/* 
 *  idct_miha.c
 *
 *	Copyright (C) Alberto Vigata - December 2000  ultraflask@yahoo.com
 *                 
 *
 *  This file is part of FlasKMPEG, a free MPEG to MPEG/AVI converter
 *	
 *  FlasKMPEG is free software; you can redistribute it and/or modify
 *  it under the terms of the GNU General Public License as published by
 *  the Free Software Foundation; either version 2, or (at your option)
 *  any later version.
 *   
 *  FlasKMPEG is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *  GNU General Public License for more details.
 *   
 *  You should have received a copy of the GNU General Public License
 *  along with GNU Make; see the file COPYING.  If not, write to
 *  the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. 
 *
 */


/*************************************************************/
/* inverse two dimensional DCT, Chen-Wang algorithm          */
/* (cf. IEEE ASSP-32, pp. 803-816, Aug. 1984)                */
/*                                                           */
/* floating point conversion by Miha Peternel                */
/* x87 hand-optimized assembly by Miha Peternel              */
/*                                    27.11. - 11.12.2000    */
/*                                                           */
/* You are free to use this code in your project if:         */
/* - no changes are made to this message                     */
/* - any changes to this code are publicly available         */
/* - your project documentation contains the following text: */
/*   "This software contains fast high-quality IDCT decoder  */
/*    by Miha Peternel."                                     */
/*                                                           */
/*************************************************************/


/////////////////////////////////////////////////////
//
// TODO:
// - loops can be easily vectorized for SIMD
//
/////////////////////////////////////////////////////

//#include "config.h"

#include <math.h>
#ifndef PI
# ifdef M_PI
#  define PI M_PI
# else
#  define PI 3.14159265358979323846
# endif
#endif


#define FLOAT double


const static double RC = 1.0*1024*1024*1024*1024*256*16 + 1024; // magic + clip center

static FLOAT W1; // /* sqrt(2)*cos(1*pi/16) */
static FLOAT W2; // /* sqrt(2)*cos(2*pi/16) */
static FLOAT W5; // /* sqrt(2)*cos(5*pi/16) */

static FLOAT W1_8;
static FLOAT W2_8;
static FLOAT W5_8;

static FLOAT W7; // /* sqrt(2)*cos(7*pi/16) */
static FLOAT W1mW7; // W1-W7
static FLOAT W1pW7; // W1+W7

static FLOAT W3; // /* sqrt(2)*cos(3*pi/16) */
static FLOAT W3mW5; // W3-W5
static FLOAT W3pW5; // W3+W5

static FLOAT W6; // /* sqrt(2)*cos(6*pi/16) */
static FLOAT W2mW6; // W2-W6
static FLOAT W2pW6; // W2+W6

static FLOAT S2; // 1/sqrt(2)
static FLOAT D8 = 1.0/8;

static FLOAT W7_8;
static FLOAT W1mW7_8;
static FLOAT W1pW7_8;

static FLOAT W3_8;
static FLOAT W3mW5_8;
static FLOAT W3pW5_8;

static FLOAT W6_8;
static FLOAT W2mW6_8;
static FLOAT W2pW6_8;

//static FLOAT fblock[8*8];

/* global declarations */
void Initialize_Fast_IDCT(void);
void Fast_IDCT(short *block);

/* private data */
static short iclip[1024+1024]; /* clipping table */
static short *iclp;



void Initialize_Fast_IDCT()
{
  int i;

  S2 = sqrt(0.5); // 1.0/sqrt(2);

  W1 = sqrt(2)*cos(PI*(1.0/16)); 
	W1_8 = W1/8;
  W2 = sqrt(2)*cos(PI*(2.0/16)); 
	W2_8 = W2/8;
  W3 = sqrt(2)*cos(PI*(3.0/16)); 
	W3_8 = W3/8;
  W5 = sqrt(2)*cos(PI*(5.0/16)); 
	W5_8 = W5/8;
  W6 = sqrt(2)*cos(PI*(6.0/16)); 
	W6_8 = W6/8;
  W7 = sqrt(2)*cos(PI*(7.0/16));
	W7_8 = W7/8;

  W1mW7 = W1-W7;  W1mW7_8 = W1mW7/8;
  W1pW7 = W1+W7;  W1pW7_8 = W1pW7/8;
  W3mW5 = W3-W5;  W3mW5_8 = W3mW5/8;
  W3pW5 = W3+W5;  W3pW5_8 = W3pW5/8;
  W2mW6 = W2-W6;  W2mW6_8 = W2mW6/8;
  W2pW6 = W2+W6;  W2pW6_8 = W2pW6/8;

  iclp = iclip+1024;
  for (i= -1024; i<1024; i++)
    iclp[i] = (i<-256) ? -256 : ((i>255) ? 255 : i);
}


#ifndef WIN32

/* private prototypes */
//static void idctrow _ANSI_ARGS_((short *blk));
//static void idctcol _ANSI_ARGS_((short *blk));

static void idctrow( short *blk, FLOAT *fblk )
{
  FLOAT x0, x1, x2, x3, x4, x5, x6, x7, x8;

  /* shortcut */
///*
	int *i = (int *)blk;
  if( !( blk[1] | i[1] | i[2] | i[3] ))
  {
    fblk[0]=fblk[1]=fblk[2]=fblk[3]=fblk[4]=fblk[5]=fblk[6]=fblk[7]= blk[0];
    return;
  }
//*/

	x1 = blk[4];
	x2 = blk[6];
	x3 = blk[2];
  x4 = blk[1];
	x5 = blk[7];
	x6 = blk[5];
	x7 = blk[3];

  x0 = blk[0];

  /* first stage */
  x8 = W7*(x4+x5);
  x4 = x8 + (W1mW7)*x4;
  x5 = x8 - (W1pW7)*x5;
  x8 = W3*(x6+x7);
  x6 = x8 - (W3mW5)*x6;
  x7 = x8 - (W3pW5)*x7;
  
  /* second stage */
  x8 = x0 + x1;
  x0 -= x1;
  x1 = W6*(x3+x2);
  x2 = x1 - (W2pW6)*x2;
  x3 = x1 + (W2mW6)*x3;
  x1 = x4 + x6;
  x4 -= x6;
  x6 = x5 + x7;
  x5 -= x7;
  
  /* third stage */
  x7 = x8 + x3;
  x8 -= x3;
  x3 = x0 + x2;
  x0 -= x2;
  x2 = S2*(x4+x5);
  x4 = S2*(x4-x5);
  
  /* fourth stage */
  fblk[0] = (x7+x1); //*(1.0/256);
  fblk[7] = (x7-x1); //*(1.0/256);
  fblk[3] = (x8+x6); //*(1.0/256);
  fblk[4] = (x8-x6); //*(1.0/256);
  fblk[1] = (x3+x2); //*(1.0/256);
  fblk[6] = (x3-x2); //*(1.0/256);
  fblk[2] = (x0+x4); //*(1.0/256);
  fblk[5] = (x0-x4); //*(1.0/256);
}

static void idctcol( short *blk, FLOAT *fblk )
{
  double rc[8];
  FLOAT x0, x1, x2, x3, x4, x5, x6, x7, x8;

	x1 = fblk[8*4]*D8;
	x2 = fblk[8*6];
	x3 = fblk[8*2];
	x4 = fblk[8*1];
	x5 = fblk[8*7];
	x6 = fblk[8*5];
	x7 = fblk[8*3];

  x0 = fblk[8*0]*D8;

  /* first stage */
  x8 = W7_8*(x4+x5);
  x4 = (x8+(W1mW7_8)*x4);
  x5 = (x8-(W1pW7_8)*x5);
  x8 = W3_8*(x6+x7);
  x6 = (x8-(W3mW5_8)*x6);
  x7 = (x8-(W3pW5_8)*x7);
  
  /* second stage */
  x8 = x0 + x1;
  x0 -= x1;
  x1 = W6_8*(x3+x2);
  x2 = (x1-(W2pW6_8)*x2);
  x3 = (x1+(W2mW6_8)*x3);
  x1 = x4 + x6;
  x4 -= x6;
  x6 = x5 + x7;
  x5 -= x7;
  
  /* third stage */
  x7 = x8 + x3;
  x8 -= x3;
  x3 = x0 + x2;
  x0 -= x2;
  x2 = S2*(x4+x5);
  x4 = S2*(x4-x5);
  
  /* fourth stage */
  rc[0] = (x7+x1)+RC;
	blk[8*0] = iclip[*((int*)(&rc[0]))];
  rc[7] = (x7-x1)+RC;
	blk[8*7] = iclip[*((int*)(&rc[7]))];
  rc[3] = (x8+x6)+RC;
	blk[8*3] = iclip[*((int*)(&rc[3]))];
  rc[4] = (x8-x6)+RC;
	blk[8*4] = iclip[*((int*)(&rc[4]))];
  rc[1] = (x3+x2)+RC;
	blk[8*1] = iclip[*((int*)(&rc[1]))];
  rc[6] = (x3-x2)+RC;
	blk[8*6] = iclip[*((int*)(&rc[6]))];
  rc[2] = (x0+x4)+RC;
	blk[8*2] = iclip[*((int*)(&rc[2]))];
  rc[5] = (x0-x4)+RC;
	blk[8*5] = iclip[*((int*)(&rc[5]))];
}


/* two dimensional inverse discrete cosine transform */

void Fast_IDCT(block)
short *block;
{
  FLOAT fblock[8*8];
	int i;

  for (i=0; i<8; i++)
    idctrow(block+8*i,fblock+8*i);

  for (i=0; i<8; i++)
    idctcol(block+i,fblock+i);
}



#else // WIN32 - kick in x87 hand-tuned code by Miha Peternel

/*
int all = 0;
int turn = 0;
int diff = 0;
int empty = 0;
int allused = 0;
int used[65] = {0};
int trows[9] = {0};
int tcols[9] = {0};
*/

void Fast_IDCT(short *block)
{
/*
  int row[8] = {0};
  int col[8] = {0};
	int dirty = 0;
	int x,y,nrows,ncols;
*/
	int *b = (int *) block;
  if( b[0]==0 && (b[31]==0x10000 || b[31]==0) )
	{
	  if( b[ 1]|b[ 2]|b[ 3]|b[ 4]|b[ 5] )
		  goto normal;
	  if( b[ 6]|b[ 7]|b[ 8]|b[ 9]|b[10] )
		  goto normal;
	  if( b[11]|b[12]|b[13]|b[14]|b[15] )
		  goto normal;
	  if( b[16]|b[17]|b[18]|b[19]|b[20] )
		  goto normal;
	  if( b[21]|b[22]|b[23]|b[24]|b[25] )
		  goto normal;
	  if( b[26]|b[27]|b[28]|b[29]|b[30] )
		  goto normal;
		b[31]=0;
		////empty++;
		return;
	}
normal:
/*
	for( y = 0; y < 8; y++ )
	for( x = 0; x < 8; x++ )
	  if( block[y*8+x] )
		{
		  row[y] = 1;
			col[x] = 1;
			dirty++;
		}
	allused += dirty;
	used[dirty]++;
	nrows = row[0]+row[1]+row[2]+row[3]+row[4]+row[5]+row[6]+row[7];
	trows[nrows]++;
	ncols = col[0]+col[1]+col[2]+col[3]+col[4]+col[5]+col[6]+col[7];
	tcols[ncols]++;
	if( ncols < nrows )
	  turn += nrows-ncols;

  if( block[63] != 1 )
  {
	  diff++;
	}
	all++;
	if( (all % 10000) == 0 )
	{
	  all *= 1;
	}
	if( (all % 100000) == 0 )
	{
	  all *= 1;
	}
//*/
//  FLOAT tmp[8*8];
//	FLOAT tmp1, tmp2, tmp3;
//	int int0, int1, int2, int3, int4, int5, int6, int7;
#define tmp  ebx
#define tmp1 ebx-1*8
#define tmp2 ebx-2*8
#define tmp3 ebx-3*8
#define int0 ebx-3*8-1*4
#define int1 ebx-3*8-2*4
#define int2 ebx-3*8-3*4
#define int3 ebx-3*8-4*4
#define int4 ebx-3*8-5*4
#define int5 ebx-3*8-6*4
#define int6 ebx-3*8-7*4
#define int7 ebx-3*8-8*4
#define SIZE 8*8*8+3*8+8*4+16 // locals + 16-byte alignment area
	__asm
	{
	  lea ebx,[esp-8*8*8]
		sub esp,SIZE
		and ebx,-16 // force 16-byte alignment of locals

// rows
		mov esi,[block]
		lea edi,[tmp]
		mov ecx,8

		align 16
Lrows:
    movsx eax,word ptr [esi+2]
		or    eax,         [esi+4]
		or    eax,         [esi+8]
		or    eax,         [esi+12]
		jnz L1

		fild word ptr [esi+0*2]
		fst  qword ptr [edi+7*8]
		fst  qword ptr [edi+6*8]
		fst  qword ptr [edi+5*8]
		fst  qword ptr [edi+4*8]
		fst  qword ptr [edi+3*8]
		fst  qword ptr [edi+2*8]
		fst  qword ptr [edi+1*8]
		fstp qword ptr [edi+0*8]
		jmp L2

		align 16
	L1:

		fild word ptr [esi+7*2]
		fld st(0)
		fild word ptr [esi+1*2]
		fadd st(1),st(0)
		fld qword ptr [W7]
		fxch st(1)
		fmul qword ptr [W1mW7]
		fxch st(1)
		fmulp st(2),st(0)
		fadd st(0),st(1)
		fstp qword ptr [tmp1]
		fild word ptr [esi+3*2]
		fld st(0)
		fxch st(3)
		fmul qword ptr [W1pW7]
		fild word ptr [esi+5*2]
		fadd st(4),st(0)
		fmul qword ptr [W3mW5]
		fxch st(1)
		fsubp st(3),st(0)//fsubrp
		fld qword ptr [W3]
		fmulp st(4),st(0)
		fsubr st(0),st(3)
		fstp qword ptr [tmp2]
		fmul qword ptr [W3pW5]
		fsubp st(2),st(0)//fsubrp
		fxch st(1)
		fstp qword ptr [tmp3]
		fild word ptr [esi+0*2]
		fild word ptr [esi+4*2]
		fild word ptr [esi+2*2]
		fld st(0)
		fmul qword ptr [W2mW6]
		fld st(3)
		fild word ptr [esi+6*2]
		fxch st(5)
		fsub st(0),st(4)
		fxch st(3)
		fadd st(0),st(5)
		fxch st(1)
		faddp st(4),st(0)
		fld qword ptr [W6]
		fmulp st(1),st(0)
		fxch st(4)
		fmul qword ptr [W2pW6]
		fld qword ptr [tmp1]
		fsub qword ptr [tmp2]
		fld st(5)
		fxch st(3)
		faddp st(6),st(0)
		fld qword ptr [tmp1]
		fxch st(1)
		fstp qword ptr [tmp1]
		fld st(6)
		fadd qword ptr [tmp3]
		fxch st(1)
		fadd qword ptr [tmp2]
		fxch st(7)
		fsub qword ptr [tmp3]
		fxch st(1)
		fstp qword ptr [tmp2]
		fld st(4)
		fxch st(3)
		fsubrp st(2),st(0)//fsubp
		fxch st(4)
		fsub st(0),st(5)
		fxch st(2)
		faddp st(5),st(0)
		fld st(2)
		fsub st(0),st(1)
		fxch st(5)
		fstp qword ptr [tmp3]
		fld qword ptr [tmp1]
		fld qword ptr [S2]
		fxch st(4)
		faddp st(2),st(0)
		fld st(3)
		fxch st(1)
		fadd st(0),st(5)
		fmulp st(1),st(0)
	//
/*
		fld st(1)
		fxch st(5)
		fsubr qword ptr [tmp1]
		fxch st(5)
		fadd st(0),st(1)
		fxch st(4)
		fmulp st(5),st(0)
		fxch st(3)
		fstp qword ptr [edi+1*8]
		fld st(4)
		fxch st(1)
		fsubrp st(3),st(0)//fsubp
		fxch st(4)
		fsub st(0),st(3)
		fxch st(4)
		faddp st(3),st(0)
		fxch st(1)
		fstp qword ptr [edi+6*8]
		fxch st(2)
		fstp qword ptr [edi+5*8]
		fstp qword ptr [edi+2*8]
		fld qword ptr [tmp3]
		fadd st(0),st(2)
		fstp qword ptr [edi+0*8]
		fxch st(1)
		fsubr qword ptr [tmp3]
		fld st(1)
		fxch st(1)
		fstp qword ptr [edi+7*8]
		fadd qword ptr [tmp2]
		fstp qword ptr [edi+3*8]
		fsub qword ptr [tmp2]
		fstp qword ptr [edi+4*8]
*/
	//
		fld qword ptr [tmp3]
		fadd st(0),st(7)
		fxch st(5)
		fsubr qword ptr [tmp1]
		fxch st(5)
		fstp qword ptr [edi+0*8]
		fxch st(6)
		fsubr qword ptr [tmp3]
		fld st(2)
		fxch st(1)
		fstp qword ptr [edi+7*8]
		fadd qword ptr [tmp2]
		fxch st(3)
		fmulp st(4),st(0)
		fxch st(2)
		fstp qword ptr [edi+3*8]
		fld st(1)
		fadd st(0),st(5)
		fxch st(1)
		fsub qword ptr [tmp2]
		fxch st(2)
		fsubrp st(5),st(0)//fsubp
		fstp qword ptr [edi+1*8]
		fld st(2)
		fxch st(1)
		fstp qword ptr [edi+4*8]
		fxch st(2)
		fsub st(0),st(1)
		fxch st(2)
		faddp st(1),st(0)
		fxch st(2)
		fstp qword ptr [edi+6*8]
		fstp qword ptr [edi+5*8]
		fstp qword ptr [edi+2*8]
	L2:
	  add esi,8*2
		add edi,8*8
		dec ecx
		jnz Lrows

// columns
    lea esi,[tmp]
		mov edi,[block]
		lea edx,[iclip+1024*2]
		mov ecx,8

    align 16
Lcols:
/*
    mov eax,[esi+1*8*8]
		or  eax,[esi+2*8*8]
		or  eax,[esi+3*8*8]
		or  eax,[esi+4*8*8]
		or  eax,[esi+5*8*8]
		or  eax,[esi+6*8*8]
		or  eax,[esi+7*8*8]
		jnz L3

		fld qword ptr [esi+0*8*8]
		fmul qword ptr [D8]
		fistp dword ptr [int0]
		mov eax,[int0]
		mov ax,word ptr [edx+2*eax]
		mov [edi+0*8*2],ax
		mov [edi+1*8*2],ax
		mov [edi+2*8*2],ax
		mov [edi+3*8*2],ax
		mov [edi+4*8*2],ax
		mov [edi+5*8*2],ax
		mov [edi+6*8*2],ax
		mov [edi+7*8*2],ax
		jmp L4

		align 16
	L3:
//*/
		fld qword ptr [esi+7*8*8]
		fld st(0)
		fld qword ptr [esi+1*8*8]
		fadd st(1),st(0)
		fld qword ptr [W7_8]
		fxch st(1)
		fmul qword ptr [W1mW7_8]
		fxch st(1)
		fmulp st(2),st(0)
		fadd st(0),st(1)
		fstp qword ptr [tmp2]
		fld qword ptr [esi+3*8*8]
		fld st(0)
		fxch st(3)
		fmul qword ptr [W1pW7_8]
		fld qword ptr [esi+5*8*8]
		fadd st(4),st(0)
		fmul qword ptr [W3mW5_8]
		fxch st(1)
		fsubp st(3),st(0)//fsubrp
		fld qword ptr [W3_8]
		fmulp st(4),st(0)
		fsubr st(0),st(3)
		fstp qword ptr [tmp3]
		fld qword ptr [D8]
		fld qword ptr [esi+0*8*8]
		fmul st(0),st(1)
		fxch st(2)
		fmul qword ptr [W3pW5_8]
		fld qword ptr [esi+4*8*8]
		fmulp st(2),st(0)
		fld qword ptr [esi+6*8*8]
		fld st(3)
		fxch st(6)
		fsubrp st(2),st(0)//fsubp
		fld qword ptr [esi+2*8*8]
		fld st(0)
		fxch st(5)
		fsub st(0),st(4)
		fxch st(7)
		faddp st(4),st(0)
		fxch st(4)
		fadd st(0),st(1)
		fld qword ptr [W6_8]
		fxch st(2)
		fmul qword ptr [W2pW6_8]
		fxch st(2)
		fmulp st(1),st(0)
		fxch st(4)
		fmul qword ptr [W2mW6_8]
		fld qword ptr [tmp2]
		fsub qword ptr [tmp3]
		fxch st(2)
		fsubr st(0),st(5)
		fxch st(1)
		faddp st(5),st(0)
		fld qword ptr [tmp2]
		fxch st(2)
		fstp qword ptr [tmp2]
		fld st(5)
		fxch st(2)
		fadd qword ptr [tmp3]
		fxch st(6)
		fsub st(0),st(3)
		fxch st(2)
		faddp st(3),st(0)
		fld st(3)
		fsub st(0),st(5)
		fxch st(3)
		fstp qword ptr [tmp3]
		fxch st(3)
		faddp st(4),st(0)
		fld st(5)
		fld qword ptr [tmp2]
		fxch st(7)
		fsub st(0),st(4)
		fxch st(7)
		fadd st(0),st(2)
		fxch st(1)
		faddp st(4),st(0)
		fld qword ptr [S2]
		fmul st(1),st(0)
		fxch st(1)
		fstp qword ptr [tmp1]
		fld st(4)
		fadd st(0),st(6)
		fxch st(2)
		fsubr qword ptr [tmp2]
		fxch st(5)
		fsubrp st(6),st(0)//fsubp
		fxch st(1)
		fistp dword ptr [int0]
		fxch st(4)
		mov eax,[int0]
		movsx eax,word ptr [edx+2*eax]
		mov [edi+0*8*2],ax
		fistp dword ptr [int7]
		mov eax,[int7]
		fld st(0)
		movsx eax,word ptr [edx+2*eax]
		mov [edi+7*8*2],ax
		fadd qword ptr [tmp3]
		fistp dword ptr [int3]
		mov eax,[int3]
		movsx eax,word ptr [edx+2*eax]
		mov [edi+3*8*2],ax
		fsub qword ptr [tmp3]
		fld st(1)
		fxch st(1)
		fistp dword ptr [int4]
		mov eax,[int4]
		movsx eax,word ptr [edx+2*eax]
		mov [edi+4*8*2],ax
		fadd qword ptr [tmp1]
		fxch st(3)
		fmulp st(2),st(0)
		fxch st(2)
		fistp dword ptr [int1]
		fxch st(1)
		mov eax,[int1]
		movsx eax,word ptr [edx+2*eax]
		mov [edi+1*8*2],ax
		fsub qword ptr [tmp1]
		fld st(2)
		fsub st(0),st(2)
		fxch st(1)
		fistp dword ptr [int6]
		fxch st(2)
		mov eax,[int6]
		faddp st(1),st(0)
		movsx eax,word ptr [edx+2*eax]
		mov [edi+6*8*2],ax
		fistp dword ptr [int2]
		mov eax,[int2]
		movsx eax,word ptr [edx+2*eax]
		mov [edi+2*8*2],ax
		fistp dword ptr [int5]
		mov eax,[int5]
		movsx eax,word ptr [edx+2*eax]
		mov [edi+5*8*2],ax
	L4:
	  add esi,8
		add edi,2
		dec ecx
		jnz Lcols

		add esp,SIZE
  }
}




#endif // WIN32

