/* 
 *  idctref_miha.c
 *
 *	Copyright (C) Alberto Vigata - December 2000  ultraflask@yahoo.com
 *                 
 *
 *  This file is part of FlasKMPEG, a free MPEG to MPEG/AVI converter
 *	
 *  FlasKMPEG is free software; you can redistribute it and/or modify
 *  it under the terms of the GNU General Public License as published by
 *  the Free Software Foundation; either version 2, or (at your option)
 *  any later version.
 *   
 *  FlasKMPEG is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *  GNU General Public License for more details.
 *   
 *  You should have received a copy of the GNU General Public License
 *  along with GNU Make; see the file COPYING.  If not, write to
 *  the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. 
 *
 */


/*************************************************************/
/*                                                           */
/* x87 hand-optimized assembly by Miha Peternel              */
/*                                    27.11. - 11.12.2000    */
/*                                                           */
/* You are free to use this code in your project if:         */
/* - no changes are made to this message                     */
/* - any changes to this code are publicly available         */
/* - your project documentation contains the following text: */
/*   "This software contains fast high-quality IDCT decoder  */
/*    by Miha Peternel."                                     */
/*                                                           */
/*************************************************************/

/*  Perform IEEE 1180 reference (64-bit floating point, separable 8x1
 *  direct matrix multiply) Inverse Discrete Cosine Transform
*/

#define ModelX 123 // enable C-level optimizations by Miha Peternel

/* Here we use math.h to generate constants.  Compiler results may
   vary a little */

#include <math.h>

//#include "config.h"

#ifndef PI
# ifdef M_PI
#  define PI M_PI
# else
#  define PI 3.14159265358979323846
# endif
#endif

/* global declarations */
void Initialize_Reference_IDCT (void);
void Reference_IDCT(short *block);

/* private data */
#ifdef ModelX
static short *iclp;
static short iclip[1024+1024]; /* clipping table */
#endif

/* cosine transform matrix for 8x1 IDCT */
static double c[8][8];

/* initialize DCT coefficient matrix */

void Initialize_Reference_IDCT()
{
  int freq, time, i;
  double scale;

  for (freq=0; freq < 8; freq++)
  {
    scale = (freq == 0) ? sqrt(0.125) : 0.5;
    for (time=0; time<8; time++)
      c[freq][time] = scale*cos((PI/8.0)*freq*(time + 0.5));
  }

#ifdef ModelX
  iclp = iclip+1024;
  for (i= -1024; i<1024; i++)
    iclp[i] = (i<-256) ? -256 : ((i>255) ? 255 : i);
#endif
}

/* perform IDCT matrix multiply for 8x8 coefficient block */

#ifdef ModelX
const static double HALF = 0.5;
const static long double RC = 1.0*1024*1024*1024*1024*256*16/2 + 65536.0*65536/2 + 1024.25; // magic + clip center
const static long double RBt = -0.25+1.0/65536/65536; // magic 1
const static long double RCt = +0.5+1.0*1024*1024*1024*1024*256*16/2 + 65536.0*65536/2 + 1024.0; // magic 2 + clip center
const static unsigned __int64 CLIP_LO  = 0xFF00FF00FF00FF00;
const static unsigned __int64 CLIP_HI  = 0x00FF00FF00FF00FF;
const static unsigned __int64 CLIP_TOP = 0x7F007F007F007F00;
const static unsigned __int64 CLIP_BOT = 0xFE00FE00FE00FE00;
const static unsigned __int64 CLIP_MID = 0x7F007F007F007F00;
#endif

void original_Reference_IDCT(block)
short *block;
{
  int i, j, k, v;
  double tmp[64];

	int *b = (int *) block;
  if( b[31]==0x10000 && b[0]==0 )
	{
	  if( b[ 1]|b[ 2]|b[ 3]|b[ 4]|b[ 5] )
		  goto normal;
	  if( b[ 6]|b[ 7]|b[ 8]|b[ 9]|b[10] )
		  goto normal;
	  if( b[11]|b[12]|b[13]|b[14]|b[15] )
		  goto normal;
	  if( b[16]|b[17]|b[18]|b[19]|b[20] )
		  goto normal;
	  if( b[21]|b[22]|b[23]|b[24]|b[25] )
		  goto normal;
	  if( b[26]|b[27]|b[28]|b[29]|b[30] )
		  goto normal;
		b[31]=0;
		return;
	}
normal:

  for (i=0; i<8; i++)
    for (j=0; j<8; j++)
    {
      tmp[8*i+j] = c[0][j]*block[8*i+0]+
			             c[1][j]*block[8*i+1]+
			             c[2][j]*block[8*i+2]+
			             c[3][j]*block[8*i+3]+
			             c[4][j]*block[8*i+4]+
			             c[5][j]*block[8*i+5]+
			             c[6][j]*block[8*i+6]+
			             c[7][j]*block[8*i+7];
    }

  /* Transpose operation is integrated into address mapping by switching 
     loop order of i and j */

  for (j=0; j<8; j++)
    for (i=0; i<8; i++)
    {
      double partial_product;

			partial_product = c[0][i]*tmp[8*0+j]+
			                  c[1][i]*tmp[8*1+j]+
			                  c[2][i]*tmp[8*2+j]+
			                  c[3][i]*tmp[8*3+j]+
			                  c[4][i]*tmp[8*4+j]+
			                  c[5][i]*tmp[8*5+j]+
			                  c[6][i]*tmp[8*6+j]+
			                  c[7][i]*tmp[8*7+j];

      block[8*i+j] = floor(partial_product+0.5);
    }
}


void Reference_IDCT(block)
short *block;
{
  int i, j, k, v;
  double tmp[64];
#ifdef ModelX
	double rnd[64];
	int int0, int1, int2, int3, int4, int5, int6, int7;
	unsigned short fpold;
	unsigned short fpnew;
#endif

	int *b = (int *) block;
  if( b[31]==0x10000 && b[0]==0 )
	{
	  if( b[ 1]|b[ 2]|b[ 3]|b[ 4]|b[ 5] )
		  goto normal;
	  if( b[ 6]|b[ 7]|b[ 8]|b[ 9]|b[10] )
		  goto normal;
	  if( b[11]|b[12]|b[13]|b[14]|b[15] )
		  goto normal;
	  if( b[16]|b[17]|b[18]|b[19]|b[20] )
		  goto normal;
	  if( b[21]|b[22]|b[23]|b[24]|b[25] )
		  goto normal;
	  if( b[26]|b[27]|b[28]|b[29]|b[30] )
		  goto normal;
		block[31]=0;
		return;
	}
normal:

#ifdef Full_C
  for (i=0; i<8; i++)
    for (j=0; j<8; j++)
    {
#ifndef ModelX
      partial_product = 0.0;

      for (k=0; k<8; k++)
        partial_product+= c[k][j]*block[8*i+k];

      tmp[8*i+j] = partial_product;
#else // ModelX
      tmp[8*i+j] = c[0][j]*block[8*i+0]+
			             c[1][j]*block[8*i+1]+
			             c[2][j]*block[8*i+2]+
			             c[3][j]*block[8*i+3]+
			             c[4][j]*block[8*i+4]+
			             c[5][j]*block[8*i+5]+
			             c[6][j]*block[8*i+6]+
			             c[7][j]*block[8*i+7];
#endif
    }

  /* Transpose operation is integrated into address mapping by switching 
     loop order of i and j */

  for (j=0; j<8; j++)
    for (i=0; i<8; i++)
    {
#ifndef ModelX
      double partial_product;
      partial_product = 0.0;

      for (k=0; k<8; k++)
        partial_product+= c[k][i]*tmp[8*k+j];

      v = (int) floor(partial_product+0.5);
      block[8*i+j] = (v<-256) ? -256 : ((v>255) ? 255 : v);
#else // ModelX
      double partial_product;
		  double rc;
			int fl;

			partial_product = c[0][i]*tmp[8*0+j]+
			                  c[1][i]*tmp[8*1+j]+
			                  c[2][i]*tmp[8*2+j]+
			                  c[3][i]*tmp[8*3+j]+
			                  c[4][i]*tmp[8*4+j]+
			                  c[5][i]*tmp[8*5+j]+
			                  c[6][i]*tmp[8*6+j]+
			                  c[7][i]*tmp[8*7+j];

      rnd[8*i+j] = partial_product+0.5;
/*
      rc = (partial_product+RBt)+RCt;
			fl = (int) floor(partial_product+0.5);
			v = ((*((int*)&rc))>>1)-1024;
			block[8*i+j] = iclp[v];
			if( v - fl != 0 )
			{
			  printf( "Rounding error!!!" );
			}
*/
#endif
    }

	__asm
	{
///*
	  // set x87 to floor mode
		fstcw [fpold]
		mov ax, [fpold]
		//or ax, 0x0C00 // 3072
		//or ax, 0x0800 // round up
		or ax, 0x0400 // round down - floor
		mov [fpnew], ax
		fldcw [fpnew]
		// now floor the damn array
		lea esi, rnd
		mov edi, [block]
		mov ecx, 8
		align 16
	__floor:
/*
		  fld   qword ptr [esi+7*8]
		  fld   qword ptr [esi+6*8]
		  fld   qword ptr [esi+5*8]
		  fld   qword ptr [esi+4*8]
		  fld   qword ptr [esi+3*8]
		  fld   qword ptr [esi+2*8]
		  fld   qword ptr [esi+1*8]
		  fld   qword ptr [esi+0*8]
			fistp  word ptr [edi+0*2]
			fistp  word ptr [edi+1*2]
			fistp  word ptr [edi+2*2]
			fistp  word ptr [edi+3*2]
			fistp  word ptr [edi+4*2]
			fistp  word ptr [edi+5*2]
			fistp  word ptr [edi+6*2]
*/
		  fld   qword ptr [esi+0*8]
			fistp  word ptr [edi+0*2]
		  fld   qword ptr [esi+1*8]
			fistp  word ptr [edi+1*2]
		  fld   qword ptr [esi+2*8]
			fistp  word ptr [edi+2*2]
		  fld   qword ptr [esi+3*8]
			fistp  word ptr [edi+3*2]
		  fld   qword ptr [esi+4*8]
			fistp  word ptr [edi+4*2]
		  fld   qword ptr [esi+5*8]
			fistp  word ptr [edi+5*2]
		  fld   qword ptr [esi+6*8]
			fistp  word ptr [edi+6*2]
		  fld   qword ptr [esi+7*8]
			add esi, 8*8
			fistp  word ptr [edi+7*2]
			add edi, 8*2
		dec ecx
		jnz __floor
		// set x87 to default mode
		fldcw [fpold]
//*/
		// now clip to -256..255 using MMX
		mov esi, [block]
		movq mm7,[CLIP_MID];
		mov ecx,8*8*2/8/2
	__clip:
		  movq mm0,[esi+0*8]
		  movq mm1,[esi+1*8]
			paddsw mm0,mm7 // max
			paddsw mm1,mm7 // max
			psubsw mm0,mm7 // back
			psubsw mm1,mm7 // back
			psubsw mm0,mm7 // min
			psubsw mm1,mm7 // min
			paddsw mm0,mm7 // back
			paddsw mm1,mm7 // back
			movq [esi+0*8],mm0
			movq [esi+1*8],mm1
			add esi,2*8
		dec ecx
		jnz __clip
		emms
	};

#else // FullC

	__asm
	{
	  // set x87 to floor mode
		//////////fstcw [fpold]
		//////////mov ax, [fpold]
		//or ax, 0x0C00 // 3072
		//////////or ax, 0x0800 // round up
		//////////mov [fpnew], ax
		//////fldcw [fpnew]
		// do the IDCT
		mov esi,[block]
		lea eax,[c]
		lea edi,[tmp]
		mov ebx,8
	__col1:
			mov ecx,8
		__row1:
				fild  word ptr [esi+0*2]
				fmul qword ptr [eax+0*8*8]
				fild  word ptr [esi+1*2]
				fmul qword ptr [eax+1*8*8]
				fadd
				fild  word ptr [esi+2*2]
				fmul qword ptr [eax+2*8*8]
				fadd
				fild  word ptr [esi+3*2]
				fmul qword ptr [eax+3*8*8]
				fadd
				fild  word ptr [esi+4*2]
				fmul qword ptr [eax+4*8*8]
				fadd
				fild  word ptr [esi+5*2]
				fmul qword ptr [eax+5*8*8]
				fadd
				fild  word ptr [esi+6*2]
				fmul qword ptr [eax+6*8*8]
				fadd
				fild  word ptr [esi+7*2]
				fmul qword ptr [eax+7*8*8]
				fadd
				add eax,8
				fstp qword ptr [edi]
				add edi,8
			dec ecx
			jnz __row1
			add eax,-8*8
			add esi,+8*2
		dec ebx
		jnz __col1

		lea esi,[tmp]
		lea eax,[c]
		lea edi,[rnd]
		//mov edi,[block]
    fld qword ptr [HALF]
		mov ebx,8
	__row2:
			mov ecx,8
			align 16
			__col2:
				fld  qword ptr [esi+0*8*8]
				fmul qword ptr [eax+0*8*8]
				fld  qword ptr [esi+1*8*8]
				fmul qword ptr [eax+1*8*8]
				fadd
				fld  qword ptr [esi+2*8*8]
				fmul qword ptr [eax+2*8*8]
				fadd
				fld  qword ptr [esi+3*8*8]
				fmul qword ptr [eax+3*8*8]
				fadd
				fld  qword ptr [esi+4*8*8]
				fmul qword ptr [eax+4*8*8]
				fadd
				fld  qword ptr [esi+5*8*8]
				fmul qword ptr [eax+5*8*8]
				fadd
				fld  qword ptr [esi+6*8*8]
				fmul qword ptr [eax+6*8*8]
				fadd
				fld  qword ptr [esi+7*8*8]
				fmul qword ptr [eax+7*8*8]
				fadd
				fadd st(0),st(1)
				add eax,8
				fstp qword ptr [edi]
				add edi,8*8
				/*
				fistp dword ptr [int0]
				mov edx,dword ptr [int0]
				mov dx,word ptr [iclip+2*1024+2*edx]
				mov word ptr [edi],dx
				add edi,8*2
				*/
			dec ecx
			jnz __col2
			add eax,-8*8
			add esi,+8
			add edi,8-8*8*8
			//add edi,2-8*8*2
		dec ebx
		jnz __row2
		ffree st(0) // bye bye 0.5

	  // set x87 to floor mode
		fstcw [fpold]
		movzx eax, [fpold]
		//or ax, 0x0C00 // 3072
		//or ax, 0x0800 // round up
		or eax, 0x0400 // round down - floor
		mov [fpnew], ax
		fldcw [fpnew]

		// now floor the damn array
		lea esi, [rnd]
		mov edi, [block]
		//lea edx, [iclip+2*1024]
		mov ebx, -256 // clip min
		mov edx, +255 // clip max
		mov ecx, 8
		align 16
	__floor:
/*
		  fld   qword ptr [esi+0*8]
			fistp  word ptr [edi+0*2]
			//fistp dword ptr [int0]
				//mov eax,dword ptr [int0]
				//movsx eax,word ptr [edx+2*eax]
				//mov word ptr [edi+0*2],ax
		  fld   qword ptr [esi+1*8]
			fistp  word ptr [edi+1*2]
			//fistp dword ptr [int1]
				//mov eax,dword ptr [int1]
				//movsx eax,word ptr [edx+2*eax]
				//mov word ptr [edi+1*2],ax
		  fld   qword ptr [esi+2*8]
			fistp  word ptr [edi+2*2]
			//fistp dword ptr [int2]
				//mov eax,dword ptr [int2]
				//movsx eax,word ptr [edx+2*eax]
				//mov word ptr [edi+2*2],ax
		  fld   qword ptr [esi+3*8]
			fistp  word ptr [edi+3*2]
			//fistp dword ptr [int3]
				//mov eax,dword ptr [int3]
				//movsx eax,word ptr [edx+2*eax]
				//mov word ptr [edi+3*2],ax
		  fld   qword ptr [esi+4*8]
			fistp  word ptr [edi+4*2]
			//fistp dword ptr [int4]
				//mov eax,dword ptr [int4]
				//movsx eax,word ptr [edx+2*eax]
				//mov word ptr [edi+4*2],ax
		  fld   qword ptr [esi+5*8]
			fistp  word ptr [edi+5*2]
			//fistp dword ptr [int5]
				//mov eax,dword ptr [int5]
				//movsx eax,word ptr [edx+2*eax]
				//mov word ptr [edi+5*2],ax
		  fld   qword ptr [esi+6*8]
			fistp  word ptr [edi+6*2]
			//fistp dword ptr [int6]
				//mov eax,dword ptr [int6]
				//movsx eax,word ptr [edx+2*eax]
				//mov word ptr [edi+6*2],ax
		  fld   qword ptr [esi+7*8]
			fistp  word ptr [edi+7*2]
			//fistp dword ptr [int7]
				//mov eax,dword ptr [int7]
				//movsx eax,word ptr [edx+2*eax]
				//mov word ptr [edi+7*2],ax
*/
		  fld   qword ptr [esi+0*8]
			//fistp  word ptr [edi+0*2]
			fistp dword ptr [int0]
			  mov eax,[int0]
				cmp   eax,ebx
				cmovl eax,ebx
				cmp   eax,edx
				cmovg eax,edx
				//mov eax,dword ptr [int0]
				//movsx eax,word ptr [edx+2*eax]
		  fld   qword ptr [esi+1*8]
			//fistp  word ptr [edi+1*2]
			fistp dword ptr [int1]
				mov word ptr [edi+0*2],ax
			  mov eax,[int1]
				cmp   eax,ebx
				cmovl eax,ebx
				cmp   eax,edx
				cmovg eax,edx
				//mov eax,dword ptr [int1]
				//movsx eax,word ptr [edx+2*eax]
		  fld   qword ptr [esi+2*8]
			//fistp  word ptr [edi+2*2]
			fistp dword ptr [int2]
				mov word ptr [edi+1*2],ax
			  mov eax,[int2]
				cmp   eax,ebx
				cmovl eax,ebx
				cmp   eax,edx
				cmovg eax,edx
				//mov eax,dword ptr [int2]
				//movsx eax,word ptr [edx+2*eax]
		  fld   qword ptr [esi+3*8]
			//fistp  word ptr [edi+3*2]
			fistp dword ptr [int3]
				mov word ptr [edi+2*2],ax
			  mov eax,[int3]
				cmp   eax,ebx
				cmovl eax,ebx
				cmp   eax,edx
				cmovg eax,edx
				//mov eax,dword ptr [int3]
				//movsx eax,word ptr [edx+2*eax]
		  fld   qword ptr [esi+4*8]
			//fistp  word ptr [edi+4*2]
			fistp dword ptr [int4]
				mov word ptr [edi+3*2],ax
			  mov eax,[int4]
				cmp   eax,ebx
				cmovl eax,ebx
				cmp   eax,edx
				cmovg eax,edx
				//mov eax,dword ptr [int4]
				//movsx eax,word ptr [edx+2*eax]
		  fld   qword ptr [esi+5*8]
			//fistp  word ptr [edi+5*2]
			fistp dword ptr [int5]
				mov word ptr [edi+4*2],ax
			  mov eax,[int5]
				cmp   eax,ebx
				cmovl eax,ebx
				cmp   eax,edx
				cmovg eax,edx
				//mov eax,dword ptr [int5]
				//movsx eax,word ptr [edx+2*eax]
		  fld   qword ptr [esi+6*8]
			//fistp  word ptr [edi+6*2]
			fistp dword ptr [int6]
				mov word ptr [edi+5*2],ax
			  mov eax,[int6]
				cmp   eax,ebx
				cmovl eax,ebx
				cmp   eax,edx
				cmovg eax,edx
				//mov eax,dword ptr [int6]
				//movsx eax,word ptr [edx+2*eax]
		  fld   qword ptr [esi+7*8]
			//fistp  word ptr [edi+7*2]
			fistp dword ptr [int7]
				mov word ptr [edi+6*2],ax
			  mov eax,[int7]
				cmp   eax,ebx
				cmovl eax,ebx
				cmp   eax,edx
				cmovg eax,edx
				//mov eax,dword ptr [int7]
				//movsx eax,word ptr [edx+2*eax]
				mov word ptr [edi+7*2],ax

			add esi, 8*8
			add edi, 8*2
		dec ecx
		jnz __floor

/*
		// now clip to -256..255 using MMX
		mov esi,[block]
		movq mm7,[CLIP_MID];
		mov ecx,8*8*2/8/2
		align 16
	__clip:
		  movq mm0,[esi+0*8]
		  movq mm1,[esi+1*8]
			paddsw mm0,mm7 // max
			paddsw mm1,mm7 // max
			psubsw mm0,mm7 // back
			psubsw mm1,mm7 // back
			psubsw mm0,mm7 // min
			psubsw mm1,mm7 // min
			paddsw mm0,mm7 // back
			paddsw mm1,mm7 // back
			movq [esi+0*8],mm0
			movq [esi+1*8],mm1
			add esi,2*8
		dec ecx
		jnz __clip
		emms
//*/

		// set x87 to default mode
		fldcw [fpold]
	};

#endif // FullC

}
