/* Fast_X87_IDCT.cpp, Inverse Discrete Fourier Transform, double precision          */

/* Copyright (C) 2000, Michael Stembera (m_stembera@yahoo.com). All Rights Reserved. */

/*
 * Disclaimer of Warranty
 *
 * These software programs are available to the user without any license fee or
 * royalty on an "as is" basis.  I disclaim
 * any and all warranties, whether express, implied, or statuary, including any
 * implied warranties or merchantability or of fitness for a particular
 * purpose.  In no event shall the copyright-holder be liable for any
 * incidental, punitive, or consequential damages of any kind whatsoever
 * arising from the use of these programs.
 *
 * This disclaimer of warranty extends to the user of these programs and user's
 * customers, employees, agents, transferees, successors, and assigns.
 *
 * I do not represent or warrant that the
 * programs furnished hereunder are free of infringement of any third-party
 * patents.
 *
 * Commercial implementations of MPEG-1 and MPEG-2 video, including shareware,
 * are subject to royalty fees to patent holders.  Many of these patents are
 * general enough such that they are unavoidable regardless of implementation
 * design.
 *
 */

/* This is a faster reimplementation of the IEEE 1180 reference (64-bit floating point, separable 8x1
 * direct matrix multiply) Inverse Discrete Cosine Transform as originally implemented by
 * the MPEG Software Simulation Group.  I believe the results to be identical to the original
 * implementation except for cases where constructs like
 * y = x1; y += x2; y += x3; differ from
 * y = x1 + x2 + x3;
*/


#include <math.h>

#ifndef PI
# ifdef M_PI
#  define PI M_PI
# else
#  define PI 3.14159265358979323846
# endif
#endif

typedef double Real;

/* cosine transform matrix for 8x1 IDCT */
static Real ct[8][8];

static inline Real m0000( const Real * const v1, const Real * const v2 )
{
	return 0.0f;
}

static inline Real m1000( const Real * const v1, const Real * const v2 )
{
	return v1[0]*v2[0];
}

static inline Real m0100( const Real * const v1, const Real * const v2 )
{
	return v1[1]*v2[1];
}

static inline Real m1100( const Real * const v1, const Real * const v2 )
{
	return v1[0]*v2[0] + v1[1]*v2[1];
}

static inline Real m0010( const Real * const v1, const Real * const v2 )
{
	return v1[2]*v2[2];
}

static inline Real m1010( const Real * const v1, const Real * const v2 )
{
	return v1[0]*v2[0] + v1[2]*v2[2];
}

static inline Real m0110( const Real * const v1, const Real * const v2 )
{
	return v1[1]*v2[1] + v1[2]*v2[2];
}

static inline Real m1110( const Real * const v1, const Real * const v2 )
{
	return v1[0]*v2[0] + v1[1]*v2[1] + v1[2]*v2[2];
}

static inline Real m0001( const Real * const v1, const Real * const v2 )
{
	return v1[3]*v2[3];
}

static inline Real m1001( const Real * const v1, const Real * const v2 )
{
	return v1[0]*v2[0] + v1[3]*v2[3];
}

static inline Real m0101( const Real * const v1, const Real * const v2 )
{
	return v1[1]*v2[1] + v1[3]*v2[3];
}

static inline Real m1101( const Real * const v1, const Real * const v2 )
{
	return v1[0]*v2[0] + v1[1]*v2[1] + v1[3]*v2[3];
}

static inline Real m0011( const Real * const v1, const Real * const v2 )
{
	return v1[2]*v2[2] + v1[3]*v2[3];
}

static inline Real m1011( const Real * const v1, const Real * const v2 )
{
	return v1[0]*v2[0] + v1[2]*v2[2] + v1[3]*v2[3];
}

static inline Real m0111( const Real * const v1, const Real * const v2 )
{
	return v1[1]*v2[1] + v1[2]*v2[2] + v1[3]*v2[3];
}

static inline Real m1111( const Real * const v1, const Real * const v2 )
{
	return v1[0]*v2[0] + v1[1]*v2[1] + v1[2]*v2[2] + v1[3]*v2[3];
}

typedef Real SmartDot( const Real * const, const Real * const );

SmartDot * const fm_tbl[2][2][2][2] = { { { {m0000, m0001}, {m0010, m0011} }, { {m0100, m0101}, {m0110, m0111} } },
										{ { {m1000, m1001}, {m1010, m1011} }, { {m1100, m1101}, {m1110, m1111} } } };


inline short f_to_floor256( const Real& r )
{
	const short i((const short)r);

	if( i <= -256 )
	{
		return -256;
	}else
	if( i >= 255 )
	{
		return 255;
	}else
	if( i > r )
	{
		return i - 1;
	}
	else
	{
		return i;
	}
}

/* two dimensional inverse discrete cosine transform */
extern "C" void Fast_X87_IDCT( short *block )
{
  	// check if it's only DC (this happens often)
	long * const l_block = (long*)block;

	int is_dc = (block[1] == 0 && l_block[31] == 0);
	if( is_dc )
	{
		const long * l_b = l_block + 1;
		const long * const l_end = l_block + 31;
		for( ; l_b < l_end; l_b += 6)
		{
			if( (l_b[0] | l_b[1] | l_b[2] | l_b[3] | l_b[4] | l_b[5]) != 0 )
			{
				is_dc = 0;
				break;
			}
		}

		if( is_dc )
		{
			block[0] =
			block[1] = f_to_floor256( (Real)block[0] * 0.125f + .5f );

			const long lv = l_block[0];

			long * l_b = l_block + 1;
			const long * const l_end = l_block + 31;
			for( ; l_b < l_end; l_b += 6)
			{
				l_b[0] = l_b[1] = l_b[2] =
				l_b[3] = l_b[4] = l_b[5] = lv;
			}
			l_block[31] = lv;

			return;
		}
	}

	// got some AC components so do the matrix multiplies
	
	Real mtx[64];

	Real bpr0[8];
	const Real * const bpr4 = &bpr0[4];
	int used_flags[8];

	int i;
	for( i = 0; i < 8; i++)
	{
		const short * const bp = block + 8 * i;
		Real * const mp = mtx + i;

		SmartDot * const mul_0 = fm_tbl[bp[0]!=0][bp[1]!=0][bp[2]!=0][bp[3]!=0];
		SmartDot * const mul_4 = fm_tbl[bp[4]!=0][bp[5]!=0][bp[6]!=0][bp[7]!=0];

		if( mul_0 != m0000 || mul_4 != m0000 )
		{
			used_flags[i] = 1;

			// convert to Real only once
			bpr0[0] = (Real)bp[0];
			bpr0[1] = (Real)bp[1];
			bpr0[2] = (Real)bp[2];
			bpr0[3] = (Real)bp[3];
			bpr0[4] = (Real)bp[4];
			bpr0[5] = (Real)bp[5];
			bpr0[6] = (Real)bp[6];
			bpr0[7] = (Real)bp[7];

			mp[ 0] = mul_0(bpr0, &ct[0][0]) + mul_4(bpr4, &ct[0][4]);
			mp[ 8] = mul_0(bpr0, &ct[1][0]) + mul_4(bpr4, &ct[1][4]);
			mp[16] = mul_0(bpr0, &ct[2][0]) + mul_4(bpr4, &ct[2][4]);
			mp[24] = mul_0(bpr0, &ct[3][0]) + mul_4(bpr4, &ct[3][4]);
			mp[32] = mul_0(bpr0, &ct[4][0]) + mul_4(bpr4, &ct[4][4]);
			mp[40] = mul_0(bpr0, &ct[5][0]) + mul_4(bpr4, &ct[5][4]);
			mp[48] = mul_0(bpr0, &ct[6][0]) + mul_4(bpr4, &ct[6][4]);
			mp[56] = mul_0(bpr0, &ct[7][0]) + mul_4(bpr4, &ct[7][4]);
		}
		else
		{
			used_flags[i] = 0;

			mp[ 0] = mp[ 8] = mp[16] = mp[24] =
			mp[32] = mp[40] = mp[48] = mp[56] = 0.0f;
		}
	}


	SmartDot * const mul_0 = fm_tbl[used_flags[0]][used_flags[1]][used_flags[2]][used_flags[3]];
	SmartDot * const mul_4 = fm_tbl[used_flags[4]][used_flags[5]][used_flags[6]][used_flags[7]];

	for( i = 0; i < 8; i++)
	{
		short * const bp = block + 8 * i;
		const Real * const ctp  = ct[i];
		const Real * const ctp4 = ctp + 4;
		
		bp[0] = f_to_floor256( mul_0( ctp, &mtx[ 0] ) + mul_4( ctp4, &mtx[ 4] ) + .5f );
		bp[1] = f_to_floor256( mul_0( ctp, &mtx[ 8] ) + mul_4( ctp4, &mtx[12] ) + .5f );
		bp[2] = f_to_floor256( mul_0( ctp, &mtx[16] ) + mul_4( ctp4, &mtx[20] ) + .5f );
		bp[3] = f_to_floor256( mul_0( ctp, &mtx[24] ) + mul_4( ctp4, &mtx[28] ) + .5f );
		bp[4] = f_to_floor256( mul_0( ctp, &mtx[32] ) + mul_4( ctp4, &mtx[36] ) + .5f );
		bp[5] = f_to_floor256( mul_0( ctp, &mtx[40] ) + mul_4( ctp4, &mtx[44] ) + .5f );
		bp[6] = f_to_floor256( mul_0( ctp, &mtx[48] ) + mul_4( ctp4, &mtx[52] ) + .5f );
		bp[7] = f_to_floor256( mul_0( ctp, &mtx[56] ) + mul_4( ctp4, &mtx[60] ) + .5f );
	}
}

extern "C" void Initialize_Fast_X87_IDCT( void )
{
  int freq, time;
  Real scale;

  for (freq=0; freq < 8; freq++)
  {
    scale = (freq == 0) ? (Real)sqrt(0.125) : (Real)0.5;
#ifdef __ICL
#pragma novector
#endif
    for (time=0; time<8; time++)
	{
	  ct[time][freq] = (Real)(scale*cos((PI/8.0)*freq*(time + 0.5)));
	}
  }
}
