/********************************************************************/
/*THIS SAMPLE CODE IS PROVIDED "AS IS" WITH NO WARRANTIES			*/
/*WHATSOEVER, INCLUDING ANY WARRANTY OF MERCHANTABILITY, 			*/
/*NONINFRINGEMENT, FITNESS FOR ANY PARTICULAR PURPOSE, OR ANY 		*/
/*WARRANTY OTHERWISE ARISING OUT OF THIS SAMPLE.					*/
/********************************************************************/
/*            Copyright (c) 1999 - 2000 Intel Corporation.			*/
/*                     All rights reserved.							*/
/********************************************************************/

#ifndef __DOT_PRODUCT_H
#define __DOT_PRODUCT_H

//#include "../KernelTemplate/kernel.h"
static const long CALL_COUNT = 10000;

typedef short int16;
#define DEFAULT_8X8_BLOCK 64
// number of iterations
#define N 1000

#define pi		3.14159265358979323846

#define ONE	((int) 1)
#define LG2_DCT_SCALE 15	
#define DCT_SCALE (ONE << LG2_DCT_SCALE)

#define BITS_INV_ACC	4                                // 4 or 5 for IEEE
#define SHIFT_INV_ROW	16 - BITS_INV_ACC
#define SHIFT_INV_COL	1 + BITS_INV_ACC
const short RND_INV_ROW   = 1024 * (6 - BITS_INV_ACC);        //1 << (SHIFT_INV_ROW-1)
const short RND_INV_COL   = 16 * (BITS_INV_ACC - 3);          // 1 << (SHIFT_INV_COL-1)
const short RND_INV_CORR  = RND_INV_COL - 1;                  // correction -1.0 and round


#define M128IVEC_IDCT	"M128_Ivec_IDCT"
#define M128ASM_IDCT	"M128_asm_IDCT"
#define M64ASM_IDCT		"M64_ASM_IDCT"
#define C_IDCT			"C_IDCT"

#define PRIORITY_IN_VEC_COUNT   MAX_PRIORITY
#define SZ_ABBREV_IN_VEC_COUNT  "vec"
#define SZ_BRIEF_IN_VEC_COUNT   "[/vec n]"
#define SZ_VERBOSE_IN_VEC_COUNT "vec        set the number of vectors to perform dot product on\n" \
                                "           n - number of vectors\n"
#ifdef GBV
struct ieee {
			double ppe;		/* pixel peak error */		/* any pixel */
			double pmse[64];	/* pixel mean square error */	/* any pixel */
			double omse;		/* overall mean square error */	/* overall */
			double pme[64];	/* pixel mean error */		/* any pixel */
			double ome;		/* overall mean error */	/* overall */
			} standard;
#else
	extern struct ieee {
				double ppe;		/* pixel peak error */		/* any pixel */
				double pmse[64];	/* pixel mean square error */	/* any pixel */
				double omse;		/* overall mean square error */	/* overall */
				double pme[64];	/* pixel mean error */		/* any pixel */
				double ome;		/* overall mean error */	/* overall */
				} standard;
#endif

/****************************************************************************/
class idct_kernel : public Kernel
{

protected:

	double dctcoeffdouble[64*8];
	double idctcoeffdouble[64*8];
	short  idctcoeffshort[64*2];
	short  ins[64*2];
	static short* dctcoeffshort;
	static short* ref;
	short *tst;

	int ChenPS[64];
	float ChenPSF[64];
	unsigned int block;

	short *p_init_ref;
	short *p_init_tst;
	short *p_init_dct;

	int error_idx, error_iter;

private:
	static bool alreadyInitialized;
	long rand(long L, long H)		/* L and H must be 32 bits */
	{
	  static long randx = 1;		/* 32 bits */
	  static double z = (double) 0x7fffffff;

	  long i, j;
	  double x;					/* double is 64 bits */

	  randx = (randx * 1103515245) + 12345;
	  i = randx & 0x7ffffffe;	/* keep 30 bits */
	  x = ((double)i)/z;		/* range 0 to 0.99999 ... */
	  x *= (L+H+1);				/* range 0 to <L+H+1 */
	  j = (int) x;				/* truncate to integer */
	  return(j-L);				/* range -L to H */
	}   // end of rand


	void random(short *op, long L, long H)
	{
		long i;

		for (i=0; i<64; i++)
		  *(op+i)=(short) rand(L, H);
	}


	void ChenIni()
	{
	   float ChenPS0[8];
	   int i,j;
	   ChenPS0[0]=cos(pi*1.0/4.0);
	   ChenPS0[1]=sin(pi*7.0/16.0);
	   ChenPS0[2]=cos(pi*1.0/8.0);
	   ChenPS0[3]=sin(pi*5.0/16.0);
	   ChenPS0[4]=cos(pi*1.0/4.0);
	   ChenPS0[5]=sin(pi*5.0/16.0);
	   ChenPS0[6]=cos(pi*1.0/8.0);
	   ChenPS0[7]=sin(pi*7.0/16.0);

	   for (i=0; i<8; i++)
	   for (j=0; j<8; j++)
	   {
		   ChenPS[8*i+j]=ChenPS0[i]*ChenPS0[j]*DCT_SCALE+0.5;
		   ChenPSF[8*i+j]=ChenPS0[i]*ChenPS0[j];
	   }

	}


	//used in fct8x8_float
	double c(int n)
	{
	  if(n==0)
		return(1/sqrt(2));
	  else
		return(1.0);
	}

	//used in ifct8x8_float
	double ci(int n)
	{
	  if(n==0)
		return(1/sqrt(2));
	  else
		return(1.0);
	}

// generate reference data  //
	void fct8x8_float(short *x, double *y)
	{
	  int i, j, u, v;
	  double sum;

	  for(v=0; v<8; v++)
		for(u=0; u<8; u++)
		{
		  sum = 0;
		  for(j=0; j<8; j++)
			for(i=0; i<8; i++)
			  sum += ((double)*(x+i+j*8)) * cos(pi*(2*i+1)*u/16) * cos(pi*(2*j+1)*v/16);

		  *(y+u+v*8) = 0.25 * c(u) * c(v) * sum;
		}
	}     

// iDCT, transform the 12bit DCT results back to the color domain
	void ifct8x8_float(short *x, double *y)
	{
	  int i, j, u, v;
	  double sum;

	  for(j=0; j<8; j++)
		for(i=0; i<8; i++)
		{
		  sum = 0;
		  for(v=0; v<8; v++)
			for(u=0; u<8; u++)
			  sum += ci(u) * ci(v) * ((double)*(x+u+v*8)) * cos(pi*(2*i+1)*u/16) * cos(pi*(2*j+1)*v/16);

		  *(y+i+j*8) = 0.25 * sum;
		}
	}     

// clip the fp result into 12bit integer. 
	void roundclip12bitd_s(double *ip, short *op)
	{
	  short i;
	  short  tmp;
	  double tmpf;

	  for (i=0; i<64; i++)
	  {
		tmpf=*(ip+i);

		if(tmpf>=0)
		  tmp=(short)(tmpf+0.5);

		if(tmpf<0)
		  tmp=(short)(tmpf-0.5);

		if(tmp>2047)
		  tmp=2047;

		if(tmp<-2048)
		  tmp=-2048;

		*(op+i) = tmp;
	  }
	}

// convert the results to 8 bit pixels
	void roundclip8bitd_s(double *ip, short *op)
	{
	  short i;
	  short  tmp;
	  double tmpf;

	  for (i=0; i<64; i++)
	  {
		tmpf=*(ip+i);

		if(tmpf>=0)
		  tmp=(short)(tmpf+0.5);

		if(tmpf<0)
		  tmp=(short)(tmpf-0.5);

		if(tmp>255)
		  tmp=255;

		if(tmp<-256)
		  tmp=-256;

		*(op+i) = tmp;
	  }
	}
public:

    unsigned int in_vector_count;

    //////////////////////////////////////////////////////////////////////////
	
	idct_kernel(unsigned int idct_block = DEFAULT_8X8_BLOCK)
    {
        SET_SRC_FILENAME();

		p_init_ref = ref;
		p_init_tst = NULL;
		p_init_dct = dctcoeffshort;
		tst = NULL;

		block = idct_block;
		if(block != DEFAULT_8X8_BLOCK)
		{
			//cout << "Currently, the proper structure is not implemented to handle blocks > 8x8 pixels\n";
			//exit(1);
		}
    }

	~idct_kernel()
	{
		if(tst != NULL) {Aligned::Free(tst);}
		if(ref != NULL) {Aligned::Free(ref);}
		if(dctcoeffshort != NULL) {Aligned::Free(dctcoeffshort);}

	}

    //////////////////////////////////////////////////////////////////////////
    void Initialize()
    {
		error_idx	= 0;
		error_iter	= 0;
		// structure for error evaluation 
		standard.ppe=0;
		standard.omse=0;
		standard.ome=0;
		for(int i=0; i<64; i++){
			standard.pmse[i]=0;
			standard.pme[i]=0;
		}
		tst = (short *)Aligned::Alloc(64*2*N,16);

		if(!alreadyInitialized)
		{
			ref = (short *)Aligned::Alloc(64*2*N,16);
			dctcoeffshort = (short *)Aligned::Alloc(64*2*N,16);

			p_init_ref = ref;
			p_init_dct = dctcoeffshort;

			ChenIni();

			for(int j = 0; j <N; j++)
			{	
				random(ins, 256, 255); 
				// generate reference data  //
				// DCT
				fct8x8_float(ins, dctcoeffdouble);
				// clip the fp result into 12bit integer. 
				roundclip12bitd_s(dctcoeffdouble, dctcoeffshort);
			
				// iDCT, transform the 12bit DCT results back to the color domain 
				ifct8x8_float(dctcoeffshort, idctcoeffdouble);
				// convert the results to 8 bit pixels
				roundclip8bitd_s(idctcoeffdouble, ref);

				ref+=64;
				dctcoeffshort+=64;


			}
			ref				= p_init_ref;
			dctcoeffshort	= p_init_dct;


			alreadyInitialized = true;
		}

		p_init_ref = ref;
		p_init_dct = dctcoeffshort;
		p_init_tst = tst;

	
    }

    //////////////////////////////////////////////////////////////////////////
    int Validate() // 0 success, 1 failure
	{
		ref				=	p_init_ref;
		tst				=	p_init_tst;
		dctcoeffshort	=	p_init_dct;
		double error = 0.0, error2 = 0.0;

		if(test_iterations == N)
		{
		for (int iter=0; iter<N; iter++)
		{
			for(int j=0; j<64; j++){
				error=*(tst+j)-*(ref+j);
				error2=error*error;

				if(error*error>standard.ppe*standard.ppe) {
					standard.ppe=error;
					error_idx=j;			//to find the place of max error
					error_iter=iter;		
					// fprintf(fp,"ppe (peak pixel error) =%14.8f at idx=%d iter=%d\n", standard.ppe,error_idx,error_iter);
				}
				standard.pmse[j]+=error2;
				standard.omse+=error2;
				standard.pme[j]+=error;
				standard.ome+=error;
			}
			ref+=64;
			tst+=64;
		}

		ref				=	p_init_ref;
		tst				=	p_init_tst;
		dctcoeffshort	=	p_init_dct;

		//Check if Results are in the acceptable range:
		int result = (int)EXIT_SUCCESS;

		//ppe (peak pixel error, <=1)		
		if(standard.ppe > 1)
			result = (int)EXIT_FAILURE;

		//pmse (average pixel rms^2 error) 8x8:
		//max of pmse(<=0.06)
		double tmpError=0.0;
		for(int i=0;i<64;i++){
			if (standard.pmse[i]>tmpError)
				tmpError=standard.pmse[i];
		}
		if((tmpError/N) > 0.06)
			result = (int)EXIT_FAILURE;

		//omse (average rms^2 error, <=0.02)
		if((standard.omse/(64.*N)) > 0.02)
			result = (int)EXIT_FAILURE;

		//pme (average error) 8x8:
		//max of pme (<=0.015)
		tmpError=0;
		for(i=0;i<64;i++){
			if (fabs(standard.pme[i])>tmpError)
				tmpError=fabs(standard.pme[i]);
		}
		if((tmpError/N) > 0.015)
			result = (int)EXIT_FAILURE;

		//ome (average error, <=0.0015)
		if((standard.ome/(64.*N)) > 0.0015)
			result = (int)EXIT_FAILURE;

		return(EXIT_SUCCESS);
		}

		return(EXIT_SUCCESS);

	}


    //////////////////////////////////////////////////////////////////////////
    void Report()
    {
        if ( do_report )
        {
            Log::Report("min cycle/idct = %1d\n",  (int)(min_elapsed_cycles/CALL_COUNT));
            Log::Report("avg cycle/idct = %1d\n",  (int)(avg_elapsed_cycles/CALL_COUNT));
            Log::Report("Block Processed		= %1d Pixels\n",  N);

			Log::Report("ppe (peak pixel error, <=1) =%14.8f at idx=%d iter=%d\n\n", standard.ppe,error_idx,error_iter);
			Log::Report("pmse (average pixel rms^2 error) 8x8:\n");
		
			double tmpError=0.0;
			for(int i=0;i<64;i++){
				if (standard.pmse[i]>tmpError)
					tmpError=standard.pmse[i];
			}
			Log::Report("max of pmse(<=0.06) %14.8f:\n\n",tmpError/N);
			Log::Report("omse (average rms^2 error, <=0.02) =%14.8f\n\n", standard.omse/(64.*N));
			Log::Report("pme (average error) 8x8:\n\n");
		
			tmpError=0.0;
			for(i=0;i<64;i++){
				if (fabs(standard.pme[i])>tmpError)
					tmpError=fabs(standard.pme[i]);

			}
			Log::Report("max of pme (<=0.015)%14.8f:\n",tmpError/N);
			Log::Report("ome (average error, <=0.0015) =%14.8f\n\n", standard.ome/(64.*N));
    
//            TKernel<float>::Report();
        }
    }            

    //////////////////////////////////////////////////////////////////////////
 /*    int SetParam(int argc, char *argv[], int curr)
    {
       char *arg = argv[curr] + 1; // strip off leading '/'

        // in vector count
        if ( stricmp(arg, SZ_ABBREV_IN_VEC_COUNT) == 0 )
        {
            if ( (curr+1) >= argc )
            {
                Log::Error("number must follow: %s\n", argv[curr]);
                return Shutdown(EXIT_FAILURE, SHOW_HELP);
            }

            unsigned int c = atoi(argv[curr+1]);
        
            if ( c == 0 )
            {
                Log::Warning("possibly invalid vector count specified: %s %s\n", argv[curr], argv[curr+1]);
            }

            SetInVectorCount(c);

            return 2;
        }

        // no match -- you gotta call the base class' SetParam() function
        else
        {
            return TKernel<float>::SetParam(argc, argv, curr);
        }
    }*/
};


/****************************************************************************
class idct_C : public idct_kernel
{
public:
    
    //////////////////////////////////////////////////////////////////////////
    DotProduct_AOS()
    {
        SetName(DP_FLOAT_AOS_SZ);
    }

    //////////////////////////////////////////////////////////////////////////
    int TheKernel()
    {
        ASSERT( out );
        ASSERT( in );
        ASSERT( (out_count*COORD_COUNT) <= in_count );

        unsigned int j = 0;
        for ( unsigned int i = 0; i < out_count; i++ )
        {
            out[i] = in[j+iX]*x + in[j+iY]*y + in[j+iZ]*z;
            j += COORD_COUNT;
        }

        return EXIT_SUCCESS;
    }
};

/****************************************************************************/
class idct_M64ASM : public idct_kernel
{
public:

    //////////////////////////////////////////////////////////////////////////
    idct_M64ASM()
    {
        SetName(M64ASM_IDCT);
		//set default settings
		test_iterations = N;
		run_type = RUN_TIMED;
    }

    //////////////////////////////////////////////////////////////////////////
	int TheCode();

	//Calls the idct code 10,000 times for a better trace
	int TheKernel()
	{
		for(int i = 0; i < CALL_COUNT - 1; i++)
		{TheCode();}
		return(TheCode());	//10,000 calls
	}


	void WarmCache()
	{
		TheCode();
	}

    //////////////////////////////////////////////////////////////////////////
    void PostKernel() 
	{
		tst+=64;
		dctcoeffshort+=64;
		__asm {emms}

	};

    //////////////////////////////////////////////////////////////////////////
    void Reset() 
	{
		  ref			=	p_init_ref;
		  tst			=	p_init_tst;
		  dctcoeffshort	=	p_init_dct;

	};

};

/****************************************************************************/
class idct_M128ASM : public idct_kernel
{
public:

    //////////////////////////////////////////////////////////////////////////
    idct_M128ASM()
    {
        SetName(M128ASM_IDCT);
		//set default settings
		test_iterations = N;
		run_type = RUN_TIMED;
    }

    //////////////////////////////////////////////////////////////////////////
	int TheCode();

	//Calls the idct code 10,000 times for a better trace
	int TheKernel()
	{
		for(int i = 0; i < CALL_COUNT - 1; i++)
		{TheCode();}
		return(TheCode());	//10,000 calls
	}


	void WarmCache()
	{
		TheCode();
	}

    //////////////////////////////////////////////////////////////////////////
    void PostKernel() 
	{
		tst+=64;
		dctcoeffshort+=64;
		__asm {emms}

	};

    //////////////////////////////////////////////////////////////////////////
    void Reset() 
	{
		  ref			=	p_init_ref;
		  tst			=	p_init_tst;
		  dctcoeffshort	=	p_init_dct;

	};

};

/****************************************************************************/
class idct_M128IVEC : public idct_kernel
{
public:

    //////////////////////////////////////////////////////////////////////////
    idct_M128IVEC()
    {
        SetName(M128IVEC_IDCT);
		//set default settings
		test_iterations = N;
		run_type = RUN_TIMED;
    }

    //////////////////////////////////////////////////////////////////////////
	int TheCode();

	//Calls the idct code 10,000 times for a better trace
	int TheKernel()
	{
		for(int i = 0; i < CALL_COUNT - 1; i++)
		{TheCode();}
		return(TheCode());	//10,000 calls
	}


	void WarmCache()
	{
		TheCode();
	}

    //////////////////////////////////////////////////////////////////////////
    void PostKernel() 
	{
		tst+=64;
		dctcoeffshort+=64;
		__asm {emms}

	};

    //////////////////////////////////////////////////////////////////////////
    void Reset() 
	{
		  ref			=	p_init_ref;
		  tst			=	p_init_tst;
		  dctcoeffshort	=	p_init_dct;

	};

};

/****************************************************************************
class DotProduct_SOA : public DotProduct
{
public:

    //////////////////////////////////////////////////////////////////////////
    DotProduct_SOA()
    {
        SetName(DP_FLOAT_SOA_SZ);
        in_temp = NULL;
    }

    //////////////////////////////////////////////////////////////////////////
    ~DotProduct_SOA()
    {
        if ( in_temp )
        {
            delete [] in_temp;
        }
    }

    //////////////////////////////////////////////////////////////////////////
    void SetInCount(unsigned int count)
    {
        in_temp = new float[count];
        DotProduct::SetInCount(count);
    }

    //////////////////////////////////////////////////////////////////////////
    void Functional()
    {
        ASSERT( in );
        ASSERT( functional );
        ASSERT( (out_count*COORD_COUNT) <= in_count );

        ASSERT( in_vector_count >= out_count );

        float *in_x = &in[in_vector_count*iX],
              *in_y = &in[in_vector_count*iY],
              *in_z = &in[in_vector_count*iZ];

        for ( unsigned int i = 0; i < out_count; i++ )
        {
            functional[i] = in_x[i] * x + in_y[i] * y + in_z[i] * z;
        }
    }        

    //////////////////////////////////////////////////////////////////////////
    void Initialize()
    {
        DotProduct::Initialize();
        transpose_in();
    }

    //////////////////////////////////////////////////////////////////////////
    int TheKernel()
    {
        ASSERT( in );
        ASSERT( out );
        ASSERT( (out_count*COORD_COUNT) <= in_count );

        ASSERT( in_vector_count >= out_count );

        float *in_x = &in[in_vector_count*iX],
              *in_y = &in[in_vector_count*iY],
              *in_z = &in[in_vector_count*iZ];

        for ( unsigned int i = 0; i < out_count; i++ )
        {
            out[i] = in_x[i] * x + in_y[i] * y + in_z[i] * z;
        }
    
        return EXIT_SUCCESS;
    }

    //////////////////////////////////////////////////////////////////////////
    void Reset()
    {
        transpose_in();
        DotProduct::Reset();
    }

private:

    float *in_temp;

    //////////////////////////////////////////////////////////////////////////
    void transpose_in()
    {
        ASSERT( in );
        ASSERT( in_temp );
    
        memcpy(in_temp, in, in_count*sizeof(float));

        for ( unsigned int i = 0; i < in_vector_count; i++ )
        {
            in[in_vector_count*iX+i] = in_temp[COORD_COUNT*i+iX];
            in[in_vector_count*iY+i] = in_temp[COORD_COUNT*i+iY];
            in[in_vector_count*iZ+i] = in_temp[COORD_COUNT*i+iZ];
        }
    }
};
*/
#endif __DOT_PRODUCT_H