/* Reference_IDCT.c, Inverse Discrete Fourier Transform, double precision          */

/* Copyright (C) 1996, MPEG Software Simulation Group. All Rights Reserved. */

/*
 * Disclaimer of Warranty
 *
 * These software programs are available to the user without any license fee or
 * royalty on an "as is" basis.  The MPEG Software Simulation Group disclaims
 * any and all warranties, whether express, implied, or statuary, including any
 * implied warranties or merchantability or of fitness for a particular
 * purpose.  In no event shall the copyright-holder be liable for any
 * incidental, punitive, or consequential damages of any kind whatsoever
 * arising from the use of these programs.
 *
 * This disclaimer of warranty extends to the user of these programs and user's
 * customers, employees, agents, transferees, successors, and assigns.
 *
 * The MPEG Software Simulation Group does not represent or warrant that the
 * programs furnished hereunder are free of infringement of any third-party
 * patents.
 *
 * Commercial implementations of MPEG-1 and MPEG-2 video, including shareware,
 * are subject to royalty fees to patent holders.  Many of these patents are
 * general enough such that they are unavoidable regardless of implementation
 * design.
 *
 */

/*  Perform IEEE 1180 reference (64-bit floating point, separable 8x1
 *  direct matrix multiply) Inverse Discrete Cosine Transform
*/


/* Here we use math.h to generate constants.  Compiler results may
   vary a little */

#include <math.h>
#include <emmintrin.h>


#define GLOBAL_LAYER // SSE2


#ifndef PI
# ifdef M_PI
#  define PI M_PI
# else
#  define PI 3.14159265358979323846
# endif
#endif



/* private data */

/* cosine transform matrix for 8x1 IDCT */
static _MM_ALIGN16 double cc[8][8];

/* initialize DCT coefficient matrix */

extern "C" void Initialize_Reference_IDCT_SSE2()
{
  int freq, time;
  double scale;
  //unsigned csr;

  for (freq=0; freq < 8; freq++)
  {
    scale = (freq == 0) ? sqrt(0.125) : 0.5;
    for (time=0; time<8; time++)
      cc[time][freq] = scale*cos((PI/8.0)*freq*(time + 0.5));
  }

  /*
  csr = _mm_getcsr();
  csr |= (0x1 << 15); // set FTZ
  _mm_setcsr(csr);
  */
}

/* perform IDCT matrix multiply for 8x8 coefficient block */

extern "C" void Reference_IDCT_SSE2(short *block)
{
  int i, j;
  __m128d partial_product, partial_product1, partial_product2;
  __m128i clock, clock1, sign;
  //const __m128i zero = _mm_setzero_si128();
  __m128d dlock, dlock1, dlock2, dlock3;
  static _MM_ALIGN16 double tmp[64];
  static _MM_ALIGN16 short s255[] = { 255, 0, 255, 0, 255, 0, 255, 0 };
  static _MM_ALIGN16 short sm256[] = { -256, 0, -256, 0, -256, 0, -256, 0 };

  for (i=0; i<8; i++)
  {
	//clock = _mm_loadl_epi64((__m128i*)&block[8*i+0]);
	//clock1 = _mm_loadl_epi64((__m128i*)&block[8*i+4]);
	//clock = _mm_unpacklo_epi64(clock, clock1);
	clock = _mm_load_si128((__m128i*)&block[8*i]);
	
	sign = _mm_cmpgt_epi16(_mm_setzero_si128(), clock);
	clock1 = _mm_unpackhi_epi16(clock, sign);
	clock = _mm_unpacklo_epi16(clock, sign);
	
	dlock2 = _mm_cvtepi32_pd(clock1);
	clock1 = _mm_unpackhi_epi64(clock1, clock1);
	dlock = _mm_cvtepi32_pd(clock);
	clock = _mm_unpackhi_epi64(clock, clock);
	dlock3 = _mm_cvtepi32_pd(clock1);
	dlock1 = _mm_cvtepi32_pd(clock);

	for (j=0; j<8; j += 2)
    {
      //for (k=0; k<8; k += 4)
	  //{
		//partial_product+= cc[j][k]*block[8*i+k];
#if 0
#define k 0
		clock = _mm_loadl_epi64((__m128i*)&block[8*i+k]);
		
		sign = _mm_cmpgt_epi16(_mm_setzero_si128(), clock);
		clock = _mm_unpacklo_epi16(clock, sign);
		
		dlock = _mm_cvtepi32_pd(clock);
		clock = _mm_unpackhi_epi64(clock, clock);
		dlock1 = _mm_cvtepi32_pd(clock);

		partial_product =
			_mm_mul_pd(dlock, _mm_load_pd(&cc[j][k]));
		partial_product1 =
			_mm_mul_pd(dlock1, _mm_load_pd(&cc[j][k + 2]));
#undef k
#define k 4
		clock = _mm_loadl_epi64((__m128i*)&block[8*i+k]);
		
		sign = _mm_cmpgt_epi16(_mm_setzero_si128(), clock);
		clock = _mm_unpacklo_epi16(clock, sign);
		
		dlock = _mm_cvtepi32_pd(clock);
		clock = _mm_unpackhi_epi64(clock, clock);
		dlock1 = _mm_cvtepi32_pd(clock);

		partial_product = _mm_add_pd(partial_product,
			_mm_mul_pd(dlock, _mm_load_pd(&cc[j][k])));
		partial_product1 = _mm_add_pd(partial_product1,
			_mm_mul_pd(dlock1, _mm_load_pd(&cc[j][k + 2])));
#undef k
#endif

		partial_product1 = _mm_mul_pd(dlock2, _mm_load_pd(&cc[j][4]));
		partial_product = _mm_mul_pd(dlock, _mm_load_pd(&cc[j][0]));
		partial_product1 = _mm_add_pd(partial_product1,
			_mm_mul_pd(dlock3, _mm_load_pd(&cc[j][4 + 2])));
		partial_product = _mm_add_pd(partial_product,
			_mm_mul_pd(dlock1, _mm_load_pd(&cc[j][0 + 2])));
	    partial_product = _mm_add_pd(partial_product, partial_product1);

		partial_product1 = _mm_mul_pd(dlock2, _mm_load_pd(&cc[j+1][4]));
		partial_product2 = _mm_mul_pd(dlock, _mm_load_pd(&cc[j+1][0]));
		partial_product1 = _mm_add_pd(partial_product1,
			_mm_mul_pd(dlock3, _mm_load_pd(&cc[j+1][4 + 2])));
		partial_product2 = _mm_add_pd(partial_product2,
			_mm_mul_pd(dlock1, _mm_load_pd(&cc[j+1][0 + 2])));
	    partial_product2 = _mm_add_pd(partial_product2, partial_product1);
	  //}

	  /*
	  partial_product = _mm_add_pd(partial_product, partial_product1);
	  partial_product = _mm_add_sd(partial_product,
		  _mm_unpackhi_pd(partial_product, partial_product));
	  */
	  partial_product = _mm_add_pd(
		  _mm_unpacklo_pd(partial_product, partial_product2),
		  _mm_unpackhi_pd(partial_product, partial_product2));

      _mm_storel_pd(&tmp[8*j+i], partial_product);
      _mm_storeh_pd(&tmp[8*(j+1)+i], partial_product);
    }
  }

  /* Transpose operation is integrated into address mapping by switching 
     loop order of i and j */

  for (j=0; j<8; j++)
    for (i=0; i<8; i += 2)
    {
      //partial_product = _mm_setzero_pd();

      //for (k=0; k<8; k += 2)
	  //{
        //partial_product+= cc[i][k]*tmp[8*j+k];
#define k 0
		dlock = _mm_load_pd(&tmp[8*j+k]);
		partial_product =
			_mm_mul_pd(dlock, _mm_load_pd(&cc[i][k]));
		partial_product1 =
			_mm_mul_pd(dlock, _mm_load_pd(&cc[i+1][k]));
#undef k
#define k 2
		dlock = _mm_load_pd(&tmp[8*j+k]);
		partial_product = _mm_add_pd(partial_product,
			_mm_mul_pd(dlock, _mm_load_pd(&cc[i][k])));
		partial_product1 = _mm_add_pd(partial_product1,
			_mm_mul_pd(dlock, _mm_load_pd(&cc[i+1][k])));
#undef k
#define k 4
		dlock = _mm_load_pd(&tmp[8*j+k]);
		partial_product = _mm_add_pd(partial_product,
			_mm_mul_pd(dlock, _mm_load_pd(&cc[i][k])));
		partial_product1 = _mm_add_pd(partial_product1,
			_mm_mul_pd(dlock, _mm_load_pd(&cc[i+1][k])));
#undef k
#define k 6
		dlock = _mm_load_pd(&tmp[8*j+k]);
		partial_product = _mm_add_pd(partial_product,
			_mm_mul_pd(dlock, _mm_load_pd(&cc[i][k])));
		partial_product1 = _mm_add_pd(partial_product1,
			_mm_mul_pd(dlock, _mm_load_pd(&cc[i+1][k])));
#undef k
	  //}

	  /*
	  partial_product = _mm_add_pd(partial_product,
		  _mm_unpackhi_pd(partial_product, partial_product));
	  partial_product1 = _mm_add_pd(partial_product1,
		  _mm_unpackhi_pd(partial_product1, partial_product1));
	  */
	  partial_product = _mm_add_pd(
		  _mm_unpacklo_pd(partial_product, partial_product1),
		  _mm_unpackhi_pd(partial_product, partial_product1));

      //v = (int) floor(partial_product+0.5);
	  clock = _mm_cvtpd_epi32(partial_product);

      //block[8*i+j] = (v<-256) ? -256 : ((v>255) ? 255 : v);
	  clock = _mm_max_epi16(clock, _mm_load_si128((__m128*)sm256));
	  clock = _mm_min_epi16(clock, _mm_load_si128((__m128*)s255));
	  
	  //v = (v < -256)? -256 : v;
	  //v = (v > 255)? 255 : v;
	  block[8*i+j] = _mm_extract_epi16(clock, 0);
	  block[8*(i+1)+j] = _mm_extract_epi16(clock, 2);
	}

}

/* private data */

/* cosine transform matrix for 8x1 IDCT */
static _MM_ALIGN16 float c[8][8];

/* initialize DCT coefficient matrix */

extern "C" void Initialize_ReferenceLo_IDCT_SSE2()
{
  int freq, time;
  double scale;

  for (freq=0; freq < 8; freq++)
  {
    scale = (freq == 0) ? sqrt(0.125) : 0.5;
    for (time=0; time<8; time++)
      c[time][freq] = (float)(scale*cos((PI/8.0)*freq*(time + 0.5)));
  }
}

/* perform IDCT matrix multiply for 8x8 coefficient block */

extern "C" void ReferenceLo_IDCT_SSE2(short *block)
{
  int i, j;
  __m128 partial_product, partial_product1, partial_product2, partial_product3;
  __m128 partial_product0;
  __m128i clock, clock1, sign;
  //const __m128i zero = _mm_setzero_si128();
  __m128 dlock, dlock1;
  static _MM_ALIGN16 float tmp[64];
  static _MM_ALIGN16 short s255[] = { 255, 0, 255, 0, 255, 0, 255, 0 };
  static _MM_ALIGN16 short sm256[] = { -256, 0, -256, 0, -256, 0, -256, 0 };

  for (i=0; i<8; i++)
  {
	//clock = _mm_loadl_epi64((__m128i*)&block[8*i+0]);
	//clock1 = _mm_loadl_epi64((__m128i*)&block[8*i+4]);
	//clock = _mm_unpacklo_epi64(clock, clock1);
	clock = _mm_load_si128((__m128i*)&block[8*i]);
	
	sign = _mm_cmpgt_epi16(_mm_setzero_si128(), clock);
	clock1 = _mm_unpackhi_epi16(clock, sign);
	clock = _mm_unpacklo_epi16(clock, sign);
	
	dlock1 = _mm_cvtepi32_ps(clock1);
	dlock = _mm_cvtepi32_ps(clock);

//	for (j=0; j<8; j += 4)
//    {
      //for (k=0; k<8; k += 4)
	  //{
		//partial_product+= c[j][k]*block[8*i+k];
#define j 0
#define j4 4
		partial_product = _mm_add_ps(
			_mm_mul_ps(dlock1, _mm_load_ps(&c[j][4])),
			_mm_mul_ps(dlock, _mm_load_ps(&c[j][0])));

		partial_product1 = _mm_add_ps(
			_mm_mul_ps(dlock1, _mm_load_ps(&c[j+1][4])),
			_mm_mul_ps(dlock, _mm_load_ps(&c[j+1][0])));

		partial_product2 = _mm_add_ps(
			_mm_mul_ps(dlock1, _mm_load_ps(&c[j+2][4])),
			_mm_mul_ps(dlock, _mm_load_ps(&c[j+2][0])));

		partial_product3 = _mm_add_ps(
			_mm_mul_ps(dlock1, _mm_load_ps(&c[j+3][4])),
			_mm_mul_ps(dlock, _mm_load_ps(&c[j+3][0])));
      //}

	  partial_product = _mm_add_ps(
		  _mm_unpacklo_ps(partial_product, partial_product2),
		  _mm_unpackhi_ps(partial_product, partial_product2));
	  partial_product1 = _mm_add_ps(
		  _mm_unpacklo_ps(partial_product1, partial_product3),
		  _mm_unpackhi_ps(partial_product1, partial_product3));
	  partial_product = _mm_add_ps(
		  _mm_unpacklo_ps(partial_product, partial_product1),
		  _mm_unpackhi_ps(partial_product, partial_product1));

		partial_product0 = _mm_add_ps(
			_mm_mul_ps(dlock1, _mm_load_ps(&c[j4][4])),
			_mm_mul_ps(dlock, _mm_load_ps(&c[j4][0])));

		partial_product1 = _mm_add_ps(
			_mm_mul_ps(dlock1, _mm_load_ps(&c[j4+1][4])),
			_mm_mul_ps(dlock, _mm_load_ps(&c[j4+1][0])));

		partial_product2 = _mm_add_ps(
			_mm_mul_ps(dlock1, _mm_load_ps(&c[j4+2][4])),
			_mm_mul_ps(dlock, _mm_load_ps(&c[j4+2][0])));

		partial_product3 = _mm_add_ps(
			_mm_mul_ps(dlock1, _mm_load_ps(&c[j4+3][4])),
			_mm_mul_ps(dlock, _mm_load_ps(&c[j4+3][0])));
      // 2nd part

	  partial_product0 = _mm_add_ps(
		  _mm_unpacklo_ps(partial_product0, partial_product2),
		  _mm_unpackhi_ps(partial_product0, partial_product2));
	  partial_product1 = _mm_add_ps(
		  _mm_unpacklo_ps(partial_product1, partial_product3),
		  _mm_unpackhi_ps(partial_product1, partial_product3));
	  partial_product0 = _mm_add_ps(
		  _mm_unpacklo_ps(partial_product0, partial_product1),
		  _mm_unpackhi_ps(partial_product0, partial_product1));
/*
      _mm_store_ss(&tmp[8*j+i], partial_product);
	  partial_product2 = _mm_shuffle_ps(partial_product, partial_product, _MM_SHUFFLE(3,3,2,1));
	  partial_product = _mm_movehl_ps(partial_product, partial_product);
      
	  _mm_store_ss(&tmp[8*j4+i], partial_product0);
	  partial_product3 = _mm_shuffle_ps(partial_product0, partial_product0, _MM_SHUFFLE(3,3,2,1));
	  partial_product0 = _mm_movehl_ps(partial_product0, partial_product0);
      
	  _mm_store_ss(&tmp[8*(j+1)+i], partial_product2);
	  partial_product2 = _mm_movehl_ps(partial_product2, partial_product2);
      
	  _mm_store_ss(&tmp[8*(j4+1)+i], partial_product3);
	  partial_product3 = _mm_movehl_ps(partial_product3, partial_product3);

      _mm_store_ss(&tmp[8*(j+2)+i], partial_product);
      _mm_store_ss(&tmp[8*(j4+2)+i], partial_product0);

      _mm_store_ss(&tmp[8*(j+3)+i], partial_product2);
      _mm_store_ss(&tmp[8*(j4+3)+i], partial_product3);
*/
      _mm_store_ss(&tmp[8*j+i], partial_product);
	  partial_product = _mm_shuffle_ps(partial_product, partial_product, _MM_SHUFFLE(3,3,2,1));
      _mm_store_ss(&tmp[8*j4+i], partial_product0);
	  partial_product0 = _mm_shuffle_ps(partial_product0, partial_product0, _MM_SHUFFLE(3,3,2,1));
      _mm_store_ss(&tmp[8*(j+1)+i], partial_product);
	  partial_product = _mm_shuffle_ps(partial_product, partial_product, _MM_SHUFFLE(3,3,2,1));
      _mm_store_ss(&tmp[8*(j4+1)+i], partial_product0);
	  partial_product0 = _mm_shuffle_ps(partial_product0, partial_product0, _MM_SHUFFLE(3,3,2,1));
      _mm_store_ss(&tmp[8*(j+2)+i], partial_product);
	  partial_product = _mm_shuffle_ps(partial_product, partial_product, _MM_SHUFFLE(3,3,2,1));
      _mm_store_ss(&tmp[8*(j4+2)+i], partial_product0);
	  partial_product0 = _mm_shuffle_ps(partial_product0, partial_product0, _MM_SHUFFLE(3,3,2,1));
      _mm_store_ss(&tmp[8*(j+3)+i], partial_product);
      _mm_store_ss(&tmp[8*(j4+3)+i], partial_product0);
#undef j
#undef j4
//    }
  }

  /* Transpose operation is integrated into address mapping by switching 
     loop order of i and j */

  for (j=0; j<8; j++)
  {
    //for (i=0; i<8; i += 4)
    //{
      //partial_product = _mm_setzero_pd();

      //for (k=0; k<8; k += 4)
	  //{
        //partial_product+= c[i][k]*tmp[8*j+k];
#define i 0
#define i4 4
#define k 0
		dlock = _mm_load_ps(&tmp[8*j+k]);
		partial_product =
			_mm_mul_ps(dlock, _mm_load_ps(&c[i][k]));
		partial_product1 =
			_mm_mul_ps(dlock, _mm_load_ps(&c[i+1][k]));
		partial_product2 =
			_mm_mul_ps(dlock, _mm_load_ps(&c[i+2][k]));
		partial_product3 =
			_mm_mul_ps(dlock, _mm_load_ps(&c[i+3][k]));
#undef k
#define k 4
		dlock1 = _mm_load_ps(&tmp[8*j+k]);
		partial_product = _mm_add_ps(partial_product,
			_mm_mul_ps(dlock1, _mm_load_ps(&c[i][k])));
		partial_product1 = _mm_add_ps(partial_product1,
			_mm_mul_ps(dlock1, _mm_load_ps(&c[i+1][k])));
		partial_product2 = _mm_add_ps(partial_product2,
			_mm_mul_ps(dlock1, _mm_load_ps(&c[i+2][k])));
		partial_product3 = _mm_add_ps(partial_product3,
			_mm_mul_ps(dlock1, _mm_load_ps(&c[i+3][k])));
#undef k
	  //}

	  partial_product = _mm_add_ps(
		  _mm_unpacklo_ps(partial_product, partial_product2),
		  _mm_unpackhi_ps(partial_product, partial_product2));
	  partial_product1 = _mm_add_ps(
		  _mm_unpacklo_ps(partial_product1, partial_product3),
		  _mm_unpackhi_ps(partial_product1, partial_product3));
	  partial_product = _mm_add_ps(
		  _mm_unpacklo_ps(partial_product, partial_product1),
		  _mm_unpackhi_ps(partial_product, partial_product1));

      //v = (int) floor(partial_product+0.5);
	  clock = _mm_cvtps_epi32(partial_product);

#define k 0
		partial_product =
			_mm_mul_ps(dlock, _mm_load_ps(&c[i4][k]));
		partial_product1 =
			_mm_mul_ps(dlock, _mm_load_ps(&c[i4+1][k]));
		partial_product2 =
			_mm_mul_ps(dlock, _mm_load_ps(&c[i4+2][k]));
		partial_product3 =
			_mm_mul_ps(dlock, _mm_load_ps(&c[i4+3][k]));
#undef k
#define k 4
		partial_product = _mm_add_ps(partial_product,
			_mm_mul_ps(dlock1, _mm_load_ps(&c[i4][k])));
		partial_product1 = _mm_add_ps(partial_product1,
			_mm_mul_ps(dlock1, _mm_load_ps(&c[i4+1][k])));
		partial_product2 = _mm_add_ps(partial_product2,
			_mm_mul_ps(dlock1, _mm_load_ps(&c[i4+2][k])));
		partial_product3 = _mm_add_ps(partial_product3,
			_mm_mul_ps(dlock1, _mm_load_ps(&c[i4+3][k])));
#undef k

	  partial_product = _mm_add_ps(
		  _mm_unpacklo_ps(partial_product, partial_product2),
		  _mm_unpackhi_ps(partial_product, partial_product2));
	  partial_product1 = _mm_add_ps(
		  _mm_unpacklo_ps(partial_product1, partial_product3),
		  _mm_unpackhi_ps(partial_product1, partial_product3));
	  partial_product = _mm_add_ps(
		  _mm_unpacklo_ps(partial_product, partial_product1),
		  _mm_unpackhi_ps(partial_product, partial_product1));

      //v = (int) floor(partial_product0+0.5);
	  clock1 = _mm_cvtps_epi32(partial_product);

      //block[8*i+j] = (v<-256) ? -256 : ((v>255) ? 255 : v);
	  sign = _mm_load_si128((__m128*)sm256);
	  clock = _mm_max_epi16(clock, sign);
	  clock1 = _mm_max_epi16(clock1, sign);
	  sign = _mm_load_si128((__m128*)s255);
	  clock = _mm_min_epi16(clock, sign);
	  clock1 = _mm_min_epi16(clock1, sign);
	  
	  //v = (v < -256)? -256 : v;
	  //v = (v > 255)? 255 : v;
	  block[8*i+j] = _mm_extract_epi16(clock, 0);
	  block[8*(i+1)+j] = _mm_extract_epi16(clock, 2);
	  block[8*(i+2)+j] = _mm_extract_epi16(clock, 4);
	  block[8*(i+3)+j] = _mm_extract_epi16(clock, 6);
	  block[8*i4+j] = _mm_extract_epi16(clock1, 0);
	  block[8*(i4+1)+j] = _mm_extract_epi16(clock1, 2);
	  block[8*(i4+2)+j] = _mm_extract_epi16(clock1, 4);
	  block[8*(i4+3)+j] = _mm_extract_epi16(clock1, 6);
#undef i
#undef i4
    //}
  }
}

void ReferenceLo_IDCT_SSE2_WORK7(short *block)
{
  int i, j, k;
  __m128 partial_product, partial_product1, partial_product2, partial_product3;
  __m128 partial_product0;
  __m128i clock, clock1, sign;
  //const __m128i zero = _mm_setzero_si128();
  __m128 dlock, dlock1, dlock2, dlock3;
  static _MM_ALIGN16 float tmp[64];
  static _MM_ALIGN16 short s255[] = { 255, 0, 255, 0, 255, 0, 255, 0 };
  static _MM_ALIGN16 short sm256[] = { -256, 0, -256, 0, -256, 0, -256, 0 };

  for (i=0; i<8; i++)
  {
	//clock = _mm_loadl_epi64((__m128i*)&block[8*i+0]);
	//clock1 = _mm_loadl_epi64((__m128i*)&block[8*i+4]);
	//clock = _mm_unpacklo_epi64(clock, clock1);
	clock = _mm_load_si128((__m128i*)&block[8*i]);
	
	sign = _mm_cmpgt_epi16(_mm_setzero_si128(), clock);
	clock1 = _mm_unpackhi_epi16(clock, sign);
	clock = _mm_unpacklo_epi16(clock, sign);
	
	dlock1 = _mm_cvtepi32_ps(clock1);
	dlock = _mm_cvtepi32_ps(clock);

//	for (j=0; j<8; j += 4)
//    {
      //for (k=0; k<8; k += 4)
	  //{
		//partial_product+= c[j][k]*block[8*i+k];
#define j 0
#define j4 4
		partial_product = _mm_add_ps(
			_mm_mul_ps(dlock1, _mm_load_ps(&c[j][4])),
			_mm_mul_ps(dlock, _mm_load_ps(&c[j][0])));

		partial_product1 = _mm_add_ps(
			_mm_mul_ps(dlock1, _mm_load_ps(&c[j+1][4])),
			_mm_mul_ps(dlock, _mm_load_ps(&c[j+1][0])));

		partial_product2 = _mm_add_ps(
			_mm_mul_ps(dlock1, _mm_load_ps(&c[j+2][4])),
			_mm_mul_ps(dlock, _mm_load_ps(&c[j+2][0])));

		partial_product3 = _mm_add_ps(
			_mm_mul_ps(dlock1, _mm_load_ps(&c[j+3][4])),
			_mm_mul_ps(dlock, _mm_load_ps(&c[j+3][0])));
      //}

	  partial_product = _mm_add_ps(
		  _mm_unpacklo_ps(partial_product, partial_product2),
		  _mm_unpackhi_ps(partial_product, partial_product2));
	  partial_product1 = _mm_add_ps(
		  _mm_unpacklo_ps(partial_product1, partial_product3),
		  _mm_unpackhi_ps(partial_product1, partial_product3));
	  partial_product = _mm_add_ps(
		  _mm_unpacklo_ps(partial_product, partial_product1),
		  _mm_unpackhi_ps(partial_product, partial_product1));

		partial_product0 = _mm_add_ps(
			_mm_mul_ps(dlock1, _mm_load_ps(&c[j4][4])),
			_mm_mul_ps(dlock, _mm_load_ps(&c[j4][0])));

		partial_product1 = _mm_add_ps(
			_mm_mul_ps(dlock1, _mm_load_ps(&c[j4+1][4])),
			_mm_mul_ps(dlock, _mm_load_ps(&c[j4+1][0])));

		partial_product2 = _mm_add_ps(
			_mm_mul_ps(dlock1, _mm_load_ps(&c[j4+2][4])),
			_mm_mul_ps(dlock, _mm_load_ps(&c[j4+2][0])));

		partial_product3 = _mm_add_ps(
			_mm_mul_ps(dlock1, _mm_load_ps(&c[j4+3][4])),
			_mm_mul_ps(dlock, _mm_load_ps(&c[j4+3][0])));
      // 2nd part

	  partial_product0 = _mm_add_ps(
		  _mm_unpacklo_ps(partial_product0, partial_product2),
		  _mm_unpackhi_ps(partial_product0, partial_product2));
	  partial_product1 = _mm_add_ps(
		  _mm_unpacklo_ps(partial_product1, partial_product3),
		  _mm_unpackhi_ps(partial_product1, partial_product3));
	  partial_product0 = _mm_add_ps(
		  _mm_unpacklo_ps(partial_product0, partial_product1),
		  _mm_unpackhi_ps(partial_product0, partial_product1));
#undef j
#undef j4
	  j = i & 4;
	  k = (i & ~4)/* << 1*/; k += k;
	  _mm_storel_pi((__m64*)&tmp[8*j+k], partial_product);
	  _mm_storeh_pi((__m64*)&tmp[8*(j+1)+k], partial_product);
	  _mm_storel_pi((__m64*)&tmp[8*(j+2)+k], partial_product0);
	  _mm_storeh_pi((__m64*)&tmp[8*(j+3)+k], partial_product0);
//    }
  }

  /* Transpose operation is integrated into address mapping by switching 
     loop order of i and j */

  for (j=0; j<8; /*j++*/)
  {
		i = j >> 1;
		dlock = _mm_load_ps(&tmp[8*i]);
		partial_product2 = _mm_load_ps(&tmp[8*i + 4]);

		dlock1 = _mm_load_ps(&tmp[8*(i+4)]);
		partial_product3 = _mm_load_ps(&tmp[8*(i+4) + 4]);

		dlock2 = _mm_shuffle_ps(dlock, partial_product2, _MM_SHUFFLE(3,1,3,1));
		dlock = _mm_shuffle_ps(dlock, partial_product2, _MM_SHUFFLE(2,0,2,0));

		dlock3 = _mm_shuffle_ps(dlock1, partial_product3, _MM_SHUFFLE(3,1,3,1));
		dlock1 = _mm_shuffle_ps(dlock1, partial_product3, _MM_SHUFFLE(2,0,2,0));

	//for (i=0; i<8; i += 4)
    //{
      //partial_product = _mm_setzero_pd();

      //for (k=0; k<8; k += 4)
	  //{
        //partial_product+= c[i][k]*tmp[8*j+k];

#define i 0
#define i4 4
#define k 0
		partial_product =
			_mm_mul_ps(dlock, _mm_load_ps(&c[i][k]));
		partial_product1 =
			_mm_mul_ps(dlock, _mm_load_ps(&c[i+1][k]));
		partial_product2 =
			_mm_mul_ps(dlock, _mm_load_ps(&c[i+2][k]));
		partial_product3 =
			_mm_mul_ps(dlock, _mm_load_ps(&c[i+3][k]));
#undef k
#define k 4
		partial_product = _mm_add_ps(partial_product,
			_mm_mul_ps(dlock1, _mm_load_ps(&c[i][k])));
		partial_product1 = _mm_add_ps(partial_product1,
			_mm_mul_ps(dlock1, _mm_load_ps(&c[i+1][k])));
		partial_product2 = _mm_add_ps(partial_product2,
			_mm_mul_ps(dlock1, _mm_load_ps(&c[i+2][k])));
		partial_product3 = _mm_add_ps(partial_product3,
			_mm_mul_ps(dlock1, _mm_load_ps(&c[i+3][k])));
#undef k
	  //}

	  partial_product = _mm_add_ps(
		  _mm_unpacklo_ps(partial_product, partial_product2),
		  _mm_unpackhi_ps(partial_product, partial_product2));
	  partial_product1 = _mm_add_ps(
		  _mm_unpacklo_ps(partial_product1, partial_product3),
		  _mm_unpackhi_ps(partial_product1, partial_product3));
	  partial_product = _mm_add_ps(
		  _mm_unpacklo_ps(partial_product, partial_product1),
		  _mm_unpackhi_ps(partial_product, partial_product1));

      //v = (int) floor(partial_product+0.5);
	  clock = _mm_cvtps_epi32(partial_product);

#define k 0
		partial_product =
			_mm_mul_ps(dlock, _mm_load_ps(&c[i4][k]));
		partial_product1 =
			_mm_mul_ps(dlock, _mm_load_ps(&c[i4+1][k]));
		partial_product2 =
			_mm_mul_ps(dlock, _mm_load_ps(&c[i4+2][k]));
		partial_product3 =
			_mm_mul_ps(dlock, _mm_load_ps(&c[i4+3][k]));
#undef k
#define k 4
		partial_product = _mm_add_ps(partial_product,
			_mm_mul_ps(dlock1, _mm_load_ps(&c[i4][k])));
		partial_product1 = _mm_add_ps(partial_product1,
			_mm_mul_ps(dlock1, _mm_load_ps(&c[i4+1][k])));
		partial_product2 = _mm_add_ps(partial_product2,
			_mm_mul_ps(dlock1, _mm_load_ps(&c[i4+2][k])));
		partial_product3 = _mm_add_ps(partial_product3,
			_mm_mul_ps(dlock1, _mm_load_ps(&c[i4+3][k])));
#undef k

	  partial_product = _mm_add_ps(
		  _mm_unpacklo_ps(partial_product, partial_product2),
		  _mm_unpackhi_ps(partial_product, partial_product2));
	  partial_product1 = _mm_add_ps(
		  _mm_unpacklo_ps(partial_product1, partial_product3),
		  _mm_unpackhi_ps(partial_product1, partial_product3));
	  partial_product = _mm_add_ps(
		  _mm_unpacklo_ps(partial_product, partial_product1),
		  _mm_unpackhi_ps(partial_product, partial_product1));

      //v = (int) floor(partial_product0+0.5);
	  clock1 = _mm_cvtps_epi32(partial_product);

      //block[8*i+j] = (v<-256) ? -256 : ((v>255) ? 255 : v);
	  sign = _mm_load_si128((__m128*)sm256);
	  clock = _mm_max_epi16(clock, sign);
	  clock1 = _mm_max_epi16(clock1, sign);
	  sign = _mm_load_si128((__m128*)s255);
	  clock = _mm_min_epi16(clock, sign);
	  clock1 = _mm_min_epi16(clock1, sign);
	  
	  //v = (v < -256)? -256 : v;
	  //v = (v > 255)? 255 : v;
	  block[8*i+j] = _mm_extract_epi16(clock, 0);
	  block[8*(i+1)+j] = _mm_extract_epi16(clock, 2);
	  block[8*(i+2)+j] = _mm_extract_epi16(clock, 4);
	  block[8*(i+3)+j] = _mm_extract_epi16(clock, 6);
	  block[8*i4+j] = _mm_extract_epi16(clock1, 0);
	  block[8*(i4+1)+j] = _mm_extract_epi16(clock1, 2);
	  block[8*(i4+2)+j] = _mm_extract_epi16(clock1, 4);
	  block[8*(i4+3)+j] = _mm_extract_epi16(clock1, 6);
#undef i
#undef i4
    //}

	j++;
	//for (i=0; i<8; i += 4)
    //{
      //partial_product = _mm_setzero_pd();

      //for (k=0; k<8; k += 4)
	  //{
        //partial_product+= c[i][k]*tmp[8*j+k];

#define i 0
#define i4 4
#define k 0
		partial_product =
			_mm_mul_ps(dlock2, _mm_load_ps(&c[i][k]));
		partial_product1 =
			_mm_mul_ps(dlock2, _mm_load_ps(&c[i+1][k]));
		partial_product2 =
			_mm_mul_ps(dlock2, _mm_load_ps(&c[i+2][k]));
		partial_product3 =
			_mm_mul_ps(dlock2, _mm_load_ps(&c[i+3][k]));
#undef k
#define k 4
		partial_product = _mm_add_ps(partial_product,
			_mm_mul_ps(dlock3, _mm_load_ps(&c[i][k])));
		partial_product1 = _mm_add_ps(partial_product1,
			_mm_mul_ps(dlock3, _mm_load_ps(&c[i+1][k])));
		partial_product2 = _mm_add_ps(partial_product2,
			_mm_mul_ps(dlock3, _mm_load_ps(&c[i+2][k])));
		partial_product3 = _mm_add_ps(partial_product3,
			_mm_mul_ps(dlock3, _mm_load_ps(&c[i+3][k])));
#undef k
	  //}

	  partial_product = _mm_add_ps(
		  _mm_unpacklo_ps(partial_product, partial_product2),
		  _mm_unpackhi_ps(partial_product, partial_product2));
	  partial_product1 = _mm_add_ps(
		  _mm_unpacklo_ps(partial_product1, partial_product3),
		  _mm_unpackhi_ps(partial_product1, partial_product3));
	  partial_product = _mm_add_ps(
		  _mm_unpacklo_ps(partial_product, partial_product1),
		  _mm_unpackhi_ps(partial_product, partial_product1));

      //v = (int) floor(partial_product+0.5);
	  clock = _mm_cvtps_epi32(partial_product);

#define k 0
		partial_product =
			_mm_mul_ps(dlock2, _mm_load_ps(&c[i4][k]));
		partial_product1 =
			_mm_mul_ps(dlock2, _mm_load_ps(&c[i4+1][k]));
		partial_product2 =
			_mm_mul_ps(dlock2, _mm_load_ps(&c[i4+2][k]));
		partial_product3 =
			_mm_mul_ps(dlock2, _mm_load_ps(&c[i4+3][k]));
#undef k
#define k 4
		partial_product = _mm_add_ps(partial_product,
			_mm_mul_ps(dlock3, _mm_load_ps(&c[i4][k])));
		partial_product1 = _mm_add_ps(partial_product1,
			_mm_mul_ps(dlock3, _mm_load_ps(&c[i4+1][k])));
		partial_product2 = _mm_add_ps(partial_product2,
			_mm_mul_ps(dlock3, _mm_load_ps(&c[i4+2][k])));
		partial_product3 = _mm_add_ps(partial_product3,
			_mm_mul_ps(dlock3, _mm_load_ps(&c[i4+3][k])));
#undef k

	  partial_product = _mm_add_ps(
		  _mm_unpacklo_ps(partial_product, partial_product2),
		  _mm_unpackhi_ps(partial_product, partial_product2));
	  partial_product1 = _mm_add_ps(
		  _mm_unpacklo_ps(partial_product1, partial_product3),
		  _mm_unpackhi_ps(partial_product1, partial_product3));
	  partial_product = _mm_add_ps(
		  _mm_unpacklo_ps(partial_product, partial_product1),
		  _mm_unpackhi_ps(partial_product, partial_product1));

      //v = (int) floor(partial_product0+0.5);
	  clock1 = _mm_cvtps_epi32(partial_product);

      //block[8*i+j] = (v<-256) ? -256 : ((v>255) ? 255 : v);
	  sign = _mm_load_si128((__m128*)sm256);
	  clock = _mm_max_epi16(clock, sign);
	  clock1 = _mm_max_epi16(clock1, sign);
	  sign = _mm_load_si128((__m128*)s255);
	  clock = _mm_min_epi16(clock, sign);
	  clock1 = _mm_min_epi16(clock1, sign);
	  
	  //v = (v < -256)? -256 : v;
	  //v = (v > 255)? 255 : v;
	  block[8*i+j] = _mm_extract_epi16(clock, 0);
	  block[8*(i+1)+j] = _mm_extract_epi16(clock, 2);
	  block[8*(i+2)+j] = _mm_extract_epi16(clock, 4);
	  block[8*(i+3)+j] = _mm_extract_epi16(clock, 6);
	  block[8*i4+j] = _mm_extract_epi16(clock1, 0);
	  block[8*(i4+1)+j] = _mm_extract_epi16(clock1, 2);
	  block[8*(i4+2)+j] = _mm_extract_epi16(clock1, 4);
	  block[8*(i4+3)+j] = _mm_extract_epi16(clock1, 6);
#undef i
#undef i4
    //}

	j++;
  }
}
