/* Reference_IDCT.c, Inverse Discrete Fourier Transform, double precision          */

/* Copyright (C) 1996, MPEG Software Simulation Group. All Rights Reserved. */

/*
 * Disclaimer of Warranty
 *
 * These software programs are available to the user without any license fee or
 * royalty on an "as is" basis.  The MPEG Software Simulation Group disclaims
 * any and all warranties, whether express, implied, or statuary, including any
 * implied warranties or merchantability or of fitness for a particular
 * purpose.  In no event shall the copyright-holder be liable for any
 * incidental, punitive, or consequential damages of any kind whatsoever
 * arising from the use of these programs.
 *
 * This disclaimer of warranty extends to the user of these programs and user's
 * customers, employees, agents, transferees, successors, and assigns.
 *
 * The MPEG Software Simulation Group does not represent or warrant that the
 * programs furnished hereunder are free of infringement of any third-party
 * patents.
 *
 * Commercial implementations of MPEG-1 and MPEG-2 video, including shareware,
 * are subject to royalty fees to patent holders.  Many of these patents are
 * general enough such that they are unavoidable regardless of implementation
 * design.
 *
 */

/*  Perform IEEE 1180 reference (64-bit floating point, separable 8x1
 *  direct matrix multiply) Inverse Discrete Cosine Transform
*/


/* Here we use math.h to generate constants.  Compiler results may
   vary a little */

#include <math.h>
#include <emmintrin.h>
#include <memory.h>

#ifndef PI
# ifdef M_PI
#  define PI M_PI
# else
#  define PI 3.14159265358979323846
# endif
#endif



#if 0
/* private data */

/* cosine transform matrix for 8x1 IDCT */
static _MM_ALIGN16 float c[8][8];

/* initialize DCT coefficient matrix */

void Initialize_Reference_IDCT_SSE()
{
  int freq, time;
  double scale;

  for (freq=0; freq < 8; freq++)
  {
    scale = (freq == 0) ? sqrt(0.125) : 0.5;
    for (time=0; time<8; time++)
      c[time][freq] = (float)(scale*cos((PI/8.0)*freq*(time + 0.5)));
  }
}

/* perform IDCT matrix multiply for 8x8 coefficient block */

void Reference_IDCT_SSE(block)
short *block;
{
  int i, j, k, v;
  __m128 partial_product;
  __m128i clock, sign;
  //const __m128i zero = _mm_setzero_si128();
  __m128 dlock;
  static _MM_ALIGN16 float tmp[64];

  for (i=0; i<8; i++)
    for (j=0; j<8; j++)
    {
      //partial_product1 = partial_product = _mm_setzero_pd();

      //for (k=0; k<8; k += 4)
	  {
		//partial_product+= c[j][k]*block[8*i+k];

		k = 0;
		clock = _mm_loadl_epi64((__m128i*)&block[8*i+k]);
		
		sign = _mm_cmpgt_epi16(_mm_setzero_si128(), clock);
		clock = _mm_unpacklo_epi16(clock, sign);
		
		dlock = _mm_cvtepi32_ps(clock);

		partial_product =
			_mm_mul_ps(dlock, _mm_load_ps(&c[j][k]));

		k = 4;
		clock = _mm_loadl_epi64((__m128i*)&block[8*i+k]);
		
		sign = _mm_cmpgt_epi16(_mm_setzero_si128(), clock);
		clock = _mm_unpacklo_epi16(clock, sign);
		
		dlock = _mm_cvtepi32_ps(clock);

		partial_product = _mm_add_ps(partial_product,
			_mm_mul_ps(dlock, _mm_load_ps(&c[j][k])));
	  }

	  partial_product = _mm_add_ps(partial_product,
		  _mm_movehl_ps(partial_product, partial_product));
	  partial_product = _mm_add_ss(partial_product,
		  _mm_shuffle_ps(partial_product, partial_product, _MM_SHUFFLE(3,2,1,1)));

      _mm_store_ss(&tmp[8*j+i], partial_product);
    }

  /* Transpose operation is integrated into address mapping by switching 
     loop order of i and j */

  for (j=0; j<8; j++)
    for (i=0; i<8; i++)
    {
      //partial_product = _mm_setzero_pd();

      //for (k=0; k<8; k += 2)
	  {
        //partial_product+= c[i][k]*tmp[8*j+k];
		k = 0;
		partial_product =
			_mm_mul_ps(_mm_load_ps(&c[i][k]), _mm_load_ps(&tmp[8*j+k]));

		k = 4;
		partial_product = _mm_add_ps(partial_product,
			_mm_mul_ps(_mm_load_ps(&c[i][k]), _mm_load_ps(&tmp[8*j+k])));
	  }

	  partial_product = _mm_add_ps(partial_product,
		  _mm_movehl_ps(partial_product, partial_product));
	  partial_product = _mm_add_ss(partial_product,
		  _mm_shuffle_ps(partial_product, partial_product, _MM_SHUFFLE(3,2,1,1)));

      //v = (int) floor(partial_product+0.5);
	  v = _mm_cvtss_si32(partial_product);
	  
      //block[8*i+j] = (v<-256) ? -256 : ((v>255) ? 255 : v);
	  v = (v < -256)? -256 : v;
	  v = (v > 255)? 255 : v;
	  block[8*i+j] = v;
	}
}
#endif

/********************************************************************/
/*THIS SAMPLE CODE IS PROVIDED "AS IS" WITH NO WARRANTIES			*/
/*WHATSOEVER, INCLUDING ANY WARRANTY OF MERCHANTABILITY, 			*/
/*NONINFRINGEMENT, FITNESS FOR ANY PARTICULAR PURPOSE, OR ANY 		*/
/*WARRANTY OTHERWISE ARISING OUT OF THIS SAMPLE.					*/
/********************************************************************/
/*            Copyright (c) 1999 - 2000 Intel Corporation.			*/
/*                     All rights reserved.							*/
/********************************************************************/

//=============================================================================
//  Name:      M128asm_dct_2d_16s_8x8_inv
//  Purpose:   Inverse Cosine Transform 8x8. 2D buffer of short int data

#include <dvec.h>
#define BITS_INV_ACC	4                                // 4 or 5 for IEEE
#define SHIFT_INV_ROW	16 - BITS_INV_ACC
#define SHIFT_INV_COL	1 + BITS_INV_ACC
const short RND_INV_ROW   = 1024 * (6 - BITS_INV_ACC);        //1 << (SHIFT_INV_ROW-1)
const short RND_INV_COL   = 16 * (BITS_INV_ACC - 3);          // 1 << (SHIFT_INV_COL-1)
const short RND_INV_CORR  = RND_INV_COL - 1;                  // correction -1.0 and round



Is16vec8 ivec_one_corr(1,	1,	1,	1,	1,	1,	1,	1);
Is16vec8 ivec_round_inv_row(0, RND_INV_ROW, 0, RND_INV_ROW, 0, RND_INV_ROW, 0, RND_INV_ROW);
//__declspec(align(16)) short M128_one_corr[8] = {1,	1,	1,	1,	1,	1,	1,	1};
//__declspec(align(16)) short M128_round_inv_row[8] = {RND_INV_ROW, 0, RND_INV_ROW, 0, RND_INV_ROW, 0, RND_INV_ROW, 0};

Is16vec8 ivec_round_inv_col(RND_INV_COL,  RND_INV_COL,  RND_INV_COL,  RND_INV_COL,	RND_INV_COL,  RND_INV_COL,  RND_INV_COL,  RND_INV_COL);
Is16vec8 ivec_round_inv_corr(RND_INV_CORR, RND_INV_CORR, RND_INV_CORR, RND_INV_CORR, RND_INV_CORR, RND_INV_CORR, RND_INV_CORR, RND_INV_CORR);
//__declspec(align(16)) short M128_round_inv_col[8] = {RND_INV_COL,  RND_INV_COL,  RND_INV_COL,  RND_INV_COL,	RND_INV_COL,  RND_INV_COL,  RND_INV_COL,  RND_INV_COL};
//__declspec(align(16)) short M128_round_inv_corr[8]= {RND_INV_CORR, RND_INV_CORR, RND_INV_CORR, RND_INV_CORR, RND_INV_CORR, RND_INV_CORR, RND_INV_CORR, RND_INV_CORR};
//round_frw_row  dword                 RND_FRW_ROW,                RND_FRW_ROW

Is16vec8 ivec_tg_1_16(13036,  13036,  13036,  13036,	13036,  13036,  13036,  13036);
Is16vec8 ivec_tg_2_16(27146,  27146,  27146,  27146,	27146,  27146,  27146,  27146);
Is16vec8 ivec_tg_3_16(-21746, -21746, -21746, -21746, -21746, -21746, -21746, -21746);
Is16vec8 ivec_cos_4_16(-19195, -19195, -19195, -19195,-19195, -19195, -19195, -19195);

//__declspec(align(16)) short  M128_tg_1_16[8] = {13036,  13036,  13036,  13036,	13036,  13036,  13036,  13036};	//  tg * (2<<16) + 0.5
//__declspec(align(16)) short  M128_tg_2_16[8] = {27146,  27146,  27146,  27146,	27146,  27146,  27146,  27146};	//  tg * (2<<16) + 0.5
//__declspec(align(16)) short  M128_tg_3_16[8] = {-21746, -21746, -21746, -21746, -21746, -21746, -21746, -21746};	//  tg * (2<<16) + 0.5
//__declspec(align(16)) short  M128_cos_4_16[8] = {-19195, -19195, -19195, -19195,-19195, -19195, -19195, -19195};//  cos * (2<<16) + 0.5
//ocos_4_16      sword   23170,  23170,  23170,  23170   ; cos * (2<<15) + 0.5


//-----------------------------------------------------------------------------

// Table for rows 0,4 - constants are multiplied on cos_4_16


//movq ->	w13 w12 w09 w08 w05 w04 w01 w00
//			w15 w14 w11 w10 w07 w06 w03 w02
//			w29 w28 w25 w24 w21 w20 w17 w16
//			w31 w30 w27 w26 w23 w22 w19 w18

__declspec(align(16)) short M128tab_i_04[] = {16384,	21407,	16384,	8867,	//movq ->	w05 w04 w01 w00
					16384,	-8867,  16384, -21407,		//			w13 w12 w09 w08
					16384,	8867,	-16384,	-21407,		//			w07 w06 w03 w02
					-16384,	21407,  16384,  -8867,		//			w15 w14 w11 w10
					22725,	19266,  19266,  -4520,		//			w21 w20 w17 w16
					12873,	-22725,	4520,	-12873,		//			w29 w28 w25 w24
					12873,	4520,	-22725, -12873,		//			w23 w22 w19 w18
					4520,	19266,  19266,	-22725};	//			w31 w30 w27 w26

// Table for rows 1,7 - constants are multiplied on cos_1_16

__declspec(align(16)) short M128tab_i_17[] = {22725,  29692,  22725,  12299,	//movq ->	w05 w04 w01 w00
					22725,	-12299, 22725,	-29692,		//			w13 w12 w09 w08
					22725,  12299,	-22725, -29692,		//			w07 w06 w03 w02
					-22725,	29692,  22725,	-12299,		//			w15 w14 w11 w10
					31521,  26722,  26722,  -6270,		//			w21 w20 w17 w16
					17855,	-31521, 6270,	-17855,		//			w29 w28 w25 w24
					17855,  6270,	-31521, -17855,		//			w23 w22 w19 w18
					6270,	26722,  26722,	-31521};	//			w31 w30 w27 w26

// Table for rows 2,6 - constants are multiplied on cos_2_16

__declspec(align(16)) short M128tab_i_26[] = {21407,  27969,  21407,  11585,	//movq ->	w05 w04 w01 w00
					21407,	-11585, 21407,	-27969,		//			w13 w12 w09 w08
					21407,  11585,	-21407, -27969,		//			w07 w06 w03 w02
					-21407, 27969,  21407,	-11585,		//			w15 w14 w11 w10
					29692,  25172,  25172,  -5906,		//			w21 w20 w17 w16
					16819,	-29692, 5906,	-16819,		//			w29 w28 w25 w24
					16819,  5906,	-29692, -16819,		//			w23 w22 w19 w18
					5906,	25172,  25172,	-29692};	//			w31 w30 w27 w26

// Table for rows 3,5 - constants are multiplied on cos_3_16

__declspec(align(16)) short M128tab_i_35[] = {19266,  25172,  19266,  10426,	//movq ->	w05 w04 w01 w00
					19266,	-10426, 19266,	-25172,		//			w13 w12 w09 w08
					19266,  10426,	-19266, -25172,		//			w07 w06 w03 w02
					-19266, 25172,  19266,	-10426,		//			w15 w14 w11 w10
					26722,  22654,  22654,  -5315,		//			w21 w20 w17 w16
					15137,	-26722, 5315,	-15137,		//			w29 w28 w25 w24
					15137,  5315,	-26722, -15137,		//			w23 w22 w19 w18
					5315,	22654,  22654,	-26722};	//			w31 w30 w27 w26

Is16vec8 *ivec_tab_i_04 = (Is16vec8*)M128tab_i_04;
Is16vec8 *ivec_tab_i_17 = (Is16vec8*)M128tab_i_17;
Is16vec8 *ivec_tab_i_26 = (Is16vec8*)M128tab_i_26;
Is16vec8 *ivec_tab_i_35 = (Is16vec8*)M128tab_i_35;

//-----------------------------------------------------------------------------

/*
;=============================================================================
;=============================================================================
;=============================================================================
;
; Inverse DCT
;
;-----------------------------------------------------------------------------
;
; This implementation calculates iDCT-2D by a row-column method.
; On the first stage the iDCT-1D is calculated for each row with use
; direct algorithm, on the second stage the calculation is executed
; at once for four columns with use of scaled iDCT-1D algorithm.
; Base R&Y algorithm for iDCT-1D is modified for second stage.
;
;=============================================================================

;-----------------------------------------------------------------------------
;
; The first stage - inverse DCTs of rows
;
;-----------------------------------------------------------------------------
; The 8-point inverse DCT direct algorithm
;-----------------------------------------------------------------------------
;
; static const short w[32] = {
;    FIX(cos_4_16),  FIX(cos_2_16),  FIX(cos_4_16),  FIX(cos_6_16),
;    FIX(cos_4_16),  FIX(cos_6_16), -FIX(cos_4_16), -FIX(cos_2_16),
;    FIX(cos_4_16), -FIX(cos_6_16), -FIX(cos_4_16),  FIX(cos_2_16),
;    FIX(cos_4_16), -FIX(cos_2_16),  FIX(cos_4_16), -FIX(cos_6_16),
;    FIX(cos_1_16),  FIX(cos_3_16),  FIX(cos_5_16),  FIX(cos_7_16),
;    FIX(cos_3_16), -FIX(cos_7_16), -FIX(cos_1_16), -FIX(cos_5_16),
;    FIX(cos_5_16), -FIX(cos_1_16),  FIX(cos_7_16),  FIX(cos_3_16),
;    FIX(cos_7_16), -FIX(cos_5_16),  FIX(cos_3_16), -FIX(cos_1_16) };
;
; #define DCT_8_INV_ROW(x, y)
; {
;    int a0, a1, a2, a3, b0, b1, b2, b3;
;
;    a0   = x[0] * w[ 0] + x[2] * w[ 1] + x[4] * w[ 2] + x[6] * w[ 3];
;    a1   = x[0] * w[ 4] + x[2] * w[ 5] + x[4] * w[ 6] + x[6] * w[ 7];
;    a2   = x[0] * w[ 8] + x[2] * w[ 9] + x[4] * w[10] + x[6] * w[11];
;    a3   = x[0] * w[12] + x[2] * w[13] + x[4] * w[14] + x[6] * w[15];
;    b0   = x[1] * w[16] + x[3] * w[17] + x[5] * w[18] + x[7] * w[19];
;    b1   = x[1] * w[20] + x[3] * w[21] + x[5] * w[22] + x[7] * w[23];
;    b2   = x[1] * w[24] + x[3] * w[25] + x[5] * w[26] + x[7] * w[27];
;    b3   = x[1] * w[28] + x[3] * w[29] + x[5] * w[30] + x[7] * w[31];
;
;    y[0] = SHIFT_ROUND ( a0 + b0 );
;    y[1] = SHIFT_ROUND ( a1 + b1 );
;    y[2] = SHIFT_ROUND ( a2 + b2 );
;    y[3] = SHIFT_ROUND ( a3 + b3 );
;    y[4] = SHIFT_ROUND ( a3 - b3 );
;    y[5] = SHIFT_ROUND ( a2 - b2 );
;    y[6] = SHIFT_ROUND ( a1 - b1 );
;    y[7] = SHIFT_ROUND ( a0 - b0 );
; }
;
;-----------------------------------------------------------------------------
;
; In this implementation the outputs of the iDCT-1D are multiplied
;    for rows 0,4 - on cos_4_16,
;    for rows 1,7 - on cos_1_16,
;    for rows 2,6 - on cos_2_16,
;    for rows 3,5 - on cos_3_16
; and are shifted to the left for rise of accuracy
;
; For used constants
;    FIX(float_const) = (short) (float_const * (1<<15) + 0.5)
;
;-----------------------------------------------------------------------------

;-----------------------------------------------------------------------------
;
; The second stage - inverse DCTs of columns
;
; The inputs are multiplied
;    for rows 0,4 - on cos_4_16,
;    for rows 1,7 - on cos_1_16,
;    for rows 2,6 - on cos_2_16,
;    for rows 3,5 - on cos_3_16
; and are shifted to the left for rise of accuracy
;
;-----------------------------------------------------------------------------
;
; The 8-point scaled inverse DCT algorithm (26a8m)
;
;-----------------------------------------------------------------------------
;
; #define DCT_8_INV_COL(x, y)
; {
;    short t0, t1, t2, t3, t4, t5, t6, t7;
;    short tp03, tm03, tp12, tm12, tp65, tm65;
;    short tp465, tm465, tp765, tm765;
;
;    tp765 = x[1]           + x[7] * tg_1_16;
;    tp465 = x[1] * tg_1_16 - x[7];
;    tm765 = x[5] * tg_3_16 + x[3];
;    tm465 = x[5]           - x[3] * tg_3_16;
;
;    t7    = tp765 + tm765;
;    tp65  = tp765 - tm765;
;    t4    = tp465 + tm465;
;    tm65  = tp465 - tm465;
;
;    t6    = ( tp65 + tm65 ) * cos_4_16;
;    t5    = ( tp65 - tm65 ) * cos_4_16;
;
;    tp03  = x[0] + x[4];
;    tp12  = x[0] - x[4];
;
;    tm03  = x[2]           + x[6] * tg_2_16;
;    tm12  = x[2] * tg_2_16 - x[6];
;
;    t0    = tp03 + tm03;
;    t3    = tp03 - tm03;
;    t1    = tp12 + tm12;
;    t2    = tp12 - tm12;
;
;    y[0]  = SHIFT_ROUND ( t0 + t7 );
;    y[7]  = SHIFT_ROUND ( t0 - t7 );
;    y[1]  = SHIFT_ROUND ( t1 + t6 );
;    y[6]  = SHIFT_ROUND ( t1 - t6 );
;    y[2]  = SHIFT_ROUND ( t2 + t5 );
;    y[5]  = SHIFT_ROUND ( t2 - t5 );
;    y[3]  = SHIFT_ROUND ( t3 + t4 );
;    y[4]  = SHIFT_ROUND ( t3 - t4 );
; }
;
;-----------------------------------------------------------------------------
*/
//emm7 = round_inv_row
#define DCT_8_INV_ROWX2															\
		row = (Is16vec8)_mm_shufflelo_epi16(row,0xd8);							\
		row20 = (Is16vec8)_mm_shuffle_epi32(row,0x00);							\
		row20 = (Is16vec8)_mm_madd_epi16(row20,table1[0]);						\
		row31 = (Is16vec8)_mm_shuffle_epi32(row,0x55);							\
		row = (Is16vec8)_mm_shufflehi_epi16(row,0xd8);							\
		row31 = (Is16vec8)_mm_madd_epi16(row31,table1[2]);						\
		/*-----------------------------------------------------------------*/	\
		row64 = (Is16vec8)_mm_shuffle_epi32(row,0xaa);							\
		row75 = (Is16vec8)_mm_shuffle_epi32(row,0xff);							\
		/*-----------------------------------------------------------------*/	\
		row64 = (Is16vec8)_mm_madd_epi16(row64,table1[1]);						\
		r2ow = (Is16vec8)_mm_shufflehi_epi16(r2ow,0xd8);						\
		row20 = (Is32vec4)row20 + (Is32vec4)ivec_round_inv_row;					\
		r2ow = (Is16vec8)_mm_shufflelo_epi16(r2ow,0xd8);						\
		row75 = (Is16vec8)_mm_madd_epi16(row75,table1[3]);						\
		r2ow20 = (Is16vec8)_mm_shuffle_epi32(r2ow,0x00);						\
		r2ow64 = (Is16vec8)_mm_shuffle_epi32(r2ow,0xaa);						\
		r2ow20 = (Is16vec8)_mm_madd_epi16(r2ow20,table2[0]);					\
		/*-----------------------------------------------------------------*/	\
		a = (Is32vec4)row20 + (Is32vec4)row64;									\
		r2ow31 = (Is16vec8)_mm_shuffle_epi32(r2ow,0x55);						\
		r2ow64 = (Is16vec8)_mm_madd_epi16(r2ow64,table2[1]);					\
		b = (Is32vec4)row31 + (Is32vec4)row75;									\
		r2ow75 = (Is16vec8)_mm_shuffle_epi32(r2ow,0xff);						\
		Yhigh = ((Is32vec4)((Is32vec4)a-(Is32vec4)b)) >> SHIFT_INV_ROW;			\
		r2ow31 = (Is16vec8)_mm_madd_epi16(r2ow31,table2[2]);					\
		Ylow =  ((Is32vec4)((Is32vec4)a+(Is32vec4)b)) >> SHIFT_INV_ROW;			\
		r2ow20 = (Is32vec4)r2ow20 + (Is32vec4)ivec_round_inv_row;				\
		r2ow75 = (Is16vec8)_mm_madd_epi16(r2ow75,table2[3]);					\
		a2 = (Is32vec4)r2ow20 + (Is32vec4)r2ow64;								\
		Yhigh = (Is16vec8)_mm_shuffle_epi32(Yhigh, 0x1b);						\
		b2 = (Is32vec4)r2ow31 + (Is32vec4)r2ow75;								\
		Y2high = ((Is32vec4)((Is32vec4)a2-(Is32vec4)b2)) >> SHIFT_INV_ROW;		\
		Y2low =  ((Is32vec4)((Is32vec4)a2+(Is32vec4)b2)) >> SHIFT_INV_ROW;		\
		Y2high = (Is16vec8)_mm_shuffle_epi32(Y2high, 0x1b);						\






Is16vec8 tm765;
Is16vec8 tm465;
Is16vec8 tp765;
Is16vec8 tp465;
Is16vec8 tm65;
Is16vec8 tp65;
Is16vec8 tmp;
Is16vec8 tm03;
Is16vec8 tp03;
Is16vec8 tm12;
Is16vec8 tp12;
Is16vec8 t0;
Is16vec8 t1;
Is16vec8 t2;
Is16vec8 t3;
Is16vec8 t4;
Is16vec8 t5;
Is16vec8 t6;
Is16vec8 t7;
 


void Initialize_Reference_IDCT_SSE()
{
}

/* perform IDCT matrix multiply for 8x8 coefficient block */

void Reference_IDCT_SSE(short* block)
//assumes src and destination are aligned on a 16-byte boundary																					
//int idct_M128IVEC::TheCode()
{
	static _MM_ALIGN16 short dest[64];
//	assert(((src & 0xf) == 0)&&((dst & 0xf) == 0))	//aligned on 16-byte boundary
		Is16vec8* src = (Is16vec8*)block;
		Is16vec8* dst = (Is16vec8*)dest;
		Is16vec8 row20, row64, row31, row75, r2ow20, r2ow64, r2ow31, r2ow75;
		Is16vec8 a, b, Ylow, Yhigh, a2, b2, Y2low, Y2high;
		
		Is16vec8 row = src[2], r2ow = src[3];
		Is16vec8 *table1 = ivec_tab_i_26, *table2 = ivec_tab_i_35;
		//
		DCT_8_INV_ROWX2;	//Row 3, tab_i_26	//Row 4, tab_i_35
		//
		dst[2] = (Is16vec8)_mm_packs_epi32(Ylow, Yhigh);
		dst[3] = (Is16vec8)_mm_packs_epi32(Y2low, Y2high);
		
		row = src[4];			r2ow = src[5];
		table1 = ivec_tab_i_04;	//table2 = ivec_tab_i_35;
		//
		DCT_8_INV_ROWX2;	//Row 5, tab_i_04	//Row 6, tab_i_35
		//
		dst[4] = (Is16vec8)_mm_packs_epi32(Ylow, Yhigh);
		dst[5] = (Is16vec8)_mm_packs_epi32(Y2low, Y2high);

		row = src[0];			r2ow = src[1];
		/*table1 = ivec_tab_i_04;*/	table2 = ivec_tab_i_17;
		//
		DCT_8_INV_ROWX2;	//Row 1, tab_i_04	//Row 2, tab_i_17
		//
		dst[0] = (Is16vec8)_mm_packs_epi32(Ylow, Yhigh);						
		dst[1] = (Is16vec8)_mm_packs_epi32(Y2low, Y2high);

		row = src[6];			r2ow = src[7];
		table1 = ivec_tab_i_26;	//table2 = ivec_tab_i_17;
		//
		DCT_8_INV_ROWX2;	//Row 7, tab_i_26	//Row 8, tab_i_17
		//
		dst[6] = (Is16vec8)_mm_packs_epi32(Ylow, Yhigh);
		Y2high = (Is16vec8)_mm_packs_epi32(Y2low, Y2high);	//store in Y2high so compiler can keep value in SIMD register
		

		//
		tm765 = mul_high(ivec_tg_3_16,dst[5]) + dst[5] + dst[3];	
		tm465 = dst[5] - (mul_high(ivec_tg_3_16,dst[3]) + dst[3]);	
		tp765 = mul_high(Y2high,ivec_tg_1_16) + dst[1];				
		tp465 = mul_high(dst[1],ivec_tg_1_16) - Y2high;				
		/*-------------------------------------------------------*/	
		tm65 = tp465 - tm465 + ivec_one_corr;						
		tp65 = tp765 - tm765;										
		t4 = tp465 + tm465;											
		t7 = tp765 + tm765 + ivec_one_corr;							
		/*-------------------------------------------------------*/	
		tmp = tp65 + tm65;											
		t6 = mul_high(tmp,ivec_cos_4_16) + tmp;						
		t6 |= ivec_one_corr;										
		tmp = tp65 - tm65;											
		t5 = mul_high(tmp,ivec_cos_4_16) + tmp;						
		t5 |= ivec_one_corr;										
		/*-------------------------------------------------------*/	
		tm03 = mul_high(dst[6],ivec_tg_2_16) + dst[2];				
		tm12 = mul_high(dst[2],ivec_tg_2_16) - dst[6];				
		tp03 = dst[0] + dst[4];										
		tp12 = dst[0] - dst[4];										
		/*-------------------------------------------------------*/	
		t0 = tp03 + tm03 + ivec_round_inv_col;						
		t3 = tp03 - tm03 + ivec_round_inv_corr;						
		t1 = tp12 + tm12 + ivec_round_inv_col;						
		t2 = tp12 - tm12 + ivec_round_inv_corr;						
		/*-------------------------------------------------------*/	
		dst[0] = (t0+t7) >> SHIFT_INV_COL;							
		dst[1] = (t1+t6) >> SHIFT_INV_COL;							
		dst[2] = (t2+t5) >> SHIFT_INV_COL;							
		dst[3] = (t3+t4) >> SHIFT_INV_COL;							
		dst[4] = (t3-t4) >> SHIFT_INV_COL;							
		dst[5] = (t2-t5) >> SHIFT_INV_COL;							
		dst[6] = (t1-t6) >> SHIFT_INV_COL;							
		dst[7] = (t0-t7) >> SHIFT_INV_COL;								

		//__asm	emms
		//return(EXIT_SUCCESS);
		memcpy(block, dest, sizeof(dest));
}
