/* 
 *  FrameSource.cpp
 *
 *	Copyright (C) Alberto Vigata - July 2000 - ultraflask@yahoo.com
 *
 *  This file is part of FlasKMPEG, a free MPEG to MPEG/AVI converter
 *	
 *  FlasKMPEG is free software; you can redistribute it and/or modify
 *  it under the terms of the GNU General Public License as published by
 *  the Free Software Foundation; either version 2, or (at your option)
 *  any later version.
 *   
 *  FlasKMPEG is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *  GNU General Public License for more details.
 *   
 *  You should have received a copy of the GNU General Public License
 *  along with GNU Make; see the file COPYING.  If not, write to
 *  the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. 
 *
 */

#include <emmintrin.h>
#include "FrameSource.h"
#include "flmemcpy.h"
#include "debug.h"

// SSE2 Global consts
static _MM_ALIGN16 const __int64 xmmmask_0001[2] = {  0x0001000100010001,  0x0001000100010001 };
static _MM_ALIGN16 const __int64 xmmmask_0002[2] = {  0x0002000200020002,  0x0002000200020002 };
static _MM_ALIGN16 const __int64 xmmmask_0003[2] = {  0x0003000300030003,  0x0003000300030003 };
static _MM_ALIGN16 const __int64 xmmmask_0004[2] = {  0x0004000400040004,  0x0004000400040004 };
static _MM_ALIGN16 const __int64 xmmmask_0005[2] = {  0x0005000500050005,  0x0005000500050005 };
static _MM_ALIGN16 const __int64 xmmmask_0007[2] = {  0x0007000700070007,  0x0007000700070007 };
static _MM_ALIGN16 const __int64 xmmmask_0016[2] = {  0x0010001000100010,  0x0010001000100010 };
static _MM_ALIGN16 const __int64 xmmmask_0040[2] = {  0x0040004000400040,  0x0040004000400040 };
static _MM_ALIGN16 const __int64 xmmmask_0128[2] = {  0x0080008000800080,  0x0080008000800080 };
static _MM_ALIGN16 const __int64 xmmmask_cbu[2] = {  0x0000408D0000408D,  0x0000408D0000408D };
static _MM_ALIGN16 const __int64 xmmmask_cgu_cgv[2] = {  0xF377E5FCF377E5FC,  0xF377E5FCF377E5FC };
static _MM_ALIGN16 const __int64 xmmmask_crv[2] = {  0x0000331300003313,  0x0000331300003313 };
static _MM_ALIGN16 const __int64 xYUVRGB_Scale[2] = {  0x1000254310002543,  0x1000254310002543 };

// MMX Global consts
static _MM_ALIGN16 const __int64 mmmask_0001 = 0x0001000100010001;
static _MM_ALIGN16 const __int64 mmmask_0002 = 0x0002000200020002;
static _MM_ALIGN16 const __int64 mmmask_0003 = 0x0003000300030003;
static _MM_ALIGN16 const __int64 mmmask_0004 = 0x0004000400040004;
static _MM_ALIGN16 const __int64 mmmask_0005 = 0x0005000500050005;
static _MM_ALIGN16 const __int64 mmmask_0007 = 0x0007000700070007;
static _MM_ALIGN16 const __int64 mmmask_0016 = 0x0010001000100010;
static _MM_ALIGN16 const __int64 mmmask_0040 = 0x0040004000400040;
static _MM_ALIGN16 const __int64 mmmask_0128 = 0x0080008000800080;
static _MM_ALIGN16 const __int64 mmmask_cbu = 0x0000408D0000408D;
static _MM_ALIGN16 const __int64 mmmask_cgu_cgv = 0xF377E5FCF377E5FC;
static _MM_ALIGN16 const __int64 mmmask_crv = 0x0000331300003313;
static _MM_ALIGN16 const __int64 YUVRGB_Scale = 0x1000254310002543;


void From420to422(unsigned char *src, unsigned char *dst, int width, int height, int frame_type)
{

    int hwidth    = width / 2;
    int dwidth    = width * 2;
    int hheightd2 = height / 2 - 2;
    int qheightd2 = height / 4 - 2;
    
    if (frame_type)
    {
      __asm
      {
        mov			eax, [src]
          mov			ebx, [dst]
          mov			ecx, ebx
          add			ecx, [hwidth]
          mov			esi, 0x00
          movq		mm3, [mmmask_0003]
          pxor		mm0, mm0
          movq		mm4, [mmmask_0002]
          
          mov			edx, eax
          add			edx, [hwidth]
convyuv422topp:
           movd		mm1, [eax+esi]
          movd		mm2, [edx+esi]
          movd		[ebx+esi], mm1
          punpcklbw	mm1, mm0
          pmullw		mm1, mm3
          paddusw		mm1, mm4
          punpcklbw	mm2, mm0
          paddusw		mm2, mm1
          psrlw		mm2, 0x02
          packuswb	mm2, mm0
          
          add			esi, 0x04
          cmp			esi, [hwidth]
          movd		[ecx+esi-4], mm2
          jl			convyuv422topp
          
          add			eax, [hwidth]
          add			ebx, [width]
          add			ecx, [width]
          mov			esi, 0x00
          
          mov			edi, [hheightd2]
convyuv422p:
          movd		mm1, [eax+esi]
          
          punpcklbw	mm1, mm0
          mov			edx, eax
          
          pmullw		mm1, mm3
          sub			edx, [hwidth]
          
          movd		mm5, [edx+esi]
          movd		mm2, [edx+esi]
          
          punpcklbw	mm5, mm0
          punpcklbw	mm2, mm0
          paddusw		mm5, mm1
          paddusw		mm2, mm1
          paddusw		mm5, mm4
          paddusw		mm2, mm4
          psrlw		mm5, 0x02
          psrlw		mm2, 0x02
          packuswb	mm5, mm0
          packuswb	mm2, mm0
          
          mov			edx, eax
          add			edx, [hwidth]
          add			esi, 0x04
          cmp			esi, [hwidth]
          movd		[ebx+esi-4], mm5
          movd		[ecx+esi-4], mm2
          
          jl			convyuv422p
          
          add			eax, [hwidth]
          add			ebx, [width]
          add			ecx, [width]
          mov			esi, 0x00
          sub			edi, 0x01
          cmp			edi, 0x00
          jg			convyuv422p
          
          mov			edx, eax
          sub			edx, [hwidth]
convyuv422bottomp:
        movd		mm1, [eax+esi]
          movd		mm5, [edx+esi]
          punpcklbw	mm5, mm0
          movd		[ecx+esi], mm1
          
          punpcklbw	mm1, mm0
          pmullw		mm1, mm3
          paddusw		mm5, mm1
          paddusw		mm5, mm4
          psrlw		mm5, 0x02
          packuswb	mm5, mm0
          
          add			esi, 0x04
          cmp			esi, [hwidth]
          movd		[ebx+esi-4], mm5
          jl			convyuv422bottomp
          
          emms
      }
    }
    else
    {
      __asm
      {
        mov			eax, [src]
          mov			ecx, [dst]
          mov			esi, 0x00
          pxor		mm0, mm0
          movq		mm3, [mmmask_0003]
          movq		mm4, [mmmask_0004]
          movq		mm5, [mmmask_0005]
          
convyuv422topi:
        movd		mm1, [eax+esi]
          mov			ebx, eax
          add			ebx, [hwidth]
          movd		mm2, [ebx+esi]
          movd		[ecx+esi], mm1
          punpcklbw	mm1, mm0
          movq		mm6, mm1
          pmullw		mm1, mm3
          
          punpcklbw	mm2, mm0
          movq		mm7, mm2
          pmullw		mm2, mm5
          paddusw		mm2, mm1
          paddusw		mm2, mm4
          psrlw		mm2, 0x03
          packuswb	mm2, mm0
          
          mov			edx, ecx
          add			edx, [hwidth]
          pmullw		mm6, mm5
          movd		[edx+esi], mm2
          
          add			ebx, [hwidth]
          movd		mm2, [ebx+esi]
          punpcklbw	mm2, mm0
          pmullw		mm2, mm3
          paddusw		mm2, mm6
          paddusw		mm2, mm4
          psrlw		mm2, 0x03
          packuswb	mm2, mm0
          
          add			edx, [hwidth]
          add			ebx, [hwidth]
          pmullw		mm7, [mmmask_0007]
          movd		[edx+esi], mm2
          
          movd		mm2, [ebx+esi]
          punpcklbw	mm2, mm0
          paddusw		mm2, mm7
          paddusw		mm2, mm4
          psrlw		mm2, 0x03
          packuswb	mm2, mm0
          
          add			edx, [hwidth]
          add			esi, 0x04
          cmp			esi, [hwidth]
          movd		[edx+esi-4], mm2
          
          jl			convyuv422topi
          
          add			eax, [width]
          add			ecx, [dwidth]
          mov			esi, 0x00
          
          mov			edi, [qheightd2]
convyuv422i:
        movd		mm1, [eax+esi]
          punpcklbw	mm1, mm0
          movq		mm6, mm1
          mov			ebx, eax
          sub			ebx, [width]
          movd		mm3, [ebx+esi]
          pmullw		mm1, [mmmask_0007]
          punpcklbw	mm3, mm0
          paddusw		mm3, mm1
          paddusw		mm3, mm4
          psrlw		mm3, 0x03
          packuswb	mm3, mm0
          
          add			ebx, [hwidth]
          movq		mm1, [ebx+esi]
          add			ebx, [width]
          movd		[ecx+esi], mm3
          
          movq		mm3, [mmmask_0003]
          movd		mm2, [ebx+esi]
          
          punpcklbw	mm1, mm0
          pmullw		mm1, mm3
          punpcklbw	mm2, mm0
          movq		mm7, mm2
          pmullw		mm2, mm5
          paddusw		mm2, mm1
          paddusw		mm2, mm4
          psrlw		mm2, 0x03
          packuswb	mm2, mm0
          
          pmullw		mm6, mm5
          mov			edx, ecx
          add			edx, [hwidth]
          movd		[edx+esi], mm2
          
          add			ebx, [hwidth]
          movd		mm2, [ebx+esi]
          punpcklbw	mm2, mm0
          pmullw		mm2, mm3
          paddusw		mm2, mm6
          paddusw		mm2, mm4
          psrlw		mm2, 0x03
          packuswb	mm2, mm0
          
          pmullw		mm7, [mmmask_0007]
          add			edx, [hwidth]
          add			ebx, [hwidth]
          movd		[edx+esi], mm2
          
          movd		mm2, [ebx+esi]
          punpcklbw	mm2, mm0
          paddusw		mm2, mm7
          paddusw		mm2, mm4
          psrlw		mm2, 0x03
          packuswb	mm2, mm0
          
          add			edx, [hwidth]
          add			esi, 0x04
          cmp			esi, [hwidth]
          movd		[edx+esi-4], mm2
          
          jl			convyuv422i
          add			eax, [width]
          add			ecx, [dwidth]
          mov			esi, 0x00
          sub			edi, 0x01
          cmp			edi, 0x00
          jg			convyuv422i
          
convyuv422bottomi:
        movd		mm1, [eax+esi]
          movq		mm6, mm1
          punpcklbw	mm1, mm0
          mov			ebx, eax
          sub			ebx, [width]
          movd		mm3, [ebx+esi]
          punpcklbw	mm3, mm0
          pmullw		mm1, [mmmask_0007]
          paddusw		mm3, mm1
          paddusw		mm3, mm4
          psrlw		mm3, 0x03
          packuswb	mm3, mm0
          
          add			ebx, [hwidth]
          movq		mm1, [ebx+esi]
          punpcklbw	mm1, mm0
          movd		[ecx+esi], mm3
          
          pmullw		mm1, [mmmask_0003]
          add			ebx, [width]
          movd		mm2, [ebx+esi]
          punpcklbw	mm2, mm0
          movq		mm7, mm2
          pmullw		mm2, mm5
          paddusw		mm2, mm1
          paddusw		mm2, mm4
          psrlw		mm2, 0x03
          packuswb	mm2, mm0
          
          mov			edx, ecx
          add			edx, [hwidth]
          pmullw		mm7, [mmmask_0007]
          movd		[edx+esi], mm2
          
          add			edx, [hwidth]
          movd		[edx+esi], mm6
          
          punpcklbw	mm6, mm0
          paddusw		mm6, mm7
          paddusw		mm6, mm4
          psrlw		mm6, 0x03
          packuswb	mm6, mm0
          
          add			edx, [hwidth]
          add			esi, 0x04
          cmp			esi, [hwidth]
          movd		[edx+esi-4], mm6
          
          jl			convyuv422bottomi
          
          emms
      }
    }
  }

void From422to444(unsigned char *src, unsigned char *dst, int width, int height)
{
    int hwidthd8 = width / 2 - 8;
    int hwidth    = width / 2;

    __asm
    {
      mov			eax, [src]
        mov			ebx, [dst]
        mov			edi, [height]
        
        movq		mm1, [mmmask_0001]
        pxor		mm0, mm0
        
convyuv444init:
      movq		mm7, [eax]
        mov			esi, 0x00
        
convyuv444:
      movq		mm2, mm7
        movq		mm7, [eax+esi+8]
        movq		mm3, mm2
        movq		mm4, mm7
        
        psrlq		mm3, 8
        psllq		mm4, 56
        por			mm3, mm4
        
        movq		mm4, mm2
        movq		mm5, mm3
        
        punpcklbw	mm4, mm0
        punpcklbw	mm5, mm0
        
        movq		mm6, mm4
        paddusw		mm4, mm1
        paddusw		mm4, mm5
        psrlw		mm4, 1
        psllq		mm4, 8
        por			mm4, mm6
        
        punpckhbw	mm2, mm0
        punpckhbw	mm3, mm0
        
        movq		mm6, mm2
        paddusw		mm2, mm1
        paddusw		mm2, mm3
        
        movq		[ebx+esi*2], mm4
        
        psrlw		mm2, 1
        psllq		mm2, 8
        por			mm2, mm6
        
        add			esi, 0x08
        cmp			esi, [hwidthd8]
        movq		[ebx+esi*2-8], mm2
        jl			convyuv444
        
        movq		mm2, mm7
        punpcklbw	mm2, mm0
        movq		mm3, mm2
        
        psllq		mm2, 8
        por			mm2, mm3
        
        movq		[ebx+esi*2], mm2
        
        punpckhbw	mm7, mm0
        movq		mm6, mm7
        
        psllq		mm6, 8
        por			mm6, mm7
        
        movq		[ebx+esi*2+8], mm6
        
        add			eax, [hwidth]		
        add			ebx, [width]
        sub			edi, 0x01
        cmp			edi, 0x00
        jg			convyuv444init
        
        emms
    }
  }

static inline void From422toYUY2odd(unsigned char *py, unsigned char *pu, unsigned char *pv, unsigned char *dst, int width, int height)
{
    int dwidth    = width * 2;
    int qwidth    = width * 4;
    int hwidth    = width / 2;
    __asm
    {
      mov			eax, [py]
        mov			ebx, [pu]
        mov			ecx, [pv]
        mov			edx, [dst]
        mov			esi, 0x00
        mov			edi, [height]
        
yuy2conv:
      movd		mm2, [ebx+esi]
        movd		mm3, [ecx+esi]
        punpcklbw	mm2, mm3
        movq		mm1, [eax+esi*2]
        movq		mm4, mm1
        punpcklbw	mm1, mm2
        punpckhbw	mm4, mm2
        
        add			esi, 0x04
        cmp			esi, [hwidth]
        movq		[edx+esi*4-16], mm1
        movq		[edx+esi*4-8], mm4
        jl			yuy2conv
        
        add			eax, [dwidth]
        add			ebx, [width]
        add			ecx, [width]
        add			edx, [qwidth]
        sub			edi, 0x02
        mov			esi, 0x00
        cmp			edi, 0x00
        jg			yuy2conv
        
        emms
    }    
  }
  
static inline void From422toYUY2even(unsigned char *py, unsigned char *pu, unsigned char *pv, unsigned char *dst, int width, int height)
{
    int hwidth    = width / 2;
    int dwidth    = width * 2;
    int qwidth    = width * 4;
    py += width; pu += hwidth; pv += hwidth; dst += dwidth;
    
    __asm
    {
      mov			eax, [py]
        mov			ebx, [pu]
        mov			ecx, [pv]
        mov			edx, [dst]
        mov			esi, 0x00
        mov			edi, [height]
        
yuy2conv:
      movd		mm2, [ebx+esi]
        movd		mm3, [ecx+esi]
        punpcklbw	mm2, mm3
        movq		mm1, [eax+esi*2]
        movq		mm4, mm1
        punpcklbw	mm1, mm2
        punpckhbw	mm4, mm2
        
        add			esi, 0x04
        cmp			esi, [hwidth]
        movq		[edx+esi*4-16], mm1
        movq		[edx+esi*4-8], mm4
        jl			yuy2conv
        
        add			eax, [dwidth]
        add			ebx, [width]
        add			ecx, [width]
        add			edx, [qwidth]
        sub			edi, 0x02
        mov			esi, 0x00
        cmp			edi, 0x00
        jg			yuy2conv
        
        emms
    }
  }

#if 1
//-----------------------------------------------------------------------------
// From RGB32
//-----------------------------------------------------------------------------
// YUV impacts from RGB
#define Y(x) (  (66*((x)>>16)  +  129*(((x)&0xFF00)>>8)  +  25*((x)&0xFF))/256 + 16 )
#define U(x) ( (112*((x)>>16)  -   94*(((x)&0xFF00)>>8)  -  18*((x)&0xFF))/256 + 128 )
#define V(x) ( (-38*((x)>>16)  -   75*(((x)&0xFF00)>>8)  + 112*((x)&0xFF))/256 + 128 )
static inline void FromRGB32toYV12(Pixel32* src, Pixel8 *dst, int w, int h)
{
  int row, col, a, b, c, d;
  int srcpitch = w * 4;
  int dstpitch = w;
  int hw = w>>1;
  int hh = h>>1;
  
  int cralign = 0;

  Pixel8 *YPtr1, *YPtr2, *UPtr, *VPtr;


  YPtr1 = dst;
  YPtr2 = YPtr1 + dstpitch;

  UPtr = dst + w * h;
  VPtr = dst + w * h + hw * hh;

  // The routines computes
  // the chroma values by averaging
  // the 4 neibours

  for(row = 0; row<hh; row++ )
  {
    for(col=0; col<hw; col++)
    {
      a = *src;
      *YPtr1++ = Y(a);

      b = *(src+1);
      *YPtr1++ = Y(b);

      c = *(src + w);
      *YPtr2++ = Y(c);

      d = *(src + w + 1);

      src+= 2;

      *YPtr2++ = Y(d);

      *UPtr++ = (U(a) + U(b) + U(c) + U(d))>>2;
      *VPtr++ = (V(a) + V(b) + V(c) + V(d))>>2;
    }
    src += w;
    UPtr += cralign;
    VPtr += cralign;
    YPtr1 += dstpitch;
    YPtr2 += dstpitch;
  }
}
#endif 

static inline void From420toYV12(unsigned char *py, unsigned char *pu, unsigned char *pv, unsigned char *dst, int width, int height)
{
  int hwidth    = width >>1;
  int hlfheight    = height >>1;
  int dest_addr = 0, src_addr = 0;
  int size = height * width;
  int hsize = hlfheight * hwidth;
  
  // Y plane
  flmemcpy( dst, py, size );
    
  // U & V planes
  for ( int j=0; j< hlfheight; j++ ) // convert U & V planes
  {
      src_addr = j * hwidth;
      dest_addr = size + src_addr;
      
      flmemcpy(  dst + dest_addr, pv + src_addr, hwidth);
      
      dest_addr += hsize;
      
      flmemcpy(  dst + dest_addr, pu + src_addr, hwidth );
      
    } // endfor j
}


static inline void FromYV12toYV12int(int top, unsigned char *py, unsigned char *pu, unsigned char *pv, unsigned char *dst, int width, int height)
{
  int hwidth    = width >>1;
  int hlfheight    = height >>1;
  int dest_addr = 0, src_addr = 0;
  int size = height * width;
  int hsize = hlfheight * hwidth;
  int start_row = top ? 0 : 1;

		for (int j=start_row; j<height ; j+=2 )  // convert y-plane
      flmemcpy( dst + j*width, py + j*width, width);
    
    for ( j=start_row; j< hlfheight; j+=2 ) // convert U & V planes
    {
      src_addr = j * hwidth;
      dest_addr = size + src_addr;
      
      flmemcpy(  dst + dest_addr, pv + src_addr, hwidth);
      
      dest_addr += hsize;
      
      flmemcpy(  dst + dest_addr, pu + src_addr, hwidth );
      
    } // endfor j
}


static inline void From444toRGB32odd(unsigned char *py, unsigned char *pu, unsigned char *pv, unsigned char *dst,int width, int height)
{
	dst += width * (height-2) * 4;
	py += width; pu += width; pv += width;
  int nwidth = width * 12;
  int dwidth    = width * 2;

	__asm
	{
		mov			eax, [py]
		mov			ebx, [pu]
		mov			ecx, [pv]
		mov			edx, [dst]
		mov			edi, [height]
		mov			esi, 0x00
		pxor		mm0, mm0

convRGB24:
		movd		mm1, [eax+esi]       ;mm1: [00][00][00][00][ y][ y][ y][ y]
		movd		mm3, [ebx+esi]       ;mm3: [00][00][00][00][ u][ u][ u][ u]
    punpcklbw	mm1, mm0           ;mm1: [00][ y][00][ y][00][ y][00][ y]
    punpcklbw	mm3, mm0           ;mm3: [00][ u][00][ u][00][ u][00][ u]
    movd		mm5, [ecx+esi]       ;mm5: [00][00][00][00][ v][ v][ v][ v]
		punpcklbw	mm5, mm0           ;mm5: [00][ v][00][ v][00][ v][00][ v]
    movq		mm7, [mmmask_0128]   ;mm7: [  -128][  -128][  -128][  -128]
    psubw		mm3, mm7             ;mm3: [ u-128][ u-128][ u-128][ u-128]
    psubw		mm5, mm7             ;mm5: [ v-128][ v-128][ v-128][ v-128]

    psubw		mm1, [mmmask_0016]   ;mm1: [ y-16 ][ y-16 ][ y-16 ][ y-16 ]
    movq		mm2, mm1             ;mm2: [ y-16 ][ y-16 ][ y-16 ][ y-16 ]
    movq		mm7, [mmmask_0001]   ;mm7: [0001][0001][0001][0001]
    punpcklwd	mm1, mm7           ;mm1: [0001][ y-16 ][0001][ y-16 ]
    punpckhwd	mm2, mm7           ;mm2: [0001][ y-16 ][0001][ y-16 ]
    movq		mm7, [YUVRGB_Scale]  ;mm7: [1000][2000][1000][2000]
    pmaddwd		mm1, mm7           ;mm1: [         y][         y]
		pmaddwd		mm2, mm7           ;mm2: [         y][         y]

    movq		mm4, mm3             ;mm4: [ u-128][ u-128][ u-128][ u-128]
    punpcklwd	mm3, mm0           ;mm3: [ 0000 ][ u-128][ 0000 ][ u-128]
    punpckhwd	mm4, mm0           ;mm4: [ 0000 ][ u-128][ 0000 ][ u-128]
    movq		mm7, [mmmask_cbu]    ;mm7: [0000][408D][0000][408D]
    pmaddwd		mm3, mm7           ;mm3: [ mult u       ][ mult u       ]
		pmaddwd		mm4, mm7           ;mm4: [ mult u       ][ mult u       ]
    paddd		mm3, mm1             ;mm3: [ mult u +y    ][ mult u +y    ]
    paddd		mm4, mm2             ;mm4: [ mult u +y    ][ mult u +y    ]
		psrld		mm3, 13
		psrld		mm4, 13
    packuswb	mm3, mm0           ;mm3: [00][00][00][00][00][ b][00][ b]
    packuswb	mm4, mm0           ;mm4: [00][00][00][00][00][ b][00][ b]

    movq		mm6, mm5             ;mm6: [ v-128][ v-128][ v-128][ v-128]
		punpcklwd	mm5, mm0
		punpckhwd	mm6, mm0
		movq		mm7, [mmmask_crv]
		pmaddwd		mm5, mm7
		pmaddwd		mm6, mm7
		paddd		mm5, mm1
		paddd		mm6, mm2
		psrld		mm5, 13
		psrld		mm6, 13
		packuswb	mm5, mm0         ; mm5: [00][00][00][00][00][ r][00][ r]
		packuswb	mm6, mm0         ; mm6: [00][00][00][00][00][ r][00][ r]

		punpcklbw	mm3, mm5         ; mm3: [00][00][ r][ b][ 0][ 0][ r][ b]
		punpcklbw	mm4, mm6         ; mm4: [00][00][ r][ b][ 0][ 0][ r][ b]
    movq		mm5, mm3           ; mm5: [00][00][ r][ b][ 0][ 0][ r][ b]
    movq		mm6, mm4           ; mm6: [00][00][ r][ b][ 0][ 0][ r][ b]
    psrlq		mm5, 16            ; mm5: [ r][ b][ 0][ 0][ r][ b][ 0][ 0]
		psrlq		mm6, 16            ; mm6: [ r][ b][ 0][ 0][ r][ b][ 0][ 0]
		por			mm3, mm5           ; mm3: [ r][ b][ r][ b][ r][ b][ r][ b]
		por			mm4, mm6           ; mm4: [ r][ b][ r][ b][ r][ b][ r][ b]

    movd		mm5, [ebx+esi]     ; mm5: [00][00][00][00][ u][ u][ u][ u]
		movd		mm6, [ecx+esi]     ; mm6: [00][00][00][00][ v][ v][ v][ v]
    punpcklbw	mm5, mm0         ; mm5: [00][ u][00][ u][00][ u][00][ u]
		punpcklbw	mm6, mm0         ; mm6: [00][ v][00][ v][00][ v][00][ v]
    movq		mm7, [mmmask_0128] ; mm7: [00  80][00  80][00  80][00  80]
    psubw		mm5, mm7           ; mm5: [u - 80][u - 80][u - 80][u - 80]
    psubw		mm6, mm7           ; mm6: [v - 80][v - 80][v - 80][v - 80]

    movq		mm7, mm6           ; mm7: [v - 80][v - 80][v - 80][v - 80]
    punpcklwd	mm6, mm5         ; mm6: [u - 80][v - 80][u - 80][v - 80]
    punpckhwd	mm7, mm5		     ; mm7: [u - 80][v - 80][u - 80][v - 80]
    movq		mm5, [mmmask_cgu_cgv] ;mm5: [F377][E5FC][F377][E5FC]
    pmaddwd		mm6, mm5         ; mm6: [ madd uv      ][  madd uv     ]
		pmaddwd		mm7, mm5         ; mm7: [ madd uv      ][  madd uv     ]
    paddd		mm6, mm1           ; mm6: [ madd uv+y    ][ madd uv+y    ]
		paddd		mm7, mm2           ; mm6: [ madd uv+y    ][ madd uv+y    ]

		psrld		mm6, 13            ; mm6: [0000][   g][0000][   g]
		psrld		mm7, 13            ; mm7: [0000][   g][0000][   g]
    packuswb	mm6, mm0         ; mm6: [00][00][00][00][ g][ 0][ g]
		packuswb	mm7, mm0         ; mm7: [00][00][00][00][ g][ 0][ g]

    punpcklbw	mm3, mm6         ; mm3: [ 0][ r][ g][ b][ 0][ r][ g][ b]
		punpcklbw	mm4, mm7         ; mm4: [ 0][ r][ g][ b][ 0][ r][ g][ b]

    movq [edx],   mm3
    movq [edx+8], mm4

		add			edx, 0x10
		add			esi, 0x04
		cmp			esi, [width]

		jl			convRGB24

		add			eax, [dwidth]
		add			ebx, [dwidth]
		add			ecx, [dwidth]
		sub			edx, [nwidth]
		mov			esi, 0x00
		sub			edi, 0x02
		cmp			edi, 0x00
		jg			convRGB24

		emms
	}
}

static inline void From444toRGB32even(unsigned char *py, unsigned char *pu, unsigned char *pv, unsigned char *dst, int width, int height)
{
	dst += width * (height-1) * 4;
  int nwidth  = width * 12;
  int dwidth  = width * 2;
	__asm
	{
		mov			eax, [py]
		mov			ebx, [pu]
		mov			ecx, [pv]
		mov			edx, [dst]
		mov			edi, [height]
		mov			esi, 0x00
		pxor		mm0, mm0

convRGB24:
		movd		mm1, [eax+esi]
		movd		mm3, [ebx+esi]
		punpcklbw	mm1, mm0
		punpcklbw	mm3, mm0
		movd		mm5, [ecx+esi]
		punpcklbw	mm5, mm0
		movq		mm7, [mmmask_0128]
		psubw		mm3, mm7
		psubw		mm5, mm7

		psubw		mm1, [mmmask_0016]
		movq		mm2, mm1
		movq		mm7, [mmmask_0001]
		punpcklwd	mm1, mm7
		punpckhwd	mm2, mm7
		movq		mm7, [YUVRGB_Scale]
		pmaddwd		mm1, mm7
		pmaddwd		mm2, mm7

		movq		mm4, mm3
		punpcklwd	mm3, mm0
		punpckhwd	mm4, mm0
		movq		mm7, [mmmask_cbu]
		pmaddwd		mm3, mm7
		pmaddwd		mm4, mm7
		paddd		mm3, mm1
		paddd		mm4, mm2
		psrld		mm3, 13
		psrld		mm4, 13
		packuswb	mm3, mm0
		packuswb	mm4, mm0

		movq		mm6, mm5
		punpcklwd	mm5, mm0
		punpckhwd	mm6, mm0
		movq		mm7, [mmmask_crv]
		pmaddwd		mm5, mm7
		pmaddwd		mm6, mm7
		paddd		mm5, mm1
		paddd		mm6, mm2

		psrld		mm5, 13
		psrld		mm6, 13
		packuswb	mm5, mm0
		packuswb	mm6, mm0

		punpcklbw	mm3, mm5
		punpcklbw	mm4, mm6
		movq		mm5, mm3
		movq		mm6, mm4
		psrlq		mm5, 16
		psrlq		mm6, 16
		por			mm3, mm5
		por			mm4, mm6

		movd		mm5, [ebx+esi]
		movd		mm6, [ecx+esi]
		punpcklbw	mm5, mm0
		punpcklbw	mm6, mm0
		movq		mm7, [mmmask_0128]
		psubw		mm5, mm7
		psubw		mm6, mm7

		movq		mm7, mm6
		punpcklwd	mm6, mm5
		punpckhwd	mm7, mm5		
		movq		mm5, [mmmask_cgu_cgv]
		pmaddwd		mm6, mm5
		pmaddwd		mm7, mm5
		paddd		mm6, mm1
		paddd		mm7, mm2

		psrld		mm6, 13
		psrld		mm7, 13
		packuswb	mm6, mm0
		packuswb	mm7, mm0

		punpcklbw	mm3, mm6
		punpcklbw	mm4, mm7

    movq [edx],   mm3
    movq [edx+8], mm4

		add			edx, 0x10
		add			esi, 0x04
		cmp			esi, [width]

		jl			convRGB24

		add			eax, [dwidth]
		add			ebx, [dwidth]
		add			ecx, [dwidth]
		sub			edx, [nwidth]
		mov			esi, 0x00
		sub			edi, 0x02
		cmp			edi, 0x00
		jg			convRGB24

		emms
	}
}

#if 0
// SSE2 full frame version
static inline void From444toRGB32_SSE2(unsigned char *py, unsigned char *pu, unsigned char *pv, unsigned char *dst,int width, int height)
{
	dst += width * (height-1) * 4;
  int nwidth  = width * 2 * 4;

  int wcorr0x10 = (width & 0x7)? 0x10 : 0x00;

	__asm
	{
		mov			eax, [py]
		mov			ebx, [pu]
		mov			ecx, [pv]
		mov			edx, [dst]
//		mov			edi, [height]
		xor			esi, esi
		pxor		xmm0, xmm0

		lea			edi, [esp - 32]
		and			edi, ~0xf

convRGB24:
		movq		xmm1, [eax+esi]       ;xmm1: [00][00][00][00][00][00][00][00][ y][ y][ y][ y][ y][ y][ y][ y]
		movq		xmm3, [ebx+esi]       ;xmm3: [00][00][00][00][00][00][00][00][ u][ u][ u][ u][ u][ u][ u][ u]
    punpcklbw	xmm1, xmm0           ;xmm1: [00][ y][00][ y][00][ y][00][ y][00][ y][00][ y][00][ y][00][ y]
    punpcklbw	xmm3, xmm0           ;xmm3: [00][ u][00][ u][00][ u][00][ u][00][ u][00][ u][00][ u][00][ u]
    movq		xmm5, [ecx+esi]       ;xmm5: [00][00][00][00][00][00][00][00][ v][ v][ v][ v][ v][ v][ v][ v]
		punpcklbw	xmm5, xmm0           ;xmm5: [00][ v][00][ v][00][ v][00][ v][00][ v][00][ v][00][ v][00][ v]
    movdqa		xmm7, [xmmmask_0128]   ;xmm7: [  +128][  +128][  +128][  +128][  +128][  +128][  +128][  +128]
    psubw		xmm3, xmm7             ;xmm3: [ u-128][ u-128][ u-128][ u-128][ u-128][ u-128][ u-128][ u-128]
    psubw		xmm5, xmm7             ;xmm5: [ v-128][ v-128][ v-128][ v-128][ v-128][ v-128][ v-128][ v-128]
	movdqa		[edi], xmm3
	movdqa		[edi + 16], xmm5

    psubw		xmm1, [xmmmask_0016]   ;xmm1: [ y-16 ][ y-16 ][ y-16 ][ y-16 ][ y-16 ][ y-16 ][ y-16 ][ y-16 ]
    movq		xmm2, xmm1             ;xmm2: [ y-16 ][ y-16 ][ y-16 ][ y-16 ][ y-16 ][ y-16 ][ y-16 ][ y-16 ]
    movdqa		xmm7, [xmmmask_0001]   ;xmm7: [0001][0001][0001][0001][0001][0001][0001][0001]
    punpcklwd	xmm2, xmm7           ;xmm1: [0001][ y-16 ][0001][ y-16 ][0001][ y-16 ][0001][ y-16 ]
    punpckhwd	xmm1, xmm7           ;xmm2: [0001][ y-16 ][0001][ y-16 ][0001][ y-16 ][0001][ y-16 ]
    movdqa		xmm7, [xYUVRGB_Scale]  ;xmm7: [1000][2000][1000][2000][1000][2000][1000][2000]
    pmaddwd		xmm2, xmm7           ;xmm2: [         y][         y][         y][         y]
		pmaddwd		xmm1, xmm7           ;xmm1: [         y][         y][         y][         y]

    movq		xmm4, xmm3             ;xmm4: [ u-128][ u-128][ u-128][ u-128][ u-128][ u-128][ u-128][ u-128]
    punpckhwd	xmm3, xmm0           ;xmm3: [ 0000 ][ u-128][ 0000 ][ u-128][ 0000 ][ u-128][ 0000 ][ u-128]
    punpcklwd	xmm4, xmm0           ;xmm4: [ 0000 ][ u-128][ 0000 ][ u-128][ 0000 ][ u-128][ 0000 ][ u-128]
    movdqa		xmm7, [xmmmask_cbu]    ;xmm7: [0000][408D][0000][408D][0000][408D][0000][408D]
    pmaddwd		xmm4, xmm7           ;xmm4: [ mult u       ][ mult u       ][ mult u       ][ mult u       ]
		pmaddwd		xmm3, xmm7           ;xmm3: [ mult u       ][ mult u       ][ mult u       ][ mult u       ]
    paddd		xmm4, xmm2             ;xmm4: [ mult u +y    ][ mult u +y    ][ mult u +y    ][ mult u +y    ]
    paddd		xmm3, xmm1             ;xmm3: [ mult u +y    ][ mult u +y    ][ mult u +y    ][ mult u +y    ]
		psrld		xmm4, 13
		psrld		xmm3, 13
    packuswb	xmm4, xmm3           ;xmm4: [00][ b][00][ b][00][ b][00][ b][00][ b][00][ b][00][ b][00][ b]

    movq		xmm6, xmm5             ;xmm6: [ v-128][ v-128][ v-128][ v-128][ v-128][ v-128][ v-128][ v-128]
		punpckhwd	xmm5, xmm0
		punpcklwd	xmm6, xmm0
		movdqa		xmm7, [xmmmask_crv]
		pmaddwd		xmm6, xmm7
		pmaddwd		xmm5, xmm7
		paddd		xmm6, xmm2
		paddd		xmm5, xmm1
		psrld		xmm6, 13
		psrld		xmm5, 13
		packuswb	xmm6, xmm5         ; xmm6: [00][ r][00][ r][00][ r][00][ r][00][ r][00][ r][00][ r][00][ r]

	movdqa		xmm7, [edi + 16] // v...
	movdqa		xmm3, [edi] // u...
	movq		xmm5, xmm7
    punpckhwd	xmm7, xmm3		     ; xmm7: [u - 80][v - 80][u - 80][v - 80][u - 80][v - 80][u - 80][v - 80]
    punpcklwd	xmm5, xmm3         ; xmm5: [u - 80][v - 80][u - 80][v - 80][u - 80][v - 80][u - 80][v - 80]

    movdqa		xmm3, [xmmmask_cgu_cgv] ;xmm6: [F377][E5FC][F377][E5FC][F377][E5FC][F377][E5FC]
    pmaddwd		xmm5, xmm3         ; xmm5: [ madd uv      ][  madd uv     ][ madd uv      ][  madd uv     ]
		pmaddwd		xmm7, xmm3         ; xmm7: [ madd uv      ][  madd uv     ][ madd uv      ][  madd uv     ]
    paddd		xmm5, xmm2           ; xmm5: [ madd uv+y    ][ madd uv+y    ][ madd uv+y    ][ madd uv+y    ]
		paddd		xmm7, xmm1           ; xmm5: [ madd uv+y    ][ madd uv+y    ][ madd uv+y    ][ madd uv+y    ]

		psrld		xmm5, 13            ; xmm5: [0000][   g][0000][   g][0000][   g][0000][   g]
		psrld		xmm7, 13            ; xmm7: [0000][   g][0000][   g][0000][   g][0000][   g]
    packuswb	xmm5, xmm7         ; xmm5: [00][ g][00][ g][00][ g][00][ g][00][ g][00][ g][00][ g][00][ g]

	// xmm6: [00][ r][00][ r][00][ r][00][ r][00][ r][00][ r][00][ r][00][ r]
	// xmm5: [00][ g][00][ g][00][ g][00][ g][00][ g][00][ g][00][ g][00][ g]
	// xmm4: [00][ b][00][ b][00][ b][00][ b][00][ b][00][ b][00][ b][00][ b]

	movq		xmm3, xmm4
	pxor		xmm7, xmm7
	punpckhbw	xmm4, xmm5 // [00][00][ g][ b][00][00][ g][ b][00][00][ g][ b][00][00][ g][ b]
	pxor		xmm1, xmm1
	punpckhwd	xmm7, xmm6 // [00][ r][00][00][00][ r][00][00][00][ r][00][00][00][ r][00][00]
	punpcklbw	xmm3, xmm5 // [00][00][ g][ b][00][00][ g][ b][00][00][ g][ b][00][00][ g][ b]
	punpcklwd	xmm1, xmm6 // [00][ r][00][00][00][ r][00][00][00][ r][00][00][00][ r][00][00]
	por			xmm4, xmm7
	por			xmm3, xmm1

	// xmm4: [ 0][ r][ g][ b][ 0][ r][ g][ b][ 0][ r][ g][ b][ 0][ r][ g][ b]
	// xmm3: [ 0][ r][ g][ b][ 0][ r][ g][ b][ 0][ r][ g][ b][ 0][ r][ g][ b]

    movntdq [edx],   xmm3
    movntdq [edx+16], xmm4

		add			edx, 0x10*2
		add			esi, 0x04*2
		cmp			esi, [width]

		jb			convRGB24

		sub			edx, wcorr0x10

		mov			esi, height
		add			eax, [width]
		add			ebx, [width]
		add			ecx, [width]
		sub			edx, [nwidth]

		sub			esi, 0x01 //0x02
		//cmp			esi, 0x00
		mov			height, esi
		mov			esi, 0 // do not xor here!!
		jg			convRGB24

//		emms
	}
}

#endif

// horizontal filter and 2:1 subsampling 
// handle, half sample shifts for MPEG1 sampling grid
static void conv444to422(ui8 *src, ui8 *dst, int width, int height, int halfsample_shift, ui8 *clp )
{
  int i, j, im5, im4, im3, im2, im1, ip1, ip2, ip3, ip4, ip5, ip6;

  if (halfsample_shift)
  {
    for (j=0; j<height; j++)
    {
      for (i=0; i<width; i+=2)
      {
        im5 = (i<5) ? 0 : i-5;
        im4 = (i<4) ? 0 : i-4;
        im3 = (i<3) ? 0 : i-3;
        im2 = (i<2) ? 0 : i-2;
        im1 = (i<1) ? 0 : i-1;
        ip1 = (i<width-1) ? i+1 : width-1;
        ip2 = (i<width-2) ? i+2 : width-1;
        ip3 = (i<width-3) ? i+3 : width-1;
        ip4 = (i<width-4) ? i+4 : width-1;
        ip5 = (i<width-5) ? i+5 : width-1;
        ip6 = (i<width-5) ? i+6 : width-1;

        /* FIR filter with 0.5 sample interval phase shift */
        dst[i>>1] = clp[(int)(228*(src[i]+src[ip1])
                         +70*(src[im1]+src[ip2])
                         -37*(src[im2]+src[ip3])
                         -21*(src[im3]+src[ip4])
                         +11*(src[im4]+src[ip5])
                         + 5*(src[im5]+src[ip6])+256)>>9];
      }
      src+= width;
      dst+= width>>1;
    }
  }
  else
  {
    for (j=0; j<height; j++)
    {
      for (i=0; i<width; i+=2)
      {
        im5 = (i<5) ? 0 : i-5;
        im3 = (i<3) ? 0 : i-3;
        im1 = (i<1) ? 0 : i-1;
        ip1 = (i<width-1) ? i+1 : width-1;
        ip3 = (i<width-3) ? i+3 : width-1;
        ip5 = (i<width-5) ? i+5 : width-1;

        /* FIR filter coefficients (*512): 22 0 -52 0 159 256 159 0 -52 0 22 */
        dst[i>>1] = clp[(int)(  22*(src[im5]+src[ip5])-52*(src[im3]+src[ip3])
                         +159*(src[im1]+src[ip1])+256*src[i]+256)>>9];
      }
      src+= width;
      dst+= width>>1;
    }
  }
}

/* vertical filter and 2:1 subsampling */
static void conv422to420(ui8 *src, ui8 *dst, int width, int height, ui8 *clp, int prog_frame )
{
  int w, i, j, jm6, jm5, jm4, jm3, jm2, jm1;
  int jp1, jp2, jp3, jp4, jp5, jp6;

  w = width>>1;
  height = (height>>1)<<1;

  if (prog_frame)
  {
    /* intra frame */
    for (i=0; i<w; i++)
    {
      for (j=0; j<height; j+=2)
      {
        jm5 = (j<5) ? 0 : j-5;
        jm4 = (j<4) ? 0 : j-4;
        jm3 = (j<3) ? 0 : j-3;
        jm2 = (j<2) ? 0 : j-2;
        jm1 = (j<1) ? 0 : j-1;
        jp1 = (j<height-1) ? j+1 : height-1;
        jp2 = (j<height-2) ? j+2 : height-1;
        jp3 = (j<height-3) ? j+3 : height-1;
        jp4 = (j<height-4) ? j+4 : height-1;
        jp5 = (j<height-5) ? j+5 : height-1;
        jp6 = (j<height-6) ? j+6 : height-1;

        /* FIR filter with 0.5 sample interval phase shift */
        dst[w*(j>>1)] = clp[(int)(228*(src[w*j]+src[w*jp1])
                             +70*(src[w*jm1]+src[w*jp2])
                             -37*(src[w*jm2]+src[w*jp3])
                             -21*(src[w*jm3]+src[w*jp4])
                             +11*(src[w*jm4]+src[w*jp5])
                             + 5*(src[w*jm5]+src[w*jp6])+256)>>9];
      }
      src++;
      dst++;
    }
  }
  else
  {
    /* intra field */
    for (i=0; i<w; i++)
    {
      for (j=0; j<height; j+=4)
      {
        /* top field */
        jm5 = (j<10) ? 0 : j-10;
        jm4 = (j<8) ? 0 : j-8;
        jm3 = (j<6) ? 0 : j-6;
        jm2 = (j<4) ? 0 : j-4;
        jm1 = (j<2) ? 0 : j-2;
        jp1 = (j<height-2) ? j+2 : height-2;
        jp2 = (j<height-4) ? j+4 : height-2;
        jp3 = (j<height-6) ? j+6 : height-2;
        jp4 = (j<height-8) ? j+8 : height-2;
        jp5 = (j<height-10) ? j+10 : height-2;
        jp6 = (j<height-12) ? j+12 : height-2;

        /* FIR filter with 0.25 sample interval phase shift */
        dst[w*(j>>1)] = clp[(int)(8*src[w*jm5]
                            +5*src[w*jm4]
                           -30*src[w*jm3]
                           -18*src[w*jm2]
                          +113*src[w*jm1]
                          +242*src[w*j]
                          +192*src[w*jp1]
                           +35*src[w*jp2]
                           -38*src[w*jp3]
                           -10*src[w*jp4]
                           +11*src[w*jp5]
                            +2*src[w*jp6]+256)>>9];

        /* bottom field */
        jm6 = (j<9) ? 1 : j-9;
        jm5 = (j<7) ? 1 : j-7;
        jm4 = (j<5) ? 1 : j-5;
        jm3 = (j<3) ? 1 : j-3;
        jm2 = (j<1) ? 1 : j-1;
        jm1 = (j<height-1) ? j+1 : height-1;
        jp1 = (j<height-3) ? j+3 : height-1;
        jp2 = (j<height-5) ? j+5 : height-1;
        jp3 = (j<height-7) ? j+7 : height-1;
        jp4 = (j<height-9) ? j+9 : height-1;
        jp5 = (j<height-11) ? j+11 : height-1;
        jp6 = (j<height-13) ? j+13 : height-1;

        /* FIR filter with 0.25 sample interval phase shift */
        dst[w*((j>>1)+1)] = clp[(int)(8*src[w*jp6]
                                +5*src[w*jp5]
                               -30*src[w*jp4]
                               -18*src[w*jp3]
                              +113*src[w*jp2]
                              +242*src[w*jp1]
                              +192*src[w*jm1]
                               +35*src[w*jm2]
                               -38*src[w*jm3]
                               -10*src[w*jm4]
                               +11*src[w*jm5]
                                +2*src[w*jm6]+256)>>9];
      }
      src++;
      dst++;
    }
  }
}

void OverlayYV12( ui8 *dst, int dstwidth, int dstheight, ui8 *src, 
                  ui8 *alphaluma, ui8 *alphachroma,  
                  ui32 x, ui32 y, ui32 srcwidth, ui32 srcheight )
{
  ui32 w, h, i, hw;
  ui8 *d;
  ui8 *a;

  int lumasize = dstwidth*dstheight;
  int crsize = lumasize>>2;
  /* Y plane */
  h = srcheight;
  d = dst + y*dstwidth + x;
  a = alphaluma;
  int oa, da; /* overlay alpha, dest alpha */
  do{
    w = srcwidth;
    do{
      oa = *a;
      da = 0xFF - oa;
      *d++ = ((*src++)*oa + (*d)*da + 128)>>8;
      a++;
    }while(--w);
    d += dstwidth - srcwidth;
  }while(--h);

  /* V U planes */
  hw = srcwidth>>1;
  for( i=0; i<2; i++)
  {
    a = alphachroma;
    h = srcheight>>1;
    d = dst + lumasize + i*crsize + (y>>1)*(dstwidth>>1) + (x>>1);
    do{
      w = hw;
      do{
        oa = *a;
        da = 0xFF - oa;
        *d++ = ((*src++)*oa + (*d)*da + 128)>>8;
        a++;
      }while(--w);
      d += (dstwidth>>1) - hw;
    }while(--h);
  }
}

void CFrame::Overlay( ui32 nX, ui32 nY, CFrame *pOverlay )
{
  if( !pOverlay )
    return;

  ui8 *a422, *a420;
  TYUVImage yuv;

  // Check limits
  int ow = pOverlay->GetWidth(), oh = pOverlay->GetHeight();
  if( (nX+ow) > m_nWidth || 
      (nY+oh) > m_nHeight ){
      DBG_STR((str, "CFrame::Overlay - Overlay is too big"));
      return;
    }
  switch( m_nFormat )
  {
    case FRAME_YV12:
      switch( pOverlay->GetFormat() )
      {
        case FRAME_YV12A:
          // Create an additional alpha plane with half width and half height
          // We are using m_pUpsample422 as a memorry allocator
          // We are using this because CFrame::Set() intelligently allocates memory
          // only if necessary
          m_pUpsample422->Set( ow, oh, FRAME_YUV422 );
          m_pUpsample422->GetYuvInfo( &yuv );
          a422 = yuv.u;  // a chroma plane in 422 has the dimensions we want

          m_pOverlayTemp->Set( ow, oh, FRAME_YV12 );
          m_pOverlayTemp->GetYuvInfo( &yuv );
          a420 = yuv.u;  // a chorma plane in 420 has the dimensions we are looking for

          // Get the alpha plane
          pOverlay->GetYuvInfo( &yuv );

          // Down convert
          conv444to422( yuv.a, a422, ow, oh, 0, m_pClp );
          conv422to420(  a422, a420, ow, oh, m_pClp, 1 );

          // Do the overlay with the alpha info
          OverlayYV12( GetBuffer(), m_nWidth, m_nHeight, pOverlay->GetBuffer(),
                       yuv.a, a420, 
                       nX, nY, ow, oh );

          break;
      }
      break;
  }
  
}

// Set the contents of the frame for the given frame
// A conversion in format will be done if needed
void CFrame::SetField( CFrame *pFrame, bool bTopField )
{
  // Resolutions must match
  if( (m_nWidth  != pFrame->GetWidth()) ||
    (  m_nHeight != pFrame->GetHeight()) )
    return;

  int nInputFormat = pFrame->GetFormat();

  switch( m_nFormat )
  {

    case FRAME_RGB32: 
      switch( nInputFormat )
      {
        case FRAME_YV12:
          DBG_STR((str, "SetField - From YV12 to RGB32 not supported"));
          break;
      }
      break;

    case FRAME_YV12:
      switch( nInputFormat )
      {
        case FRAME_YV12:
          TYUVImage in;
          pFrame->GetYuvInfo( &in );
          if(bTopField)
            FromYV12toYV12int( true, in.y, in.u, in.v, (unsigned char *)m_pData, m_nWidth, m_nHeight);
          else
            FromYV12toYV12int( false, in.y, in.u, in.v, (unsigned char *)m_pData, m_nWidth, m_nHeight);
        break;
      }      
      break;

  }    
}

void CFrame::SetFrame( CFrame *pFrame )
{
  TYUVImage *YUVImage=0;

  // Resolutions must match
  if( (m_nWidth  != pFrame->GetWidth()) ||
    (  m_nHeight != pFrame->GetHeight()) )
    return;
  
  int nInputFormat = pFrame->GetFormat();

  m_nFrameFlags= pFrame->GetFlags();

  switch( m_nFormat )
  {
    case FRAME_RGB32:      
      switch( nInputFormat )
      {
        case FRAME_RGB32:
          flmemcpy( m_pData, pFrame->GetBuffer(), GetBufferSize() );
          break;
        case FRAME_YV12:
          // We are going to need temporary frames for the upconversion
          // Set them. This will allocate necessary memory if not done before.
          m_pUpsample422->Set( m_nWidth, m_nHeight, FRAME_YUV422 );
          m_pUpsample444->Set( m_nWidth, m_nHeight, FRAME_YUV444 );

          TYUVImage in, y422, y444;
          pFrame->GetYuvInfo(&in);
          m_pUpsample422->GetYuvInfo(&y422);
          m_pUpsample444->GetYuvInfo(&y444);

          From420to422(in.u   , y422.u , m_nWidth, m_nHeight, pFrame->IsProgressive() );
          From420to422(in.v   , y422.v , m_nWidth, m_nHeight, pFrame->IsProgressive() );
          From422to444(y422.u, y444.u, m_nWidth, m_nHeight);
          From422to444(y422.v, y444.v, m_nWidth, m_nHeight);

          From444toRGB32odd(in.y,  y444.u ,y444.v,
                            (unsigned char *)m_pData, m_nWidth, m_nHeight);
          From444toRGB32even(in.y, y444.u, y444.v, 
                            (unsigned char *)m_pData, m_nWidth, m_nHeight);
          break;
      }
      break;
          
    case FRAME_YV12:
      switch( nInputFormat )
      {
        case FRAME_YV12:
          flmemcpy( m_pData, pFrame->GetBuffer(), GetBufferSize() );
          break;
        case FRAME_RGB32:
          FromRGB32toYV12((Pixel32 *)pFrame->GetBuffer(), (Pixel8 *)GetBuffer(), m_nWidth, m_nHeight);
          break;
      }      
      break;
    case FRAME_YV12A:
      switch( nInputFormat )
      {
        case FRAME_YUV444A:
          TYUVImage y444, y422, y420;
          if( pFrame->GetWidth()%2 || pFrame->GetHeight()%2 )
          {
            DBG_STR((str, "CFrame - Dimensions are not multiple of 2\n"));
            return;
          }

          m_pUpsample422->Set( m_nWidth, m_nHeight, FRAME_YUV422 );
          if( !m_pUpsample422->GetBuffer() )
            int aa = 0;
          
          // Grab pointers 
          pFrame->GetYuvInfo(&y444);
          m_pUpsample422->GetYuvInfo(&y422);
          GetYuvInfo( &y420 );

          // Down convert
          conv444to422( y444.u, y422.u, m_nWidth, m_nHeight, 0, m_pClp );
          conv444to422( y444.v, y422.v, m_nWidth, m_nHeight, 0, m_pClp );


          flmemcpy( y420.y, y444.y, m_nWidth*m_nHeight );
          conv422to420( y422.u, y420.u, m_nWidth, m_nHeight, m_pClp, 1 );
          conv422to420( y422.v, y420.v, m_nWidth, m_nHeight, m_pClp, 1 );

          // copy alpha plane
          flmemcpy( y420.a, y444.a, m_nWidth*m_nHeight );

          break;      
      }
      break;
    case FRAME_YUY2:
      switch( nInputFormat )
      {
        case FRAME_YV12:
          TYUVImage yuv422, yuv420;
          // We are going to need temporary frames for the upconversion
          // Set them. This will allocate necessary memory if not done before.
          m_pUpsample422->Set( m_nWidth, m_nHeight, FRAME_YUV422 );

          pFrame->GetYuvInfo( &yuv420 );
          m_pUpsample422->GetYuvInfo( &yuv422 );

          From420to422( yuv420.u, yuv422.u, m_nWidth, m_nHeight, pFrame->IsProgressive() );
          From420to422( yuv420.v, yuv422.v, m_nWidth, m_nHeight, pFrame->IsProgressive() );

          From422toYUY2odd ( yuv420.y, yuv422.u, yuv422.v, m_pData, m_nWidth, m_nHeight );
          From422toYUY2even( yuv420.y, yuv422.u, yuv422.v, m_pData, m_nWidth, m_nHeight );

          break;
      }
      break;
    case FRAME_YUV444:
      switch( nInputFormat )
      {
        case FRAME_YV12:

          // We are going to need temporary frames for the upconversion
          // Set them. This will allocate necessary memory if not done before.
          m_pUpsample422->Set( m_nWidth, m_nHeight, FRAME_YUV422 );

          TYUVImage in, y422, y444;
          pFrame->GetYuvInfo(&in);
          m_pUpsample422->GetYuvInfo(&y422);
          GetYuvInfo(&y444);

          From420to422(in.u   , y422.u , m_nWidth, m_nHeight, pFrame->IsProgressive() );
          From420to422(in.v   , y422.v , m_nWidth, m_nHeight, pFrame->IsProgressive() );

          
          flmemcpy( y444.y, in.y, m_nWidth*m_nHeight );
          From422to444(y422.u, y444.u, m_nWidth, m_nHeight);
          From422to444(y422.v, y444.v, m_nWidth, m_nHeight);
          break;
      }
      break;
  }
    
}
 

ui32 CFrame::GetRequiredMem()
{
  int bufSize;
  int nChromaSize;
  int nLumaSize;

  switch(m_nFormat)
  {
    case FRAME_RGB32:
      bufSize = m_nWidth * m_nHeight * (m_nDepth/8);
      break;
    case FRAME_YV12:
      nChromaSize = (m_nWidth>>1) * (m_nHeight>>1);
      bufSize = nChromaSize * 6;
      break;
    case FRAME_YUY2:
      bufSize = m_nHeight * (m_nWidth * 2);
      break;
    case FRAME_YUV422:
      nLumaSize   = m_nWidth * m_nHeight;
      nChromaSize = (m_nWidth>>1) * m_nHeight;
      bufSize = nLumaSize + nChromaSize * 2;
      break;
    case FRAME_YUV444:
      nLumaSize   = m_nWidth * m_nHeight;
      bufSize = 3 * nLumaSize;
      break;
    case FRAME_YV12A:
      nLumaSize = m_nWidth * m_nHeight;
      nChromaSize = (m_nWidth>>1) * (m_nHeight>>1);
      bufSize = nChromaSize * 6 + nLumaSize;
      break;
    case FRAME_YUV444A:
      nLumaSize   = m_nWidth * m_nHeight;
      bufSize = 4 * nLumaSize;
      break;
    default:
      bufSize = 0;
      break;
  }
  return bufSize;
}

bool CFrame::Alloc(ui32 nBufferSize)
{
  m_pUpsample444 = new CFrame;
  m_pUpsample422 = new CFrame;
  m_pOverlayTemp = new CFrame;
  
  if(!nBufferSize)
  {
    DBG_STR((str, "CFrame::Alloc - nBuffersize is zero\n"));
    m_nAllocatedSize = 0;
    return false;
  }

  m_pData = aligned_new(nBufferSize);
  if( !m_pData )
  {
    DBG_STR((str, "CFrame::Alloc - allocation of %d bytes failed!\n", nBufferSize));
    m_nAllocatedSize = 0;
    return false;
  }


  m_bOwnBuffer = true;
  m_nAllocatedSize = m_pData ? nBufferSize : 0;
  return true;
}

void CFrame::DeAlloc()
{
  if( m_pUpsample444 ){
    delete m_pUpsample444;
    m_pUpsample444 = NULL;
  }

  if( m_pUpsample422 ){
    delete m_pUpsample422;
    m_pUpsample422 = NULL;
  }

  if( m_pOverlayTemp ){
    delete m_pOverlayTemp;
    m_pOverlayTemp = NULL;
  }

  if(m_pData && m_bOwnBuffer)
  {
    aligned_delete(m_pData);
    m_pData = NULL;
  }

  m_nAllocatedSize = 0;
}
