/* 
 *  YUVtoRGB.cpp 
 *
 *	Copyright (C) Alberto Vigata - January 2000 - ultraflask@yahoo.com
 *
 *  This file is part of FlasKMPEG, a free MPEG to MPEG/AVI converter
 *	
 *  FlasKMPEG is free software; you can redistribute it and/or modify
 *  it under the terms of the GNU General Public License as published by
 *  the Free Software Foundation; either version 2, or (at your option)
 *  any later version.
 *   
 *  FlasKMPEG is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *  GNU General Public License for more details.
 *   
 *  You should have received a copy of the GNU General Public License
 *  along with GNU Make; see the file COPYING.  If not, write to
 *  the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. 
 *
 */

#include "mmx.h"
#include <windows.h> 

//////////////////////////////////////////////////////////////
#include "YUVtoRGB.h"

static void YUVToRGB24(YUVPixel *Y_ptr, YUVPixel *U_ptr, YUVPixel *V_ptr, unsigned char *dst, long bpr, long w, long h);
static void YUVToRGB16(YUVPixel *Y_ptr, YUVPixel *U_ptr, YUVPixel *V_ptr, unsigned char *dst, long bpr, long w, long h);
static void YUVToUYVY16(YUVPixel *Y_ptr, YUVPixel *U_ptr, YUVPixel *V_ptr, unsigned char *dst, long bpr, long w, long h);
static void YUVToYUYV16(YUVPixel *Y_ptr, YUVPixel *U_ptr, YUVPixel *V_ptr, unsigned char *dst, long bpr, long w, long h);
static void YUVToYUV12(YUVPixel *Y_ptr, YUVPixel *U_ptr, YUVPixel *V_ptr, unsigned char *dst, long bpr, long w, long h);
static long y_pitch, uv_pitch;



extern "C" void asm_YUVtoRGB32_row(
		unsigned long *ARGB1_pointer,
		unsigned long *ARGB2_pointer,
		YUVPixel *Y1_pointer,
		YUVPixel *Y2_pointer,
		YUVPixel *U_pointer,
		YUVPixel *V_pointer,
		long width
		);

extern "C" void asm_YUVtoRGB24_row(
		unsigned long *ARGB1_pointer,
		unsigned long *ARGB2_pointer,
		YUVPixel *Y1_pointer,
		YUVPixel *Y2_pointer,
		YUVPixel *U_pointer,
		YUVPixel *V_pointer,
		long width
		);

extern "C" void asm_YUVtoRGB16_row(
		unsigned long *ARGB1_pointer,
		unsigned long *ARGB2_pointer,
		YUVPixel *Y1_pointer,
		YUVPixel *Y2_pointer,
		YUVPixel *U_pointer,
		YUVPixel *V_pointer,
		long width
		);

void YUVToRGB32(YUVPixel *Y_ptr, YUVPixel *U_ptr, YUVPixel *V_ptr, unsigned char *dst, long bpr, long w, long h) {
	dst = dst + bpr * (2*h - 2);
		y_pitch	= bpr>>2;
		uv_pitch= bpr>>3;
	do {
		asm_YUVtoRGB32_row(
				(unsigned long *)(dst + bpr),
				(unsigned long *)dst,
				Y_ptr,
				Y_ptr + y_pitch,
				U_ptr,
				V_ptr,
				w);

		dst = dst - 2*bpr;
		Y_ptr = Y_ptr + 2*y_pitch;
		U_ptr = U_ptr + uv_pitch;
		V_ptr = V_ptr + uv_pitch;
	} while(--h);

	if (MMX_enabled)
		__asm emms
}

// YUVtoRGB wrapper
int YUVtoRGB(YUVimage *inp, unsigned char *dst)
{
	int oxsize,oysize;

	//Sanity checks
	if (!inp)
		return 0;
	oxsize = inp->Yxsize;
	oysize = inp->Yysize;

	if (inp->Yysize <= 0 || inp->Yysize >= 1024)
		return 0;
	if (inp->Cysize <= 0 || inp->Cysize >= 1024)
		return 0;
	if (inp->Y==NULL || inp->U==NULL || inp->V==NULL)
		return 0;
	YUVToRGB32(inp->Y, inp->U,inp->V, (unsigned char *)dst, oxsize*4, oxsize>>1, oysize>>1); 
	return 1;
}

static void YUVToRGB24(YUVPixel *Y_ptr, YUVPixel *U_ptr, YUVPixel *V_ptr, unsigned char *dst, long bpr, long w, long h) {
	dst = dst + bpr * (2*h - 2);

	do {
		asm_YUVtoRGB24_row(
				(unsigned long *)(dst + bpr),
				(unsigned long *)dst,
				Y_ptr,
				Y_ptr + y_pitch,
				U_ptr,
				V_ptr,
				w);

		dst = dst - 2*bpr;
		Y_ptr = Y_ptr + 2*y_pitch;
		U_ptr = U_ptr + uv_pitch;
		V_ptr = V_ptr + uv_pitch;
	} while(--h);

	if (MMX_enabled)
		__asm emms
}
static void YUVToRGB16(YUVPixel *Y_ptr, YUVPixel *U_ptr, YUVPixel *V_ptr, unsigned char *dst, long bpr, long w, long h) {
	dst = dst + bpr * (2*h - 2);

	do {
		asm_YUVtoRGB16_row(
				(unsigned long *)(dst + bpr),
				(unsigned long *)dst,
				Y_ptr,
				Y_ptr + y_pitch,
				U_ptr,
				V_ptr,
				w);

		dst = dst - 2*bpr;
		Y_ptr = Y_ptr + 2*y_pitch;
		U_ptr = U_ptr + uv_pitch;
		V_ptr = V_ptr + uv_pitch;
	} while(--h);

	if (MMX_enabled)
		__asm emms
}

static void __declspec(naked) YUVtoUYVY16_MMX(
				YUVPixel *Y_ptr,		// [esp+4+16]
				YUVPixel *U_ptr,		// [esp+8+16]
				YUVPixel *V_ptr,		// [esp+12+16]
				unsigned char *dst,		// [esp+16+16]
				long bpr,				// [esp+20+16]
				long w,					// [esp+24+16]
				long h) {				// [esp+28+16]

	__asm {
		push		ebp
		push		edi
		push		esi
		push		ebx

		mov			edx,[esp+24+16]			;load width (mult of 8)

		mov			ebx,[esp+8+16]			;load source U ptr
		mov			ecx,[esp+12+16]			;load source V ptr
		mov			eax,[esp+4+16]			;load source Y ptr
		mov			edi,[esp+16+16]			;load destination ptr
		mov			esi,[esp+20+16]			;load destination pitch
		mov			ebp,[esp+28+16]			;load height

		lea			ebx,[ebx+edx]			;bias pointers
		lea			ecx,[ecx+edx]			;(we count from -n to 0)
		lea			eax,[eax+edx*2]
		lea			edi,[edi+edx*4]

		neg			edx
		mov			[esp+24+16],edx
xyloop:
		movq		mm0,[ebx+edx]			;U0-U7

		movq		mm7,[ecx+edx]			;V0-V7
		movq		mm2,mm0					;U0-U7

		movq		mm4,[eax+edx*2]
		punpcklbw	mm0,mm7					;[V3][U3][V2][U2][V1][U1][V0][U0]

		movq		mm5,[eax+edx*2+8]
		punpckhbw	mm2,mm7					;[V7][U7][V6][U6][V5][U5][V4][U4]

		movq		mm1,mm0
		punpcklbw	mm0,mm4					;[Y3][V1][Y2][U1][Y1][V0][Y0][U0]

		punpckhbw	mm1,mm4					;[Y7][V3][Y6][U3][Y5][V2][Y4][U2]
		movq		mm3,mm2

		movq		[edi+edx*4+ 0],mm0
		punpcklbw	mm2,mm5					;[YB][V5][YA][U5][Y9][V4][Y8][U4]

		movq		[edi+edx*4+ 8],mm1
		punpckhbw	mm3,mm5					;[YF][V7][YE][U7][YD][V6][YC][U6]

		movq		[edi+edx*4+16],mm2

		movq		[edi+edx*4+24],mm3

		add			edx,8
		jnc			xyloop

		mov			edx,[esp+24+16]			;reload width counter

		test		ebp,1					;update U/V row every other row only
		jz			oddline

		sub			ebx,edx					;advance U pointer
		sub			ecx,edx					;advance V pointer

oddline:
		sub			eax,edx					;advance Y pointer
		sub			eax,edx					;advance Y pointer

		add			edi,esi					;advance dest ptr

		dec			ebp
		jne			xyloop

		pop			ebx
		pop			esi
		pop			edi
		pop			ebp
		emms
		ret
	}
}

static void YUVToUYVY16(YUVPixel *Y_ptr, YUVPixel *U_ptr, YUVPixel *V_ptr, unsigned char *dst, long bpr, long w, long h) {
	if (MMX_enabled) {
		YUVtoUYVY16_MMX(Y_ptr, U_ptr, V_ptr, dst, bpr, w, h*2);
		return;
	}

	do {
		int wt;

		wt = w;
		do {
			*dst++ = *U_ptr++;
			*dst++ = *Y_ptr++;
			*dst++ = *V_ptr++;
			*dst++ = *Y_ptr++;
		} while(--wt);

		U_ptr = U_ptr - uv_pitch;
		V_ptr = V_ptr - uv_pitch;
		
		wt = w;
		do {
			*dst++ = *U_ptr++;
			*dst++ = *Y_ptr++;
			*dst++ = *V_ptr++;
			*dst++ = *Y_ptr++;
		} while(--wt);

	} while(--h);
}

static void __declspec(naked) YUVtoYUYV16_MMX(
				YUVPixel *Y_ptr,		// [esp+4+16]
				YUVPixel *U_ptr,		// [esp+8+16]
				YUVPixel *V_ptr,		// [esp+12+16]
				unsigned char *dst,		// [esp+16+16]
				long bpr,				// [esp+20+16]
				long w,					// [esp+24+16]
				long h) {				// [esp+28+16]

	__asm {
		push		ebp
		push		edi
		push		esi
		push		ebx

		mov			edx,[esp+24+16]			;multiply width by 8

		mov			ebx,[esp+8+16]			;load source U ptr
		mov			ecx,[esp+12+16]			;load source V ptr
		mov			eax,[esp+4+16]			;load source Y ptr
		mov			edi,[esp+16+16]			;load destination ptr
		mov			esi,[esp+20+16]			;load destination pitch
		mov			ebp,[esp+28+16]			;load height

		lea			ebx,[ebx+edx]			;bias pointers
		lea			ecx,[ecx+edx]			;(we count from -n to 0)
		lea			eax,[eax+edx*2]
		lea			edi,[edi+edx*4]

		neg			edx
		mov			[esp+24+16],edx
xyloop:
		movq		mm0,[ebx+edx]			;U0-U7

		movq		mm7,[ecx+edx]			;V0-V7
		movq		mm1,mm0					;U0-U7

		movq		mm2,[eax+edx*2]			;Y0-Y7
		punpcklbw	mm0,mm7					;[V3][U3][V2][U2][V1][U1][V0][U0]

		movq		mm4,[eax+edx*2+8]		;Y8-YF
		punpckhbw	mm1,mm7					;[V7][U7][V6][U6][V5][U5][V4][U4]

		movq		mm3,mm2
		punpcklbw	mm2,mm0					;[V1][Y3][U1][Y2][V0][Y1][U0][Y0]

		movq		mm5,mm4
		punpckhbw	mm3,mm0					;[V3][Y7][U3][Y6][V2][Y5][U2][Y4]

		movq		[edi+edx*4+ 0],mm2
		punpcklbw	mm4,mm1					;[V5][YB][U5][YA][V4][Y9][U4][Y8]

		movq		[edi+edx*4+ 8],mm3
		punpckhbw	mm5,mm1					;[V7][YF][U7][YE][V6][YD][U6][YC]

		movq		[edi+edx*4+16],mm4

		movq		[edi+edx*4+24],mm5
		add			edx,8

		jnc			xyloop

		mov			edx,[esp+24+16]			;reload width counter

		test		ebp,1					;update U/V row every other row only
		jz			oddline

		sub			ebx,edx					;advance U pointer
		sub			ecx,edx					;advance V pointer

oddline:
		sub			eax,edx					;advance Y pointer
		sub			eax,edx					;advance Y pointer

		add			edi,esi					;advance dest ptr

		dec			ebp
		jne			xyloop

		pop			ebx
		pop			esi
		pop			edi
		pop			ebp
		emms
		ret
	}
}




static void YUVToYUYV16(YUVPixel *Y_ptr, YUVPixel *U_ptr, YUVPixel *V_ptr, unsigned char *dst, long bpr, long w, long h) {

	if (MMX_enabled) {
		YUVtoYUYV16_MMX(Y_ptr, U_ptr, V_ptr, dst, bpr, w, h*2);
		return;
	}

	do {
		int wt;

		wt = w;
		do {
			*dst++ = *Y_ptr++;
			*dst++ = *U_ptr++;
			*dst++ = *Y_ptr++;
			*dst++ = *V_ptr++;
		} while(--wt);

		U_ptr = U_ptr - uv_pitch;
		V_ptr = V_ptr - uv_pitch;
		
		wt = w;
		do {
			*dst++ = *Y_ptr++;
			*dst++ = *U_ptr++;
			*dst++ = *Y_ptr++;
			*dst++ = *V_ptr++;
		} while(--wt);

	} while(--h);
}

// Image manipulation
#define ASM_COPY  __asm             \
{									\
__asm			mov ecx, runsize    \
__asm			mov esi, i          \
__asm           mov edi, o          \
__asm			rep movsd           \
}


/*{							  	    \
__asm	 	    mov ecx, runsize    \
__asm			mov ebx, o          \
__asm			mov edx, i			\
__asm			mov eax, [edx]      \
__asm			mov [ebx], eax      \
__asm			add edx,4           \
__asm			add ebx,4           \
__asm           sub ecx,1           \
__asm           cmp ecx,0           \
__asm			jnz $-16            \
}
*/
void image_zero(YUVImageTag *input){
	memset(input->Y, 0, input->Yxsize*input->Yysize );
	memset(input->U, 0, input->Cxsize*input->Cysize );
	memset(input->V, 0, input->Cxsize*input->Cysize );
}
void image_copy(YUVImageTag *output, YUVImageTag *input)
{
	int runsize, *i, *o;
	if(output==NULL || input==NULL) return;

	runsize = (input->Yxsize*input->Yysize) >> 2;
	o = (int *) output->Y;
	i = (int *) input->Y;
	ASM_COPY;
        
	runsize = (input->Cxsize*input->Cysize) >> 2;
	o = (int *) output->U;
	i = (int *) input->U;
	ASM_COPY;

	o = (int *) output->V;
	i = (int *) input->V;
	ASM_COPY;


/*	run_size = (input->Cxsize*input->Cysize) >> 2;
	asm_copy((int *) output->U, (int *) input->U, run_size);
	asm_copy((int *) output->V, (int *) input->V, run_size);*/
/*	memcpy( output->Y, input->Y, input->Yxsize*input->Yysize );
	memcpy( output->U, input->U, input->Cxsize*input->Cysize );
	memcpy( output->V, input->V, input->Cxsize*input->Cysize );*/
	return;
}
void field_copy_swap(YUVImageTag *output, YUVImageTag *input,  int field){
	int y,first, runsize;
	int *i, *o;
	int swap;
	if(output==NULL || input==NULL) return;
	
	if (field==1)
		first=0;
	else
		first=1;

	swap = first ? -1 : 1;
	
	runsize = (input->Yxsize) >> 2;
	for(y=first; y<input->Yysize; y=y+2){
		o = (int *) &output->Y[(y+swap)*output->Yxsize];
		i = (int *)  &input->Y[y*output->Yxsize];
		ASM_COPY;
	}

	runsize = (input->Cxsize) >> 2;
	for(y=first; y<input->Cysize; y=y+2){
		o = (int *) &output->U[(y+swap)*output->Cxsize];
		i = (int *)  &input->U[y*output->Cxsize];
		ASM_COPY;
	}

	runsize = (input->Cxsize) >> 2;
	for(y=first; y<input->Cysize; y=y+2){
		o = (int *) &output->V[(y+swap)*output->Cxsize];
		i = (int *)  &input->V[y*output->Cxsize];
		ASM_COPY;
	}
	return;
}
void field_copy(YUVImageTag *output, YUVImageTag *input, int field)
{
	int y,first, runsize;
	int *i, *o;
	if(output==NULL || input==NULL) return;
	
	if (field==1)
		first=0;
	else
		first=1;
	
	runsize = (input->Yxsize) >> 2;
	for(y=first; y<input->Yysize; y=y+2){
		o = (int *) &output->Y[y*output->Yxsize];
		i = (int *)  &input->Y[y*output->Yxsize];
		ASM_COPY;
	}

	runsize = (input->Cxsize) >> 2;
	for(y=first; y<input->Cysize; y=y+2){
		o = (int *) &output->U[y*output->Cxsize];
		i = (int *)  &input->U[y*output->Cxsize];
		ASM_COPY;
	}

	runsize = (input->Cxsize) >> 2;
	for(y=first; y<input->Cysize; y=y+2){
		o = (int *) &output->V[y*output->Cxsize];
		i = (int *)  &input->V[y*output->Cxsize];
		ASM_COPY;
	}

/*	for(y=first; y<input->Cysize; y=y+2){
		for(x=0; x<input->Cxsize; x++)
			output->U[y*output->Cxsize + x] = input->U[y*input->Cxsize + x];
	}
	for(y=first; y<input->Cysize; y=y+2){
		for(x=0; x<input->Cxsize; x++)
			output->V[y*output->Cxsize + x] = input->V[y*input->Cxsize + x];
	}
*/
	return;
}