/* 
 *  yuvresizer.cpp
 *
 *	Copyright (C) Alberto Vigata - ultraflask@yahoo.com - January 2000
 *
 *  This file is part of FlasKMPEG, a free MPEG to MPEG/AVI converter
 *	
 *  FlasKMPEG is free software; you can redistribute it and/or modify
 *  it under the terms of the GNU General Public License as published by
 *  the Free Software Foundation; either version 2, or (at your option)
 *  any later version.
 *   
 *  FlasKMPEG is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *  GNU General Public License for more details.
 *   
 *  You should have received a copy of the GNU General Public License
 *  along with GNU Make; see the file COPYING.  If not, write to
 *  the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. 
 *
 */



#include <math.h>
#include "..\FormatDefs.h"

union Stepper
{
  __int64 nStep;
  struct
  {
    unsigned long nStepLo;
    long nStepHi;
  };
};

//FIXME
// This should not be global
static Stepper xStep, yStep;

void NearestInitialize(long nSrcWidth, long nSrcHeight, long nDstWidth, long nDstHeight )
{
  if(!nDstWidth || !nDstHeight )
    return;
  
  // Compute steps
  // integer part
  xStep.nStepHi = nSrcWidth / nDstWidth;
  yStep.nStepHi = nSrcHeight / nDstHeight;
  
  // fractional part
  double dxRatio = (double)nSrcWidth / (double)nDstWidth;
  double dyRatio = (double)nSrcHeight / (double)nDstHeight;
  // leave only the fractional part
  dxRatio = dxRatio - floor(dxRatio);
  dyRatio = dyRatio - floor(dyRatio);
  
  xStep.nStepLo = (unsigned long) (dxRatio * 4294967296.0);
  yStep.nStepLo = (unsigned long) (dyRatio * 4294967296.0);
  
  return;
} 

void NearestRun( Pixel8 *src, Pixel8 *dst, long nSrcWidth, long nSrcHeight, 
                       long nDstWidth, long nDstHeight, long nSrcPitch, long nDestPitch )
{
  if( !src || !dst || !nDstWidth || !nDstHeight)
    return;
  
  Stepper xInc, yInc;
  
  xInc.nStep = yInc.nStep = 0;
  
  
  unsigned long x_frac_inc = xStep.nStepLo;
  unsigned long x_inte_inc = xStep.nStepHi;
  unsigned long y_frac_inc = yStep.nStepLo;
  unsigned long y_inte_inc = yStep.nStepHi;

  int w = nDstWidth, h = nDstHeight, i = 0;
  
  Pixel8 *pSrcRow, *pDstRow;
  
  pDstRow = dst;

  // Alignment in destination frame
  int n_align = nDestPitch - nDstWidth;
  
  do
  {
    pSrcRow = src + (nSrcPitch * yInc.nStepHi);
    xInc.nStep = 0;
    
    w = nDstWidth;
    do
    {
      
      *pDstRow++ = *(pSrcRow + xInc.nStepHi);
      
      xInc.nStep   += x_frac_inc;
      xInc.nStepHi += x_inte_inc; 
      
    }while(--w);

    pDstRow += n_align;
    
    yInc.nStep   += y_frac_inc;
    yInc.nStepHi += y_inte_inc;
    
  }while(--h);
  return;
}


void BilinearRun_Plane( unsigned char *src, unsigned char *dst, long nSrcWidth, long nSrcHeight, 
                        long nDstWidth, long nDstHeight, long nSrcPitch, long nDstPitch )
{

// Simple bilinear resize, mmx optimised
// This function resize src, giving dst, bilinear algorithm used

//		A    |          B                E
//		     |
//		     | d2
//		     |
//		  d1 |
//		-----|         xint + xfrac
//		      P-------------------------P2
// 
//		C               D                F
// 
// 
//    P =  ( A*(1-d1) + B*d1 ) * (1-d2)   + ( C*(1-d1) + D*d1 ) * d2

// to avoid floating point computations, x and y resize ratios are multiplied by 16384
// so integer mmx can be used

static	unsigned long  rounder[2]={0x2000, 0x2000};
static	unsigned short init[4]={0x4000,0,0x4000,0};
static	unsigned short et1[4]={0xffff,0x3fff,0xffff,0x3fff};

	unsigned long SrcPtr;
	unsigned long xint,yint;
  unsigned long dstmod = nDstPitch - nDstWidth;

  __asm{
  
  mov esi,src
  mov SrcPtr,esi
  
  mov eax,nSrcHeight
  mov ecx,nDstHeight

  // compute xratio*16384
  mov eax, nSrcWidth
  dec eax			; nSrcWidth-1
  xor edx,edx
  mov ebx, nDstWidth
  dec ebx			; nDstWidth-1
  div ebx			; (nSrcWidth-1)/(nDstWidth-1)=xratio
  mov xint,eax		; int part of xratio
  xor eax,eax
  div bx			
  shr ax,2			; ax=frac part of xratio*16384=xr
  mov cx,ax
  neg cx			; -xr
  shl eax,16
  mov ax,cx
  movd mm7,eax		; mm7=0,0,xr,-xr
  punpckldq mm7,mm7 ; mm7=xr,-xr,xr,-xr  
 
  // compute yratio*16384
  mov eax, nSrcHeight
  dec eax			; nSrcHeight-1
  xor edx, edx
  mov ebx, nDstHeight
  dec ebx			; nDstHeight-1
  div ebx			; (nSrcHeight-1)/(nDstHeight-1)=yratio
  mov yint,eax		; int part of yratio
  xor eax,eax
  div bx			
  shr ax,2			; ax=frac part of yratio*16384=yr
  mov cx,ax
  neg cx			; -yr
  shl eax,16
  mov ax,cx
  movd mm6,eax		; mm6=0,0,yr,-yr

  mov eax,yint
  mul nSrcPitch
  mov ebx,eax		; yint*nSrcPitch

  movq mm3,rounder
  movq mm5,init
  mov edi,dst		; destination
  pxor mm1,mm1		; mm1=0
  movq mm3,rounder
resize_yloop:
  mov esi,SrcPtr	; source
  mov ecx,nDstWidth	; count
  movq mm4,init		; current xr
  
resize_xloop:
  mov edx,nSrcPitch
  movzx eax,word ptr[esi] // eax=00BA
  movzx edx,word ptr[esi+edx] // edx=00DC
  shl edx,16		; edx=DC00
  or edx,eax		; edx=DCBA

  movd mm0,edx		; mm0=0000DCBA
  punpcklbw mm0,mm1 ; mm0=0D0C0B0A

  pmaddwd mm0,mm4	; mm0= d*xr-c*(1-xr),b*xr-a*(1-xr)
  paddd mm0,mm3		; + rounder
  psrld mm0,14		; shift (divide by 16384)
 
  packssdw mm0,mm0
  add esi,xint		; next x point (next loop)
 
  pmaddwd mm0,mm5	; mm0=(d*xr+c*(1-xr))*yr+(b*xr+a*(1-xr))*(1-yr)
  paddw mm4,mm7		; next x interpol  (next loop)

  movd eax,mm0
  add eax,8192		; +rounder

  shr eax,14		; shift (divide by 16384)

  mov byte ptr [edi],al ; write new point
  inc edi
    
  dec ecx
  je resize_end_xloop
 
  movd eax,mm4		
  or ax,ax			
  jg resize_xloop	; if frac part <1

  inc esi	 		; next source point
  pand mm4,et1		; frac part -1
  paddw mm4,init
  
  jmp resize_xloop

resize_end_xloop:
  
// go to next src line

  add edi, dstmod
  paddw mm5,mm6
  add SrcPtr,ebx ; + yint*nSrcPitch
  movd eax,mm5
  or ax,ax
  jg notbig
  mov eax,nSrcPitch
  add SrcPtr,eax

  pand mm5,et1	; frac part -1
  paddw mm5,init
notbig: 
  dec nDstHeight ; 1 more line done
  jg resize_yloop
 
// done
   emms

  } // _asm
}
void BilinearRun(Pixel8 *src, Pixel8 *dst, long nSrcWidth, long nSrcHeight, 
                 long nDstWidth, long nDstHeight, long nSrcPitch, long nDstPitch  )
{
  BilinearRun_Plane( src, dst, nSrcWidth, nSrcHeight, nDstWidth, nDstHeight, nSrcPitch, nDstPitch );
}
