postprocess.c


1111000
1110111

*/
//DERING_CORE(dst,src                  ,ppsx ,psx  ,sx   ,pplx ,plx  ,lx   ,t0   ,t1)
DERING_CORE((%%eax),(%%eax, %1)        ,%%mm0,%%mm2,%%mm4,%%mm1,%%mm3,%%mm5,%%mm6,%%mm7)
DERING_CORE((%%eax, %1),(%%eax, %1, 2) ,%%mm2,%%mm4,%%mm0,%%mm3,%%mm5,%%mm1,%%mm6,%%mm7)
DERING_CORE((%%eax, %1, 2),(%0, %1, 4) ,%%mm4,%%mm0,%%mm2,%%mm5,%%mm1,%%mm3,%%mm6,%%mm7)
DERING_CORE((%0, %1, 4),(%%ebx)        ,%%mm0,%%mm2,%%mm4,%%mm1,%%mm3,%%mm5,%%mm6,%%mm7)
DERING_CORE((%%ebx),(%%ebx, %1)        ,%%mm2,%%mm4,%%mm0,%%mm3,%%mm5,%%mm1,%%mm6,%%mm7)
DERING_CORE((%%ebx, %1), (%%ebx, %1, 2),%%mm4,%%mm0,%%mm2,%%mm5,%%mm1,%%mm3,%%mm6,%%mm7)
DERING_CORE((%%ebx, %1, 2),(%0, %1, 8) ,%%mm0,%%mm2,%%mm4,%%mm1,%%mm3,%%mm5,%%mm6,%%mm7)
DERING_CORE((%0, %1, 8),(%%ebx, %1, 4) ,%%mm2,%%mm4,%%mm0,%%mm3,%%mm5,%%mm1,%%mm6,%%mm7)


		: : "r" (src), "r" (stride), "r" (QP)
		: "%eax", "%ebx"
	);
#else

//FIXME
#endif
}

/**
 * Deinterlaces the given block
 * will be called for every 8x8 block, and can read & write into an 8x16 block
 */
static inline void deInterlaceInterpolateLinear(uint8_t src[], int stride)
{
#if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
	asm volatile(
		"leal (%0, %1), %%eax				\n\t"
		"leal (%%eax, %1, 4), %%ebx			\n\t"
//	0	1	2	3	4	5	6	7	8	9
//	%0	eax	eax+%1	eax+2%1	%0+4%1	ebx	ebx+%1	ebx+2%1	%0+8%1	ebx+4%1

		"movq (%0), %%mm0				\n\t"
		"movq (%%eax, %1), %%mm1			\n\t"
		PAVGB(%%mm1, %%mm0)
		"movq %%mm0, (%%eax)				\n\t"
		"movq (%0, %1, 4), %%mm0			\n\t"
		PAVGB(%%mm0, %%mm1)
		"movq %%mm1, (%%eax, %1, 2)			\n\t"
		"movq (%%ebx, %1), %%mm1			\n\t"
		PAVGB(%%mm1, %%mm0)
		"movq %%mm0, (%%ebx)				\n\t"
		"movq (%0, %1, 8), %%mm0			\n\t"
		PAVGB(%%mm0, %%mm1)
		"movq %%mm1, (%%ebx, %1, 2)			\n\t"

		: : "r" (src), "r" (stride)
		: "%eax", "%ebx"
	);
#else
	int x;
	for(x=0; x<8; x++)
	{
		src[stride]   = (src[0]        + src[stride*2])>>1;
		src[stride*3] = (src[stride*2] + src[stride*4])>>1;
		src[stride*5] = (src[stride*4] + src[stride*6])>>1;
		src[stride*7] = (src[stride*6] + src[stride*8])>>1;
		src++;
	}
#endif
}

/**
 * Deinterlaces the given block
 * will be called for every 8x8 block, and can read & write into an 8x16 block
 * no cliping in C version
 */
static inline void deInterlaceInterpolateCubic(uint8_t src[], int stride)
{
#if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
	asm volatile(
		"leal (%0, %1), %%eax				\n\t"
		"leal (%%eax, %1, 4), %%ebx			\n\t"
		"leal (%%ebx, %1, 4), %%ecx			\n\t"
		"addl %1, %%ecx					\n\t"
		"pxor %%mm7, %%mm7				\n\t"
//	0	1	2	3	4	5	6	7	8	9	10
//	%0	eax	eax+%1	eax+2%1	%0+4%1	ebx	ebx+%1	ebx+2%1	%0+8%1	ebx+4%1 ecx

#define DEINT_CUBIC(a,b,c,d,e)\
		"movq " #a ", %%mm0				\n\t"\
		"movq " #b ", %%mm1				\n\t"\
		"movq " #d ", %%mm2				\n\t"\
		"movq " #e ", %%mm3				\n\t"\
		PAVGB(%%mm2, %%mm1)					/* (b+d) /2 */\
		PAVGB(%%mm3, %%mm0)					/* a(a+e) /2 */\
		"movq %%mm0, %%mm2				\n\t"\
		"punpcklbw %%mm7, %%mm0				\n\t"\
		"punpckhbw %%mm7, %%mm2				\n\t"\
		"movq %%mm1, %%mm3				\n\t"\
		"punpcklbw %%mm7, %%mm1				\n\t"\
		"punpckhbw %%mm7, %%mm3				\n\t"\
		"psubw %%mm1, %%mm0				\n\t"	/* L(a+e - (b+d))/2 */\
		"psubw %%mm3, %%mm2				\n\t"	/* H(a+e - (b+d))/2 */\
		"psraw $3, %%mm0				\n\t"	/* L(a+e - (b+d))/16 */\
		"psraw $3, %%mm2				\n\t"	/* H(a+e - (b+d))/16 */\
		"psubw %%mm0, %%mm1				\n\t"	/* L(9b + 9d - a - e)/16 */\
		"psubw %%mm2, %%mm3				\n\t"	/* H(9b + 9d - a - e)/16 */\
		"packuswb %%mm3, %%mm1				\n\t"\
		"movq %%mm1, " #c "				\n\t"

DEINT_CUBIC((%0), (%%eax, %1), (%%eax, %1, 2), (%0, %1, 4), (%%ebx, %1))
DEINT_CUBIC((%%eax, %1), (%0, %1, 4), (%%ebx), (%%ebx, %1), (%0, %1, 8))
DEINT_CUBIC((%0, %1, 4), (%%ebx, %1), (%%ebx, %1, 2), (%0, %1, 8), (%%ecx))
DEINT_CUBIC((%%ebx, %1), (%0, %1, 8), (%%ebx, %1, 4), (%%ecx), (%%ecx, %1, 2))

		: : "r" (src), "r" (stride)
		: "%eax", "%ebx", "ecx"
	);
#else
	int x;
	for(x=0; x<8; x++)
	{
		src[stride*3] = (-src[0]        + 9*src[stride*2] + 9*src[stride*4] - src[stride*6])>>4;
		src[stride*5] = (-src[stride*2] + 9*src[stride*4] + 9*src[stride*6] - src[stride*8])>>4;
		src[stride*7] = (-src[stride*4] + 9*src[stride*6] + 9*src[stride*8] - src[stride*10])>>4;
		src[stride*9] = (-src[stride*6] + 9*src[stride*8] + 9*src[stride*10] - src[stride*12])>>4;
		src++;
	}
#endif
}

/**
 * Deinterlaces the given block
 * will be called for every 8x8 block, and can read & write into an 8x16 block
 * will shift the image up by 1 line (FIXME if this is a problem)
 */
static inline void deInterlaceBlendLinear(uint8_t src[], int stride)
{
#if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
	asm volatile(
		"leal (%0, %1), %%eax				\n\t"
		"leal (%%eax, %1, 4), %%ebx			\n\t"
//	0	1	2	3	4	5	6	7	8	9
//	%0	eax	eax+%1	eax+2%1	%0+4%1	ebx	ebx+%1	ebx+2%1	%0+8%1	ebx+4%1

		"movq (%0), %%mm0				\n\t" // L0
		"movq (%%eax, %1), %%mm1			\n\t" // L2
		PAVGB(%%mm1, %%mm0)				      // L0+L2
		"movq (%%eax), %%mm2				\n\t" // L1
		PAVGB(%%mm2, %%mm0)
		"movq %%mm0, (%0)				\n\t"
		"movq (%%eax, %1, 2), %%mm0			\n\t" // L3
		PAVGB(%%mm0, %%mm2)				      // L1+L3
		PAVGB(%%mm1, %%mm2)				      // 2L2 + L1 + L3
		"movq %%mm2, (%%eax)				\n\t"
		"movq (%0, %1, 4), %%mm2			\n\t" // L4
		PAVGB(%%mm2, %%mm1)				      // L2+L4
		PAVGB(%%mm0, %%mm1)				      // 2L3 + L2 + L4
		"movq %%mm1, (%%eax, %1)			\n\t"
		"movq (%%ebx), %%mm1				\n\t" // L5
		PAVGB(%%mm1, %%mm0)				      // L3+L5
		PAVGB(%%mm2, %%mm0)				      // 2L4 + L3 + L5
		"movq %%mm0, (%%eax, %1, 2)			\n\t"
		"movq (%%ebx, %1), %%mm0			\n\t" // L6
		PAVGB(%%mm0, %%mm2)				      // L4+L6
		PAVGB(%%mm1, %%mm2)				      // 2L5 + L4 + L6
		"movq %%mm2, (%0, %1, 4)			\n\t"
		"movq (%%ebx, %1, 2), %%mm2			\n\t" // L7
		PAVGB(%%mm2, %%mm1)				      // L5+L7
		PAVGB(%%mm0, %%mm1)				      // 2L6 + L5 + L7
		"movq %%mm1, (%%ebx)				\n\t"
		"movq (%0, %1, 8), %%mm1			\n\t" // L8
		PAVGB(%%mm1, %%mm0)				      // L6+L8
		PAVGB(%%mm2, %%mm0)				      // 2L7 + L6 + L8
		"movq %%mm0, (%%ebx, %1)			\n\t"
		"movq (%%ebx, %1, 4), %%mm0			\n\t" // L9
		PAVGB(%%mm0, %%mm2)				      // L7+L9
		PAVGB(%%mm1, %%mm2)				      // 2L8 + L7 + L9
		"movq %%mm2, (%%ebx, %1, 2)			\n\t"


		: : "r" (src), "r" (stride)
		: "%eax", "%ebx"
	);
#else
	int x;
	for(x=0; x<8; x++)
	{
		src[0       ] = (src[0       ] + 2*src[stride  ] + src[stride*2])>>2;
		src[stride  ] = (src[stride  ] + 2*src[stride*2] + src[stride*3])>>2;
		src[stride*2] = (src[stride*2] + 2*src[stride*3] + src[stride*4])>>2;
		src[stride*3] = (src[stride*3] + 2*src[stride*4] + src[stride*5])>>2;
		src[stride*4] = (src[stride*4] + 2*src[stride*5] + src[stride*6])>>2;
		src[stride*5] = (src[stride*5] + 2*src[stride*6] + src[stride*7])>>2;
		src[stride*6] = (src[stride*6] + 2*src[stride*7] + src[stride*8])>>2;
		src[stride*7] = (src[stride*7] + 2*src[stride*8] + src[stride*9])>>2;
		src++;
	}
#endif
}

/**
 * Deinterlaces the given block
 * will be called for every 8x8 block, except the last row, and can read & write into an 8x16 block
 */
static inline void deInterlaceMedian(uint8_t src[], int stride)
{
#ifdef HAVE_MMX
#ifdef HAVE_MMX2
	asm volatile(
		"leal (%0, %1), %%eax				\n\t"
		"leal (%%eax, %1, 4), %%ebx			\n\t"
//	0	1	2	3	4	5	6	7	8	9
//	%0	eax	eax+%1	eax+2%1	%0+4%1	ebx	ebx+%1	ebx+2%1	%0+8%1	ebx+4%1

		"movq (%0), %%mm0				\n\t" //
		"movq (%%eax, %1), %%mm2			\n\t" //
		"movq (%%eax), %%mm1				\n\t" //
		"movq %%mm0, %%mm3				\n\t"
		"pmaxub %%mm1, %%mm0				\n\t" //
		"pminub %%mm3, %%mm1				\n\t" //
		"pmaxub %%mm2, %%mm1				\n\t" //
		"pminub %%mm1, %%mm0				\n\t"
		"movq %%mm0, (%%eax)				\n\t"

		"movq (%0, %1, 4), %%mm0			\n\t" //
		"movq (%%eax, %1, 2), %%mm1			\n\t" //
		"movq %%mm2, %%mm3				\n\t"
		"pmaxub %%mm1, %%mm2				\n\t" //
		"pminub %%mm3, %%mm1				\n\t" //
		"pmaxub %%mm0, %%mm1				\n\t" //
		"pminub %%mm1, %%mm2				\n\t"
		"movq %%mm2, (%%eax, %1, 2)			\n\t"

		"movq (%%ebx), %%mm2				\n\t" //
		"movq (%%ebx, %1), %%mm1			\n\t" //
		"movq %%mm2, %%mm3				\n\t"
		"pmaxub %%mm0, %%mm2				\n\t" //
		"pminub %%mm3, %%mm0				\n\t" //
		"pmaxub %%mm1, %%mm0				\n\t" //
		"pminub %%mm0, %%mm2				\n\t"
		"movq %%mm2, (%%ebx)				\n\t"

		"movq (%%ebx, %1, 2), %%mm2			\n\t" //
		"movq (%0, %1, 8), %%mm0			\n\t" //
		"movq %%mm2, %%mm3				\n\t"
		"pmaxub %%mm0, %%mm2				\n\t" //
		"pminub %%mm3, %%mm0				\n\t" //
		"pmaxub %%mm1, %%mm0				\n\t" //
		"pminub %%mm0, %%mm2				\n\t"
		"movq %%mm2, (%%ebx, %1, 2)			\n\t"


		: : "r" (src), "r" (stride)
		: "%eax", "%ebx"
	);

#else // MMX without MMX2
	asm volatile(
		"leal (%0, %1), %%eax				\n\t"
		"leal (%%eax, %1, 4), %%ebx			\n\t"
//	0	1	2	3	4	5	6	7	8	9
//	%0	eax	eax+%1	eax+2%1	%0+4%1	ebx	ebx+%1	ebx+2%1	%0+8%1	ebx+4%1
		"pxor %%mm7, %%mm7				\n\t"

#define MEDIAN(a,b,c)\
		"movq " #a ", %%mm0				\n\t"\
		"movq " #b ", %%mm2				\n\t"\
		"movq " #c ", %%mm1				\n\t"\
		"movq %%mm0, %%mm3				\n\t"\
		"movq %%mm1, %%mm4				\n\t"\
		"movq %%mm2, %%mm5				\n\t"\
		"psubusb %%mm1, %%mm3				\n\t"\
		"psubusb %%mm2, %%mm4				\n\t"\
		"psubusb %%mm0, %%mm5				\n\t"\
		"pcmpeqb %%mm7, %%mm3				\n\t"\
		"pcmpeqb %%mm7, %%mm4				\n\t"\
		"pcmpeqb %%mm7, %%mm5				\n\t"\
		"movq %%mm3, %%mm6				\n\t"\
		"pxor %%mm4, %%mm3				\n\t"\
		"pxor %%mm5, %%mm4				\n\t"\
		"pxor %%mm6, %%mm5				\n\t"\
		"por %%mm3, %%mm1				\n\t"\
		"por %%mm4, %%mm2				\n\t"\
		"por %%mm5, %%mm0				\n\t"\
		"pand %%mm2, %%mm0				\n\t"\
		"pand %%mm1, %%mm0				\n\t"\
		"movq %%mm0, " #b "				\n\t"

MEDIAN((%0), (%%eax), (%%eax, %1))
MEDIAN((%%eax, %1), (%%eax, %1, 2), (%0, %1, 4))
MEDIAN((%0, %1, 4), (%%ebx), (%%ebx, %1))
MEDIAN((%%ebx, %1), (%%ebx, %1, 2), (%0, %1, 8))

		: : "r" (src), "r" (stride)
		: "%eax", "%ebx"
	);
#endif // MMX
#else
	//FIXME
	int x;
	for(x=0; x<8; x++)
	{
		src[0       ] = (src[0       ] + 2*src[stride  ] + src[stride*2])>>2;
		src[stride  ] = (src[stride  ] + 2*src[stride*2] + src[stride*3])>>2;
		src[stride*2] = (src[stride*2] + 2*src[stride*3] + src[stride*4])>>2;
		src[stride*3] = (src[stride*3] + 2*src[stride*4] + src[stride*5])>>2;
		src[stride*4] = (src[stride*4] + 2*src[stride*5] + src[stride*6])>>2;
		src[stride*5] = (src[stride*5] + 2*src[stride*6] + src[stride*7])>>2;
		src[stride*6] = (src[stride*6] + 2*src[stride*7] + src[stride*8])>>2;
		src[stride*7] = (src[stride*7] + 2*src[stride*8] + src[stride*9])>>2;
		src++;
	}
#endif
}

#ifdef HAVE_MMX
/**
 * transposes and shift the given 8x8 Block into dst1 and dst2
 */
static inline void transpose1(uint8_t *dst1, uint8_t *dst2, uint8_t *src, int srcStride)
{
	asm(
		"leal (%0, %1), %%eax				\n\t"
		"leal (%%eax, %1, 4), %%ebx			\n\t"
//	0	1	2	3	4	5	6	7	8	9
//	%0	eax	eax+%1	eax+2%1	%0+4%1	ebx	ebx+%1	ebx+2%1	%0+8%1	ebx+4%1
		"movq (%0), %%mm0		\n\t" // 12345678
		"movq (%%eax), %%mm1		\n\t" // abcdefgh
		"movq %%mm0, %%mm2		\n\t" // 12345678
		"punpcklbw %%mm1, %%mm0		\n\t" // 1a2b3c4d
		"punpckhbw %%mm1, %%mm2		\n\t" // 5e6f7g8h

		"movq (%%eax, %1), %%mm1	\n\t"
		"movq (%%eax, %1, 2), %%mm3	\n\t"
		"movq %%mm1, %%mm4		\n\t"
		"punpcklbw %%mm3, %%mm1		\n\t"
		"punpckhbw %%mm3, %%mm4		\n\t"

		"movq %%mm0, %%mm3		\n\t"
		"punpcklwd %%mm1, %%mm0		\n\t"
		"punpckhwd %%mm1, %%mm3		\n\t"
		"movq %%mm2, %%mm1		\n\t"
		"punpcklwd %%mm4, %%mm2		\n\t"
		"punpckhwd %%mm4, %%mm1		\n\t"

		"movd %%mm0, 128(%2)		\n\t"
		"psrlq $32, %%mm0		\n\t"
		"movd %%mm0, 144(%2)		\n\t"
		"movd %%mm3, 160(%2)		\n\t"
		"psrlq $32, %%mm3		\n\t"
		"movd %%mm3, 176(%2)		\n\t"
		"movd %%mm3, 48(%3)		\n\t"
		"movd %%mm2, 192(%2)		\n\t"
		"movd %%mm2, 64(%3)		\n\t"
		"psrlq $32, %%mm2		\n\t"
		"movd %%mm2, 80(%3)		\n\t"
		"movd %%mm1, 96(%3)		\n\t"
		"psrlq $32, %%mm1		\n\t"
		"movd %%mm1, 112(%3)		\n\t"

		"movq (%0, %1, 4), %%mm0	\n\t" // 12345678
		"movq (%%ebx), %%mm1		\n\t" // abcdefgh
		"movq %%mm0, %%mm2		\n\t" // 12345678
		"punpcklbw %%mm1, %%mm0		\n\t" // 1a2b3c4d
		"punpckhbw %%mm1, %%mm2		\n\t" // 5e6f7g8h

		"movq (%%ebx, %1), %%mm1	\n\t"
		"movq (%%ebx, %1, 2), %%mm3	\n\t"
		"movq %%mm1, %%mm4		\n\t"
		"punpcklbw %%mm3, %%mm1		\n\t"
		"punpckhbw %%mm3, %%mm4		\n\t"

		"movq %%mm0, %%mm3		\n\t"
		"punpcklwd %%mm1, %%mm0		\n\t"
		"punpckhwd %%mm1, %%mm3		\n\t"
		"movq %%mm2, %%mm1		\n\t"
		"punpcklwd %%mm4, %%mm2		\n\t"
		"punpckhwd %%mm4, %%mm1		\n\t"

		"movd %%mm0, 132(%2)		\n\t"
		"psrlq $32, %%mm0		\n\t"
		"movd %%mm0, 148(%2)		\n\t"
		"movd %%mm3, 164(%2)		\n\t"
		"psrlq $32, %%mm3		\n\t"
		"movd %%mm3, 180(%2)		\n\t"
		"movd %%mm3, 52(%3)		\n\t"
		"movd %%mm2, 196(%2)		\n\t"
		"movd %%mm2, 68(%3)		\n\t"
		"psrlq $32, %%mm2		\n\t"
		"movd %%mm2, 84(%3)		\n\t"
		"movd %%mm1, 100(%3)		\n\t"
		"psrlq $32, %%mm1		\n\t"
		"movd %%mm1, 116(%3)		\n\t"


	:: "r" (src), "r" (srcStride), "r" (dst1), "r" (dst2)
	: "%eax", "%ebx"
	);
}

/**
 * transposes the given 8x8 block
 */
static inline void transpose2(uint8_t *dst, int dstStride, uint8_t *src)
{
	asm(
		"leal (%0, %1), %%eax				\n\t"
		"leal (%%eax, %1, 4), %%ebx			\n\t"
//	0	1	2	3	4	5	6	7	8	9
//	%0	eax	eax+%1	eax+2%1	%0+4%1	ebx	ebx+%1	ebx+2%1	%0+8%1	ebx+4%1
		"movq (%2), %%mm0		\n\t" // 12345678
		"movq 16(%2), %%mm1		\n\t" // abcdefgh
		"movq %%mm0, %%mm2		\n\t" // 12345678
		"punpcklbw %%mm1, %%mm0		\n\t" // 1a2b3c4d
		"punpckhbw %%mm1, %%mm2		\n\t" // 5e6f7g8h

		"movq 32(%2), %%mm1		\n\t"
		"movq 48(%2), %%mm3		\n\t"
		"movq %%mm1, %%mm4		\n\t"
		"punpcklbw %%mm3, %%mm1		\n\t"
		"punpckhbw %%mm3, %%mm4		\n\t"

		"movq %%mm0, %%mm3		\n\t"
		"punpcklwd %%mm1, %%mm0		\n\t"
		"punpckhwd %%mm1, %%mm3		\n\t"
		"movq %%mm2, %%mm1		\n\t"
		"punpcklwd %%mm4, %%mm2		\n\t"
		"punpckhwd %%mm4, %%mm1		\n\t"

		"movd %%mm0, (%0)		\n\t"
		"psrlq $32, %%mm0		\n\t"
		"movd %%mm0, (%%eax)		\n\t"
		"movd %%mm3, (%%eax, %1)	\n\t"
		"psrlq $32, %%mm3		\n\t"
		"movd %%mm3, (%%eax, %1, 2)	\n\t"
		"movd %%mm2, (%0, %1, 4)	\n\t"
		"psrlq $32, %%mm2		\n\t"
		"movd %%mm2, (%%ebx)		\n\t"
		"movd %%mm1, (%%ebx, %1)	\n\t"
		"psrlq $32, %%mm1		\n\t"
		"movd %%mm1, (%%ebx, %1, 2)	\n\t"


		"movq 64(%2), %%mm0		\n\t" // 12345678
		"movq 80(%2), %%mm1		\n\t" // abcdefgh
		"movq %%mm0, %%mm2		\n\t" // 12345678
		"punpcklbw %%mm1, %%mm0		\n\t" // 1a2b3c4d
		"punpckhbw %%mm1, %%mm2		\n\t" // 5e6f7g8h

		"movq 96(%2), %%mm1		\n\t"
		"movq 112(%2), %%mm3		\n\t"
		"movq %%mm1, %%mm4		\n\t"
		"punpcklbw %%mm3, %%mm1		\n\t"
		"punpckhbw %%mm3, %%mm4		\n\t"

		"movq %%mm0, %%mm3		\n\t"
		"punpcklwd %%mm1, %%mm0		\n\t"
		"punpckhwd %%mm1, %%mm3		\n\t"
		"movq %%mm2, %%mm1		\n\t"
		"punpcklwd %%mm4, %%mm2		\n\t"
		"punpckhwd %%mm4, %%mm1		\n\t"

		"movd %%mm0, 4(%0)		\n\t"
		"psrlq $32, %%mm0		\n\t"
		"movd %%mm0, 4(%%eax)		\n\t"
		"movd %%mm3, 4(%%eax, %1)	\n\t"
		"psrlq $32, %%mm3		\n\t"
		"movd %%mm3, 4(%%eax, %1, 2)	\n\t"
		"movd %%mm2, 4(%0, %1, 4)	\n\t"
		"psrlq $32, %%mm2		\n\t"
		"movd %%mm2, 4(%%ebx)		\n\t"
		"movd %%mm1, 4(%%ebx, %1)	\n\t"
		"psrlq $32, %%mm1		\n\t"
		"movd %%mm1, 4(%%ebx, %1, 2)	\n\t"

	:: "r" (dst), "r" (dstStride), "r" (src)
	: "%eax", "%ebx"
	);
}
#endif

#ifdef HAVE_ODIVX_POSTPROCESS
#include "../opendivx/postprocess.h"
int use_old_pp=0;
#endif

static void postProcess(uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int width, int height,
	QP_STORE_T QPs[], int QPStride, int isColor, int mode);

/* -pp Command line Help
NOTE/FIXME: put this at an appropriate place (--help, html docs, man mplayer)?

-pp <filterName>[:<option>[:<option>...]][,[-]<filterName>[:<option>...]]...

long form example:
-pp vdeblock:autoq,hdeblock:autoq,linblenddeint		-pp default,-vdeblock
short form example:
-pp vb:a,hb:a,lb					-pp de,-vb

Filters			Options
short	long name	short	long option	Description
*	*		a	autoq		cpu power dependant enabler
			c	chrom		chrominance filtring enabled
			y	nochrom		chrominance filtring disabled
hb	hdeblock				horizontal deblocking filter
vb	vdeblock				vertical deblocking filter
vr	rkvdeblock
h1	x1hdeblock				Experimental horizontal deblock filter 1
v1	x1vdeblock				Experimental vertical deblock filter 1
dr	dering					not implemented yet
al	autolevels				automatic brightness / contrast fixer
			f	fullyrange	stretch luminance range to (0..255)
lb	linblenddeint				linear blend deinterlacer
li	linipoldeint				linear interpolating deinterlacer
ci	cubicipoldeint				cubic interpolating deinterlacer
md	mediandeint				median deinterlacer
de	default					hdeblock:a,vdeblock:a,dering:a,autolevels
fa	fast					x1hdeblock:a,x1vdeblock:a,dering:a,autolevels
*/

/**
 * returns a PPMode struct which will have a non 0 error variable if an error occured
 * name is the string after "-pp" on the command line
 * quality is a number from 0 to GET_PP_QUALITY_MAX
 */
struct PPMode getPPModeByNameAndQuality(char *name, int quality)
{
	char temp[GET_MODE_BUFFER_SIZE];
	char *p= temp;
	char *filterDelimiters= ",";
	char *optionDelimiters= ":";
	struct PPMode ppMode= {0,0,0,0,0,0};
	char *filterToken;

	strncpy(temp, name, GET_MODE_BUFFER_SIZE);

	for(;;){
		char *filterName;
		int q= GET_PP_QUALITY_MAX;
		int chrom=-1;
		char *option;
		char *options[OPTIONS_ARRAY_SIZE];
		int i;
		int filterNameOk=0;
		int numOfUnknownOptions=0;
		int enable=1; //does the user want us to enabled or disabled the filter

		filterToken= strtok(p, filterDelimiters);
		if(filterToken == NULL) break;
		p+= strlen(filterToken) + 1;
		filterName= strtok(filterToken, optionDelimiters);
		printf("%s::%s\n", filterToken, filterName);

		if(*filterName == '-')
		{
			enable=0;
			filterName++;
		}
		for(;;){ //for all options
			option= strtok(NULL, optionDelimiters);
			if(option == NULL) break;

			printf("%s\n", option);
			if(!strcmp("autoq", option) || !strcmp("a", option)) q= quality;
			else if(!strcmp("nochrom", option) || !strcmp("y", option)) chrom=0;
			else if(!strcmp("chrom", option) || !strcmp("c", option)) chrom=1;
			else
			{
				options[numOfUnknownOptions] = option;
				numOfUnknownOptions++;
				options[numOfUnknownOptions] = NULL;
			}
			if(numOfUnknownOptions >= OPTIONS_ARRAY_SIZE-1) break;
		}

		/* replace stuff from the replace Table */
		for(i=0; replaceTable[2*i]!=NULL; i++)
		{
			if(!strcmp(replaceTable[2*i], filterName))
			{
				int newlen= strlen(replaceTable[2*i + 1]);
				int plen;
				int spaceLeft;

				if(p==NULL) p= temp, *p=0; 	//last filter
				else p--, *p=',';		//not last filter

				plen= strlen(p);
				spaceLeft= (int)p - (int)temp + plen;
				if(spaceLeft + newlen  >= GET_MODE_BUFFER_SIZE)
				{
					ppMode.error++;
					break;
				}
				memmove(p + newlen, p, plen+1);
				memcpy(p, replaceTable[2*i + 1], newlen);
				filterNameOk=1;
			}
		}

		for(i=0; filters[i].shortName!=NULL; i++)
		{
			if(   !strcmp(filters[i].longName, filterName)
			   || !strcmp(filters[i].shortName, filterName))
			{
				ppMode.lumMode &= ~filters[i].mask;
				ppMode.chromMode &= ~filters[i].mask;

				filterNameOk=1;
				if(!enable) break; // user wants to disable it

				if(q >= filters[i].minLumQuality)
					ppMode.lumMode|= filters[i].mask;
				if(chrom==1 || (chrom==-1 && filters[i].chromDefault))
					if(q >= filters[i].minChromQuality)
						ppMode.chromMode|= filters[i].mask;

				if(filters[i].mask == LEVEL_FIX)
				{
					int o;
					ppMode.minAllowedY= 16;
					ppMode.maxAllowedY= 234;
					for(o=0; options[o]!=NULL; o++)
						if(  !strcmp(options[o],"fullyrange")
						   ||!strcmp(options[o],"f"))
						{
							ppMode.minAllowedY= 0;
							ppMode.maxAllowedY= 255;
							numOfUnknownOptions--;
						}
				}
			}
		}
		if(!filterNameOk) ppMode.error++;
		ppMode.error += numOfUnknownOptions;
	}

	if(ppMode.lumMode & H_DEBLOCK) ppMode.oldMode |= PP_DEBLOCK_Y_H;
	if(ppMode.lumMode & V_DEBLOCK) ppMode.oldMode |= PP_DEBLOCK_Y_V;
	if(ppMode.chromMode & H_DEBLOCK) ppMode.oldMode |= PP_DEBLOCK_C_H;
	if(ppMode.chromMode & V_DEBLOCK) ppMode.oldMode |= PP_DEBLOCK_C_V;
	if(ppMode.lumMode & DERING) ppMode.oldMode |= PP_DERING_Y;
	if(ppMode.chromMode & DERING) ppMode.oldMode |= PP_DERING_C;

	return ppMode;
}

/**
 * ...
 */
void  postprocess(unsigned char * src[], int src_stride,
                 unsigned char * dst[], int dst_stride,
                 int horizontal_size,   int vertical_size,
                 QP_STORE_T *QP_store,  int QP_stride,
					  int mode)
{
/*
	static int qual=0;

	struct PPMode ppMode= getPPModeByNameAndQuality("fast,default,-hdeblock,-vdeblock", qual);
	qual++;
	qual%=7;
	printf("\n%d %d %d %d\n", ppMode.lumMode, ppMode.chromMode, ppMode.oldMode, ppMode.error);
	postprocess2(src, src_stride, dst, dst_stride,
                 horizontal_size, vertical_size, QP_store, QP_stride, &ppMode);

	return;
*/

#ifdef HAVE_ODIVX_POSTPROCESS
// Note: I could make this shit outside of this file, but it would mean one
// more function call...
	if(use_old_pp){
	    odivx_postprocess(src,src_stride,dst,dst_stride,horizontal_size,vertical_size,QP_store,QP_stride,mode);
	    return;
	}
#endif

	postProcess(src[0], src_stride, dst[0], dst_stride,
		horizontal_size, vertical_size, QP_store, QP_stride, 0, mode);

	horizontal_size >>= 1;
	vertical_size   >>= 1;
	src_stride      >>= 1;
	dst_stride      >>= 1;
	mode= ((mode&0xFF)>>4) | (mode&0xFFFFFF00);

	if(1)
	{
		postProcess(src[1], src_stride, dst[1], dst_stride,
			horizontal_size, vertical_size, QP_store, QP_stride, 1, mode);
		postProcess(src[2], src_stride, dst[2], dst_stride,
			horizontal_size, vertical_size, QP_store, QP_stride, 2, mode);
	}
	else
	{
		memcpy(dst[1], src[1], src_stride*horizontal_size);
		memcpy(dst[2], src[2], src_stride*horizontal_size);
	}
}

void  postprocess2(unsigned char * src[], int src_stride,
                 unsigned char * dst[], int dst_stride,
                 int horizontal_size,   int vertical_size,
                 QP_STORE_T *QP_store,  int QP_stride,
		 struct PPMode *mode)
{

#ifdef HAVE_ODIVX_POSTPROCESS
// Note: I could make this shit outside of this file, but it would mean one
// more function call...
	if(use_old_pp){
	    odivx_postprocess(src,src_stride,dst,dst_stride,horizontal_size,vertical_size,QP_store,QP_stride,
	    mode->oldMode);
	    return;
	}
#endif

	postProcess(src[0], src_stride, dst[0], dst_stride,
		horizontal_size, vertical_size, QP_store, QP_stride, 0, mode->lumMode);

	horizontal_size >>= 1;
	vertical_size   >>= 1;
	src_stride      >>= 1;
	dst_stride      >>= 1;

	postProcess(src[1], src_stride, dst[1], dst_stride,
		horizontal_size, vertical_size, QP_store, QP_stride, 1, mode->chromMode);
	postProcess(src[2], src_stride, dst[2], dst_stride,
		horizontal_size, vertical_size, QP_store, QP_stride, 2, mode->chromMode);
}


/**
 * gets the mode flags for a given quality (larger values mean slower but better postprocessing)
 * 0 <= quality <= 6
 */
int getPpModeForQuality(int quality){
	int modes[1+GET_PP_QUALITY_MAX]= {
		0,
#if 1
		// horizontal filters first
		LUM_H_DEBLOCK,
		LUM_H_DEBLOCK | LUM_V_DEBLOCK,
		LUM_H_DEBLOCK | LUM_V_DEBLOCK | CHROM_H_DEBLOCK,
		LUM_H_DEBLOCK | LUM_V_DEBLOCK | CHROM_H_DEBLOCK | CHROM_V_DEBLOCK,
		LUM_H_DEBLOCK | LUM_V_DEBLOCK | CHROM_H_DEBLOCK | CHROM_V_DEBLOCK | LUM_DERING,
		LUM_H_DEBLOCK | LUM_V_DEBLOCK | CHROM_H_DEBLOCK | CHROM_V_DEBLOCK | LUM_DERING | CHROM_DERING
#else
		// vertical filters first
		LUM_V_DEBLOCK,
		LUM_V_DEBLOCK | LUM_H_DEBLOCK,
		LUM_V_DEBLOCK | LUM_H_DEBLOCK | CHROM_V_DEBLOCK,
		LUM_V_DEBLOCK | LUM_H_DEBLOCK | CHROM_V_DEBLOCK | CHROM_H_DEBLOCK,
		LUM_V_DEBLOCK | LUM_H_DEBLOCK | CHROM_V_DEBLOCK | CHROM_H_DEBLOCK | LUM_DERING,
		LUM_V_DEBLOCK | LUM_H_DEBLOCK | CHROM_V_DEBLOCK | CHROM_H_DEBLOCK | LUM_DERING | CHROM_DERING
#endif
	};

#ifdef HAVE_ODIVX_POSTPROCESS
	int odivx_modes[1+GET_PP_QUALITY_MAX]= {
		0,
		PP_DEBLOCK_Y_H,
		PP_DEBLOCK_Y_H|PP_DEBLOCK_Y_V,
		PP_DEBLOCK_Y_H|PP_DEBLOCK_Y_V|PP_DEBLOCK_C_H,
		PP_DEBLOCK_Y_H|PP_DEBLOCK_Y_V|PP_DEBLOCK_C_H|PP_DEBLOCK_C_V,
		PP_DEBLOCK_Y_H|PP_DEBLOCK_Y_V|PP_DEBLOCK_C_H|PP_DEBLOCK_C_V|PP_DERING_Y,
		PP_DEBLOCK_Y_H|PP_DEBLOCK_Y_V|PP_DEBLOCK_C_H|PP_DEBLOCK_C_V|PP_DERING_Y|PP_DERING_C
	};
	if(use_old_pp) return odivx_modes[quality];
#endif
	return modes[quality];
}

/**
 * Copies a block from src to dst and fixes the blacklevel
 * numLines must be a multiple of 4
 * levelFix == 0 -> dont touch the brighness & contrast
 */
static inline void blockCopy(uint8_t dst[], int dstStride, uint8_t src[], int srcStride,
	int numLines, int levelFix)
{
#ifndef HAVE_MMX
	int i;
#endif
	if(levelFix)
	{
#ifdef HAVE_MMX
					asm volatile(
						"leal (%2,%2), %%eax	\n\t"
						"leal (%3,%3), %%ebx	\n\t"
						"movq packedYOffset, %%mm2	\n\t"
						"movq packedYScale, %%mm3	\n\t"
						"pxor %%mm4, %%mm4	\n\t"

#define SCALED_CPY					\
						"movq (%0), %%mm0	\n\t"\
						"movq (%0), %%mm5	\n\t"\
						"punpcklbw %%mm4, %%mm0 \n\t"\
						"punpckhbw %%mm4, %%mm5 \n\t"\
						"psubw %%mm2, %%mm0	\n\t"\
						"psubw %%mm2, %%mm5	\n\t"\
						"movq (%0,%2), %%mm1	\n\t"\
						"psllw $6, %%mm0	\n\t"\
						"psllw $6, %%mm5	\n\t"\
						"pmulhw %%mm3, %%mm0	\n\t"\
						"movq (%0,%2), %%mm6	\n\t"\
						"pmulhw %%mm3, %%mm5	\n\t"\
						"punpcklbw %%mm4, %%mm1 \n\t"\
						"punpckhbw %%mm4, %%mm6 \n\t"\
						"psubw %%mm2, %%mm1	\n\t"\
						"psubw %%mm2, %%mm6	\n\t"\
						"psllw $6, %%mm1	\n\t"\
						"psllw $6, %%mm6	\n\t"\
						"pmulhw %%mm3, %%mm1	\n\t"\
						"pmulhw %%mm3, %%mm6	\n\t"\
						"addl %%eax, %0		\n\t"\
						"packuswb %%mm5, %%mm0	\n\t"\
						"packuswb %%mm6, %%mm1	\n\t"\
						"movq %%mm0, (%1)	\n\t"\
						"movq %%mm1, (%1, %3)	\n\t"\

SCALED_CPY
						"addl %%ebx, %1		\n\t"
SCALED_CPY
						"addl %%ebx, %1		\n\t"
SCALED_CPY
						"addl %%ebx, %1		\n\t"
SCALED_CPY

						: "+r"(src),
						"+r"(dst)
						:"r" (srcStride),
						"r" (dstStride)
						: "%eax", "%ebx"
					);
#else
				for(i=0; i<numLines; i++)
					memcpy(	&(dst[dstStride*i]),
						&(src[srcStride*i]), BLOCK_SIZE);
#endif
	}
	else
	{
#ifdef HAVE_MMX
					asm volatile(
						"movl %4, %%eax \n\t"
						"movl %%eax, temp0\n\t"
						"pushl %0 \n\t"
						"pushl %1 \n\t"
						"leal (%2,%2), %%eax	\n\t"
						"leal (%3,%3), %%ebx	\n\t"
						"movq packedYOffset, %%mm2	\n\t"
						"movq packedYScale, %%mm3	\n\t"

#define SIMPLE_CPY					\
						"movq (%0), %%mm0	\n\t"\
						"movq (%0,%2), %%mm1	\n\t"\
						"movq %%mm0, (%1)	\n\t"\
						"movq %%mm1, (%1, %3)	\n\t"\

						"1:			\n\t"
SIMPLE_CPY
						"addl %%eax, %0		\n\t"
						"addl %%ebx, %1		\n\t"
SIMPLE_CPY
						"addl %%eax, %0		\n\t"
						"addl %%ebx, %1		\n\t"
						"decl temp0		\n\t"
						"jnz 1b			\n\t"

						"popl %1 \n\t"
						"popl %0 \n\t"
						: : "r" (src),
						"r" (dst),
						"r" (srcStride),
						"r" (dstStride),
						"m" (numLines>>2)
						: "%eax", "%ebx"
					);
#else
				for(i=0; i<numLines; i++)
					memcpy(	&(dst[dstStride*i]),
						&(src[srcStride*i]), BLOCK_SIZE);
#endif
	}
}


/**
 * Filters array of bytes (Y or U or V values)
 */
static void postProcess(uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int width, int height,
	QP_STORE_T QPs[], int QPStride, int isColor, int mode)
{
	int x,y;
	/* we need 64bit here otherwise well going to have a problem
	   after watching a black picture for 5 hours*/
	static uint64_t *yHistogram= NULL;
	int black=0, white=255; // blackest black and whitest white in the picture
	int QPCorrecture= 256;

	/* Temporary buffers for handling the last row(s) */
	static uint8_t *tempDst= NULL;
	static uint8_t *tempSrc= NULL;

	/* Temporary buffers for handling the last block */
	static uint8_t *tempDstBlock= NULL;
	static uint8_t *tempSrcBlock= NULL;

#ifdef PP_FUNNY_STRIDE
	uint8_t *dstBlockPtrBackup;
	uint8_t *srcBlockPtrBackup;
#endif

#ifdef MORE_TIMING
	long long T0, T1, diffTime=0;
#endif
#ifdef TIMING
	long long memcpyTime=0, vertTime=0, horizTime=0, sumTime;
	sumTime= rdtsc();
#endif

	if(tempDst==NULL)
	{
		tempDst= (uint8_t*)memalign(8, 1024*24);
		tempSrc= (uint8_t*)memalign(8, 1024*24);
		tempDstBlock= (uint8_t*)memalign(8, 1024*24);
		tempSrcBlock= (uint8_t*)memalign(8, 1024*24);
	}

	if(!yHistogram)
	{
		int i;
		yHistogram= (uint64_t*)malloc(8*256);
		for(i=0; i<256; i++) yHistogram[i]= width*height/64*15/256;

		if(mode & FULL_Y_RANGE)
		{
			maxAllowedY=255;
			minAllowedY=0;
		}
	}

	if(!isColor)
	{
		uint64_t sum= 0;
		int i;
		static int framenum= -1;
		uint64_t maxClipped;
		uint64_t clipped;
		double scale;

		framenum++;
		if(framenum == 1) yHistogram[0]= width*height/64*15/256;

		for(i=0; i<256; i++)
		{
			sum+= yHistogram[i];
//			printf("%d ", yHistogram[i]);
		}
//		printf("\n\n");

		/* we allways get a completly black picture first */
		maxClipped= (uint64_t)(sum * maxClippedThreshold);

		clipped= sum;
		for(black=255; black>0; black--)
		{
			if(clipped < maxClipped) break;
			clipped-= yHistogram[black];
		}

		clipped= sum;
		for(white=0; white<256; white++)
		{
			if(clipped < maxClipped) break;
			clipped-= yHistogram[white];
		}

		packedYOffset= (black - minAllowedY) & 0xFFFF;
		packedYOffset|= packedYOffset<<32;
		packedYOffset|= packedYOffset<<16;

		scale= (double)(maxAllowedY - minAllowedY) / (double)(white-black);

		packedYScale= (uint16_t)(scale*1024.0 + 0.5);
		packedYScale|= packedYScale<<32;
		packedYScale|= packedYScale<<16;
	}
	else
	{
		packedYScale= 0x0100010001000100LL;
		packedYOffset= 0;
	}

	if(mode & LEVEL_FIX)	QPCorrecture= packedYScale &0xFFFF;
	else			QPCorrecture= 256;

	/* copy first row of 8x8 blocks */
	for(x=0; x<width; x+=BLOCK_SIZE)
		blockCopy(dst + x, dstStride, src + x, srcStride, 8, mode & LEVEL_FIX);