Skip to content
Snippets Groups Projects
postprocess.c 105 KiB
Newer Older
  • Learn to ignore specific revisions
  • Michael Niedermayer's avatar
    Michael Niedermayer committed
    #endif // MMX
    
    Michael Niedermayer's avatar
    Michael Niedermayer committed
    	src+= 4*stride;
    
    	for(x=0; x<8; x++)
    	{
    		src[0       ] = (src[0       ] + 2*src[stride  ] + src[stride*2])>>2;
    		src[stride  ] = (src[stride  ] + 2*src[stride*2] + src[stride*3])>>2;
    		src[stride*2] = (src[stride*2] + 2*src[stride*3] + src[stride*4])>>2;
    		src[stride*3] = (src[stride*3] + 2*src[stride*4] + src[stride*5])>>2;
    		src[stride*4] = (src[stride*4] + 2*src[stride*5] + src[stride*6])>>2;
    		src[stride*5] = (src[stride*5] + 2*src[stride*6] + src[stride*7])>>2;
    		src[stride*6] = (src[stride*6] + 2*src[stride*7] + src[stride*8])>>2;
    		src[stride*7] = (src[stride*7] + 2*src[stride*8] + src[stride*9])>>2;
    		src++;
    	}
    #endif
    }
    
    
    /**
     * transposes and shift the given 8x8 Block into dst1 and dst2
     */
    static inline void transpose1(uint8_t *dst1, uint8_t *dst2, uint8_t *src, int srcStride)
    {
    	asm(
    		"leal (%0, %1), %%eax				\n\t"
    		"leal (%%eax, %1, 4), %%ebx			\n\t"
    //	0	1	2	3	4	5	6	7	8	9
    //	%0	eax	eax+%1	eax+2%1	%0+4%1	ebx	ebx+%1	ebx+2%1	%0+8%1	ebx+4%1
    		"movq (%0), %%mm0		\n\t" // 12345678
    		"movq (%%eax), %%mm1		\n\t" // abcdefgh
    		"movq %%mm0, %%mm2		\n\t" // 12345678
    		"punpcklbw %%mm1, %%mm0		\n\t" // 1a2b3c4d
    		"punpckhbw %%mm1, %%mm2		\n\t" // 5e6f7g8h
    
    		"movq (%%eax, %1), %%mm1	\n\t"
    		"movq (%%eax, %1, 2), %%mm3	\n\t"
    		"movq %%mm1, %%mm4		\n\t"
    		"punpcklbw %%mm3, %%mm1		\n\t"
    		"punpckhbw %%mm3, %%mm4		\n\t"
    
    		"movq %%mm0, %%mm3		\n\t"
    		"punpcklwd %%mm1, %%mm0		\n\t"
    		"punpckhwd %%mm1, %%mm3		\n\t"
    		"movq %%mm2, %%mm1		\n\t"
    		"punpcklwd %%mm4, %%mm2		\n\t"
    		"punpckhwd %%mm4, %%mm1		\n\t"
    
    		"movd %%mm0, 128(%2)		\n\t"
    		"psrlq $32, %%mm0		\n\t"
    		"movd %%mm0, 144(%2)		\n\t"
    		"movd %%mm3, 160(%2)		\n\t"
    		"psrlq $32, %%mm3		\n\t"
    		"movd %%mm3, 176(%2)		\n\t"
    		"movd %%mm3, 48(%3)		\n\t"
    		"movd %%mm2, 192(%2)		\n\t"
    		"movd %%mm2, 64(%3)		\n\t"
    		"psrlq $32, %%mm2		\n\t"
    		"movd %%mm2, 80(%3)		\n\t"
    		"movd %%mm1, 96(%3)		\n\t"
    		"psrlq $32, %%mm1		\n\t"
    		"movd %%mm1, 112(%3)		\n\t"
    
    		"movq (%0, %1, 4), %%mm0	\n\t" // 12345678
    		"movq (%%ebx), %%mm1		\n\t" // abcdefgh
    		"movq %%mm0, %%mm2		\n\t" // 12345678
    		"punpcklbw %%mm1, %%mm0		\n\t" // 1a2b3c4d
    		"punpckhbw %%mm1, %%mm2		\n\t" // 5e6f7g8h
    
    		"movq (%%ebx, %1), %%mm1	\n\t"
    		"movq (%%ebx, %1, 2), %%mm3	\n\t"
    		"movq %%mm1, %%mm4		\n\t"
    		"punpcklbw %%mm3, %%mm1		\n\t"
    		"punpckhbw %%mm3, %%mm4		\n\t"
    
    		"movq %%mm0, %%mm3		\n\t"
    		"punpcklwd %%mm1, %%mm0		\n\t"
    		"punpckhwd %%mm1, %%mm3		\n\t"
    		"movq %%mm2, %%mm1		\n\t"
    		"punpcklwd %%mm4, %%mm2		\n\t"
    		"punpckhwd %%mm4, %%mm1		\n\t"
    
    		"movd %%mm0, 132(%2)		\n\t"
    		"psrlq $32, %%mm0		\n\t"
    		"movd %%mm0, 148(%2)		\n\t"
    		"movd %%mm3, 164(%2)		\n\t"
    		"psrlq $32, %%mm3		\n\t"
    		"movd %%mm3, 180(%2)		\n\t"
    		"movd %%mm3, 52(%3)		\n\t"
    		"movd %%mm2, 196(%2)		\n\t"
    		"movd %%mm2, 68(%3)		\n\t"
    		"psrlq $32, %%mm2		\n\t"
    		"movd %%mm2, 84(%3)		\n\t"
    		"movd %%mm1, 100(%3)		\n\t"
    		"psrlq $32, %%mm1		\n\t"
    		"movd %%mm1, 116(%3)		\n\t"
    
    
    	:: "r" (src), "r" (srcStride), "r" (dst1), "r" (dst2)
    	: "%eax", "%ebx"
    	);
    }
    
    /**
     * transposes the given 8x8 block
     */
    static inline void transpose2(uint8_t *dst, int dstStride, uint8_t *src)
    {
    	asm(
    		"leal (%0, %1), %%eax				\n\t"
    		"leal (%%eax, %1, 4), %%ebx			\n\t"
    //	0	1	2	3	4	5	6	7	8	9
    //	%0	eax	eax+%1	eax+2%1	%0+4%1	ebx	ebx+%1	ebx+2%1	%0+8%1	ebx+4%1
    		"movq (%2), %%mm0		\n\t" // 12345678
    		"movq 16(%2), %%mm1		\n\t" // abcdefgh
    		"movq %%mm0, %%mm2		\n\t" // 12345678
    		"punpcklbw %%mm1, %%mm0		\n\t" // 1a2b3c4d
    		"punpckhbw %%mm1, %%mm2		\n\t" // 5e6f7g8h
    
    		"movq 32(%2), %%mm1		\n\t"
    		"movq 48(%2), %%mm3		\n\t"
    		"movq %%mm1, %%mm4		\n\t"
    		"punpcklbw %%mm3, %%mm1		\n\t"
    		"punpckhbw %%mm3, %%mm4		\n\t"
    
    		"movq %%mm0, %%mm3		\n\t"
    		"punpcklwd %%mm1, %%mm0		\n\t"
    		"punpckhwd %%mm1, %%mm3		\n\t"
    		"movq %%mm2, %%mm1		\n\t"
    		"punpcklwd %%mm4, %%mm2		\n\t"
    		"punpckhwd %%mm4, %%mm1		\n\t"
    
    		"movd %%mm0, (%0)		\n\t"
    		"psrlq $32, %%mm0		\n\t"
    		"movd %%mm0, (%%eax)		\n\t"
    		"movd %%mm3, (%%eax, %1)	\n\t"
    		"psrlq $32, %%mm3		\n\t"
    		"movd %%mm3, (%%eax, %1, 2)	\n\t"
    		"movd %%mm2, (%0, %1, 4)	\n\t"
    		"psrlq $32, %%mm2		\n\t"
    		"movd %%mm2, (%%ebx)		\n\t"
    		"movd %%mm1, (%%ebx, %1)	\n\t"
    		"psrlq $32, %%mm1		\n\t"
    		"movd %%mm1, (%%ebx, %1, 2)	\n\t"
    
    
    		"movq 64(%2), %%mm0		\n\t" // 12345678
    		"movq 80(%2), %%mm1		\n\t" // abcdefgh
    		"movq %%mm0, %%mm2		\n\t" // 12345678
    		"punpcklbw %%mm1, %%mm0		\n\t" // 1a2b3c4d
    		"punpckhbw %%mm1, %%mm2		\n\t" // 5e6f7g8h
    
    		"movq 96(%2), %%mm1		\n\t"
    		"movq 112(%2), %%mm3		\n\t"
    		"movq %%mm1, %%mm4		\n\t"
    		"punpcklbw %%mm3, %%mm1		\n\t"
    		"punpckhbw %%mm3, %%mm4		\n\t"
    
    		"movq %%mm0, %%mm3		\n\t"
    		"punpcklwd %%mm1, %%mm0		\n\t"
    		"punpckhwd %%mm1, %%mm3		\n\t"
    		"movq %%mm2, %%mm1		\n\t"
    		"punpcklwd %%mm4, %%mm2		\n\t"
    		"punpckhwd %%mm4, %%mm1		\n\t"
    
    		"movd %%mm0, 4(%0)		\n\t"
    		"psrlq $32, %%mm0		\n\t"
    		"movd %%mm0, 4(%%eax)		\n\t"
    		"movd %%mm3, 4(%%eax, %1)	\n\t"
    		"psrlq $32, %%mm3		\n\t"
    		"movd %%mm3, 4(%%eax, %1, 2)	\n\t"
    		"movd %%mm2, 4(%0, %1, 4)	\n\t"
    		"psrlq $32, %%mm2		\n\t"
    		"movd %%mm2, 4(%%ebx)		\n\t"
    		"movd %%mm1, 4(%%ebx, %1)	\n\t"
    		"psrlq $32, %%mm1		\n\t"
    		"movd %%mm1, 4(%%ebx, %1, 2)	\n\t"
    
    	:: "r" (dst), "r" (dstStride), "r" (src)
    	: "%eax", "%ebx"
    	);
    }
    
    static void inline tempNoiseReducer(uint8_t *src, int stride,
    
    Michael Niedermayer's avatar
    Michael Niedermayer committed
    				    uint8_t *tempBlured, uint32_t *tempBluredPast, int *maxNoise)
    
    #define FAST_L2_DIFF
    //#define L1_DIFF //u should change the thresholds too if u try that one
    #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
    	asm volatile(
    		"leal (%2, %2, 2), %%eax			\n\t" // 3*stride
    		"leal (%2, %2, 4), %%ebx			\n\t" // 5*stride
    		"leal (%%ebx, %2, 2), %%ecx			\n\t" // 7*stride
    //	0	1	2	3	4	5	6	7	8	9
    //	%x	%x+%2	%x+2%2	%x+eax	%x+4%2	%x+ebx	%x+2eax	%x+ecx	%x+8%2
    //FIXME reorder?
    #ifdef L1_DIFF //needs mmx2
    		"movq (%0), %%mm0				\n\t" // L0
    		"psadbw (%1), %%mm0				\n\t" // |L0-R0|
    		"movq (%0, %2), %%mm1				\n\t" // L1
    		"psadbw (%1, %2), %%mm1				\n\t" // |L1-R1|
    		"movq (%0, %2, 2), %%mm2			\n\t" // L2
    		"psadbw (%1, %2, 2), %%mm2			\n\t" // |L2-R2|
    		"movq (%0, %%eax), %%mm3			\n\t" // L3
    		"psadbw (%1, %%eax), %%mm3			\n\t" // |L3-R3|
    
    		"movq (%0, %2, 4), %%mm4			\n\t" // L4
    		"paddw %%mm1, %%mm0				\n\t"
    		"psadbw (%1, %2, 4), %%mm4			\n\t" // |L4-R4|
    		"movq (%0, %%ebx), %%mm5			\n\t" // L5
    		"paddw %%mm2, %%mm0				\n\t"
    		"psadbw (%1, %%ebx), %%mm5			\n\t" // |L5-R5|
    		"movq (%0, %%eax, 2), %%mm6			\n\t" // L6
    		"paddw %%mm3, %%mm0				\n\t"
    		"psadbw (%1, %%eax, 2), %%mm6			\n\t" // |L6-R6|
    		"movq (%0, %%ecx), %%mm7			\n\t" // L7
    		"paddw %%mm4, %%mm0				\n\t"
    		"psadbw (%1, %%ecx), %%mm7			\n\t" // |L7-R7|
    		"paddw %%mm5, %%mm6				\n\t"
    		"paddw %%mm7, %%mm6				\n\t"
    		"paddw %%mm6, %%mm0				\n\t"
    #elif defined (FAST_L2_DIFF)
    		"pcmpeqb %%mm7, %%mm7				\n\t"
    		"movq b80, %%mm6				\n\t"
    		"pxor %%mm0, %%mm0				\n\t"
    #define L2_DIFF_CORE(a, b)\
    		"movq " #a ", %%mm5				\n\t"\
    		"movq " #b ", %%mm2				\n\t"\
    		"pxor %%mm7, %%mm2				\n\t"\
    		PAVGB(%%mm2, %%mm5)\
    		"paddb %%mm6, %%mm5				\n\t"\
    		"movq %%mm5, %%mm2				\n\t"\
    		"psllw $8, %%mm5				\n\t"\
    		"pmaddwd %%mm5, %%mm5				\n\t"\
    		"pmaddwd %%mm2, %%mm2				\n\t"\
    		"paddd %%mm2, %%mm5				\n\t"\
    		"psrld $14, %%mm5				\n\t"\
    		"paddd %%mm5, %%mm0				\n\t"
    
    L2_DIFF_CORE((%0), (%1))
    L2_DIFF_CORE((%0, %2), (%1, %2))
    L2_DIFF_CORE((%0, %2, 2), (%1, %2, 2))
    L2_DIFF_CORE((%0, %%eax), (%1, %%eax))
    L2_DIFF_CORE((%0, %2, 4), (%1, %2, 4))
    L2_DIFF_CORE((%0, %%ebx), (%1, %%ebx))
    L2_DIFF_CORE((%0, %%eax,2), (%1, %%eax,2))
    L2_DIFF_CORE((%0, %%ecx), (%1, %%ecx))
    
    #else
    		"pxor %%mm7, %%mm7				\n\t"
    		"pxor %%mm0, %%mm0				\n\t"
    #define L2_DIFF_CORE(a, b)\
    		"movq " #a ", %%mm5				\n\t"\
    		"movq " #b ", %%mm2				\n\t"\
    		"movq %%mm5, %%mm1				\n\t"\
    		"movq %%mm2, %%mm3				\n\t"\
    		"punpcklbw %%mm7, %%mm5				\n\t"\
    		"punpckhbw %%mm7, %%mm1				\n\t"\
    		"punpcklbw %%mm7, %%mm2				\n\t"\
    		"punpckhbw %%mm7, %%mm3				\n\t"\
    		"psubw %%mm2, %%mm5				\n\t"\
    		"psubw %%mm3, %%mm1				\n\t"\
    		"pmaddwd %%mm5, %%mm5				\n\t"\
    		"pmaddwd %%mm1, %%mm1				\n\t"\
    		"paddd %%mm1, %%mm5				\n\t"\
    		"paddd %%mm5, %%mm0				\n\t"
    
    L2_DIFF_CORE((%0), (%1))
    L2_DIFF_CORE((%0, %2), (%1, %2))
    L2_DIFF_CORE((%0, %2, 2), (%1, %2, 2))
    L2_DIFF_CORE((%0, %%eax), (%1, %%eax))
    L2_DIFF_CORE((%0, %2, 4), (%1, %2, 4))
    L2_DIFF_CORE((%0, %%ebx), (%1, %%ebx))
    L2_DIFF_CORE((%0, %%eax,2), (%1, %%eax,2))
    L2_DIFF_CORE((%0, %%ecx), (%1, %%ecx))
    
    #endif
    
    		"movq %%mm0, %%mm4				\n\t"
    		"psrlq $32, %%mm0				\n\t"
    		"paddd %%mm0, %%mm4				\n\t"
    		"movd %%mm4, %%ecx				\n\t"
    
    Michael Niedermayer's avatar
    Michael Niedermayer committed
    		"shll $2, %%ecx					\n\t"
    		"movl %3, %%ebx					\n\t"
    		"addl -4(%%ebx), %%ecx				\n\t"
    		"addl 4(%%ebx), %%ecx				\n\t"
    		"addl -1024(%%ebx), %%ecx			\n\t"
    		"addl $4, %%ecx					\n\t"
    		"addl 1024(%%ebx), %%ecx			\n\t"
    		"shrl $3, %%ecx					\n\t"
    		"movl %%ecx, (%%ebx)				\n\t"
    		"leal (%%eax, %2, 2), %%ebx			\n\t" // 5*stride
    
    
    //		"movl %3, %%ecx				\n\t"
    //		"movl %%ecx, test				\n\t"
    //		"jmp 4f \n\t"
    
    Michael Niedermayer's avatar
    Michael Niedermayer committed
    		"cmpl 4+maxTmpNoise, %%ecx			\n\t"
    
    Michael Niedermayer's avatar
    Michael Niedermayer committed
    		"cmpl 8+maxTmpNoise, %%ecx			\n\t"
    
    		" jb 1f						\n\t"
    
    		"leal (%%ebx, %2, 2), %%ecx			\n\t" // 7*stride
    		"movq (%0), %%mm0				\n\t" // L0
    		"movq (%0, %2), %%mm1				\n\t" // L1
    		"movq (%0, %2, 2), %%mm2			\n\t" // L2
    		"movq (%0, %%eax), %%mm3			\n\t" // L3
    		"movq (%0, %2, 4), %%mm4			\n\t" // L4
    		"movq (%0, %%ebx), %%mm5			\n\t" // L5
    		"movq (%0, %%eax, 2), %%mm6			\n\t" // L6
    		"movq (%0, %%ecx), %%mm7			\n\t" // L7
    		"movq %%mm0, (%1)				\n\t" // L0
    		"movq %%mm1, (%1, %2)				\n\t" // L1
    		"movq %%mm2, (%1, %2, 2)			\n\t" // L2
    		"movq %%mm3, (%1, %%eax)			\n\t" // L3
    		"movq %%mm4, (%1, %2, 4)			\n\t" // L4
    		"movq %%mm5, (%1, %%ebx)			\n\t" // L5
    		"movq %%mm6, (%1, %%eax, 2)			\n\t" // L6
    		"movq %%mm7, (%1, %%ecx)			\n\t" // L7
    		"jmp 4f						\n\t"
    
    		"1:						\n\t"
    		"leal (%%ebx, %2, 2), %%ecx			\n\t" // 7*stride
    		"movq (%0), %%mm0				\n\t" // L0
    		"pavgb (%1), %%mm0				\n\t" // L0
    		"movq (%0, %2), %%mm1				\n\t" // L1
    		"pavgb (%1, %2), %%mm1				\n\t" // L1
    		"movq (%0, %2, 2), %%mm2			\n\t" // L2
    		"pavgb (%1, %2, 2), %%mm2			\n\t" // L2
    		"movq (%0, %%eax), %%mm3			\n\t" // L3
    		"pavgb (%1, %%eax), %%mm3			\n\t" // L3
    		"movq (%0, %2, 4), %%mm4			\n\t" // L4
    		"pavgb (%1, %2, 4), %%mm4			\n\t" // L4
    		"movq (%0, %%ebx), %%mm5			\n\t" // L5
    		"pavgb (%1, %%ebx), %%mm5			\n\t" // L5
    		"movq (%0, %%eax, 2), %%mm6			\n\t" // L6
    		"pavgb (%1, %%eax, 2), %%mm6			\n\t" // L6
    		"movq (%0, %%ecx), %%mm7			\n\t" // L7
    		"pavgb (%1, %%ecx), %%mm7			\n\t" // L7
    		"movq %%mm0, (%1)				\n\t" // R0
    		"movq %%mm1, (%1, %2)				\n\t" // R1
    		"movq %%mm2, (%1, %2, 2)			\n\t" // R2
    		"movq %%mm3, (%1, %%eax)			\n\t" // R3
    		"movq %%mm4, (%1, %2, 4)			\n\t" // R4
    		"movq %%mm5, (%1, %%ebx)			\n\t" // R5
    		"movq %%mm6, (%1, %%eax, 2)			\n\t" // R6
    		"movq %%mm7, (%1, %%ecx)			\n\t" // R7
    		"movq %%mm0, (%0)				\n\t" // L0
    		"movq %%mm1, (%0, %2)				\n\t" // L1
    		"movq %%mm2, (%0, %2, 2)			\n\t" // L2
    		"movq %%mm3, (%0, %%eax)			\n\t" // L3
    		"movq %%mm4, (%0, %2, 4)			\n\t" // L4
    		"movq %%mm5, (%0, %%ebx)			\n\t" // L5
    		"movq %%mm6, (%0, %%eax, 2)			\n\t" // L6
    		"movq %%mm7, (%0, %%ecx)			\n\t" // L7
    		"jmp 4f						\n\t"
    
    		"2:						\n\t"
    
    Michael Niedermayer's avatar
    Michael Niedermayer committed
    		"cmpl maxTmpNoise, %%ecx			\n\t"
    
    		" jb 3f						\n\t"
    
    		"leal (%%ebx, %2, 2), %%ecx			\n\t" // 7*stride
    		"movq (%0), %%mm0				\n\t" // L0
    		"movq (%0, %2), %%mm1				\n\t" // L1
    		"movq (%0, %2, 2), %%mm2			\n\t" // L2
    		"movq (%0, %%eax), %%mm3			\n\t" // L3
    		"movq (%1), %%mm4				\n\t" // R0
    		"movq (%1, %2), %%mm5				\n\t" // R1
    		"movq (%1, %2, 2), %%mm6			\n\t" // R2
    		"movq (%1, %%eax), %%mm7			\n\t" // R3
    		PAVGB(%%mm4, %%mm0)
    		PAVGB(%%mm5, %%mm1)
    		PAVGB(%%mm6, %%mm2)
    		PAVGB(%%mm7, %%mm3)
    		PAVGB(%%mm4, %%mm0)
    		PAVGB(%%mm5, %%mm1)
    		PAVGB(%%mm6, %%mm2)
    		PAVGB(%%mm7, %%mm3)
    		"movq %%mm0, (%1)				\n\t" // R0
    		"movq %%mm1, (%1, %2)				\n\t" // R1
    		"movq %%mm2, (%1, %2, 2)			\n\t" // R2
    		"movq %%mm3, (%1, %%eax)			\n\t" // R3
    		"movq %%mm0, (%0)				\n\t" // L0
    		"movq %%mm1, (%0, %2)				\n\t" // L1
    		"movq %%mm2, (%0, %2, 2)			\n\t" // L2
    		"movq %%mm3, (%0, %%eax)			\n\t" // L3
    
    		"movq (%0, %2, 4), %%mm0			\n\t" // L4
    		"movq (%0, %%ebx), %%mm1			\n\t" // L5
    		"movq (%0, %%eax, 2), %%mm2			\n\t" // L6
    		"movq (%0, %%ecx), %%mm3			\n\t" // L7
    		"movq (%1, %2, 4), %%mm4			\n\t" // R4
    		"movq (%1, %%ebx), %%mm5			\n\t" // R5
    		"movq (%1, %%eax, 2), %%mm6			\n\t" // R6
    		"movq (%1, %%ecx), %%mm7			\n\t" // R7
    		PAVGB(%%mm4, %%mm0)
    		PAVGB(%%mm5, %%mm1)
    		PAVGB(%%mm6, %%mm2)
    		PAVGB(%%mm7, %%mm3)
    		PAVGB(%%mm4, %%mm0)
    		PAVGB(%%mm5, %%mm1)
    		PAVGB(%%mm6, %%mm2)
    		PAVGB(%%mm7, %%mm3)
    		"movq %%mm0, (%1, %2, 4)			\n\t" // R4
    		"movq %%mm1, (%1, %%ebx)			\n\t" // R5
    		"movq %%mm2, (%1, %%eax, 2)			\n\t" // R6
    		"movq %%mm3, (%1, %%ecx)			\n\t" // R7
    		"movq %%mm0, (%0, %2, 4)			\n\t" // L4
    		"movq %%mm1, (%0, %%ebx)			\n\t" // L5
    		"movq %%mm2, (%0, %%eax, 2)			\n\t" // L6
    		"movq %%mm3, (%0, %%ecx)			\n\t" // L7
    		"jmp 4f						\n\t"
    
    		"3:						\n\t"
    		"leal (%%ebx, %2, 2), %%ecx			\n\t" // 7*stride
    		"movq (%0), %%mm0				\n\t" // L0
    		"movq (%0, %2), %%mm1				\n\t" // L1
    		"movq (%0, %2, 2), %%mm2			\n\t" // L2
    		"movq (%0, %%eax), %%mm3			\n\t" // L3
    		"movq (%1), %%mm4				\n\t" // R0
    		"movq (%1, %2), %%mm5				\n\t" // R1
    		"movq (%1, %2, 2), %%mm6			\n\t" // R2
    		"movq (%1, %%eax), %%mm7			\n\t" // R3
    		PAVGB(%%mm4, %%mm0)
    		PAVGB(%%mm5, %%mm1)
    		PAVGB(%%mm6, %%mm2)
    		PAVGB(%%mm7, %%mm3)
    		PAVGB(%%mm4, %%mm0)
    		PAVGB(%%mm5, %%mm1)
    		PAVGB(%%mm6, %%mm2)
    		PAVGB(%%mm7, %%mm3)
    		PAVGB(%%mm4, %%mm0)
    		PAVGB(%%mm5, %%mm1)
    		PAVGB(%%mm6, %%mm2)
    		PAVGB(%%mm7, %%mm3)
    		"movq %%mm0, (%1)				\n\t" // R0
    		"movq %%mm1, (%1, %2)				\n\t" // R1
    		"movq %%mm2, (%1, %2, 2)			\n\t" // R2
    		"movq %%mm3, (%1, %%eax)			\n\t" // R3
    		"movq %%mm0, (%0)				\n\t" // L0
    		"movq %%mm1, (%0, %2)				\n\t" // L1
    		"movq %%mm2, (%0, %2, 2)			\n\t" // L2
    		"movq %%mm3, (%0, %%eax)			\n\t" // L3
    
    		"movq (%0, %2, 4), %%mm0			\n\t" // L4
    		"movq (%0, %%ebx), %%mm1			\n\t" // L5
    		"movq (%0, %%eax, 2), %%mm2			\n\t" // L6
    		"movq (%0, %%ecx), %%mm3			\n\t" // L7
    		"movq (%1, %2, 4), %%mm4			\n\t" // R4
    		"movq (%1, %%ebx), %%mm5			\n\t" // R5
    		"movq (%1, %%eax, 2), %%mm6			\n\t" // R6
    		"movq (%1, %%ecx), %%mm7			\n\t" // R7
    		PAVGB(%%mm4, %%mm0)
    		PAVGB(%%mm5, %%mm1)
    		PAVGB(%%mm6, %%mm2)
    		PAVGB(%%mm7, %%mm3)
    		PAVGB(%%mm4, %%mm0)
    		PAVGB(%%mm5, %%mm1)
    		PAVGB(%%mm6, %%mm2)
    		PAVGB(%%mm7, %%mm3)
    		PAVGB(%%mm4, %%mm0)
    		PAVGB(%%mm5, %%mm1)
    		PAVGB(%%mm6, %%mm2)
    		PAVGB(%%mm7, %%mm3)
    		"movq %%mm0, (%1, %2, 4)			\n\t" // R4
    		"movq %%mm1, (%1, %%ebx)			\n\t" // R5
    		"movq %%mm2, (%1, %%eax, 2)			\n\t" // R6
    		"movq %%mm3, (%1, %%ecx)			\n\t" // R7
    		"movq %%mm0, (%0, %2, 4)			\n\t" // L4
    		"movq %%mm1, (%0, %%ebx)			\n\t" // L5
    		"movq %%mm2, (%0, %%eax, 2)			\n\t" // L6
    		"movq %%mm3, (%0, %%ecx)			\n\t" // L7
    
    		"4:						\n\t"
    
    
    Michael Niedermayer's avatar
    Michael Niedermayer committed
    		:: "r" (src), "r" (tempBlured), "r"(stride), "m" (tempBluredPast)
    
    		: "%eax", "%ebx", "%ecx", "memory"
    		);
    //printf("%d\n", test);
    #else
    
    Michael Niedermayer's avatar
    Michael Niedermayer committed
    	int i;
    
    
    	for(y=0; y<8; y++)
    	{
    		int x;
    		for(x=0; x<8; x++)
    		{
    			int ref= tempBlured[ x + y*stride ];
    			int cur= src[ x + y*stride ];
    			int d1=ref - cur;
    
    //			if(x==0 || x==7) d1+= d1>>1;
    //			if(y==0 || y==7) d1+= d1>>1;
    //			d+= ABS(d1);
    			d+= d1*d1;
    
    Michael Niedermayer's avatar
    Michael Niedermayer committed
    	i=d;
    	d= 	(
    		4*d
    		+(*(tempBluredPast-256))
    		+(*(tempBluredPast-1))+ (*(tempBluredPast+1))
    		+(*(tempBluredPast+256))
    		+4)>>3;
    	*tempBluredPast=i;
    //	((*tempBluredPast)*3 + d + 2)>>2;
    
    
    //printf("%d %d %d\n", maxNoise[0], maxNoise[1], maxNoise[2]);
    /*
    Switch between
     1  0  0  0  0  0  0  (0)
    64 32 16  8  4  2  1  (1)
    64 48 36 27 20 15 11 (33) (approx)
    64 56 49 43 37 33 29 (200) (approx)
    */
    	if(d > maxNoise[1])
    	{
    		if(d < maxNoise[2])
    		{
    			for(y=0; y<8; y++)
    			{
    				int x;
    				for(x=0; x<8; x++)
    				{
    					int ref= tempBlured[ x + y*stride ];
    					int cur= src[ x + y*stride ];
    					tempBlured[ x + y*stride ]=
    					src[ x + y*stride ]=
    						(ref + cur + 1)>>1;
    				}
    			}
    		}
    		else
    		{
    			for(y=0; y<8; y++)
    			{
    				int x;
    				for(x=0; x<8; x++)
    				{
    					tempBlured[ x + y*stride ]= src[ x + y*stride ];
    				}
    			}
    		}
    	}
    	else
    	{
    		if(d < maxNoise[0])
    		{
    			for(y=0; y<8; y++)
    			{
    				int x;
    				for(x=0; x<8; x++)
    				{
    					int ref= tempBlured[ x + y*stride ];
    					int cur= src[ x + y*stride ];
    					tempBlured[ x + y*stride ]=
    					src[ x + y*stride ]=
    						(ref*7 + cur + 4)>>3;
    				}
    			}
    		}
    		else
    		{
    			for(y=0; y<8; y++)
    			{
    				int x;
    				for(x=0; x<8; x++)
    				{
    					int ref= tempBlured[ x + y*stride ];
    					int cur= src[ x + y*stride ];
    					tempBlured[ x + y*stride ]=
    					src[ x + y*stride ]=
    						(ref*3 + cur + 2)>>2;
    				}
    			}
    		}
    	}
    
    #ifdef HAVE_ODIVX_POSTPROCESS
    #include "../opendivx/postprocess.h"
    int use_old_pp=0;
    #endif
    
    static void postProcess(uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int width, int height,
    
    	QP_STORE_T QPs[], int QPStride, int isColor, struct PPMode *ppMode);
    
    /* -pp Command line Help
    NOTE/FIXME: put this at an appropriate place (--help, html docs, man mplayer)?
    
    -pp <filterName>[:<option>[:<option>...]][,[-]<filterName>[:<option>...]]...
    
    long form example:
    -pp vdeblock:autoq,hdeblock:autoq,linblenddeint		-pp default,-vdeblock
    short form example:
    -pp vb:a,hb:a,lb					-pp de,-vb
    
    more examples:
    -pp tn:64:128:256
    
    
    Filters			Options
    short	long name	short	long option	Description
    *	*		a	autoq		cpu power dependant enabler
    			c	chrom		chrominance filtring enabled
    			y	nochrom		chrominance filtring disabled
    hb	hdeblock				horizontal deblocking filter
    vb	vdeblock				vertical deblocking filter
    vr	rkvdeblock
    h1	x1hdeblock				Experimental horizontal deblock filter 1
    v1	x1vdeblock				Experimental vertical deblock filter 1
    dr	dering					not implemented yet
    al	autolevels				automatic brightness / contrast fixer
    			f	fullyrange	stretch luminance range to (0..255)
    lb	linblenddeint				linear blend deinterlacer
    li	linipoldeint				linear interpolating deinterlacer
    ci	cubicipoldeint				cubic interpolating deinterlacer
    md	mediandeint				median deinterlacer
    de	default					hdeblock:a,vdeblock:a,dering:a,autolevels
    fa	fast					x1hdeblock:a,x1vdeblock:a,dering:a,autolevels
    
    tn	tmpnoise	(3 Thresholds)		Temporal Noise Reducer
    
    */
    
    /**
     * returns a PPMode struct which will have a non 0 error variable if an error occured
     * name is the string after "-pp" on the command line
     * quality is a number from 0 to GET_PP_QUALITY_MAX
     */
    struct PPMode getPPModeByNameAndQuality(char *name, int quality)
    {
    	char temp[GET_MODE_BUFFER_SIZE];
    	char *p= temp;
    	char *filterDelimiters= ",";
    	char *optionDelimiters= ":";
    
    	struct PPMode ppMode= {0,0,0,0,0,0,{150,200,400}};
    
    	char *filterToken;
    
    	strncpy(temp, name, GET_MODE_BUFFER_SIZE);
    
    
    	for(;;){
    		char *filterName;
    
    		int q= 1000000; //GET_PP_QUALITY_MAX;
    
    		int chrom=-1;
    		char *option;
    		char *options[OPTIONS_ARRAY_SIZE];
    		int i;
    		int filterNameOk=0;
    		int numOfUnknownOptions=0;
    		int enable=1; //does the user want us to enabled or disabled the filter
    
    		filterToken= strtok(p, filterDelimiters);
    		if(filterToken == NULL) break;
    
    		p+= strlen(filterToken) + 1; // p points to next filterToken
    
    		filterName= strtok(filterToken, optionDelimiters);
    		printf("%s::%s\n", filterToken, filterName);
    
    		if(*filterName == '-')
    		{
    			enable=0;
    			filterName++;
    		}
    
    		for(;;){ //for all options
    			option= strtok(NULL, optionDelimiters);
    			if(option == NULL) break;
    
    			printf("%s\n", option);
    			if(!strcmp("autoq", option) || !strcmp("a", option)) q= quality;
    			else if(!strcmp("nochrom", option) || !strcmp("y", option)) chrom=0;
    			else if(!strcmp("chrom", option) || !strcmp("c", option)) chrom=1;
    			else
    			{
    				options[numOfUnknownOptions] = option;
    				numOfUnknownOptions++;
    			}
    			if(numOfUnknownOptions >= OPTIONS_ARRAY_SIZE-1) break;
    		}
    
    		options[numOfUnknownOptions] = NULL;
    
    
    		/* replace stuff from the replace Table */
    		for(i=0; replaceTable[2*i]!=NULL; i++)
    		{
    			if(!strcmp(replaceTable[2*i], filterName))
    			{
    				int newlen= strlen(replaceTable[2*i + 1]);
    				int plen;
    				int spaceLeft;
    
    				if(p==NULL) p= temp, *p=0; 	//last filter
    				else p--, *p=',';		//not last filter
    
    				plen= strlen(p);
    				spaceLeft= (int)p - (int)temp + plen;
    				if(spaceLeft + newlen  >= GET_MODE_BUFFER_SIZE)
    				{
    					ppMode.error++;
    					break;
    				}
    				memmove(p + newlen, p, plen+1);
    				memcpy(p, replaceTable[2*i + 1], newlen);
    				filterNameOk=1;
    			}
    		}
    
    		for(i=0; filters[i].shortName!=NULL; i++)
    		{
    
    //			printf("Compareing %s, %s, %s\n", filters[i].shortName,filters[i].longName, filterName);
    
    			if(   !strcmp(filters[i].longName, filterName)
    			   || !strcmp(filters[i].shortName, filterName))
    			{
    				ppMode.lumMode &= ~filters[i].mask;
    				ppMode.chromMode &= ~filters[i].mask;
    
    				filterNameOk=1;
    				if(!enable) break; // user wants to disable it
    
    				if(q >= filters[i].minLumQuality)
    					ppMode.lumMode|= filters[i].mask;
    				if(chrom==1 || (chrom==-1 && filters[i].chromDefault))
    					if(q >= filters[i].minChromQuality)
    						ppMode.chromMode|= filters[i].mask;
    
    				if(filters[i].mask == LEVEL_FIX)
    				{
    					int o;
    					ppMode.minAllowedY= 16;
    					ppMode.maxAllowedY= 234;
    					for(o=0; options[o]!=NULL; o++)
    						if(  !strcmp(options[o],"fullyrange")
    						   ||!strcmp(options[o],"f"))
    						{
    							ppMode.minAllowedY= 0;
    							ppMode.maxAllowedY= 255;
    							numOfUnknownOptions--;
    						}
    				}
    
    				else if(filters[i].mask == TEMP_NOISE_FILTER)
    				{
    					int o;
    					int numOfNoises=0;
    					ppMode.maxTmpNoise[0]= 150;
    					ppMode.maxTmpNoise[1]= 200;
    					ppMode.maxTmpNoise[2]= 400;
    
    					for(o=0; options[o]!=NULL; o++)
    					{
    						char *tail;
    						ppMode.maxTmpNoise[numOfNoises]=
    							strtol(options[o], &tail, 0);
    						if(tail!=options[o])
    						{
    							numOfNoises++;
    							numOfUnknownOptions--;
    							if(numOfNoises >= 3) break;
    						}
    					}
    				}
    
    			}
    		}
    		if(!filterNameOk) ppMode.error++;
    		ppMode.error += numOfUnknownOptions;
    	}
    
    
    #ifdef HAVE_ODIVX_POSTPROCESS
    
    	if(ppMode.lumMode & H_DEBLOCK) ppMode.oldMode |= PP_DEBLOCK_Y_H;
    	if(ppMode.lumMode & V_DEBLOCK) ppMode.oldMode |= PP_DEBLOCK_Y_V;
    	if(ppMode.chromMode & H_DEBLOCK) ppMode.oldMode |= PP_DEBLOCK_C_H;
    	if(ppMode.chromMode & V_DEBLOCK) ppMode.oldMode |= PP_DEBLOCK_C_V;
    	if(ppMode.lumMode & DERING) ppMode.oldMode |= PP_DERING_Y;
    	if(ppMode.chromMode & DERING) ppMode.oldMode |= PP_DERING_C;
    
     * Obsolete, dont use it, use postprocess2() instead
    
     */
    void  postprocess(unsigned char * src[], int src_stride,
                     unsigned char * dst[], int dst_stride,
                     int horizontal_size,   int vertical_size,
                     QP_STORE_T *QP_store,  int QP_stride,
    					  int mode)
    {
    
    	struct PPMode ppMode;
    	static QP_STORE_T zeroArray[2048/8];
    
    	ppMode= getPPModeByNameAndQuality("fast,default,-hdeblock,-vdeblock,tmpnoise:150:200:300", qual);
    	printf("OK\n");
    
    	printf("\n%X %X %X %X :%d: %d %d %d\n", ppMode.lumMode, ppMode.chromMode, ppMode.oldMode, ppMode.error,
    		qual, ppMode.maxTmpNoise[0], ppMode.maxTmpNoise[1], ppMode.maxTmpNoise[2]);
    
    	postprocess2(src, src_stride, dst, dst_stride,
                     horizontal_size, vertical_size, QP_store, QP_stride, &ppMode);
    
    	return;
    */
    
    	if(QP_store==NULL)
    	{
    		QP_store= zeroArray;
    		QP_stride= 0;
    	}
    
    	ppMode.lumMode= mode;
    	mode= ((mode&0xFF)>>4) | (mode&0xFFFFFF00);
    	ppMode.chromMode= mode;
    
    	ppMode.maxTmpNoise[0]= 700;
    	ppMode.maxTmpNoise[1]= 1500;
    	ppMode.maxTmpNoise[2]= 3000;
    
    #ifdef HAVE_ODIVX_POSTPROCESS
    // Note: I could make this shit outside of this file, but it would mean one
    // more function call...
    	if(use_old_pp){
    	    odivx_postprocess(src,src_stride,dst,dst_stride,horizontal_size,vertical_size,QP_store,QP_stride,mode);
    	    return;
    	}
    #endif
    
    
    	postProcess(src[0], src_stride, dst[0], dst_stride,
    
    		horizontal_size, vertical_size, QP_store, QP_stride, 0, &ppMode);
    
    
    	horizontal_size >>= 1;
    	vertical_size   >>= 1;
    	src_stride      >>= 1;
    	dst_stride      >>= 1;
    
    	if(1)
    	{
    
    		postProcess(src[1], src_stride, dst[1], dst_stride,
    
    			horizontal_size, vertical_size, QP_store, QP_stride, 1, &ppMode);
    
    		postProcess(src[2], src_stride, dst[2], dst_stride,
    
    			horizontal_size, vertical_size, QP_store, QP_stride, 2, &ppMode);
    
    		memset(dst[1], 128, dst_stride*vertical_size);
    		memset(dst[2], 128, dst_stride*vertical_size);
    //		memcpy(dst[1], src[1], src_stride*horizontal_size);
    //		memcpy(dst[2], src[2], src_stride*horizontal_size);
    
    void  postprocess2(unsigned char * src[], int src_stride,
                     unsigned char * dst[], int dst_stride,
                     int horizontal_size,   int vertical_size,
                     QP_STORE_T *QP_store,  int QP_stride,
    		 struct PPMode *mode)
    {
    
    
    	static QP_STORE_T zeroArray[2048/8];
    	if(QP_store==NULL)
    	{
    		QP_store= zeroArray;
    		QP_stride= 0;
    	}
    
    
    #ifdef HAVE_ODIVX_POSTPROCESS
    // Note: I could make this shit outside of this file, but it would mean one
    // more function call...
    	if(use_old_pp){
    	    odivx_postprocess(src,src_stride,dst,dst_stride,horizontal_size,vertical_size,QP_store,QP_stride,
    	    mode->oldMode);
    	    return;
    	}
    #endif
    
    	postProcess(src[0], src_stride, dst[0], dst_stride,
    
    		horizontal_size, vertical_size, QP_store, QP_stride, 0, mode);
    
    
    	horizontal_size >>= 1;
    	vertical_size   >>= 1;
    	src_stride      >>= 1;
    	dst_stride      >>= 1;
    
    	postProcess(src[1], src_stride, dst[1], dst_stride,
    
    		horizontal_size, vertical_size, QP_store, QP_stride, 1, mode);
    
    	postProcess(src[2], src_stride, dst[2], dst_stride,
    
    		horizontal_size, vertical_size, QP_store, QP_stride, 2, mode);
    
    /**
     * gets the mode flags for a given quality (larger values mean slower but better postprocessing)
    
     * 0 <= quality <= 6
    
    int getPpModeForQuality(int quality){
    	int modes[1+GET_PP_QUALITY_MAX]= {
    		0,
    #if 1
    		// horizontal filters first
    		LUM_H_DEBLOCK,
    		LUM_H_DEBLOCK | LUM_V_DEBLOCK,
    		LUM_H_DEBLOCK | LUM_V_DEBLOCK | CHROM_H_DEBLOCK,
    		LUM_H_DEBLOCK | LUM_V_DEBLOCK | CHROM_H_DEBLOCK | CHROM_V_DEBLOCK,
    		LUM_H_DEBLOCK | LUM_V_DEBLOCK | CHROM_H_DEBLOCK | CHROM_V_DEBLOCK | LUM_DERING,
    		LUM_H_DEBLOCK | LUM_V_DEBLOCK | CHROM_H_DEBLOCK | CHROM_V_DEBLOCK | LUM_DERING | CHROM_DERING
    #else
    		// vertical filters first
    
    		LUM_V_DEBLOCK,
    		LUM_V_DEBLOCK | LUM_H_DEBLOCK,
    		LUM_V_DEBLOCK | LUM_H_DEBLOCK | CHROM_V_DEBLOCK,
    		LUM_V_DEBLOCK | LUM_H_DEBLOCK | CHROM_V_DEBLOCK | CHROM_H_DEBLOCK,
    		LUM_V_DEBLOCK | LUM_H_DEBLOCK | CHROM_V_DEBLOCK | CHROM_H_DEBLOCK | LUM_DERING,
    		LUM_V_DEBLOCK | LUM_H_DEBLOCK | CHROM_V_DEBLOCK | CHROM_H_DEBLOCK | LUM_DERING | CHROM_DERING
    
    #endif
    	};
    
    #ifdef HAVE_ODIVX_POSTPROCESS
    	int odivx_modes[1+GET_PP_QUALITY_MAX]= {
    		0,
    		PP_DEBLOCK_Y_H,
    		PP_DEBLOCK_Y_H|PP_DEBLOCK_Y_V,
    		PP_DEBLOCK_Y_H|PP_DEBLOCK_Y_V|PP_DEBLOCK_C_H,
    		PP_DEBLOCK_Y_H|PP_DEBLOCK_Y_V|PP_DEBLOCK_C_H|PP_DEBLOCK_C_V,
    		PP_DEBLOCK_Y_H|PP_DEBLOCK_Y_V|PP_DEBLOCK_C_H|PP_DEBLOCK_C_V|PP_DERING_Y,
    		PP_DEBLOCK_Y_H|PP_DEBLOCK_Y_V|PP_DEBLOCK_C_H|PP_DEBLOCK_C_V|PP_DERING_Y|PP_DERING_C
    	};
    	if(use_old_pp) return odivx_modes[quality];
    #endif
    	return modes[quality];
    
    }
    
    /**
     * Copies a block from src to dst and fixes the blacklevel
    
     * numLines must be a multiple of 4
     * levelFix == 0 -> dont touch the brighness & contrast
    
    static inline void blockCopy(uint8_t dst[], int dstStride, uint8_t src[], int srcStride,
    
    Michael Niedermayer's avatar
    Michael Niedermayer committed
    	int levelFix)
    
    #ifdef HAVE_MMX
    					asm volatile(
    						"leal (%2,%2), %%eax	\n\t"
    						"leal (%3,%3), %%ebx	\n\t"
    						"movq packedYOffset, %%mm2	\n\t"
    						"movq packedYScale, %%mm3	\n\t"
    
    
    #define SCALED_CPY					\
    						"movq (%0), %%mm0	\n\t"\
    
    						"punpcklbw %%mm4, %%mm0 \n\t"\
    						"punpckhbw %%mm4, %%mm5 \n\t"\
    
    						"psubw %%mm2, %%mm0	\n\t"\
    						"psubw %%mm2, %%mm5	\n\t"\
    
    						"psllw $6, %%mm0	\n\t"\
    						"psllw $6, %%mm5	\n\t"\
    
    						"pmulhw %%mm3, %%mm5	\n\t"\
    						"punpcklbw %%mm4, %%mm1 \n\t"\
    
    						"punpckhbw %%mm4, %%mm6 \n\t"\
    
    						"psubw %%mm2, %%mm1	\n\t"\
    
    						"psllw $6, %%mm1	\n\t"\
    
    						"pmulhw %%mm3, %%mm6	\n\t"\
    						"addl %%eax, %0		\n\t"\
    						"packuswb %%mm5, %%mm0	\n\t"\
    						"packuswb %%mm6, %%mm1	\n\t"\
    						"movq %%mm0, (%1)	\n\t"\
    
    SCALED_CPY
    						"addl %%ebx, %1		\n\t"
    SCALED_CPY
    
    						: "+r"(src),
    						"+r"(dst)
    						:"r" (srcStride),
    
    Michael Niedermayer's avatar
    Michael Niedermayer committed
    				for(i=0; i<8; i++)
    
    					memcpy(	&(dst[dstStride*i]),
    						&(src[srcStride*i]), BLOCK_SIZE);
    #endif