Newer
Older
"punpcklbw %%mm1, %%mm0 \n\t" // 1a2b3c4d
"punpckhbw %%mm1, %%mm2 \n\t" // 5e6f7g8h
"movq (%%"FF_REG_a", %1), %%mm1 \n\t"
"movq (%%"FF_REG_a", %1, 2), %%mm3 \n\t"
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
"movq %%mm1, %%mm4 \n\t"
"punpcklbw %%mm3, %%mm1 \n\t"
"punpckhbw %%mm3, %%mm4 \n\t"
"movq %%mm0, %%mm3 \n\t"
"punpcklwd %%mm1, %%mm0 \n\t"
"punpckhwd %%mm1, %%mm3 \n\t"
"movq %%mm2, %%mm1 \n\t"
"punpcklwd %%mm4, %%mm2 \n\t"
"punpckhwd %%mm4, %%mm1 \n\t"
"movd %%mm0, 128(%2) \n\t"
"psrlq $32, %%mm0 \n\t"
"movd %%mm0, 144(%2) \n\t"
"movd %%mm3, 160(%2) \n\t"
"psrlq $32, %%mm3 \n\t"
"movd %%mm3, 176(%2) \n\t"
"movd %%mm3, 48(%3) \n\t"
"movd %%mm2, 192(%2) \n\t"
"movd %%mm2, 64(%3) \n\t"
"psrlq $32, %%mm2 \n\t"
"movd %%mm2, 80(%3) \n\t"
"movd %%mm1, 96(%3) \n\t"
"psrlq $32, %%mm1 \n\t"
"movd %%mm1, 112(%3) \n\t"
"lea (%%"FF_REG_a", %1, 4), %%"FF_REG_a"\n\t"
"movq (%0, %1, 4), %%mm0 \n\t" // 12345678
"movq (%%"FF_REG_a"), %%mm1 \n\t" // abcdefgh
"movq %%mm0, %%mm2 \n\t" // 12345678
"punpcklbw %%mm1, %%mm0 \n\t" // 1a2b3c4d
"punpckhbw %%mm1, %%mm2 \n\t" // 5e6f7g8h
"movq (%%"FF_REG_a", %1), %%mm1 \n\t"
"movq (%%"FF_REG_a", %1, 2), %%mm3 \n\t"
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
"movq %%mm1, %%mm4 \n\t"
"punpcklbw %%mm3, %%mm1 \n\t"
"punpckhbw %%mm3, %%mm4 \n\t"
"movq %%mm0, %%mm3 \n\t"
"punpcklwd %%mm1, %%mm0 \n\t"
"punpckhwd %%mm1, %%mm3 \n\t"
"movq %%mm2, %%mm1 \n\t"
"punpcklwd %%mm4, %%mm2 \n\t"
"punpckhwd %%mm4, %%mm1 \n\t"
"movd %%mm0, 132(%2) \n\t"
"psrlq $32, %%mm0 \n\t"
"movd %%mm0, 148(%2) \n\t"
"movd %%mm3, 164(%2) \n\t"
"psrlq $32, %%mm3 \n\t"
"movd %%mm3, 180(%2) \n\t"
"movd %%mm3, 52(%3) \n\t"
"movd %%mm2, 196(%2) \n\t"
"movd %%mm2, 68(%3) \n\t"
"psrlq $32, %%mm2 \n\t"
"movd %%mm2, 84(%3) \n\t"
"movd %%mm1, 100(%3) \n\t"
"psrlq $32, %%mm1 \n\t"
"movd %%mm1, 116(%3) \n\t"
:: "r" (src), "r" ((x86_reg)srcStride), "r" (dst1), "r" (dst2)
: "%"FF_REG_a
);
}
/**
* Transpose the given 8x8 block.
*/
Michael Niedermayer
committed
static inline void RENAME(transpose2)(uint8_t *dst, int dstStride, const uint8_t *src)
{
"lea (%0, %1), %%"FF_REG_a" \n\t"
"lea (%%"FF_REG_a",%1,4), %%"FF_REG_d" \n\t"
// 0 1 2 3 4 5 6 7 8 9
// %0 eax eax+%1 eax+2%1 %0+4%1 edx edx+%1 edx+2%1 %0+8%1 edx+4%1
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
"movq (%2), %%mm0 \n\t" // 12345678
"movq 16(%2), %%mm1 \n\t" // abcdefgh
"movq %%mm0, %%mm2 \n\t" // 12345678
"punpcklbw %%mm1, %%mm0 \n\t" // 1a2b3c4d
"punpckhbw %%mm1, %%mm2 \n\t" // 5e6f7g8h
"movq 32(%2), %%mm1 \n\t"
"movq 48(%2), %%mm3 \n\t"
"movq %%mm1, %%mm4 \n\t"
"punpcklbw %%mm3, %%mm1 \n\t"
"punpckhbw %%mm3, %%mm4 \n\t"
"movq %%mm0, %%mm3 \n\t"
"punpcklwd %%mm1, %%mm0 \n\t"
"punpckhwd %%mm1, %%mm3 \n\t"
"movq %%mm2, %%mm1 \n\t"
"punpcklwd %%mm4, %%mm2 \n\t"
"punpckhwd %%mm4, %%mm1 \n\t"
"movd %%mm0, (%0) \n\t"
"psrlq $32, %%mm0 \n\t"
"movd %%mm0, (%%"FF_REG_a") \n\t"
"movd %%mm3, (%%"FF_REG_a", %1) \n\t"
"psrlq $32, %%mm3 \n\t"
"movd %%mm3, (%%"FF_REG_a", %1, 2) \n\t"
"movd %%mm2, (%0, %1, 4) \n\t"
"psrlq $32, %%mm2 \n\t"
"movd %%mm2, (%%"FF_REG_d") \n\t"
"movd %%mm1, (%%"FF_REG_d", %1) \n\t"
"psrlq $32, %%mm1 \n\t"
"movd %%mm1, (%%"FF_REG_d", %1, 2) \n\t"
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
"movq 64(%2), %%mm0 \n\t" // 12345678
"movq 80(%2), %%mm1 \n\t" // abcdefgh
"movq %%mm0, %%mm2 \n\t" // 12345678
"punpcklbw %%mm1, %%mm0 \n\t" // 1a2b3c4d
"punpckhbw %%mm1, %%mm2 \n\t" // 5e6f7g8h
"movq 96(%2), %%mm1 \n\t"
"movq 112(%2), %%mm3 \n\t"
"movq %%mm1, %%mm4 \n\t"
"punpcklbw %%mm3, %%mm1 \n\t"
"punpckhbw %%mm3, %%mm4 \n\t"
"movq %%mm0, %%mm3 \n\t"
"punpcklwd %%mm1, %%mm0 \n\t"
"punpckhwd %%mm1, %%mm3 \n\t"
"movq %%mm2, %%mm1 \n\t"
"punpcklwd %%mm4, %%mm2 \n\t"
"punpckhwd %%mm4, %%mm1 \n\t"
"movd %%mm0, 4(%0) \n\t"
"psrlq $32, %%mm0 \n\t"
"movd %%mm0, 4(%%"FF_REG_a") \n\t"
"movd %%mm3, 4(%%"FF_REG_a", %1) \n\t"
"psrlq $32, %%mm3 \n\t"
"movd %%mm3, 4(%%"FF_REG_a", %1, 2) \n\t"
"movd %%mm2, 4(%0, %1, 4) \n\t"
"psrlq $32, %%mm2 \n\t"
"movd %%mm2, 4(%%"FF_REG_d") \n\t"
"movd %%mm1, 4(%%"FF_REG_d", %1) \n\t"
"psrlq $32, %%mm1 \n\t"
"movd %%mm1, 4(%%"FF_REG_d", %1, 2) \n\t"
:: "r" (dst), "r" ((x86_reg)dstStride), "r" (src)
: "%"FF_REG_a, "%"FF_REG_d
);
}
Aurelien Jacobs
committed
//static long test=0;
static inline void RENAME(tempNoiseReducer)(uint8_t *src, int stride,
Michael Niedermayer
committed
uint8_t *tempBlurred, uint32_t *tempBlurredPast, const int *maxNoise)
// to save a register (FIXME do this outside of the loops)
tempBlurredPast[127]= maxNoise[0];
tempBlurredPast[128]= maxNoise[1];
tempBlurredPast[129]= maxNoise[2];
Michael Niedermayer
committed
#define FAST_L2_DIFF
//#define L1_DIFF //u should change the thresholds too if u try that one
#if (TEMPLATE_PP_MMXEXT || TEMPLATE_PP_3DNOW) && HAVE_6REGS
"lea (%2, %2, 2), %%"FF_REG_a" \n\t" // 3*stride
"lea (%2, %2, 4), %%"FF_REG_d" \n\t" // 5*stride
"lea (%%"FF_REG_d", %2, 2), %%"FF_REG_c"\n\t" // 7*stride
// 0 1 2 3 4 5 6 7 8 9
// %x %x+%2 %x+2%2 %x+eax %x+4%2 %x+edx %x+2eax %x+ecx %x+8%2
Michael Niedermayer
committed
//FIXME reorder?
#ifdef L1_DIFF //needs mmx2
"movq (%0), %%mm0 \n\t" // L0
"psadbw (%1), %%mm0 \n\t" // |L0-R0|
"movq (%0, %2), %%mm1 \n\t" // L1
"psadbw (%1, %2), %%mm1 \n\t" // |L1-R1|
"movq (%0, %2, 2), %%mm2 \n\t" // L2
"psadbw (%1, %2, 2), %%mm2 \n\t" // |L2-R2|
"movq (%0, %%"FF_REG_a"), %%mm3 \n\t" // L3
"psadbw (%1, %%"FF_REG_a"), %%mm3 \n\t" // |L3-R3|
"movq (%0, %2, 4), %%mm4 \n\t" // L4
"paddw %%mm1, %%mm0 \n\t"
"psadbw (%1, %2, 4), %%mm4 \n\t" // |L4-R4|
"movq (%0, %%"FF_REG_d"), %%mm5 \n\t" // L5
"paddw %%mm2, %%mm0 \n\t"
"psadbw (%1, %%"FF_REG_d"), %%mm5 \n\t" // |L5-R5|
"movq (%0, %%"FF_REG_a", 2), %%mm6 \n\t" // L6
"paddw %%mm3, %%mm0 \n\t"
"psadbw (%1, %%"FF_REG_a", 2), %%mm6 \n\t" // |L6-R6|
"movq (%0, %%"FF_REG_c"), %%mm7 \n\t" // L7
"paddw %%mm4, %%mm0 \n\t"
"psadbw (%1, %%"FF_REG_c"), %%mm7 \n\t" // |L7-R7|
"paddw %%mm5, %%mm6 \n\t"
"paddw %%mm7, %%mm6 \n\t"
"paddw %%mm6, %%mm0 \n\t"
#else //L1_DIFF
Aurelien Jacobs
committed
#if defined (FAST_L2_DIFF)
"pcmpeqb %%mm7, %%mm7 \n\t"
"movq "MANGLE(b80)", %%mm6 \n\t"
"pxor %%mm0, %%mm0 \n\t"
Aurelien Jacobs
committed
#define REAL_L2_DIFF_CORE(a, b)\
"movq " #a ", %%mm5 \n\t"\
"movq " #b ", %%mm2 \n\t"\
"pxor %%mm7, %%mm2 \n\t"\
PAVGB(%%mm2, %%mm5)\
"paddb %%mm6, %%mm5 \n\t"\
"movq %%mm5, %%mm2 \n\t"\
"psllw $8, %%mm5 \n\t"\
"pmaddwd %%mm5, %%mm5 \n\t"\
"pmaddwd %%mm2, %%mm2 \n\t"\
"paddd %%mm2, %%mm5 \n\t"\
"psrld $14, %%mm5 \n\t"\
"paddd %%mm5, %%mm0 \n\t"
Michael Niedermayer
committed
#else //defined (FAST_L2_DIFF)
"pxor %%mm7, %%mm7 \n\t"
"pxor %%mm0, %%mm0 \n\t"
Aurelien Jacobs
committed
#define REAL_L2_DIFF_CORE(a, b)\
"movq " #a ", %%mm5 \n\t"\
"movq " #b ", %%mm2 \n\t"\
"movq %%mm5, %%mm1 \n\t"\
"movq %%mm2, %%mm3 \n\t"\
"punpcklbw %%mm7, %%mm5 \n\t"\
"punpckhbw %%mm7, %%mm1 \n\t"\
"punpcklbw %%mm7, %%mm2 \n\t"\
"punpckhbw %%mm7, %%mm3 \n\t"\
"psubw %%mm2, %%mm5 \n\t"\
"psubw %%mm3, %%mm1 \n\t"\
"pmaddwd %%mm5, %%mm5 \n\t"\
"pmaddwd %%mm1, %%mm1 \n\t"\
"paddd %%mm1, %%mm5 \n\t"\
"paddd %%mm5, %%mm0 \n\t"
Michael Niedermayer
committed
#endif //defined (FAST_L2_DIFF)
Aurelien Jacobs
committed
#define L2_DIFF_CORE(a, b) REAL_L2_DIFF_CORE(a, b)
L2_DIFF_CORE((%0) , (%1))
L2_DIFF_CORE((%0, %2) , (%1, %2))
L2_DIFF_CORE((%0, %2, 2) , (%1, %2, 2))
L2_DIFF_CORE((%0, %%FF_REGa) , (%1, %%FF_REGa))
L2_DIFF_CORE((%0, %2, 4) , (%1, %2, 4))
L2_DIFF_CORE((%0, %%FF_REGd) , (%1, %%FF_REGd))
L2_DIFF_CORE((%0, %%FF_REGa,2), (%1, %%FF_REGa,2))
L2_DIFF_CORE((%0, %%FF_REGc) , (%1, %%FF_REGc))
Michael Niedermayer
committed
#endif //L1_DIFF
Michael Niedermayer
committed
"movq %%mm0, %%mm4 \n\t"
"psrlq $32, %%mm0 \n\t"
"paddd %%mm0, %%mm4 \n\t"
"movd %%mm4, %%ecx \n\t"
"shll $2, %%ecx \n\t"
"mov %3, %%"FF_REG_d" \n\t"
"addl -4(%%"FF_REG_d"), %%ecx \n\t"
"addl 4(%%"FF_REG_d"), %%ecx \n\t"
"addl -1024(%%"FF_REG_d"), %%ecx \n\t"
"addl $4, %%ecx \n\t"
"addl 1024(%%"FF_REG_d"), %%ecx \n\t"
"shrl $3, %%ecx \n\t"
"movl %%ecx, (%%"FF_REG_d") \n\t"
// "mov %3, %%"FF_REG_c" \n\t"
// "mov %%"FF_REG_c", test \n\t"
// "jmp 4f \n\t"
"cmpl 512(%%"FF_REG_d"), %%ecx \n\t"
" jb 2f \n\t"
"cmpl 516(%%"FF_REG_d"), %%ecx \n\t"
" jb 1f \n\t"
"lea (%%"FF_REG_a", %2, 2), %%"FF_REG_d"\n\t" // 5*stride
"lea (%%"FF_REG_d", %2, 2), %%"FF_REG_c"\n\t" // 7*stride
"movq (%0), %%mm0 \n\t" // L0
"movq (%0, %2), %%mm1 \n\t" // L1
"movq (%0, %2, 2), %%mm2 \n\t" // L2
"movq (%0, %%"FF_REG_a"), %%mm3 \n\t" // L3
"movq (%0, %2, 4), %%mm4 \n\t" // L4
"movq (%0, %%"FF_REG_d"), %%mm5 \n\t" // L5
"movq (%0, %%"FF_REG_a", 2), %%mm6 \n\t" // L6
"movq (%0, %%"FF_REG_c"), %%mm7 \n\t" // L7
"movq %%mm0, (%1) \n\t" // L0
"movq %%mm1, (%1, %2) \n\t" // L1
"movq %%mm2, (%1, %2, 2) \n\t" // L2
"movq %%mm3, (%1, %%"FF_REG_a") \n\t" // L3
"movq %%mm4, (%1, %2, 4) \n\t" // L4
"movq %%mm5, (%1, %%"FF_REG_d") \n\t" // L5
"movq %%mm6, (%1, %%"FF_REG_a", 2) \n\t" // L6
"movq %%mm7, (%1, %%"FF_REG_c") \n\t" // L7
"jmp 4f \n\t"
"1: \n\t"
"lea (%%"FF_REG_a", %2, 2), %%"FF_REG_d"\n\t" // 5*stride
"lea (%%"FF_REG_d", %2, 2), %%"FF_REG_c"\n\t" // 7*stride
"movq (%0), %%mm0 \n\t" // L0
PAVGB((%1), %%mm0) // L0
"movq (%0, %2), %%mm1 \n\t" // L1
PAVGB((%1, %2), %%mm1) // L1
"movq (%0, %2, 2), %%mm2 \n\t" // L2
PAVGB((%1, %2, 2), %%mm2) // L2
"movq (%0, %%"FF_REG_a"), %%mm3 \n\t" // L3
PAVGB((%1, %%FF_REGa), %%mm3) // L3
"movq (%0, %2, 4), %%mm4 \n\t" // L4
PAVGB((%1, %2, 4), %%mm4) // L4
"movq (%0, %%"FF_REG_d"), %%mm5 \n\t" // L5
PAVGB((%1, %%FF_REGd), %%mm5) // L5
"movq (%0, %%"FF_REG_a", 2), %%mm6 \n\t" // L6
PAVGB((%1, %%FF_REGa, 2), %%mm6) // L6
"movq (%0, %%"FF_REG_c"), %%mm7 \n\t" // L7
PAVGB((%1, %%FF_REGc), %%mm7) // L7
"movq %%mm0, (%1) \n\t" // R0
"movq %%mm1, (%1, %2) \n\t" // R1
"movq %%mm2, (%1, %2, 2) \n\t" // R2
"movq %%mm3, (%1, %%"FF_REG_a") \n\t" // R3
"movq %%mm4, (%1, %2, 4) \n\t" // R4
"movq %%mm5, (%1, %%"FF_REG_d") \n\t" // R5
"movq %%mm6, (%1, %%"FF_REG_a", 2) \n\t" // R6
"movq %%mm7, (%1, %%"FF_REG_c") \n\t" // R7
"movq %%mm0, (%0) \n\t" // L0
"movq %%mm1, (%0, %2) \n\t" // L1
"movq %%mm2, (%0, %2, 2) \n\t" // L2
"movq %%mm3, (%0, %%"FF_REG_a") \n\t" // L3
"movq %%mm4, (%0, %2, 4) \n\t" // L4
"movq %%mm5, (%0, %%"FF_REG_d") \n\t" // L5
"movq %%mm6, (%0, %%"FF_REG_a", 2) \n\t" // L6
"movq %%mm7, (%0, %%"FF_REG_c") \n\t" // L7
"jmp 4f \n\t"
"2: \n\t"
"cmpl 508(%%"FF_REG_d"), %%ecx \n\t"
" jb 3f \n\t"
"lea (%%"FF_REG_a", %2, 2), %%"FF_REG_d"\n\t" // 5*stride
"lea (%%"FF_REG_d", %2, 2), %%"FF_REG_c"\n\t" // 7*stride
"movq (%0), %%mm0 \n\t" // L0
"movq (%0, %2), %%mm1 \n\t" // L1
"movq (%0, %2, 2), %%mm2 \n\t" // L2
"movq (%0, %%"FF_REG_a"), %%mm3 \n\t" // L3
"movq (%1), %%mm4 \n\t" // R0
"movq (%1, %2), %%mm5 \n\t" // R1
"movq (%1, %2, 2), %%mm6 \n\t" // R2
"movq (%1, %%"FF_REG_a"), %%mm7 \n\t" // R3
PAVGB(%%mm4, %%mm0)
PAVGB(%%mm5, %%mm1)
PAVGB(%%mm6, %%mm2)
PAVGB(%%mm7, %%mm3)
PAVGB(%%mm4, %%mm0)
PAVGB(%%mm5, %%mm1)
PAVGB(%%mm6, %%mm2)
PAVGB(%%mm7, %%mm3)
"movq %%mm0, (%1) \n\t" // R0
"movq %%mm1, (%1, %2) \n\t" // R1
"movq %%mm2, (%1, %2, 2) \n\t" // R2
"movq %%mm3, (%1, %%"FF_REG_a") \n\t" // R3
"movq %%mm0, (%0) \n\t" // L0
"movq %%mm1, (%0, %2) \n\t" // L1
"movq %%mm2, (%0, %2, 2) \n\t" // L2
"movq %%mm3, (%0, %%"FF_REG_a") \n\t" // L3
"movq (%0, %2, 4), %%mm0 \n\t" // L4
"movq (%0, %%"FF_REG_d"), %%mm1 \n\t" // L5
"movq (%0, %%"FF_REG_a", 2), %%mm2 \n\t" // L6
"movq (%0, %%"FF_REG_c"), %%mm3 \n\t" // L7
"movq (%1, %2, 4), %%mm4 \n\t" // R4
"movq (%1, %%"FF_REG_d"), %%mm5 \n\t" // R5
"movq (%1, %%"FF_REG_a", 2), %%mm6 \n\t" // R6
"movq (%1, %%"FF_REG_c"), %%mm7 \n\t" // R7
PAVGB(%%mm4, %%mm0)
PAVGB(%%mm5, %%mm1)
PAVGB(%%mm6, %%mm2)
PAVGB(%%mm7, %%mm3)
PAVGB(%%mm4, %%mm0)
PAVGB(%%mm5, %%mm1)
PAVGB(%%mm6, %%mm2)
PAVGB(%%mm7, %%mm3)
"movq %%mm0, (%1, %2, 4) \n\t" // R4
"movq %%mm1, (%1, %%"FF_REG_d") \n\t" // R5
"movq %%mm2, (%1, %%"FF_REG_a", 2) \n\t" // R6
"movq %%mm3, (%1, %%"FF_REG_c") \n\t" // R7
"movq %%mm0, (%0, %2, 4) \n\t" // L4
"movq %%mm1, (%0, %%"FF_REG_d") \n\t" // L5
"movq %%mm2, (%0, %%"FF_REG_a", 2) \n\t" // L6
"movq %%mm3, (%0, %%"FF_REG_c") \n\t" // L7
"jmp 4f \n\t"
"3: \n\t"
"lea (%%"FF_REG_a", %2, 2), %%"FF_REG_d"\n\t" // 5*stride
"lea (%%"FF_REG_d", %2, 2), %%"FF_REG_c"\n\t" // 7*stride
"movq (%0), %%mm0 \n\t" // L0
"movq (%0, %2), %%mm1 \n\t" // L1
"movq (%0, %2, 2), %%mm2 \n\t" // L2
"movq (%0, %%"FF_REG_a"), %%mm3 \n\t" // L3
"movq (%1), %%mm4 \n\t" // R0
"movq (%1, %2), %%mm5 \n\t" // R1
"movq (%1, %2, 2), %%mm6 \n\t" // R2
"movq (%1, %%"FF_REG_a"), %%mm7 \n\t" // R3
PAVGB(%%mm4, %%mm0)
PAVGB(%%mm5, %%mm1)
PAVGB(%%mm6, %%mm2)
PAVGB(%%mm7, %%mm3)
PAVGB(%%mm4, %%mm0)
PAVGB(%%mm5, %%mm1)
PAVGB(%%mm6, %%mm2)
PAVGB(%%mm7, %%mm3)
PAVGB(%%mm4, %%mm0)
PAVGB(%%mm5, %%mm1)
PAVGB(%%mm6, %%mm2)
PAVGB(%%mm7, %%mm3)
"movq %%mm0, (%1) \n\t" // R0
"movq %%mm1, (%1, %2) \n\t" // R1
"movq %%mm2, (%1, %2, 2) \n\t" // R2
"movq %%mm3, (%1, %%"FF_REG_a") \n\t" // R3
"movq %%mm0, (%0) \n\t" // L0
"movq %%mm1, (%0, %2) \n\t" // L1
"movq %%mm2, (%0, %2, 2) \n\t" // L2
"movq %%mm3, (%0, %%"FF_REG_a") \n\t" // L3
"movq (%0, %2, 4), %%mm0 \n\t" // L4
"movq (%0, %%"FF_REG_d"), %%mm1 \n\t" // L5
"movq (%0, %%"FF_REG_a", 2), %%mm2 \n\t" // L6
"movq (%0, %%"FF_REG_c"), %%mm3 \n\t" // L7
"movq (%1, %2, 4), %%mm4 \n\t" // R4
"movq (%1, %%"FF_REG_d"), %%mm5 \n\t" // R5
"movq (%1, %%"FF_REG_a", 2), %%mm6 \n\t" // R6
"movq (%1, %%"FF_REG_c"), %%mm7 \n\t" // R7
PAVGB(%%mm4, %%mm0)
PAVGB(%%mm5, %%mm1)
PAVGB(%%mm6, %%mm2)
PAVGB(%%mm7, %%mm3)
PAVGB(%%mm4, %%mm0)
PAVGB(%%mm5, %%mm1)
PAVGB(%%mm6, %%mm2)
PAVGB(%%mm7, %%mm3)
PAVGB(%%mm4, %%mm0)
PAVGB(%%mm5, %%mm1)
PAVGB(%%mm6, %%mm2)
PAVGB(%%mm7, %%mm3)
"movq %%mm0, (%1, %2, 4) \n\t" // R4
"movq %%mm1, (%1, %%"FF_REG_d") \n\t" // R5
"movq %%mm2, (%1, %%"FF_REG_a", 2) \n\t" // R6
"movq %%mm3, (%1, %%"FF_REG_c") \n\t" // R7
"movq %%mm0, (%0, %2, 4) \n\t" // L4
"movq %%mm1, (%0, %%"FF_REG_d") \n\t" // L5
"movq %%mm2, (%0, %%"FF_REG_a", 2) \n\t" // L6
"movq %%mm3, (%0, %%"FF_REG_c") \n\t" // L7
"4: \n\t"
:: "r" (src), "r" (tempBlurred), "r"((x86_reg)stride), "m" (tempBlurredPast)
NAMED_CONSTRAINTS_ADD(b80)
: "%"FF_REG_a, "%"FF_REG_d, "%"FF_REG_c, "memory"
);
#else //(TEMPLATE_PP_MMXEXT || TEMPLATE_PP_3DNOW) && HAVE_6REGS
int y;
int d=0;
// int sysd=0;
int i;
for(y=0; y<8; y++){
int x;
for(x=0; x<8; x++){
int ref= tempBlurred[ x + y*stride ];
int cur= src[ x + y*stride ];
int d1=ref - cur;
// if(x==0 || x==7) d1+= d1>>1;
// if(y==0 || y==7) d1+= d1>>1;
// d+= FFABS(d1);
d+= d1*d1;
// sysd+= d1;
}
i=d;
d= (
4*d
+(*(tempBlurredPast-256))
+(*(tempBlurredPast-1))+ (*(tempBlurredPast+1))
+(*(tempBlurredPast+256))
+4)>>3;
*tempBlurredPast=i;
// ((*tempBlurredPast)*3 + d + 2)>>2;
/*
Switch between
1 0 0 0 0 0 0 (0)
64 32 16 8 4 2 1 (1)
64 48 36 27 20 15 11 (33) (approx)
64 56 49 43 37 33 29 (200) (approx)
*/
if(d > maxNoise[1]){
if(d < maxNoise[2]){
for(y=0; y<8; y++){
int x;
for(x=0; x<8; x++){
int ref= tempBlurred[ x + y*stride ];
int cur= src[ x + y*stride ];
src[ x + y*stride ]=
(ref + cur + 1)>>1;
}
}else{
for(y=0; y<8; y++){
int x;
for(x=0; x<8; x++){
tempBlurred[ x + y*stride ]= src[ x + y*stride ];
}else{
if(d < maxNoise[0]){
for(y=0; y<8; y++){
int x;
for(x=0; x<8; x++){
int ref= tempBlurred[ x + y*stride ];
int cur= src[ x + y*stride ];
src[ x + y*stride ]=
(ref*7 + cur + 4)>>3;
}
}else{
for(y=0; y<8; y++){
int x;
for(x=0; x<8; x++){
int ref= tempBlurred[ x + y*stride ];
int cur= src[ x + y*stride ];
src[ x + y*stride ]=
(ref*3 + cur + 2)>>2;
#endif //(TEMPLATE_PP_MMXEXT || TEMPLATE_PP_3DNOW) && HAVE_6REGS
/**
* accurate deblock filter
*/
static av_always_inline void RENAME(do_a_deblock)(uint8_t *src, int step, int stride, const PPContext *c, int mode){
int64_t dc_mask, eq_mask, both_masks;
int64_t sums[10*8*2];
src+= step*3; // src points to begin of the 8x8 Block
"movq %0, %%mm7 \n\t"
"movq %1, %%mm6 \n\t"
: : "m" (c->mmxDcOffset[c->nonBQP]), "m" (c->mmxDcThreshold[c->nonBQP])
);
"lea (%2, %3), %%"FF_REG_a" \n\t"
// 0 1 2 3 4 5 6 7 8 9
// %1 eax eax+%2 eax+2%2 %1+4%2 ecx ecx+%2 ecx+2%2 %1+8%2 ecx+4%2
"movq (%2), %%mm0 \n\t"
"movq (%%"FF_REG_a"), %%mm1 \n\t"
"movq %%mm1, %%mm3 \n\t"
"movq %%mm1, %%mm4 \n\t"
"psubb %%mm1, %%mm0 \n\t" // mm0 = difference
"paddb %%mm7, %%mm0 \n\t"
"pcmpgtb %%mm6, %%mm0 \n\t"
"movq (%%"FF_REG_a",%3), %%mm2 \n\t"
PMAXUB(%%mm2, %%mm4)
PMINUB(%%mm2, %%mm3, %%mm5)
"psubb %%mm2, %%mm1 \n\t"
"paddb %%mm7, %%mm1 \n\t"
"pcmpgtb %%mm6, %%mm1 \n\t"
"paddb %%mm1, %%mm0 \n\t"
"movq (%%"FF_REG_a", %3, 2), %%mm1 \n\t"
PMAXUB(%%mm1, %%mm4)
PMINUB(%%mm1, %%mm3, %%mm5)
"psubb %%mm1, %%mm2 \n\t"
"paddb %%mm7, %%mm2 \n\t"
"pcmpgtb %%mm6, %%mm2 \n\t"
"paddb %%mm2, %%mm0 \n\t"
"lea (%%"FF_REG_a", %3, 4), %%"FF_REG_a"\n\t"
"movq (%2, %3, 4), %%mm2 \n\t"
PMAXUB(%%mm2, %%mm4)
PMINUB(%%mm2, %%mm3, %%mm5)
"psubb %%mm2, %%mm1 \n\t"
"paddb %%mm7, %%mm1 \n\t"
"pcmpgtb %%mm6, %%mm1 \n\t"
"paddb %%mm1, %%mm0 \n\t"
"movq (%%"FF_REG_a"), %%mm1 \n\t"
PMAXUB(%%mm1, %%mm4)
PMINUB(%%mm1, %%mm3, %%mm5)
"psubb %%mm1, %%mm2 \n\t"
"paddb %%mm7, %%mm2 \n\t"
"pcmpgtb %%mm6, %%mm2 \n\t"
"paddb %%mm2, %%mm0 \n\t"
"movq (%%"FF_REG_a", %3), %%mm2 \n\t"
PMAXUB(%%mm2, %%mm4)
PMINUB(%%mm2, %%mm3, %%mm5)
"psubb %%mm2, %%mm1 \n\t"
"paddb %%mm7, %%mm1 \n\t"
"pcmpgtb %%mm6, %%mm1 \n\t"
"paddb %%mm1, %%mm0 \n\t"
"movq (%%"FF_REG_a", %3, 2), %%mm1 \n\t"
PMAXUB(%%mm1, %%mm4)
PMINUB(%%mm1, %%mm3, %%mm5)
"psubb %%mm1, %%mm2 \n\t"
"paddb %%mm7, %%mm2 \n\t"
"pcmpgtb %%mm6, %%mm2 \n\t"
"paddb %%mm2, %%mm0 \n\t"
"movq (%2, %3, 8), %%mm2 \n\t"
PMAXUB(%%mm2, %%mm4)
PMINUB(%%mm2, %%mm3, %%mm5)
"psubb %%mm2, %%mm1 \n\t"
"paddb %%mm7, %%mm1 \n\t"
"pcmpgtb %%mm6, %%mm1 \n\t"
"paddb %%mm1, %%mm0 \n\t"
"movq (%%"FF_REG_a", %3, 4), %%mm1 \n\t"
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
"psubb %%mm1, %%mm2 \n\t"
"paddb %%mm7, %%mm2 \n\t"
"pcmpgtb %%mm6, %%mm2 \n\t"
"paddb %%mm2, %%mm0 \n\t"
"psubusb %%mm3, %%mm4 \n\t"
"pxor %%mm6, %%mm6 \n\t"
"movq %4, %%mm7 \n\t" // QP,..., QP
"paddusb %%mm7, %%mm7 \n\t" // 2QP ... 2QP
"psubusb %%mm4, %%mm7 \n\t" // Diff >=2QP -> 0
"pcmpeqb %%mm6, %%mm7 \n\t" // Diff < 2QP -> 0
"pcmpeqb %%mm6, %%mm7 \n\t" // Diff < 2QP -> 0
"movq %%mm7, %1 \n\t"
"movq %5, %%mm7 \n\t"
"punpcklbw %%mm7, %%mm7 \n\t"
"punpcklbw %%mm7, %%mm7 \n\t"
"punpcklbw %%mm7, %%mm7 \n\t"
"psubb %%mm0, %%mm6 \n\t"
"pcmpgtb %%mm7, %%mm6 \n\t"
"movq %%mm6, %0 \n\t"
: "=m" (eq_mask), "=m" (dc_mask)
: "r" (src), "r" ((x86_reg)step), "m" (c->pQPb), "m"(c->ppMode.flatnessThreshold)
: "%"FF_REG_a
);
both_masks = dc_mask & eq_mask;
if(both_masks){
x86_reg offset= -8*step;
int64_t *temp_sums= sums;
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
"movq %2, %%mm0 \n\t" // QP,..., QP
"pxor %%mm4, %%mm4 \n\t"
"movq (%0), %%mm6 \n\t"
"movq (%0, %1), %%mm5 \n\t"
"movq %%mm5, %%mm1 \n\t"
"movq %%mm6, %%mm2 \n\t"
"psubusb %%mm6, %%mm5 \n\t"
"psubusb %%mm1, %%mm2 \n\t"
"por %%mm5, %%mm2 \n\t" // ABS Diff of lines
"psubusb %%mm2, %%mm0 \n\t" // diff >= QP -> 0
"pcmpeqb %%mm4, %%mm0 \n\t" // diff >= QP -> FF
"pxor %%mm6, %%mm1 \n\t"
"pand %%mm0, %%mm1 \n\t"
"pxor %%mm1, %%mm6 \n\t"
// 0:QP 6:First
"movq (%0, %1, 8), %%mm5 \n\t"
"add %1, %0 \n\t" // %0 points to line 1 not 0
"movq (%0, %1, 8), %%mm7 \n\t"
"movq %%mm5, %%mm1 \n\t"
"movq %%mm7, %%mm2 \n\t"
"psubusb %%mm7, %%mm5 \n\t"
"psubusb %%mm1, %%mm2 \n\t"
"por %%mm5, %%mm2 \n\t" // ABS Diff of lines
"movq %2, %%mm0 \n\t" // QP,..., QP
"psubusb %%mm2, %%mm0 \n\t" // diff >= QP -> 0
"pcmpeqb %%mm4, %%mm0 \n\t" // diff >= QP -> FF
"pxor %%mm7, %%mm1 \n\t"
"pand %%mm0, %%mm1 \n\t"
"pxor %%mm1, %%mm7 \n\t"
"movq %%mm6, %%mm5 \n\t"
"punpckhbw %%mm4, %%mm6 \n\t"
"punpcklbw %%mm4, %%mm5 \n\t"
// 4:0 5/6:First 7:Last
"movq %%mm5, %%mm0 \n\t"
"movq %%mm6, %%mm1 \n\t"
"psllw $2, %%mm0 \n\t"
"psllw $2, %%mm1 \n\t"
"paddw "MANGLE(w04)", %%mm0 \n\t"
"paddw "MANGLE(w04)", %%mm1 \n\t"
"movq (%0), %%mm2 \n\t"\
"movq (%0), %%mm3 \n\t"\
"add %1, %0 \n\t"\
"punpcklbw %%mm4, %%mm2 \n\t"\
"punpckhbw %%mm4, %%mm3 \n\t"\
"paddw %%mm2, %%mm0 \n\t"\
"paddw %%mm3, %%mm1 \n\t"
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
"movq (%0), %%mm2 \n\t"\
"movq (%0), %%mm3 \n\t"\
"add %1, %0 \n\t"\
"punpcklbw %%mm4, %%mm2 \n\t"\
"punpckhbw %%mm4, %%mm3 \n\t"\
"psubw %%mm2, %%mm0 \n\t"\
"psubw %%mm3, %%mm1 \n\t"
NEXT //0
NEXT //1
NEXT //2
"movq %%mm0, (%3) \n\t"
"movq %%mm1, 8(%3) \n\t"
NEXT //3
"psubw %%mm5, %%mm0 \n\t"
"psubw %%mm6, %%mm1 \n\t"
"movq %%mm0, 16(%3) \n\t"
"movq %%mm1, 24(%3) \n\t"
NEXT //4
"psubw %%mm5, %%mm0 \n\t"
"psubw %%mm6, %%mm1 \n\t"
"movq %%mm0, 32(%3) \n\t"
"movq %%mm1, 40(%3) \n\t"
NEXT //5
"psubw %%mm5, %%mm0 \n\t"
"psubw %%mm6, %%mm1 \n\t"
"movq %%mm0, 48(%3) \n\t"
"movq %%mm1, 56(%3) \n\t"
NEXT //6
"psubw %%mm5, %%mm0 \n\t"
"psubw %%mm6, %%mm1 \n\t"
"movq %%mm0, 64(%3) \n\t"
"movq %%mm1, 72(%3) \n\t"
"movq %%mm7, %%mm6 \n\t"
"punpckhbw %%mm4, %%mm7 \n\t"
"punpcklbw %%mm4, %%mm6 \n\t"
NEXT //7
"mov %4, %0 \n\t"
"add %1, %0 \n\t"
PREV //0
"movq %%mm0, 80(%3) \n\t"
"movq %%mm1, 88(%3) \n\t"
PREV //1
"paddw %%mm6, %%mm0 \n\t"
"paddw %%mm7, %%mm1 \n\t"
"movq %%mm0, 96(%3) \n\t"
"movq %%mm1, 104(%3) \n\t"
PREV //2
"paddw %%mm6, %%mm0 \n\t"
"paddw %%mm7, %%mm1 \n\t"
"movq %%mm0, 112(%3) \n\t"
"movq %%mm1, 120(%3) \n\t"
PREV //3
"paddw %%mm6, %%mm0 \n\t"
"paddw %%mm7, %%mm1 \n\t"
"movq %%mm0, 128(%3) \n\t"
"movq %%mm1, 136(%3) \n\t"
PREV //4
"paddw %%mm6, %%mm0 \n\t"
"paddw %%mm7, %%mm1 \n\t"
"movq %%mm0, 144(%3) \n\t"
"movq %%mm1, 152(%3) \n\t"
"mov %4, %0 \n\t" //FIXME
: "+&r"(src)
: "r" ((x86_reg)step), "m" (c->pQPb), "r"(sums), "g"(src)
NAMED_CONSTRAINTS_ADD(w04)
);
src+= step; // src points to begin of the 8x8 Block
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
"movq %4, %%mm6 \n\t"
"pcmpeqb %%mm5, %%mm5 \n\t"
"pxor %%mm6, %%mm5 \n\t"
"pxor %%mm7, %%mm7 \n\t"
"1: \n\t"
"movq (%1), %%mm0 \n\t"
"movq 8(%1), %%mm1 \n\t"
"paddw 32(%1), %%mm0 \n\t"
"paddw 40(%1), %%mm1 \n\t"
"movq (%0, %3), %%mm2 \n\t"
"movq %%mm2, %%mm3 \n\t"
"movq %%mm2, %%mm4 \n\t"
"punpcklbw %%mm7, %%mm2 \n\t"
"punpckhbw %%mm7, %%mm3 \n\t"
"paddw %%mm2, %%mm0 \n\t"
"paddw %%mm3, %%mm1 \n\t"
"paddw %%mm2, %%mm0 \n\t"
"paddw %%mm3, %%mm1 \n\t"
"psrlw $4, %%mm0 \n\t"
"psrlw $4, %%mm1 \n\t"
"packuswb %%mm1, %%mm0 \n\t"
"pand %%mm6, %%mm0 \n\t"
"pand %%mm5, %%mm4 \n\t"
"por %%mm4, %%mm0 \n\t"
"movq %%mm0, (%0, %3) \n\t"
"add $16, %1 \n\t"
"add %2, %0 \n\t"
" js 1b \n\t"
: "+r"(offset), "+r"(temp_sums)
: "r" ((x86_reg)step), "r"(src - offset), "m"(both_masks)
);
}else
src+= step; // src points to begin of the 8x8 Block
if(eq_mask != -1LL){
uint8_t *temp_src= src;
DECLARE_ALIGNED(8, uint64_t, tmp)[4]; // make space for 4 8-byte vars
"pxor %%mm7, %%mm7 \n\t"
// 0 1 2 3 4 5 6 7 8 9
// %0 eax eax+%1 eax+2%1 %0+4%1 ecx ecx+%1 ecx+2%1 %1+8%1 ecx+4%1
"movq (%0), %%mm0 \n\t"
"movq %%mm0, %%mm1 \n\t"
"punpcklbw %%mm7, %%mm0 \n\t" // low part of line 0
"punpckhbw %%mm7, %%mm1 \n\t" // high part of line 0
"movq (%0, %1), %%mm2 \n\t"
"lea (%0, %1, 2), %%"FF_REG_a" \n\t"
"movq %%mm2, %%mm3 \n\t"
"punpcklbw %%mm7, %%mm2 \n\t" // low part of line 1
"punpckhbw %%mm7, %%mm3 \n\t" // high part of line 1
"movq (%%"FF_REG_a"), %%mm4 \n\t"
"movq %%mm4, %%mm5 \n\t"
"punpcklbw %%mm7, %%mm4 \n\t" // low part of line 2
"punpckhbw %%mm7, %%mm5 \n\t" // high part of line 2
"paddw %%mm0, %%mm0 \n\t" // 2L0
"paddw %%mm1, %%mm1 \n\t" // 2H0
"psubw %%mm4, %%mm2 \n\t" // L1 - L2
"psubw %%mm5, %%mm3 \n\t" // H1 - H2
"psubw %%mm2, %%mm0 \n\t" // 2L0 - L1 + L2
"psubw %%mm3, %%mm1 \n\t" // 2H0 - H1 + H2
"psllw $2, %%mm2 \n\t" // 4L1 - 4L2
"psllw $2, %%mm3 \n\t" // 4H1 - 4H2
"psubw %%mm2, %%mm0 \n\t" // 2L0 - 5L1 + 5L2
"psubw %%mm3, %%mm1 \n\t" // 2H0 - 5H1 + 5H2
"movq (%%"FF_REG_a", %1), %%mm2 \n\t"
"movq %%mm2, %%mm3 \n\t"
"punpcklbw %%mm7, %%mm2 \n\t" // L3
"punpckhbw %%mm7, %%mm3 \n\t" // H3
"psubw %%mm2, %%mm0 \n\t" // 2L0 - 5L1 + 5L2 - L3
"psubw %%mm3, %%mm1 \n\t" // 2H0 - 5H1 + 5H2 - H3
"psubw %%mm2, %%mm0 \n\t" // 2L0 - 5L1 + 5L2 - 2L3
"psubw %%mm3, %%mm1 \n\t" // 2H0 - 5H1 + 5H2 - 2H3
"movq %%mm0, (%4) \n\t" // 2L0 - 5L1 + 5L2 - 2L3
"movq %%mm1, 8(%4) \n\t" // 2H0 - 5H1 + 5H2 - 2H3
"movq (%%"FF_REG_a", %1, 2), %%mm0 \n\t"
"movq %%mm0, %%mm1 \n\t"
"punpcklbw %%mm7, %%mm0 \n\t" // L4
"punpckhbw %%mm7, %%mm1 \n\t" // H4
"psubw %%mm0, %%mm2 \n\t" // L3 - L4
"psubw %%mm1, %%mm3 \n\t" // H3 - H4
"movq %%mm2, 16(%4) \n\t" // L3 - L4
"movq %%mm3, 24(%4) \n\t" // H3 - H4
"paddw %%mm4, %%mm4 \n\t" // 2L2
"paddw %%mm5, %%mm5 \n\t" // 2H2
"psubw %%mm2, %%mm4 \n\t" // 2L2 - L3 + L4
"psubw %%mm3, %%mm5 \n\t" // 2H2 - H3 + H4
"lea (%%"FF_REG_a", %1), %0 \n\t"
"psllw $2, %%mm2 \n\t" // 4L3 - 4L4
"psllw $2, %%mm3 \n\t" // 4H3 - 4H4
"psubw %%mm2, %%mm4 \n\t" // 2L2 - 5L3 + 5L4
"psubw %%mm3, %%mm5 \n\t" // 2H2 - 5H3 + 5H4
//50 opcodes so far
"movq (%0, %1, 2), %%mm2 \n\t"
"movq %%mm2, %%mm3 \n\t"
"punpcklbw %%mm7, %%mm2 \n\t" // L5
"punpckhbw %%mm7, %%mm3 \n\t" // H5
"psubw %%mm2, %%mm4 \n\t" // 2L2 - 5L3 + 5L4 - L5
"psubw %%mm3, %%mm5 \n\t" // 2H2 - 5H3 + 5H4 - H5
"psubw %%mm2, %%mm4 \n\t" // 2L2 - 5L3 + 5L4 - 2L5
"psubw %%mm3, %%mm5 \n\t" // 2H2 - 5H3 + 5H4 - 2H5
"movq (%%"FF_REG_a", %1, 4), %%mm6 \n\t"
"punpcklbw %%mm7, %%mm6 \n\t" // L6
"psubw %%mm6, %%mm2 \n\t" // L5 - L6
"movq (%%"FF_REG_a", %1, 4), %%mm6 \n\t"
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
"punpckhbw %%mm7, %%mm6 \n\t" // H6
"psubw %%mm6, %%mm3 \n\t" // H5 - H6
"paddw %%mm0, %%mm0 \n\t" // 2L4
"paddw %%mm1, %%mm1 \n\t" // 2H4
"psubw %%mm2, %%mm0 \n\t" // 2L4 - L5 + L6
"psubw %%mm3, %%mm1 \n\t" // 2H4 - H5 + H6
"psllw $2, %%mm2 \n\t" // 4L5 - 4L6
"psllw $2, %%mm3 \n\t" // 4H5 - 4H6
"psubw %%mm2, %%mm0 \n\t" // 2L4 - 5L5 + 5L6
"psubw %%mm3, %%mm1 \n\t" // 2H4 - 5H5 + 5H6
"movq (%0, %1, 4), %%mm2 \n\t"
"movq %%mm2, %%mm3 \n\t"
"punpcklbw %%mm7, %%mm2 \n\t" // L7
"punpckhbw %%mm7, %%mm3 \n\t" // H7
"paddw %%mm2, %%mm2 \n\t" // 2L7
"paddw %%mm3, %%mm3 \n\t" // 2H7
"psubw %%mm2, %%mm0 \n\t" // 2L4 - 5L5 + 5L6 - 2L7
"psubw %%mm3, %%mm1 \n\t" // 2H4 - 5H5 + 5H6 - 2H7
"movq (%4), %%mm2 \n\t" // 2L0 - 5L1 + 5L2 - 2L3
"movq 8(%4), %%mm3 \n\t" // 2H0 - 5H1 + 5H2 - 2H3
"movq %%mm7, %%mm6 \n\t" // 0
"psubw %%mm0, %%mm6 \n\t"
"pmaxsw %%mm6, %%mm0 \n\t" // |2L4 - 5L5 + 5L6 - 2L7|
"movq %%mm7, %%mm6 \n\t" // 0
"psubw %%mm1, %%mm6 \n\t"
"pmaxsw %%mm6, %%mm1 \n\t" // |2H4 - 5H5 + 5H6 - 2H7|
"movq %%mm7, %%mm6 \n\t" // 0
"psubw %%mm2, %%mm6 \n\t"
"pmaxsw %%mm6, %%mm2 \n\t" // |2L0 - 5L1 + 5L2 - 2L3|
"movq %%mm7, %%mm6 \n\t" // 0
"psubw %%mm3, %%mm6 \n\t"
"pmaxsw %%mm6, %%mm3 \n\t" // |2H0 - 5H1 + 5H2 - 2H3|
#else
"movq %%mm7, %%mm6 \n\t" // 0
"pcmpgtw %%mm0, %%mm6 \n\t"
"pxor %%mm6, %%mm0 \n\t"
"psubw %%mm6, %%mm0 \n\t" // |2L4 - 5L5 + 5L6 - 2L7|
"movq %%mm7, %%mm6 \n\t" // 0
"pcmpgtw %%mm1, %%mm6 \n\t"
"pxor %%mm6, %%mm1 \n\t"
"psubw %%mm6, %%mm1 \n\t" // |2H4 - 5H5 + 5H6 - 2H7|
"movq %%mm7, %%mm6 \n\t" // 0
"pcmpgtw %%mm2, %%mm6 \n\t"
"pxor %%mm6, %%mm2 \n\t"
"psubw %%mm6, %%mm2 \n\t" // |2L0 - 5L1 + 5L2 - 2L3|
"movq %%mm7, %%mm6 \n\t" // 0
"pcmpgtw %%mm3, %%mm6 \n\t"
"pxor %%mm6, %%mm3 \n\t"
"psubw %%mm6, %%mm3 \n\t" // |2H0 - 5H1 + 5H2 - 2H3|
#endif
"pminsw %%mm2, %%mm0 \n\t"
"pminsw %%mm3, %%mm1 \n\t"
#else
"movq %%mm0, %%mm6 \n\t"
"psubusw %%mm2, %%mm6 \n\t"
"psubw %%mm6, %%mm0 \n\t"
"movq %%mm1, %%mm6 \n\t"
"psubusw %%mm3, %%mm6 \n\t"
"psubw %%mm6, %%mm1 \n\t"
#endif
"movd %2, %%mm2 \n\t" // QP
"punpcklbw %%mm7, %%mm2 \n\t"
"movq %%mm7, %%mm6 \n\t" // 0
"pcmpgtw %%mm4, %%mm6 \n\t" // sign(2L2 - 5L3 + 5L4 - 2L5)
"pxor %%mm6, %%mm4 \n\t"
"psubw %%mm6, %%mm4 \n\t" // |2L2 - 5L3 + 5L4 - 2L5|
"pcmpgtw %%mm5, %%mm7 \n\t" // sign(2H2 - 5H3 + 5H4 - 2H5)
"pxor %%mm7, %%mm5 \n\t"
"psubw %%mm7, %%mm5 \n\t" // |2H2 - 5H3 + 5H4 - 2H5|