Newer
Older
"movq %%mm2, %%mm1 \n\t"
"punpcklwd %%mm4, %%mm2 \n\t"
"punpckhwd %%mm4, %%mm1 \n\t"
"movd %%mm0, (%0) \n\t"
"psrlq $32, %%mm0 \n\t"
"movd %%mm0, (%%eax) \n\t"
"movd %%mm3, (%%eax, %1) \n\t"
"psrlq $32, %%mm3 \n\t"
"movd %%mm3, (%%eax, %1, 2) \n\t"
"movd %%mm2, (%0, %1, 4) \n\t"
"psrlq $32, %%mm2 \n\t"
"movd %%mm2, (%%edx) \n\t"
"movd %%mm1, (%%edx, %1) \n\t"
"psrlq $32, %%mm1 \n\t"
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
"movq 64(%2), %%mm0 \n\t" // 12345678
"movq 80(%2), %%mm1 \n\t" // abcdefgh
"movq %%mm0, %%mm2 \n\t" // 12345678
"punpcklbw %%mm1, %%mm0 \n\t" // 1a2b3c4d
"punpckhbw %%mm1, %%mm2 \n\t" // 5e6f7g8h
"movq 96(%2), %%mm1 \n\t"
"movq 112(%2), %%mm3 \n\t"
"movq %%mm1, %%mm4 \n\t"
"punpcklbw %%mm3, %%mm1 \n\t"
"punpckhbw %%mm3, %%mm4 \n\t"
"movq %%mm0, %%mm3 \n\t"
"punpcklwd %%mm1, %%mm0 \n\t"
"punpckhwd %%mm1, %%mm3 \n\t"
"movq %%mm2, %%mm1 \n\t"
"punpcklwd %%mm4, %%mm2 \n\t"
"punpckhwd %%mm4, %%mm1 \n\t"
"movd %%mm0, 4(%0) \n\t"
"psrlq $32, %%mm0 \n\t"
"movd %%mm0, 4(%%eax) \n\t"
"movd %%mm3, 4(%%eax, %1) \n\t"
"psrlq $32, %%mm3 \n\t"
"movd %%mm3, 4(%%eax, %1, 2) \n\t"
"movd %%mm2, 4(%0, %1, 4) \n\t"
"psrlq $32, %%mm2 \n\t"
"movd %%mm2, 4(%%edx) \n\t"
"movd %%mm1, 4(%%edx, %1) \n\t"
"psrlq $32, %%mm1 \n\t"
:: "r" (dst), "r" (dstStride), "r" (src)
);
}
Michael Niedermayer
committed
#endif
Michael Niedermayer
committed
//static int test=0;
static void inline RENAME(tempNoiseReducer)(uint8_t *src, int stride,
uint8_t *tempBlured, uint32_t *tempBluredPast, int *maxNoise)
// to save a register (FIXME do this outside of the loops)
tempBluredPast[127]= maxNoise[0];
tempBluredPast[128]= maxNoise[1];
tempBluredPast[129]= maxNoise[2];
Michael Niedermayer
committed
#define FAST_L2_DIFF
//#define L1_DIFF //u should change the thresholds too if u try that one
#if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
asm volatile(
"leal (%2, %2, 2), %%eax \n\t" // 3*stride
"leal (%2, %2, 4), %%edx \n\t" // 5*stride
"leal (%%edx, %2, 2), %%ecx \n\t" // 7*stride
Michael Niedermayer
committed
// 0 1 2 3 4 5 6 7 8 9
// %x %x+%2 %x+2%2 %x+eax %x+4%2 %x+edx %x+2eax %x+ecx %x+8%2
Michael Niedermayer
committed
//FIXME reorder?
#ifdef L1_DIFF //needs mmx2
"movq (%0), %%mm0 \n\t" // L0
"psadbw (%1), %%mm0 \n\t" // |L0-R0|
"movq (%0, %2), %%mm1 \n\t" // L1
"psadbw (%1, %2), %%mm1 \n\t" // |L1-R1|
"movq (%0, %2, 2), %%mm2 \n\t" // L2
"psadbw (%1, %2, 2), %%mm2 \n\t" // |L2-R2|
"movq (%0, %%eax), %%mm3 \n\t" // L3
"psadbw (%1, %%eax), %%mm3 \n\t" // |L3-R3|
"movq (%0, %2, 4), %%mm4 \n\t" // L4
"paddw %%mm1, %%mm0 \n\t"
"psadbw (%1, %2, 4), %%mm4 \n\t" // |L4-R4|
Michael Niedermayer
committed
"paddw %%mm2, %%mm0 \n\t"
"psadbw (%1, %%edx), %%mm5 \n\t" // |L5-R5|
Michael Niedermayer
committed
"movq (%0, %%eax, 2), %%mm6 \n\t" // L6
"paddw %%mm3, %%mm0 \n\t"
"psadbw (%1, %%eax, 2), %%mm6 \n\t" // |L6-R6|
"movq (%0, %%ecx), %%mm7 \n\t" // L7
"paddw %%mm4, %%mm0 \n\t"
"psadbw (%1, %%ecx), %%mm7 \n\t" // |L7-R7|
"paddw %%mm5, %%mm6 \n\t"
"paddw %%mm7, %%mm6 \n\t"
"paddw %%mm6, %%mm0 \n\t"
#elif defined (FAST_L2_DIFF)
"pcmpeqb %%mm7, %%mm7 \n\t"
Michael Niedermayer
committed
"pxor %%mm0, %%mm0 \n\t"
#define L2_DIFF_CORE(a, b)\
"movq " #a ", %%mm5 \n\t"\
"movq " #b ", %%mm2 \n\t"\
"pxor %%mm7, %%mm2 \n\t"\
PAVGB(%%mm2, %%mm5)\
"paddb %%mm6, %%mm5 \n\t"\
"movq %%mm5, %%mm2 \n\t"\
"psllw $8, %%mm5 \n\t"\
"pmaddwd %%mm5, %%mm5 \n\t"\
"pmaddwd %%mm2, %%mm2 \n\t"\
"paddd %%mm2, %%mm5 \n\t"\
"psrld $14, %%mm5 \n\t"\
"paddd %%mm5, %%mm0 \n\t"
L2_DIFF_CORE((%0), (%1))
L2_DIFF_CORE((%0, %2), (%1, %2))
L2_DIFF_CORE((%0, %2, 2), (%1, %2, 2))
L2_DIFF_CORE((%0, %%eax), (%1, %%eax))
L2_DIFF_CORE((%0, %2, 4), (%1, %2, 4))
Michael Niedermayer
committed
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
L2_DIFF_CORE((%0, %%eax,2), (%1, %%eax,2))
L2_DIFF_CORE((%0, %%ecx), (%1, %%ecx))
#else
"pxor %%mm7, %%mm7 \n\t"
"pxor %%mm0, %%mm0 \n\t"
#define L2_DIFF_CORE(a, b)\
"movq " #a ", %%mm5 \n\t"\
"movq " #b ", %%mm2 \n\t"\
"movq %%mm5, %%mm1 \n\t"\
"movq %%mm2, %%mm3 \n\t"\
"punpcklbw %%mm7, %%mm5 \n\t"\
"punpckhbw %%mm7, %%mm1 \n\t"\
"punpcklbw %%mm7, %%mm2 \n\t"\
"punpckhbw %%mm7, %%mm3 \n\t"\
"psubw %%mm2, %%mm5 \n\t"\
"psubw %%mm3, %%mm1 \n\t"\
"pmaddwd %%mm5, %%mm5 \n\t"\
"pmaddwd %%mm1, %%mm1 \n\t"\
"paddd %%mm1, %%mm5 \n\t"\
"paddd %%mm5, %%mm0 \n\t"
L2_DIFF_CORE((%0), (%1))
L2_DIFF_CORE((%0, %2), (%1, %2))
L2_DIFF_CORE((%0, %2, 2), (%1, %2, 2))
L2_DIFF_CORE((%0, %%eax), (%1, %%eax))
L2_DIFF_CORE((%0, %2, 4), (%1, %2, 4))
Michael Niedermayer
committed
L2_DIFF_CORE((%0, %%eax,2), (%1, %%eax,2))
L2_DIFF_CORE((%0, %%ecx), (%1, %%ecx))
#endif
"movq %%mm0, %%mm4 \n\t"
"psrlq $32, %%mm0 \n\t"
"paddd %%mm0, %%mm4 \n\t"
"movd %%mm4, %%ecx \n\t"
"movl %3, %%edx \n\t"
"addl -4(%%edx), %%ecx \n\t"
"addl 4(%%edx), %%ecx \n\t"
"addl -1024(%%edx), %%ecx \n\t"
Michael Niedermayer
committed
// "movl %%ecx, test \n\t"
// "jmp 4f \n\t"
Michael Niedermayer
committed
" jb 2f \n\t"
Michael Niedermayer
committed
" jb 1f \n\t"
"leal (%%eax, %2, 2), %%edx \n\t" // 5*stride
"leal (%%edx, %2, 2), %%ecx \n\t" // 7*stride
Michael Niedermayer
committed
"movq (%0), %%mm0 \n\t" // L0
"movq (%0, %2), %%mm1 \n\t" // L1
"movq (%0, %2, 2), %%mm2 \n\t" // L2
"movq (%0, %%eax), %%mm3 \n\t" // L3
"movq (%0, %2, 4), %%mm4 \n\t" // L4
Michael Niedermayer
committed
"movq (%0, %%eax, 2), %%mm6 \n\t" // L6
"movq (%0, %%ecx), %%mm7 \n\t" // L7
"movq %%mm0, (%1) \n\t" // L0
"movq %%mm1, (%1, %2) \n\t" // L1
"movq %%mm2, (%1, %2, 2) \n\t" // L2
"movq %%mm3, (%1, %%eax) \n\t" // L3
"movq %%mm4, (%1, %2, 4) \n\t" // L4
Michael Niedermayer
committed
"movq %%mm6, (%1, %%eax, 2) \n\t" // L6
"movq %%mm7, (%1, %%ecx) \n\t" // L7
"jmp 4f \n\t"
"1: \n\t"
"leal (%%eax, %2, 2), %%edx \n\t" // 5*stride
"leal (%%edx, %2, 2), %%ecx \n\t" // 7*stride
Michael Niedermayer
committed
"movq (%0), %%mm0 \n\t" // L0
PAVGB((%1), %%mm0) // L0
Michael Niedermayer
committed
"movq (%0, %2), %%mm1 \n\t" // L1
PAVGB((%1, %2), %%mm1) // L1
Michael Niedermayer
committed
"movq (%0, %2, 2), %%mm2 \n\t" // L2
PAVGB((%1, %2, 2), %%mm2) // L2
Michael Niedermayer
committed
"movq (%0, %%eax), %%mm3 \n\t" // L3
PAVGB((%1, %%eax), %%mm3) // L3
Michael Niedermayer
committed
"movq (%0, %2, 4), %%mm4 \n\t" // L4
PAVGB((%1, %2, 4), %%mm4) // L4
"movq (%0, %%edx), %%mm5 \n\t" // L5
PAVGB((%1, %%edx), %%mm5) // L5
Michael Niedermayer
committed
"movq (%0, %%eax, 2), %%mm6 \n\t" // L6
PAVGB((%1, %%eax, 2), %%mm6) // L6
Michael Niedermayer
committed
"movq (%0, %%ecx), %%mm7 \n\t" // L7
PAVGB((%1, %%ecx), %%mm7) // L7
Michael Niedermayer
committed
"movq %%mm0, (%1) \n\t" // R0
"movq %%mm1, (%1, %2) \n\t" // R1
"movq %%mm2, (%1, %2, 2) \n\t" // R2
"movq %%mm3, (%1, %%eax) \n\t" // R3
"movq %%mm4, (%1, %2, 4) \n\t" // R4
Michael Niedermayer
committed
"movq %%mm6, (%1, %%eax, 2) \n\t" // R6
"movq %%mm7, (%1, %%ecx) \n\t" // R7
"movq %%mm0, (%0) \n\t" // L0
"movq %%mm1, (%0, %2) \n\t" // L1
"movq %%mm2, (%0, %2, 2) \n\t" // L2
"movq %%mm3, (%0, %%eax) \n\t" // L3
"movq %%mm4, (%0, %2, 4) \n\t" // L4
Michael Niedermayer
committed
"movq %%mm6, (%0, %%eax, 2) \n\t" // L6
"movq %%mm7, (%0, %%ecx) \n\t" // L7
"jmp 4f \n\t"
"2: \n\t"
Michael Niedermayer
committed
" jb 3f \n\t"
"leal (%%eax, %2, 2), %%edx \n\t" // 5*stride
"leal (%%edx, %2, 2), %%ecx \n\t" // 7*stride
Michael Niedermayer
committed
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
"movq (%0), %%mm0 \n\t" // L0
"movq (%0, %2), %%mm1 \n\t" // L1
"movq (%0, %2, 2), %%mm2 \n\t" // L2
"movq (%0, %%eax), %%mm3 \n\t" // L3
"movq (%1), %%mm4 \n\t" // R0
"movq (%1, %2), %%mm5 \n\t" // R1
"movq (%1, %2, 2), %%mm6 \n\t" // R2
"movq (%1, %%eax), %%mm7 \n\t" // R3
PAVGB(%%mm4, %%mm0)
PAVGB(%%mm5, %%mm1)
PAVGB(%%mm6, %%mm2)
PAVGB(%%mm7, %%mm3)
PAVGB(%%mm4, %%mm0)
PAVGB(%%mm5, %%mm1)
PAVGB(%%mm6, %%mm2)
PAVGB(%%mm7, %%mm3)
"movq %%mm0, (%1) \n\t" // R0
"movq %%mm1, (%1, %2) \n\t" // R1
"movq %%mm2, (%1, %2, 2) \n\t" // R2
"movq %%mm3, (%1, %%eax) \n\t" // R3
"movq %%mm0, (%0) \n\t" // L0
"movq %%mm1, (%0, %2) \n\t" // L1
"movq %%mm2, (%0, %2, 2) \n\t" // L2
"movq %%mm3, (%0, %%eax) \n\t" // L3
"movq (%0, %2, 4), %%mm0 \n\t" // L4
Michael Niedermayer
committed
"movq (%0, %%eax, 2), %%mm2 \n\t" // L6
"movq (%0, %%ecx), %%mm3 \n\t" // L7
"movq (%1, %2, 4), %%mm4 \n\t" // R4
Michael Niedermayer
committed
"movq (%1, %%eax, 2), %%mm6 \n\t" // R6
"movq (%1, %%ecx), %%mm7 \n\t" // R7
PAVGB(%%mm4, %%mm0)
PAVGB(%%mm5, %%mm1)
PAVGB(%%mm6, %%mm2)
PAVGB(%%mm7, %%mm3)
PAVGB(%%mm4, %%mm0)
PAVGB(%%mm5, %%mm1)
PAVGB(%%mm6, %%mm2)
PAVGB(%%mm7, %%mm3)
"movq %%mm0, (%1, %2, 4) \n\t" // R4
Michael Niedermayer
committed
"movq %%mm2, (%1, %%eax, 2) \n\t" // R6
"movq %%mm3, (%1, %%ecx) \n\t" // R7
"movq %%mm0, (%0, %2, 4) \n\t" // L4
Michael Niedermayer
committed
"movq %%mm2, (%0, %%eax, 2) \n\t" // L6
"movq %%mm3, (%0, %%ecx) \n\t" // L7
"jmp 4f \n\t"
"3: \n\t"
"leal (%%eax, %2, 2), %%edx \n\t" // 5*stride
"leal (%%edx, %2, 2), %%ecx \n\t" // 7*stride
Michael Niedermayer
committed
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
"movq (%0), %%mm0 \n\t" // L0
"movq (%0, %2), %%mm1 \n\t" // L1
"movq (%0, %2, 2), %%mm2 \n\t" // L2
"movq (%0, %%eax), %%mm3 \n\t" // L3
"movq (%1), %%mm4 \n\t" // R0
"movq (%1, %2), %%mm5 \n\t" // R1
"movq (%1, %2, 2), %%mm6 \n\t" // R2
"movq (%1, %%eax), %%mm7 \n\t" // R3
PAVGB(%%mm4, %%mm0)
PAVGB(%%mm5, %%mm1)
PAVGB(%%mm6, %%mm2)
PAVGB(%%mm7, %%mm3)
PAVGB(%%mm4, %%mm0)
PAVGB(%%mm5, %%mm1)
PAVGB(%%mm6, %%mm2)
PAVGB(%%mm7, %%mm3)
PAVGB(%%mm4, %%mm0)
PAVGB(%%mm5, %%mm1)
PAVGB(%%mm6, %%mm2)
PAVGB(%%mm7, %%mm3)
"movq %%mm0, (%1) \n\t" // R0
"movq %%mm1, (%1, %2) \n\t" // R1
"movq %%mm2, (%1, %2, 2) \n\t" // R2
"movq %%mm3, (%1, %%eax) \n\t" // R3
"movq %%mm0, (%0) \n\t" // L0
"movq %%mm1, (%0, %2) \n\t" // L1
"movq %%mm2, (%0, %2, 2) \n\t" // L2
"movq %%mm3, (%0, %%eax) \n\t" // L3
"movq (%0, %2, 4), %%mm0 \n\t" // L4
Michael Niedermayer
committed
"movq (%0, %%eax, 2), %%mm2 \n\t" // L6
"movq (%0, %%ecx), %%mm3 \n\t" // L7
"movq (%1, %2, 4), %%mm4 \n\t" // R4
Michael Niedermayer
committed
"movq (%1, %%eax, 2), %%mm6 \n\t" // R6
"movq (%1, %%ecx), %%mm7 \n\t" // R7
PAVGB(%%mm4, %%mm0)
PAVGB(%%mm5, %%mm1)
PAVGB(%%mm6, %%mm2)
PAVGB(%%mm7, %%mm3)
PAVGB(%%mm4, %%mm0)
PAVGB(%%mm5, %%mm1)
PAVGB(%%mm6, %%mm2)
PAVGB(%%mm7, %%mm3)
PAVGB(%%mm4, %%mm0)
PAVGB(%%mm5, %%mm1)
PAVGB(%%mm6, %%mm2)
PAVGB(%%mm7, %%mm3)
"movq %%mm0, (%1, %2, 4) \n\t" // R4
Michael Niedermayer
committed
"movq %%mm2, (%1, %%eax, 2) \n\t" // R6
"movq %%mm3, (%1, %%ecx) \n\t" // R7
"movq %%mm0, (%0, %2, 4) \n\t" // L4
Michael Niedermayer
committed
"movq %%mm2, (%0, %%eax, 2) \n\t" // L6
"movq %%mm3, (%0, %%ecx) \n\t" // L7
"4: \n\t"
:: "r" (src), "r" (tempBlured), "r"(stride), "m" (tempBluredPast)
Michael Niedermayer
committed
);
//printf("%d\n", test);
#else
int y;
int d=0;
int sysd=0;
for(y=0; y<8; y++)
{
int x;
for(x=0; x<8; x++)
{
int ref= tempBlured[ x + y*stride ];
int cur= src[ x + y*stride ];
int d1=ref - cur;
Michael Niedermayer
committed
// if(x==0 || x==7) d1+= d1>>1;
// if(y==0 || y==7) d1+= d1>>1;
// d+= ABS(d1);
d+= d1*d1;
sysd+= d1;
}
}
i=d;
d= (
4*d
+(*(tempBluredPast-256))
+(*(tempBluredPast-1))+ (*(tempBluredPast+1))
+(*(tempBluredPast+256))
+4)>>3;
*tempBluredPast=i;
// ((*tempBluredPast)*3 + d + 2)>>2;
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
//printf("%d %d %d\n", maxNoise[0], maxNoise[1], maxNoise[2]);
/*
Switch between
1 0 0 0 0 0 0 (0)
64 32 16 8 4 2 1 (1)
64 48 36 27 20 15 11 (33) (approx)
64 56 49 43 37 33 29 (200) (approx)
*/
if(d > maxNoise[1])
{
if(d < maxNoise[2])
{
for(y=0; y<8; y++)
{
int x;
for(x=0; x<8; x++)
{
int ref= tempBlured[ x + y*stride ];
int cur= src[ x + y*stride ];
tempBlured[ x + y*stride ]=
src[ x + y*stride ]=
(ref + cur + 1)>>1;
}
}
}
else
{
for(y=0; y<8; y++)
{
int x;
for(x=0; x<8; x++)
{
tempBlured[ x + y*stride ]= src[ x + y*stride ];
}
}
}
}
else
{
if(d < maxNoise[0])
{
for(y=0; y<8; y++)
{
int x;
for(x=0; x<8; x++)
{
int ref= tempBlured[ x + y*stride ];
int cur= src[ x + y*stride ];
tempBlured[ x + y*stride ]=
src[ x + y*stride ]=
(ref*7 + cur + 4)>>3;
}
}
}
else
{
for(y=0; y<8; y++)
{
int x;
for(x=0; x<8; x++)
{
int ref= tempBlured[ x + y*stride ];
int cur= src[ x + y*stride ];
tempBlured[ x + y*stride ]=
src[ x + y*stride ]=
(ref*3 + cur + 2)>>2;
}
}
}
}
Michael Niedermayer
committed
#endif
static void RENAME(postProcess)(uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int width, int height,
QP_STORE_T QPs[], int QPStride, int isColor, PPContext *c);
/**
* Copies a block from src to dst and fixes the blacklevel
* levelFix == 0 -> dont touch the brighness & contrast
#undef SCALED_CPY
static inline void RENAME(blockCopy)(uint8_t dst[], int dstStride, uint8_t src[], int srcStride,
int levelFix, int64_t *packedOffsetAndScale)
Michael Niedermayer
committed
#ifndef HAVE_MMX
Michael Niedermayer
committed
#endif
if(levelFix)
{
#ifdef HAVE_MMX
asm volatile(
"movq (%%eax), %%mm2 \n\t" // packedYOffset
"movq 8(%%eax), %%mm3 \n\t" // packedYScale
"leal (%2,%4), %%eax \n\t"
"leal (%3,%5), %%edx \n\t"
Michael Niedermayer
committed
"pxor %%mm4, %%mm4 \n\t"
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
#ifdef HAVE_MMX2
#define SCALED_CPY(src1, src2, dst1, dst2) \
"movq " #src1 ", %%mm0 \n\t"\
"movq " #src1 ", %%mm5 \n\t"\
"movq " #src2 ", %%mm1 \n\t"\
"movq " #src2 ", %%mm6 \n\t"\
"punpcklbw %%mm0, %%mm0 \n\t"\
"punpckhbw %%mm5, %%mm5 \n\t"\
"punpcklbw %%mm1, %%mm1 \n\t"\
"punpckhbw %%mm6, %%mm6 \n\t"\
"pmulhuw %%mm3, %%mm0 \n\t"\
"pmulhuw %%mm3, %%mm5 \n\t"\
"pmulhuw %%mm3, %%mm1 \n\t"\
"pmulhuw %%mm3, %%mm6 \n\t"\
"psubw %%mm2, %%mm0 \n\t"\
"psubw %%mm2, %%mm5 \n\t"\
"psubw %%mm2, %%mm1 \n\t"\
"psubw %%mm2, %%mm6 \n\t"\
"packuswb %%mm5, %%mm0 \n\t"\
"packuswb %%mm6, %%mm1 \n\t"\
"movq %%mm0, " #dst1 " \n\t"\
"movq %%mm1, " #dst2 " \n\t"\
#define SCALED_CPY(src1, src2, dst1, dst2) \
"movq " #src1 ", %%mm0 \n\t"\
"movq " #src1 ", %%mm5 \n\t"\
Michael Niedermayer
committed
"punpcklbw %%mm4, %%mm0 \n\t"\
"punpckhbw %%mm4, %%mm5 \n\t"\
"psubw %%mm2, %%mm0 \n\t"\
"psubw %%mm2, %%mm5 \n\t"\
"psllw $6, %%mm0 \n\t"\
"psllw $6, %%mm5 \n\t"\
Michael Niedermayer
committed
"pmulhw %%mm3, %%mm0 \n\t"\
Michael Niedermayer
committed
"pmulhw %%mm3, %%mm5 \n\t"\
"punpcklbw %%mm4, %%mm1 \n\t"\
"punpckhbw %%mm4, %%mm6 \n\t"\
"psubw %%mm2, %%mm6 \n\t"\
"psllw $6, %%mm6 \n\t"\
Michael Niedermayer
committed
"pmulhw %%mm3, %%mm1 \n\t"\
"pmulhw %%mm3, %%mm6 \n\t"\
"packuswb %%mm5, %%mm0 \n\t"\
"packuswb %%mm6, %%mm1 \n\t"\
"movq %%mm0, " #dst1 " \n\t"\
"movq %%mm1, " #dst2 " \n\t"\
SCALED_CPY((%2) , (%2, %4) , (%3) , (%3, %5))
SCALED_CPY((%2, %4, 2), (%%eax, %4, 2), (%3, %5, 2), (%%edx, %5, 2))
SCALED_CPY((%2, %4, 4), (%%eax, %4, 4), (%3, %5, 4), (%%edx, %5, 4))
"leal (%%eax,%4,4), %%eax \n\t"
"leal (%%edx,%5,4), %%edx \n\t"
SCALED_CPY((%%eax, %4), (%%eax, %4, 2), (%%edx, %5), (%%edx, %5, 2))
: "=&a" (packedOffsetAndScale)
: "0" (packedOffsetAndScale),
"r"(src),
"r" (dstStride)
);
#else
memcpy( &(dst[dstStride*i]),
&(src[srcStride*i]), BLOCK_SIZE);
#endif
}
else
{
#ifdef HAVE_MMX
asm volatile(
#define SIMPLE_CPY(src1, src2, dst1, dst2) \
"movq " #src1 ", %%mm0 \n\t"\
"movq " #src2 ", %%mm1 \n\t"\
"movq %%mm0, " #dst1 " \n\t"\
"movq %%mm1, " #dst2 " \n\t"\
SIMPLE_CPY((%0) , (%0, %2) , (%1) , (%1, %3))
SIMPLE_CPY((%0, %2, 2), (%%eax, %2, 2), (%1, %3, 2), (%%edx, %3, 2))
SIMPLE_CPY((%0, %2, 4), (%%eax, %2, 4), (%1, %3, 4), (%%edx, %3, 4))
"leal (%%edx,%3,4), %%edx \n\t"
SIMPLE_CPY((%%eax, %2), (%%eax, %2, 2), (%%edx, %3), (%%edx, %3, 2))
: : "r" (src),
"r" (dst),
"r" (srcStride),
memcpy( &(dst[dstStride*i]),
&(src[srcStride*i]), BLOCK_SIZE);
#endif
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
/**
* Duplicates the given 8 src pixels ? times upward
*/
static inline void RENAME(duplicate)(uint8_t src[], int stride)
{
#ifdef HAVE_MMX
asm volatile(
"movq (%0), %%mm0 \n\t"
"addl %1, %0 \n\t"
"movq %%mm0, (%0) \n\t"
"movq %%mm0, (%0, %1) \n\t"
"movq %%mm0, (%0, %1, 2) \n\t"
: "+r" (src)
: "r" (-stride)
);
#else
int i;
uint8_t *p=src;
for(i=0; i<3; i++)
{
p-= stride;
memcpy(p, src, 8);
}
#endif
}
/**
* Filters array of bytes (Y or U or V values)
*/
static void RENAME(postProcess)(uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int width, int height,
QP_STORE_T QPs[], int QPStride, int isColor, PPContext *c2)
PPContext __attribute__((aligned(8))) c= *c2; //copy to stack for faster access
int x,y;
Michael Niedermayer
committed
#ifdef COMPILE_TIME_MODE
const int mode= COMPILE_TIME_MODE;
#else
const int mode= isColor ? c.ppMode.chromMode : c.ppMode.lumMode;
Michael Niedermayer
committed
#endif
int black=0, white=255; // blackest black and whitest white in the picture
//FIXME remove
uint64_t * const yHistogram= c.yHistogram;
uint8_t * const tempSrc= c.tempSrc;
uint8_t * const tempDst= c.tempDst;
c.dcOffset= c.ppMode.maxDcDiff;
c.dcThreshold= c.ppMode.maxDcDiff*2 + 1;
c.mmxDcOffset= 0x7F - c.dcOffset;
c.mmxDcThreshold= 0x7F - c.dcThreshold;
c.mmxDcOffset*= 0x0101010101010101LL;
c.mmxDcThreshold*= 0x0101010101010101LL;
else if( (mode & LINEAR_BLEND_DEINT_FILTER)
|| (mode & FFMPEG_DEINT_FILTER)) copyAhead=14;
else if( (mode & V_DEBLOCK)
|| (mode & LINEAR_IPOL_DEINT_FILTER)
|| (mode & MEDIAN_DEINT_FILTER)) copyAhead=13;
else if(mode & V_X1_FILTER) copyAhead=11;
// else if(mode & V_RK1_FILTER) copyAhead=10;
else if(mode & DERING) copyAhead=9;
else copyAhead=8;
copyAhead-= 8;
if(!isColor)
{
uint64_t sum= 0;
int i;
uint64_t maxClipped;
uint64_t clipped;
double scale;
c.frameNum++;
// first frame is fscked so we ignore it
if(c.frameNum == 1) yHistogram[0]= width*height/64*15/256;
for(i=0; i<256; i++)
{
sum+= yHistogram[i];
// printf("%d ", yHistogram[i]);
}
// printf("\n\n");
/* we allways get a completly black picture first */
maxClipped= (uint64_t)(sum * maxClippedThreshold);
clipped= sum;
for(black=255; black>0; black--)
{
if(clipped < maxClipped) break;
clipped-= yHistogram[black];
}
clipped= sum;
for(white=0; white<256; white++)
{
if(clipped < maxClipped) break;
clipped-= yHistogram[white];
}
scale= (double)(c.ppMode.maxAllowedY - c.ppMode.minAllowedY) / (double)(white-black);
c.packedYScale= (uint16_t)(scale*256.0 + 0.5);
c.packedYOffset= (((black*c.packedYScale)>>8) - c.ppMode.minAllowedY) & 0xFFFF;
c.packedYScale= (uint16_t)(scale*1024.0 + 0.5);
c.packedYOffset= (black - c.ppMode.minAllowedY) & 0xFFFF;
c.packedYOffset|= c.packedYOffset<<32;
c.packedYOffset|= c.packedYOffset<<16;
c.packedYScale|= c.packedYScale<<32;
c.packedYScale|= c.packedYScale<<16;
if(mode & LEVEL_FIX) QPCorrecture= (int)(scale*256*256 + 0.5);
else QPCorrecture= 256*256;
}
else
{
c.packedYScale= 0x0100010001000100LL;
c.packedYOffset= 0;
/* copy & deinterlace first row of blocks */
y=-BLOCK_SIZE;
{
uint8_t *srcBlock= &(src[y*srcStride]);
// From this point on it is guranteed that we can read and write 16 lines downward
// finish 1 block before the next otherwise well might have a problem
// with the L1 Cache of the P4 ... or only a few blocks at a time or soemthing
for(x=0; x<width; x+=BLOCK_SIZE)
{
#ifdef HAVE_MMX2
/*
prefetchnta(srcBlock + (((x>>2)&6) + 5)*srcStride + 32);
prefetchnta(srcBlock + (((x>>2)&6) + 6)*srcStride + 32);
prefetcht0(dstBlock + (((x>>2)&6) + 5)*dstStride + 32);
prefetcht0(dstBlock + (((x>>2)&6) + 6)*dstStride + 32);
*/
asm(
"movl %4, %%eax \n\t"
"shrl $2, %%eax \n\t"
"andl $6, %%eax \n\t"
:: "r" (srcBlock), "r" (srcStride), "r" (dstBlock), "r" (dstStride),
);
#elif defined(HAVE_3DNOW)
//FIXME check if this is faster on an 3dnow chip or if its faster without the prefetch or ...
/* prefetch(srcBlock + (((x>>3)&3) + 5)*srcStride + 32);
prefetch(srcBlock + (((x>>3)&3) + 9)*srcStride + 32);
prefetchw(dstBlock + (((x>>3)&3) + 5)*dstStride + 32);
prefetchw(dstBlock + (((x>>3)&3) + 9)*dstStride + 32);
*/
#endif
RENAME(blockCopy)(dstBlock + dstStride*8, dstStride,
srcBlock + srcStride*8, srcStride, mode & LEVEL_FIX, &c.packedYOffset);
RENAME(duplicate)(dstBlock + dstStride*8, dstStride);
RENAME(deInterlaceInterpolateLinear)(dstBlock, dstStride);
RENAME(deInterlaceBlendLinear)(dstBlock, dstStride);
RENAME(deInterlaceMedian)(dstBlock, dstStride);
RENAME(deInterlaceInterpolateCubic)(dstBlock, dstStride);
else if(mode & FFMPEG_DEINT_FILTER)
RENAME(deInterlaceFF)(dstBlock, dstStride, c.deintTemp + x);
RENAME(deInterlaceBlendCubic)(dstBlock, dstStride);
memcpy(dst, tempDst + 9*dstStride, copyAhead*dstStride );
{
//1% speedup if these are here instead of the inner loop
uint8_t *srcBlock= &(src[y*srcStride]);
uint8_t *dstBlock= &(dst[y*dstStride]);
uint8_t *tempBlock1= c.tempBlocks;
uint8_t *tempBlock2= c.tempBlocks + 8;
#ifdef ARCH_X86
int *QPptr= isColor ? &QPs[(y>>3)*QPStride] :&QPs[(y>>4)*QPStride];
int QPDelta= isColor ? (-1) : 1<<31;
int QPFrac= 1<<30;
/* can we mess with a 8x16 block from srcBlock/dstBlock downwards and 1 line upwards
if not than use a temporary buffer */
/* copy from line (copyAhead) to (copyAhead+7) of src, these will be copied with
memcpy(tempSrc + srcStride*copyAhead, srcBlock + srcStride*copyAhead,
srcStride*MAX(height-y-copyAhead, 0) );
/* duplicate last line of src to fill the void upto line (copyAhead+7) */
for(i=MAX(height-y, 8); i<copyAhead+8; i++)
memcpy(tempSrc + srcStride*i, src + srcStride*(height-1), srcStride);
/* copy up to (copyAhead+1) lines of dst (line -1 to (copyAhead-1))*/
memcpy(tempDst, dstBlock - dstStride, dstStride*MIN(height-y+1, copyAhead+1) );
/* duplicate last line of dst to fill the void upto line (copyAhead) */
for(i=height-y+1; i<=copyAhead; i++)
memcpy(tempDst + dstStride*i, dst + dstStride*(height-1), dstStride);
// From this point on it is guranteed that we can read and write 16 lines downward
// finish 1 block before the next otherwise well might have a problem
// with the L1 Cache of the P4 ... or only a few blocks at a time or soemthing
for(x=0; x<width; x+=BLOCK_SIZE)
const int stride= dstStride;
uint8_t *tmpXchg;
asm volatile(
"addl %2, %1 \n\t"
"sbbl %%eax, %%eax \n\t"
"shll $2, %%eax \n\t"
"subl %%eax, %0 \n\t"
: "+r" (QPptr), "+m" (QPFrac)
: "r" (QPDelta)
: "%eax"
);
#else
QPs[(y>>3)*QPStride + (x>>3)]:
QPs[(y>>4)*QPStride + (x>>4)];
#endif
if(!isColor)
yHistogram[ srcBlock[srcStride*12 + 4] ]++;
"packuswb %%mm7, %%mm7 \n\t" // 0, 0, 0, QP, 0, 0, 0, QP
"packuswb %%mm7, %%mm7 \n\t" // 0,QP, 0, QP, 0,QP, 0, QP
"packuswb %%mm7, %%mm7 \n\t" // QP,..., QP
"movq %%mm7, %0 \n\t"
: "=m" (c.pQPb)
: "r" (QP)
/*
prefetchnta(srcBlock + (((x>>2)&6) + 5)*srcStride + 32);
prefetchnta(srcBlock + (((x>>2)&6) + 6)*srcStride + 32);
prefetcht0(dstBlock + (((x>>2)&6) + 5)*dstStride + 32);
prefetcht0(dstBlock + (((x>>2)&6) + 6)*dstStride + 32);
*/
asm(
"movl %4, %%eax \n\t"
"shrl $2, %%eax \n\t"
"andl $6, %%eax \n\t"
:: "r" (srcBlock), "r" (srcStride), "r" (dstBlock), "r" (dstStride),
#elif defined(HAVE_3DNOW)
//FIXME check if this is faster on an 3dnow chip or if its faster without the prefetch or ...
/* prefetch(srcBlock + (((x>>3)&3) + 5)*srcStride + 32);
prefetch(srcBlock + (((x>>3)&3) + 9)*srcStride + 32);
prefetchw(dstBlock + (((x>>3)&3) + 5)*dstStride + 32);
prefetchw(dstBlock + (((x>>3)&3) + 9)*dstStride + 32);
RENAME(blockCopy)(dstBlock + dstStride*copyAhead, dstStride,
srcBlock + srcStride*copyAhead, srcStride, mode & LEVEL_FIX, &c.packedYOffset);
if(mode & LINEAR_IPOL_DEINT_FILTER)
RENAME(deInterlaceInterpolateLinear)(dstBlock, dstStride);
else if(mode & LINEAR_BLEND_DEINT_FILTER)
RENAME(deInterlaceBlendLinear)(dstBlock, dstStride);
RENAME(deInterlaceMedian)(dstBlock, dstStride);
else if(mode & CUBIC_IPOL_DEINT_FILTER)
RENAME(deInterlaceInterpolateCubic)(dstBlock, dstStride);
else if(mode & FFMPEG_DEINT_FILTER)
RENAME(deInterlaceFF)(dstBlock, dstStride, c.deintTemp + x);
/* else if(mode & CUBIC_BLEND_DEINT_FILTER)
RENAME(deInterlaceBlendCubic)(dstBlock, dstStride);
Michael Niedermayer
committed
*/
/* only deblock if we have 2 blocks */
if(y + 8 < height)
{
if(mode & V_X1_FILTER)
RENAME(vertX1Filter)(dstBlock, stride, &c);
Michael Niedermayer
committed
else if(mode & V_DEBLOCK)
if( RENAME(isVertDC)(dstBlock, stride, &c))
if(RENAME(isVertMinMaxOk)(dstBlock, stride, &c))
RENAME(doVertLowPass)(dstBlock, stride, &c);
Michael Niedermayer
committed
else
RENAME(doVertDefFilter)(dstBlock, stride, &c);
#ifdef HAVE_MMX
RENAME(transpose1)(tempBlock1, tempBlock2, dstBlock, dstStride);
#endif
/* check if we have a previous block to deblock it with dstBlock */
#ifdef HAVE_MMX
if(mode & H_X1_FILTER)
RENAME(vertX1Filter)(tempBlock1, 16, &c);
else if(mode & H_DEBLOCK)
{
if( RENAME(isVertDC)(tempBlock1, 16, &c))
{
if(RENAME(isVertMinMaxOk)(tempBlock1, 16, &c))
RENAME(doVertLowPass)(tempBlock1, 16, &c);
}
else
RENAME(doVertDefFilter)(tempBlock1, 16, &c);
}
RENAME(transpose2)(dstBlock-4, dstStride, tempBlock1 + 4*16);
#else
Michael Niedermayer
committed
if(mode & H_X1_FILTER)
horizX1Filter(dstBlock-4, stride, QP);
else if(mode & H_DEBLOCK)
if(isHorizMinMaxOk(dstBlock-4, stride, QP))
doHorizLowPass(dstBlock-4, stride, QP);
Michael Niedermayer
committed
else
doHorizDefFilter(dstBlock-4, stride, QP);
if(mode & DERING)
{
//FIXME filter first line
if(y>0) RENAME(dering)(dstBlock - stride - 8, stride, &c);