Newer
Older
"psubw %%mm3, %%mm5 \n\t" // 2H2 - H3 + H4
"lea (%%"REG_a", %1), %0 \n\t"
"psllw $2, %%mm2 \n\t" // 4L3 - 4L4
"psllw $2, %%mm3 \n\t" // 4H3 - 4H4
"psubw %%mm2, %%mm4 \n\t" // 2L2 - 5L3 + 5L4
"psubw %%mm3, %%mm5 \n\t" // 2H2 - 5H3 + 5H4
//50 opcodes so far
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
"movq (%0, %1, 2), %%mm2 \n\t"
"movq %%mm2, %%mm3 \n\t"
"punpcklbw %%mm7, %%mm2 \n\t" // L5
"punpckhbw %%mm7, %%mm3 \n\t" // H5
"psubw %%mm2, %%mm4 \n\t" // 2L2 - 5L3 + 5L4 - L5
"psubw %%mm3, %%mm5 \n\t" // 2H2 - 5H3 + 5H4 - H5
"psubw %%mm2, %%mm4 \n\t" // 2L2 - 5L3 + 5L4 - 2L5
"psubw %%mm3, %%mm5 \n\t" // 2H2 - 5H3 + 5H4 - 2H5
"movq (%%"REG_a", %1, 4), %%mm6 \n\t"
"punpcklbw %%mm7, %%mm6 \n\t" // L6
"psubw %%mm6, %%mm2 \n\t" // L5 - L6
"movq (%%"REG_a", %1, 4), %%mm6 \n\t"
"punpckhbw %%mm7, %%mm6 \n\t" // H6
"psubw %%mm6, %%mm3 \n\t" // H5 - H6
"paddw %%mm0, %%mm0 \n\t" // 2L4
"paddw %%mm1, %%mm1 \n\t" // 2H4
"psubw %%mm2, %%mm0 \n\t" // 2L4 - L5 + L6
"psubw %%mm3, %%mm1 \n\t" // 2H4 - H5 + H6
"psllw $2, %%mm2 \n\t" // 4L5 - 4L6
"psllw $2, %%mm3 \n\t" // 4H5 - 4H6
"psubw %%mm2, %%mm0 \n\t" // 2L4 - 5L5 + 5L6
"psubw %%mm3, %%mm1 \n\t" // 2H4 - 5H5 + 5H6
"movq (%0, %1, 4), %%mm2 \n\t"
"movq %%mm2, %%mm3 \n\t"
"punpcklbw %%mm7, %%mm2 \n\t" // L7
"punpckhbw %%mm7, %%mm3 \n\t" // H7
"paddw %%mm2, %%mm2 \n\t" // 2L7
"paddw %%mm3, %%mm3 \n\t" // 2H7
"psubw %%mm2, %%mm0 \n\t" // 2L4 - 5L5 + 5L6 - 2L7
"psubw %%mm3, %%mm1 \n\t" // 2H4 - 5H5 + 5H6 - 2H7
"movq (%%"REG_c"), %%mm2 \n\t" // 2L0 - 5L1 + 5L2 - 2L3
"movq 8(%%"REG_c"), %%mm3 \n\t" // 2H0 - 5H1 + 5H2 - 2H3
#ifdef HAVE_MMX2
"movq %%mm7, %%mm6 \n\t" // 0
"psubw %%mm0, %%mm6 \n\t"
"pmaxsw %%mm6, %%mm0 \n\t" // |2L4 - 5L5 + 5L6 - 2L7|
"movq %%mm7, %%mm6 \n\t" // 0
"psubw %%mm1, %%mm6 \n\t"
"pmaxsw %%mm6, %%mm1 \n\t" // |2H4 - 5H5 + 5H6 - 2H7|
"movq %%mm7, %%mm6 \n\t" // 0
"psubw %%mm2, %%mm6 \n\t"
"pmaxsw %%mm6, %%mm2 \n\t" // |2L0 - 5L1 + 5L2 - 2L3|
"movq %%mm7, %%mm6 \n\t" // 0
"psubw %%mm3, %%mm6 \n\t"
"pmaxsw %%mm6, %%mm3 \n\t" // |2H0 - 5H1 + 5H2 - 2H3|
#else
"movq %%mm7, %%mm6 \n\t" // 0
"pcmpgtw %%mm0, %%mm6 \n\t"
"pxor %%mm6, %%mm0 \n\t"
"psubw %%mm6, %%mm0 \n\t" // |2L4 - 5L5 + 5L6 - 2L7|
"movq %%mm7, %%mm6 \n\t" // 0
"pcmpgtw %%mm1, %%mm6 \n\t"
"pxor %%mm6, %%mm1 \n\t"
"psubw %%mm6, %%mm1 \n\t" // |2H4 - 5H5 + 5H6 - 2H7|
"movq %%mm7, %%mm6 \n\t" // 0
"pcmpgtw %%mm2, %%mm6 \n\t"
"pxor %%mm6, %%mm2 \n\t"
"psubw %%mm6, %%mm2 \n\t" // |2L0 - 5L1 + 5L2 - 2L3|
"movq %%mm7, %%mm6 \n\t" // 0
"pcmpgtw %%mm3, %%mm6 \n\t"
"pxor %%mm6, %%mm3 \n\t"
"psubw %%mm6, %%mm3 \n\t" // |2H0 - 5H1 + 5H2 - 2H3|
#endif
#ifdef HAVE_MMX2
"pminsw %%mm2, %%mm0 \n\t"
"pminsw %%mm3, %%mm1 \n\t"
#else
"movq %%mm0, %%mm6 \n\t"
"psubusw %%mm2, %%mm6 \n\t"
"psubw %%mm6, %%mm0 \n\t"
"movq %%mm1, %%mm6 \n\t"
"psubusw %%mm3, %%mm6 \n\t"
"psubw %%mm6, %%mm1 \n\t"
#endif
"movd %2, %%mm2 \n\t" // QP
"punpcklbw %%mm7, %%mm2 \n\t"
"movq %%mm7, %%mm6 \n\t" // 0
"pcmpgtw %%mm4, %%mm6 \n\t" // sign(2L2 - 5L3 + 5L4 - 2L5)
"pxor %%mm6, %%mm4 \n\t"
"psubw %%mm6, %%mm4 \n\t" // |2L2 - 5L3 + 5L4 - 2L5|
"pcmpgtw %%mm5, %%mm7 \n\t" // sign(2H2 - 5H3 + 5H4 - 2H5)
"pxor %%mm7, %%mm5 \n\t"
"psubw %%mm7, %%mm5 \n\t" // |2H2 - 5H3 + 5H4 - 2H5|
// 100 opcodes
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
"psllw $3, %%mm2 \n\t" // 8QP
"movq %%mm2, %%mm3 \n\t" // 8QP
"pcmpgtw %%mm4, %%mm2 \n\t"
"pcmpgtw %%mm5, %%mm3 \n\t"
"pand %%mm2, %%mm4 \n\t"
"pand %%mm3, %%mm5 \n\t"
"psubusw %%mm0, %%mm4 \n\t" // hd
"psubusw %%mm1, %%mm5 \n\t" // ld
"movq "MANGLE(w05)", %%mm2 \n\t" // 5
"pmullw %%mm2, %%mm4 \n\t"
"pmullw %%mm2, %%mm5 \n\t"
"movq "MANGLE(w20)", %%mm2 \n\t" // 32
"paddw %%mm2, %%mm4 \n\t"
"paddw %%mm2, %%mm5 \n\t"
"psrlw $6, %%mm4 \n\t"
"psrlw $6, %%mm5 \n\t"
"movq 16(%%"REG_c"), %%mm0 \n\t" // L3 - L4
"movq 24(%%"REG_c"), %%mm1 \n\t" // H3 - H4
"pxor %%mm2, %%mm2 \n\t"
"pxor %%mm3, %%mm3 \n\t"
"pcmpgtw %%mm0, %%mm2 \n\t" // sign (L3-L4)
"pcmpgtw %%mm1, %%mm3 \n\t" // sign (H3-H4)
"pxor %%mm2, %%mm0 \n\t"
"pxor %%mm3, %%mm1 \n\t"
"psubw %%mm2, %%mm0 \n\t" // |L3-L4|
"psubw %%mm3, %%mm1 \n\t" // |H3-H4|
"psrlw $1, %%mm0 \n\t" // |L3 - L4|/2
"psrlw $1, %%mm1 \n\t" // |H3 - H4|/2
"pxor %%mm6, %%mm2 \n\t"
"pxor %%mm7, %%mm3 \n\t"
"pand %%mm2, %%mm4 \n\t"
"pand %%mm3, %%mm5 \n\t"
#ifdef HAVE_MMX2
"pminsw %%mm0, %%mm4 \n\t"
"pminsw %%mm1, %%mm5 \n\t"
#else
"movq %%mm4, %%mm2 \n\t"
"psubusw %%mm0, %%mm2 \n\t"
"psubw %%mm2, %%mm4 \n\t"
"movq %%mm5, %%mm2 \n\t"
"psubusw %%mm1, %%mm2 \n\t"
"psubw %%mm2, %%mm5 \n\t"
#endif
"pxor %%mm6, %%mm4 \n\t"
"pxor %%mm7, %%mm5 \n\t"
"psubw %%mm6, %%mm4 \n\t"
"psubw %%mm7, %%mm5 \n\t"
"packsswb %%mm5, %%mm4 \n\t"
"movq %3, %%mm1 \n\t"
"pandn %%mm4, %%mm1 \n\t"
"movq (%0), %%mm0 \n\t"
"paddb %%mm1, %%mm0 \n\t"
"movq %%mm0, (%0) \n\t"
"movq (%0, %1), %%mm0 \n\t"
"psubb %%mm1, %%mm0 \n\t"
"movq %%mm0, (%0, %1) \n\t"
: "+r" (temp_src)
: "r" ((long)step), "m" (c->pQPb), "m"(eq_mask)
: "%"REG_a, "%"REG_c
);
}
/*if(step==16){
STOP_TIMER("step16")
}else{
STOP_TIMER("stepX")
}*/
}
#endif //HAVE_MMX
static void RENAME(postProcess)(uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int width, int height,
QP_STORE_T QPs[], int QPStride, int isColor, PPContext *c);
/**
* Copies a block from src to dst and fixes the blacklevel
* levelFix == 0 -> dont touch the brighness & contrast
#undef SCALED_CPY
static inline void RENAME(blockCopy)(uint8_t dst[], int dstStride, uint8_t src[], int srcStride,
int levelFix, int64_t *packedOffsetAndScale)
Michael Niedermayer
committed
#ifndef HAVE_MMX
Michael Niedermayer
committed
#endif
asm volatile(
"movq (%%"REG_a"), %%mm2 \n\t" // packedYOffset
"movq 8(%%"REG_a"), %%mm3 \n\t" // packedYScale
"lea (%2,%4), %%"REG_a" \n\t"
"lea (%3,%5), %%"REG_d" \n\t"
"pxor %%mm4, %%mm4 \n\t"
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
#define REAL_SCALED_CPY(src1, src2, dst1, dst2) \
"movq " #src1 ", %%mm0 \n\t"\
"movq " #src1 ", %%mm5 \n\t"\
"movq " #src2 ", %%mm1 \n\t"\
"movq " #src2 ", %%mm6 \n\t"\
"punpcklbw %%mm0, %%mm0 \n\t"\
"punpckhbw %%mm5, %%mm5 \n\t"\
"punpcklbw %%mm1, %%mm1 \n\t"\
"punpckhbw %%mm6, %%mm6 \n\t"\
"pmulhuw %%mm3, %%mm0 \n\t"\
"pmulhuw %%mm3, %%mm5 \n\t"\
"pmulhuw %%mm3, %%mm1 \n\t"\
"pmulhuw %%mm3, %%mm6 \n\t"\
"psubw %%mm2, %%mm0 \n\t"\
"psubw %%mm2, %%mm5 \n\t"\
"psubw %%mm2, %%mm1 \n\t"\
"psubw %%mm2, %%mm6 \n\t"\
"packuswb %%mm5, %%mm0 \n\t"\
"packuswb %%mm6, %%mm1 \n\t"\
"movq %%mm0, " #dst1 " \n\t"\
"movq %%mm1, " #dst2 " \n\t"\
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
#define REAL_SCALED_CPY(src1, src2, dst1, dst2) \
"movq " #src1 ", %%mm0 \n\t"\
"movq " #src1 ", %%mm5 \n\t"\
"punpcklbw %%mm4, %%mm0 \n\t"\
"punpckhbw %%mm4, %%mm5 \n\t"\
"psubw %%mm2, %%mm0 \n\t"\
"psubw %%mm2, %%mm5 \n\t"\
"movq " #src2 ", %%mm1 \n\t"\
"psllw $6, %%mm0 \n\t"\
"psllw $6, %%mm5 \n\t"\
"pmulhw %%mm3, %%mm0 \n\t"\
"movq " #src2 ", %%mm6 \n\t"\
"pmulhw %%mm3, %%mm5 \n\t"\
"punpcklbw %%mm4, %%mm1 \n\t"\
"punpckhbw %%mm4, %%mm6 \n\t"\
"psubw %%mm2, %%mm1 \n\t"\
"psubw %%mm2, %%mm6 \n\t"\
"psllw $6, %%mm1 \n\t"\
"psllw $6, %%mm6 \n\t"\
"pmulhw %%mm3, %%mm1 \n\t"\
"pmulhw %%mm3, %%mm6 \n\t"\
"packuswb %%mm5, %%mm0 \n\t"\
"packuswb %%mm6, %%mm1 \n\t"\
"movq %%mm0, " #dst1 " \n\t"\
"movq %%mm1, " #dst2 " \n\t"\
#endif //HAVE_MMX2
Aurelien Jacobs
committed
#define SCALED_CPY(src1, src2, dst1, dst2)\
REAL_SCALED_CPY(src1, src2, dst1, dst2)
SCALED_CPY((%2) , (%2, %4) , (%3) , (%3, %5))
Aurelien Jacobs
committed
SCALED_CPY((%2, %4, 2), (%%REGa, %4, 2), (%3, %5, 2), (%%REGd, %5, 2))
SCALED_CPY((%2, %4, 4), (%%REGa, %4, 4), (%3, %5, 4), (%%REGd, %5, 4))
"lea (%%"REG_a",%4,4), %%"REG_a" \n\t"
"lea (%%"REG_d",%5,4), %%"REG_d" \n\t"
Aurelien Jacobs
committed
SCALED_CPY((%%REGa, %4), (%%REGa, %4, 2), (%%REGd, %5), (%%REGd, %5, 2))
: "=&a" (packedOffsetAndScale)
: "0" (packedOffsetAndScale),
"r"(src),
"r"(dst),
"r" ((long)srcStride),
"r" ((long)dstStride)
: "%"REG_d
);
#else //HAVE_MMX
for(i=0; i<8; i++)
memcpy( &(dst[dstStride*i]),
&(src[srcStride*i]), BLOCK_SIZE);
#endif //HAVE_MMX
#ifdef HAVE_MMX
asm volatile(
"lea (%0,%2), %%"REG_a" \n\t"
"lea (%1,%3), %%"REG_d" \n\t"
#define REAL_SIMPLE_CPY(src1, src2, dst1, dst2) \
"movq " #src1 ", %%mm0 \n\t"\
"movq " #src2 ", %%mm1 \n\t"\
"movq %%mm0, " #dst1 " \n\t"\
"movq %%mm1, " #dst2 " \n\t"\
Aurelien Jacobs
committed
#define SIMPLE_CPY(src1, src2, dst1, dst2)\
REAL_SIMPLE_CPY(src1, src2, dst1, dst2)
SIMPLE_CPY((%0) , (%0, %2) , (%1) , (%1, %3))
Aurelien Jacobs
committed
SIMPLE_CPY((%0, %2, 2), (%%REGa, %2, 2), (%1, %3, 2), (%%REGd, %3, 2))
SIMPLE_CPY((%0, %2, 4), (%%REGa, %2, 4), (%1, %3, 4), (%%REGd, %3, 4))
"lea (%%"REG_a",%2,4), %%"REG_a" \n\t"
"lea (%%"REG_d",%3,4), %%"REG_d" \n\t"
Aurelien Jacobs
committed
SIMPLE_CPY((%%REGa, %2), (%%REGa, %2, 2), (%%REGd, %3), (%%REGd, %3, 2))
: : "r" (src),
"r" (dst),
"r" ((long)srcStride),
"r" ((long)dstStride)
: "%"REG_a, "%"REG_d
);
#else //HAVE_MMX
for(i=0; i<8; i++)
memcpy( &(dst[dstStride*i]),
&(src[srcStride*i]), BLOCK_SIZE);
#endif //HAVE_MMX
/**
* Duplicates the given 8 src pixels ? times upward
*/
static inline void RENAME(duplicate)(uint8_t src[], int stride)
{
#ifdef HAVE_MMX
asm volatile(
"movq (%0), %%mm0 \n\t"
"add %1, %0 \n\t"
"movq %%mm0, (%0) \n\t"
"movq %%mm0, (%0, %1) \n\t"
"movq %%mm0, (%0, %1, 2) \n\t"
: "+r" (src)
: "r" ((long)-stride)
);
int i;
uint8_t *p=src;
for(i=0; i<3; i++)
{
p-= stride;
memcpy(p, src, 8);
}
/**
* Filters array of bytes (Y or U or V values)
*/
static void RENAME(postProcess)(uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int width, int height,
QP_STORE_T QPs[], int QPStride, int isColor, PPContext *c2)
DECLARE_ALIGNED(8, PPContext, c)= *c2; //copy to stack for faster access
Michael Niedermayer
committed
#ifdef COMPILE_TIME_MODE
const int mode= COMPILE_TIME_MODE;
Michael Niedermayer
committed
#else
const int mode= isColor ? c.ppMode.chromMode : c.ppMode.lumMode;
Michael Niedermayer
committed
#endif
int black=0, white=255; // blackest black and whitest white in the picture
int QPCorrecture= 256*256;
Dominik Mierzejewski
committed
#ifdef HAVE_MMX
Dominik Mierzejewski
committed
#endif
const int qpHShift= isColor ? 4-c.hChromaSubSample : 4;
const int qpVShift= isColor ? 4-c.vChromaSubSample : 4;
//FIXME remove
uint64_t * const yHistogram= c.yHistogram;
uint8_t * const tempSrc= srcStride > 0 ? c.tempSrc : c.tempSrc - 23*srcStride;
uint8_t * const tempDst= dstStride > 0 ? c.tempDst : c.tempDst - 23*dstStride;
//const int mbWidth= isColor ? (width+7)>>3 : (width+15)>>4;
for(i=0; i<57; i++){
int offset= ((i*c.ppMode.baseDcDiff)>>8) + 1;
int threshold= offset*2 + 1;
c.mmxDcOffset[i]= 0x7F - offset;
c.mmxDcThreshold[i]= 0x7F - threshold;
c.mmxDcOffset[i]*= 0x0101010101010101LL;
c.mmxDcThreshold[i]*= 0x0101010101010101LL;
}
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417
if(mode & CUBIC_IPOL_DEINT_FILTER) copyAhead=16;
else if( (mode & LINEAR_BLEND_DEINT_FILTER)
|| (mode & FFMPEG_DEINT_FILTER)
|| (mode & LOWPASS5_DEINT_FILTER)) copyAhead=14;
else if( (mode & V_DEBLOCK)
|| (mode & LINEAR_IPOL_DEINT_FILTER)
|| (mode & MEDIAN_DEINT_FILTER)
|| (mode & V_A_DEBLOCK)) copyAhead=13;
else if(mode & V_X1_FILTER) copyAhead=11;
// else if(mode & V_RK1_FILTER) copyAhead=10;
else if(mode & DERING) copyAhead=9;
else copyAhead=8;
copyAhead-= 8;
if(!isColor)
{
uint64_t sum= 0;
int i;
uint64_t maxClipped;
uint64_t clipped;
double scale;
c.frameNum++;
// first frame is fscked so we ignore it
if(c.frameNum == 1) yHistogram[0]= width*height/64*15/256;
for(i=0; i<256; i++)
{
sum+= yHistogram[i];
}
maxClipped= (uint64_t)(sum * c.ppMode.maxClippedThreshold);
clipped= sum;
for(black=255; black>0; black--)
{
if(clipped < maxClipped) break;
clipped-= yHistogram[black];
}
clipped= sum;
for(white=0; white<256; white++)
{
if(clipped < maxClipped) break;
clipped-= yHistogram[white];
}
scale= (double)(c.ppMode.maxAllowedY - c.ppMode.minAllowedY) / (double)(white-black);
c.packedYScale= (uint16_t)(scale*256.0 + 0.5);
c.packedYOffset= (((black*c.packedYScale)>>8) - c.ppMode.minAllowedY) & 0xFFFF;
c.packedYScale= (uint16_t)(scale*1024.0 + 0.5);
c.packedYOffset= (black - c.ppMode.minAllowedY) & 0xFFFF;
3445
3446
3447
3448
3449
3450
3451
3452
3453
3454
3455
3456
3457
3458
3459
3460
3461
3462
3463
3464
3465
3466
c.packedYOffset|= c.packedYOffset<<32;
c.packedYOffset|= c.packedYOffset<<16;
c.packedYScale|= c.packedYScale<<32;
c.packedYScale|= c.packedYScale<<16;
if(mode & LEVEL_FIX) QPCorrecture= (int)(scale*256*256 + 0.5);
else QPCorrecture= 256*256;
}
else
{
c.packedYScale= 0x0100010001000100LL;
c.packedYOffset= 0;
QPCorrecture= 256*256;
}
/* copy & deinterlace first row of blocks */
y=-BLOCK_SIZE;
{
uint8_t *srcBlock= &(src[y*srcStride]);
uint8_t *dstBlock= tempDst + dstStride;
// From this point on it is guaranteed that we can read and write 16 lines downward
// finish 1 block before the next otherwise we might have a problem
// with the L1 Cache of the P4 ... or only a few blocks at a time or soemthing
for(x=0; x<width; x+=BLOCK_SIZE)
{
prefetchnta(srcBlock + (((x>>2)&6) + 5)*srcStride + 32);
prefetchnta(srcBlock + (((x>>2)&6) + 6)*srcStride + 32);
prefetcht0(dstBlock + (((x>>2)&6) + 5)*dstStride + 32);
prefetcht0(dstBlock + (((x>>2)&6) + 6)*dstStride + 32);
asm(
"mov %4, %%"REG_a" \n\t"
"shr $2, %%"REG_a" \n\t"
"and $6, %%"REG_a" \n\t"
"add %5, %%"REG_a" \n\t"
"mov %%"REG_a", %%"REG_d" \n\t"
"imul %1, %%"REG_a" \n\t"
"imul %3, %%"REG_d" \n\t"
"prefetchnta 32(%%"REG_a", %0) \n\t"
"prefetcht0 32(%%"REG_d", %2) \n\t"
"add %1, %%"REG_a" \n\t"
"add %3, %%"REG_d" \n\t"
"prefetchnta 32(%%"REG_a", %0) \n\t"
"prefetcht0 32(%%"REG_d", %2) \n\t"
:: "r" (srcBlock), "r" ((long)srcStride), "r" (dstBlock), "r" ((long)dstStride),
"g" ((long)x), "g" ((long)copyAhead)
: "%"REG_a, "%"REG_d
);
#elif defined(HAVE_3DNOW)
//FIXME check if this is faster on an 3dnow chip or if its faster without the prefetch or ...
/* prefetch(srcBlock + (((x>>3)&3) + 5)*srcStride + 32);
prefetch(srcBlock + (((x>>3)&3) + 9)*srcStride + 32);
prefetchw(dstBlock + (((x>>3)&3) + 5)*dstStride + 32);
prefetchw(dstBlock + (((x>>3)&3) + 9)*dstStride + 32);
RENAME(blockCopy)(dstBlock + dstStride*8, dstStride,
srcBlock + srcStride*8, srcStride, mode & LEVEL_FIX, &c.packedYOffset);
RENAME(duplicate)(dstBlock + dstStride*8, dstStride);
if(mode & LINEAR_IPOL_DEINT_FILTER)
RENAME(deInterlaceInterpolateLinear)(dstBlock, dstStride);
else if(mode & LINEAR_BLEND_DEINT_FILTER)
RENAME(deInterlaceBlendLinear)(dstBlock, dstStride, c.deintTemp + x);
else if(mode & MEDIAN_DEINT_FILTER)
RENAME(deInterlaceMedian)(dstBlock, dstStride);
else if(mode & CUBIC_IPOL_DEINT_FILTER)
RENAME(deInterlaceInterpolateCubic)(dstBlock, dstStride);
else if(mode & FFMPEG_DEINT_FILTER)
RENAME(deInterlaceFF)(dstBlock, dstStride, c.deintTemp + x);
else if(mode & LOWPASS5_DEINT_FILTER)
RENAME(deInterlaceL5)(dstBlock, dstStride, c.deintTemp + x, c.deintTemp + width + x);
/* else if(mode & CUBIC_BLEND_DEINT_FILTER)
RENAME(deInterlaceBlendCubic)(dstBlock, dstStride);
dstBlock+=8;
srcBlock+=8;
}
linecpy(dst, tempDst + 9*dstStride, copyAhead, dstStride);
else
{
int i;
for(i=0; i<copyAhead; i++)
{
memcpy(dst + i*dstStride, tempDst + (9+i)*dstStride, width);
}
}
}
for(y=0; y<height; y+=BLOCK_SIZE)
{
//1% speedup if these are here instead of the inner loop
uint8_t *srcBlock= &(src[y*srcStride]);
uint8_t *dstBlock= &(dst[y*dstStride]);
uint8_t *tempBlock1= c.tempBlocks;
uint8_t *tempBlock2= c.tempBlocks + 8;
int8_t *QPptr= &QPs[(y>>qpVShift)*QPStride];
int8_t *nonBQPptr= &c.nonBQPTable[(y>>qpVShift)*FFABS(QPStride)];
int QP=0;
/* can we mess with a 8x16 block from srcBlock/dstBlock downwards and 1 line upwards
if not than use a temporary buffer */
if(y+15 >= height)
{
int i;
/* copy from line (copyAhead) to (copyAhead+7) of src, these will be copied with
blockcopy to dst later */
linecpy(tempSrc + srcStride*copyAhead, srcBlock + srcStride*copyAhead,
FFMAX(height-y-copyAhead, 0), srcStride);
/* duplicate last line of src to fill the void upto line (copyAhead+7) */
for(i=FFMAX(height-y, 8); i<copyAhead+8; i++)
memcpy(tempSrc + srcStride*i, src + srcStride*(height-1), FFABS(srcStride));
/* copy up to (copyAhead+1) lines of dst (line -1 to (copyAhead-1))*/
linecpy(tempDst, dstBlock - dstStride, FFMIN(height-y+1, copyAhead+1), dstStride);
/* duplicate last line of dst to fill the void upto line (copyAhead) */
for(i=height-y+1; i<=copyAhead; i++)
memcpy(tempDst + dstStride*i, dst + dstStride*(height-1), FFABS(dstStride));
dstBlock= tempDst + dstStride;
srcBlock= tempSrc;
}
// From this point on it is guaranteed that we can read and write 16 lines downward
// finish 1 block before the next otherwise we might have a problem
// with the L1 Cache of the P4 ... or only a few blocks at a time or soemthing
for(x=0; x<width; x+=BLOCK_SIZE)
{
const int stride= dstStride;
if(isColor)
{
QP= QPptr[x>>qpHShift];
c.nonBQP= nonBQPptr[x>>qpHShift];
}
else
{
QP= QPptr[x>>4];
QP= (QP* QPCorrecture + 256*128)>>16;
c.nonBQP= nonBQPptr[x>>4];
c.nonBQP= (c.nonBQP* QPCorrecture + 256*128)>>16;
yHistogram[ srcBlock[srcStride*12 + 4] ]++;
}
c.QP= QP;
asm volatile(
"movd %1, %%mm7 \n\t"
"packuswb %%mm7, %%mm7 \n\t" // 0, 0, 0, QP, 0, 0, 0, QP
"packuswb %%mm7, %%mm7 \n\t" // 0,QP, 0, QP, 0,QP, 0, QP
"packuswb %%mm7, %%mm7 \n\t" // QP,..., QP
"movq %%mm7, %0 \n\t"
: "=m" (c.pQPb)
: "r" (QP)
);
prefetchnta(srcBlock + (((x>>2)&6) + 5)*srcStride + 32);
prefetchnta(srcBlock + (((x>>2)&6) + 6)*srcStride + 32);
prefetcht0(dstBlock + (((x>>2)&6) + 5)*dstStride + 32);
prefetcht0(dstBlock + (((x>>2)&6) + 6)*dstStride + 32);
asm(
"mov %4, %%"REG_a" \n\t"
"shr $2, %%"REG_a" \n\t"
"and $6, %%"REG_a" \n\t"
"add %5, %%"REG_a" \n\t"
"mov %%"REG_a", %%"REG_d" \n\t"
"imul %1, %%"REG_a" \n\t"
"imul %3, %%"REG_d" \n\t"
"prefetchnta 32(%%"REG_a", %0) \n\t"
"prefetcht0 32(%%"REG_d", %2) \n\t"
"add %1, %%"REG_a" \n\t"
"add %3, %%"REG_d" \n\t"
"prefetchnta 32(%%"REG_a", %0) \n\t"
"prefetcht0 32(%%"REG_d", %2) \n\t"
:: "r" (srcBlock), "r" ((long)srcStride), "r" (dstBlock), "r" ((long)dstStride),
"g" ((long)x), "g" ((long)copyAhead)
: "%"REG_a, "%"REG_d
);
#elif defined(HAVE_3DNOW)
//FIXME check if this is faster on an 3dnow chip or if its faster without the prefetch or ...
/* prefetch(srcBlock + (((x>>3)&3) + 5)*srcStride + 32);
prefetch(srcBlock + (((x>>3)&3) + 9)*srcStride + 32);
prefetchw(dstBlock + (((x>>3)&3) + 5)*dstStride + 32);
prefetchw(dstBlock + (((x>>3)&3) + 9)*dstStride + 32);
RENAME(blockCopy)(dstBlock + dstStride*copyAhead, dstStride,
srcBlock + srcStride*copyAhead, srcStride, mode & LEVEL_FIX, &c.packedYOffset);
if(mode & LINEAR_IPOL_DEINT_FILTER)
RENAME(deInterlaceInterpolateLinear)(dstBlock, dstStride);
else if(mode & LINEAR_BLEND_DEINT_FILTER)
RENAME(deInterlaceBlendLinear)(dstBlock, dstStride, c.deintTemp + x);
else if(mode & MEDIAN_DEINT_FILTER)
RENAME(deInterlaceMedian)(dstBlock, dstStride);
else if(mode & CUBIC_IPOL_DEINT_FILTER)
RENAME(deInterlaceInterpolateCubic)(dstBlock, dstStride);
else if(mode & FFMPEG_DEINT_FILTER)
RENAME(deInterlaceFF)(dstBlock, dstStride, c.deintTemp + x);
else if(mode & LOWPASS5_DEINT_FILTER)
RENAME(deInterlaceL5)(dstBlock, dstStride, c.deintTemp + x, c.deintTemp + width + x);
/* else if(mode & CUBIC_BLEND_DEINT_FILTER)
RENAME(deInterlaceBlendCubic)(dstBlock, dstStride);
Michael Niedermayer
committed
*/
/* only deblock if we have 2 blocks */
if(y + 8 < height)
{
if(mode & V_X1_FILTER)
RENAME(vertX1Filter)(dstBlock, stride, &c);
else if(mode & V_DEBLOCK)
{
const int t= RENAME(vertClassify)(dstBlock, stride, &c);
if(t==1)
RENAME(doVertLowPass)(dstBlock, stride, &c);
else if(t==2)
RENAME(doVertDefFilter)(dstBlock, stride, &c);
}else if(mode & V_A_DEBLOCK){
RENAME(do_a_deblock)(dstBlock, stride, 1, &c);
}
}
#ifdef HAVE_MMX
RENAME(transpose1)(tempBlock1, tempBlock2, dstBlock, dstStride);
#endif
/* check if we have a previous block to deblock it with dstBlock */
if(x - 8 >= 0)
{
#ifdef HAVE_MMX
if(mode & H_X1_FILTER)
RENAME(vertX1Filter)(tempBlock1, 16, &c);
else if(mode & H_DEBLOCK)
{
//START_TIMER
const int t= RENAME(vertClassify)(tempBlock1, 16, &c);
//STOP_TIMER("dc & minmax")
if(t==1)
RENAME(doVertLowPass)(tempBlock1, 16, &c);
else if(t==2)
RENAME(doVertDefFilter)(tempBlock1, 16, &c);
}else if(mode & H_A_DEBLOCK){
RENAME(do_a_deblock)(tempBlock1, 16, 1, &c);
}
RENAME(transpose2)(dstBlock-4, dstStride, tempBlock1 + 4*16);
#else
if(mode & H_X1_FILTER)
horizX1Filter(dstBlock-4, stride, QP);
else if(mode & H_DEBLOCK)
{
DECLARE_ALIGNED(16, unsigned char, tempBlock[272]);
transpose_16x8_char_toPackedAlign_altivec(tempBlock, dstBlock - (4 + 1), stride);
const int t=vertClassify_altivec(tempBlock-48, 16, &c);
if(t==1) {
doVertLowPass_altivec(tempBlock-48, 16, &c);
transpose_8x16_char_fromPackedAlign_altivec(dstBlock - (4 + 1), tempBlock, stride);
}
else if(t==2) {
doVertDefFilter_altivec(tempBlock-48, 16, &c);
transpose_8x16_char_fromPackedAlign_altivec(dstBlock - (4 + 1), tempBlock, stride);
}
#else
const int t= RENAME(horizClassify)(dstBlock-4, stride, &c);
Michael Niedermayer
committed
if(t==1)
RENAME(doHorizLowPass)(dstBlock-4, stride, &c);
else if(t==2)
RENAME(doHorizDefFilter)(dstBlock-4, stride, &c);
}else if(mode & H_A_DEBLOCK){
RENAME(do_a_deblock)(dstBlock-8, 1, stride, &c);
}
#endif //HAVE_MMX
if(mode & DERING)
{
//FIXME filter first line
if(y>0) RENAME(dering)(dstBlock - stride - 8, stride, &c);
}
if(mode & TEMP_NOISE_FILTER)
{
RENAME(tempNoiseReducer)(dstBlock-8, stride,
c.tempBlured[isColor] + y*dstStride + x,
c.tempBluredPast[isColor] + (y>>3)*256 + (x>>3),
c.ppMode.maxTmpNoise);
}
}
dstBlock+=8;
srcBlock+=8;
Michael Niedermayer
committed
#ifdef HAVE_MMX
tmpXchg= tempBlock1;
tempBlock1= tempBlock2;
tempBlock2 = tmpXchg;
Michael Niedermayer
committed
#endif
}
if(mode & DERING)
{
if(y > 0) RENAME(dering)(dstBlock - dstStride - 8, dstStride, &c);
}
if((mode & TEMP_NOISE_FILTER))
{
RENAME(tempNoiseReducer)(dstBlock-8, dstStride,
c.tempBlured[isColor] + y*dstStride + x,
c.tempBluredPast[isColor] + (y>>3)*256 + (x>>3),
c.ppMode.maxTmpNoise);
}
/* did we use a tmp buffer for the last lines*/
if(y+15 >= height)
{
uint8_t *dstBlock= &(dst[y*dstStride]);
linecpy(dstBlock, tempDst + dstStride, height-y, dstStride);
else
{
int i;
for(i=0; i<height-y; i++)
{
memcpy(dstBlock + i*dstStride, tempDst + (i+1)*dstStride, width);
}
}
}
for(x=0; x<width; x+=32)
{
volatile int i;
i+= + dstBlock[x + 7*dstStride] + dstBlock[x + 8*dstStride]
+ dstBlock[x + 9*dstStride] + dstBlock[x +10*dstStride]
+ dstBlock[x +11*dstStride] + dstBlock[x +12*dstStride];
// + dstBlock[x +13*dstStride]
// + dstBlock[x +14*dstStride] + dstBlock[x +15*dstStride];
}*/
}
3815
3816
3817
3818
3819
3820
3821
3822
3823
3824
3825
3826
3827
3828
3829
3830
3831
3832
3833
3834
3835
3836
3837
3838
if(!isColor)
{
int max=1;
int i;
for(i=0; i<256; i++)
if(yHistogram[i] > max) max=yHistogram[i];
for(i=1; i<256; i++)
{
int x;
int start=yHistogram[i-1]/(max/256+1);
int end=yHistogram[i]/(max/256+1);
int inc= end > start ? 1 : -1;
for(x=start; x!=end+inc; x+=inc)
dst[ i*dstStride + x]+=128;
}
for(i=0; i<100; i+=2)
{
dst[ (white)*dstStride + i]+=128;
dst[ (black)*dstStride + i]+=128;
}
}
*c2= c; //copy local context back