Newer
Older
// 100 opcodes
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
"psllw $3, %%mm2 \n\t" // 8QP
"movq %%mm2, %%mm3 \n\t" // 8QP
"pcmpgtw %%mm4, %%mm2 \n\t"
"pcmpgtw %%mm5, %%mm3 \n\t"
"pand %%mm2, %%mm4 \n\t"
"pand %%mm3, %%mm5 \n\t"
"psubusw %%mm0, %%mm4 \n\t" // hd
"psubusw %%mm1, %%mm5 \n\t" // ld
"movq "MANGLE(w05)", %%mm2 \n\t" // 5
"pmullw %%mm2, %%mm4 \n\t"
"pmullw %%mm2, %%mm5 \n\t"
"movq "MANGLE(w20)", %%mm2 \n\t" // 32
"paddw %%mm2, %%mm4 \n\t"
"paddw %%mm2, %%mm5 \n\t"
"psrlw $6, %%mm4 \n\t"
"psrlw $6, %%mm5 \n\t"
"movq 16(%4), %%mm0 \n\t" // L3 - L4
"movq 24(%4), %%mm1 \n\t" // H3 - H4
"pxor %%mm2, %%mm2 \n\t"
"pxor %%mm3, %%mm3 \n\t"
"pcmpgtw %%mm0, %%mm2 \n\t" // sign (L3-L4)
"pcmpgtw %%mm1, %%mm3 \n\t" // sign (H3-H4)
"pxor %%mm2, %%mm0 \n\t"
"pxor %%mm3, %%mm1 \n\t"
"psubw %%mm2, %%mm0 \n\t" // |L3-L4|
"psubw %%mm3, %%mm1 \n\t" // |H3-H4|
"psrlw $1, %%mm0 \n\t" // |L3 - L4|/2
"psrlw $1, %%mm1 \n\t" // |H3 - H4|/2
"pxor %%mm6, %%mm2 \n\t"
"pxor %%mm7, %%mm3 \n\t"
"pand %%mm2, %%mm4 \n\t"
"pand %%mm3, %%mm5 \n\t"
"pminsw %%mm0, %%mm4 \n\t"
"pminsw %%mm1, %%mm5 \n\t"
#else
"movq %%mm4, %%mm2 \n\t"
"psubusw %%mm0, %%mm2 \n\t"
"psubw %%mm2, %%mm4 \n\t"
"movq %%mm5, %%mm2 \n\t"
"psubusw %%mm1, %%mm2 \n\t"
"psubw %%mm2, %%mm5 \n\t"
#endif
"pxor %%mm6, %%mm4 \n\t"
"pxor %%mm7, %%mm5 \n\t"
"psubw %%mm6, %%mm4 \n\t"
"psubw %%mm7, %%mm5 \n\t"
"packsswb %%mm5, %%mm4 \n\t"
"movq %3, %%mm1 \n\t"
"pandn %%mm4, %%mm1 \n\t"
"movq (%0), %%mm0 \n\t"
"paddb %%mm1, %%mm0 \n\t"
"movq %%mm0, (%0) \n\t"
"movq (%0, %1), %%mm0 \n\t"
"psubb %%mm1, %%mm0 \n\t"
"movq %%mm0, (%0, %1) \n\t"
: "+r" (temp_src)
: "r" ((x86_reg)step), "m" (c->pQPb), "m"(eq_mask), "r"(tmp)
NAMED_CONSTRAINTS_ADD(w05,w20)
: "%"FF_REG_a
);
}
/*if(step==16){
STOP_TIMER("step16")
}else{
STOP_TIMER("stepX")
static void RENAME(postProcess)(const uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int width, int height,
const int8_t QPs[], int QPStride, int isColor, PPContext *c);
* Copy a block from src to dst and fixes the blacklevel.
* levelFix == 0 -> do not touch the brightness & contrast
#undef REAL_SCALED_CPY
#undef SCALED_CPY
static inline void RENAME(blockCopy)(uint8_t dst[], int dstStride, const uint8_t src[], int srcStride,
int levelFix, int64_t *packedOffsetAndScale)
#if !TEMPLATE_PP_MMX || !HAVE_6REGS
int i;
Michael Niedermayer
committed
#endif
if(levelFix){
#if TEMPLATE_PP_MMX && HAVE_6REGS
"movq (%%"FF_REG_a"), %%mm2 \n\t" // packedYOffset
"movq 8(%%"FF_REG_a"), %%mm3 \n\t" // packedYScale
"lea (%2,%4), %%"FF_REG_a" \n\t"
"lea (%3,%5), %%"FF_REG_d" \n\t"
"pxor %%mm4, %%mm4 \n\t"
#define REAL_SCALED_CPY(src1, src2, dst1, dst2) \
"movq " #src1 ", %%mm0 \n\t"\
"movq " #src1 ", %%mm5 \n\t"\
"movq " #src2 ", %%mm1 \n\t"\
"movq " #src2 ", %%mm6 \n\t"\
"punpcklbw %%mm0, %%mm0 \n\t"\
"punpckhbw %%mm5, %%mm5 \n\t"\
"punpcklbw %%mm1, %%mm1 \n\t"\
"punpckhbw %%mm6, %%mm6 \n\t"\
"pmulhuw %%mm3, %%mm0 \n\t"\
"pmulhuw %%mm3, %%mm5 \n\t"\
"pmulhuw %%mm3, %%mm1 \n\t"\
"pmulhuw %%mm3, %%mm6 \n\t"\
"psubw %%mm2, %%mm0 \n\t"\
"psubw %%mm2, %%mm5 \n\t"\
"psubw %%mm2, %%mm1 \n\t"\
"psubw %%mm2, %%mm6 \n\t"\
"packuswb %%mm5, %%mm0 \n\t"\
"packuswb %%mm6, %%mm1 \n\t"\
"movq %%mm0, " #dst1 " \n\t"\
"movq %%mm1, " #dst2 " \n\t"\
#define REAL_SCALED_CPY(src1, src2, dst1, dst2) \
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
"movq " #src1 ", %%mm0 \n\t"\
"movq " #src1 ", %%mm5 \n\t"\
"punpcklbw %%mm4, %%mm0 \n\t"\
"punpckhbw %%mm4, %%mm5 \n\t"\
"psubw %%mm2, %%mm0 \n\t"\
"psubw %%mm2, %%mm5 \n\t"\
"movq " #src2 ", %%mm1 \n\t"\
"psllw $6, %%mm0 \n\t"\
"psllw $6, %%mm5 \n\t"\
"pmulhw %%mm3, %%mm0 \n\t"\
"movq " #src2 ", %%mm6 \n\t"\
"pmulhw %%mm3, %%mm5 \n\t"\
"punpcklbw %%mm4, %%mm1 \n\t"\
"punpckhbw %%mm4, %%mm6 \n\t"\
"psubw %%mm2, %%mm1 \n\t"\
"psubw %%mm2, %%mm6 \n\t"\
"psllw $6, %%mm1 \n\t"\
"psllw $6, %%mm6 \n\t"\
"pmulhw %%mm3, %%mm1 \n\t"\
"pmulhw %%mm3, %%mm6 \n\t"\
"packuswb %%mm5, %%mm0 \n\t"\
"packuswb %%mm6, %%mm1 \n\t"\
"movq %%mm0, " #dst1 " \n\t"\
"movq %%mm1, " #dst2 " \n\t"\
Aurelien Jacobs
committed
#define SCALED_CPY(src1, src2, dst1, dst2)\
REAL_SCALED_CPY(src1, src2, dst1, dst2)
SCALED_CPY((%2) , (%2, %4) , (%3) , (%3, %5))
SCALED_CPY((%2, %4, 2), (%%FF_REGa, %4, 2), (%3, %5, 2), (%%FF_REGd, %5, 2))
SCALED_CPY((%2, %4, 4), (%%FF_REGa, %4, 4), (%3, %5, 4), (%%FF_REGd, %5, 4))
"lea (%%"FF_REG_a",%4,4), %%"FF_REG_a" \n\t"
"lea (%%"FF_REG_d",%5,4), %%"FF_REG_d" \n\t"
SCALED_CPY((%%FF_REGa, %4), (%%FF_REGa, %4, 2), (%%FF_REGd, %5), (%%FF_REGd, %5, 2))
: "=&a" (packedOffsetAndScale)
: "0" (packedOffsetAndScale),
"r"(src),
"r"(dst),
"r" ((x86_reg)srcStride),
"r" ((x86_reg)dstStride)
: "%"FF_REG_d
);
#else //TEMPLATE_PP_MMX && HAVE_6REGS
for(i=0; i<8; i++)
memcpy( &(dst[dstStride*i]),
&(src[srcStride*i]), BLOCK_SIZE);
#endif //TEMPLATE_PP_MMX && HAVE_6REGS
}else{
#if TEMPLATE_PP_MMX && HAVE_6REGS
"lea (%0,%2), %%"FF_REG_a" \n\t"
"lea (%1,%3), %%"FF_REG_d" \n\t"
#define REAL_SIMPLE_CPY(src1, src2, dst1, dst2) \
"movq " #src1 ", %%mm0 \n\t"\
"movq " #src2 ", %%mm1 \n\t"\
"movq %%mm0, " #dst1 " \n\t"\
"movq %%mm1, " #dst2 " \n\t"\
Aurelien Jacobs
committed
#define SIMPLE_CPY(src1, src2, dst1, dst2)\
REAL_SIMPLE_CPY(src1, src2, dst1, dst2)
SIMPLE_CPY((%0) , (%0, %2) , (%1) , (%1, %3))
SIMPLE_CPY((%0, %2, 2), (%%FF_REGa, %2, 2), (%1, %3, 2), (%%FF_REGd, %3, 2))
SIMPLE_CPY((%0, %2, 4), (%%FF_REGa, %2, 4), (%1, %3, 4), (%%FF_REGd, %3, 4))
"lea (%%"FF_REG_a",%2,4), %%"FF_REG_a" \n\t"
"lea (%%"FF_REG_d",%3,4), %%"FF_REG_d" \n\t"
SIMPLE_CPY((%%FF_REGa, %2), (%%FF_REGa, %2, 2), (%%FF_REGd, %3), (%%FF_REGd, %3, 2))
: : "r" (src),
"r" (dst),
"r" ((x86_reg)srcStride),
"r" ((x86_reg)dstStride)
: "%"FF_REG_a, "%"FF_REG_d
);
#else //TEMPLATE_PP_MMX && HAVE_6REGS
for(i=0; i<8; i++)
memcpy( &(dst[dstStride*i]),
&(src[srcStride*i]), BLOCK_SIZE);
#endif //TEMPLATE_PP_MMX && HAVE_6REGS
* Duplicate the given 8 src pixels ? times upward
*/
static inline void RENAME(duplicate)(uint8_t src[], int stride)
{
"movq (%0), %%mm0 \n\t"
Michael Niedermayer
committed
"movq %%mm0, (%0, %1, 4) \n\t"
"add %1, %0 \n\t"
"movq %%mm0, (%0) \n\t"
"movq %%mm0, (%0, %1) \n\t"
"movq %%mm0, (%0, %1, 2) \n\t"
Michael Niedermayer
committed
"movq %%mm0, (%0, %1, 4) \n\t"
: "+r" (src)
: "r" ((x86_reg)-stride)
);
int i;
uint8_t *p=src;
Michael Niedermayer
committed
for(i=0; i<5; i++){
p-= stride;
memcpy(p, src, 8);
}
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
#if ARCH_X86 && TEMPLATE_PP_MMXEXT
static inline void RENAME(prefetchnta)(const void *p)
{
__asm__ volatile( "prefetchnta (%0)\n\t"
: : "r" (p)
);
}
static inline void RENAME(prefetcht0)(const void *p)
{
__asm__ volatile( "prefetcht0 (%0)\n\t"
: : "r" (p)
);
}
static inline void RENAME(prefetcht1)(const void *p)
{
__asm__ volatile( "prefetcht1 (%0)\n\t"
: : "r" (p)
);
}
static inline void RENAME(prefetcht2)(const void *p)
{
__asm__ volatile( "prefetcht2 (%0)\n\t"
: : "r" (p)
);
}
#elif !ARCH_X86 && AV_GCC_VERSION_AT_LEAST(3,2)
static inline void RENAME(prefetchnta)(const void *p)
{
__builtin_prefetch(p,0,0);
}
static inline void RENAME(prefetcht0)(const void *p)
{
__builtin_prefetch(p,0,1);
}
static inline void RENAME(prefetcht1)(const void *p)
{
__builtin_prefetch(p,0,2);
}
static inline void RENAME(prefetcht2)(const void *p)
{
__builtin_prefetch(p,0,3);
}
#else
static inline void RENAME(prefetchnta)(const void *p)
{
return;
}
static inline void RENAME(prefetcht0)(const void *p)
{
return;
}
static inline void RENAME(prefetcht1)(const void *p)
{
return;
}
static inline void RENAME(prefetcht2)(const void *p)
{
return;
}
#endif
* Filter array of bytes (Y or U or V values)
static void RENAME(postProcess)(const uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int width, int height,
const int8_t QPs[], int QPStride, int isColor, PPContext *c2)
DECLARE_ALIGNED(8, PPContext, c)= *c2; //copy to stack for faster access
int x,y;
#ifdef TEMPLATE_PP_TIME_MODE
const int mode= TEMPLATE_PP_TIME_MODE;
Michael Niedermayer
committed
#else
const int mode= isColor ? c.ppMode.chromMode : c.ppMode.lumMode;
Michael Niedermayer
committed
#endif
int black=0, white=255; // blackest black and whitest white in the picture
int QPCorrecture= 256*256;
int copyAhead;
int i;
Dominik Mierzejewski
committed
#endif
const int qpHShift= isColor ? 4-c.hChromaSubSample : 4;
const int qpVShift= isColor ? 4-c.vChromaSubSample : 4;
//FIXME remove
uint64_t * const yHistogram= c.yHistogram;
uint8_t * const tempSrc= srcStride > 0 ? c.tempSrc : c.tempSrc - 23*srcStride;
uint8_t * const tempDst= (dstStride > 0 ? c.tempDst : c.tempDst - 23*dstStride) + 32;
//const int mbWidth= isColor ? (width+7)>>3 : (width+15)>>4;
if (mode & VISUALIZE){
if(!(mode & (V_A_DEBLOCK | H_A_DEBLOCK)) || TEMPLATE_PP_MMX) {
av_log(c2, AV_LOG_WARNING, "Visualization is currently only supported with the accurate deblock filter without SIMD\n");
}
}
for(i=0; i<57; i++){
int offset= ((i*c.ppMode.baseDcDiff)>>8) + 1;
int threshold= offset*2 + 1;
c.mmxDcOffset[i]= 0x7F - offset;
c.mmxDcThreshold[i]= 0x7F - threshold;
c.mmxDcOffset[i]*= 0x0101010101010101LL;
c.mmxDcThreshold[i]*= 0x0101010101010101LL;
}
if(mode & CUBIC_IPOL_DEINT_FILTER) copyAhead=16;
else if( (mode & LINEAR_BLEND_DEINT_FILTER)
|| (mode & FFMPEG_DEINT_FILTER)
|| (mode & LOWPASS5_DEINT_FILTER)) copyAhead=14;
else if( (mode & V_DEBLOCK)
|| (mode & LINEAR_IPOL_DEINT_FILTER)
|| (mode & MEDIAN_DEINT_FILTER)
|| (mode & V_A_DEBLOCK)) copyAhead=13;
else if(mode & V_X1_FILTER) copyAhead=11;
// else if(mode & V_RK1_FILTER) copyAhead=10;
else if(mode & DERING) copyAhead=9;
else copyAhead=8;
copyAhead-= 8;
if(!isColor){
uint64_t sum= 0;
int i;
uint64_t maxClipped;
uint64_t clipped;
Michael Niedermayer
committed
AVRational scale;
c.frameNum++;
// first frame is fscked so we ignore it
if(c.frameNum == 1) yHistogram[0]= width*(uint64_t)height/64*15/256;
for(i=0; i<256; i++){
sum+= yHistogram[i];
}
/* We always get a completely black picture first. */
maxClipped= av_rescale(sum, c.ppMode.maxClippedThreshold.num, c.ppMode.maxClippedThreshold.den);
clipped= sum;
for(black=255; black>0; black--){
if(clipped < maxClipped) break;
clipped-= yHistogram[black];
}
clipped= sum;
for(white=0; white<256; white++){
if(clipped < maxClipped) break;
clipped-= yHistogram[white];
}
Michael Niedermayer
committed
scale = (AVRational){c.ppMode.maxAllowedY - c.ppMode.minAllowedY, white - black};
Michael Niedermayer
committed
c.packedYScale = (uint16_t)av_rescale(scale.num, 256, scale.den);
c.packedYOffset= (((black*c.packedYScale)>>8) - c.ppMode.minAllowedY) & 0xFFFF;
Michael Niedermayer
committed
c.packedYScale = (uint16_t)av_rescale(scale.num, 1024, scale.den);
c.packedYOffset= (black - c.ppMode.minAllowedY) & 0xFFFF;
c.packedYOffset|= c.packedYOffset<<32;
c.packedYOffset|= c.packedYOffset<<16;
c.packedYScale|= c.packedYScale<<32;
c.packedYScale|= c.packedYScale<<16;
Michael Niedermayer
committed
if(mode & LEVEL_FIX) QPCorrecture= (int)av_rescale(scale.num, 256*256, scale.den);
else QPCorrecture= 256*256;
}else{
c.packedYScale= 0x0100010001000100LL;
c.packedYOffset= 0;
QPCorrecture= 256*256;
}
/* copy & deinterlace first row of blocks */
y=-BLOCK_SIZE;
{
const uint8_t *srcBlock= &(src[y*srcStride]);
uint8_t *dstBlock= tempDst + dstStride;
// From this point on it is guaranteed that we can read and write 16 lines downward
// finish 1 block before the next otherwise we might have a problem
// with the L1 Cache of the P4 ... or only a few blocks at a time or something
for(x=0; x<width; x+=BLOCK_SIZE){
RENAME(prefetchnta)(srcBlock + (((x>>2)&6) + copyAhead)*srcStride + 32);
RENAME(prefetchnta)(srcBlock + (((x>>2)&6) + copyAhead+1)*srcStride + 32);
RENAME(prefetcht0)(dstBlock + (((x>>2)&6) + copyAhead)*dstStride + 32);
RENAME(prefetcht0)(dstBlock + (((x>>2)&6) + copyAhead+1)*dstStride + 32);
RENAME(blockCopy)(dstBlock + dstStride*8, dstStride,
srcBlock + srcStride*8, srcStride, mode & LEVEL_FIX, &c.packedYOffset);
RENAME(duplicate)(dstBlock + dstStride*8, dstStride);
if(mode & LINEAR_IPOL_DEINT_FILTER)
RENAME(deInterlaceInterpolateLinear)(dstBlock, dstStride);
else if(mode & LINEAR_BLEND_DEINT_FILTER)
RENAME(deInterlaceBlendLinear)(dstBlock, dstStride, c.deintTemp + x);
else if(mode & MEDIAN_DEINT_FILTER)
RENAME(deInterlaceMedian)(dstBlock, dstStride);
else if(mode & CUBIC_IPOL_DEINT_FILTER)
RENAME(deInterlaceInterpolateCubic)(dstBlock, dstStride);
else if(mode & FFMPEG_DEINT_FILTER)
RENAME(deInterlaceFF)(dstBlock, dstStride, c.deintTemp + x);
else if(mode & LOWPASS5_DEINT_FILTER)
RENAME(deInterlaceL5)(dstBlock, dstStride, c.deintTemp + x, c.deintTemp + width + x);
/* else if(mode & CUBIC_BLEND_DEINT_FILTER)
RENAME(deInterlaceBlendCubic)(dstBlock, dstStride);
dstBlock+=8;
srcBlock+=8;
}
if(width==FFABS(dstStride))
linecpy(dst, tempDst + 9*dstStride, copyAhead, dstStride);
else{
int i;
for(i=0; i<copyAhead; i++){
memcpy(dst + i*dstStride, tempDst + (9+i)*dstStride, width);
}
for(y=0; y<height; y+=BLOCK_SIZE){
//1% speedup if these are here instead of the inner loop
const uint8_t *srcBlock= &(src[y*srcStride]);
uint8_t *dstBlock= &(dst[y*dstStride]);
uint8_t *tempBlock1= c.tempBlocks;
uint8_t *tempBlock2= c.tempBlocks + 8;
const int8_t *QPptr= &QPs[(y>>qpVShift)*QPStride];
int8_t *nonBQPptr= &c.nonBQPTable[(y>>qpVShift)*FFABS(QPStride)];
/* can we mess with a 8x16 block from srcBlock/dstBlock downwards and 1 line upwards
if not than use a temporary buffer */
if(y+15 >= height){
int i;
/* copy from line (copyAhead) to (copyAhead+7) of src, these will be copied with
blockcopy to dst later */
linecpy(tempSrc + srcStride*copyAhead, srcBlock + srcStride*copyAhead,
FFMAX(height-y-copyAhead, 0), srcStride);
/* duplicate last line of src to fill the void up to line (copyAhead+7) */
for(i=FFMAX(height-y, 8); i<copyAhead+8; i++)
memcpy(tempSrc + srcStride*i, src + srcStride*(height-1), FFABS(srcStride));
/* copy up to (copyAhead+1) lines of dst (line -1 to (copyAhead-1))*/
linecpy(tempDst, dstBlock - dstStride, FFMIN(height-y+1, copyAhead+1), dstStride);
/* duplicate last line of dst to fill the void up to line (copyAhead) */
for(i=height-y+1; i<=copyAhead; i++)
memcpy(tempDst + dstStride*i, dst + dstStride*(height-1), FFABS(dstStride));
dstBlock= tempDst + dstStride;
srcBlock= tempSrc;
}
// From this point on it is guaranteed that we can read and write 16 lines downward
// finish 1 block before the next otherwise we might have a problem
// with the L1 Cache of the P4 ... or only a few blocks at a time or something
Michael Niedermayer
committed
for(x=0; x<width; ){
int startx = x;
int endx = FFMIN(width, x+32);
uint8_t *dstBlockStart = dstBlock;
const uint8_t *srcBlockStart = srcBlock;
3515
3516
3517
3518
3519
3520
3521
3522
3523
3524
3525
3526
3527
3528
3529
3530
3531
3532
3533
3534
3535
3536
3537
int qp_index = 0;
for(qp_index=0; qp_index < (endx-startx)/BLOCK_SIZE; qp_index++){
QP = QPptr[(x+qp_index*BLOCK_SIZE)>>qpHShift];
nonBQP = nonBQPptr[(x+qp_index*BLOCK_SIZE)>>qpHShift];
if(!isColor){
QP= (QP* QPCorrecture + 256*128)>>16;
nonBQP= (nonBQP* QPCorrecture + 256*128)>>16;
yHistogram[(srcBlock+qp_index*8)[srcStride*12 + 4]]++;
}
c.QP_block[qp_index] = QP;
c.nonBQP_block[qp_index] = nonBQP;
#if TEMPLATE_PP_MMX
__asm__ volatile(
"movd %1, %%mm7 \n\t"
"packuswb %%mm7, %%mm7 \n\t" // 0, 0, 0, QP, 0, 0, 0, QP
"packuswb %%mm7, %%mm7 \n\t" // 0,QP, 0, QP, 0,QP, 0, QP
"packuswb %%mm7, %%mm7 \n\t" // QP,..., QP
"movq %%mm7, %0 \n\t"
: "=m" (c.pQPb_block[qp_index])
: "r" (QP)
);
#endif
}
Michael Niedermayer
committed
for(; x < endx; x+=BLOCK_SIZE){
RENAME(prefetchnta)(srcBlock + (((x>>2)&6) + copyAhead)*srcStride + 32);
RENAME(prefetchnta)(srcBlock + (((x>>2)&6) + copyAhead+1)*srcStride + 32);
RENAME(prefetcht0)(dstBlock + (((x>>2)&6) + copyAhead)*dstStride + 32);
RENAME(prefetcht0)(dstBlock + (((x>>2)&6) + copyAhead+1)*dstStride + 32);
RENAME(blockCopy)(dstBlock + dstStride*copyAhead, dstStride,
srcBlock + srcStride*copyAhead, srcStride, mode & LEVEL_FIX, &c.packedYOffset);
if(mode & LINEAR_IPOL_DEINT_FILTER)
RENAME(deInterlaceInterpolateLinear)(dstBlock, dstStride);
else if(mode & LINEAR_BLEND_DEINT_FILTER)
RENAME(deInterlaceBlendLinear)(dstBlock, dstStride, c.deintTemp + x);
else if(mode & MEDIAN_DEINT_FILTER)
RENAME(deInterlaceMedian)(dstBlock, dstStride);
else if(mode & CUBIC_IPOL_DEINT_FILTER)
RENAME(deInterlaceInterpolateCubic)(dstBlock, dstStride);
else if(mode & FFMPEG_DEINT_FILTER)
RENAME(deInterlaceFF)(dstBlock, dstStride, c.deintTemp + x);
else if(mode & LOWPASS5_DEINT_FILTER)
RENAME(deInterlaceL5)(dstBlock, dstStride, c.deintTemp + x, c.deintTemp + width + x);
/* else if(mode & CUBIC_BLEND_DEINT_FILTER)
RENAME(deInterlaceBlendCubic)(dstBlock, dstStride);
Michael Niedermayer
committed
*/
dstBlock+=8;
srcBlock+=8;
}
dstBlock = dstBlockStart;
srcBlock = srcBlockStart;
for(x = startx, qp_index = 0; x < endx; x+=BLOCK_SIZE, qp_index++){
const int stride= dstStride;
//temporary while changing QP stuff to make things continue to work
//eventually QP,nonBQP,etc will be arrays and this will be unnecessary
c.QP = c.QP_block[qp_index];
c.nonBQP = c.nonBQP_block[qp_index];
c.pQPb = c.pQPb_block[qp_index];
c.pQPb2 = c.pQPb2_block[qp_index];
/* only deblock if we have 2 blocks */
if(y + 8 < height){
if(mode & V_X1_FILTER)
RENAME(vertX1Filter)(dstBlock, stride, &c);
else if(mode & V_DEBLOCK){
const int t= RENAME(vertClassify)(dstBlock, stride, &c);
if(t==1)
RENAME(doVertLowPass)(dstBlock, stride, &c);
else if(t==2)
RENAME(doVertDefFilter)(dstBlock, stride, &c);
}else if(mode & V_A_DEBLOCK){
RENAME(do_a_deblock)(dstBlock, stride, 1, &c, mode);
}
}
dstBlock+=8;
srcBlock+=8;
}
dstBlock = dstBlockStart;
srcBlock = srcBlockStart;
for(x = startx, qp_index=0; x < endx; x+=BLOCK_SIZE, qp_index++){
const int stride= dstStride;
av_unused uint8_t *tmpXchg;
c.QP = c.QP_block[qp_index];
c.nonBQP = c.nonBQP_block[qp_index];
c.pQPb = c.pQPb_block[qp_index];
c.pQPb2 = c.pQPb2_block[qp_index];
RENAME(transpose1)(tempBlock1, tempBlock2, dstBlock, dstStride);
#endif
/* check if we have a previous block to deblock it with dstBlock */
if(x - 8 >= 0){
if(mode & H_X1_FILTER)
RENAME(vertX1Filter)(tempBlock1, 16, &c);
else if(mode & H_DEBLOCK){
const int t= RENAME(vertClassify)(tempBlock1, 16, &c);
if(t==1)
RENAME(doVertLowPass)(tempBlock1, 16, &c);
else if(t==2)
RENAME(doVertDefFilter)(tempBlock1, 16, &c);
}else if(mode & H_A_DEBLOCK){
RENAME(do_a_deblock)(tempBlock1, 16, 1, &c, mode);
RENAME(transpose2)(dstBlock-4, dstStride, tempBlock1 + 4*16);
#else
if(mode & H_X1_FILTER)
horizX1Filter(dstBlock-4, stride, c.QP);
else if(mode & H_DEBLOCK){
DECLARE_ALIGNED(16, unsigned char, tempBlock)[272];
int t;
transpose_16x8_char_toPackedAlign_altivec(tempBlock, dstBlock - (4 + 1), stride);
t = vertClassify_altivec(tempBlock-48, 16, &c);
if(t==1) {
doVertLowPass_altivec(tempBlock-48, 16, &c);
transpose_8x16_char_fromPackedAlign_altivec(dstBlock - (4 + 1), tempBlock, stride);
}
else if(t==2) {
doVertDefFilter_altivec(tempBlock-48, 16, &c);
transpose_8x16_char_fromPackedAlign_altivec(dstBlock - (4 + 1), tempBlock, stride);
}
const int t= RENAME(horizClassify)(dstBlock-4, stride, &c);
Michael Niedermayer
committed
if(t==1)
RENAME(doHorizLowPass)(dstBlock-4, stride, &c);
else if(t==2)
RENAME(doHorizDefFilter)(dstBlock-4, stride, &c);
}else if(mode & H_A_DEBLOCK){
RENAME(do_a_deblock)(dstBlock-8, 1, stride, &c, mode);
if(mode & DERING){
//FIXME filter first line
if(y>0) RENAME(dering)(dstBlock - stride - 8, stride, &c);
}
if(mode & TEMP_NOISE_FILTER)
{
RENAME(tempNoiseReducer)(dstBlock-8, stride,
c.tempBlurred[isColor] + y*dstStride + x,
c.tempBlurredPast[isColor] + (y>>3)*256 + (x>>3) + 256,
c.ppMode.maxTmpNoise);
}
}
dstBlock+=8;
srcBlock+=8;
tmpXchg= tempBlock1;
tempBlock1= tempBlock2;
tempBlock2 = tmpXchg;
Michael Niedermayer
committed
#endif
Michael Niedermayer
committed
}
if(mode & DERING){
if(y > 0) RENAME(dering)(dstBlock - dstStride - 8, dstStride, &c);
}
if((mode & TEMP_NOISE_FILTER)){
RENAME(tempNoiseReducer)(dstBlock-8, dstStride,
c.tempBlurred[isColor] + y*dstStride + x,
c.tempBlurredPast[isColor] + (y>>3)*256 + (x>>3) + 256,
c.ppMode.maxTmpNoise);
}
/* did we use a tmp buffer for the last lines*/
if(y+15 >= height){
uint8_t *dstBlock= &(dst[y*dstStride]);
if(width==FFABS(dstStride))
linecpy(dstBlock, tempDst + dstStride, height-y, dstStride);
else{
int i;
for(i=0; i<height-y; i++){
memcpy(dstBlock + i*dstStride, tempDst + (i+1)*dstStride, width);
if(!isColor){
int max=1;
int i;
for(i=0; i<256; i++)
if(yHistogram[i] > max) max=yHistogram[i];
for(i=1; i<256; i++){
int x;
int start=yHistogram[i-1]/(max/256+1);
int end=yHistogram[i]/(max/256+1);
int inc= end > start ? 1 : -1;
for(x=start; x!=end+inc; x+=inc)
dst[ i*dstStride + x]+=128;
}
for(i=0; i<100; i+=2){
dst[ (white)*dstStride + i]+=128;
dst[ (black)*dstStride + i]+=128;
*c2= c; //copy local context back
#undef RENAME
#undef TEMPLATE_PP_C
#undef TEMPLATE_PP_ALTIVEC
#undef TEMPLATE_PP_MMX
#undef TEMPLATE_PP_MMXEXT
#undef TEMPLATE_PP_3DNOW