Newer
Older
"pxor %%mm6, %%mm3 \n\t"
"psubw %%mm6, %%mm3 \n\t" // |2H0 - 5H1 + 5H2 - 2H3|
#endif
#ifdef HAVE_MMX2
"pminsw %%mm2, %%mm0 \n\t"
"pminsw %%mm3, %%mm1 \n\t"
#else
"movq %%mm0, %%mm6 \n\t"
"psubusw %%mm2, %%mm6 \n\t"
"psubw %%mm6, %%mm0 \n\t"
"movq %%mm1, %%mm6 \n\t"
"psubusw %%mm3, %%mm6 \n\t"
"psubw %%mm6, %%mm1 \n\t"
#endif
"movd %2, %%mm2 \n\t" // QP
"punpcklbw %%mm7, %%mm2 \n\t"
"movq %%mm7, %%mm6 \n\t" // 0
"pcmpgtw %%mm4, %%mm6 \n\t" // sign(2L2 - 5L3 + 5L4 - 2L5)
"pxor %%mm6, %%mm4 \n\t"
"psubw %%mm6, %%mm4 \n\t" // |2L2 - 5L3 + 5L4 - 2L5|
"pcmpgtw %%mm5, %%mm7 \n\t" // sign(2H2 - 5H3 + 5H4 - 2H5)
"pxor %%mm7, %%mm5 \n\t"
"psubw %%mm7, %%mm5 \n\t" // |2H2 - 5H3 + 5H4 - 2H5|
// 100 opcodes
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
"psllw $3, %%mm2 \n\t" // 8QP
"movq %%mm2, %%mm3 \n\t" // 8QP
"pcmpgtw %%mm4, %%mm2 \n\t"
"pcmpgtw %%mm5, %%mm3 \n\t"
"pand %%mm2, %%mm4 \n\t"
"pand %%mm3, %%mm5 \n\t"
"psubusw %%mm0, %%mm4 \n\t" // hd
"psubusw %%mm1, %%mm5 \n\t" // ld
"movq "MANGLE(w05)", %%mm2 \n\t" // 5
"pmullw %%mm2, %%mm4 \n\t"
"pmullw %%mm2, %%mm5 \n\t"
"movq "MANGLE(w20)", %%mm2 \n\t" // 32
"paddw %%mm2, %%mm4 \n\t"
"paddw %%mm2, %%mm5 \n\t"
"psrlw $6, %%mm4 \n\t"
"psrlw $6, %%mm5 \n\t"
"movq 16(%%"REG_c"), %%mm0 \n\t" // L3 - L4
"movq 24(%%"REG_c"), %%mm1 \n\t" // H3 - H4
"pxor %%mm2, %%mm2 \n\t"
"pxor %%mm3, %%mm3 \n\t"
"pcmpgtw %%mm0, %%mm2 \n\t" // sign (L3-L4)
"pcmpgtw %%mm1, %%mm3 \n\t" // sign (H3-H4)
"pxor %%mm2, %%mm0 \n\t"
"pxor %%mm3, %%mm1 \n\t"
"psubw %%mm2, %%mm0 \n\t" // |L3-L4|
"psubw %%mm3, %%mm1 \n\t" // |H3-H4|
"psrlw $1, %%mm0 \n\t" // |L3 - L4|/2
"psrlw $1, %%mm1 \n\t" // |H3 - H4|/2
"pxor %%mm6, %%mm2 \n\t"
"pxor %%mm7, %%mm3 \n\t"
"pand %%mm2, %%mm4 \n\t"
"pand %%mm3, %%mm5 \n\t"
#ifdef HAVE_MMX2
"pminsw %%mm0, %%mm4 \n\t"
"pminsw %%mm1, %%mm5 \n\t"
#else
"movq %%mm4, %%mm2 \n\t"
"psubusw %%mm0, %%mm2 \n\t"
"psubw %%mm2, %%mm4 \n\t"
"movq %%mm5, %%mm2 \n\t"
"psubusw %%mm1, %%mm2 \n\t"
"psubw %%mm2, %%mm5 \n\t"
#endif
"pxor %%mm6, %%mm4 \n\t"
"pxor %%mm7, %%mm5 \n\t"
"psubw %%mm6, %%mm4 \n\t"
"psubw %%mm7, %%mm5 \n\t"
"packsswb %%mm5, %%mm4 \n\t"
"movq %3, %%mm1 \n\t"
"pandn %%mm4, %%mm1 \n\t"
"movq (%0), %%mm0 \n\t"
"paddb %%mm1, %%mm0 \n\t"
"movq %%mm0, (%0) \n\t"
"movq (%0, %1), %%mm0 \n\t"
"psubb %%mm1, %%mm0 \n\t"
"movq %%mm0, (%0, %1) \n\t"
: "+r" (temp_src)
: "r" ((long)step), "m" (c->pQPb), "m"(eq_mask)
: "%"REG_a, "%"REG_c
);
}
/*if(step==16){
STOP_TIMER("step16")
}else{
STOP_TIMER("stepX")
}*/
}
#endif //HAVE_MMX
static void RENAME(postProcess)(const uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int width, int height,
const QP_STORE_T QPs[], int QPStride, int isColor, PPContext *c);
* Copies a block from src to dst and fixes the blacklevel.
* levelFix == 0 -> do not touch the brighness & contrast
#undef SCALED_CPY
static inline void RENAME(blockCopy)(uint8_t dst[], int dstStride, const uint8_t src[], int srcStride,
int levelFix, int64_t *packedOffsetAndScale)
Michael Niedermayer
committed
#ifndef HAVE_MMX
int i;
Michael Niedermayer
committed
#endif
if(levelFix){
asm volatile(
"movq (%%"REG_a"), %%mm2 \n\t" // packedYOffset
"movq 8(%%"REG_a"), %%mm3 \n\t" // packedYScale
"lea (%2,%4), %%"REG_a" \n\t"
"lea (%3,%5), %%"REG_d" \n\t"
"pxor %%mm4, %%mm4 \n\t"
#define REAL_SCALED_CPY(src1, src2, dst1, dst2) \
"movq " #src1 ", %%mm0 \n\t"\
"movq " #src1 ", %%mm5 \n\t"\
"movq " #src2 ", %%mm1 \n\t"\
"movq " #src2 ", %%mm6 \n\t"\
"punpcklbw %%mm0, %%mm0 \n\t"\
"punpckhbw %%mm5, %%mm5 \n\t"\
"punpcklbw %%mm1, %%mm1 \n\t"\
"punpckhbw %%mm6, %%mm6 \n\t"\
"pmulhuw %%mm3, %%mm0 \n\t"\
"pmulhuw %%mm3, %%mm5 \n\t"\
"pmulhuw %%mm3, %%mm1 \n\t"\
"pmulhuw %%mm3, %%mm6 \n\t"\
"psubw %%mm2, %%mm0 \n\t"\
"psubw %%mm2, %%mm5 \n\t"\
"psubw %%mm2, %%mm1 \n\t"\
"psubw %%mm2, %%mm6 \n\t"\
"packuswb %%mm5, %%mm0 \n\t"\
"packuswb %%mm6, %%mm1 \n\t"\
"movq %%mm0, " #dst1 " \n\t"\
"movq %%mm1, " #dst2 " \n\t"\
#define REAL_SCALED_CPY(src1, src2, dst1, dst2) \
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
"movq " #src1 ", %%mm0 \n\t"\
"movq " #src1 ", %%mm5 \n\t"\
"punpcklbw %%mm4, %%mm0 \n\t"\
"punpckhbw %%mm4, %%mm5 \n\t"\
"psubw %%mm2, %%mm0 \n\t"\
"psubw %%mm2, %%mm5 \n\t"\
"movq " #src2 ", %%mm1 \n\t"\
"psllw $6, %%mm0 \n\t"\
"psllw $6, %%mm5 \n\t"\
"pmulhw %%mm3, %%mm0 \n\t"\
"movq " #src2 ", %%mm6 \n\t"\
"pmulhw %%mm3, %%mm5 \n\t"\
"punpcklbw %%mm4, %%mm1 \n\t"\
"punpckhbw %%mm4, %%mm6 \n\t"\
"psubw %%mm2, %%mm1 \n\t"\
"psubw %%mm2, %%mm6 \n\t"\
"psllw $6, %%mm1 \n\t"\
"psllw $6, %%mm6 \n\t"\
"pmulhw %%mm3, %%mm1 \n\t"\
"pmulhw %%mm3, %%mm6 \n\t"\
"packuswb %%mm5, %%mm0 \n\t"\
"packuswb %%mm6, %%mm1 \n\t"\
"movq %%mm0, " #dst1 " \n\t"\
"movq %%mm1, " #dst2 " \n\t"\
#endif //HAVE_MMX2
Aurelien Jacobs
committed
#define SCALED_CPY(src1, src2, dst1, dst2)\
REAL_SCALED_CPY(src1, src2, dst1, dst2)
SCALED_CPY((%2) , (%2, %4) , (%3) , (%3, %5))
Aurelien Jacobs
committed
SCALED_CPY((%2, %4, 2), (%%REGa, %4, 2), (%3, %5, 2), (%%REGd, %5, 2))
SCALED_CPY((%2, %4, 4), (%%REGa, %4, 4), (%3, %5, 4), (%%REGd, %5, 4))
"lea (%%"REG_a",%4,4), %%"REG_a" \n\t"
"lea (%%"REG_d",%5,4), %%"REG_d" \n\t"
Aurelien Jacobs
committed
SCALED_CPY((%%REGa, %4), (%%REGa, %4, 2), (%%REGd, %5), (%%REGd, %5, 2))
: "=&a" (packedOffsetAndScale)
: "0" (packedOffsetAndScale),
"r"(src),
"r"(dst),
"r" ((long)srcStride),
"r" ((long)dstStride)
: "%"REG_d
);
#else //HAVE_MMX
for(i=0; i<8; i++)
memcpy( &(dst[dstStride*i]),
&(src[srcStride*i]), BLOCK_SIZE);
#endif //HAVE_MMX
}else{
#ifdef HAVE_MMX
asm volatile(
"lea (%0,%2), %%"REG_a" \n\t"
"lea (%1,%3), %%"REG_d" \n\t"
#define REAL_SIMPLE_CPY(src1, src2, dst1, dst2) \
"movq " #src1 ", %%mm0 \n\t"\
"movq " #src2 ", %%mm1 \n\t"\
"movq %%mm0, " #dst1 " \n\t"\
"movq %%mm1, " #dst2 " \n\t"\
Aurelien Jacobs
committed
#define SIMPLE_CPY(src1, src2, dst1, dst2)\
REAL_SIMPLE_CPY(src1, src2, dst1, dst2)
SIMPLE_CPY((%0) , (%0, %2) , (%1) , (%1, %3))
Aurelien Jacobs
committed
SIMPLE_CPY((%0, %2, 2), (%%REGa, %2, 2), (%1, %3, 2), (%%REGd, %3, 2))
SIMPLE_CPY((%0, %2, 4), (%%REGa, %2, 4), (%1, %3, 4), (%%REGd, %3, 4))
"lea (%%"REG_a",%2,4), %%"REG_a" \n\t"
"lea (%%"REG_d",%3,4), %%"REG_d" \n\t"
Aurelien Jacobs
committed
SIMPLE_CPY((%%REGa, %2), (%%REGa, %2, 2), (%%REGd, %3), (%%REGd, %3, 2))
: : "r" (src),
"r" (dst),
"r" ((long)srcStride),
"r" ((long)dstStride)
: "%"REG_a, "%"REG_d
);
#else //HAVE_MMX
for(i=0; i<8; i++)
memcpy( &(dst[dstStride*i]),
&(src[srcStride*i]), BLOCK_SIZE);
#endif //HAVE_MMX
/**
* Duplicates the given 8 src pixels ? times upward
*/
static inline void RENAME(duplicate)(uint8_t src[], int stride)
{
#ifdef HAVE_MMX
asm volatile(
"movq (%0), %%mm0 \n\t"
"add %1, %0 \n\t"
"movq %%mm0, (%0) \n\t"
"movq %%mm0, (%0, %1) \n\t"
"movq %%mm0, (%0, %1, 2) \n\t"
: "+r" (src)
: "r" ((long)-stride)
);
int i;
uint8_t *p=src;
for(i=0; i<3; i++){
p-= stride;
memcpy(p, src, 8);
}
/**
* Filters array of bytes (Y or U or V values)
*/
static void RENAME(postProcess)(const uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int width, int height,
const QP_STORE_T QPs[], int QPStride, int isColor, PPContext *c2)
DECLARE_ALIGNED(8, PPContext, c)= *c2; //copy to stack for faster access
int x,y;
Michael Niedermayer
committed
#ifdef COMPILE_TIME_MODE
const int mode= COMPILE_TIME_MODE;
Michael Niedermayer
committed
#else
const int mode= isColor ? c.ppMode.chromMode : c.ppMode.lumMode;
Michael Niedermayer
committed
#endif
int black=0, white=255; // blackest black and whitest white in the picture
int QPCorrecture= 256*256;
int copyAhead;
Dominik Mierzejewski
committed
#ifdef HAVE_MMX
int i;
Dominik Mierzejewski
committed
#endif
const int qpHShift= isColor ? 4-c.hChromaSubSample : 4;
const int qpVShift= isColor ? 4-c.vChromaSubSample : 4;
//FIXME remove
uint64_t * const yHistogram= c.yHistogram;
uint8_t * const tempSrc= srcStride > 0 ? c.tempSrc : c.tempSrc - 23*srcStride;
uint8_t * const tempDst= dstStride > 0 ? c.tempDst : c.tempDst - 23*dstStride;
//const int mbWidth= isColor ? (width+7)>>3 : (width+15)>>4;
for(i=0; i<57; i++){
int offset= ((i*c.ppMode.baseDcDiff)>>8) + 1;
int threshold= offset*2 + 1;
c.mmxDcOffset[i]= 0x7F - offset;
c.mmxDcThreshold[i]= 0x7F - threshold;
c.mmxDcOffset[i]*= 0x0101010101010101LL;
c.mmxDcThreshold[i]*= 0x0101010101010101LL;
}
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
if(mode & CUBIC_IPOL_DEINT_FILTER) copyAhead=16;
else if( (mode & LINEAR_BLEND_DEINT_FILTER)
|| (mode & FFMPEG_DEINT_FILTER)
|| (mode & LOWPASS5_DEINT_FILTER)) copyAhead=14;
else if( (mode & V_DEBLOCK)
|| (mode & LINEAR_IPOL_DEINT_FILTER)
|| (mode & MEDIAN_DEINT_FILTER)
|| (mode & V_A_DEBLOCK)) copyAhead=13;
else if(mode & V_X1_FILTER) copyAhead=11;
// else if(mode & V_RK1_FILTER) copyAhead=10;
else if(mode & DERING) copyAhead=9;
else copyAhead=8;
copyAhead-= 8;
if(!isColor){
uint64_t sum= 0;
int i;
uint64_t maxClipped;
uint64_t clipped;
double scale;
c.frameNum++;
// first frame is fscked so we ignore it
if(c.frameNum == 1) yHistogram[0]= width*height/64*15/256;
for(i=0; i<256; i++){
sum+= yHistogram[i];
}
/* We always get a completely black picture first. */
maxClipped= (uint64_t)(sum * c.ppMode.maxClippedThreshold);
clipped= sum;
for(black=255; black>0; black--){
if(clipped < maxClipped) break;
clipped-= yHistogram[black];
}
clipped= sum;
for(white=0; white<256; white++){
if(clipped < maxClipped) break;
clipped-= yHistogram[white];
}
scale= (double)(c.ppMode.maxAllowedY - c.ppMode.minAllowedY) / (double)(white-black);
c.packedYScale= (uint16_t)(scale*256.0 + 0.5);
c.packedYOffset= (((black*c.packedYScale)>>8) - c.ppMode.minAllowedY) & 0xFFFF;
c.packedYScale= (uint16_t)(scale*1024.0 + 0.5);
c.packedYOffset= (black - c.ppMode.minAllowedY) & 0xFFFF;
c.packedYOffset|= c.packedYOffset<<32;
c.packedYOffset|= c.packedYOffset<<16;
c.packedYScale|= c.packedYScale<<32;
c.packedYScale|= c.packedYScale<<16;
if(mode & LEVEL_FIX) QPCorrecture= (int)(scale*256*256 + 0.5);
else QPCorrecture= 256*256;
}else{
c.packedYScale= 0x0100010001000100LL;
c.packedYOffset= 0;
QPCorrecture= 256*256;
}
/* copy & deinterlace first row of blocks */
y=-BLOCK_SIZE;
{
const uint8_t *srcBlock= &(src[y*srcStride]);
uint8_t *dstBlock= tempDst + dstStride;
// From this point on it is guaranteed that we can read and write 16 lines downward
// finish 1 block before the next otherwise we might have a problem
// with the L1 Cache of the P4 ... or only a few blocks at a time or soemthing
for(x=0; x<width; x+=BLOCK_SIZE){
prefetchnta(srcBlock + (((x>>2)&6) + 5)*srcStride + 32);
prefetchnta(srcBlock + (((x>>2)&6) + 6)*srcStride + 32);
prefetcht0(dstBlock + (((x>>2)&6) + 5)*dstStride + 32);
prefetcht0(dstBlock + (((x>>2)&6) + 6)*dstStride + 32);
asm(
"mov %4, %%"REG_a" \n\t"
"shr $2, %%"REG_a" \n\t"
"and $6, %%"REG_a" \n\t"
"add %5, %%"REG_a" \n\t"
"mov %%"REG_a", %%"REG_d" \n\t"
"imul %1, %%"REG_a" \n\t"
"imul %3, %%"REG_d" \n\t"
"prefetchnta 32(%%"REG_a", %0) \n\t"
"prefetcht0 32(%%"REG_d", %2) \n\t"
"add %1, %%"REG_a" \n\t"
"add %3, %%"REG_d" \n\t"
"prefetchnta 32(%%"REG_a", %0) \n\t"
"prefetcht0 32(%%"REG_d", %2) \n\t"
:: "r" (srcBlock), "r" ((long)srcStride), "r" (dstBlock), "r" ((long)dstStride),
"g" ((long)x), "g" ((long)copyAhead)
: "%"REG_a, "%"REG_d
);
//FIXME check if this is faster on an 3dnow chip or if it is faster without the prefetch or ...
/* prefetch(srcBlock + (((x>>3)&3) + 5)*srcStride + 32);
prefetch(srcBlock + (((x>>3)&3) + 9)*srcStride + 32);
prefetchw(dstBlock + (((x>>3)&3) + 5)*dstStride + 32);
prefetchw(dstBlock + (((x>>3)&3) + 9)*dstStride + 32);
RENAME(blockCopy)(dstBlock + dstStride*8, dstStride,
srcBlock + srcStride*8, srcStride, mode & LEVEL_FIX, &c.packedYOffset);
RENAME(duplicate)(dstBlock + dstStride*8, dstStride);
if(mode & LINEAR_IPOL_DEINT_FILTER)
RENAME(deInterlaceInterpolateLinear)(dstBlock, dstStride);
else if(mode & LINEAR_BLEND_DEINT_FILTER)
RENAME(deInterlaceBlendLinear)(dstBlock, dstStride, c.deintTemp + x);
else if(mode & MEDIAN_DEINT_FILTER)
RENAME(deInterlaceMedian)(dstBlock, dstStride);
else if(mode & CUBIC_IPOL_DEINT_FILTER)
RENAME(deInterlaceInterpolateCubic)(dstBlock, dstStride);
else if(mode & FFMPEG_DEINT_FILTER)
RENAME(deInterlaceFF)(dstBlock, dstStride, c.deintTemp + x);
else if(mode & LOWPASS5_DEINT_FILTER)
RENAME(deInterlaceL5)(dstBlock, dstStride, c.deintTemp + x, c.deintTemp + width + x);
/* else if(mode & CUBIC_BLEND_DEINT_FILTER)
RENAME(deInterlaceBlendCubic)(dstBlock, dstStride);
dstBlock+=8;
srcBlock+=8;
}
if(width==FFABS(dstStride))
linecpy(dst, tempDst + 9*dstStride, copyAhead, dstStride);
else{
int i;
for(i=0; i<copyAhead; i++){
memcpy(dst + i*dstStride, tempDst + (9+i)*dstStride, width);
}
for(y=0; y<height; y+=BLOCK_SIZE){
//1% speedup if these are here instead of the inner loop
const uint8_t *srcBlock= &(src[y*srcStride]);
uint8_t *dstBlock= &(dst[y*dstStride]);
uint8_t *tempBlock1= c.tempBlocks;
uint8_t *tempBlock2= c.tempBlocks + 8;
3464
3465
3466
3467
3468
3469
3470
3471
3472
3473
3474
3475
3476
3477
3478
3479
3480
3481
3482
3483
3484
3485
3486
3487
3488
3489
const int8_t *QPptr= &QPs[(y>>qpVShift)*QPStride];
int8_t *nonBQPptr= &c.nonBQPTable[(y>>qpVShift)*FFABS(QPStride)];
int QP=0;
/* can we mess with a 8x16 block from srcBlock/dstBlock downwards and 1 line upwards
if not than use a temporary buffer */
if(y+15 >= height){
int i;
/* copy from line (copyAhead) to (copyAhead+7) of src, these will be copied with
blockcopy to dst later */
linecpy(tempSrc + srcStride*copyAhead, srcBlock + srcStride*copyAhead,
FFMAX(height-y-copyAhead, 0), srcStride);
/* duplicate last line of src to fill the void upto line (copyAhead+7) */
for(i=FFMAX(height-y, 8); i<copyAhead+8; i++)
memcpy(tempSrc + srcStride*i, src + srcStride*(height-1), FFABS(srcStride));
/* copy up to (copyAhead+1) lines of dst (line -1 to (copyAhead-1))*/
linecpy(tempDst, dstBlock - dstStride, FFMIN(height-y+1, copyAhead+1), dstStride);
/* duplicate last line of dst to fill the void upto line (copyAhead) */
for(i=height-y+1; i<=copyAhead; i++)
memcpy(tempDst + dstStride*i, dst + dstStride*(height-1), FFABS(dstStride));
dstBlock= tempDst + dstStride;
srcBlock= tempSrc;
}
// From this point on it is guaranteed that we can read and write 16 lines downward
// finish 1 block before the next otherwise we might have a problem
// with the L1 Cache of the P4 ... or only a few blocks at a time or soemthing
for(x=0; x<width; x+=BLOCK_SIZE){
const int stride= dstStride;
uint8_t *tmpXchg;
if(isColor){
QP= QPptr[x>>qpHShift];
c.nonBQP= nonBQPptr[x>>qpHShift];
}else{
QP= QPptr[x>>4];
QP= (QP* QPCorrecture + 256*128)>>16;
c.nonBQP= nonBQPptr[x>>4];
c.nonBQP= (c.nonBQP* QPCorrecture + 256*128)>>16;
yHistogram[ srcBlock[srcStride*12 + 4] ]++;
}
c.QP= QP;
asm volatile(
"movd %1, %%mm7 \n\t"
"packuswb %%mm7, %%mm7 \n\t" // 0, 0, 0, QP, 0, 0, 0, QP
"packuswb %%mm7, %%mm7 \n\t" // 0,QP, 0, QP, 0,QP, 0, QP
"packuswb %%mm7, %%mm7 \n\t" // QP,..., QP
"movq %%mm7, %0 \n\t"
: "=m" (c.pQPb)
: "r" (QP)
);
prefetchnta(srcBlock + (((x>>2)&6) + 5)*srcStride + 32);
prefetchnta(srcBlock + (((x>>2)&6) + 6)*srcStride + 32);
prefetcht0(dstBlock + (((x>>2)&6) + 5)*dstStride + 32);
prefetcht0(dstBlock + (((x>>2)&6) + 6)*dstStride + 32);
asm(
"mov %4, %%"REG_a" \n\t"
"shr $2, %%"REG_a" \n\t"
"and $6, %%"REG_a" \n\t"
"add %5, %%"REG_a" \n\t"
"mov %%"REG_a", %%"REG_d" \n\t"
"imul %1, %%"REG_a" \n\t"
"imul %3, %%"REG_d" \n\t"
"prefetchnta 32(%%"REG_a", %0) \n\t"
"prefetcht0 32(%%"REG_d", %2) \n\t"
"add %1, %%"REG_a" \n\t"
"add %3, %%"REG_d" \n\t"
"prefetchnta 32(%%"REG_a", %0) \n\t"
"prefetcht0 32(%%"REG_d", %2) \n\t"
:: "r" (srcBlock), "r" ((long)srcStride), "r" (dstBlock), "r" ((long)dstStride),
"g" ((long)x), "g" ((long)copyAhead)
: "%"REG_a, "%"REG_d
);
//FIXME check if this is faster on an 3dnow chip or if it is faster without the prefetch or ...
/* prefetch(srcBlock + (((x>>3)&3) + 5)*srcStride + 32);
prefetch(srcBlock + (((x>>3)&3) + 9)*srcStride + 32);
prefetchw(dstBlock + (((x>>3)&3) + 5)*dstStride + 32);
prefetchw(dstBlock + (((x>>3)&3) + 9)*dstStride + 32);
RENAME(blockCopy)(dstBlock + dstStride*copyAhead, dstStride,
srcBlock + srcStride*copyAhead, srcStride, mode & LEVEL_FIX, &c.packedYOffset);
if(mode & LINEAR_IPOL_DEINT_FILTER)
RENAME(deInterlaceInterpolateLinear)(dstBlock, dstStride);
else if(mode & LINEAR_BLEND_DEINT_FILTER)
RENAME(deInterlaceBlendLinear)(dstBlock, dstStride, c.deintTemp + x);
else if(mode & MEDIAN_DEINT_FILTER)
RENAME(deInterlaceMedian)(dstBlock, dstStride);
else if(mode & CUBIC_IPOL_DEINT_FILTER)
RENAME(deInterlaceInterpolateCubic)(dstBlock, dstStride);
else if(mode & FFMPEG_DEINT_FILTER)
RENAME(deInterlaceFF)(dstBlock, dstStride, c.deintTemp + x);
else if(mode & LOWPASS5_DEINT_FILTER)
RENAME(deInterlaceL5)(dstBlock, dstStride, c.deintTemp + x, c.deintTemp + width + x);
/* else if(mode & CUBIC_BLEND_DEINT_FILTER)
RENAME(deInterlaceBlendCubic)(dstBlock, dstStride);
Michael Niedermayer
committed
*/
/* only deblock if we have 2 blocks */
if(y + 8 < height){
if(mode & V_X1_FILTER)
RENAME(vertX1Filter)(dstBlock, stride, &c);
else if(mode & V_DEBLOCK){
const int t= RENAME(vertClassify)(dstBlock, stride, &c);
if(t==1)
RENAME(doVertLowPass)(dstBlock, stride, &c);
else if(t==2)
RENAME(doVertDefFilter)(dstBlock, stride, &c);
}else if(mode & V_A_DEBLOCK){
RENAME(do_a_deblock)(dstBlock, stride, 1, &c);
}
}
#ifdef HAVE_MMX
RENAME(transpose1)(tempBlock1, tempBlock2, dstBlock, dstStride);
#endif
/* check if we have a previous block to deblock it with dstBlock */
if(x - 8 >= 0){
#ifdef HAVE_MMX
if(mode & H_X1_FILTER)
RENAME(vertX1Filter)(tempBlock1, 16, &c);
else if(mode & H_DEBLOCK){
//START_TIMER
const int t= RENAME(vertClassify)(tempBlock1, 16, &c);
//STOP_TIMER("dc & minmax")
if(t==1)
RENAME(doVertLowPass)(tempBlock1, 16, &c);
else if(t==2)
RENAME(doVertDefFilter)(tempBlock1, 16, &c);
}else if(mode & H_A_DEBLOCK){
RENAME(do_a_deblock)(tempBlock1, 16, 1, &c);
}
RENAME(transpose2)(dstBlock-4, dstStride, tempBlock1 + 4*16);
#else
if(mode & H_X1_FILTER)
horizX1Filter(dstBlock-4, stride, QP);
else if(mode & H_DEBLOCK){
DECLARE_ALIGNED(16, unsigned char, tempBlock[272]);
transpose_16x8_char_toPackedAlign_altivec(tempBlock, dstBlock - (4 + 1), stride);
const int t=vertClassify_altivec(tempBlock-48, 16, &c);
if(t==1) {
doVertLowPass_altivec(tempBlock-48, 16, &c);
transpose_8x16_char_fromPackedAlign_altivec(dstBlock - (4 + 1), tempBlock, stride);
}
else if(t==2) {
doVertDefFilter_altivec(tempBlock-48, 16, &c);
transpose_8x16_char_fromPackedAlign_altivec(dstBlock - (4 + 1), tempBlock, stride);
}
const int t= RENAME(horizClassify)(dstBlock-4, stride, &c);
Michael Niedermayer
committed
if(t==1)
RENAME(doHorizLowPass)(dstBlock-4, stride, &c);
else if(t==2)
RENAME(doHorizDefFilter)(dstBlock-4, stride, &c);
}else if(mode & H_A_DEBLOCK){
RENAME(do_a_deblock)(dstBlock-8, 1, stride, &c);
}
#endif //HAVE_MMX
if(mode & DERING){
//FIXME filter first line
if(y>0) RENAME(dering)(dstBlock - stride - 8, stride, &c);
}
if(mode & TEMP_NOISE_FILTER)
{
RENAME(tempNoiseReducer)(dstBlock-8, stride,
c.tempBlurred[isColor] + y*dstStride + x,
c.tempBlurredPast[isColor] + (y>>3)*256 + (x>>3),
c.ppMode.maxTmpNoise);
}
}
dstBlock+=8;
srcBlock+=8;
Michael Niedermayer
committed
#ifdef HAVE_MMX
tmpXchg= tempBlock1;
tempBlock1= tempBlock2;
tempBlock2 = tmpXchg;
Michael Niedermayer
committed
#endif
if(mode & DERING){
if(y > 0) RENAME(dering)(dstBlock - dstStride - 8, dstStride, &c);
}
if((mode & TEMP_NOISE_FILTER)){
RENAME(tempNoiseReducer)(dstBlock-8, dstStride,
c.tempBlurred[isColor] + y*dstStride + x,
c.tempBlurredPast[isColor] + (y>>3)*256 + (x>>3),
c.ppMode.maxTmpNoise);
}
/* did we use a tmp buffer for the last lines*/
if(y+15 >= height){
uint8_t *dstBlock= &(dst[y*dstStride]);
if(width==FFABS(dstStride))
linecpy(dstBlock, tempDst + dstStride, height-y, dstStride);
else{
int i;
for(i=0; i<height-y; i++){
memcpy(dstBlock + i*dstStride, tempDst + (i+1)*dstStride, width);
/*
for(x=0; x<width; x+=32){
volatile int i;
i+= + dstBlock[x + 7*dstStride] + dstBlock[x + 8*dstStride]
+ dstBlock[x + 9*dstStride] + dstBlock[x +10*dstStride]
+ dstBlock[x +11*dstStride] + dstBlock[x +12*dstStride];
+ dstBlock[x +13*dstStride]
+ dstBlock[x +14*dstStride] + dstBlock[x +15*dstStride];
}*/
}
asm volatile("femms");
asm volatile("emms");
if(!isColor){
int max=1;
int i;
for(i=0; i<256; i++)
if(yHistogram[i] > max) max=yHistogram[i];
for(i=1; i<256; i++){
int x;
int start=yHistogram[i-1]/(max/256+1);
int end=yHistogram[i]/(max/256+1);
int inc= end > start ? 1 : -1;
for(x=start; x!=end+inc; x+=inc)
dst[ i*dstStride + x]+=128;
}
for(i=0; i<100; i+=2){
dst[ (white)*dstStride + i]+=128;
dst[ (black)*dstStride + i]+=128;
*c2= c; //copy local context back