Newer
Older
printf("\n%X %X %X %X :%d: %d %d %d\n", ppMode.lumMode, ppMode.chromMode, ppMode.oldMode, ppMode.error,
qual, ppMode.maxTmpNoise[0], ppMode.maxTmpNoise[1], ppMode.maxTmpNoise[2]);
postprocess2(src, src_stride, dst, dst_stride,
horizontal_size, vertical_size, QP_store, QP_stride, &ppMode);
return;
*/
if(QP_store==NULL)
{
QP_store= zeroArray;
QP_stride= 0;
}
ppMode.lumMode= mode;
mode= ((mode&0xFF)>>4) | (mode&0xFFFFFF00);
ppMode.chromMode= mode;
Michael Niedermayer
committed
ppMode.maxTmpNoise[0]= 700;
ppMode.maxTmpNoise[1]= 1500;
ppMode.maxTmpNoise[2]= 3000;
#ifdef HAVE_ODIVX_POSTPROCESS
// Note: I could make this shit outside of this file, but it would mean one
// more function call...
if(use_old_pp){
odivx_postprocess(src,src_stride,dst,dst_stride,horizontal_size,vertical_size,QP_store,QP_stride,mode);
return;
}
#endif
postProcess(src[0], src_stride, dst[0], dst_stride,
horizontal_size, vertical_size, QP_store, QP_stride, 0, &ppMode);
horizontal_size >>= 1;
vertical_size >>= 1;
src_stride >>= 1;
dst_stride >>= 1;
if(ppMode.chromMode)
postProcess(src[1], src_stride, dst[1], dst_stride,
horizontal_size, vertical_size, QP_store, QP_stride, 1, &ppMode);
postProcess(src[2], src_stride, dst[2], dst_stride,
horizontal_size, vertical_size, QP_store, QP_stride, 2, &ppMode);
else if(src_stride == dst_stride)
{
memcpy(dst[1], src[1], src_stride*vertical_size);
memcpy(dst[2], src[2], src_stride*vertical_size);
}
int y;
for(y=0; y<vertical_size; y++)
{
memcpy(&(dst[1][y*dst_stride]), &(src[1][y*src_stride]), horizontal_size);
memcpy(&(dst[2][y*dst_stride]), &(src[2][y*src_stride]), horizontal_size);
}
}
#if 0
memset(dst[1], 128, dst_stride*vertical_size);
memset(dst[2], 128, dst_stride*vertical_size);
#endif
void postprocess2(unsigned char * src[], int src_stride,
unsigned char * dst[], int dst_stride,
int horizontal_size, int vertical_size,
QP_STORE_T *QP_store, int QP_stride,
struct PPMode *mode)
{
static QP_STORE_T zeroArray[2048/8];
if(QP_store==NULL)
{
QP_store= zeroArray;
QP_stride= 0;
}
#ifdef HAVE_ODIVX_POSTPROCESS
// Note: I could make this shit outside of this file, but it would mean one
// more function call...
if(use_old_pp){
odivx_postprocess(src,src_stride,dst,dst_stride,horizontal_size,vertical_size,QP_store,QP_stride,
mode->oldMode);
return;
}
#endif
postProcess(src[0], src_stride, dst[0], dst_stride,
horizontal_size, vertical_size, QP_store, QP_stride, 0, mode);
horizontal_size >>= 1;
vertical_size >>= 1;
src_stride >>= 1;
dst_stride >>= 1;
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
if(mode->chromMode)
{
postProcess(src[1], src_stride, dst[1], dst_stride,
horizontal_size, vertical_size, QP_store, QP_stride, 1, mode);
postProcess(src[2], src_stride, dst[2], dst_stride,
horizontal_size, vertical_size, QP_store, QP_stride, 2, mode);
}
else if(src_stride == dst_stride)
{
memcpy(dst[1], src[1], src_stride*vertical_size);
memcpy(dst[2], src[2], src_stride*vertical_size);
}
else
{
int y;
for(y=0; y<vertical_size; y++)
{
memcpy(&(dst[1][y*dst_stride]), &(src[1][y*src_stride]), horizontal_size);
memcpy(&(dst[2][y*dst_stride]), &(src[2][y*src_stride]), horizontal_size);
}
}
/**
* gets the mode flags for a given quality (larger values mean slower but better postprocessing)
int getPpModeForQuality(int quality){
int modes[1+GET_PP_QUALITY_MAX]= {
0,
#if 1
// horizontal filters first
LUM_H_DEBLOCK,
LUM_H_DEBLOCK | LUM_V_DEBLOCK,
LUM_H_DEBLOCK | LUM_V_DEBLOCK | CHROM_H_DEBLOCK,
LUM_H_DEBLOCK | LUM_V_DEBLOCK | CHROM_H_DEBLOCK | CHROM_V_DEBLOCK,
LUM_H_DEBLOCK | LUM_V_DEBLOCK | CHROM_H_DEBLOCK | CHROM_V_DEBLOCK | LUM_DERING,
LUM_H_DEBLOCK | LUM_V_DEBLOCK | CHROM_H_DEBLOCK | CHROM_V_DEBLOCK | LUM_DERING | CHROM_DERING
#else
// vertical filters first
LUM_V_DEBLOCK,
LUM_V_DEBLOCK | LUM_H_DEBLOCK,
LUM_V_DEBLOCK | LUM_H_DEBLOCK | CHROM_V_DEBLOCK,
LUM_V_DEBLOCK | LUM_H_DEBLOCK | CHROM_V_DEBLOCK | CHROM_H_DEBLOCK,
LUM_V_DEBLOCK | LUM_H_DEBLOCK | CHROM_V_DEBLOCK | CHROM_H_DEBLOCK | LUM_DERING,
LUM_V_DEBLOCK | LUM_H_DEBLOCK | CHROM_V_DEBLOCK | CHROM_H_DEBLOCK | LUM_DERING | CHROM_DERING
#endif
};
#ifdef HAVE_ODIVX_POSTPROCESS
int odivx_modes[1+GET_PP_QUALITY_MAX]= {
0,
PP_DEBLOCK_Y_H,
PP_DEBLOCK_Y_H|PP_DEBLOCK_Y_V,
PP_DEBLOCK_Y_H|PP_DEBLOCK_Y_V|PP_DEBLOCK_C_H,
PP_DEBLOCK_Y_H|PP_DEBLOCK_Y_V|PP_DEBLOCK_C_H|PP_DEBLOCK_C_V,
PP_DEBLOCK_Y_H|PP_DEBLOCK_Y_V|PP_DEBLOCK_C_H|PP_DEBLOCK_C_V|PP_DERING_Y,
PP_DEBLOCK_Y_H|PP_DEBLOCK_Y_V|PP_DEBLOCK_C_H|PP_DEBLOCK_C_V|PP_DERING_Y|PP_DERING_C
};
if(use_old_pp) return odivx_modes[quality];
#endif
return modes[quality];
}
/**
* Copies a block from src to dst and fixes the blacklevel
* numLines must be a multiple of 4
* levelFix == 0 -> dont touch the brighness & contrast
static inline void blockCopy(uint8_t dst[], int dstStride, uint8_t src[], int srcStride,
Michael Niedermayer
committed
#ifndef HAVE_MMX
Michael Niedermayer
committed
#endif
if(levelFix)
{
#ifdef HAVE_MMX
asm volatile(
"leal (%0,%2), %%eax \n\t"
"leal (%1,%3), %%ebx \n\t"
"movq packedYOffset, %%mm2 \n\t"
"movq packedYScale, %%mm3 \n\t"
Michael Niedermayer
committed
"pxor %%mm4, %%mm4 \n\t"
#define SCALED_CPY(src1, src2, dst1, dst2) \
"movq " #src1 ", %%mm0 \n\t"\
"movq " #src1 ", %%mm5 \n\t"\
Michael Niedermayer
committed
"punpcklbw %%mm4, %%mm0 \n\t"\
"punpckhbw %%mm4, %%mm5 \n\t"\
"psubw %%mm2, %%mm0 \n\t"\
"psubw %%mm2, %%mm5 \n\t"\
"psllw $6, %%mm0 \n\t"\
"psllw $6, %%mm5 \n\t"\
Michael Niedermayer
committed
"pmulhw %%mm3, %%mm0 \n\t"\
Michael Niedermayer
committed
"pmulhw %%mm3, %%mm5 \n\t"\
"punpcklbw %%mm4, %%mm1 \n\t"\
"punpckhbw %%mm4, %%mm6 \n\t"\
"psubw %%mm2, %%mm6 \n\t"\
"psllw $6, %%mm6 \n\t"\
Michael Niedermayer
committed
"pmulhw %%mm3, %%mm1 \n\t"\
"pmulhw %%mm3, %%mm6 \n\t"\
"packuswb %%mm5, %%mm0 \n\t"\
"packuswb %%mm6, %%mm1 \n\t"\
"movq %%mm0, " #dst1 " \n\t"\
"movq %%mm1, " #dst2 " \n\t"\
SCALED_CPY((%0) , (%0, %2) , (%1) , (%1, %3))
SCALED_CPY((%0, %2, 2), (%%eax, %2, 2), (%1, %3, 2), (%%ebx, %3, 2))
SCALED_CPY((%0, %2, 4), (%%eax, %2, 4), (%1, %3, 4), (%%ebx, %3, 4))
"leal (%%eax,%2,4), %%eax \n\t"
"leal (%%ebx,%3,4), %%ebx \n\t"
SCALED_CPY((%%eax, %2), (%%eax, %2, 2), (%%ebx, %3), (%%ebx, %3, 2))
: : "r"(src),
"r"(dst),
"r" (srcStride),
"r" (dstStride)
: "%eax", "%ebx"
);
#else
memcpy( &(dst[dstStride*i]),
&(src[srcStride*i]), BLOCK_SIZE);
#endif
}
else
{
#ifdef HAVE_MMX
asm volatile(
"leal (%0,%2), %%eax \n\t"
"leal (%1,%3), %%ebx \n\t"
#define SIMPLE_CPY(src1, src2, dst1, dst2) \
"movq " #src1 ", %%mm0 \n\t"\
"movq " #src2 ", %%mm1 \n\t"\
"movq %%mm0, " #dst1 " \n\t"\
"movq %%mm1, " #dst2 " \n\t"\
SIMPLE_CPY((%0) , (%0, %2) , (%1) , (%1, %3))
SIMPLE_CPY((%0, %2, 2), (%%eax, %2, 2), (%1, %3, 2), (%%ebx, %3, 2))
SIMPLE_CPY((%0, %2, 4), (%%eax, %2, 4), (%1, %3, 4), (%%ebx, %3, 4))
"leal (%%eax,%2,4), %%eax \n\t"
"leal (%%ebx,%3,4), %%ebx \n\t"
SIMPLE_CPY((%%eax, %2), (%%eax, %2, 2), (%%ebx, %3), (%%ebx, %3, 2))
: : "r" (src),
"r" (dst),
"r" (srcStride),
: "%eax", "%ebx"
);
#else
memcpy( &(dst[dstStride*i]),
&(src[srcStride*i]), BLOCK_SIZE);
#endif
}
/**
* Filters array of bytes (Y or U or V values)
*/
static void postProcess(uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int width, int height,
QP_STORE_T QPs[], int QPStride, int isColor, struct PPMode *ppMode)
int x,y;
const int mode= isColor ? ppMode->chromMode : ppMode->lumMode;
/* we need 64bit here otherwise well going to have a problem
after watching a black picture for 5 hours*/
static uint64_t *yHistogram= NULL;
int black=0, white=255; // blackest black and whitest white in the picture
/* Temporary buffers for handling the last row(s) */
static uint8_t *tempDst= NULL;
static uint8_t *tempSrc= NULL;
/* Temporary buffers for handling the last block */
static uint8_t *tempDstBlock= NULL;
static uint8_t *tempSrcBlock= NULL;
/* Temporal noise reducing buffers */
static uint8_t *tempBlured[3]= {NULL,NULL,NULL};
static uint32_t *tempBluredPast[3]= {NULL,NULL,NULL};
Michael Niedermayer
committed
#ifdef PP_FUNNY_STRIDE
uint8_t *dstBlockPtrBackup;
uint8_t *srcBlockPtrBackup;
Michael Niedermayer
committed
#endif
Michael Niedermayer
committed
#ifdef MORE_TIMING
long long T0, T1, diffTime=0;
#endif
Michael Niedermayer
committed
long long memcpyTime=0, vertTime=0, horizTime=0, sumTime;
sumTime= rdtsc();
#endif
#ifdef HAVE_MMX
maxTmpNoise[0]= ppMode->maxTmpNoise[0];
maxTmpNoise[1]= ppMode->maxTmpNoise[1];
maxTmpNoise[2]= ppMode->maxTmpNoise[2];
#endif
if(mode & CUBIC_IPOL_DEINT_FILTER) copyAhead=16;
else if(mode & LINEAR_BLEND_DEINT_FILTER) copyAhead=14;
else if( (mode & V_DEBLOCK)
|| (mode & LINEAR_IPOL_DEINT_FILTER)
|| (mode & MEDIAN_DEINT_FILTER)) copyAhead=13;
else if(mode & V_X1_FILTER) copyAhead=11;
else if(mode & V_RK1_FILTER) copyAhead=10;
else if(mode & DERING) copyAhead=9;
else copyAhead=8;
copyAhead-= 8;
if(tempDst==NULL)
{
tempDst= (uint8_t*)memalign(8, 1024*24);
tempSrc= (uint8_t*)memalign(8, 1024*24);
tempDstBlock= (uint8_t*)memalign(8, 1024*24);
tempSrcBlock= (uint8_t*)memalign(8, 1024*24);
if(tempBlured[isColor]==NULL && (mode & TEMP_NOISE_FILTER))
{
// printf("%d %d %d\n", isColor, dstStride, height);
//FIXME works only as long as the size doesnt increase
//Note:the +17*1024 is just there so i dont have to worry about r/w over te end
tempBlured[isColor]= (uint8_t*)memalign(8, dstStride*((height+7)&(~7)) + 17*1024);
tempBluredPast[isColor]= (uint32_t*)memalign(8, 256*((height+7)&(~7))/2 + 17*1024);
memset(tempBlured[isColor], 0, dstStride*((height+7)&(~7)) + 17*1024);
memset(tempBluredPast[isColor], 0, 256*((height+7)&(~7))/2 + 17*1024);
if(!yHistogram)
{
int i;
yHistogram= (uint64_t*)malloc(8*256);
for(i=0; i<256; i++) yHistogram[i]= width*height/64*15/256;
if(mode & FULL_Y_RANGE)
{
maxAllowedY=255;
minAllowedY=0;
}
}
if(!isColor)
{
uint64_t sum= 0;
int i;
static int framenum= -1;
uint64_t maxClipped;
uint64_t clipped;
double scale;
framenum++;
if(framenum == 1) yHistogram[0]= width*height/64*15/256;
for(i=0; i<256; i++)
{
sum+= yHistogram[i];
// printf("%d ", yHistogram[i]);
}
// printf("\n\n");
/* we allways get a completly black picture first */
maxClipped= (uint64_t)(sum * maxClippedThreshold);
clipped= sum;
for(black=255; black>0; black--)
{
if(clipped < maxClipped) break;
clipped-= yHistogram[black];
}
clipped= sum;
for(white=0; white<256; white++)
{
if(clipped < maxClipped) break;
clipped-= yHistogram[white];
}
packedYOffset= (black - minAllowedY) & 0xFFFF;
packedYOffset|= packedYOffset<<32;
packedYOffset|= packedYOffset<<16;
scale= (double)(maxAllowedY - minAllowedY) / (double)(white-black);
packedYScale= (uint16_t)(scale*1024.0 + 0.5);
packedYScale|= packedYScale<<32;
packedYScale|= packedYScale<<16;
}
else
{
packedYScale= 0x0100010001000100LL;
packedYOffset= 0;
}
if(mode & LEVEL_FIX) QPCorrecture= packedYScale &0xFFFF;
else QPCorrecture= 256;
/* copy & deinterlace first row of blocks */
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426
3427
3428
3429
3430
3431
3432
3433
3434
3435
3436
3437
3438
3439
3440
3441
y=-BLOCK_SIZE;
{
//1% speedup if these are here instead of the inner loop
uint8_t *srcBlock= &(src[y*srcStride]);
uint8_t *dstBlock= &(dst[y*dstStride]);
dstBlock= tempDst + dstStride;
// From this point on it is guranteed that we can read and write 16 lines downward
// finish 1 block before the next otherwise well might have a problem
// with the L1 Cache of the P4 ... or only a few blocks at a time or soemthing
for(x=0; x<width; x+=BLOCK_SIZE)
{
#ifdef HAVE_MMX2
/*
prefetchnta(srcBlock + (((x>>2)&6) + 5)*srcStride + 32);
prefetchnta(srcBlock + (((x>>2)&6) + 6)*srcStride + 32);
prefetcht0(dstBlock + (((x>>2)&6) + 5)*dstStride + 32);
prefetcht0(dstBlock + (((x>>2)&6) + 6)*dstStride + 32);
*/
asm(
"movl %4, %%eax \n\t"
"shrl $2, %%eax \n\t"
"andl $6, %%eax \n\t"
"movl %%eax, %%ebx \n\t"
"imul %1, %%eax \n\t"
"imul %3, %%ebx \n\t"
"prefetchnta 32(%%eax, %0) \n\t"
"prefetcht0 32(%%ebx, %2) \n\t"
"addl %1, %%eax \n\t"
"addl %3, %%ebx \n\t"
"prefetchnta 32(%%eax, %0) \n\t"
"prefetcht0 32(%%ebx, %2) \n\t"
:: "r" (srcBlock), "r" (srcStride), "r" (dstBlock), "r" (dstStride),
: "%eax", "%ebx"
);
#elif defined(HAVE_3DNOW)
//FIXME check if this is faster on an 3dnow chip or if its faster without the prefetch or ...
/* prefetch(srcBlock + (((x>>3)&3) + 5)*srcStride + 32);
prefetch(srcBlock + (((x>>3)&3) + 9)*srcStride + 32);
prefetchw(dstBlock + (((x>>3)&3) + 5)*dstStride + 32);
prefetchw(dstBlock + (((x>>3)&3) + 9)*dstStride + 32);
*/
#endif
blockCopy(dstBlock + dstStride*copyAhead, dstStride,
srcBlock + srcStride*copyAhead, srcStride, mode & LEVEL_FIX);
if(mode & LINEAR_IPOL_DEINT_FILTER)
deInterlaceInterpolateLinear(dstBlock, dstStride);
else if(mode & LINEAR_BLEND_DEINT_FILTER)
deInterlaceBlendLinear(dstBlock, dstStride);
else if(mode & MEDIAN_DEINT_FILTER)
deInterlaceMedian(dstBlock, dstStride);
else if(mode & CUBIC_IPOL_DEINT_FILTER)
deInterlaceInterpolateCubic(dstBlock, dstStride);
/* else if(mode & CUBIC_BLEND_DEINT_FILTER)
deInterlaceBlendCubic(dstBlock, dstStride);
*/
dstBlock+=8;
srcBlock+=8;
}
memcpy(&(dst[y*dstStride]) + 8*dstStride, tempDst + 9*dstStride, copyAhead*dstStride );
{
//1% speedup if these are here instead of the inner loop
uint8_t *srcBlock= &(src[y*srcStride]);
uint8_t *dstBlock= &(dst[y*dstStride]);
#ifdef ARCH_X86
int *QPptr= isColor ? &QPs[(y>>3)*QPStride] :&QPs[(y>>4)*QPStride];
int QPDelta= isColor ? 1<<(32-3) : 1<<(32-4);
int QPFrac= QPDelta;
uint8_t *tempBlock1= tempBlocks;
uint8_t *tempBlock2= tempBlocks + 8;
/* can we mess with a 8x16 block from srcBlock/dstBlock downwards and 1 line upwards
if not than use a temporary buffer */
/* copy from line (copyAhead) to (copyAhead+7) of src, these will be copied with
memcpy(tempSrc + srcStride*copyAhead, srcBlock + srcStride*copyAhead,
srcStride*MAX(height-y-copyAhead, 0) );
/* duplicate last line of src to fill the void upto line (copyAhead+7) */
for(i=MAX(height-y, 8); i<copyAhead+8; i++)
memcpy(tempSrc + srcStride*i, src + srcStride*(height-1), srcStride);
/* copy up to (copyAhead+1) lines of dst (line -1 to (copyAhead-1))*/
memcpy(tempDst, dstBlock - dstStride, dstStride*MIN(height-y+1, copyAhead+1) );
/* duplicate last line of dst to fill the void upto line (copyAhead) */
for(i=height-y+1; i<=copyAhead; i++)
memcpy(tempDst + dstStride*i, dst + dstStride*(height-1), dstStride);
// From this point on it is guranteed that we can read and write 16 lines downward
// finish 1 block before the next otherwise well might have a problem
// with the L1 Cache of the P4 ... or only a few blocks at a time or soemthing
for(x=0; x<width; x+=BLOCK_SIZE)
const int stride= dstStride;
uint8_t *tmpXchg;
asm volatile(
"addl %2, %1 \n\t"
"sbbl %%eax, %%eax \n\t"
"shll $2, %%eax \n\t"
"subl %%eax, %0 \n\t"
: "+r" (QPptr), "+m" (QPFrac)
: "r" (QPDelta)
: "%eax"
);
#else
QPs[(y>>3)*QPStride + (x>>3)]:
QPs[(y>>4)*QPStride + (x>>4)];
#endif
if(!isColor)
yHistogram[ srcBlock[srcStride*12 + 4] ]++;
asm volatile(
"movd %0, %%mm7 \n\t"
"packuswb %%mm7, %%mm7 \n\t" // 0, 0, 0, QP, 0, 0, 0, QP
"packuswb %%mm7, %%mm7 \n\t" // 0,QP, 0, QP, 0,QP, 0, QP
"packuswb %%mm7, %%mm7 \n\t" // QP,..., QP
"movq %%mm7, pQPb \n\t"
: : "r" (QP)
);
/*
prefetchnta(srcBlock + (((x>>2)&6) + 5)*srcStride + 32);
prefetchnta(srcBlock + (((x>>2)&6) + 6)*srcStride + 32);
prefetcht0(dstBlock + (((x>>2)&6) + 5)*dstStride + 32);
prefetcht0(dstBlock + (((x>>2)&6) + 6)*dstStride + 32);
*/
asm(
"movl %4, %%eax \n\t"
"shrl $2, %%eax \n\t"
"andl $6, %%eax \n\t"
"movl %%eax, %%ebx \n\t"
"imul %1, %%eax \n\t"
"imul %3, %%ebx \n\t"
"prefetchnta 32(%%eax, %0) \n\t"
"prefetcht0 32(%%ebx, %2) \n\t"
"addl %1, %%eax \n\t"
"addl %3, %%ebx \n\t"
"prefetchnta 32(%%eax, %0) \n\t"
"prefetcht0 32(%%ebx, %2) \n\t"
:: "r" (srcBlock), "r" (srcStride), "r" (dstBlock), "r" (dstStride),
#elif defined(HAVE_3DNOW)
//FIXME check if this is faster on an 3dnow chip or if its faster without the prefetch or ...
/* prefetch(srcBlock + (((x>>3)&3) + 5)*srcStride + 32);
prefetch(srcBlock + (((x>>3)&3) + 9)*srcStride + 32);
prefetchw(dstBlock + (((x>>3)&3) + 5)*dstStride + 32);
prefetchw(dstBlock + (((x>>3)&3) + 9)*dstStride + 32);
#ifdef PP_FUNNY_STRIDE
//can we mess with a 8x16 block, if not use a temp buffer, yes again
if(x+7 >= width)
{
int i;
dstBlockPtrBackup= dstBlock;
srcBlockPtrBackup= srcBlock;
for(i=0;i<BLOCK_SIZE*2; i++)
{
memcpy(tempSrcBlock+i*srcStride, srcBlock+i*srcStride, width-x);
memcpy(tempDstBlock+i*dstStride, dstBlock+i*dstStride, width-x);
}
dstBlock= tempDstBlock;
srcBlock= tempSrcBlock;
}
blockCopy(dstBlock + dstStride*copyAhead, dstStride,
srcBlock + srcStride*copyAhead, srcStride, mode & LEVEL_FIX);
if(mode & LINEAR_IPOL_DEINT_FILTER)
deInterlaceInterpolateLinear(dstBlock, dstStride);
else if(mode & LINEAR_BLEND_DEINT_FILTER)
deInterlaceBlendLinear(dstBlock, dstStride);
else if(mode & MEDIAN_DEINT_FILTER)
deInterlaceMedian(dstBlock, dstStride);
else if(mode & CUBIC_IPOL_DEINT_FILTER)
deInterlaceInterpolateCubic(dstBlock, dstStride);
/* else if(mode & CUBIC_BLEND_DEINT_FILTER)
deInterlaceBlendCubic(dstBlock, dstStride);
Michael Niedermayer
committed
*/
/* only deblock if we have 2 blocks */
if(y + 8 < height)
{
T1= rdtsc();
memcpyTime+= T1-T0;
T0=T1;
#endif
Michael Niedermayer
committed
if(mode & V_RK1_FILTER)
vertRK1Filter(dstBlock, stride, QP);
else if(mode & V_X1_FILTER)
vertX1Filter(dstBlock, stride, QP);
else if(mode & V_DEBLOCK)
Michael Niedermayer
committed
if( isVertDC(dstBlock, stride))
Michael Niedermayer
committed
if(isVertMinMaxOk(dstBlock, stride, QP))
doVertLowPass(dstBlock, stride, QP);
Michael Niedermayer
committed
else
doVertDefFilter(dstBlock, stride, QP);
T1= rdtsc();
vertTime+= T1-T0;
T0=T1;
#endif
}
#ifdef HAVE_MMX
transpose1(tempBlock1, tempBlock2, dstBlock, dstStride);
#endif
/* check if we have a previous block to deblock it with dstBlock */
T0= rdtsc();
#endif
#ifdef HAVE_MMX
if(mode & H_RK1_FILTER)
vertRK1Filter(tempBlock1, 16, QP);
else if(mode & H_X1_FILTER)
vertX1Filter(tempBlock1, 16, QP);
else if(mode & H_DEBLOCK)
{
{
if(isVertMinMaxOk(tempBlock1, 16, QP))
doVertLowPass(tempBlock1, 16, QP);
}
else
doVertDefFilter(tempBlock1, 16, QP);
}
transpose2(dstBlock-4, dstStride, tempBlock1 + 4*16);
#else
Michael Niedermayer
committed
if(mode & H_X1_FILTER)
horizX1Filter(dstBlock-4, stride, QP);
else if(mode & H_DEBLOCK)
if( isHorizDC(dstBlock-4, stride))
if(isHorizMinMaxOk(dstBlock-4, stride, QP))
doHorizLowPass(dstBlock-4, stride, QP);
Michael Niedermayer
committed
else
doHorizDefFilter(dstBlock-4, stride, QP);
#endif
T1= rdtsc();
horizTime+= T1-T0;
T0=T1;
#endif
if(mode & DERING)
{
//FIXME filter first line
if(y>0) dering(dstBlock - stride - 8, stride, QP);
}
if(mode & TEMP_NOISE_FILTER)
{
tempNoiseReducer(dstBlock-8, stride,
tempBlured[isColor] + y*dstStride + x,
ppMode->maxTmpNoise);
}
}
#ifdef PP_FUNNY_STRIDE
/* did we use a tmp-block buffer */
if(x+7 >= width)
{
int i;
dstBlock= dstBlockPtrBackup;
srcBlock= srcBlockPtrBackup;
for(i=0;i<BLOCK_SIZE*2; i++)
{
memcpy(dstBlock+i*dstStride, tempDstBlock+i*dstStride, width-x);
}
}
dstBlock+=8;
srcBlock+=8;
Michael Niedermayer
committed
#ifdef HAVE_MMX
tmpXchg= tempBlock1;
tempBlock1= tempBlock2;
tempBlock2 = tmpXchg;
Michael Niedermayer
committed
#endif
if(mode & DERING)
{
if(y > 0) dering(dstBlock - dstStride - 8, dstStride, QP);
}
if((mode & TEMP_NOISE_FILTER))
{
tempNoiseReducer(dstBlock-8, dstStride,
tempBlured[isColor] + y*dstStride + x,
ppMode->maxTmpNoise);
}
/* did we use a tmp buffer for the last lines*/
{
uint8_t *dstBlock= &(dst[y*dstStride]);
memcpy(dstBlock, tempDst + dstStride, dstStride*(height-y) );
/*
for(x=0; x<width; x+=32)
{
i+= + dstBlock[x + 7*dstStride] + dstBlock[x + 8*dstStride]
+ dstBlock[x + 9*dstStride] + dstBlock[x +10*dstStride]
+ dstBlock[x +11*dstStride] + dstBlock[x +12*dstStride];
// + dstBlock[x +13*dstStride]
// + dstBlock[x +14*dstStride] + dstBlock[x +15*dstStride];
}*/
}
#ifdef HAVE_3DNOW
asm volatile("femms");
#elif defined (HAVE_MMX)
asm volatile("emms");
#endif
// FIXME diff is mostly the time spent for rdtsc (should subtract that but ...)
sumTime= rdtsc() - sumTime;
if(!isColor)
printf("cpy:%4dk, vert:%4dk, horiz:%4dk, sum:%4dk, diff:%4dk, color: %d/%d \r",
(int)(memcpyTime/1000), (int)(vertTime/1000), (int)(horizTime/1000),
(int)(sumTime/1000), (int)((sumTime-memcpyTime-vertTime-horizTime)/1000)
, black, white);
#endif
3797
3798
3799
3800
3801
3802
3803
3804
3805
3806
3807
3808
3809
3810
3811
3812
3813
3814
3815
3816
3817
3818
3819
3820
3821
3822
3823
#ifdef DEBUG_BRIGHTNESS
if(!isColor)
{
int max=1;
int i;
for(i=0; i<256; i++)
if(yHistogram[i] > max) max=yHistogram[i];
for(i=1; i<256; i++)
{
int x;
int start=yHistogram[i-1]/(max/256+1);
int end=yHistogram[i]/(max/256+1);
int inc= end > start ? 1 : -1;
for(x=start; x!=end+inc; x+=inc)
dst[ i*dstStride + x]+=128;
}
for(i=0; i<100; i+=2)
{
dst[ (white)*dstStride + i]+=128;
dst[ (black)*dstStride + i]+=128;
}
}
#endif