Newer
Older
{
//1% speedup if these are here instead of the inner loop
uint8_t *srcBlock= &(src[y*srcStride]);
uint8_t *dstBlock= &(dst[y*dstStride]);
#ifdef ARCH_X86
int *QPptr= isColor ? &QPs[(y>>3)*QPStride] :&QPs[(y>>4)*QPStride];
int QPDelta= isColor ? 1<<(32-3) : 1<<(32-4);
int QPFrac= QPDelta;
uint8_t *tempBlock1= tempBlocks;
uint8_t *tempBlock2= tempBlocks + 8;
/* can we mess with a 8x16 block from srcBlock/dstBlock downwards and 1 line upwards
if not than use a temporary buffer */
/* copy from line 5 to 12 of src, these will be copied with
blockcopy to dst later */
memcpy(tempSrc + srcStride*5, srcBlock + srcStride*5,
srcStride*MAX(height-y-5, 0) );
/* duplicate last line to fill the void upto line 12 */
if(y+12 >= height)
{
int i;
for(i=height-y; i<=12; i++)
memcpy(tempSrc + srcStride*i,
src + srcStride*(height-1), srcStride);
}
/* copy up to 6 lines of dst */
memcpy(tempDst, dstBlock - dstStride, dstStride*MIN(height-y+1, 6) );
dstBlock= tempDst + dstStride;
// From this point on it is guranteed that we can read and write 16 lines downward
// finish 1 block before the next otherwise well might have a problem
// with the L1 Cache of the P4 ... or only a few blocks at a time or soemthing
for(x=0; x<width; x+=BLOCK_SIZE)
const int stride= dstStride;
uint8_t *tmpXchg;
#ifdef ARCH_X86
int QP= *QPptr;
asm volatile(
"addl %2, %1 \n\t"
"sbbl %%eax, %%eax \n\t"
"shll $2, %%eax \n\t"
"subl %%eax, %0 \n\t"
: "+r" (QPptr), "+m" (QPFrac)
: "r" (QPDelta)
: "%eax"
);
#else
int QP= isColor ?
QPs[(y>>3)*QPStride + (x>>3)]:
QPs[(y>>4)*QPStride + (x>>4)];
#endif
if(!isColor)
QP= (QP* QPCorrecture)>>8;
yHistogram[ srcBlock[srcStride*4 + 4] ]++;
asm volatile(
"movd %0, %%mm7 \n\t"
"packuswb %%mm7, %%mm7 \n\t" // 0, 0, 0, QP, 0, 0, 0, QP
"packuswb %%mm7, %%mm7 \n\t" // 0,QP, 0, QP, 0,QP, 0, QP
"packuswb %%mm7, %%mm7 \n\t" // QP,..., QP
"movq %%mm7, pQPb \n\t"
: : "r" (QP)
);
prefetchnta(srcBlock + (((x>>3)&3) + 5)*srcStride + 32);
prefetchnta(srcBlock + (((x>>3)&3) + 9)*srcStride + 32);
prefetcht0(dstBlock + (((x>>3)&3) + 5)*dstStride + 32);
prefetcht0(dstBlock + (((x>>3)&3) + 9)*dstStride + 32);
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
*/
/*
prefetchnta(srcBlock + (((x>>2)&6) + 5)*srcStride + 32);
prefetchnta(srcBlock + (((x>>2)&6) + 6)*srcStride + 32);
prefetcht0(dstBlock + (((x>>2)&6) + 5)*dstStride + 32);
prefetcht0(dstBlock + (((x>>2)&6) + 6)*dstStride + 32);
*/
asm(
"movl %4, %%eax \n\t"
"shrl $2, %%eax \n\t"
"andl $6, %%eax \n\t"
"addl $5, %%eax \n\t"
"movl %%eax, %%ebx \n\t"
"imul %1, %%eax \n\t"
"imul %3, %%ebx \n\t"
"prefetchnta 32(%%eax, %0) \n\t"
"prefetcht0 32(%%ebx, %2) \n\t"
"addl %1, %%eax \n\t"
"addl %3, %%ebx \n\t"
"prefetchnta 32(%%eax, %0) \n\t"
"prefetcht0 32(%%ebx, %2) \n\t"
:: "r" (srcBlock), "r" (srcStride), "r" (dstBlock), "r" (dstStride),
"m" (x)
: "%eax", "%ebx"
);
#elif defined(HAVE_3DNOW)
//FIXME check if this is faster on an 3dnow chip or if its faster without the prefetch or ...
/* prefetch(srcBlock + (((x>>3)&3) + 5)*srcStride + 32);
prefetch(srcBlock + (((x>>3)&3) + 9)*srcStride + 32);
prefetchw(dstBlock + (((x>>3)&3) + 5)*dstStride + 32);
prefetchw(dstBlock + (((x>>3)&3) + 9)*dstStride + 32);
#ifdef PP_FUNNY_STRIDE
//can we mess with a 8x16 block, if not use a temp buffer, yes again
if(x+7 >= width)
{
int i;
dstBlockPtrBackup= dstBlock;
srcBlockPtrBackup= srcBlock;
for(i=0;i<BLOCK_SIZE*2; i++)
{
memcpy(tempSrcBlock+i*srcStride, srcBlock+i*srcStride, width-x);
memcpy(tempDstBlock+i*dstStride, dstBlock+i*dstStride, width-x);
}
dstBlock= tempDstBlock;
srcBlock= tempSrcBlock;
}
blockCopy(dstBlock + dstStride*5, dstStride,
srcBlock + srcStride*5, srcStride, 8, mode & LEVEL_FIX);
if(mode & LINEAR_IPOL_DEINT_FILTER)
deInterlaceInterpolateLinear(dstBlock, dstStride);
else if(mode & LINEAR_BLEND_DEINT_FILTER)
deInterlaceBlendLinear(dstBlock, dstStride);
else if(mode & MEDIAN_DEINT_FILTER)
deInterlaceMedian(dstBlock, dstStride);
else if(mode & CUBIC_IPOL_DEINT_FILTER)
deInterlaceInterpolateCubic(dstBlock, dstStride);
/* else if(mode & CUBIC_BLEND_DEINT_FILTER)
deInterlaceBlendCubic(dstBlock, dstStride);
Michael Niedermayer
committed
*/
/* only deblock if we have 2 blocks */
if(y + 8 < height)
{
T1= rdtsc();
memcpyTime+= T1-T0;
T0=T1;
#endif
Michael Niedermayer
committed
if(mode & V_RK1_FILTER)
vertRK1Filter(dstBlock, stride, QP);
else if(mode & V_X1_FILTER)
vertX1Filter(dstBlock, stride, QP);
else if(mode & V_DEBLOCK)
Michael Niedermayer
committed
if( isVertDC(dstBlock, stride))
Michael Niedermayer
committed
if(isVertMinMaxOk(dstBlock, stride, QP))
doVertLowPass(dstBlock, stride, QP);
Michael Niedermayer
committed
else
doVertDefFilter(dstBlock, stride, QP);
T1= rdtsc();
vertTime+= T1-T0;
T0=T1;
#endif
}
#ifdef HAVE_MMX
transpose1(tempBlock1, tempBlock2, dstBlock, dstStride);
#endif
/* check if we have a previous block to deblock it with dstBlock */
T0= rdtsc();
#endif
#ifdef HAVE_MMX
if(mode & H_RK1_FILTER)
vertRK1Filter(tempBlock1, 16, QP);
else if(mode & H_X1_FILTER)
vertX1Filter(tempBlock1, 16, QP);
else if(mode & H_DEBLOCK)
{
if( isVertDC(tempBlock1, 16))
{
if(isVertMinMaxOk(tempBlock1, 16, QP))
doVertLowPass(tempBlock1, 16, QP);
}
else
doVertDefFilter(tempBlock1, 16, QP);
}
transpose2(dstBlock-4, dstStride, tempBlock1 + 4*16);
#else
Michael Niedermayer
committed
if(mode & H_X1_FILTER)
horizX1Filter(dstBlock-4, stride, QP);
else if(mode & H_DEBLOCK)
if( isHorizDC(dstBlock-4, stride))
if(isHorizMinMaxOk(dstBlock-4, stride, QP))
doHorizLowPass(dstBlock-4, stride, QP);
Michael Niedermayer
committed
else
doHorizDefFilter(dstBlock-4, stride, QP);
#endif
T1= rdtsc();
horizTime+= T1-T0;
T0=T1;
#endif
if(mode & DERING)
{
//FIXME filter first line
if(y>0) dering(dstBlock - stride - 8, stride, QP);
}
}
else if(mode & DERING)
{
//FIXME y+15 is required cuz of the tempBuffer thing -> bottom right block isnt filtered
if(y > 8 && y+15 < height) dering(dstBlock - stride*9 + width - 8, stride, QP);
#ifdef PP_FUNNY_STRIDE
/* did we use a tmp-block buffer */
if(x+7 >= width)
{
int i;
dstBlock= dstBlockPtrBackup;
srcBlock= srcBlockPtrBackup;
for(i=0;i<BLOCK_SIZE*2; i++)
{
memcpy(dstBlock+i*dstStride, tempDstBlock+i*dstStride, width-x);
}
}
dstBlock+=8;
srcBlock+=8;
Michael Niedermayer
committed
#ifdef HAVE_MMX
tmpXchg= tempBlock1;
tempBlock1= tempBlock2;
tempBlock2 = tmpXchg;
Michael Niedermayer
committed
#endif
}
/* did we use a tmp buffer */
{
uint8_t *dstBlock= &(dst[y*dstStride]);
memcpy(dstBlock, tempDst + dstStride, dstStride*(height-y) );
#ifdef HAVE_3DNOW
asm volatile("femms");
#elif defined (HAVE_MMX)
asm volatile("emms");
#endif
// FIXME diff is mostly the time spent for rdtsc (should subtract that but ...)
sumTime= rdtsc() - sumTime;
if(!isColor)
printf("cpy:%4dk, vert:%4dk, horiz:%4dk, sum:%4dk, diff:%4dk, color: %d/%d \r",
(int)(memcpyTime/1000), (int)(vertTime/1000), (int)(horizTime/1000),
(int)(sumTime/1000), (int)((sumTime-memcpyTime-vertTime-horizTime)/1000)
, black, white);
#endif
}