Newer
Older
for(i=0; i<numLines; i++)
memcpy( &(dst[dstStride*i]),
&(src[srcStride*i]), BLOCK_SIZE);
#endif
}
/**
* Filters array of bytes (Y or U or V values)
*/
void postProcess(uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int width, int height,
QP_STORE_T QPs[], int QPStride, int isColor, int mode)
int x,y;
/* we need 64bit here otherwise well going to have a problem
after watching a black picture for 5 hours*/
static uint64_t *yHistogram= NULL;
int black=0, white=255; // blackest black and whitest white in the picture
#ifdef TIMEING
long long T0, T1, memcpyTime=0, vertTime=0, horizTime=0, sumTime, diffTime=0;
sumTime= rdtsc();
#endif
if(!yHistogram)
{
int i;
yHistogram= (uint64_t*)malloc(8*256);
for(i=0; i<256; i++) yHistogram[i]= width*height/64*15/256;
}
if(!isColor)
{
uint64_t sum= 0;
int i;
static int framenum= -1;
uint64_t maxClipped;
uint64_t clipped;
double scale;
framenum++;
if(framenum == 1) yHistogram[0]= width*height/64*15/256;
for(i=0; i<256; i++)
{
sum+= yHistogram[i];
// printf("%d ", yHistogram[i]);
}
// printf("\n\n");
/* we allways get a completly black picture first */
maxClipped= (uint64_t)(sum * maxClippedThreshold);
clipped= sum;
for(black=255; black>0; black--)
{
if(clipped < maxClipped) break;
clipped-= yHistogram[black];
}
clipped= sum;
for(white=0; white<256; white++)
{
if(clipped < maxClipped) break;
clipped-= yHistogram[white];
}
// we cant handle negative correctures
packedYOffset= MAX(black - minAllowedY, 0);
packedYOffset|= packedYOffset<<32;
packedYOffset|= packedYOffset<<16;
packedYOffset|= packedYOffset<<8;
scale= (double)(maxAllowedY - minAllowedY) / (double)(white-black);
Michael Niedermayer
committed
packedYScale= (uint16_t)(scale*512.0 + 0.5);
packedYScale|= packedYScale<<32;
packedYScale|= packedYScale<<16;
}
else
{
packedYScale= 0x0100010001000100LL;
packedYOffset= 0;
}
for(x=0; x<width; x+=BLOCK_SIZE)
blockCopy(dst + x, dstStride, src + x, srcStride, 8, mode & LEVEL_FIX);
for(y=0; y<height; y+=BLOCK_SIZE)
{
//1% speedup if these are here instead of the inner loop
uint8_t *srcBlock= &(src[y*srcStride]);
uint8_t *dstBlock= &(dst[y*dstStride]);
uint8_t *vertSrcBlock= &(srcBlock[srcStride*3]); // Blocks are 10x8 -> *3 to start
uint8_t *vertBlock= &(dstBlock[dstStride*3]);
// finish 1 block before the next otherwise well might have a problem
// with the L1 Cache of the P4 ... or only a few blocks at a time or soemthing
for(x=0; x<width; x+=BLOCK_SIZE)
const int stride= dstStride;
int QP= isColor ?
QPs[(y>>3)*QPStride + (x>>3)]:
QPs[(y>>4)*QPStride + (x>>4)];
if(!isColor && (mode & LEVEL_FIX)) QP= (QP* (packedYScale &0xFFFF))>>8;
#ifdef HAVE_MMX
asm volatile(
"movd %0, %%mm7 \n\t"
"packuswb %%mm7, %%mm7 \n\t" // 0, 0, 0, QP, 0, 0, 0, QP
"packuswb %%mm7, %%mm7 \n\t" // 0,QP, 0, QP, 0,QP, 0, QP
"packuswb %%mm7, %%mm7 \n\t" // QP,..., QP
"movq %%mm7, pQPb \n\t"
: : "r" (QP)
);
#endif
if(y + 12 < height)
{
#ifdef MORE_TIMEING
T0= rdtsc();
#endif
prefetchnta(vertSrcBlock + (((x>>3)&3) + 2)*srcStride + 32);
prefetchnta(vertSrcBlock + (((x>>3)&3) + 6)*srcStride + 32);
prefetcht0(vertBlock + (((x>>3)&3) + 2)*dstStride + 32);
prefetcht0(vertBlock + (((x>>3)&3) + 6)*dstStride + 32);
#elif defined(HAVE_3DNOW)
//FIXME check if this is faster on an 3dnow chip or if its faster without the prefetch or ...
/* prefetch(vertSrcBlock + (((x>>3)&3) + 2)*srcStride + 32);
prefetch(vertSrcBlock + (((x>>3)&3) + 6)*srcStride + 32);
prefetchw(vertBlock + (((x>>3)&3) + 2)*dstStride + 32);
prefetchw(vertBlock + (((x>>3)&3) + 6)*dstStride + 32);
*/
#endif
if(!isColor) yHistogram[ srcBlock[0] ]++;
blockCopy(vertBlock + dstStride*2, dstStride,
vertSrcBlock + srcStride*2, srcStride, 8, mode & LEVEL_FIX);
#ifdef MORE_TIMEING
T1= rdtsc();
memcpyTime+= T1-T0;
T0=T1;
#endif
if(mode & V_RK1_FILTER)
vertRK1Filter(vertBlock, stride, QP);
else if(mode & V_X1_FILTER)
vertX1Filter(vertBlock, stride, QP);
else
{
if( isVertDC(vertBlock, stride))
{
if(isVertMinMaxOk(vertBlock, stride, QP))
doVertLowPass(vertBlock, stride, QP);
}
else
doVertDefFilter(vertBlock, stride, QP);
}
}
#ifdef MORE_TIMEING
T1= rdtsc();
vertTime+= T1-T0;
T0=T1;
#endif
}
else
blockCopy(vertBlock + dstStride*1, dstStride,
vertSrcBlock + srcStride*1, srcStride, 4, mode & LEVEL_FIX);
if(x - 8 >= 0 && x<width)
{
#ifdef MORE_TIMEING
T0= rdtsc();
#endif
if(mode & H_X1_FILTER)
horizX1Filter(dstBlock-4, stride, QP);
else
if( isHorizDCAndCopy2Temp(dstBlock-4, stride))
{
if(isHorizMinMaxOk(tempBlock, TEMP_STRIDE, QP))
doHorizLowPassAndCopyBack(dstBlock-4, stride, QP);
}
else
doHorizDefFilterAndCopyBack(dstBlock-4, stride, QP);
}
#ifdef MORE_TIMEING
T1= rdtsc();
horizTime+= T1-T0;
T0=T1;
#endif
dering(dstBlock - 9 - stride, stride, QP);
}
else if(y!=0)
dering(dstBlock - stride*9 + width-9, stride, QP);
//FIXME dering filter will not be applied to last block (bottom right)
dstBlock+=8;
srcBlock+=8;
vertBlock+=8;
vertSrcBlock+=8;
}
}
#ifdef HAVE_3DNOW
asm volatile("femms");
#elif defined (HAVE_MMX)
asm volatile("emms");
#endif
#ifdef TIMEING
// FIXME diff is mostly the time spent for rdtsc (should subtract that but ...)
sumTime= rdtsc() - sumTime;
if(!isColor)
printf("cpy:%4dk, vert:%4dk, horiz:%4dk, sum:%4dk, diff:%4dk, color: %d/%d \r",
(int)(memcpyTime/1000), (int)(vertTime/1000), (int)(horizTime/1000),
(int)(sumTime/1000), (int)((sumTime-memcpyTime-vertTime-horizTime)/1000)
, black, white);
#endif
}