postprocess.c

#else
				for(i=0; i<numLines; i++)
					memcpy(	&(dst[dstStride*i]),
						&(src[srcStride*i]), BLOCK_SIZE);
#endif
	}
}


/**
 * Filters array of bytes (Y or U or V values)
 */
void postProcess(uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int width, int height,
	QP_STORE_T QPs[], int QPStride, int isColor, int mode)
{
	int x,y;
	/* we need 64bit here otherwise well going to have a problem
	   after watching a black picture for 5 hours*/
	static uint64_t *yHistogram= NULL;
	int black=0, white=255; // blackest black and whitest white in the picture

#ifdef TIMEING
	long long T0, T1, memcpyTime=0, vertTime=0, horizTime=0, sumTime, diffTime=0;
	sumTime= rdtsc();
#endif

	if(!yHistogram)
	{
		int i;
		yHistogram= (uint64_t*)malloc(8*256);
		for(i=0; i<256; i++) yHistogram[i]= width*height/64*15/256;
	}

	if(!isColor)
	{
		uint64_t sum= 0;
		int i;
		static int framenum= -1;
		uint64_t maxClipped;
		uint64_t clipped;
		double scale;

		framenum++;
		if(framenum == 1) yHistogram[0]= width*height/64*15/256;

		for(i=0; i<256; i++)
		{
			sum+= yHistogram[i];
//			printf("%d ", yHistogram[i]);
		}
//		printf("\n\n");

		/* we allways get a completly black picture first */

		maxClipped= (uint64_t)(sum * maxClippedThreshold);

		clipped= sum;
		for(black=255; black>0; black--)
		{
			if(clipped < maxClipped) break;
			clipped-= yHistogram[black];
		}

		clipped= sum;
		for(white=0; white<256; white++)
		{
			if(clipped < maxClipped) break;
			clipped-= yHistogram[white];
		}

		// we cant handle negative correctures
		packedYOffset= MAX(black - minAllowedY, 0);
		packedYOffset|= packedYOffset<<32;
		packedYOffset|= packedYOffset<<16;
		packedYOffset|= packedYOffset<<8;

		scale= (double)(maxAllowedY - minAllowedY) / (double)(white-black);

		packedYScale= (uint16_t)(scale*512.0 + 0.5);
		packedYScale|= packedYScale<<32;
		packedYScale|= packedYScale<<16;
	}
	else
	{
		packedYScale= 0x0100010001000100LL;
		packedYOffset= 0;
	}

	for(x=0; x<width; x+=BLOCK_SIZE)
		blockCopy(dst + x, dstStride, src + x, srcStride, 8, mode & LEVEL_FIX);

	for(y=0; y<height; y+=BLOCK_SIZE)
	{
		//1% speedup if these are here instead of the inner loop
		uint8_t *srcBlock= &(src[y*srcStride]);
		uint8_t *dstBlock= &(dst[y*dstStride]);
		uint8_t *vertSrcBlock= &(srcBlock[srcStride*3]); // Blocks are 10x8 -> *3 to start
		uint8_t *vertBlock= &(dstBlock[dstStride*3]);

		// finish 1 block before the next otherwise well might have a problem
		// with the L1 Cache of the P4 ... or only a few blocks at a time or soemthing
		for(x=0; x<width; x+=BLOCK_SIZE)
		{
			const int stride= dstStride;
			int QP= isColor ?
				QPs[(y>>3)*QPStride + (x>>3)]:
				QPs[(y>>4)*QPStride + (x>>4)];
			if(!isColor && (mode & LEVEL_FIX)) QP= (QP* (packedYScale &0xFFFF))>>8;
#ifdef HAVE_MMX
		asm volatile(
			"movd %0, %%mm7					\n\t"
			"packuswb %%mm7, %%mm7				\n\t" // 0, 0, 0, QP, 0, 0, 0, QP
			"packuswb %%mm7, %%mm7				\n\t" // 0,QP, 0, QP, 0,QP, 0, QP
			"packuswb %%mm7, %%mm7				\n\t" // QP,..., QP
			"movq %%mm7, pQPb				\n\t"
			: : "r" (QP)
		);
#endif


			if(y + 12 < height)
			{
#ifdef MORE_TIMEING
				T0= rdtsc();
#endif

#ifdef HAVE_MMX2
				prefetchnta(vertSrcBlock + (((x>>3)&3) + 2)*srcStride + 32);
				prefetchnta(vertSrcBlock + (((x>>3)&3) + 6)*srcStride + 32);
				prefetcht0(vertBlock + (((x>>3)&3) + 2)*dstStride + 32);
				prefetcht0(vertBlock + (((x>>3)&3) + 6)*dstStride + 32);
#elif defined(HAVE_3DNOW)
//FIXME check if this is faster on an 3dnow chip or if its faster without the prefetch or ...
/*				prefetch(vertSrcBlock + (((x>>3)&3) + 2)*srcStride + 32);
				prefetch(vertSrcBlock + (((x>>3)&3) + 6)*srcStride + 32);
				prefetchw(vertBlock + (((x>>3)&3) + 2)*dstStride + 32);
				prefetchw(vertBlock + (((x>>3)&3) + 6)*dstStride + 32);
*/
#endif
				if(!isColor) yHistogram[ srcBlock[0] ]++;

				blockCopy(vertBlock + dstStride*2, dstStride,
					vertSrcBlock + srcStride*2, srcStride, 8, mode & LEVEL_FIX);


#ifdef MORE_TIMEING
				T1= rdtsc();
				memcpyTime+= T1-T0;
				T0=T1;
#endif
				if(mode & V_DEBLOCK)
				{
					if(mode & V_RK1_FILTER)
						vertRK1Filter(vertBlock, stride, QP);
					else if(mode & V_X1_FILTER)
						vertX1Filter(vertBlock, stride, QP);
					else
					{
						if( isVertDC(vertBlock, stride))
						{
							if(isVertMinMaxOk(vertBlock, stride, QP))
								doVertLowPass(vertBlock, stride, QP);
						}
						else
							doVertDefFilter(vertBlock, stride, QP);
					}
				}
#ifdef MORE_TIMEING
				T1= rdtsc();
				vertTime+= T1-T0;
				T0=T1;
#endif
			}
			else
				blockCopy(vertBlock + dstStride*1, dstStride,
					vertSrcBlock + srcStride*1, srcStride, 4, mode & LEVEL_FIX);


			if(x - 8 >= 0 && x<width)
			{
#ifdef MORE_TIMEING
				T0= rdtsc();
#endif
				if(mode & H_DEBLOCK)
				{
					if(mode & H_X1_FILTER)
						horizX1Filter(dstBlock-4, stride, QP);
					else
					{
						if( isHorizDCAndCopy2Temp(dstBlock-4, stride))
						{
							if(isHorizMinMaxOk(tempBlock, TEMP_STRIDE, QP))
								doHorizLowPassAndCopyBack(dstBlock-4, stride, QP);
						}
						else
							doHorizDefFilterAndCopyBack(dstBlock-4, stride, QP);
					}
				}
#ifdef MORE_TIMEING
				T1= rdtsc();
				horizTime+= T1-T0;
				T0=T1;
#endif
				dering(dstBlock - 9 - stride, stride, QP);
			}
			else if(y!=0)
				dering(dstBlock - stride*9 + width-9, stride, QP);
			//FIXME dering filter will not be applied to last block (bottom right)


			dstBlock+=8;
			srcBlock+=8;
			vertBlock+=8;
			vertSrcBlock+=8;
		}
	}
#ifdef HAVE_3DNOW
	asm volatile("femms");
#elif defined (HAVE_MMX)
	asm volatile("emms");
#endif

#ifdef TIMEING
	// FIXME diff is mostly the time spent for rdtsc (should subtract that but ...)
	sumTime= rdtsc() - sumTime;
	if(!isColor)
		printf("cpy:%4dk, vert:%4dk, horiz:%4dk, sum:%4dk, diff:%4dk, color: %d/%d    \r",
			(int)(memcpyTime/1000), (int)(vertTime/1000), (int)(horizTime/1000),
			(int)(sumTime/1000), (int)((sumTime-memcpyTime-vertTime-horizTime)/1000)
			, black, white);
#endif
}