Skip to content
Snippets Groups Projects
postprocess.c 61.3 KiB
Newer Older
  • Learn to ignore specific revisions
  • 					memcpy(	&(dst[dstStride*i]),
    						&(src[srcStride*i]), BLOCK_SIZE);
    #endif
    
    }
    
    
    /**
     * Filters array of bytes (Y or U or V values)
     */
    void postProcess(uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int width, int height,
    
    	QP_STORE_T QPs[], int QPStride, int isColor, int mode)
    
    	int x,y;
    	/* we need 64bit here otherwise well going to have a problem
    	   after watching a black picture for 5 hours*/
    	static uint64_t *yHistogram= NULL;
    	int black=0, white=255; // blackest black and whitest white in the picture
    
    
    #ifdef TIMEING
    	long long T0, T1, memcpyTime=0, vertTime=0, horizTime=0, sumTime, diffTime=0;
    	sumTime= rdtsc();
    #endif
    
    	if(!yHistogram)
    	{
    
    		int i;
    		yHistogram= (uint64_t*)malloc(8*256);
    		for(i=0; i<256; i++) yHistogram[i]= width*height/64*15/256;
    
    		int i;
    		static int framenum= -1;
    		uint64_t maxClipped;
    		uint64_t clipped;
    		double scale;
    
    		framenum++;
    		if(framenum == 1) yHistogram[0]= width*height/64*15/256;
    
    		for(i=0; i<256; i++)
    		{
    
    //			printf("%d ", yHistogram[i]);
    		}
    //		printf("\n\n");
    
    		/* we allways get a completly black picture first */
    
    		maxClipped= (uint64_t)(sum * maxClippedThreshold);
    
    		for(black=255; black>0; black--)
    		{
    			if(clipped < maxClipped) break;
    			clipped-= yHistogram[black];
    		}
    
    		clipped= sum;
    		for(white=0; white<256; white++)
    		{
    			if(clipped < maxClipped) break;
    			clipped-= yHistogram[white];
    		}
    
    		// we cant handle negative correctures
    		packedYOffset= MAX(black - minAllowedY, 0);
    		packedYOffset|= packedYOffset<<32;
    		packedYOffset|= packedYOffset<<16;
    		packedYOffset|= packedYOffset<<8;
    
    
    		scale= (double)(maxAllowedY - minAllowedY) / (double)(white-black);
    
    		packedYScale|= packedYScale<<32;
    		packedYScale|= packedYScale<<16;
    	}
    	else
    	{
    		packedYScale= 0x0100010001000100LL;
    		packedYOffset= 0;
    	}
    
    
    	for(x=0; x<width; x+=BLOCK_SIZE)
    		blockCopy(dst + x, dstStride, src + x, srcStride, 8, mode & LEVEL_FIX);
    
    	for(y=0; y<height; y+=BLOCK_SIZE)
    
    	{
    		//1% speedup if these are here instead of the inner loop
    		uint8_t *srcBlock= &(src[y*srcStride]);
    		uint8_t *dstBlock= &(dst[y*dstStride]);
    		uint8_t *vertSrcBlock= &(srcBlock[srcStride*3]); // Blocks are 10x8 -> *3 to start
    		uint8_t *vertBlock= &(dstBlock[dstStride*3]);
    
    		// finish 1 block before the next otherwise well might have a problem
    		// with the L1 Cache of the P4 ... or only a few blocks at a time or soemthing
    
    		for(x=0; x<width; x+=BLOCK_SIZE)
    
    			int QP= isColor ?
    				QPs[(y>>3)*QPStride + (x>>3)]:
    
    				QPs[(y>>4)*QPStride + (x>>4)];
    			if(!isColor && (mode & LEVEL_FIX)) QP= (QP* (packedYScale &0xFFFF))>>8;
    
    #ifdef HAVE_MMX
    		asm volatile(
    			"movd %0, %%mm7					\n\t"
    			"packuswb %%mm7, %%mm7				\n\t" // 0, 0, 0, QP, 0, 0, 0, QP
    			"packuswb %%mm7, %%mm7				\n\t" // 0,QP, 0, QP, 0,QP, 0, QP
    			"packuswb %%mm7, %%mm7				\n\t" // QP,..., QP
    			"movq %%mm7, pQPb				\n\t"
    			: : "r" (QP)
    		);
    #endif
    
    
    			if(y + 12 < height)
    			{
    #ifdef MORE_TIMEING
    				T0= rdtsc();
    #endif
    
    
    #ifdef HAVE_MMX2
    
    				prefetchnta(vertSrcBlock + (((x>>3)&3) + 2)*srcStride + 32);
    				prefetchnta(vertSrcBlock + (((x>>3)&3) + 6)*srcStride + 32);
    				prefetcht0(vertBlock + (((x>>3)&3) + 2)*dstStride + 32);
    				prefetcht0(vertBlock + (((x>>3)&3) + 6)*dstStride + 32);
    
    #elif defined(HAVE_3DNOW)
    //FIXME check if this is faster on an 3dnow chip or if its faster without the prefetch or ...
    /*				prefetch(vertSrcBlock + (((x>>3)&3) + 2)*srcStride + 32);
    				prefetch(vertSrcBlock + (((x>>3)&3) + 6)*srcStride + 32);
    				prefetchw(vertBlock + (((x>>3)&3) + 2)*dstStride + 32);
    				prefetchw(vertBlock + (((x>>3)&3) + 6)*dstStride + 32);
    */
    
    #endif
    				if(!isColor) yHistogram[ srcBlock[0] ]++;
    
    				blockCopy(vertBlock + dstStride*2, dstStride,
    
    					vertSrcBlock + srcStride*2, srcStride, 8, mode & LEVEL_FIX);
    
    
    
    #ifdef MORE_TIMEING
    				T1= rdtsc();
    				memcpyTime+= T1-T0;
    				T0=T1;
    #endif
    
    				if(mode & V_DEBLOCK)
    
    					if(mode & V_RK1_FILTER)
    						vertRK1Filter(vertBlock, stride, QP);
    					else if(mode & V_X1_FILTER)
    
    						vertX1Filter(vertBlock, stride, QP);
    					else
    					{
    						if( isVertDC(vertBlock, stride))
    						{
    							if(isVertMinMaxOk(vertBlock, stride, QP))
    								doVertLowPass(vertBlock, stride, QP);
    						}
    						else
    							doVertDefFilter(vertBlock, stride, QP);
    					}
    
    				}
    #ifdef MORE_TIMEING
    				T1= rdtsc();
    				vertTime+= T1-T0;
    				T0=T1;
    #endif
    			}
    			else
    
    				blockCopy(vertBlock + dstStride*1, dstStride,
    					vertSrcBlock + srcStride*1, srcStride, 4, mode & LEVEL_FIX);
    
    
    
    			if(x - 8 >= 0 && x<width)
    			{
    #ifdef MORE_TIMEING
    				T0= rdtsc();
    #endif
    
    				if(mode & H_DEBLOCK)
    
    					if(mode & H_X1_FILTER)
    						horizX1Filter(dstBlock-4, stride, QP);
    					else
    
    						if( isHorizDCAndCopy2Temp(dstBlock-4, stride))
    						{
    							if(isHorizMinMaxOk(tempBlock, TEMP_STRIDE, QP))
    								doHorizLowPassAndCopyBack(dstBlock-4, stride, QP);
    						}
    						else
    							doHorizDefFilterAndCopyBack(dstBlock-4, stride, QP);
    
    				}
    #ifdef MORE_TIMEING
    				T1= rdtsc();
    				horizTime+= T1-T0;
    				T0=T1;
    #endif
    				dering(dstBlock - 9 - stride, stride, QP);
    			}
    			else if(y!=0)
    				dering(dstBlock - stride*9 + width-9, stride, QP);
    			//FIXME dering filter will not be applied to last block (bottom right)
    
    
    			dstBlock+=8;
    			srcBlock+=8;
    			vertBlock+=8;
    			vertSrcBlock+=8;
    		}
    	}
    
    #ifdef HAVE_3DNOW
    	asm volatile("femms");
    #elif defined (HAVE_MMX)
    
    	asm volatile("emms");
    #endif
    
    #ifdef TIMEING
    	// FIXME diff is mostly the time spent for rdtsc (should subtract that but ...)
    	sumTime= rdtsc() - sumTime;
    	if(!isColor)
    		printf("cpy:%4dk, vert:%4dk, horiz:%4dk, sum:%4dk, diff:%4dk, color: %d/%d    \r",
    
    			(int)(memcpyTime/1000), (int)(vertTime/1000), (int)(horizTime/1000),
    			(int)(sumTime/1000), (int)((sumTime-memcpyTime-vertTime-horizTime)/1000)