Skip to content
Snippets Groups Projects
postprocess.c 94.4 KiB
Newer Older
  • Learn to ignore specific revisions
  • 	for(y=0; y<height; y+=BLOCK_SIZE)
    
    	{
    		//1% speedup if these are here instead of the inner loop
    		uint8_t *srcBlock= &(src[y*srcStride]);
    		uint8_t *dstBlock= &(dst[y*dstStride]);
    
    Michael Niedermayer's avatar
    Michael Niedermayer committed
    #ifdef ARCH_X86
    		int *QPptr= isColor ? &QPs[(y>>3)*QPStride] :&QPs[(y>>4)*QPStride];
    		int QPDelta= isColor ? 1<<(32-3) : 1<<(32-4);
    		int QPFrac= QPDelta;
    
    		uint8_t *tempBlock1= tempBlocks;
    		uint8_t *tempBlock2= tempBlocks + 8;
    
    Michael Niedermayer's avatar
    Michael Niedermayer committed
    #endif
    
    Michael Niedermayer's avatar
    Michael Niedermayer committed
    		/* can we mess with a 8x16 block from srcBlock/dstBlock downwards and 1 line upwards
    		   if not than use a temporary buffer */
    
    		if(y+15 >= height)
    		{
    
    Michael Niedermayer's avatar
    Michael Niedermayer committed
    			/* copy from line 5 to 12 of src, these will be copied with
    
    			   blockcopy to dst later */
    			memcpy(tempSrc + srcStride*5, srcBlock + srcStride*5,
    				srcStride*MAX(height-y-5, 0) );
    
    			/* duplicate last line to fill the void upto line 12 */
    			if(y+12 >= height)
    			{
    				int i;
    				for(i=height-y; i<=12; i++)
    					memcpy(tempSrc + srcStride*i,
    						src + srcStride*(height-1), srcStride);
    			}
    
    
    
    Michael Niedermayer's avatar
    Michael Niedermayer committed
    			/* copy up to 6 lines of dst */
    			memcpy(tempDst, dstBlock - dstStride, dstStride*MIN(height-y+1, 6) );
    			dstBlock= tempDst + dstStride;
    
    			srcBlock= tempSrc;
    		}
    
    		// From this point on it is guranteed that we can read and write 16 lines downward
    
    		// finish 1 block before the next otherwise well might have a problem
    		// with the L1 Cache of the P4 ... or only a few blocks at a time or soemthing
    
    		for(x=0; x<width; x+=BLOCK_SIZE)
    
    Michael Niedermayer's avatar
    Michael Niedermayer committed
    #ifdef ARCH_X86
    			int QP= *QPptr;
    			asm volatile(
    				"addl %2, %1		\n\t"
    				"sbbl %%eax, %%eax	\n\t"
    				"shll $2, %%eax		\n\t"
    				"subl %%eax, %0		\n\t"
    				: "+r" (QPptr), "+m" (QPFrac)
    				: "r" (QPDelta)
    				: "%eax"
    			);
    #else
    			int QP= isColor ?
                                    QPs[(y>>3)*QPStride + (x>>3)]:
                                    QPs[(y>>4)*QPStride + (x>>4)];
    #endif
    			if(!isColor)
    
    Michael Niedermayer's avatar
    Michael Niedermayer committed
    			{
    
    Michael Niedermayer's avatar
    Michael Niedermayer committed
    				QP= (QP* QPCorrecture)>>8;
    				yHistogram[ srcBlock[srcStride*4 + 4] ]++;
    
    Michael Niedermayer's avatar
    Michael Niedermayer committed
    			}
    
    			asm volatile(
    				"movd %0, %%mm7					\n\t"
    				"packuswb %%mm7, %%mm7				\n\t" // 0, 0, 0, QP, 0, 0, 0, QP
    				"packuswb %%mm7, %%mm7				\n\t" // 0,QP, 0, QP, 0,QP, 0, QP
    				"packuswb %%mm7, %%mm7				\n\t" // QP,..., QP
    				"movq %%mm7, pQPb				\n\t"
    				: : "r" (QP)
    			);
    
    Michael Niedermayer's avatar
    Michael Niedermayer committed
    #ifdef MORE_TIMING
    
    			T0= rdtsc();
    
    #ifdef HAVE_MMX2
    
    Michael Niedermayer's avatar
    Michael Niedermayer committed
    /*
    
    			prefetchnta(srcBlock + (((x>>3)&3) + 5)*srcStride + 32);
    			prefetchnta(srcBlock + (((x>>3)&3) + 9)*srcStride + 32);
    			prefetcht0(dstBlock + (((x>>3)&3) + 5)*dstStride + 32);
    			prefetcht0(dstBlock + (((x>>3)&3) + 9)*dstStride + 32);
    
    Michael Niedermayer's avatar
    Michael Niedermayer committed
    */
    /*
    			prefetchnta(srcBlock + (((x>>2)&6) + 5)*srcStride + 32);
    			prefetchnta(srcBlock + (((x>>2)&6) + 6)*srcStride + 32);
    			prefetcht0(dstBlock + (((x>>2)&6) + 5)*dstStride + 32);
    			prefetcht0(dstBlock + (((x>>2)&6) + 6)*dstStride + 32);
    */
    
    			asm(
    				"movl %4, %%eax			\n\t"
    				"shrl $2, %%eax			\n\t"
    				"andl $6, %%eax			\n\t"
    				"addl $5, %%eax			\n\t"
    				"movl %%eax, %%ebx		\n\t"
    				"imul %1, %%eax			\n\t"
    				"imul %3, %%ebx			\n\t"
    				"prefetchnta 32(%%eax, %0)	\n\t"
    				"prefetcht0 32(%%ebx, %2)	\n\t"
    				"addl %1, %%eax			\n\t"
    				"addl %3, %%ebx			\n\t"
    				"prefetchnta 32(%%eax, %0)	\n\t"
    				"prefetcht0 32(%%ebx, %2)	\n\t"
    			:: "r" (srcBlock), "r" (srcStride), "r" (dstBlock), "r" (dstStride),
    			"m" (x)
    			: "%eax", "%ebx"
    			);
    
    
    #elif defined(HAVE_3DNOW)
    //FIXME check if this is faster on an 3dnow chip or if its faster without the prefetch or ...
    
    /*			prefetch(srcBlock + (((x>>3)&3) + 5)*srcStride + 32);
    			prefetch(srcBlock + (((x>>3)&3) + 9)*srcStride + 32);
    			prefetchw(dstBlock + (((x>>3)&3) + 5)*dstStride + 32);
    			prefetchw(dstBlock + (((x>>3)&3) + 9)*dstStride + 32);
    
    #ifdef PP_FUNNY_STRIDE
    
    			//can we mess with a 8x16 block, if not use a temp buffer, yes again
    			if(x+7 >= width)
    			{
    				int i;
    				dstBlockPtrBackup= dstBlock;
    				srcBlockPtrBackup= srcBlock;
    
    				for(i=0;i<BLOCK_SIZE*2; i++)
    				{
    					memcpy(tempSrcBlock+i*srcStride, srcBlock+i*srcStride, width-x);
    					memcpy(tempDstBlock+i*dstStride, dstBlock+i*dstStride, width-x);
    				}
    
    				dstBlock= tempDstBlock;
    				srcBlock= tempSrcBlock;
    			}
    
    			blockCopy(dstBlock + dstStride*5, dstStride,
    				srcBlock + srcStride*5, srcStride, 8, mode & LEVEL_FIX);
    
    			if(mode & LINEAR_IPOL_DEINT_FILTER)
    				deInterlaceInterpolateLinear(dstBlock, dstStride);
    			else if(mode & LINEAR_BLEND_DEINT_FILTER)
    				deInterlaceBlendLinear(dstBlock, dstStride);
    			else if(mode & MEDIAN_DEINT_FILTER)
    				deInterlaceMedian(dstBlock, dstStride);
    			else if(mode & CUBIC_IPOL_DEINT_FILTER)
    				deInterlaceInterpolateCubic(dstBlock, dstStride);
    /*			else if(mode & CUBIC_BLEND_DEINT_FILTER)
    				deInterlaceBlendCubic(dstBlock, dstStride);
    
    			/* only deblock if we have 2 blocks */
    			if(y + 8 < height)
    			{
    
    Michael Niedermayer's avatar
    Michael Niedermayer committed
    #ifdef MORE_TIMING
    
    				T1= rdtsc();
    				memcpyTime+= T1-T0;
    				T0=T1;
    #endif
    
    				if(mode & V_RK1_FILTER)
    					vertRK1Filter(dstBlock, stride, QP);
    				else if(mode & V_X1_FILTER)
    					vertX1Filter(dstBlock, stride, QP);
    				else if(mode & V_DEBLOCK)
    
    						if(isVertMinMaxOk(dstBlock, stride, QP))
    							doVertLowPass(dstBlock, stride, QP);
    
    Michael Niedermayer's avatar
    Michael Niedermayer committed
    #ifdef MORE_TIMING
    
    				T1= rdtsc();
    				vertTime+= T1-T0;
    				T0=T1;
    #endif
    			}
    
    #ifdef HAVE_MMX
    			transpose1(tempBlock1, tempBlock2, dstBlock, dstStride);
    #endif
    
    			/* check if we have a previous block to deblock it with dstBlock */
    
    			if(x - 8 >= 0)
    
    Michael Niedermayer's avatar
    Michael Niedermayer committed
    #ifdef MORE_TIMING
    
    #ifdef HAVE_MMX
    				if(mode & H_RK1_FILTER)
    					vertRK1Filter(tempBlock1, 16, QP);
    				else if(mode & H_X1_FILTER)
    					vertX1Filter(tempBlock1, 16, QP);
    				else if(mode & H_DEBLOCK)
    				{
    					if( isVertDC(tempBlock1, 16))
    					{
    						if(isVertMinMaxOk(tempBlock1, 16, QP))
    							doVertLowPass(tempBlock1, 16, QP);
    					}
    					else
    						doVertDefFilter(tempBlock1, 16, QP);
    				}
    
    				transpose2(dstBlock-4, dstStride, tempBlock1 + 4*16);
    
    #else
    
    				if(mode & H_X1_FILTER)
    					horizX1Filter(dstBlock-4, stride, QP);
    				else if(mode & H_DEBLOCK)
    
    						if(isHorizMinMaxOk(dstBlock-4, stride, QP))
    							doHorizLowPass(dstBlock-4, stride, QP);
    
    						doHorizDefFilter(dstBlock-4, stride, QP);
    
    Michael Niedermayer's avatar
    Michael Niedermayer committed
    #ifdef MORE_TIMING
    
    				T1= rdtsc();
    				horizTime+= T1-T0;
    				T0=T1;
    #endif
    
    Michael Niedermayer's avatar
    Michael Niedermayer committed
    				if(mode & DERING)
    				{
    				//FIXME filter first line
    					if(y>0) dering(dstBlock - stride - 8, stride, QP);
    				}
    			}
    			else if(mode & DERING)
    			{
    			 //FIXME y+15 is required cuz of the tempBuffer thing -> bottom right block isnt filtered
    					if(y > 8 && y+15 < height) dering(dstBlock - stride*9 + width - 8, stride, QP);
    
    #ifdef PP_FUNNY_STRIDE
    
    			/* did we use a tmp-block buffer */
    			if(x+7 >= width)
    			{
    				int i;
    				dstBlock= dstBlockPtrBackup;
    				srcBlock= srcBlockPtrBackup;
    
    				for(i=0;i<BLOCK_SIZE*2; i++)
    				{
    					memcpy(dstBlock+i*dstStride, tempDstBlock+i*dstStride, width-x);
    				}
    			}
    
    			tmpXchg= tempBlock1;
    			tempBlock1= tempBlock2;
    			tempBlock2 = tmpXchg;
    
    		}
    
    		/* did we use a tmp buffer */
    
    		if(y+15 >= height)
    
    		{
    			uint8_t *dstBlock= &(dst[y*dstStride]);
    
    Michael Niedermayer's avatar
    Michael Niedermayer committed
    			memcpy(dstBlock, tempDst + dstStride, dstStride*(height-y) );
    
    #ifdef HAVE_3DNOW
    	asm volatile("femms");
    #elif defined (HAVE_MMX)
    
    Michael Niedermayer's avatar
    Michael Niedermayer committed
    #ifdef TIMING
    
    	// FIXME diff is mostly the time spent for rdtsc (should subtract that but ...)
    	sumTime= rdtsc() - sumTime;
    	if(!isColor)
    		printf("cpy:%4dk, vert:%4dk, horiz:%4dk, sum:%4dk, diff:%4dk, color: %d/%d    \r",
    
    			(int)(memcpyTime/1000), (int)(vertTime/1000), (int)(horizTime/1000),
    			(int)(sumTime/1000), (int)((sumTime-memcpyTime-vertTime-horizTime)/1000)