postprocess_template.c

// 100 opcodes
            "psllw $3, %%mm2                        \n\t" // 8QP
            "movq %%mm2, %%mm3                      \n\t" // 8QP
            "pcmpgtw %%mm4, %%mm2                   \n\t"
            "pcmpgtw %%mm5, %%mm3                   \n\t"
            "pand %%mm2, %%mm4                      \n\t"
            "pand %%mm3, %%mm5                      \n\t"


            "psubusw %%mm0, %%mm4                   \n\t" // hd
            "psubusw %%mm1, %%mm5                   \n\t" // ld


            "movq "MANGLE(w05)", %%mm2              \n\t" // 5
            "pmullw %%mm2, %%mm4                    \n\t"
            "pmullw %%mm2, %%mm5                    \n\t"
            "movq "MANGLE(w20)", %%mm2              \n\t" // 32
            "paddw %%mm2, %%mm4                     \n\t"
            "paddw %%mm2, %%mm5                     \n\t"
            "psrlw $6, %%mm4                        \n\t"
            "psrlw $6, %%mm5                        \n\t"

            "movq 16(%4), %%mm0                     \n\t" // L3 - L4
            "movq 24(%4), %%mm1                     \n\t" // H3 - H4

            "pxor %%mm2, %%mm2                      \n\t"
            "pxor %%mm3, %%mm3                      \n\t"

            "pcmpgtw %%mm0, %%mm2                   \n\t" // sign (L3-L4)
            "pcmpgtw %%mm1, %%mm3                   \n\t" // sign (H3-H4)
            "pxor %%mm2, %%mm0                      \n\t"
            "pxor %%mm3, %%mm1                      \n\t"
            "psubw %%mm2, %%mm0                     \n\t" // |L3-L4|
            "psubw %%mm3, %%mm1                     \n\t" // |H3-H4|
            "psrlw $1, %%mm0                        \n\t" // |L3 - L4|/2
            "psrlw $1, %%mm1                        \n\t" // |H3 - H4|/2

            "pxor %%mm6, %%mm2                      \n\t"
            "pxor %%mm7, %%mm3                      \n\t"
            "pand %%mm2, %%mm4                      \n\t"
            "pand %%mm3, %%mm5                      \n\t"

#if TEMPLATE_PP_MMXEXT
            "pminsw %%mm0, %%mm4                    \n\t"
            "pminsw %%mm1, %%mm5                    \n\t"
#else
            "movq %%mm4, %%mm2                      \n\t"
            "psubusw %%mm0, %%mm2                   \n\t"
            "psubw %%mm2, %%mm4                     \n\t"
            "movq %%mm5, %%mm2                      \n\t"
            "psubusw %%mm1, %%mm2                   \n\t"
            "psubw %%mm2, %%mm5                     \n\t"
#endif
            "pxor %%mm6, %%mm4                      \n\t"
            "pxor %%mm7, %%mm5                      \n\t"
            "psubw %%mm6, %%mm4                     \n\t"
            "psubw %%mm7, %%mm5                     \n\t"
            "packsswb %%mm5, %%mm4                  \n\t"
            "movq %3, %%mm1                         \n\t"
            "pandn %%mm4, %%mm1                     \n\t"
            "movq (%0), %%mm0                       \n\t"
            "paddb   %%mm1, %%mm0                   \n\t"
            "movq %%mm0, (%0)                       \n\t"
            "movq (%0, %1), %%mm0                   \n\t"
            "psubb %%mm1, %%mm0                     \n\t"
            "movq %%mm0, (%0, %1)                   \n\t"

            : "+r" (temp_src)
            : "r" ((x86_reg)step), "m" (c->pQPb), "m"(eq_mask), "r"(tmp)
              NAMED_CONSTRAINTS_ADD(w05,w20)
            : "%"FF_REG_a
        );
    }
/*if(step==16){
    STOP_TIMER("step16")
}else{
    STOP_TIMER("stepX")
}
    } */
}
#endif //TEMPLATE_PP_MMX

static void RENAME(postProcess)(const uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int width, int height,
                                const int8_t QPs[], int QPStride, int isColor, PPContext *c);

/**
 * Copy a block from src to dst and fixes the blacklevel.
 * levelFix == 0 -> do not touch the brightness & contrast
 */
#undef REAL_SCALED_CPY
#undef SCALED_CPY

static inline void RENAME(blockCopy)(uint8_t dst[], int dstStride, const uint8_t src[], int srcStride,
                                     int levelFix, int64_t *packedOffsetAndScale)
{
#if !TEMPLATE_PP_MMX || !HAVE_6REGS
    int i;
#endif
    if(levelFix){
#if TEMPLATE_PP_MMX && HAVE_6REGS
    __asm__ volatile(
        "movq (%%"FF_REG_a"), %%mm2     \n\t" // packedYOffset
        "movq 8(%%"FF_REG_a"), %%mm3    \n\t" // packedYScale
        "lea (%2,%4), %%"FF_REG_a"      \n\t"
        "lea (%3,%5), %%"FF_REG_d"      \n\t"
        "pxor %%mm4, %%mm4              \n\t"
#if TEMPLATE_PP_MMXEXT
#define REAL_SCALED_CPY(src1, src2, dst1, dst2)                                                \
        "movq " #src1 ", %%mm0          \n\t"\
        "movq " #src1 ", %%mm5          \n\t"\
        "movq " #src2 ", %%mm1          \n\t"\
        "movq " #src2 ", %%mm6          \n\t"\
        "punpcklbw %%mm0, %%mm0         \n\t"\
        "punpckhbw %%mm5, %%mm5         \n\t"\
        "punpcklbw %%mm1, %%mm1         \n\t"\
        "punpckhbw %%mm6, %%mm6         \n\t"\
        "pmulhuw %%mm3, %%mm0           \n\t"\
        "pmulhuw %%mm3, %%mm5           \n\t"\
        "pmulhuw %%mm3, %%mm1           \n\t"\
        "pmulhuw %%mm3, %%mm6           \n\t"\
        "psubw %%mm2, %%mm0             \n\t"\
        "psubw %%mm2, %%mm5             \n\t"\
        "psubw %%mm2, %%mm1             \n\t"\
        "psubw %%mm2, %%mm6             \n\t"\
        "packuswb %%mm5, %%mm0          \n\t"\
        "packuswb %%mm6, %%mm1          \n\t"\
        "movq %%mm0, " #dst1 "          \n\t"\
        "movq %%mm1, " #dst2 "          \n\t"\

#else //TEMPLATE_PP_MMXEXT
#define REAL_SCALED_CPY(src1, src2, dst1, dst2)                                        \
        "movq " #src1 ", %%mm0          \n\t"\
        "movq " #src1 ", %%mm5          \n\t"\
        "punpcklbw %%mm4, %%mm0         \n\t"\
        "punpckhbw %%mm4, %%mm5         \n\t"\
        "psubw %%mm2, %%mm0             \n\t"\
        "psubw %%mm2, %%mm5             \n\t"\
        "movq " #src2 ", %%mm1          \n\t"\
        "psllw $6, %%mm0                \n\t"\
        "psllw $6, %%mm5                \n\t"\
        "pmulhw %%mm3, %%mm0            \n\t"\
        "movq " #src2 ", %%mm6          \n\t"\
        "pmulhw %%mm3, %%mm5            \n\t"\
        "punpcklbw %%mm4, %%mm1         \n\t"\
        "punpckhbw %%mm4, %%mm6         \n\t"\
        "psubw %%mm2, %%mm1             \n\t"\
        "psubw %%mm2, %%mm6             \n\t"\
        "psllw $6, %%mm1                \n\t"\
        "psllw $6, %%mm6                \n\t"\
        "pmulhw %%mm3, %%mm1            \n\t"\
        "pmulhw %%mm3, %%mm6            \n\t"\
        "packuswb %%mm5, %%mm0          \n\t"\
        "packuswb %%mm6, %%mm1          \n\t"\
        "movq %%mm0, " #dst1 "          \n\t"\
        "movq %%mm1, " #dst2 "          \n\t"\

#endif //TEMPLATE_PP_MMXEXT
#define SCALED_CPY(src1, src2, dst1, dst2)\
   REAL_SCALED_CPY(src1, src2, dst1, dst2)

SCALED_CPY((%2)       , (%2, %4)      , (%3)       , (%3, %5))
SCALED_CPY((%2, %4, 2), (%%FF_REGa, %4, 2), (%3, %5, 2), (%%FF_REGd, %5, 2))
SCALED_CPY((%2, %4, 4), (%%FF_REGa, %4, 4), (%3, %5, 4), (%%FF_REGd, %5, 4))
        "lea (%%"FF_REG_a",%4,4), %%"FF_REG_a"        \n\t"
        "lea (%%"FF_REG_d",%5,4), %%"FF_REG_d"        \n\t"
SCALED_CPY((%%FF_REGa, %4), (%%FF_REGa, %4, 2), (%%FF_REGd, %5), (%%FF_REGd, %5, 2))


        : "=&a" (packedOffsetAndScale)
        : "0" (packedOffsetAndScale),
        "r"(src),
        "r"(dst),
        "r" ((x86_reg)srcStride),
        "r" ((x86_reg)dstStride)
        : "%"FF_REG_d
    );
#else //TEMPLATE_PP_MMX && HAVE_6REGS
    for(i=0; i<8; i++)
        memcpy( &(dst[dstStride*i]),
                &(src[srcStride*i]), BLOCK_SIZE);
#endif //TEMPLATE_PP_MMX && HAVE_6REGS
    }else{
#if TEMPLATE_PP_MMX && HAVE_6REGS
    __asm__ volatile(
        "lea (%0,%2), %%"FF_REG_a"      \n\t"
        "lea (%1,%3), %%"FF_REG_d"      \n\t"

#define REAL_SIMPLE_CPY(src1, src2, dst1, dst2)                              \
        "movq " #src1 ", %%mm0          \n\t"\
        "movq " #src2 ", %%mm1          \n\t"\
        "movq %%mm0, " #dst1 "          \n\t"\
        "movq %%mm1, " #dst2 "          \n\t"\

#define SIMPLE_CPY(src1, src2, dst1, dst2)\
   REAL_SIMPLE_CPY(src1, src2, dst1, dst2)

SIMPLE_CPY((%0)       , (%0, %2)          , (%1)       , (%1, %3))
SIMPLE_CPY((%0, %2, 2), (%%FF_REGa, %2, 2), (%1, %3, 2), (%%FF_REGd, %3, 2))
SIMPLE_CPY((%0, %2, 4), (%%FF_REGa, %2, 4), (%1, %3, 4), (%%FF_REGd, %3, 4))
        "lea (%%"FF_REG_a",%2,4), %%"FF_REG_a"        \n\t"
        "lea (%%"FF_REG_d",%3,4), %%"FF_REG_d"        \n\t"
SIMPLE_CPY((%%FF_REGa, %2), (%%FF_REGa, %2, 2), (%%FF_REGd, %3), (%%FF_REGd, %3, 2))

        : : "r" (src),
        "r" (dst),
        "r" ((x86_reg)srcStride),
        "r" ((x86_reg)dstStride)
        : "%"FF_REG_a, "%"FF_REG_d
    );
#else //TEMPLATE_PP_MMX && HAVE_6REGS
    for(i=0; i<8; i++)
        memcpy( &(dst[dstStride*i]),
                &(src[srcStride*i]), BLOCK_SIZE);
#endif //TEMPLATE_PP_MMX && HAVE_6REGS
    }
}

/**
 * Duplicate the given 8 src pixels ? times upward
 */
static inline void RENAME(duplicate)(uint8_t src[], int stride)
{
#if TEMPLATE_PP_MMX
    __asm__ volatile(
        "movq (%0), %%mm0               \n\t"
        "movq %%mm0, (%0, %1, 4)        \n\t"
        "add %1, %0                     \n\t"
        "movq %%mm0, (%0)               \n\t"
        "movq %%mm0, (%0, %1)           \n\t"
        "movq %%mm0, (%0, %1, 2)        \n\t"
        "movq %%mm0, (%0, %1, 4)        \n\t"
        : "+r" (src)
        : "r" ((x86_reg)-stride)
    );
#else
    int i;
    uint8_t *p=src;
    for(i=0; i<5; i++){
        p-= stride;
        memcpy(p, src, 8);
    }
#endif
}

#if ARCH_X86 && TEMPLATE_PP_MMXEXT
static inline void RENAME(prefetchnta)(const void *p)
{
    __asm__ volatile(   "prefetchnta (%0)\n\t"
        : : "r" (p)
    );
}

static inline void RENAME(prefetcht0)(const void *p)
{
    __asm__ volatile(   "prefetcht0 (%0)\n\t"
        : : "r" (p)
    );
}

static inline void RENAME(prefetcht1)(const void *p)
{
    __asm__ volatile(   "prefetcht1 (%0)\n\t"
        : : "r" (p)
    );
}

static inline void RENAME(prefetcht2)(const void *p)
{
    __asm__ volatile(   "prefetcht2 (%0)\n\t"
        : : "r" (p)
    );
}
#elif !ARCH_X86 && AV_GCC_VERSION_AT_LEAST(3,2)
static inline void RENAME(prefetchnta)(const void *p)
{
    __builtin_prefetch(p,0,0);
}
static inline void RENAME(prefetcht0)(const void *p)
{
    __builtin_prefetch(p,0,1);
}
static inline void RENAME(prefetcht1)(const void *p)
{
    __builtin_prefetch(p,0,2);
}
static inline void RENAME(prefetcht2)(const void *p)
{
    __builtin_prefetch(p,0,3);
}
#else
static inline void RENAME(prefetchnta)(const void *p)
{
    return;
}
static inline void RENAME(prefetcht0)(const void *p)
{
    return;
}
static inline void RENAME(prefetcht1)(const void *p)
{
    return;
}
static inline void RENAME(prefetcht2)(const void *p)
{
    return;
}
#endif
/**
 * Filter array of bytes (Y or U or V values)
 */
static void RENAME(postProcess)(const uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int width, int height,
                                const int8_t QPs[], int QPStride, int isColor, PPContext *c2)
{
    DECLARE_ALIGNED(8, PPContext, c)= *c2; //copy to stack for faster access
    int x,y;
#ifdef TEMPLATE_PP_TIME_MODE
    const int mode= TEMPLATE_PP_TIME_MODE;
#else
    const int mode= isColor ? c.ppMode.chromMode : c.ppMode.lumMode;
#endif
    int black=0, white=255; // blackest black and whitest white in the picture
    int QPCorrecture= 256*256;

    int copyAhead;
#if TEMPLATE_PP_MMX
    int i;
#endif

    const int qpHShift= isColor ? 4-c.hChromaSubSample : 4;
    const int qpVShift= isColor ? 4-c.vChromaSubSample : 4;

    //FIXME remove
    uint64_t * const yHistogram= c.yHistogram;
    uint8_t * const tempSrc= srcStride > 0 ? c.tempSrc : c.tempSrc - 23*srcStride;
    uint8_t * const tempDst= (dstStride > 0 ? c.tempDst : c.tempDst - 23*dstStride) + 32;
    //const int mbWidth= isColor ? (width+7)>>3 : (width+15)>>4;

    if (mode & VISUALIZE){
        if(!(mode & (V_A_DEBLOCK | H_A_DEBLOCK)) || TEMPLATE_PP_MMX) {
            av_log(c2, AV_LOG_WARNING, "Visualization is currently only supported with the accurate deblock filter without SIMD\n");
        }
    }

#if TEMPLATE_PP_MMX
    for(i=0; i<57; i++){
        int offset= ((i*c.ppMode.baseDcDiff)>>8) + 1;
        int threshold= offset*2 + 1;
        c.mmxDcOffset[i]= 0x7F - offset;
        c.mmxDcThreshold[i]= 0x7F - threshold;
        c.mmxDcOffset[i]*= 0x0101010101010101LL;
        c.mmxDcThreshold[i]*= 0x0101010101010101LL;
    }
#endif

    if(mode & CUBIC_IPOL_DEINT_FILTER) copyAhead=16;
    else if(   (mode & LINEAR_BLEND_DEINT_FILTER)
            || (mode & FFMPEG_DEINT_FILTER)
            || (mode & LOWPASS5_DEINT_FILTER)) copyAhead=14;
    else if(   (mode & V_DEBLOCK)
            || (mode & LINEAR_IPOL_DEINT_FILTER)
            || (mode & MEDIAN_DEINT_FILTER)
            || (mode & V_A_DEBLOCK)) copyAhead=13;
    else if(mode & V_X1_FILTER) copyAhead=11;
//    else if(mode & V_RK1_FILTER) copyAhead=10;
    else if(mode & DERING) copyAhead=9;
    else copyAhead=8;

    copyAhead-= 8;

    if(!isColor){
        uint64_t sum= 0;
        int i;
        uint64_t maxClipped;
        uint64_t clipped;
        AVRational scale;

        c.frameNum++;
        // first frame is fscked so we ignore it
        if(c.frameNum == 1) yHistogram[0]= width*(uint64_t)height/64*15/256;

        for(i=0; i<256; i++){
            sum+= yHistogram[i];
        }

        /* We always get a completely black picture first. */
        maxClipped= av_rescale(sum, c.ppMode.maxClippedThreshold.num, c.ppMode.maxClippedThreshold.den);

        clipped= sum;
        for(black=255; black>0; black--){
            if(clipped < maxClipped) break;
            clipped-= yHistogram[black];
        }

        clipped= sum;
        for(white=0; white<256; white++){
            if(clipped < maxClipped) break;
            clipped-= yHistogram[white];
        }

        scale = (AVRational){c.ppMode.maxAllowedY - c.ppMode.minAllowedY, white - black};

#if TEMPLATE_PP_MMXEXT
        c.packedYScale = (uint16_t)av_rescale(scale.num, 256, scale.den);
        c.packedYOffset= (((black*c.packedYScale)>>8) - c.ppMode.minAllowedY) & 0xFFFF;
#else
        c.packedYScale = (uint16_t)av_rescale(scale.num, 1024, scale.den);
        c.packedYOffset= (black - c.ppMode.minAllowedY) & 0xFFFF;
#endif

        c.packedYOffset|= c.packedYOffset<<32;
        c.packedYOffset|= c.packedYOffset<<16;

        c.packedYScale|= c.packedYScale<<32;
        c.packedYScale|= c.packedYScale<<16;

        if(mode & LEVEL_FIX)        QPCorrecture= (int)av_rescale(scale.num, 256*256, scale.den);
        else                        QPCorrecture= 256*256;
    }else{
        c.packedYScale= 0x0100010001000100LL;
        c.packedYOffset= 0;
        QPCorrecture= 256*256;
    }

    /* copy & deinterlace first row of blocks */
    y=-BLOCK_SIZE;
    {
        const uint8_t *srcBlock= &(src[y*srcStride]);
        uint8_t *dstBlock= tempDst + dstStride;

        // From this point on it is guaranteed that we can read and write 16 lines downward
        // finish 1 block before the next otherwise we might have a problem
        // with the L1 Cache of the P4 ... or only a few blocks at a time or something
        for(x=0; x<width; x+=BLOCK_SIZE){
            RENAME(prefetchnta)(srcBlock + (((x>>2)&6) + copyAhead)*srcStride + 32);
            RENAME(prefetchnta)(srcBlock + (((x>>2)&6) + copyAhead+1)*srcStride + 32);
            RENAME(prefetcht0)(dstBlock + (((x>>2)&6) + copyAhead)*dstStride + 32);
            RENAME(prefetcht0)(dstBlock + (((x>>2)&6) + copyAhead+1)*dstStride + 32);

            RENAME(blockCopy)(dstBlock + dstStride*8, dstStride,
                              srcBlock + srcStride*8, srcStride, mode & LEVEL_FIX, &c.packedYOffset);

            RENAME(duplicate)(dstBlock + dstStride*8, dstStride);

            if(mode & LINEAR_IPOL_DEINT_FILTER)
                RENAME(deInterlaceInterpolateLinear)(dstBlock, dstStride);
            else if(mode & LINEAR_BLEND_DEINT_FILTER)
                RENAME(deInterlaceBlendLinear)(dstBlock, dstStride, c.deintTemp + x);
            else if(mode & MEDIAN_DEINT_FILTER)
                RENAME(deInterlaceMedian)(dstBlock, dstStride);
            else if(mode & CUBIC_IPOL_DEINT_FILTER)
                RENAME(deInterlaceInterpolateCubic)(dstBlock, dstStride);
            else if(mode & FFMPEG_DEINT_FILTER)
                RENAME(deInterlaceFF)(dstBlock, dstStride, c.deintTemp + x);
            else if(mode & LOWPASS5_DEINT_FILTER)
                RENAME(deInterlaceL5)(dstBlock, dstStride, c.deintTemp + x, c.deintTemp + width + x);
/*          else if(mode & CUBIC_BLEND_DEINT_FILTER)
                RENAME(deInterlaceBlendCubic)(dstBlock, dstStride);
*/
            dstBlock+=8;
            srcBlock+=8;
        }
        if(width==FFABS(dstStride))
            linecpy(dst, tempDst + 9*dstStride, copyAhead, dstStride);
        else{
            int i;
            for(i=0; i<copyAhead; i++){
                memcpy(dst + i*dstStride, tempDst + (9+i)*dstStride, width);
            }
        }
    }

    for(y=0; y<height; y+=BLOCK_SIZE){
        //1% speedup if these are here instead of the inner loop
        const uint8_t *srcBlock= &(src[y*srcStride]);
        uint8_t *dstBlock= &(dst[y*dstStride]);
#if TEMPLATE_PP_MMX
        uint8_t *tempBlock1= c.tempBlocks;
        uint8_t *tempBlock2= c.tempBlocks + 8;
#endif
        const int8_t *QPptr= &QPs[(y>>qpVShift)*QPStride];
        int8_t *nonBQPptr= &c.nonBQPTable[(y>>qpVShift)*FFABS(QPStride)];
        int QP=0, nonBQP=0;
        /* can we mess with a 8x16 block from srcBlock/dstBlock downwards and 1 line upwards
           if not than use a temporary buffer */
        if(y+15 >= height){
            int i;
            /* copy from line (copyAhead) to (copyAhead+7) of src, these will be copied with
               blockcopy to dst later */
            linecpy(tempSrc + srcStride*copyAhead, srcBlock + srcStride*copyAhead,
                    FFMAX(height-y-copyAhead, 0), srcStride);

            /* duplicate last line of src to fill the void up to line (copyAhead+7) */
            for(i=FFMAX(height-y, 8); i<copyAhead+8; i++)
                    memcpy(tempSrc + srcStride*i, src + srcStride*(height-1), FFABS(srcStride));

            /* copy up to (copyAhead+1) lines of dst (line -1 to (copyAhead-1))*/
            linecpy(tempDst, dstBlock - dstStride, FFMIN(height-y+1, copyAhead+1), dstStride);

            /* duplicate last line of dst to fill the void up to line (copyAhead) */
            for(i=height-y+1; i<=copyAhead; i++)
                    memcpy(tempDst + dstStride*i, dst + dstStride*(height-1), FFABS(dstStride));

            dstBlock= tempDst + dstStride;
            srcBlock= tempSrc;
        }

        // From this point on it is guaranteed that we can read and write 16 lines downward
        // finish 1 block before the next otherwise we might have a problem
        // with the L1 Cache of the P4 ... or only a few blocks at a time or something
        for(x=0; x<width; ){
            int startx = x;
            int endx = FFMIN(width, x+32);
            uint8_t *dstBlockStart = dstBlock;
            const uint8_t *srcBlockStart = srcBlock;
            int qp_index = 0;
            for(qp_index=0; qp_index < (endx-startx)/BLOCK_SIZE; qp_index++){
                QP = QPptr[(x+qp_index*BLOCK_SIZE)>>qpHShift];
                nonBQP = nonBQPptr[(x+qp_index*BLOCK_SIZE)>>qpHShift];
            if(!isColor){
                QP= (QP* QPCorrecture + 256*128)>>16;
                nonBQP= (nonBQP* QPCorrecture + 256*128)>>16;
                yHistogram[(srcBlock+qp_index*8)[srcStride*12 + 4]]++;
            }
            c.QP_block[qp_index] = QP;
            c.nonBQP_block[qp_index] = nonBQP;
#if TEMPLATE_PP_MMX
            __asm__ volatile(
                "movd %1, %%mm7         \n\t"
                "packuswb %%mm7, %%mm7  \n\t" // 0, 0, 0, QP, 0, 0, 0, QP
                "packuswb %%mm7, %%mm7  \n\t" // 0,QP, 0, QP, 0,QP, 0, QP
                "packuswb %%mm7, %%mm7  \n\t" // QP,..., QP
                "movq %%mm7, %0         \n\t"
                : "=m" (c.pQPb_block[qp_index])
                : "r" (QP)
            );
#endif
            }
          for(; x < endx; x+=BLOCK_SIZE){
            RENAME(prefetchnta)(srcBlock + (((x>>2)&6) + copyAhead)*srcStride + 32);
            RENAME(prefetchnta)(srcBlock + (((x>>2)&6) + copyAhead+1)*srcStride + 32);
            RENAME(prefetcht0)(dstBlock + (((x>>2)&6) + copyAhead)*dstStride + 32);
            RENAME(prefetcht0)(dstBlock + (((x>>2)&6) + copyAhead+1)*dstStride + 32);

            RENAME(blockCopy)(dstBlock + dstStride*copyAhead, dstStride,
                              srcBlock + srcStride*copyAhead, srcStride, mode & LEVEL_FIX, &c.packedYOffset);

            if(mode & LINEAR_IPOL_DEINT_FILTER)
                RENAME(deInterlaceInterpolateLinear)(dstBlock, dstStride);
            else if(mode & LINEAR_BLEND_DEINT_FILTER)
                RENAME(deInterlaceBlendLinear)(dstBlock, dstStride, c.deintTemp + x);
            else if(mode & MEDIAN_DEINT_FILTER)
                RENAME(deInterlaceMedian)(dstBlock, dstStride);
            else if(mode & CUBIC_IPOL_DEINT_FILTER)
                RENAME(deInterlaceInterpolateCubic)(dstBlock, dstStride);
            else if(mode & FFMPEG_DEINT_FILTER)
                RENAME(deInterlaceFF)(dstBlock, dstStride, c.deintTemp + x);
            else if(mode & LOWPASS5_DEINT_FILTER)
                RENAME(deInterlaceL5)(dstBlock, dstStride, c.deintTemp + x, c.deintTemp + width + x);
/*          else if(mode & CUBIC_BLEND_DEINT_FILTER)
                RENAME(deInterlaceBlendCubic)(dstBlock, dstStride);
*/
            dstBlock+=8;
            srcBlock+=8;
          }

          dstBlock = dstBlockStart;
          srcBlock = srcBlockStart;

          for(x = startx, qp_index = 0; x < endx; x+=BLOCK_SIZE, qp_index++){
            const int stride= dstStride;
            //temporary while changing QP stuff to make things continue to work
            //eventually QP,nonBQP,etc will be arrays and this will be unnecessary
            c.QP = c.QP_block[qp_index];
            c.nonBQP = c.nonBQP_block[qp_index];
            c.pQPb = c.pQPb_block[qp_index];
            c.pQPb2 = c.pQPb2_block[qp_index];

            /* only deblock if we have 2 blocks */
            if(y + 8 < height){
                if(mode & V_X1_FILTER)
                    RENAME(vertX1Filter)(dstBlock, stride, &c);
                else if(mode & V_DEBLOCK){
                    const int t= RENAME(vertClassify)(dstBlock, stride, &c);

                    if(t==1)
                        RENAME(doVertLowPass)(dstBlock, stride, &c);
                    else if(t==2)
                        RENAME(doVertDefFilter)(dstBlock, stride, &c);
                }else if(mode & V_A_DEBLOCK){
                    RENAME(do_a_deblock)(dstBlock, stride, 1, &c, mode);
                }
            }

            dstBlock+=8;
            srcBlock+=8;
          }

          dstBlock = dstBlockStart;
          srcBlock = srcBlockStart;

          for(x = startx, qp_index=0; x < endx; x+=BLOCK_SIZE, qp_index++){
            const int stride= dstStride;
            av_unused uint8_t *tmpXchg;
            c.QP = c.QP_block[qp_index];
            c.nonBQP = c.nonBQP_block[qp_index];
            c.pQPb = c.pQPb_block[qp_index];
            c.pQPb2 = c.pQPb2_block[qp_index];
#if TEMPLATE_PP_MMX
            RENAME(transpose1)(tempBlock1, tempBlock2, dstBlock, dstStride);
#endif
            /* check if we have a previous block to deblock it with dstBlock */
            if(x - 8 >= 0){
#if TEMPLATE_PP_MMX
                if(mode & H_X1_FILTER)
                        RENAME(vertX1Filter)(tempBlock1, 16, &c);
                else if(mode & H_DEBLOCK){
                    const int t= RENAME(vertClassify)(tempBlock1, 16, &c);
                    if(t==1)
                        RENAME(doVertLowPass)(tempBlock1, 16, &c);
                    else if(t==2)
                        RENAME(doVertDefFilter)(tempBlock1, 16, &c);
                }else if(mode & H_A_DEBLOCK){
                        RENAME(do_a_deblock)(tempBlock1, 16, 1, &c, mode);
                }

                RENAME(transpose2)(dstBlock-4, dstStride, tempBlock1 + 4*16);

#else
                if(mode & H_X1_FILTER)
                    horizX1Filter(dstBlock-4, stride, c.QP);
                else if(mode & H_DEBLOCK){
#if TEMPLATE_PP_ALTIVEC
                    DECLARE_ALIGNED(16, unsigned char, tempBlock)[272];
                    int t;
                    transpose_16x8_char_toPackedAlign_altivec(tempBlock, dstBlock - (4 + 1), stride);

                    t = vertClassify_altivec(tempBlock-48, 16, &c);
                    if(t==1) {
                        doVertLowPass_altivec(tempBlock-48, 16, &c);
                        transpose_8x16_char_fromPackedAlign_altivec(dstBlock - (4 + 1), tempBlock, stride);
                    }
                    else if(t==2) {
                        doVertDefFilter_altivec(tempBlock-48, 16, &c);
                        transpose_8x16_char_fromPackedAlign_altivec(dstBlock - (4 + 1), tempBlock, stride);
                    }
#else
                    const int t= RENAME(horizClassify)(dstBlock-4, stride, &c);

                    if(t==1)
                        RENAME(doHorizLowPass)(dstBlock-4, stride, &c);
                    else if(t==2)
                        RENAME(doHorizDefFilter)(dstBlock-4, stride, &c);
#endif
                }else if(mode & H_A_DEBLOCK){
                    RENAME(do_a_deblock)(dstBlock-8, 1, stride, &c, mode);
                }
#endif //TEMPLATE_PP_MMX
                if(mode & DERING){
                //FIXME filter first line
                    if(y>0) RENAME(dering)(dstBlock - stride - 8, stride, &c);
                }

                if(mode & TEMP_NOISE_FILTER)
                {
                    RENAME(tempNoiseReducer)(dstBlock-8, stride,
                            c.tempBlurred[isColor] + y*dstStride + x,
                            c.tempBlurredPast[isColor] + (y>>3)*256 + (x>>3) + 256,
                            c.ppMode.maxTmpNoise);
                }
            }

            dstBlock+=8;
            srcBlock+=8;

#if TEMPLATE_PP_MMX
            tmpXchg= tempBlock1;
            tempBlock1= tempBlock2;
            tempBlock2 = tmpXchg;
#endif
          }
        }

        if(mode & DERING){
            if(y > 0) RENAME(dering)(dstBlock - dstStride - 8, dstStride, &c);
        }

        if((mode & TEMP_NOISE_FILTER)){
            RENAME(tempNoiseReducer)(dstBlock-8, dstStride,
                    c.tempBlurred[isColor] + y*dstStride + x,
                    c.tempBlurredPast[isColor] + (y>>3)*256 + (x>>3) + 256,
                    c.ppMode.maxTmpNoise);
        }

        /* did we use a tmp buffer for the last lines*/
        if(y+15 >= height){
            uint8_t *dstBlock= &(dst[y*dstStride]);
            if(width==FFABS(dstStride))
                linecpy(dstBlock, tempDst + dstStride, height-y, dstStride);
            else{
                int i;
                for(i=0; i<height-y; i++){
                    memcpy(dstBlock + i*dstStride, tempDst + (i+1)*dstStride, width);
                }
            }
        }
    }
#if   TEMPLATE_PP_3DNOW
    __asm__ volatile("femms");
#elif TEMPLATE_PP_MMX
    __asm__ volatile("emms");
#endif

#ifdef DEBUG_BRIGHTNESS
    if(!isColor){
        int max=1;
        int i;
        for(i=0; i<256; i++)
            if(yHistogram[i] > max) max=yHistogram[i];

        for(i=1; i<256; i++){
            int x;
            int start=yHistogram[i-1]/(max/256+1);
            int end=yHistogram[i]/(max/256+1);
            int inc= end > start ? 1 : -1;
            for(x=start; x!=end+inc; x+=inc)
                dst[ i*dstStride + x]+=128;
        }

        for(i=0; i<100; i+=2){
            dst[ (white)*dstStride + i]+=128;
            dst[ (black)*dstStride + i]+=128;
        }
    }
#endif

    *c2= c; //copy local context back

}

#undef RENAME
#undef TEMPLATE_PP_C
#undef TEMPLATE_PP_ALTIVEC
#undef TEMPLATE_PP_MMX
#undef TEMPLATE_PP_MMXEXT
#undef TEMPLATE_PP_3DNOW
#undef TEMPLATE_PP_SSE2