Skip to content
Snippets Groups Projects
postprocess_template.c 186 KiB
Newer Older
  • Learn to ignore specific revisions
  •  * Copyright (C) 2001-2002 Michael Niedermayer (michaelni@gmx.at)
     *
     * This file is part of FFmpeg.
     *
     * FFmpeg is free software; you can redistribute it and/or modify
     * it under the terms of the GNU General Public License as published by
     * the Free Software Foundation; either version 2 of the License, or
     * (at your option) any later version.
     *
     * FFmpeg is distributed in the hope that it will be useful,
     * but WITHOUT ANY WARRANTY; without even the implied warranty of
     * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
     * GNU General Public License for more details.
     *
     * You should have received a copy of the GNU General Public License
     * along with FFmpeg; if not, write to the Free Software
     * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
    
    Michael Niedermayer's avatar
    Michael Niedermayer committed
    /**
     * @file postprocess_template.c
     * mmx/mmx2/3dnow postprocess code.
     */
    
    
    
    #ifdef ARCH_X86_64
    #  define REGa  rax
    #  define REGc  rcx
    #  define REGd  rdx
    #  define REG_a  "rax"
    #  define REG_c  "rcx"
    #  define REG_d  "rdx"
    #  define REG_SP "rsp"
    #  define ALIGN_MASK "$0xFFFFFFFFFFFFFFF8"
    #else
    #  define REGa  eax
    #  define REGc  ecx
    #  define REGd  edx
    #  define REG_a  "eax"
    #  define REG_c  "ecx"
    #  define REG_d  "edx"
    #  define REG_SP "esp"
    #  define ALIGN_MASK "$0xFFFFFFF8"
    #endif
    
    
    
    #undef PAVGB
    #undef PMINUB
    #undef PMAXUB
    
    Arpi's avatar
    Arpi committed
    
    #ifdef HAVE_MMX2
    
    #define REAL_PAVGB(a,b) "pavgb " #a ", " #b " \n\t"
    
    Arpi's avatar
    Arpi committed
    #elif defined (HAVE_3DNOW)
    
    D Richard Felker III's avatar
    D Richard Felker III committed
    #define REAL_PAVGB(a,b) "pavgusb " #a ", " #b " \n\t"
    
    Arpi's avatar
    Arpi committed
    #endif
    
    Michael Niedermayer's avatar
    Michael Niedermayer committed
    #ifdef HAVE_MMX2
    #define PMINUB(a,b,t) "pminub " #a ", " #b " \n\t"
    #elif defined (HAVE_MMX)
    #define PMINUB(b,a,t) \
    
            "movq " #a ", " #t " \n\t"\
            "psubusb " #b ", " #t " \n\t"\
            "psubb " #t ", " #a " \n\t"
    
    Michael Niedermayer's avatar
    Michael Niedermayer committed
    #endif
    
    #ifdef HAVE_MMX2
    #define PMAXUB(a,b) "pmaxub " #a ", " #b " \n\t"
    #elif defined (HAVE_MMX)
    #define PMAXUB(a,b) \
    
            "psubusb " #a ", " #b " \n\t"\
            "paddb " #a ", " #b " \n\t"
    
    Michael Niedermayer's avatar
    Michael Niedermayer committed
    #endif
    
    
    //FIXME? |255-0| = 1 (shouldnt be a problem ...)
    
    #ifdef HAVE_MMX
    
     * Check if the middle 8x8 Block in the given 8x16 block is flat
    
    static inline int RENAME(vertClassify)(uint8_t src[], int stride, PPContext *c){
    
            int numEq= 0, dcOk;
            src+= stride*4; // src points to begin of the 8x8 Block
    
    Michael Niedermayer's avatar
    Michael Niedermayer committed
    asm volatile(
    
                    "movq %0, %%mm7                         \n\t"
                    "movq %1, %%mm6                         \n\t"
    
    Michael Niedermayer's avatar
    Michael Niedermayer committed
                    : : "m" (c->mmxDcOffset[c->nonBQP]),  "m" (c->mmxDcThreshold[c->nonBQP])
                    );
    
    asm volatile(
    
                    "lea (%2, %3), %%"REG_a"                \n\t"
    //      0       1       2       3       4       5       6       7       8       9
    //      %1      eax     eax+%2  eax+2%2 %1+4%2  ecx     ecx+%2  ecx+2%2 %1+8%2  ecx+4%2
    
                    "movq (%2), %%mm0                       \n\t"
                    "movq (%%"REG_a"), %%mm1                \n\t"
                    "movq %%mm0, %%mm3                      \n\t"
                    "movq %%mm0, %%mm4                      \n\t"
    
                    PMAXUB(%%mm1, %%mm4)
                    PMINUB(%%mm1, %%mm3, %%mm5)
    
                    "psubb %%mm1, %%mm0                     \n\t" // mm0 = differnece
                    "paddb %%mm7, %%mm0                     \n\t"
                    "pcmpgtb %%mm6, %%mm0                   \n\t"
    
                    "movq (%%"REG_a",%3), %%mm2             \n\t"
    
                    PMAXUB(%%mm2, %%mm4)
                    PMINUB(%%mm2, %%mm3, %%mm5)
    
                    "psubb %%mm2, %%mm1                     \n\t"
                    "paddb %%mm7, %%mm1                     \n\t"
                    "pcmpgtb %%mm6, %%mm1                   \n\t"
                    "paddb %%mm1, %%mm0                     \n\t"
    
                    "movq (%%"REG_a", %3, 2), %%mm1         \n\t"
    
                    PMAXUB(%%mm1, %%mm4)
                    PMINUB(%%mm1, %%mm3, %%mm5)
    
                    "psubb %%mm1, %%mm2                     \n\t"
                    "paddb %%mm7, %%mm2                     \n\t"
                    "pcmpgtb %%mm6, %%mm2                   \n\t"
                    "paddb %%mm2, %%mm0                     \n\t"
    
                    "lea (%%"REG_a", %3, 4), %%"REG_a"      \n\t"
    
                    "movq (%2, %3, 4), %%mm2                \n\t"
    
                    PMAXUB(%%mm2, %%mm4)
                    PMINUB(%%mm2, %%mm3, %%mm5)
    
                    "psubb %%mm2, %%mm1                     \n\t"
                    "paddb %%mm7, %%mm1                     \n\t"
                    "pcmpgtb %%mm6, %%mm1                   \n\t"
                    "paddb %%mm1, %%mm0                     \n\t"
    
                    "movq (%%"REG_a"), %%mm1                \n\t"
    
                    PMAXUB(%%mm1, %%mm4)
                    PMINUB(%%mm1, %%mm3, %%mm5)
    
                    "psubb %%mm1, %%mm2                     \n\t"
                    "paddb %%mm7, %%mm2                     \n\t"
                    "pcmpgtb %%mm6, %%mm2                   \n\t"
                    "paddb %%mm2, %%mm0                     \n\t"
    
                    "movq (%%"REG_a", %3), %%mm2            \n\t"
    
                    PMAXUB(%%mm2, %%mm4)
                    PMINUB(%%mm2, %%mm3, %%mm5)
    
                    "psubb %%mm2, %%mm1                     \n\t"
                    "paddb %%mm7, %%mm1                     \n\t"
                    "pcmpgtb %%mm6, %%mm1                   \n\t"
                    "paddb %%mm1, %%mm0                     \n\t"
    
                    "movq (%%"REG_a", %3, 2), %%mm1         \n\t"
    
                    PMAXUB(%%mm1, %%mm4)
                    PMINUB(%%mm1, %%mm3, %%mm5)
    
                    "psubb %%mm1, %%mm2                     \n\t"
                    "paddb %%mm7, %%mm2                     \n\t"
                    "pcmpgtb %%mm6, %%mm2                   \n\t"
                    "paddb %%mm2, %%mm0                     \n\t"
                    "psubusb %%mm3, %%mm4                   \n\t"
    
    Michael Niedermayer's avatar
    Michael Niedermayer committed
    #ifdef HAVE_MMX2
    
                    "pxor %%mm7, %%mm7                      \n\t"
                    "psadbw %%mm7, %%mm0                    \n\t"
    
    Michael Niedermayer's avatar
    Michael Niedermayer committed
    #else
    
                    "movq %%mm0, %%mm1                      \n\t"
                    "psrlw $8, %%mm0                        \n\t"
                    "paddb %%mm1, %%mm0                     \n\t"
                    "movq %%mm0, %%mm1                      \n\t"
                    "psrlq $16, %%mm0                       \n\t"
                    "paddb %%mm1, %%mm0                     \n\t"
                    "movq %%mm0, %%mm1                      \n\t"
                    "psrlq $32, %%mm0                       \n\t"
                    "paddb %%mm1, %%mm0                     \n\t"
    
    Michael Niedermayer's avatar
    Michael Niedermayer committed
    #endif
    
                    "movq %4, %%mm7                         \n\t" // QP,..., QP
                    "paddusb %%mm7, %%mm7                   \n\t" // 2QP ... 2QP
                    "psubusb %%mm7, %%mm4                   \n\t" // Diff <= 2QP -> 0
                    "packssdw %%mm4, %%mm4                  \n\t"
                    "movd %%mm0, %0                         \n\t"
                    "movd %%mm4, %1                         \n\t"
    
                    : "=r" (numEq), "=r" (dcOk)
                    : "r" (src), "r" ((long)stride), "m" (c->pQPb)
                    : "%"REG_a
                    );
    
            numEq= (-numEq) &0xFF;
            if(numEq > c->ppMode.flatnessThreshold){
    
                if(dcOk) return 0;
                else     return 1;
            }else{
                return 2;
            }
    
     * Do a vertical low pass filter on the 8x16 block (only write to the 8x8 block in the middle)
    
    Michael Niedermayer's avatar
    Michael Niedermayer committed
     * using the 9-Tap Filter (1,1,2,2,4,2,2,1,1)/16
    
    static inline void RENAME(doVertLowPass)(uint8_t *src, int stride, PPContext *c)
    
    #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
    
            src+= stride*3;
            asm volatile(        //"movv %0 %1 %2\n\t"
                    "movq %2, %%mm0                         \n\t"  // QP,..., QP
                    "pxor %%mm4, %%mm4                      \n\t"
    
                    "movq (%0), %%mm6                       \n\t"
                    "movq (%0, %1), %%mm5                   \n\t"
                    "movq %%mm5, %%mm1                      \n\t"
                    "movq %%mm6, %%mm2                      \n\t"
                    "psubusb %%mm6, %%mm5                   \n\t"
                    "psubusb %%mm1, %%mm2                   \n\t"
                    "por %%mm5, %%mm2                       \n\t" // ABS Diff of lines
                    "psubusb %%mm0, %%mm2                   \n\t" // diff <= QP -> 0
                    "pcmpeqb %%mm4, %%mm2                   \n\t" // diff <= QP -> FF
    
                    "pand %%mm2, %%mm6                      \n\t"
                    "pandn %%mm1, %%mm2                     \n\t"
                    "por %%mm2, %%mm6                       \n\t"// First Line to Filter
    
                    "movq (%0, %1, 8), %%mm5                \n\t"
                    "lea (%0, %1, 4), %%"REG_a"             \n\t"
                    "lea (%0, %1, 8), %%"REG_c"             \n\t"
                    "sub %1, %%"REG_c"                      \n\t"
                    "add %1, %0                             \n\t" // %0 points to line 1 not 0
                    "movq (%0, %1, 8), %%mm7                \n\t"
                    "movq %%mm5, %%mm1                      \n\t"
                    "movq %%mm7, %%mm2                      \n\t"
                    "psubusb %%mm7, %%mm5                   \n\t"
                    "psubusb %%mm1, %%mm2                   \n\t"
                    "por %%mm5, %%mm2                       \n\t" // ABS Diff of lines
                    "psubusb %%mm0, %%mm2                   \n\t" // diff <= QP -> 0
                    "pcmpeqb %%mm4, %%mm2                   \n\t" // diff <= QP -> FF
    
                    "pand %%mm2, %%mm7                      \n\t"
                    "pandn %%mm1, %%mm2                     \n\t"
                    "por %%mm2, %%mm7                       \n\t" // First Line to Filter
    
    
                    //      1       2       3       4       5       6       7       8
                    //      %0      %0+%1   %0+2%1  eax     %0+4%1  eax+2%1 ecx     eax+4%1
                    // 6 4 2 2 1 1
                    // 6 4 4 2
                    // 6 8 2
    
                    "movq (%0, %1), %%mm0                   \n\t" //  1
                    "movq %%mm0, %%mm1                      \n\t" //  1
                    PAVGB(%%mm6, %%mm0)                           //1 1        /2
                    PAVGB(%%mm6, %%mm0)                           //3 1        /4
    
                    "movq (%0, %1, 4), %%mm2                \n\t" //     1
                    "movq %%mm2, %%mm5                      \n\t" //     1
                    PAVGB((%%REGa), %%mm2)                        //    11        /2
                    PAVGB((%0, %1, 2), %%mm2)                     //   211        /4
                    "movq %%mm2, %%mm3                      \n\t" //   211        /4
                    "movq (%0), %%mm4                       \n\t" // 1
                    PAVGB(%%mm4, %%mm3)                           // 4 211        /8
                    PAVGB(%%mm0, %%mm3)                           //642211        /16
                    "movq %%mm3, (%0)                       \n\t" // X
                    // mm1=2 mm2=3(211) mm4=1 mm5=5 mm6=0 mm7=9
                    "movq %%mm1, %%mm0                      \n\t" //  1
                    PAVGB(%%mm6, %%mm0)                           //1 1        /2
                    "movq %%mm4, %%mm3                      \n\t" // 1
                    PAVGB((%0,%1,2), %%mm3)                       // 1 1        /2
                    PAVGB((%%REGa,%1,2), %%mm5)                   //     11        /2
                    PAVGB((%%REGa), %%mm5)                        //    211 /4
                    PAVGB(%%mm5, %%mm3)                           // 2 2211 /8
                    PAVGB(%%mm0, %%mm3)                           //4242211 /16
                    "movq %%mm3, (%0,%1)                    \n\t" //  X
                    // mm1=2 mm2=3(211) mm4=1 mm5=4(211) mm6=0 mm7=9
                    PAVGB(%%mm4, %%mm6)                                   //11        /2
                    "movq (%%"REG_c"), %%mm0                \n\t" //       1
                    PAVGB((%%REGa, %1, 2), %%mm0)                 //      11/2
                    "movq %%mm0, %%mm3                      \n\t" //      11/2
                    PAVGB(%%mm1, %%mm0)                           //  2   11/4
                    PAVGB(%%mm6, %%mm0)                           //222   11/8
                    PAVGB(%%mm2, %%mm0)                           //22242211/16
                    "movq (%0, %1, 2), %%mm2                \n\t" //   1
                    "movq %%mm0, (%0, %1, 2)                \n\t" //   X
                    // mm1=2 mm2=3 mm3=6(11) mm4=1 mm5=4(211) mm6=0(11) mm7=9
                    "movq (%%"REG_a", %1, 4), %%mm0         \n\t" //        1
                    PAVGB((%%REGc), %%mm0)                        //       11        /2
                    PAVGB(%%mm0, %%mm6)                           //11     11        /4
                    PAVGB(%%mm1, %%mm4)                           // 11                /2
                    PAVGB(%%mm2, %%mm1)                           //  11                /2
                    PAVGB(%%mm1, %%mm6)                           //1122   11        /8
                    PAVGB(%%mm5, %%mm6)                           //112242211        /16
                    "movq (%%"REG_a"), %%mm5                \n\t" //    1
                    "movq %%mm6, (%%"REG_a")                \n\t" //    X
                    // mm0=7(11) mm1=2(11) mm2=3 mm3=6(11) mm4=1(11) mm5=4 mm7=9
                    "movq (%%"REG_a", %1, 4), %%mm6         \n\t" //        1
                    PAVGB(%%mm7, %%mm6)                           //        11        /2
                    PAVGB(%%mm4, %%mm6)                           // 11     11        /4
                    PAVGB(%%mm3, %%mm6)                           // 11   2211        /8
                    PAVGB(%%mm5, %%mm2)                           //   11                /2
                    "movq (%0, %1, 4), %%mm4                \n\t" //     1
                    PAVGB(%%mm4, %%mm2)                           //   112                /4
                    PAVGB(%%mm2, %%mm6)                           // 112242211        /16
                    "movq %%mm6, (%0, %1, 4)                \n\t" //     X
                    // mm0=7(11) mm1=2(11) mm2=3(112) mm3=6(11) mm4=5 mm5=4 mm7=9
                    PAVGB(%%mm7, %%mm1)                           //  11     2        /4
                    PAVGB(%%mm4, %%mm5)                           //    11                /2
                    PAVGB(%%mm5, %%mm0)                           //    11 11        /4
                    "movq (%%"REG_a", %1, 2), %%mm6         \n\t" //      1
                    PAVGB(%%mm6, %%mm1)                           //  11  4  2        /8
                    PAVGB(%%mm0, %%mm1)                           //  11224222        /16
                    "movq %%mm1, (%%"REG_a", %1, 2)         \n\t" //      X
                    // mm2=3(112) mm3=6(11) mm4=5 mm5=4(11) mm6=6 mm7=9
                    PAVGB((%%REGc), %%mm2)                        //   112 4        /8
                    "movq (%%"REG_a", %1, 4), %%mm0         \n\t" //        1
                    PAVGB(%%mm0, %%mm6)                           //      1 1        /2
                    PAVGB(%%mm7, %%mm6)                           //      1 12        /4
                    PAVGB(%%mm2, %%mm6)                           //   1122424        /4
                    "movq %%mm6, (%%"REG_c")                \n\t" //       X
                    // mm0=8 mm3=6(11) mm4=5 mm5=4(11) mm7=9
                    PAVGB(%%mm7, %%mm5)                           //    11   2        /4
                    PAVGB(%%mm7, %%mm5)                           //    11   6        /8
    
                    PAVGB(%%mm3, %%mm0)                           //      112        /4
                    PAVGB(%%mm0, %%mm5)                           //    112246        /16
                    "movq %%mm5, (%%"REG_a", %1, 4)         \n\t" //        X
                    "sub %1, %0                             \n\t"
    
                    :
                    : "r" (src), "r" ((long)stride), "m" (c->pQPb)
                    : "%"REG_a, "%"REG_c
            );
    
    #else //defined (HAVE_MMX2) || defined (HAVE_3DNOW)
    
            const int l1= stride;
            const int l2= stride + l1;
            const int l3= stride + l2;
            const int l4= stride + l3;
            const int l5= stride + l4;
            const int l6= stride + l5;
            const int l7= stride + l6;
            const int l8= stride + l7;
            const int l9= stride + l8;
            int x;
            src+= stride*3;
            for(x=0; x<BLOCK_SIZE; x++)
            {
    
                    const int first= FFABS(src[0] - src[l1]) < c->QP ? src[0] : src[l1];
                    const int last= FFABS(src[l8] - src[l9]) < c->QP ? src[l9] : src[l8];
    
    
                    int sums[10];
                    sums[0] = 4*first + src[l1] + src[l2] + src[l3] + 4;
                    sums[1] = sums[0] - first  + src[l4];
                    sums[2] = sums[1] - first  + src[l5];
                    sums[3] = sums[2] - first  + src[l6];
                    sums[4] = sums[3] - first  + src[l7];
                    sums[5] = sums[4] - src[l1] + src[l8];
                    sums[6] = sums[5] - src[l2] + last;
                    sums[7] = sums[6] - src[l3] + last;
                    sums[8] = sums[7] - src[l4] + last;
                    sums[9] = sums[8] - src[l5] + last;
    
                    src[l1]= (sums[0] + sums[2] + 2*src[l1])>>4;
                    src[l2]= (sums[1] + sums[3] + 2*src[l2])>>4;
                    src[l3]= (sums[2] + sums[4] + 2*src[l3])>>4;
                    src[l4]= (sums[3] + sums[5] + 2*src[l4])>>4;
                    src[l5]= (sums[4] + sums[6] + 2*src[l5])>>4;
                    src[l6]= (sums[5] + sums[7] + 2*src[l6])>>4;
                    src[l7]= (sums[6] + sums[8] + 2*src[l7])>>4;
                    src[l8]= (sums[7] + sums[9] + 2*src[l8])>>4;
    
                    src++;
            }
    
    #endif //defined (HAVE_MMX2) || defined (HAVE_3DNOW)
    
    /**
     * Experimental implementation of the filter (Algorithm 1) described in a paper from Ramkishor & Karandikar
     * values are correctly clipped (MMX2)
     * values are wraparound (C)
     * conclusion: its fast, but introduces ugly horizontal patterns if there is a continious gradient
    
            0 8 16 24
            x = 8
            x/2 = 4
            x/8 = 1
            1 12 12 23
    
    static inline void RENAME(vertRK1Filter)(uint8_t *src, int stride, int QP)
    
    #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
    
    // FIXME rounding
    
            asm volatile(
                    "pxor %%mm7, %%mm7                      \n\t" // 0
                    "movq "MANGLE(b80)", %%mm6              \n\t" // MIN_SIGNED_BYTE
                    "leal (%0, %1), %%"REG_a"               \n\t"
                    "leal (%%"REG_a", %1, 4), %%"REG_c"     \n\t"
    //      0       1       2       3       4       5       6       7       8       9
    //      %0      eax     eax+%1  eax+2%1 %0+4%1  ecx     ecx+%1  ecx+2%1 %0+8%1  ecx+4%1
                    "movq "MANGLE(pQPb)", %%mm0             \n\t" // QP,..., QP
                    "movq %%mm0, %%mm1                      \n\t" // QP,..., QP
                    "paddusb "MANGLE(b02)", %%mm0           \n\t"
                    "psrlw $2, %%mm0                        \n\t"
                    "pand "MANGLE(b3F)", %%mm0              \n\t" // QP/4,..., QP/4
                    "paddusb %%mm1, %%mm0                   \n\t" // QP*1.25 ...
                    "movq (%0, %1, 4), %%mm2                \n\t" // line 4
                    "movq (%%"REG_c"), %%mm3                \n\t" // line 5
                    "movq %%mm2, %%mm4                      \n\t" // line 4
                    "pcmpeqb %%mm5, %%mm5                   \n\t" // -1
                    "pxor %%mm2, %%mm5                      \n\t" // -line 4 - 1
                    PAVGB(%%mm3, %%mm5)
                    "paddb %%mm6, %%mm5                     \n\t" // (l5-l4)/2
                    "psubusb %%mm3, %%mm4                   \n\t"
                    "psubusb %%mm2, %%mm3                   \n\t"
                    "por %%mm3, %%mm4                       \n\t" // |l4 - l5|
                    "psubusb %%mm0, %%mm4                   \n\t"
                    "pcmpeqb %%mm7, %%mm4                   \n\t"
                    "pand %%mm4, %%mm5                      \n\t" // d/2
    
    //                "paddb %%mm6, %%mm2                     \n\t" // line 4 + 0x80
                    "paddb %%mm5, %%mm2                     \n\t"
    //                "psubb %%mm6, %%mm2                     \n\t"
                    "movq %%mm2, (%0,%1, 4)                 \n\t"
    
                    "movq (%%"REG_c"), %%mm2                \n\t"
    //                "paddb %%mm6, %%mm2                     \n\t" // line 5 + 0x80
                    "psubb %%mm5, %%mm2                     \n\t"
    //                "psubb %%mm6, %%mm2                     \n\t"
                    "movq %%mm2, (%%"REG_c")                \n\t"
    
                    "paddb %%mm6, %%mm5                     \n\t"
                    "psrlw $2, %%mm5                        \n\t"
                    "pand "MANGLE(b3F)", %%mm5              \n\t"
                    "psubb "MANGLE(b20)", %%mm5             \n\t" // (l5-l4)/8
    
                    "movq (%%"REG_a", %1, 2), %%mm2         \n\t"
                    "paddb %%mm6, %%mm2                     \n\t" // line 3 + 0x80
                    "paddsb %%mm5, %%mm2                    \n\t"
                    "psubb %%mm6, %%mm2                     \n\t"
                    "movq %%mm2, (%%"REG_a", %1, 2)         \n\t"
    
                    "movq (%%"REG_c", %1), %%mm2            \n\t"
                    "paddb %%mm6, %%mm2                     \n\t" // line 6 + 0x80
                    "psubsb %%mm5, %%mm2                    \n\t"
                    "psubb %%mm6, %%mm2                     \n\t"
                    "movq %%mm2, (%%"REG_c", %1)            \n\t"
    
                    :
                    : "r" (src), "r" ((long)stride)
                    : "%"REG_a, "%"REG_c
            );
    
    #else //defined (HAVE_MMX2) || defined (HAVE_3DNOW)
    
             const int l1= stride;
            const int l2= stride + l1;
            const int l3= stride + l2;
            const int l4= stride + l3;
            const int l5= stride + l4;
            const int l6= stride + l5;
    //        const int l7= stride + l6;
    //        const int l8= stride + l7;
    //        const int l9= stride + l8;
            int x;
            const int QP15= QP + (QP>>2);
            src+= stride*3;
            for(x=0; x<BLOCK_SIZE; x++)
            {
                    const int v = (src[x+l5] - src[x+l4]);
    
                    if(FFABS(v) < QP15)
    
                    {
                            src[x+l3] +=v>>3;
                            src[x+l4] +=v>>1;
                            src[x+l5] -=v>>1;
                            src[x+l6] -=v>>3;
    
                    }
            }
    
    #endif //defined (HAVE_MMX2) || defined (HAVE_3DNOW)
    
    
    /**
     * Experimental Filter 1
    
     * will not damage linear gradients
     * Flat blocks should look like they where passed through the (1,1,2,2,4,2,2,1,1) 9-Tap filter
    
     * can only smooth blocks at the expected locations (it cant smooth them if they did move)
     * MMX2 version does correct clipping C version doesnt
    
    static inline void RENAME(vertX1Filter)(uint8_t *src, int stride, PPContext *co)
    
    #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
    
            src+= stride*3;
    
            asm volatile(
                    "pxor %%mm7, %%mm7                      \n\t" // 0
                    "lea (%0, %1), %%"REG_a"                \n\t"
                    "lea (%%"REG_a", %1, 4), %%"REG_c"      \n\t"
    //      0       1       2       3       4       5       6       7       8       9
    //      %0      eax     eax+%1  eax+2%1 %0+4%1  ecx     ecx+%1  ecx+2%1 %0+8%1  ecx+4%1
                    "movq (%%"REG_a", %1, 2), %%mm0         \n\t" // line 3
                    "movq (%0, %1, 4), %%mm1                \n\t" // line 4
                    "movq %%mm1, %%mm2                      \n\t" // line 4
                    "psubusb %%mm0, %%mm1                   \n\t"
                    "psubusb %%mm2, %%mm0                   \n\t"
                    "por %%mm1, %%mm0                       \n\t" // |l2 - l3|
                    "movq (%%"REG_c"), %%mm3                \n\t" // line 5
                    "movq (%%"REG_c", %1), %%mm4            \n\t" // line 6
                    "movq %%mm3, %%mm5                      \n\t" // line 5
                    "psubusb %%mm4, %%mm3                   \n\t"
                    "psubusb %%mm5, %%mm4                   \n\t"
                    "por %%mm4, %%mm3                       \n\t" // |l5 - l6|
                    PAVGB(%%mm3, %%mm0)                           // (|l2 - l3| + |l5 - l6|)/2
                    "movq %%mm2, %%mm1                      \n\t" // line 4
                    "psubusb %%mm5, %%mm2                   \n\t"
                    "movq %%mm2, %%mm4                      \n\t"
                    "pcmpeqb %%mm7, %%mm2                   \n\t" // (l4 - l5) <= 0 ? -1 : 0
                    "psubusb %%mm1, %%mm5                   \n\t"
                    "por %%mm5, %%mm4                       \n\t" // |l4 - l5|
                    "psubusb %%mm0, %%mm4                   \n\t" //d = MAX(0, |l4-l5| - (|l2-l3| + |l5-l6|)/2)
                    "movq %%mm4, %%mm3                      \n\t" // d
                    "movq %2, %%mm0                         \n\t"
                    "paddusb %%mm0, %%mm0                   \n\t"
                    "psubusb %%mm0, %%mm4                   \n\t"
                    "pcmpeqb %%mm7, %%mm4                   \n\t" // d <= QP ? -1 : 0
                    "psubusb "MANGLE(b01)", %%mm3           \n\t"
                    "pand %%mm4, %%mm3                      \n\t" // d <= QP ? d : 0
    
                    PAVGB(%%mm7, %%mm3)                           // d/2
                    "movq %%mm3, %%mm1                      \n\t" // d/2
                    PAVGB(%%mm7, %%mm3)                           // d/4
                    PAVGB(%%mm1, %%mm3)                           // 3*d/8
    
                    "movq (%0, %1, 4), %%mm0                \n\t" // line 4
                    "pxor %%mm2, %%mm0                      \n\t" //(l4 - l5) <= 0 ? -l4-1 : l4
                    "psubusb %%mm3, %%mm0                   \n\t"
                    "pxor %%mm2, %%mm0                      \n\t"
                    "movq %%mm0, (%0, %1, 4)                \n\t" // line 4
    
                    "movq (%%"REG_c"), %%mm0                \n\t" // line 5
                    "pxor %%mm2, %%mm0                      \n\t" //(l4 - l5) <= 0 ? -l5-1 : l5
                    "paddusb %%mm3, %%mm0                   \n\t"
                    "pxor %%mm2, %%mm0                      \n\t"
                    "movq %%mm0, (%%"REG_c")                \n\t" // line 5
    
                    PAVGB(%%mm7, %%mm1)                           // d/4
    
                    "movq (%%"REG_a", %1, 2), %%mm0         \n\t" // line 3
                    "pxor %%mm2, %%mm0                      \n\t" //(l4 - l5) <= 0 ? -l4-1 : l4
                    "psubusb %%mm1, %%mm0                   \n\t"
                    "pxor %%mm2, %%mm0                      \n\t"
                    "movq %%mm0, (%%"REG_a", %1, 2)         \n\t" // line 3
    
                    "movq (%%"REG_c", %1), %%mm0            \n\t" // line 6
                    "pxor %%mm2, %%mm0                      \n\t" //(l4 - l5) <= 0 ? -l5-1 : l5
                    "paddusb %%mm1, %%mm0                   \n\t"
                    "pxor %%mm2, %%mm0                      \n\t"
                    "movq %%mm0, (%%"REG_c", %1)            \n\t" // line 6
    
                    PAVGB(%%mm7, %%mm1)                           // d/8
    
                    "movq (%%"REG_a", %1), %%mm0            \n\t" // line 2
                    "pxor %%mm2, %%mm0                      \n\t" //(l4 - l5) <= 0 ? -l2-1 : l2
                    "psubusb %%mm1, %%mm0                   \n\t"
                    "pxor %%mm2, %%mm0                      \n\t"
                    "movq %%mm0, (%%"REG_a", %1)            \n\t" // line 2
    
                    "movq (%%"REG_c", %1, 2), %%mm0         \n\t" // line 7
                    "pxor %%mm2, %%mm0                      \n\t" //(l4 - l5) <= 0 ? -l7-1 : l7
                    "paddusb %%mm1, %%mm0                   \n\t"
                    "pxor %%mm2, %%mm0                      \n\t"
                    "movq %%mm0, (%%"REG_c", %1, 2)         \n\t" // line 7
    
                    :
                    : "r" (src), "r" ((long)stride), "m" (co->pQPb)
                    : "%"REG_a, "%"REG_c
            );
    
    #else //defined (HAVE_MMX2) || defined (HAVE_3DNOW)
    
            const int l1= stride;
            const int l2= stride + l1;
            const int l3= stride + l2;
            const int l4= stride + l3;
            const int l5= stride + l4;
            const int l6= stride + l5;
            const int l7= stride + l6;
    //        const int l8= stride + l7;
    //        const int l9= stride + l8;
            int x;
    
            src+= stride*3;
            for(x=0; x<BLOCK_SIZE; x++)
            {
                    int a= src[l3] - src[l4];
                    int b= src[l4] - src[l5];
                    int c= src[l5] - src[l6];
    
    
                    int d= FFABS(b) - ((FFABS(a) + FFABS(c))>>1);
    
    
                            src[l2] +=v>>3;
                            src[l3] +=v>>2;
                            src[l4] +=(3*v)>>3;
                            src[l5] -=(3*v)>>3;
                            src[l6] -=v>>2;
                            src[l7] -=v>>3;
    
                    }
                    src++;
            }
    
    #endif //defined (HAVE_MMX2) || defined (HAVE_3DNOW)
    
    static inline void RENAME(doVertDefFilter)(uint8_t src[], int stride, PPContext *c)
    
    #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
    /*
    
            uint8_t tmp[16];
            const int l1= stride;
            const int l2= stride + l1;
            const int l3= stride + l2;
            const int l4= (int)tmp - (int)src - stride*3;
            const int l5= (int)tmp - (int)src - stride*3 + 8;
            const int l6= stride*3 + l3;
            const int l7= stride + l6;
            const int l8= stride + l7;
    
            memcpy(tmp, src+stride*7, 8);
            memcpy(tmp+8, src+stride*8, 8);
    
            src+= stride*4;
            asm volatile(
    
    
    #if 0 //sligtly more accurate and slightly slower
    
                    "pxor %%mm7, %%mm7                      \n\t" // 0
                    "lea (%0, %1), %%"REG_a"                \n\t"
                    "lea (%%"REG_a", %1, 4), %%"REG_c"      \n\t"
    //      0       1       2       3       4       5       6       7
    //      %0      %0+%1   %0+2%1  eax+2%1 %0+4%1  eax+4%1 ecx+%1  ecx+2%1
    //      %0      eax     eax+%1  eax+2%1 %0+4%1  ecx     ecx+%1  ecx+2%1
    
    
                    "movq (%0, %1, 2), %%mm0                \n\t" // l2
                    "movq (%0), %%mm1                       \n\t" // l0
                    "movq %%mm0, %%mm2                      \n\t" // l2
                    PAVGB(%%mm7, %%mm0)                           // ~l2/2
                    PAVGB(%%mm1, %%mm0)                           // ~(l2 + 2l0)/4
                    PAVGB(%%mm2, %%mm0)                           // ~(5l2 + 2l0)/8
    
                    "movq (%%"REG_a"), %%mm1                \n\t" // l1
                    "movq (%%"REG_a", %1, 2), %%mm3         \n\t" // l3
                    "movq %%mm1, %%mm4                      \n\t" // l1
                    PAVGB(%%mm7, %%mm1)                           // ~l1/2
                    PAVGB(%%mm3, %%mm1)                           // ~(l1 + 2l3)/4
                    PAVGB(%%mm4, %%mm1)                           // ~(5l1 + 2l3)/8
    
                    "movq %%mm0, %%mm4                      \n\t" // ~(5l2 + 2l0)/8
                    "psubusb %%mm1, %%mm0                   \n\t"
                    "psubusb %%mm4, %%mm1                   \n\t"
                    "por %%mm0, %%mm1                       \n\t" // ~|2l0 - 5l1 + 5l2 - 2l3|/8
    
    // mm1= |lenergy|, mm2= l2, mm3= l3, mm7=0
    
    
                    "movq (%0, %1, 4), %%mm0                \n\t" // l4
                    "movq %%mm0, %%mm4                      \n\t" // l4
                    PAVGB(%%mm7, %%mm0)                           // ~l4/2
                    PAVGB(%%mm2, %%mm0)                           // ~(l4 + 2l2)/4
                    PAVGB(%%mm4, %%mm0)                           // ~(5l4 + 2l2)/8
    
                    "movq (%%"REG_c"), %%mm2                \n\t" // l5
                    "movq %%mm3, %%mm5                      \n\t" // l3
                    PAVGB(%%mm7, %%mm3)                           // ~l3/2
                    PAVGB(%%mm2, %%mm3)                           // ~(l3 + 2l5)/4
                    PAVGB(%%mm5, %%mm3)                           // ~(5l3 + 2l5)/8
    
                    "movq %%mm0, %%mm6                      \n\t" // ~(5l4 + 2l2)/8
                    "psubusb %%mm3, %%mm0                   \n\t"
                    "psubusb %%mm6, %%mm3                   \n\t"
                    "por %%mm0, %%mm3                       \n\t" // ~|2l2 - 5l3 + 5l4 - 2l5|/8
                    "pcmpeqb %%mm7, %%mm0                   \n\t" // SIGN(2l2 - 5l3 + 5l4 - 2l5)
    
    // mm0= SIGN(menergy), mm1= |lenergy|, mm2= l5, mm3= |menergy|, mm4=l4, mm5= l3, mm7=0
    
    
                    "movq (%%"REG_c", %1), %%mm6            \n\t" // l6
                    "movq %%mm6, %%mm5                      \n\t" // l6
                    PAVGB(%%mm7, %%mm6)                           // ~l6/2
                    PAVGB(%%mm4, %%mm6)                           // ~(l6 + 2l4)/4
                    PAVGB(%%mm5, %%mm6)                           // ~(5l6 + 2l4)/8
    
                    "movq (%%"REG_c", %1, 2), %%mm5         \n\t" // l7
                    "movq %%mm2, %%mm4                      \n\t" // l5
                    PAVGB(%%mm7, %%mm2)                           // ~l5/2
                    PAVGB(%%mm5, %%mm2)                           // ~(l5 + 2l7)/4
                    PAVGB(%%mm4, %%mm2)                           // ~(5l5 + 2l7)/8
    
                    "movq %%mm6, %%mm4                      \n\t" // ~(5l6 + 2l4)/8
                    "psubusb %%mm2, %%mm6                   \n\t"
                    "psubusb %%mm4, %%mm2                   \n\t"
                    "por %%mm6, %%mm2                       \n\t" // ~|2l4 - 5l5 + 5l6 - 2l7|/8
    
    // mm0= SIGN(menergy), mm1= |lenergy|/8, mm2= |renergy|/8, mm3= |menergy|/8, mm7=0
    
    
    
                    PMINUB(%%mm2, %%mm1, %%mm4)                   // MIN(|lenergy|,|renergy|)/8
                    "movq %2, %%mm4                         \n\t" // QP //FIXME QP+1 ?
                    "paddusb "MANGLE(b01)", %%mm4           \n\t"
                    "pcmpgtb %%mm3, %%mm4                   \n\t" // |menergy|/8 < QP
                    "psubusb %%mm1, %%mm3                   \n\t" // d=|menergy|/8-MIN(|lenergy|,|renergy|)/8
                    "pand %%mm4, %%mm3                      \n\t"
    
                    "movq %%mm3, %%mm1                      \n\t"
    //                "psubusb "MANGLE(b01)", %%mm3           \n\t"
                    PAVGB(%%mm7, %%mm3)
                    PAVGB(%%mm7, %%mm3)
                    "paddusb %%mm1, %%mm3                   \n\t"
    //                "paddusb "MANGLE(b01)", %%mm3           \n\t"
    
                    "movq (%%"REG_a", %1, 2), %%mm6         \n\t" //l3
                    "movq (%0, %1, 4), %%mm5                \n\t" //l4
                    "movq (%0, %1, 4), %%mm4                \n\t" //l4
                    "psubusb %%mm6, %%mm5                   \n\t"
                    "psubusb %%mm4, %%mm6                   \n\t"
                    "por %%mm6, %%mm5                       \n\t" // |l3-l4|
                    "pcmpeqb %%mm7, %%mm6                   \n\t" // SIGN(l3-l4)
                    "pxor %%mm6, %%mm0                      \n\t"
                    "pand %%mm0, %%mm3                      \n\t"
                    PMINUB(%%mm5, %%mm3, %%mm0)
    
                    "psubusb "MANGLE(b01)", %%mm3           \n\t"
                    PAVGB(%%mm7, %%mm3)
    
                    "movq (%%"REG_a", %1, 2), %%mm0         \n\t"
                    "movq (%0, %1, 4), %%mm2                \n\t"
                    "pxor %%mm6, %%mm0                      \n\t"
                    "pxor %%mm6, %%mm2                      \n\t"
                    "psubb %%mm3, %%mm0                     \n\t"
                    "paddb %%mm3, %%mm2                     \n\t"
                    "pxor %%mm6, %%mm0                      \n\t"
                    "pxor %%mm6, %%mm2                      \n\t"
                    "movq %%mm0, (%%"REG_a", %1, 2)         \n\t"
                    "movq %%mm2, (%0, %1, 4)                \n\t"
    
                    "lea (%0, %1), %%"REG_a"                \n\t"
                    "pcmpeqb %%mm6, %%mm6                   \n\t" // -1
    //      0       1       2       3       4       5       6       7
    //      %0      %0+%1   %0+2%1  eax+2%1 %0+4%1  eax+4%1 ecx+%1  ecx+2%1
    //      %0      eax     eax+%1  eax+2%1 %0+4%1  ecx     ecx+%1  ecx+2%1
    
                    "movq (%%"REG_a", %1, 2), %%mm1         \n\t" // l3
                    "movq (%0, %1, 4), %%mm0                \n\t" // l4
                    "pxor %%mm6, %%mm1                      \n\t" // -l3-1
                    PAVGB(%%mm1, %%mm0)                           // -q+128 = (l4-l3+256)/2
    
    // mm1=-l3-1, mm0=128-q
    
    
                    "movq (%%"REG_a", %1, 4), %%mm2         \n\t" // l5
                    "movq (%%"REG_a", %1), %%mm3            \n\t" // l2
                    "pxor %%mm6, %%mm2                      \n\t" // -l5-1
                    "movq %%mm2, %%mm5                      \n\t" // -l5-1
                    "movq "MANGLE(b80)", %%mm4              \n\t" // 128
                    "lea (%%"REG_a", %1, 4), %%"REG_c"      \n\t"
                    PAVGB(%%mm3, %%mm2)                           // (l2-l5+256)/2
                    PAVGB(%%mm0, %%mm4)                           // ~(l4-l3)/4 + 128
                    PAVGB(%%mm2, %%mm4)                           // ~(l2-l5)/4 +(l4-l3)/8 + 128
                    PAVGB(%%mm0, %%mm4)                           // ~(l2-l5)/8 +5(l4-l3)/16 + 128
    
    // mm1=-l3-1, mm0=128-q, mm3=l2, mm4=menergy/16 + 128, mm5= -l5-1
    
    
                    "movq (%%"REG_a"), %%mm2                \n\t" // l1
                    "pxor %%mm6, %%mm2                      \n\t" // -l1-1
                    PAVGB(%%mm3, %%mm2)                           // (l2-l1+256)/2
                    PAVGB((%0), %%mm1)                            // (l0-l3+256)/2
                    "movq "MANGLE(b80)", %%mm3              \n\t" // 128
                    PAVGB(%%mm2, %%mm3)                           // ~(l2-l1)/4 + 128
                    PAVGB(%%mm1, %%mm3)                           // ~(l0-l3)/4 +(l2-l1)/8 + 128
                    PAVGB(%%mm2, %%mm3)                           // ~(l0-l3)/8 +5(l2-l1)/16 + 128
    
    // mm0=128-q, mm3=lenergy/16 + 128, mm4= menergy/16 + 128, mm5= -l5-1
    
    
                    PAVGB((%%REGc, %1), %%mm5)                    // (l6-l5+256)/2
                    "movq (%%"REG_c", %1, 2), %%mm1         \n\t" // l7
                    "pxor %%mm6, %%mm1                      \n\t" // -l7-1
                    PAVGB((%0, %1, 4), %%mm1)                     // (l4-l7+256)/2
                    "movq "MANGLE(b80)", %%mm2              \n\t" // 128
                    PAVGB(%%mm5, %%mm2)                           // ~(l6-l5)/4 + 128
                    PAVGB(%%mm1, %%mm2)                           // ~(l4-l7)/4 +(l6-l5)/8 + 128
                    PAVGB(%%mm5, %%mm2)                           // ~(l4-l7)/8 +5(l6-l5)/16 + 128
    
    // mm0=128-q, mm2=renergy/16 + 128, mm3=lenergy/16 + 128, mm4= menergy/16 + 128
    
    
                    "movq "MANGLE(b00)", %%mm1              \n\t" // 0
                    "movq "MANGLE(b00)", %%mm5              \n\t" // 0
                    "psubb %%mm2, %%mm1                     \n\t" // 128 - renergy/16
                    "psubb %%mm3, %%mm5                     \n\t" // 128 - lenergy/16
                    PMAXUB(%%mm1, %%mm2)                          // 128 + |renergy/16|
                     PMAXUB(%%mm5, %%mm3)                         // 128 + |lenergy/16|
                    PMINUB(%%mm2, %%mm3, %%mm1)                   // 128 + MIN(|lenergy|,|renergy|)/16
    
    
    // mm0=128-q, mm3=128 + MIN(|lenergy|,|renergy|)/16, mm4= menergy/16 + 128
    
    
                    "movq "MANGLE(b00)", %%mm7              \n\t" // 0
                    "movq %2, %%mm2                         \n\t" // QP
                    PAVGB(%%mm6, %%mm2)                           // 128 + QP/2
                    "psubb %%mm6, %%mm2                     \n\t"
    
                    "movq %%mm4, %%mm1                      \n\t"
                    "pcmpgtb %%mm7, %%mm1                   \n\t" // SIGN(menergy)
                    "pxor %%mm1, %%mm4                      \n\t"
                    "psubb %%mm1, %%mm4                     \n\t" // 128 + |menergy|/16
                    "pcmpgtb %%mm4, %%mm2                   \n\t" // |menergy|/16 < QP/2
                    "psubusb %%mm3, %%mm4                   \n\t" //d=|menergy|/16 - MIN(|lenergy|,|renergy|)/16
    
    // mm0=128-q, mm1= SIGN(menergy), mm2= |menergy|/16 < QP/2, mm4= d/16
    
    
                    "movq %%mm4, %%mm3                      \n\t" // d
                    "psubusb "MANGLE(b01)", %%mm4           \n\t"
                    PAVGB(%%mm7, %%mm4)                           // d/32
                    PAVGB(%%mm7, %%mm4)                           // (d + 32)/64
                    "paddb %%mm3, %%mm4                     \n\t" // 5d/64
                    "pand %%mm2, %%mm4                      \n\t"
    
                    "movq "MANGLE(b80)", %%mm5              \n\t" // 128
                    "psubb %%mm0, %%mm5                     \n\t" // q
                    "paddsb %%mm6, %%mm5                    \n\t" // fix bad rounding
                    "pcmpgtb %%mm5, %%mm7                   \n\t" // SIGN(q)
                    "pxor %%mm7, %%mm5                      \n\t"
    
                    PMINUB(%%mm5, %%mm4, %%mm3)                   // MIN(|q|, 5d/64)
                    "pxor %%mm1, %%mm7                      \n\t" // SIGN(d*q)
    
                    "pand %%mm7, %%mm4                      \n\t"
                    "movq (%%"REG_a", %1, 2), %%mm0         \n\t"
                    "movq (%0, %1, 4), %%mm2                \n\t"
                    "pxor %%mm1, %%mm0                      \n\t"
                    "pxor %%mm1, %%mm2                      \n\t"
                    "paddb %%mm4, %%mm0                     \n\t"
                    "psubb %%mm4, %%mm2                     \n\t"
                    "pxor %%mm1, %%mm0                      \n\t"
                    "pxor %%mm1, %%mm2                      \n\t"
                    "movq %%mm0, (%%"REG_a", %1, 2)         \n\t"
                    "movq %%mm2, (%0, %1, 4)                \n\t"
    
                    :
                    : "r" (src), "r" ((long)stride), "m" (c->pQPb)
                    : "%"REG_a, "%"REG_c
            );
    
            {
            int x;
            src-= stride;
            for(x=0; x<BLOCK_SIZE; x++)
            {
                    const int middleEnergy= 5*(src[l5] - src[l4]) + 2*(src[l3] - src[l6]);
    
                    if(FFABS(middleEnergy)< 8*QP)
    
                    {
                            const int q=(src[l4] - src[l5])/2;
                            const int leftEnergy=  5*(src[l3] - src[l2]) + 2*(src[l1] - src[l4]);
                            const int rightEnergy= 5*(src[l7] - src[l6]) + 2*(src[l5] - src[l8]);
    
    
                            int d= FFABS(middleEnergy) - FFMIN( FFABS(leftEnergy), FFABS(rightEnergy) );
    
    
                            if(q>0)
                            {
                                    d= d<0 ? 0 : d;
                                    d= d>q ? q : d;
                            }
                            else
                            {
                                    d= d>0 ? 0 : d;
                                    d= d<q ? q : d;
                            }
    
                            src[l4]-= d;
                            src[l5]+= d;
                    }
                    src++;
            }
    
            for(x=0; x<8; x++)
            {
                    int y;
                    for(y=4; y<6; y++)
                    {
                            int d= src[x+y*stride] - tmp[x+(y-4)*8];
    
                            int ad= FFABS(d);
    
                            static int max=0;
                            static int sum=0;
                            static int num=0;
                            static int bias=0;
    
                            if(max<ad) max=ad;
                            sum+= ad>3 ? 1 : 0;
                            if(ad>3)
                            {
                                    src[0] = src[7] = src[stride*7] = src[(stride+1)*7]=255;
                            }
                            if(y==4) bias+=d;
                            num++;
                            if(num%1000000 == 0)
                            {
    
                                    av_log(c, AV_LOG_INFO, " %d %d %d %d\n", num, sum, max, bias);
    
    }
    */
    #elif defined (HAVE_MMX)
    
            src+= stride*4;
            asm volatile(
                    "pxor %%mm7, %%mm7                      \n\t"
                    "lea -40(%%"REG_SP"), %%"REG_c"         \n\t" // make space for 4 8-byte vars
                    "and "ALIGN_MASK", %%"REG_c"            \n\t" // align
    //      0       1       2       3       4       5       6       7
    //      %0      %0+%1   %0+2%1  eax+2%1 %0+4%1  eax+4%1 edx+%1  edx+2%1
    //      %0      eax     eax+%1  eax+2%1 %0+4%1  edx     edx+%1  edx+2%1
    
                    "movq (%0), %%mm0                       \n\t"
                    "movq %%mm0, %%mm1                      \n\t"
                    "punpcklbw %%mm7, %%mm0                 \n\t" // low part of line 0
                    "punpckhbw %%mm7, %%mm1                 \n\t" // high part of line 0
    
                    "movq (%0, %1), %%mm2                   \n\t"
                    "lea (%0, %1, 2), %%"REG_a"             \n\t"
                    "movq %%mm2, %%mm3                      \n\t"
                    "punpcklbw %%mm7, %%mm2                 \n\t" // low part of line 1
                    "punpckhbw %%mm7, %%mm3                 \n\t" // high part of line 1
    
                    "movq (%%"REG_a"), %%mm4                \n\t"
                    "movq %%mm4, %%mm5                      \n\t"
                    "punpcklbw %%mm7, %%mm4                 \n\t" // low part of line 2
                    "punpckhbw %%mm7, %%mm5                 \n\t" // high part of line 2
    
                    "paddw %%mm0, %%mm0                     \n\t" // 2L0
                    "paddw %%mm1, %%mm1                     \n\t" // 2H0
                    "psubw %%mm4, %%mm2                     \n\t" // L1 - L2
                    "psubw %%mm5, %%mm3                     \n\t" // H1 - H2
                    "psubw %%mm2, %%mm0                     \n\t" // 2L0 - L1 + L2
                    "psubw %%mm3, %%mm1                     \n\t" // 2H0 - H1 + H2
    
                    "psllw $2, %%mm2                        \n\t" // 4L1 - 4L2
                    "psllw $2, %%mm3                        \n\t" // 4H1 - 4H2
                    "psubw %%mm2, %%mm0                     \n\t" // 2L0 - 5L1 + 5L2
                    "psubw %%mm3, %%mm1                     \n\t" // 2H0 - 5H1 + 5H2
    
                    "movq (%%"REG_a", %1), %%mm2            \n\t"
                    "movq %%mm2, %%mm3                      \n\t"
                    "punpcklbw %%mm7, %%mm2                 \n\t" // L3
                    "punpckhbw %%mm7, %%mm3                 \n\t" // H3
    
                    "psubw %%mm2, %%mm0                     \n\t" // 2L0 - 5L1 + 5L2 - L3
                    "psubw %%mm3, %%mm1                     \n\t" // 2H0 - 5H1 + 5H2 - H3
                    "psubw %%mm2, %%mm0                     \n\t" // 2L0 - 5L1 + 5L2 - 2L3
                    "psubw %%mm3, %%mm1                     \n\t" // 2H0 - 5H1 + 5H2 - 2H3
                    "movq %%mm0, (%%"REG_c")                \n\t" // 2L0 - 5L1 + 5L2 - 2L3
                    "movq %%mm1, 8(%%"REG_c")               \n\t" // 2H0 - 5H1 + 5H2 - 2H3
    
                    "movq (%%"REG_a", %1, 2), %%mm0         \n\t"
                    "movq %%mm0, %%mm1                      \n\t"
                    "punpcklbw %%mm7, %%mm0                 \n\t" // L4
                    "punpckhbw %%mm7, %%mm1                 \n\t" // H4
    
                    "psubw %%mm0, %%mm2                     \n\t" // L3 - L4
                    "psubw %%mm1, %%mm3                     \n\t" // H3 - H4
                    "movq %%mm2, 16(%%"REG_c")              \n\t" // L3 - L4
                    "movq %%mm3, 24(%%"REG_c")              \n\t" // H3 - H4
                    "paddw %%mm4, %%mm4                     \n\t" // 2L2
                    "paddw %%mm5, %%mm5                     \n\t" // 2H2
                    "psubw %%mm2, %%mm4                     \n\t" // 2L2 - L3 + L4
                    "psubw %%mm3, %%mm5                     \n\t" // 2H2 - H3 + H4
    
                    "lea (%%"REG_a", %1), %0                \n\t"
                    "psllw $2, %%mm2                        \n\t" // 4L3 - 4L4
                    "psllw $2, %%mm3                        \n\t" // 4H3 - 4H4
                    "psubw %%mm2, %%mm4                     \n\t" // 2L2 - 5L3 + 5L4
                    "psubw %%mm3, %%mm5                     \n\t" // 2H2 - 5H3 + 5H4
    
                    "movq (%0, %1, 2), %%mm2                \n\t"
                    "movq %%mm2, %%mm3                      \n\t"
                    "punpcklbw %%mm7, %%mm2                 \n\t" // L5
                    "punpckhbw %%mm7, %%mm3                 \n\t" // H5
                    "psubw %%mm2, %%mm4                     \n\t" // 2L2 - 5L3 + 5L4 - L5
                    "psubw %%mm3, %%mm5                     \n\t" // 2H2 - 5H3 + 5H4 - H5
                    "psubw %%mm2, %%mm4                     \n\t" // 2L2 - 5L3 + 5L4 - 2L5
                    "psubw %%mm3, %%mm5                     \n\t" // 2H2 - 5H3 + 5H4 - 2H5
    
                    "movq (%%"REG_a", %1, 4), %%mm6         \n\t"
                    "punpcklbw %%mm7, %%mm6                 \n\t" // L6
                    "psubw %%mm6, %%mm2                     \n\t" // L5 - L6
                    "movq (%%"REG_a", %1, 4), %%mm6         \n\t"
                    "punpckhbw %%mm7, %%mm6                 \n\t" // H6
                    "psubw %%mm6, %%mm3                     \n\t" // H5 - H6
    
                    "paddw %%mm0, %%mm0                     \n\t" // 2L4
                    "paddw %%mm1, %%mm1                     \n\t" // 2H4
                    "psubw %%mm2, %%mm0                     \n\t" // 2L4 - L5 + L6
                    "psubw %%mm3, %%mm1                     \n\t" // 2H4 - H5 + H6
    
                    "psllw $2, %%mm2                        \n\t" // 4L5 - 4L6
                    "psllw $2, %%mm3                        \n\t" // 4H5 - 4H6
                    "psubw %%mm2, %%mm0                     \n\t" // 2L4 - 5L5 + 5L6
                    "psubw %%mm3, %%mm1                     \n\t" // 2H4 - 5H5 + 5H6