Skip to content
Snippets Groups Projects
postprocess_template.c 161 KiB
Newer Older
  • Learn to ignore specific revisions
  •  * Copyright (C) 2001-2002 Michael Niedermayer (michaelni@gmx.at)
     *
     * This file is part of FFmpeg.
     *
     * FFmpeg is free software; you can redistribute it and/or modify
     * it under the terms of the GNU General Public License as published by
     * the Free Software Foundation; either version 2 of the License, or
     * (at your option) any later version.
     *
     * FFmpeg is distributed in the hope that it will be useful,
     * but WITHOUT ANY WARRANTY; without even the implied warranty of
     * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
     * GNU General Public License for more details.
     *
     * You should have received a copy of the GNU General Public License
     * along with FFmpeg; if not, write to the Free Software
     * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
    
    Michael Niedermayer's avatar
    Michael Niedermayer committed
    /**
    
    Michael Niedermayer's avatar
    Michael Niedermayer committed
     * mmx/mmx2/3dnow postprocess code.
     */
    
    
    #include "libavutil/x86/asm.h"
    
    /* A single TEMPLATE_PP_* should be defined (to 1) when this template is
     * included. The following macros will define its dependencies to 1 as well
     * (like MMX2 depending on MMX), and will define to 0 all the others. Every
     * TEMPLATE_PP_* need to be undef at the end. */
    
    #ifdef TEMPLATE_PP_C
    #   define RENAME(a) a ## _C
    #else
    #   define TEMPLATE_PP_C 0
    #endif
    
    #ifdef TEMPLATE_PP_ALTIVEC
    #   define RENAME(a) a ## _altivec
    #else
    #   define TEMPLATE_PP_ALTIVEC 0
    #endif
    
    #ifdef TEMPLATE_PP_MMX
    #   define RENAME(a) a ## _MMX
    #else
    #   define TEMPLATE_PP_MMX 0
    #endif
    
    #ifdef TEMPLATE_PP_MMXEXT
    #   undef  TEMPLATE_PP_MMX
    #   define TEMPLATE_PP_MMX 1
    #   define RENAME(a) a ## _MMX2
    #else
    #   define TEMPLATE_PP_MMXEXT 0
    #endif
    
    #ifdef TEMPLATE_PP_3DNOW
    #   undef  TEMPLATE_PP_MMX
    #   define TEMPLATE_PP_MMX 1
    #   define RENAME(a) a ## _3DNow
    #else
    #   define TEMPLATE_PP_3DNOW 0
    #endif
    
    
    #ifdef TEMPLATE_PP_SSE2
    #   undef  TEMPLATE_PP_MMX
    #   define TEMPLATE_PP_MMX 1
    #   undef  TEMPLATE_PP_MMXEXT
    #   define TEMPLATE_PP_MMXEXT 1
    #   define RENAME(a) a ## _SSE2
    #else
    #   define TEMPLATE_PP_SSE2 0
    #endif
    
    
    #undef PAVGB
    #undef PMINUB
    #undef PMAXUB
    
    Arpi's avatar
    Arpi committed
    
    
    #if   TEMPLATE_PP_MMXEXT
    
    #define REAL_PAVGB(a,b) "pavgb " #a ", " #b " \n\t"
    
    #elif TEMPLATE_PP_3DNOW
    
    D Richard Felker III's avatar
    D Richard Felker III committed
    #define REAL_PAVGB(a,b) "pavgusb " #a ", " #b " \n\t"
    
    Arpi's avatar
    Arpi committed
    #endif
    
    #if   TEMPLATE_PP_MMXEXT
    
    Michael Niedermayer's avatar
    Michael Niedermayer committed
    #define PMINUB(a,b,t) "pminub " #a ", " #b " \n\t"
    
    #elif TEMPLATE_PP_MMX
    
    Michael Niedermayer's avatar
    Michael Niedermayer committed
    #define PMINUB(b,a,t) \
    
        "movq " #a ", " #t " \n\t"\
        "psubusb " #b ", " #t " \n\t"\
        "psubb " #t ", " #a " \n\t"
    
    Michael Niedermayer's avatar
    Michael Niedermayer committed
    #endif
    
    
    #if   TEMPLATE_PP_MMXEXT
    
    Michael Niedermayer's avatar
    Michael Niedermayer committed
    #define PMAXUB(a,b) "pmaxub " #a ", " #b " \n\t"
    
    #elif TEMPLATE_PP_MMX
    
    Michael Niedermayer's avatar
    Michael Niedermayer committed
    #define PMAXUB(a,b) \
    
        "psubusb " #a ", " #b " \n\t"\
        "paddb " #a ", " #b " \n\t"
    
    Michael Niedermayer's avatar
    Michael Niedermayer committed
    #endif
    
    
    Diego Biurrun's avatar
    Diego Biurrun committed
    //FIXME? |255-0| = 1 (should not be a problem ...)
    
    #if TEMPLATE_PP_MMX
    
     * Check if the middle 8x8 Block in the given 8x16 block is flat
    
    static inline int RENAME(vertClassify)(const uint8_t src[], int stride, PPContext *c){
    
        int numEq= 0, dcOk;
        src+= stride*4; // src points to begin of the 8x8 Block
    
        __asm__ volatile(
    
            "movq %0, %%mm7                         \n\t"
            "movq %1, %%mm6                         \n\t"
            : : "m" (c->mmxDcOffset[c->nonBQP]),  "m" (c->mmxDcThreshold[c->nonBQP])
            );
    
    
        __asm__ volatile(
    
            "lea (%2, %3), %%"FF_REG_a"             \n\t"
    
    //      0       1       2       3       4       5       6       7       8       9
    //      %1      eax     eax+%2  eax+2%2 %1+4%2  ecx     ecx+%2  ecx+2%2 %1+8%2  ecx+4%2
    
    
            "movq (%%"FF_REG_a"), %%mm1             \n\t"
    
            "movq %%mm0, %%mm3                      \n\t"
            "movq %%mm0, %%mm4                      \n\t"
            PMAXUB(%%mm1, %%mm4)
            PMINUB(%%mm1, %%mm3, %%mm5)
    
            "psubb %%mm1, %%mm0                     \n\t" // mm0 = difference
    
            "paddb %%mm7, %%mm0                     \n\t"
            "pcmpgtb %%mm6, %%mm0                   \n\t"
    
    
            "movq (%%"FF_REG_a",%3), %%mm2          \n\t"
    
            PMAXUB(%%mm2, %%mm4)
            PMINUB(%%mm2, %%mm3, %%mm5)
            "psubb %%mm2, %%mm1                     \n\t"
            "paddb %%mm7, %%mm1                     \n\t"
            "pcmpgtb %%mm6, %%mm1                   \n\t"
            "paddb %%mm1, %%mm0                     \n\t"
    
    
            "movq (%%"FF_REG_a", %3, 2), %%mm1      \n\t"
    
            PMAXUB(%%mm1, %%mm4)
            PMINUB(%%mm1, %%mm3, %%mm5)
            "psubb %%mm1, %%mm2                     \n\t"
            "paddb %%mm7, %%mm2                     \n\t"
            "pcmpgtb %%mm6, %%mm2                   \n\t"
            "paddb %%mm2, %%mm0                     \n\t"
    
    
            "lea (%%"FF_REG_a", %3, 4), %%"FF_REG_a"\n\t"
    
    
            "movq (%2, %3, 4), %%mm2                \n\t"
            PMAXUB(%%mm2, %%mm4)
            PMINUB(%%mm2, %%mm3, %%mm5)
            "psubb %%mm2, %%mm1                     \n\t"
            "paddb %%mm7, %%mm1                     \n\t"
            "pcmpgtb %%mm6, %%mm1                   \n\t"
            "paddb %%mm1, %%mm0                     \n\t"
    
    
            "movq (%%"FF_REG_a"), %%mm1             \n\t"
    
            PMAXUB(%%mm1, %%mm4)
            PMINUB(%%mm1, %%mm3, %%mm5)
            "psubb %%mm1, %%mm2                     \n\t"
            "paddb %%mm7, %%mm2                     \n\t"
            "pcmpgtb %%mm6, %%mm2                   \n\t"
            "paddb %%mm2, %%mm0                     \n\t"
    
    
            "movq (%%"FF_REG_a", %3), %%mm2         \n\t"
    
            PMAXUB(%%mm2, %%mm4)
            PMINUB(%%mm2, %%mm3, %%mm5)
            "psubb %%mm2, %%mm1                     \n\t"
            "paddb %%mm7, %%mm1                     \n\t"
            "pcmpgtb %%mm6, %%mm1                   \n\t"
            "paddb %%mm1, %%mm0                     \n\t"
    
    
            "movq (%%"FF_REG_a", %3, 2), %%mm1      \n\t"
    
            PMAXUB(%%mm1, %%mm4)
            PMINUB(%%mm1, %%mm3, %%mm5)
            "psubb %%mm1, %%mm2                     \n\t"
            "paddb %%mm7, %%mm2                     \n\t"
            "pcmpgtb %%mm6, %%mm2                   \n\t"
            "paddb %%mm2, %%mm0                     \n\t"
            "psubusb %%mm3, %%mm4                   \n\t"
    
            "                                       \n\t"
    
    #if TEMPLATE_PP_MMXEXT
    
            "pxor %%mm7, %%mm7                      \n\t"
            "psadbw %%mm7, %%mm0                    \n\t"
    
    Michael Niedermayer's avatar
    Michael Niedermayer committed
    #else
    
            "movq %%mm0, %%mm1                      \n\t"
            "psrlw $8, %%mm0                        \n\t"
            "paddb %%mm1, %%mm0                     \n\t"
            "movq %%mm0, %%mm1                      \n\t"
            "psrlq $16, %%mm0                       \n\t"
            "paddb %%mm1, %%mm0                     \n\t"
            "movq %%mm0, %%mm1                      \n\t"
            "psrlq $32, %%mm0                       \n\t"
            "paddb %%mm1, %%mm0                     \n\t"
    
    Michael Niedermayer's avatar
    Michael Niedermayer committed
    #endif
    
            "movq %4, %%mm7                         \n\t" // QP,..., QP
            "paddusb %%mm7, %%mm7                   \n\t" // 2QP ... 2QP
            "psubusb %%mm7, %%mm4                   \n\t" // Diff <= 2QP -> 0
            "packssdw %%mm4, %%mm4                  \n\t"
            "movd %%mm0, %0                         \n\t"
            "movd %%mm4, %1                         \n\t"
    
            : "=r" (numEq), "=r" (dcOk)
    
            : "r" (src), "r" ((x86_reg)stride), "m" (c->pQPb)
    
            );
    
        numEq= (-numEq) &0xFF;
        if(numEq > c->ppMode.flatnessThreshold){
            if(dcOk) return 0;
            else     return 1;
        }else{
            return 2;
        }
    
    #endif //TEMPLATE_PP_MMX
    
     * Do a vertical low pass filter on the 8x16 block (only write to the 8x8 block in the middle)
    
    Michael Niedermayer's avatar
    Michael Niedermayer committed
     * using the 9-Tap Filter (1,1,2,2,4,2,2,1,1)/16
    
    #if !TEMPLATE_PP_ALTIVEC
    
    static inline void RENAME(doVertLowPass)(uint8_t *src, int stride, PPContext *c)
    
    #if TEMPLATE_PP_MMXEXT || TEMPLATE_PP_3DNOW
    
        __asm__ volatile(        //"movv %0 %1 %2\n\t"
    
            "movq %2, %%mm0                         \n\t"  // QP,..., QP
            "pxor %%mm4, %%mm4                      \n\t"
    
            "movq (%0), %%mm6                       \n\t"
            "movq (%0, %1), %%mm5                   \n\t"
            "movq %%mm5, %%mm1                      \n\t"
            "movq %%mm6, %%mm2                      \n\t"
            "psubusb %%mm6, %%mm5                   \n\t"
            "psubusb %%mm1, %%mm2                   \n\t"
            "por %%mm5, %%mm2                       \n\t" // ABS Diff of lines
            "psubusb %%mm0, %%mm2                   \n\t" // diff <= QP -> 0
            "pcmpeqb %%mm4, %%mm2                   \n\t" // diff <= QP -> FF
    
            "pand %%mm2, %%mm6                      \n\t"
            "pandn %%mm1, %%mm2                     \n\t"
            "por %%mm2, %%mm6                       \n\t"// First Line to Filter
    
            "movq (%0, %1, 8), %%mm5                \n\t"
    
            "lea (%0, %1, 4), %%"FF_REG_a"          \n\t"
            "lea (%0, %1, 8), %%"FF_REG_c"          \n\t"
            "sub %1, %%"FF_REG_c"                   \n\t"
    
            "add %1, %0                             \n\t" // %0 points to line 1 not 0
            "movq (%0, %1, 8), %%mm7                \n\t"
            "movq %%mm5, %%mm1                      \n\t"
            "movq %%mm7, %%mm2                      \n\t"
            "psubusb %%mm7, %%mm5                   \n\t"
            "psubusb %%mm1, %%mm2                   \n\t"
            "por %%mm5, %%mm2                       \n\t" // ABS Diff of lines
            "psubusb %%mm0, %%mm2                   \n\t" // diff <= QP -> 0
            "pcmpeqb %%mm4, %%mm2                   \n\t" // diff <= QP -> FF
    
            "pand %%mm2, %%mm7                      \n\t"
            "pandn %%mm1, %%mm2                     \n\t"
            "por %%mm2, %%mm7                       \n\t" // First Line to Filter
    
    
            //      1       2       3       4       5       6       7       8
            //      %0      %0+%1   %0+2%1  eax     %0+4%1  eax+2%1 ecx     eax+4%1
            // 6 4 2 2 1 1
            // 6 4 4 2
            // 6 8 2
    
            "movq (%0, %1), %%mm0                   \n\t" //  1
            "movq %%mm0, %%mm1                      \n\t" //  1
            PAVGB(%%mm6, %%mm0)                           //1 1        /2
            PAVGB(%%mm6, %%mm0)                           //3 1        /4
    
            "movq (%0, %1, 4), %%mm2                \n\t" //     1
            "movq %%mm2, %%mm5                      \n\t" //     1
    
            PAVGB((%%FF_REGa), %%mm2)                     //    11        /2
    
            PAVGB((%0, %1, 2), %%mm2)                     //   211        /4
            "movq %%mm2, %%mm3                      \n\t" //   211        /4
            "movq (%0), %%mm4                       \n\t" // 1
            PAVGB(%%mm4, %%mm3)                           // 4 211        /8
            PAVGB(%%mm0, %%mm3)                           //642211        /16
            "movq %%mm3, (%0)                       \n\t" // X
            // mm1=2 mm2=3(211) mm4=1 mm5=5 mm6=0 mm7=9
            "movq %%mm1, %%mm0                      \n\t" //  1
            PAVGB(%%mm6, %%mm0)                           //1 1        /2
            "movq %%mm4, %%mm3                      \n\t" // 1
            PAVGB((%0,%1,2), %%mm3)                       // 1 1        /2
    
            PAVGB((%%FF_REGa,%1,2), %%mm5)                //     11        /2
            PAVGB((%%FF_REGa), %%mm5)                     //    211 /4
    
            PAVGB(%%mm5, %%mm3)                           // 2 2211 /8
            PAVGB(%%mm0, %%mm3)                           //4242211 /16
            "movq %%mm3, (%0,%1)                    \n\t" //  X
            // mm1=2 mm2=3(211) mm4=1 mm5=4(211) mm6=0 mm7=9
            PAVGB(%%mm4, %%mm6)                                   //11        /2
    
            "movq (%%"FF_REG_c"), %%mm0             \n\t" //       1
            PAVGB((%%FF_REGa, %1, 2), %%mm0)              //      11/2
    
            "movq %%mm0, %%mm3                      \n\t" //      11/2
            PAVGB(%%mm1, %%mm0)                           //  2   11/4
            PAVGB(%%mm6, %%mm0)                           //222   11/8
            PAVGB(%%mm2, %%mm0)                           //22242211/16
            "movq (%0, %1, 2), %%mm2                \n\t" //   1
            "movq %%mm0, (%0, %1, 2)                \n\t" //   X
            // mm1=2 mm2=3 mm3=6(11) mm4=1 mm5=4(211) mm6=0(11) mm7=9
    
            "movq (%%"FF_REG_a", %1, 4), %%mm0      \n\t" //        1
            PAVGB((%%FF_REGc), %%mm0)                     //       11        /2
    
            PAVGB(%%mm0, %%mm6)                           //11     11        /4
            PAVGB(%%mm1, %%mm4)                           // 11                /2
            PAVGB(%%mm2, %%mm1)                           //  11                /2
            PAVGB(%%mm1, %%mm6)                           //1122   11        /8
            PAVGB(%%mm5, %%mm6)                           //112242211        /16
    
            "movq (%%"FF_REG_a"), %%mm5             \n\t" //    1
            "movq %%mm6, (%%"FF_REG_a")             \n\t" //    X
    
            // mm0=7(11) mm1=2(11) mm2=3 mm3=6(11) mm4=1(11) mm5=4 mm7=9
    
            "movq (%%"FF_REG_a", %1, 4), %%mm6      \n\t" //        1
    
            PAVGB(%%mm7, %%mm6)                           //        11        /2
            PAVGB(%%mm4, %%mm6)                           // 11     11        /4
            PAVGB(%%mm3, %%mm6)                           // 11   2211        /8
            PAVGB(%%mm5, %%mm2)                           //   11                /2
            "movq (%0, %1, 4), %%mm4                \n\t" //     1
            PAVGB(%%mm4, %%mm2)                           //   112                /4
            PAVGB(%%mm2, %%mm6)                           // 112242211        /16
            "movq %%mm6, (%0, %1, 4)                \n\t" //     X
            // mm0=7(11) mm1=2(11) mm2=3(112) mm3=6(11) mm4=5 mm5=4 mm7=9
            PAVGB(%%mm7, %%mm1)                           //  11     2        /4
            PAVGB(%%mm4, %%mm5)                           //    11                /2
            PAVGB(%%mm5, %%mm0)                           //    11 11        /4
    
            "movq (%%"FF_REG_a", %1, 2), %%mm6      \n\t" //      1
    
            PAVGB(%%mm6, %%mm1)                           //  11  4  2        /8
            PAVGB(%%mm0, %%mm1)                           //  11224222        /16
    
            "movq %%mm1, (%%"FF_REG_a", %1, 2)      \n\t" //      X
    
            // mm2=3(112) mm3=6(11) mm4=5 mm5=4(11) mm6=6 mm7=9
    
            PAVGB((%%FF_REGc), %%mm2)                     //   112 4        /8
            "movq (%%"FF_REG_a", %1, 4), %%mm0      \n\t" //        1
    
            PAVGB(%%mm0, %%mm6)                           //      1 1        /2
            PAVGB(%%mm7, %%mm6)                           //      1 12        /4
            PAVGB(%%mm2, %%mm6)                           //   1122424        /4
    
            "movq %%mm6, (%%"FF_REG_c")             \n\t" //       X
    
            // mm0=8 mm3=6(11) mm4=5 mm5=4(11) mm7=9
            PAVGB(%%mm7, %%mm5)                           //    11   2        /4
            PAVGB(%%mm7, %%mm5)                           //    11   6        /8
    
            PAVGB(%%mm3, %%mm0)                           //      112        /4
            PAVGB(%%mm0, %%mm5)                           //    112246        /16
    
            "movq %%mm5, (%%"FF_REG_a", %1, 4)      \n\t" //        X
    
            : "r" (src), "r" ((x86_reg)stride), "m" (c->pQPb)
    
            : "%"FF_REG_a, "%"FF_REG_c
    
    #else //TEMPLATE_PP_MMXEXT || TEMPLATE_PP_3DNOW
    
        const int l1= stride;
        const int l2= stride + l1;
        const int l3= stride + l2;
        const int l4= stride + l3;
        const int l5= stride + l4;
        const int l6= stride + l5;
        const int l7= stride + l6;
        const int l8= stride + l7;
        const int l9= stride + l8;
        int x;
        src+= stride*3;
        for(x=0; x<BLOCK_SIZE; x++){
            const int first= FFABS(src[0] - src[l1]) < c->QP ? src[0] : src[l1];
            const int last= FFABS(src[l8] - src[l9]) < c->QP ? src[l9] : src[l8];
    
            int sums[10];
            sums[0] = 4*first + src[l1] + src[l2] + src[l3] + 4;
            sums[1] = sums[0] - first  + src[l4];
            sums[2] = sums[1] - first  + src[l5];
            sums[3] = sums[2] - first  + src[l6];
            sums[4] = sums[3] - first  + src[l7];
            sums[5] = sums[4] - src[l1] + src[l8];
            sums[6] = sums[5] - src[l2] + last;
            sums[7] = sums[6] - src[l3] + last;
            sums[8] = sums[7] - src[l4] + last;
            sums[9] = sums[8] - src[l5] + last;
    
            src[l1]= (sums[0] + sums[2] + 2*src[l1])>>4;
            src[l2]= (sums[1] + sums[3] + 2*src[l2])>>4;
            src[l3]= (sums[2] + sums[4] + 2*src[l3])>>4;
            src[l4]= (sums[3] + sums[5] + 2*src[l4])>>4;
            src[l5]= (sums[4] + sums[6] + 2*src[l5])>>4;
            src[l6]= (sums[5] + sums[7] + 2*src[l6])>>4;
            src[l7]= (sums[6] + sums[8] + 2*src[l7])>>4;
            src[l8]= (sums[7] + sums[9] + 2*src[l8])>>4;
    
            src++;
        }
    
    #endif //TEMPLATE_PP_MMXEXT || TEMPLATE_PP_3DNOW
    
    #endif //TEMPLATE_PP_ALTIVEC
    
    /**
     * Experimental Filter 1
    
     * will not damage linear gradients
    
    Diego Biurrun's avatar
    Diego Biurrun committed
     * Flat blocks should look like they were passed through the (1,1,2,2,4,2,2,1,1) 9-Tap filter
    
    Diego Biurrun's avatar
    Diego Biurrun committed
     * can only smooth blocks at the expected locations (it cannot smooth them if they did move)
     * MMX2 version does correct clipping C version does not
    
    static inline void RENAME(vertX1Filter)(uint8_t *src, int stride, PPContext *co)
    
    #if TEMPLATE_PP_MMXEXT || TEMPLATE_PP_3DNOW
    
        __asm__ volatile(
    
            "lea (%0, %1), %%"FF_REG_a"             \n\t"
            "lea (%%"FF_REG_a", %1, 4), %%"FF_REG_c"\n\t"
    
    //      0       1       2       3       4       5       6       7       8       9
    //      %0      eax     eax+%1  eax+2%1 %0+4%1  ecx     ecx+%1  ecx+2%1 %0+8%1  ecx+4%1
    
            "movq (%%"FF_REG_a", %1, 2), %%mm0      \n\t" // line 3
    
            "movq (%0, %1, 4), %%mm1                \n\t" // line 4
            "movq %%mm1, %%mm2                      \n\t" // line 4
            "psubusb %%mm0, %%mm1                   \n\t"
            "psubusb %%mm2, %%mm0                   \n\t"
            "por %%mm1, %%mm0                       \n\t" // |l2 - l3|
    
            "movq (%%"FF_REG_c"), %%mm3             \n\t" // line 5
            "movq (%%"FF_REG_c", %1), %%mm4         \n\t" // line 6
    
            "movq %%mm3, %%mm5                      \n\t" // line 5
            "psubusb %%mm4, %%mm3                   \n\t"
            "psubusb %%mm5, %%mm4                   \n\t"
            "por %%mm4, %%mm3                       \n\t" // |l5 - l6|
            PAVGB(%%mm3, %%mm0)                           // (|l2 - l3| + |l5 - l6|)/2
            "movq %%mm2, %%mm1                      \n\t" // line 4
            "psubusb %%mm5, %%mm2                   \n\t"
            "movq %%mm2, %%mm4                      \n\t"
            "pcmpeqb %%mm7, %%mm2                   \n\t" // (l4 - l5) <= 0 ? -1 : 0
            "psubusb %%mm1, %%mm5                   \n\t"
            "por %%mm5, %%mm4                       \n\t" // |l4 - l5|
            "psubusb %%mm0, %%mm4                   \n\t" //d = MAX(0, |l4-l5| - (|l2-l3| + |l5-l6|)/2)
            "movq %%mm4, %%mm3                      \n\t" // d
            "movq %2, %%mm0                         \n\t"
            "paddusb %%mm0, %%mm0                   \n\t"
            "psubusb %%mm0, %%mm4                   \n\t"
            "pcmpeqb %%mm7, %%mm4                   \n\t" // d <= QP ? -1 : 0
            "psubusb "MANGLE(b01)", %%mm3           \n\t"
            "pand %%mm4, %%mm3                      \n\t" // d <= QP ? d : 0
    
            PAVGB(%%mm7, %%mm3)                           // d/2
            "movq %%mm3, %%mm1                      \n\t" // d/2
            PAVGB(%%mm7, %%mm3)                           // d/4
            PAVGB(%%mm1, %%mm3)                           // 3*d/8
    
            "movq (%0, %1, 4), %%mm0                \n\t" // line 4
            "pxor %%mm2, %%mm0                      \n\t" //(l4 - l5) <= 0 ? -l4-1 : l4
            "psubusb %%mm3, %%mm0                   \n\t"
            "pxor %%mm2, %%mm0                      \n\t"
            "movq %%mm0, (%0, %1, 4)                \n\t" // line 4
    
    
            "movq (%%"FF_REG_c"), %%mm0             \n\t" // line 5
    
            "pxor %%mm2, %%mm0                      \n\t" //(l4 - l5) <= 0 ? -l5-1 : l5
            "paddusb %%mm3, %%mm0                   \n\t"
            "pxor %%mm2, %%mm0                      \n\t"
    
            "movq %%mm0, (%%"FF_REG_c")             \n\t" // line 5
    
            "movq (%%"FF_REG_a", %1, 2), %%mm0      \n\t" // line 3
    
            "pxor %%mm2, %%mm0                      \n\t" //(l4 - l5) <= 0 ? -l4-1 : l4
            "psubusb %%mm1, %%mm0                   \n\t"
            "pxor %%mm2, %%mm0                      \n\t"
    
            "movq %%mm0, (%%"FF_REG_a", %1, 2)      \n\t" // line 3
    
            "movq (%%"FF_REG_c", %1), %%mm0         \n\t" // line 6
    
            "pxor %%mm2, %%mm0                      \n\t" //(l4 - l5) <= 0 ? -l5-1 : l5
            "paddusb %%mm1, %%mm0                   \n\t"
            "pxor %%mm2, %%mm0                      \n\t"
    
            "movq %%mm0, (%%"FF_REG_c", %1)         \n\t" // line 6
    
            "movq (%%"FF_REG_a", %1), %%mm0         \n\t" // line 2
    
            "pxor %%mm2, %%mm0                      \n\t" //(l4 - l5) <= 0 ? -l2-1 : l2
            "psubusb %%mm1, %%mm0                   \n\t"
            "pxor %%mm2, %%mm0                      \n\t"
    
            "movq %%mm0, (%%"FF_REG_a", %1)         \n\t" // line 2
    
            "movq (%%"FF_REG_c", %1, 2), %%mm0      \n\t" // line 7
    
            "pxor %%mm2, %%mm0                      \n\t" //(l4 - l5) <= 0 ? -l7-1 : l7
            "paddusb %%mm1, %%mm0                   \n\t"
            "pxor %%mm2, %%mm0                      \n\t"
    
            "movq %%mm0, (%%"FF_REG_c", %1, 2)      \n\t" // line 7
    
            : "r" (src), "r" ((x86_reg)stride), "m" (co->pQPb)
    
            : "%"FF_REG_a, "%"FF_REG_c
    
    #else //TEMPLATE_PP_MMXEXT || TEMPLATE_PP_3DNOW
    
        const int l1= stride;
        const int l2= stride + l1;
        const int l3= stride + l2;
        const int l4= stride + l3;
        const int l5= stride + l4;
        const int l6= stride + l5;
        const int l7= stride + l6;
    //    const int l8= stride + l7;
    //    const int l9= stride + l8;
        int x;
    
        src+= stride*3;
        for(x=0; x<BLOCK_SIZE; x++){
            int a= src[l3] - src[l4];
            int b= src[l4] - src[l5];
            int c= src[l5] - src[l6];
    
            int d= FFABS(b) - ((FFABS(a) + FFABS(c))>>1);
            d= FFMAX(d, 0);
    
            if(d < co->QP*2){
                int v = d * FFSIGN(-b);
    
                src[l2] +=v>>3;
                src[l3] +=v>>2;
                src[l4] +=(3*v)>>3;
                src[l5] -=(3*v)>>3;
                src[l6] -=v>>2;
                src[l7] -=v>>3;
    
    #endif //TEMPLATE_PP_MMXEXT || TEMPLATE_PP_3DNOW
    
    #if !TEMPLATE_PP_ALTIVEC
    
    static inline void RENAME(doVertDefFilter)(uint8_t src[], int stride, PPContext *c)
    
    #if TEMPLATE_PP_MMXEXT || TEMPLATE_PP_3DNOW
    
        uint8_t tmp[16];
        const int l1= stride;
        const int l2= stride + l1;
        const int l3= stride + l2;
        const int l4= (int)tmp - (int)src - stride*3;
        const int l5= (int)tmp - (int)src - stride*3 + 8;
        const int l6= stride*3 + l3;
        const int l7= stride + l6;
        const int l8= stride + l7;
    
        memcpy(tmp, src+stride*7, 8);
        memcpy(tmp+8, src+stride*8, 8);
    
        __asm__ volatile(
    
    Diego Biurrun's avatar
    Diego Biurrun committed
    #if 0 //slightly more accurate and slightly slower
    
            "lea (%0, %1), %%"FF_REG_a"             \n\t"
            "lea (%%"FF_REG_a", %1, 4), %%"FF_REG_c"\n\t"
    
    //      0       1       2       3       4       5       6       7
    //      %0      %0+%1   %0+2%1  eax+2%1 %0+4%1  eax+4%1 ecx+%1  ecx+2%1
    //      %0      eax     eax+%1  eax+2%1 %0+4%1  ecx     ecx+%1  ecx+2%1
    
    
    
            "movq (%0, %1, 2), %%mm0                \n\t" // l2
            "movq (%0), %%mm1                       \n\t" // l0
            "movq %%mm0, %%mm2                      \n\t" // l2
            PAVGB(%%mm7, %%mm0)                           // ~l2/2
            PAVGB(%%mm1, %%mm0)                           // ~(l2 + 2l0)/4
            PAVGB(%%mm2, %%mm0)                           // ~(5l2 + 2l0)/8
    
    
            "movq (%%"FF_REG_a"), %%mm1             \n\t" // l1
            "movq (%%"FF_REG_a", %1, 2), %%mm3      \n\t" // l3
    
            "movq %%mm1, %%mm4                      \n\t" // l1
            PAVGB(%%mm7, %%mm1)                           // ~l1/2
            PAVGB(%%mm3, %%mm1)                           // ~(l1 + 2l3)/4
            PAVGB(%%mm4, %%mm1)                           // ~(5l1 + 2l3)/8
    
            "movq %%mm0, %%mm4                      \n\t" // ~(5l2 + 2l0)/8
            "psubusb %%mm1, %%mm0                   \n\t"
            "psubusb %%mm4, %%mm1                   \n\t"
            "por %%mm0, %%mm1                       \n\t" // ~|2l0 - 5l1 + 5l2 - 2l3|/8
    
    // mm1= |lenergy|, mm2= l2, mm3= l3, mm7=0
    
    
            "movq (%0, %1, 4), %%mm0                \n\t" // l4
            "movq %%mm0, %%mm4                      \n\t" // l4
            PAVGB(%%mm7, %%mm0)                           // ~l4/2
            PAVGB(%%mm2, %%mm0)                           // ~(l4 + 2l2)/4
            PAVGB(%%mm4, %%mm0)                           // ~(5l4 + 2l2)/8
    
    
            "movq (%%"FF_REG_c"), %%mm2             \n\t" // l5
    
            "movq %%mm3, %%mm5                      \n\t" // l3
            PAVGB(%%mm7, %%mm3)                           // ~l3/2
            PAVGB(%%mm2, %%mm3)                           // ~(l3 + 2l5)/4
            PAVGB(%%mm5, %%mm3)                           // ~(5l3 + 2l5)/8
    
            "movq %%mm0, %%mm6                      \n\t" // ~(5l4 + 2l2)/8
            "psubusb %%mm3, %%mm0                   \n\t"
            "psubusb %%mm6, %%mm3                   \n\t"
            "por %%mm0, %%mm3                       \n\t" // ~|2l2 - 5l3 + 5l4 - 2l5|/8
            "pcmpeqb %%mm7, %%mm0                   \n\t" // SIGN(2l2 - 5l3 + 5l4 - 2l5)
    
    // mm0= SIGN(menergy), mm1= |lenergy|, mm2= l5, mm3= |menergy|, mm4=l4, mm5= l3, mm7=0
    
    
            "movq (%%"FF_REG_c", %1), %%mm6         \n\t" // l6
    
            "movq %%mm6, %%mm5                      \n\t" // l6
            PAVGB(%%mm7, %%mm6)                           // ~l6/2
            PAVGB(%%mm4, %%mm6)                           // ~(l6 + 2l4)/4
            PAVGB(%%mm5, %%mm6)                           // ~(5l6 + 2l4)/8
    
    
            "movq (%%"FF_REG_c", %1, 2), %%mm5      \n\t" // l7
    
            "movq %%mm2, %%mm4                      \n\t" // l5
            PAVGB(%%mm7, %%mm2)                           // ~l5/2
            PAVGB(%%mm5, %%mm2)                           // ~(l5 + 2l7)/4
            PAVGB(%%mm4, %%mm2)                           // ~(5l5 + 2l7)/8
    
            "movq %%mm6, %%mm4                      \n\t" // ~(5l6 + 2l4)/8
            "psubusb %%mm2, %%mm6                   \n\t"
            "psubusb %%mm4, %%mm2                   \n\t"
            "por %%mm6, %%mm2                       \n\t" // ~|2l4 - 5l5 + 5l6 - 2l7|/8
    
    // mm0= SIGN(menergy), mm1= |lenergy|/8, mm2= |renergy|/8, mm3= |menergy|/8, mm7=0
    
    
    
            PMINUB(%%mm2, %%mm1, %%mm4)                   // MIN(|lenergy|,|renergy|)/8
            "movq %2, %%mm4                         \n\t" // QP //FIXME QP+1 ?
            "paddusb "MANGLE(b01)", %%mm4           \n\t"
            "pcmpgtb %%mm3, %%mm4                   \n\t" // |menergy|/8 < QP
            "psubusb %%mm1, %%mm3                   \n\t" // d=|menergy|/8-MIN(|lenergy|,|renergy|)/8
            "pand %%mm4, %%mm3                      \n\t"
    
            "movq %%mm3, %%mm1                      \n\t"
    //        "psubusb "MANGLE(b01)", %%mm3           \n\t"
            PAVGB(%%mm7, %%mm3)
            PAVGB(%%mm7, %%mm3)
            "paddusb %%mm1, %%mm3                   \n\t"
    //        "paddusb "MANGLE(b01)", %%mm3           \n\t"
    
    
            "movq (%%"FF_REG_a", %1, 2), %%mm6      \n\t" //l3
    
            "movq (%0, %1, 4), %%mm5                \n\t" //l4
            "movq (%0, %1, 4), %%mm4                \n\t" //l4
            "psubusb %%mm6, %%mm5                   \n\t"
            "psubusb %%mm4, %%mm6                   \n\t"
            "por %%mm6, %%mm5                       \n\t" // |l3-l4|
            "pcmpeqb %%mm7, %%mm6                   \n\t" // SIGN(l3-l4)
            "pxor %%mm6, %%mm0                      \n\t"
            "pand %%mm0, %%mm3                      \n\t"
            PMINUB(%%mm5, %%mm3, %%mm0)
    
            "psubusb "MANGLE(b01)", %%mm3           \n\t"
            PAVGB(%%mm7, %%mm3)
    
    
            "movq (%%"FF_REG_a", %1, 2), %%mm0      \n\t"
    
            "movq (%0, %1, 4), %%mm2                \n\t"
            "pxor %%mm6, %%mm0                      \n\t"
            "pxor %%mm6, %%mm2                      \n\t"
            "psubb %%mm3, %%mm0                     \n\t"
            "paddb %%mm3, %%mm2                     \n\t"
            "pxor %%mm6, %%mm0                      \n\t"
            "pxor %%mm6, %%mm2                      \n\t"
    
            "movq %%mm0, (%%"FF_REG_a", %1, 2)      \n\t"
    
            "lea (%0, %1), %%"FF_REG_a"             \n\t"
    
            "pcmpeqb %%mm6, %%mm6                   \n\t" // -1
    
    //      0       1       2       3       4       5       6       7
    //      %0      %0+%1   %0+2%1  eax+2%1 %0+4%1  eax+4%1 ecx+%1  ecx+2%1
    //      %0      eax     eax+%1  eax+2%1 %0+4%1  ecx     ecx+%1  ecx+2%1
    
            "movq (%%"FF_REG_a", %1, 2), %%mm1      \n\t" // l3
    
            "movq (%0, %1, 4), %%mm0                \n\t" // l4
            "pxor %%mm6, %%mm1                      \n\t" // -l3-1
            PAVGB(%%mm1, %%mm0)                           // -q+128 = (l4-l3+256)/2
    
    // mm1=-l3-1, mm0=128-q
    
    
            "movq (%%"FF_REG_a", %1, 4), %%mm2      \n\t" // l5
            "movq (%%"FF_REG_a", %1), %%mm3         \n\t" // l2
    
            "pxor %%mm6, %%mm2                      \n\t" // -l5-1
            "movq %%mm2, %%mm5                      \n\t" // -l5-1
            "movq "MANGLE(b80)", %%mm4              \n\t" // 128
    
            "lea (%%"FF_REG_a", %1, 4), %%"FF_REG_c"\n\t"
    
            PAVGB(%%mm3, %%mm2)                           // (l2-l5+256)/2
            PAVGB(%%mm0, %%mm4)                           // ~(l4-l3)/4 + 128
            PAVGB(%%mm2, %%mm4)                           // ~(l2-l5)/4 +(l4-l3)/8 + 128
            PAVGB(%%mm0, %%mm4)                           // ~(l2-l5)/8 +5(l4-l3)/16 + 128
    
    // mm1=-l3-1, mm0=128-q, mm3=l2, mm4=menergy/16 + 128, mm5= -l5-1
    
    
            "movq (%%"FF_REG_a"), %%mm2             \n\t" // l1
    
            "pxor %%mm6, %%mm2                      \n\t" // -l1-1
            PAVGB(%%mm3, %%mm2)                           // (l2-l1+256)/2
            PAVGB((%0), %%mm1)                            // (l0-l3+256)/2
            "movq "MANGLE(b80)", %%mm3              \n\t" // 128
            PAVGB(%%mm2, %%mm3)                           // ~(l2-l1)/4 + 128
            PAVGB(%%mm1, %%mm3)                           // ~(l0-l3)/4 +(l2-l1)/8 + 128
            PAVGB(%%mm2, %%mm3)                           // ~(l0-l3)/8 +5(l2-l1)/16 + 128
    
    // mm0=128-q, mm3=lenergy/16 + 128, mm4= menergy/16 + 128, mm5= -l5-1
    
    
            PAVGB((%%FF_REGc, %1), %%mm5)                 // (l6-l5+256)/2
            "movq (%%"FF_REG_c", %1, 2), %%mm1      \n\t" // l7
    
            "pxor %%mm6, %%mm1                      \n\t" // -l7-1
            PAVGB((%0, %1, 4), %%mm1)                     // (l4-l7+256)/2
            "movq "MANGLE(b80)", %%mm2              \n\t" // 128
            PAVGB(%%mm5, %%mm2)                           // ~(l6-l5)/4 + 128
            PAVGB(%%mm1, %%mm2)                           // ~(l4-l7)/4 +(l6-l5)/8 + 128
            PAVGB(%%mm5, %%mm2)                           // ~(l4-l7)/8 +5(l6-l5)/16 + 128
    
    // mm0=128-q, mm2=renergy/16 + 128, mm3=lenergy/16 + 128, mm4= menergy/16 + 128
    
    
            "movq "MANGLE(b00)", %%mm1              \n\t" // 0
            "movq "MANGLE(b00)", %%mm5              \n\t" // 0
            "psubb %%mm2, %%mm1                     \n\t" // 128 - renergy/16
            "psubb %%mm3, %%mm5                     \n\t" // 128 - lenergy/16
            PMAXUB(%%mm1, %%mm2)                          // 128 + |renergy/16|
            PMAXUB(%%mm5, %%mm3)                          // 128 + |lenergy/16|
            PMINUB(%%mm2, %%mm3, %%mm1)                   // 128 + MIN(|lenergy|,|renergy|)/16
    
    
    // mm0=128-q, mm3=128 + MIN(|lenergy|,|renergy|)/16, mm4= menergy/16 + 128
    
    
            "movq "MANGLE(b00)", %%mm7              \n\t" // 0
            "movq %2, %%mm2                         \n\t" // QP
            PAVGB(%%mm6, %%mm2)                           // 128 + QP/2
            "psubb %%mm6, %%mm2                     \n\t"
    
            "movq %%mm4, %%mm1                      \n\t"
            "pcmpgtb %%mm7, %%mm1                   \n\t" // SIGN(menergy)
            "pxor %%mm1, %%mm4                      \n\t"
            "psubb %%mm1, %%mm4                     \n\t" // 128 + |menergy|/16
            "pcmpgtb %%mm4, %%mm2                   \n\t" // |menergy|/16 < QP/2
            "psubusb %%mm3, %%mm4                   \n\t" //d=|menergy|/16 - MIN(|lenergy|,|renergy|)/16
    
    // mm0=128-q, mm1= SIGN(menergy), mm2= |menergy|/16 < QP/2, mm4= d/16
    
    
            "movq %%mm4, %%mm3                      \n\t" // d
            "psubusb "MANGLE(b01)", %%mm4           \n\t"
            PAVGB(%%mm7, %%mm4)                           // d/32
            PAVGB(%%mm7, %%mm4)                           // (d + 32)/64
            "paddb %%mm3, %%mm4                     \n\t" // 5d/64
            "pand %%mm2, %%mm4                      \n\t"
    
            "movq "MANGLE(b80)", %%mm5              \n\t" // 128
            "psubb %%mm0, %%mm5                     \n\t" // q
            "paddsb %%mm6, %%mm5                    \n\t" // fix bad rounding
            "pcmpgtb %%mm5, %%mm7                   \n\t" // SIGN(q)
            "pxor %%mm7, %%mm5                      \n\t"
    
            PMINUB(%%mm5, %%mm4, %%mm3)                   // MIN(|q|, 5d/64)
            "pxor %%mm1, %%mm7                      \n\t" // SIGN(d*q)
    
            "pand %%mm7, %%mm4                      \n\t"
    
            "movq (%%"FF_REG_a", %1, 2), %%mm0      \n\t"
    
            "movq (%0, %1, 4), %%mm2                \n\t"
            "pxor %%mm1, %%mm0                      \n\t"
            "pxor %%mm1, %%mm2                      \n\t"
            "paddb %%mm4, %%mm0                     \n\t"
            "psubb %%mm4, %%mm2                     \n\t"
            "pxor %%mm1, %%mm0                      \n\t"
            "pxor %%mm1, %%mm2                      \n\t"
    
            "movq %%mm0, (%%"FF_REG_a", %1, 2)      \n\t"
    
            : "r" (src), "r" ((x86_reg)stride), "m" (c->pQPb)
    
            : "%"FF_REG_a, "%"FF_REG_c
    
        {
        int x;
        src-= stride;
        for(x=0; x<BLOCK_SIZE; x++){
            const int middleEnergy= 5*(src[l5] - src[l4]) + 2*(src[l3] - src[l6]);
            if(FFABS(middleEnergy)< 8*QP){
                const int q=(src[l4] - src[l5])/2;
                const int leftEnergy=  5*(src[l3] - src[l2]) + 2*(src[l1] - src[l4]);
                const int rightEnergy= 5*(src[l7] - src[l6]) + 2*(src[l5] - src[l8]);
    
                int d= FFABS(middleEnergy) - FFMIN( FFABS(leftEnergy), FFABS(rightEnergy) );
                d= FFMAX(d, 0);
    
                d= (5*d + 32) >> 6;
                d*= FFSIGN(-middleEnergy);
    
                if(q>0){
                    d= d<0 ? 0 : d;
                    d= d>q ? q : d;
                }else{
                    d= d>0 ? 0 : d;
                    d= d<q ? q : d;
                }
    
                src[l4]-= d;
                src[l5]+= d;
    
            src++;
        }
        src-=8;
        for(x=0; x<8; x++){
            int y;
            for(y=4; y<6; y++){
                int d= src[x+y*stride] - tmp[x+(y-4)*8];
                int ad= FFABS(d);
                static int max=0;
                static int sum=0;
                static int num=0;
                static int bias=0;
    
                if(max<ad) max=ad;
                sum+= ad>3 ? 1 : 0;
                if(ad>3){
                    src[0] = src[7] = src[stride*7] = src[(stride+1)*7]=255;
                }
                if(y==4) bias+=d;
                num++;
                if(num%1000000 == 0){
                    av_log(c, AV_LOG_INFO, " %d %d %d %d\n", num, sum, max, bias);
                }
    
    #elif TEMPLATE_PP_MMX
    
        DECLARE_ALIGNED(8, uint64_t, tmp)[4]; // make space for 4 8-byte vars
    
        __asm__ volatile(
    
    //      0       1       2       3       4       5       6       7
    //      %0      %0+%1   %0+2%1  eax+2%1 %0+4%1  eax+4%1 edx+%1  edx+2%1
    //      %0      eax     eax+%1  eax+2%1 %0+4%1  edx     edx+%1  edx+2%1
    
    
            "movq (%0), %%mm0                       \n\t"
            "movq %%mm0, %%mm1                      \n\t"
            "punpcklbw %%mm7, %%mm0                 \n\t" // low part of line 0
            "punpckhbw %%mm7, %%mm1                 \n\t" // high part of line 0
    
            "movq (%0, %1), %%mm2                   \n\t"
    
            "lea (%0, %1, 2), %%"FF_REG_a"          \n\t"
    
            "movq %%mm2, %%mm3                      \n\t"
            "punpcklbw %%mm7, %%mm2                 \n\t" // low part of line 1
            "punpckhbw %%mm7, %%mm3                 \n\t" // high part of line 1
    
    
            "movq (%%"FF_REG_a"), %%mm4             \n\t"
    
            "movq %%mm4, %%mm5                      \n\t"
            "punpcklbw %%mm7, %%mm4                 \n\t" // low part of line 2
            "punpckhbw %%mm7, %%mm5                 \n\t" // high part of line 2
    
            "paddw %%mm0, %%mm0                     \n\t" // 2L0
            "paddw %%mm1, %%mm1                     \n\t" // 2H0
            "psubw %%mm4, %%mm2                     \n\t" // L1 - L2
            "psubw %%mm5, %%mm3                     \n\t" // H1 - H2
            "psubw %%mm2, %%mm0                     \n\t" // 2L0 - L1 + L2
            "psubw %%mm3, %%mm1                     \n\t" // 2H0 - H1 + H2
    
            "psllw $2, %%mm2                        \n\t" // 4L1 - 4L2
            "psllw $2, %%mm3                        \n\t" // 4H1 - 4H2
            "psubw %%mm2, %%mm0                     \n\t" // 2L0 - 5L1 + 5L2
            "psubw %%mm3, %%mm1                     \n\t" // 2H0 - 5H1 + 5H2
    
    
            "movq (%%"FF_REG_a", %1), %%mm2         \n\t"
    
            "movq %%mm2, %%mm3                      \n\t"
            "punpcklbw %%mm7, %%mm2                 \n\t" // L3
            "punpckhbw %%mm7, %%mm3                 \n\t" // H3
    
            "psubw %%mm2, %%mm0                     \n\t" // 2L0 - 5L1 + 5L2 - L3
            "psubw %%mm3, %%mm1                     \n\t" // 2H0 - 5H1 + 5H2 - H3
            "psubw %%mm2, %%mm0                     \n\t" // 2L0 - 5L1 + 5L2 - 2L3
            "psubw %%mm3, %%mm1                     \n\t" // 2H0 - 5H1 + 5H2 - 2H3
    
            "movq %%mm0, (%3)                       \n\t" // 2L0 - 5L1 + 5L2 - 2L3
            "movq %%mm1, 8(%3)                      \n\t" // 2H0 - 5H1 + 5H2 - 2H3
    
            "movq (%%"FF_REG_a", %1, 2), %%mm0      \n\t"
    
            "movq %%mm0, %%mm1                      \n\t"
            "punpcklbw %%mm7, %%mm0                 \n\t" // L4
            "punpckhbw %%mm7, %%mm1                 \n\t" // H4
    
            "psubw %%mm0, %%mm2                     \n\t" // L3 - L4
            "psubw %%mm1, %%mm3                     \n\t" // H3 - H4
    
            "movq %%mm2, 16(%3)                     \n\t" // L3 - L4
            "movq %%mm3, 24(%3)                     \n\t" // H3 - H4
    
            "paddw %%mm4, %%mm4                     \n\t" // 2L2
            "paddw %%mm5, %%mm5                     \n\t" // 2H2
            "psubw %%mm2, %%mm4                     \n\t" // 2L2 - L3 + L4
            "psubw %%mm3, %%mm5                     \n\t" // 2H2 - H3 + H4
    
    
            "lea (%%"FF_REG_a", %1), %0             \n\t"
    
            "psllw $2, %%mm2                        \n\t" // 4L3 - 4L4
            "psllw $2, %%mm3                        \n\t" // 4H3 - 4H4
            "psubw %%mm2, %%mm4                     \n\t" // 2L2 - 5L3 + 5L4
            "psubw %%mm3, %%mm5                     \n\t" // 2H2 - 5H3 + 5H4
    
            "movq (%0, %1, 2), %%mm2                \n\t"
            "movq %%mm2, %%mm3                      \n\t"
            "punpcklbw %%mm7, %%mm2                 \n\t" // L5
            "punpckhbw %%mm7, %%mm3                 \n\t" // H5
            "psubw %%mm2, %%mm4                     \n\t" // 2L2 - 5L3 + 5L4 - L5
            "psubw %%mm3, %%mm5                     \n\t" // 2H2 - 5H3 + 5H4 - H5
            "psubw %%mm2, %%mm4                     \n\t" // 2L2 - 5L3 + 5L4 - 2L5
            "psubw %%mm3, %%mm5                     \n\t" // 2H2 - 5H3 + 5H4 - 2H5
    
    
            "movq (%%"FF_REG_a", %1, 4), %%mm6      \n\t"
    
            "punpcklbw %%mm7, %%mm6                 \n\t" // L6
            "psubw %%mm6, %%mm2                     \n\t" // L5 - L6
    
            "movq (%%"FF_REG_a", %1, 4), %%mm6      \n\t"
    
            "punpckhbw %%mm7, %%mm6                 \n\t" // H6
            "psubw %%mm6, %%mm3                     \n\t" // H5 - H6
    
            "paddw %%mm0, %%mm0                     \n\t" // 2L4
            "paddw %%mm1, %%mm1                     \n\t" // 2H4
            "psubw %%mm2, %%mm0                     \n\t" // 2L4 - L5 + L6
            "psubw %%mm3, %%mm1                     \n\t" // 2H4 - H5 + H6
    
            "psllw $2, %%mm2                        \n\t" // 4L5 - 4L6
            "psllw $2, %%mm3                        \n\t" // 4H5 - 4H6
            "psubw %%mm2, %%mm0                     \n\t" // 2L4 - 5L5 + 5L6
            "psubw %%mm3, %%mm1                     \n\t" // 2H4 - 5H5 + 5H6
    
            "movq (%0, %1, 4), %%mm2                \n\t"
            "movq %%mm2, %%mm3                      \n\t"
            "punpcklbw %%mm7, %%mm2                 \n\t" // L7
            "punpckhbw %%mm7, %%mm3                 \n\t" // H7
    
            "paddw %%mm2, %%mm2                     \n\t" // 2L7
            "paddw %%mm3, %%mm3                     \n\t" // 2H7
            "psubw %%mm2, %%mm0                     \n\t" // 2L4 - 5L5 + 5L6 - 2L7
            "psubw %%mm3, %%mm1                     \n\t" // 2H4 - 5H5 + 5H6 - 2H7
    
    
            "movq (%3), %%mm2                       \n\t" // 2L0 - 5L1 + 5L2 - 2L3
            "movq 8(%3), %%mm3                      \n\t" // 2H0 - 5H1 + 5H2 - 2H3
    
    Michael Niedermayer's avatar
    Michael Niedermayer committed
    
    
    #if TEMPLATE_PP_MMXEXT
    
            "movq %%mm7, %%mm6                      \n\t" // 0
            "psubw %%mm0, %%mm6                     \n\t"
            "pmaxsw %%mm6, %%mm0                    \n\t" // |2L4 - 5L5 + 5L6 - 2L7|
            "movq %%mm7, %%mm6                      \n\t" // 0
            "psubw %%mm1, %%mm6                     \n\t"
            "pmaxsw %%mm6, %%mm1                    \n\t" // |2H4 - 5H5 + 5H6 - 2H7|
            "movq %%mm7, %%mm6                      \n\t" // 0
            "psubw %%mm2, %%mm6                     \n\t"
            "pmaxsw %%mm6, %%mm2                    \n\t" // |2L0 - 5L1 + 5L2 - 2L3|
            "movq %%mm7, %%mm6                      \n\t" // 0
            "psubw %%mm3, %%mm6                     \n\t"
            "pmaxsw %%mm6, %%mm3                    \n\t" // |2H0 - 5H1 + 5H2 - 2H3|
    
    Michael Niedermayer's avatar
    Michael Niedermayer committed
    #else
    
            "movq %%mm7, %%mm6                      \n\t" // 0
            "pcmpgtw %%mm0, %%mm6                   \n\t"
            "pxor %%mm6, %%mm0                      \n\t"
            "psubw %%mm6, %%mm0                     \n\t" // |2L4 - 5L5 + 5L6 - 2L7|
            "movq %%mm7, %%mm6                      \n\t" // 0
            "pcmpgtw %%mm1, %%mm6                   \n\t"
            "pxor %%mm6, %%mm1                      \n\t"
            "psubw %%mm6, %%mm1                     \n\t" // |2H4 - 5H5 + 5H6 - 2H7|
            "movq %%mm7, %%mm6                      \n\t" // 0
            "pcmpgtw %%mm2, %%mm6                   \n\t"
            "pxor %%mm6, %%mm2                      \n\t"
            "psubw %%mm6, %%mm2                     \n\t" // |2L0 - 5L1 + 5L2 - 2L3|
            "movq %%mm7, %%mm6                      \n\t" // 0
            "pcmpgtw %%mm3, %%mm6                   \n\t"
            "pxor %%mm6, %%mm3                      \n\t"
            "psubw %%mm6, %%mm3                     \n\t" // |2H0 - 5H1 + 5H2 - 2H3|
    
    Michael Niedermayer's avatar
    Michael Niedermayer committed
    #endif
    
    #if TEMPLATE_PP_MMXEXT
    
            "pminsw %%mm2, %%mm0                    \n\t"
            "pminsw %%mm3, %%mm1                    \n\t"
    
            "movq %%mm0, %%mm6                      \n\t"
            "psubusw %%mm2, %%mm6                   \n\t"
            "psubw %%mm6, %%mm0                     \n\t"
            "movq %%mm1, %%mm6                      \n\t"
            "psubusw %%mm3, %%mm6                   \n\t"
            "psubw %%mm6, %%mm1                     \n\t"
    
            "movd %2, %%mm2                         \n\t" // QP
            "punpcklbw %%mm7, %%mm2                 \n\t"
    
            "movq %%mm7, %%mm6                      \n\t" // 0
            "pcmpgtw %%mm4, %%mm6                   \n\t" // sign(2L2 - 5L3 + 5L4 - 2L5)
            "pxor %%mm6, %%mm4                      \n\t"
            "psubw %%mm6, %%mm4                     \n\t" // |2L2 - 5L3 + 5L4 - 2L5|
            "pcmpgtw %%mm5, %%mm7                   \n\t" // sign(2H2 - 5H3 + 5H4 - 2H5)
            "pxor %%mm7, %%mm5                      \n\t"
            "psubw %%mm7, %%mm5                     \n\t" // |2H2 - 5H3 + 5H4 - 2H5|
    
            "psllw $3, %%mm2                        \n\t" // 8QP
            "movq %%mm2, %%mm3                      \n\t" // 8QP
            "pcmpgtw %%mm4, %%mm2                   \n\t"
            "pcmpgtw %%mm5, %%mm3                   \n\t"
            "pand %%mm2, %%mm4                      \n\t"
            "pand %%mm3, %%mm5                      \n\t"
    
    
            "psubusw %%mm0, %%mm4                   \n\t" // hd
            "psubusw %%mm1, %%mm5                   \n\t" // ld
    
    
            "movq "MANGLE(w05)", %%mm2              \n\t" // 5
            "pmullw %%mm2, %%mm4                    \n\t"
            "pmullw %%mm2, %%mm5                    \n\t"
            "movq "MANGLE(w20)", %%mm2              \n\t" // 32
            "paddw %%mm2, %%mm4                     \n\t"
            "paddw %%mm2, %%mm5                     \n\t"
            "psrlw $6, %%mm4                        \n\t"
            "psrlw $6, %%mm5                        \n\t"