Skip to content
Snippets Groups Projects
postprocess_template.c 186 KiB
Newer Older
  • Learn to ignore specific revisions
  •                 "movq %%mm1, %%mm4                      \n\t"\
                    "movq %%mm2, %%mm5                      \n\t"\
                    "psubusb %%mm1, %%mm3                   \n\t"\
                    "psubusb %%mm2, %%mm4                   \n\t"\
                    "psubusb %%mm0, %%mm5                   \n\t"\
                    "pcmpeqb %%mm7, %%mm3                   \n\t"\
                    "pcmpeqb %%mm7, %%mm4                   \n\t"\
                    "pcmpeqb %%mm7, %%mm5                   \n\t"\
                    "movq %%mm3, %%mm6                      \n\t"\
                    "pxor %%mm4, %%mm3                      \n\t"\
                    "pxor %%mm5, %%mm4                      \n\t"\
                    "pxor %%mm6, %%mm5                      \n\t"\
                    "por %%mm3, %%mm1                       \n\t"\
                    "por %%mm4, %%mm2                       \n\t"\
                    "por %%mm5, %%mm0                       \n\t"\
                    "pand %%mm2, %%mm0                      \n\t"\
                    "pand %%mm1, %%mm0                      \n\t"\
                    "movq %%mm0, " #b "                     \n\t"
    
    #define MEDIAN(a,b,c)  REAL_MEDIAN(a,b,c)
    
    MEDIAN((%0)        , (%%REGa)       , (%%REGa, %1))
    
    MEDIAN((%%REGa, %1), (%%REGa, %1, 2), (%0, %1, 4))
    
    MEDIAN((%0, %1, 4) , (%%REGd)       , (%%REGd, %1))
    
    MEDIAN((%%REGd, %1), (%%REGd, %1, 2), (%0, %1, 8))
    
                    : : "r" (src), "r" ((long)stride)
                    : "%"REG_a, "%"REG_d
            );
    
            int x, y;
            src+= 4*stride;
            // FIXME - there should be a way to do a few columns in parallel like w/mmx
            for(x=0; x<8; x++)
            {
                    uint8_t *colsrc = src;
                    for (y=0; y<4; y++)
                    {
                            int a, b, c, d, e, f;
                            a = colsrc[0       ];
                            b = colsrc[stride  ];
                            c = colsrc[stride*2];
                            d = (a-b)>>31;
                            e = (b-c)>>31;
                            f = (c-a)>>31;
                            colsrc[stride  ] = (a|(d^f)) & (b|(d^e)) & (c|(e^f));
                            colsrc += stride*2;
                    }
                    src++;
            }
    
    /**
     * transposes and shift the given 8x8 Block into dst1 and dst2
     */
    
    static inline void RENAME(transpose1)(uint8_t *dst1, uint8_t *dst2, uint8_t *src, int srcStride)
    
            asm(
                    "lea (%0, %1), %%"REG_a"                \n\t"
    //      0       1       2       3       4       5       6       7       8       9
    //      %0      eax     eax+%1  eax+2%1 %0+4%1  edx     edx+%1  edx+2%1 %0+8%1  edx+4%1
                    "movq (%0), %%mm0                       \n\t" // 12345678
                    "movq (%%"REG_a"), %%mm1                \n\t" // abcdefgh
                    "movq %%mm0, %%mm2                      \n\t" // 12345678
                    "punpcklbw %%mm1, %%mm0                 \n\t" // 1a2b3c4d
                    "punpckhbw %%mm1, %%mm2                 \n\t" // 5e6f7g8h
    
                    "movq (%%"REG_a", %1), %%mm1            \n\t"
                    "movq (%%"REG_a", %1, 2), %%mm3         \n\t"
                    "movq %%mm1, %%mm4                      \n\t"
                    "punpcklbw %%mm3, %%mm1                 \n\t"
                    "punpckhbw %%mm3, %%mm4                 \n\t"
    
                    "movq %%mm0, %%mm3                      \n\t"
                    "punpcklwd %%mm1, %%mm0                 \n\t"
                    "punpckhwd %%mm1, %%mm3                 \n\t"
                    "movq %%mm2, %%mm1                      \n\t"
                    "punpcklwd %%mm4, %%mm2                 \n\t"
                    "punpckhwd %%mm4, %%mm1                 \n\t"
    
                    "movd %%mm0, 128(%2)                    \n\t"
                    "psrlq $32, %%mm0                       \n\t"
                    "movd %%mm0, 144(%2)                    \n\t"
                    "movd %%mm3, 160(%2)                    \n\t"
                    "psrlq $32, %%mm3                       \n\t"
                    "movd %%mm3, 176(%2)                    \n\t"
                    "movd %%mm3, 48(%3)                     \n\t"
                    "movd %%mm2, 192(%2)                    \n\t"
                    "movd %%mm2, 64(%3)                     \n\t"
                    "psrlq $32, %%mm2                       \n\t"
                    "movd %%mm2, 80(%3)                     \n\t"
                    "movd %%mm1, 96(%3)                     \n\t"
                    "psrlq $32, %%mm1                       \n\t"
                    "movd %%mm1, 112(%3)                    \n\t"
    
                    "lea (%%"REG_a", %1, 4), %%"REG_a"      \n\t"
    
                    "movq (%0, %1, 4), %%mm0                \n\t" // 12345678
                    "movq (%%"REG_a"), %%mm1                \n\t" // abcdefgh
                    "movq %%mm0, %%mm2                      \n\t" // 12345678
                    "punpcklbw %%mm1, %%mm0                 \n\t" // 1a2b3c4d
                    "punpckhbw %%mm1, %%mm2                 \n\t" // 5e6f7g8h
    
                    "movq (%%"REG_a", %1), %%mm1            \n\t"
                    "movq (%%"REG_a", %1, 2), %%mm3         \n\t"
                    "movq %%mm1, %%mm4                      \n\t"
                    "punpcklbw %%mm3, %%mm1                 \n\t"
                    "punpckhbw %%mm3, %%mm4                 \n\t"
    
                    "movq %%mm0, %%mm3                      \n\t"
                    "punpcklwd %%mm1, %%mm0                 \n\t"
                    "punpckhwd %%mm1, %%mm3                 \n\t"
                    "movq %%mm2, %%mm1                      \n\t"
                    "punpcklwd %%mm4, %%mm2                 \n\t"
                    "punpckhwd %%mm4, %%mm1                 \n\t"
    
                    "movd %%mm0, 132(%2)                    \n\t"
                    "psrlq $32, %%mm0                       \n\t"
                    "movd %%mm0, 148(%2)                    \n\t"
                    "movd %%mm3, 164(%2)                    \n\t"
                    "psrlq $32, %%mm3                       \n\t"
                    "movd %%mm3, 180(%2)                    \n\t"
                    "movd %%mm3, 52(%3)                     \n\t"
                    "movd %%mm2, 196(%2)                    \n\t"
                    "movd %%mm2, 68(%3)                     \n\t"
                    "psrlq $32, %%mm2                       \n\t"
                    "movd %%mm2, 84(%3)                     \n\t"
                    "movd %%mm1, 100(%3)                    \n\t"
                    "psrlq $32, %%mm1                       \n\t"
                    "movd %%mm1, 116(%3)                    \n\t"
    
    
            :: "r" (src), "r" ((long)srcStride), "r" (dst1), "r" (dst2)
            : "%"REG_a
            );
    
    static inline void RENAME(transpose2)(uint8_t *dst, int dstStride, uint8_t *src)
    
            asm(
                    "lea (%0, %1), %%"REG_a"                \n\t"
                    "lea (%%"REG_a",%1,4), %%"REG_d"        \n\t"
    //      0       1       2       3       4       5       6       7       8       9
    //      %0      eax     eax+%1  eax+2%1 %0+4%1  edx     edx+%1  edx+2%1 %0+8%1  edx+4%1
                    "movq (%2), %%mm0                       \n\t" // 12345678
                    "movq 16(%2), %%mm1                     \n\t" // abcdefgh
                    "movq %%mm0, %%mm2                      \n\t" // 12345678
                    "punpcklbw %%mm1, %%mm0                 \n\t" // 1a2b3c4d
                    "punpckhbw %%mm1, %%mm2                 \n\t" // 5e6f7g8h
    
                    "movq 32(%2), %%mm1                     \n\t"
                    "movq 48(%2), %%mm3                     \n\t"
                    "movq %%mm1, %%mm4                      \n\t"
                    "punpcklbw %%mm3, %%mm1                 \n\t"
                    "punpckhbw %%mm3, %%mm4                 \n\t"
    
                    "movq %%mm0, %%mm3                      \n\t"
                    "punpcklwd %%mm1, %%mm0                 \n\t"
                    "punpckhwd %%mm1, %%mm3                 \n\t"
                    "movq %%mm2, %%mm1                      \n\t"
                    "punpcklwd %%mm4, %%mm2                 \n\t"
                    "punpckhwd %%mm4, %%mm1                 \n\t"
    
                    "movd %%mm0, (%0)                       \n\t"
                    "psrlq $32, %%mm0                       \n\t"
                    "movd %%mm0, (%%"REG_a")                \n\t"
                    "movd %%mm3, (%%"REG_a", %1)            \n\t"
                    "psrlq $32, %%mm3                       \n\t"
                    "movd %%mm3, (%%"REG_a", %1, 2)         \n\t"
                    "movd %%mm2, (%0, %1, 4)                \n\t"
                    "psrlq $32, %%mm2                       \n\t"
                    "movd %%mm2, (%%"REG_d")                \n\t"
                    "movd %%mm1, (%%"REG_d", %1)            \n\t"
                    "psrlq $32, %%mm1                       \n\t"
                    "movd %%mm1, (%%"REG_d", %1, 2)         \n\t"
    
    
                    "movq 64(%2), %%mm0                     \n\t" // 12345678
                    "movq 80(%2), %%mm1                     \n\t" // abcdefgh
                    "movq %%mm0, %%mm2                      \n\t" // 12345678
                    "punpcklbw %%mm1, %%mm0                 \n\t" // 1a2b3c4d
                    "punpckhbw %%mm1, %%mm2                 \n\t" // 5e6f7g8h
    
                    "movq 96(%2), %%mm1                     \n\t"
                    "movq 112(%2), %%mm3                    \n\t"
                    "movq %%mm1, %%mm4                      \n\t"
                    "punpcklbw %%mm3, %%mm1                 \n\t"
                    "punpckhbw %%mm3, %%mm4                 \n\t"
    
                    "movq %%mm0, %%mm3                      \n\t"
                    "punpcklwd %%mm1, %%mm0                 \n\t"
                    "punpckhwd %%mm1, %%mm3                 \n\t"
                    "movq %%mm2, %%mm1                      \n\t"
                    "punpcklwd %%mm4, %%mm2                 \n\t"
                    "punpckhwd %%mm4, %%mm1                 \n\t"
    
                    "movd %%mm0, 4(%0)                      \n\t"
                    "psrlq $32, %%mm0                       \n\t"
                    "movd %%mm0, 4(%%"REG_a")               \n\t"
                    "movd %%mm3, 4(%%"REG_a", %1)           \n\t"
                    "psrlq $32, %%mm3                       \n\t"
                    "movd %%mm3, 4(%%"REG_a", %1, 2)        \n\t"
                    "movd %%mm2, 4(%0, %1, 4)               \n\t"
                    "psrlq $32, %%mm2                       \n\t"
                    "movd %%mm2, 4(%%"REG_d")               \n\t"
                    "movd %%mm1, 4(%%"REG_d", %1)           \n\t"
                    "psrlq $32, %%mm1                       \n\t"
                    "movd %%mm1, 4(%%"REG_d", %1, 2)        \n\t"
    
            :: "r" (dst), "r" ((long)dstStride), "r" (src)
            : "%"REG_a, "%"REG_d
            );
    
    #ifndef HAVE_ALTIVEC
    
    Michael Niedermayer's avatar
    Michael Niedermayer committed
    static inline void RENAME(tempNoiseReducer)(uint8_t *src, int stride,
    
                                        uint8_t *tempBlured, uint32_t *tempBluredPast, int *maxNoise)
    
            // to save a register (FIXME do this outside of the loops)
            tempBluredPast[127]= maxNoise[0];
            tempBluredPast[128]= maxNoise[1];
            tempBluredPast[129]= maxNoise[2];
    
    #define FAST_L2_DIFF
    //#define L1_DIFF //u should change the thresholds too if u try that one
    #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
    
            asm volatile(
                    "lea (%2, %2, 2), %%"REG_a"             \n\t" // 3*stride
                    "lea (%2, %2, 4), %%"REG_d"             \n\t" // 5*stride
                    "lea (%%"REG_d", %2, 2), %%"REG_c"      \n\t" // 7*stride
    //      0       1       2       3       4       5       6       7       8       9
    //      %x      %x+%2   %x+2%2  %x+eax  %x+4%2  %x+edx  %x+2eax %x+ecx  %x+8%2
    
    //FIXME reorder?
    #ifdef L1_DIFF //needs mmx2
    
                    "movq (%0), %%mm0                       \n\t" // L0
                    "psadbw (%1), %%mm0                     \n\t" // |L0-R0|
                    "movq (%0, %2), %%mm1                   \n\t" // L1
                    "psadbw (%1, %2), %%mm1                 \n\t" // |L1-R1|
                    "movq (%0, %2, 2), %%mm2                \n\t" // L2
                    "psadbw (%1, %2, 2), %%mm2              \n\t" // |L2-R2|
                    "movq (%0, %%"REG_a"), %%mm3            \n\t" // L3
                    "psadbw (%1, %%"REG_a"), %%mm3          \n\t" // |L3-R3|
    
                    "movq (%0, %2, 4), %%mm4                \n\t" // L4
                    "paddw %%mm1, %%mm0                     \n\t"
                    "psadbw (%1, %2, 4), %%mm4              \n\t" // |L4-R4|
                    "movq (%0, %%"REG_d"), %%mm5            \n\t" // L5
                    "paddw %%mm2, %%mm0                     \n\t"
                    "psadbw (%1, %%"REG_d"), %%mm5          \n\t" // |L5-R5|
                    "movq (%0, %%"REG_a", 2), %%mm6         \n\t" // L6
                    "paddw %%mm3, %%mm0                     \n\t"
                    "psadbw (%1, %%"REG_a", 2), %%mm6       \n\t" // |L6-R6|
                    "movq (%0, %%"REG_c"), %%mm7            \n\t" // L7
                    "paddw %%mm4, %%mm0                     \n\t"
                    "psadbw (%1, %%"REG_c"), %%mm7          \n\t" // |L7-R7|
                    "paddw %%mm5, %%mm6                     \n\t"
                    "paddw %%mm7, %%mm6                     \n\t"
                    "paddw %%mm6, %%mm0                     \n\t"
    
                    "pcmpeqb %%mm7, %%mm7                   \n\t"
                    "movq "MANGLE(b80)", %%mm6              \n\t"
                    "pxor %%mm0, %%mm0                      \n\t"
    
                    "movq " #a ", %%mm5                     \n\t"\
                    "movq " #b ", %%mm2                     \n\t"\
                    "pxor %%mm7, %%mm2                      \n\t"\
                    PAVGB(%%mm2, %%mm5)\
                    "paddb %%mm6, %%mm5                     \n\t"\
                    "movq %%mm5, %%mm2                      \n\t"\
                    "psllw $8, %%mm5                        \n\t"\
                    "pmaddwd %%mm5, %%mm5                   \n\t"\
                    "pmaddwd %%mm2, %%mm2                   \n\t"\
                    "paddd %%mm2, %%mm5                     \n\t"\
                    "psrld $14, %%mm5                       \n\t"\
                    "paddd %%mm5, %%mm0                     \n\t"
    
                    "pxor %%mm7, %%mm7                      \n\t"
                    "pxor %%mm0, %%mm0                      \n\t"
    
                    "movq " #a ", %%mm5                     \n\t"\
                    "movq " #b ", %%mm2                     \n\t"\
                    "movq %%mm5, %%mm1                      \n\t"\
                    "movq %%mm2, %%mm3                      \n\t"\
                    "punpcklbw %%mm7, %%mm5                 \n\t"\
                    "punpckhbw %%mm7, %%mm1                 \n\t"\
                    "punpcklbw %%mm7, %%mm2                 \n\t"\
                    "punpckhbw %%mm7, %%mm3                 \n\t"\
                    "psubw %%mm2, %%mm5                     \n\t"\
                    "psubw %%mm3, %%mm1                     \n\t"\
                    "pmaddwd %%mm5, %%mm5                   \n\t"\
                    "pmaddwd %%mm1, %%mm1                   \n\t"\
                    "paddd %%mm1, %%mm5                     \n\t"\
                    "paddd %%mm5, %%mm0                     \n\t"
    
    
    #define L2_DIFF_CORE(a, b)  REAL_L2_DIFF_CORE(a, b)
    
    
    L2_DIFF_CORE((%0)          , (%1))
    L2_DIFF_CORE((%0, %2)      , (%1, %2))
    L2_DIFF_CORE((%0, %2, 2)   , (%1, %2, 2))
    L2_DIFF_CORE((%0, %%REGa)  , (%1, %%REGa))
    L2_DIFF_CORE((%0, %2, 4)   , (%1, %2, 4))
    L2_DIFF_CORE((%0, %%REGd)  , (%1, %%REGd))
    
    L2_DIFF_CORE((%0, %%REGa,2), (%1, %%REGa,2))
    
    L2_DIFF_CORE((%0, %%REGc)  , (%1, %%REGc))
    
                    "movq %%mm0, %%mm4                      \n\t"
                    "psrlq $32, %%mm0                       \n\t"
                    "paddd %%mm0, %%mm4                     \n\t"
                    "movd %%mm4, %%ecx                      \n\t"
                    "shll $2, %%ecx                         \n\t"
                    "mov %3, %%"REG_d"                      \n\t"
                    "addl -4(%%"REG_d"), %%ecx              \n\t"
                    "addl 4(%%"REG_d"), %%ecx               \n\t"
                    "addl -1024(%%"REG_d"), %%ecx           \n\t"
                    "addl $4, %%ecx                         \n\t"
                    "addl 1024(%%"REG_d"), %%ecx            \n\t"
                    "shrl $3, %%ecx                         \n\t"
                    "movl %%ecx, (%%"REG_d")                \n\t"
    
    //                "mov %3, %%"REG_c"                      \n\t"
    //                "mov %%"REG_c", test                    \n\t"
    //                "jmp 4f                                 \n\t"
                    "cmpl 512(%%"REG_d"), %%ecx             \n\t"
                    " jb 2f                                 \n\t"
                    "cmpl 516(%%"REG_d"), %%ecx             \n\t"
                    " jb 1f                                 \n\t"
    
                    "lea (%%"REG_a", %2, 2), %%"REG_d"      \n\t" // 5*stride
                    "lea (%%"REG_d", %2, 2), %%"REG_c"      \n\t" // 7*stride
                    "movq (%0), %%mm0                       \n\t" // L0
                    "movq (%0, %2), %%mm1                   \n\t" // L1
                    "movq (%0, %2, 2), %%mm2                \n\t" // L2
                    "movq (%0, %%"REG_a"), %%mm3            \n\t" // L3
                    "movq (%0, %2, 4), %%mm4                \n\t" // L4
                    "movq (%0, %%"REG_d"), %%mm5            \n\t" // L5
                    "movq (%0, %%"REG_a", 2), %%mm6         \n\t" // L6
                    "movq (%0, %%"REG_c"), %%mm7            \n\t" // L7
                    "movq %%mm0, (%1)                       \n\t" // L0
                    "movq %%mm1, (%1, %2)                   \n\t" // L1
                    "movq %%mm2, (%1, %2, 2)                \n\t" // L2
                    "movq %%mm3, (%1, %%"REG_a")            \n\t" // L3
                    "movq %%mm4, (%1, %2, 4)                \n\t" // L4
                    "movq %%mm5, (%1, %%"REG_d")            \n\t" // L5
                    "movq %%mm6, (%1, %%"REG_a", 2)         \n\t" // L6
                    "movq %%mm7, (%1, %%"REG_c")            \n\t" // L7
                    "jmp 4f                                 \n\t"
    
                    "1:                                     \n\t"
                    "lea (%%"REG_a", %2, 2), %%"REG_d"      \n\t" // 5*stride
                    "lea (%%"REG_d", %2, 2), %%"REG_c"      \n\t" // 7*stride
                    "movq (%0), %%mm0                       \n\t" // L0
                    PAVGB((%1), %%mm0)                            // L0
                    "movq (%0, %2), %%mm1                   \n\t" // L1
                    PAVGB((%1, %2), %%mm1)                        // L1
                    "movq (%0, %2, 2), %%mm2                \n\t" // L2
                    PAVGB((%1, %2, 2), %%mm2)                     // L2
                    "movq (%0, %%"REG_a"), %%mm3            \n\t" // L3
                    PAVGB((%1, %%REGa), %%mm3)                    // L3
                    "movq (%0, %2, 4), %%mm4                \n\t" // L4
                    PAVGB((%1, %2, 4), %%mm4)                     // L4
                    "movq (%0, %%"REG_d"), %%mm5            \n\t" // L5
                    PAVGB((%1, %%REGd), %%mm5)                    // L5
                    "movq (%0, %%"REG_a", 2), %%mm6         \n\t" // L6
                    PAVGB((%1, %%REGa, 2), %%mm6)                 // L6
                    "movq (%0, %%"REG_c"), %%mm7            \n\t" // L7
                    PAVGB((%1, %%REGc), %%mm7)                    // L7
                    "movq %%mm0, (%1)                       \n\t" // R0
                    "movq %%mm1, (%1, %2)                   \n\t" // R1
                    "movq %%mm2, (%1, %2, 2)                \n\t" // R2
                    "movq %%mm3, (%1, %%"REG_a")            \n\t" // R3
                    "movq %%mm4, (%1, %2, 4)                \n\t" // R4
                    "movq %%mm5, (%1, %%"REG_d")            \n\t" // R5
                    "movq %%mm6, (%1, %%"REG_a", 2)         \n\t" // R6
                    "movq %%mm7, (%1, %%"REG_c")            \n\t" // R7
                    "movq %%mm0, (%0)                       \n\t" // L0
                    "movq %%mm1, (%0, %2)                   \n\t" // L1
                    "movq %%mm2, (%0, %2, 2)                \n\t" // L2
                    "movq %%mm3, (%0, %%"REG_a")            \n\t" // L3
                    "movq %%mm4, (%0, %2, 4)                \n\t" // L4
                    "movq %%mm5, (%0, %%"REG_d")            \n\t" // L5
                    "movq %%mm6, (%0, %%"REG_a", 2)         \n\t" // L6
                    "movq %%mm7, (%0, %%"REG_c")            \n\t" // L7
                    "jmp 4f                                 \n\t"
    
                    "2:                                     \n\t"
                    "cmpl 508(%%"REG_d"), %%ecx             \n\t"
                    " jb 3f                                 \n\t"
    
                    "lea (%%"REG_a", %2, 2), %%"REG_d"      \n\t" // 5*stride
                    "lea (%%"REG_d", %2, 2), %%"REG_c"      \n\t" // 7*stride
                    "movq (%0), %%mm0                       \n\t" // L0
                    "movq (%0, %2), %%mm1                   \n\t" // L1
                    "movq (%0, %2, 2), %%mm2                \n\t" // L2
                    "movq (%0, %%"REG_a"), %%mm3            \n\t" // L3
                    "movq (%1), %%mm4                       \n\t" // R0
                    "movq (%1, %2), %%mm5                   \n\t" // R1
                    "movq (%1, %2, 2), %%mm6                \n\t" // R2
                    "movq (%1, %%"REG_a"), %%mm7            \n\t" // R3
                    PAVGB(%%mm4, %%mm0)
                    PAVGB(%%mm5, %%mm1)
                    PAVGB(%%mm6, %%mm2)
                    PAVGB(%%mm7, %%mm3)
                    PAVGB(%%mm4, %%mm0)
                    PAVGB(%%mm5, %%mm1)
                    PAVGB(%%mm6, %%mm2)
                    PAVGB(%%mm7, %%mm3)
                    "movq %%mm0, (%1)                       \n\t" // R0
                    "movq %%mm1, (%1, %2)                   \n\t" // R1
                    "movq %%mm2, (%1, %2, 2)                \n\t" // R2
                    "movq %%mm3, (%1, %%"REG_a")            \n\t" // R3
                    "movq %%mm0, (%0)                       \n\t" // L0
                    "movq %%mm1, (%0, %2)                   \n\t" // L1
                    "movq %%mm2, (%0, %2, 2)                \n\t" // L2
                    "movq %%mm3, (%0, %%"REG_a")            \n\t" // L3
    
                    "movq (%0, %2, 4), %%mm0                \n\t" // L4
                    "movq (%0, %%"REG_d"), %%mm1            \n\t" // L5
                    "movq (%0, %%"REG_a", 2), %%mm2         \n\t" // L6
                    "movq (%0, %%"REG_c"), %%mm3            \n\t" // L7
                    "movq (%1, %2, 4), %%mm4                \n\t" // R4
                    "movq (%1, %%"REG_d"), %%mm5            \n\t" // R5
                    "movq (%1, %%"REG_a", 2), %%mm6         \n\t" // R6
                    "movq (%1, %%"REG_c"), %%mm7            \n\t" // R7
                    PAVGB(%%mm4, %%mm0)
                    PAVGB(%%mm5, %%mm1)
                    PAVGB(%%mm6, %%mm2)
                    PAVGB(%%mm7, %%mm3)
                    PAVGB(%%mm4, %%mm0)
                    PAVGB(%%mm5, %%mm1)
                    PAVGB(%%mm6, %%mm2)
                    PAVGB(%%mm7, %%mm3)
                    "movq %%mm0, (%1, %2, 4)                \n\t" // R4
                    "movq %%mm1, (%1, %%"REG_d")            \n\t" // R5
                    "movq %%mm2, (%1, %%"REG_a", 2)         \n\t" // R6
                    "movq %%mm3, (%1, %%"REG_c")            \n\t" // R7
                    "movq %%mm0, (%0, %2, 4)                \n\t" // L4
                    "movq %%mm1, (%0, %%"REG_d")            \n\t" // L5
                    "movq %%mm2, (%0, %%"REG_a", 2)         \n\t" // L6
                    "movq %%mm3, (%0, %%"REG_c")            \n\t" // L7
                    "jmp 4f                                 \n\t"
    
                    "3:                                     \n\t"
                    "lea (%%"REG_a", %2, 2), %%"REG_d"      \n\t" // 5*stride
                    "lea (%%"REG_d", %2, 2), %%"REG_c"      \n\t" // 7*stride
                    "movq (%0), %%mm0                       \n\t" // L0
                    "movq (%0, %2), %%mm1                   \n\t" // L1
                    "movq (%0, %2, 2), %%mm2                \n\t" // L2
                    "movq (%0, %%"REG_a"), %%mm3            \n\t" // L3
                    "movq (%1), %%mm4                       \n\t" // R0
                    "movq (%1, %2), %%mm5                   \n\t" // R1
                    "movq (%1, %2, 2), %%mm6                \n\t" // R2
                    "movq (%1, %%"REG_a"), %%mm7            \n\t" // R3
                    PAVGB(%%mm4, %%mm0)
                    PAVGB(%%mm5, %%mm1)
                    PAVGB(%%mm6, %%mm2)
                    PAVGB(%%mm7, %%mm3)
                    PAVGB(%%mm4, %%mm0)
                    PAVGB(%%mm5, %%mm1)
                    PAVGB(%%mm6, %%mm2)
                    PAVGB(%%mm7, %%mm3)
                    PAVGB(%%mm4, %%mm0)
                    PAVGB(%%mm5, %%mm1)
                    PAVGB(%%mm6, %%mm2)
                    PAVGB(%%mm7, %%mm3)
                    "movq %%mm0, (%1)                       \n\t" // R0
                    "movq %%mm1, (%1, %2)                   \n\t" // R1
                    "movq %%mm2, (%1, %2, 2)                \n\t" // R2
                    "movq %%mm3, (%1, %%"REG_a")            \n\t" // R3
                    "movq %%mm0, (%0)                       \n\t" // L0
                    "movq %%mm1, (%0, %2)                   \n\t" // L1
                    "movq %%mm2, (%0, %2, 2)                \n\t" // L2
                    "movq %%mm3, (%0, %%"REG_a")            \n\t" // L3
    
                    "movq (%0, %2, 4), %%mm0                \n\t" // L4
                    "movq (%0, %%"REG_d"), %%mm1            \n\t" // L5
                    "movq (%0, %%"REG_a", 2), %%mm2         \n\t" // L6
                    "movq (%0, %%"REG_c"), %%mm3            \n\t" // L7
                    "movq (%1, %2, 4), %%mm4                \n\t" // R4
                    "movq (%1, %%"REG_d"), %%mm5            \n\t" // R5
                    "movq (%1, %%"REG_a", 2), %%mm6         \n\t" // R6
                    "movq (%1, %%"REG_c"), %%mm7            \n\t" // R7
                    PAVGB(%%mm4, %%mm0)
                    PAVGB(%%mm5, %%mm1)
                    PAVGB(%%mm6, %%mm2)
                    PAVGB(%%mm7, %%mm3)
                    PAVGB(%%mm4, %%mm0)
                    PAVGB(%%mm5, %%mm1)
                    PAVGB(%%mm6, %%mm2)
                    PAVGB(%%mm7, %%mm3)
                    PAVGB(%%mm4, %%mm0)
                    PAVGB(%%mm5, %%mm1)
                    PAVGB(%%mm6, %%mm2)
                    PAVGB(%%mm7, %%mm3)
                    "movq %%mm0, (%1, %2, 4)                \n\t" // R4
                    "movq %%mm1, (%1, %%"REG_d")            \n\t" // R5
                    "movq %%mm2, (%1, %%"REG_a", 2)         \n\t" // R6
                    "movq %%mm3, (%1, %%"REG_c")            \n\t" // R7
                    "movq %%mm0, (%0, %2, 4)                \n\t" // L4
                    "movq %%mm1, (%0, %%"REG_d")            \n\t" // L5
                    "movq %%mm2, (%0, %%"REG_a", 2)         \n\t" // L6
                    "movq %%mm3, (%0, %%"REG_c")            \n\t" // L7
    
                    "4:                                     \n\t"
    
                    :: "r" (src), "r" (tempBlured), "r"((long)stride), "m" (tempBluredPast)
                    : "%"REG_a, "%"REG_d, "%"REG_c, "memory"
                    );
    
    #else //defined (HAVE_MMX2) || defined (HAVE_3DNOW)
    
            int y;
            int d=0;
    //        int sysd=0;
            int i;
    
            for(y=0; y<8; y++)
            {
                    int x;
                    for(x=0; x<8; x++)
                    {
                            int ref= tempBlured[ x + y*stride ];
                            int cur= src[ x + y*stride ];
                            int d1=ref - cur;
    //                        if(x==0 || x==7) d1+= d1>>1;
    //                        if(y==0 || y==7) d1+= d1>>1;
    
    //                        d+= FFABS(d1);
    
                            d+= d1*d1;
    //                        sysd+= d1;
                    }
            }
            i=d;
            d=         (
                    4*d
                    +(*(tempBluredPast-256))
                    +(*(tempBluredPast-1))+ (*(tempBluredPast+1))
                    +(*(tempBluredPast+256))
                    +4)>>3;
            *tempBluredPast=i;
    //        ((*tempBluredPast)*3 + d + 2)>>2;
    
    /*
    Switch between
     1  0  0  0  0  0  0  (0)
    64 32 16  8  4  2  1  (1)
    64 48 36 27 20 15 11 (33) (approx)
    64 56 49 43 37 33 29 (200) (approx)
    */
    
            if(d > maxNoise[1])
            {
                    if(d < maxNoise[2])
                    {
                            for(y=0; y<8; y++)
                            {
                                    int x;
                                    for(x=0; x<8; x++)
                                    {
                                            int ref= tempBlured[ x + y*stride ];
                                            int cur= src[ x + y*stride ];
                                            tempBlured[ x + y*stride ]=
                                            src[ x + y*stride ]=
                                                    (ref + cur + 1)>>1;
                                    }
                            }
                    }
                    else
                    {
                            for(y=0; y<8; y++)
                            {
                                    int x;
                                    for(x=0; x<8; x++)
                                    {
                                            tempBlured[ x + y*stride ]= src[ x + y*stride ];
                                    }
                            }
                    }
            }
            else
            {
                    if(d < maxNoise[0])
                    {
                            for(y=0; y<8; y++)
                            {
                                    int x;
                                    for(x=0; x<8; x++)
                                    {
                                            int ref= tempBlured[ x + y*stride ];
                                            int cur= src[ x + y*stride ];
                                            tempBlured[ x + y*stride ]=
                                            src[ x + y*stride ]=
                                                    (ref*7 + cur + 4)>>3;
                                    }
                            }
                    }
                    else
                    {
                            for(y=0; y<8; y++)
                            {
                                    int x;
                                    for(x=0; x<8; x++)
                                    {
                                            int ref= tempBlured[ x + y*stride ];
                                            int cur= src[ x + y*stride ];
                                            tempBlured[ x + y*stride ]=
                                            src[ x + y*stride ]=
                                                    (ref*3 + cur + 2)>>2;
                                    }
                            }
                    }
            }
    
    #endif //defined (HAVE_MMX2) || defined (HAVE_3DNOW)
    
    #endif //HAVE_ALTIVEC
    
    #ifdef HAVE_MMX
    /**
     * accurate deblock filter
     */
    
    static av_always_inline void RENAME(do_a_deblock)(uint8_t *src, int step, int stride, PPContext *c){
    
            int64_t dc_mask, eq_mask, both_masks;
            int64_t sums[10*8*2];
            src+= step*3; // src points to begin of the 8x8 Block
    
                    "movq %0, %%mm7                         \n\t"
                    "movq %1, %%mm6                         \n\t"
    
                    : : "m" (c->mmxDcOffset[c->nonBQP]),  "m" (c->mmxDcThreshold[c->nonBQP])
                    );
    
                    "lea (%2, %3), %%"REG_a"                \n\t"
    //      0       1       2       3       4       5       6       7       8       9
    //      %1      eax     eax+%2  eax+2%2 %1+4%2  ecx     ecx+%2  ecx+2%2 %1+8%2  ecx+4%2
    
                    "movq (%2), %%mm0                       \n\t"
                    "movq (%%"REG_a"), %%mm1                \n\t"
                    "movq %%mm1, %%mm3                      \n\t"
                    "movq %%mm1, %%mm4                      \n\t"
                    "psubb %%mm1, %%mm0                     \n\t" // mm0 = differnece
                    "paddb %%mm7, %%mm0                     \n\t"
                    "pcmpgtb %%mm6, %%mm0                   \n\t"
    
                    "movq (%%"REG_a",%3), %%mm2             \n\t"
    
                    PMAXUB(%%mm2, %%mm4)
                    PMINUB(%%mm2, %%mm3, %%mm5)
    
                    "psubb %%mm2, %%mm1                     \n\t"
                    "paddb %%mm7, %%mm1                     \n\t"
                    "pcmpgtb %%mm6, %%mm1                   \n\t"
                    "paddb %%mm1, %%mm0                     \n\t"
    
                    "movq (%%"REG_a", %3, 2), %%mm1         \n\t"
    
                    PMAXUB(%%mm1, %%mm4)
                    PMINUB(%%mm1, %%mm3, %%mm5)
    
                    "psubb %%mm1, %%mm2                     \n\t"
                    "paddb %%mm7, %%mm2                     \n\t"
                    "pcmpgtb %%mm6, %%mm2                   \n\t"
                    "paddb %%mm2, %%mm0                     \n\t"
    
                    "lea (%%"REG_a", %3, 4), %%"REG_a"      \n\t"
    
                    "movq (%2, %3, 4), %%mm2                \n\t"
    
                    PMAXUB(%%mm2, %%mm4)
                    PMINUB(%%mm2, %%mm3, %%mm5)
    
                    "psubb %%mm2, %%mm1                     \n\t"
                    "paddb %%mm7, %%mm1                     \n\t"
                    "pcmpgtb %%mm6, %%mm1                   \n\t"
                    "paddb %%mm1, %%mm0                     \n\t"
    
                    "movq (%%"REG_a"), %%mm1                \n\t"
    
                    PMAXUB(%%mm1, %%mm4)
                    PMINUB(%%mm1, %%mm3, %%mm5)
    
                    "psubb %%mm1, %%mm2                     \n\t"
                    "paddb %%mm7, %%mm2                     \n\t"
                    "pcmpgtb %%mm6, %%mm2                   \n\t"
                    "paddb %%mm2, %%mm0                     \n\t"
    
                    "movq (%%"REG_a", %3), %%mm2            \n\t"
    
                    PMAXUB(%%mm2, %%mm4)
                    PMINUB(%%mm2, %%mm3, %%mm5)
    
                    "psubb %%mm2, %%mm1                     \n\t"
                    "paddb %%mm7, %%mm1                     \n\t"
                    "pcmpgtb %%mm6, %%mm1                   \n\t"
                    "paddb %%mm1, %%mm0                     \n\t"
    
                    "movq (%%"REG_a", %3, 2), %%mm1         \n\t"
    
                    PMAXUB(%%mm1, %%mm4)
                    PMINUB(%%mm1, %%mm3, %%mm5)
    
                    "psubb %%mm1, %%mm2                     \n\t"
                    "paddb %%mm7, %%mm2                     \n\t"
                    "pcmpgtb %%mm6, %%mm2                   \n\t"
                    "paddb %%mm2, %%mm0                     \n\t"
    
                    "movq (%2, %3, 8), %%mm2                \n\t"
    
                    PMAXUB(%%mm2, %%mm4)
                    PMINUB(%%mm2, %%mm3, %%mm5)
    
                    "psubb %%mm2, %%mm1                     \n\t"
                    "paddb %%mm7, %%mm1                     \n\t"
                    "pcmpgtb %%mm6, %%mm1                   \n\t"
                    "paddb %%mm1, %%mm0                     \n\t"
    
                    "movq (%%"REG_a", %3, 4), %%mm1         \n\t"
                    "psubb %%mm1, %%mm2                     \n\t"
                    "paddb %%mm7, %%mm2                     \n\t"
                    "pcmpgtb %%mm6, %%mm2                   \n\t"
                    "paddb %%mm2, %%mm0                     \n\t"
                    "psubusb %%mm3, %%mm4                   \n\t"
    
                    "pxor %%mm6, %%mm6                      \n\t"
                    "movq %4, %%mm7                         \n\t" // QP,..., QP
                    "paddusb %%mm7, %%mm7                   \n\t" // 2QP ... 2QP
                    "psubusb %%mm4, %%mm7                   \n\t" // Diff >=2QP -> 0
                    "pcmpeqb %%mm6, %%mm7                   \n\t" // Diff < 2QP -> 0
                    "pcmpeqb %%mm6, %%mm7                   \n\t" // Diff < 2QP -> 0
                    "movq %%mm7, %1                         \n\t"
    
                    "movq %5, %%mm7                         \n\t"
                    "punpcklbw %%mm7, %%mm7                 \n\t"
                    "punpcklbw %%mm7, %%mm7                 \n\t"
                    "punpcklbw %%mm7, %%mm7                 \n\t"
                    "psubb %%mm0, %%mm6                     \n\t"
                    "pcmpgtb %%mm7, %%mm6                   \n\t"
                    "movq %%mm6, %0                         \n\t"
    
                    : "=m" (eq_mask), "=m" (dc_mask)
                    : "r" (src), "r" ((long)step), "m" (c->pQPb), "m"(c->ppMode.flatnessThreshold)
                    : "%"REG_a
                    );
    
            both_masks = dc_mask & eq_mask;
    
            if(both_masks){
                    long offset= -8*step;
                    int64_t *temp_sums= sums;
    
                    asm volatile(
                    "movq %2, %%mm0                         \n\t"  // QP,..., QP
                    "pxor %%mm4, %%mm4                      \n\t"
    
                    "movq (%0), %%mm6                       \n\t"
                    "movq (%0, %1), %%mm5                   \n\t"
                    "movq %%mm5, %%mm1                      \n\t"
                    "movq %%mm6, %%mm2                      \n\t"
                    "psubusb %%mm6, %%mm5                   \n\t"
                    "psubusb %%mm1, %%mm2                   \n\t"
                    "por %%mm5, %%mm2                       \n\t" // ABS Diff of lines
                    "psubusb %%mm2, %%mm0                   \n\t" // diff >= QP -> 0
                    "pcmpeqb %%mm4, %%mm0                   \n\t" // diff >= QP -> FF
    
                    "pxor %%mm6, %%mm1                      \n\t"
                    "pand %%mm0, %%mm1                      \n\t"
                    "pxor %%mm1, %%mm6                      \n\t"
                    // 0:QP  6:First
    
                    "movq (%0, %1, 8), %%mm5                \n\t"
                    "add %1, %0                             \n\t" // %0 points to line 1 not 0
                    "movq (%0, %1, 8), %%mm7                \n\t"
                    "movq %%mm5, %%mm1                      \n\t"
                    "movq %%mm7, %%mm2                      \n\t"
                    "psubusb %%mm7, %%mm5                   \n\t"
                    "psubusb %%mm1, %%mm2                   \n\t"
                    "por %%mm5, %%mm2                       \n\t" // ABS Diff of lines
                    "movq %2, %%mm0                         \n\t"  // QP,..., QP
                    "psubusb %%mm2, %%mm0                   \n\t" // diff >= QP -> 0
                    "pcmpeqb %%mm4, %%mm0                   \n\t" // diff >= QP -> FF
    
                    "pxor %%mm7, %%mm1                      \n\t"
                    "pand %%mm0, %%mm1                      \n\t"
                    "pxor %%mm1, %%mm7                      \n\t"
    
                    "movq %%mm6, %%mm5                      \n\t"
                    "punpckhbw %%mm4, %%mm6                 \n\t"
                    "punpcklbw %%mm4, %%mm5                 \n\t"
                    // 4:0 5/6:First 7:Last
    
                    "movq %%mm5, %%mm0                      \n\t"
                    "movq %%mm6, %%mm1                      \n\t"
                    "psllw $2, %%mm0                        \n\t"
                    "psllw $2, %%mm1                        \n\t"
                    "paddw "MANGLE(w04)", %%mm0             \n\t"
                    "paddw "MANGLE(w04)", %%mm1             \n\t"
    
                    "movq (%0), %%mm2                       \n\t"\
                    "movq (%0), %%mm3                       \n\t"\
                    "add %1, %0                             \n\t"\
                    "punpcklbw %%mm4, %%mm2                 \n\t"\
                    "punpckhbw %%mm4, %%mm3                 \n\t"\
                    "paddw %%mm2, %%mm0                     \n\t"\
                    "paddw %%mm3, %%mm1                     \n\t"
    
                    "movq (%0), %%mm2                       \n\t"\
                    "movq (%0), %%mm3                       \n\t"\
                    "add %1, %0                             \n\t"\
                    "punpcklbw %%mm4, %%mm2                 \n\t"\
                    "punpckhbw %%mm4, %%mm3                 \n\t"\
                    "psubw %%mm2, %%mm0                     \n\t"\
                    "psubw %%mm3, %%mm1                     \n\t"
    
    
                    NEXT //0
                    NEXT //1
                    NEXT //2
                    "movq %%mm0, (%3)                       \n\t"
                    "movq %%mm1, 8(%3)                      \n\t"
    
                    NEXT //3
                    "psubw %%mm5, %%mm0                     \n\t"
                    "psubw %%mm6, %%mm1                     \n\t"
                    "movq %%mm0, 16(%3)                     \n\t"
                    "movq %%mm1, 24(%3)                     \n\t"
    
                    NEXT //4
                    "psubw %%mm5, %%mm0                     \n\t"
                    "psubw %%mm6, %%mm1                     \n\t"
                    "movq %%mm0, 32(%3)                     \n\t"
                    "movq %%mm1, 40(%3)                     \n\t"
    
                    NEXT //5
                    "psubw %%mm5, %%mm0                     \n\t"
                    "psubw %%mm6, %%mm1                     \n\t"
                    "movq %%mm0, 48(%3)                     \n\t"
                    "movq %%mm1, 56(%3)                     \n\t"
    
                    NEXT //6
                    "psubw %%mm5, %%mm0                     \n\t"
                    "psubw %%mm6, %%mm1                     \n\t"
                    "movq %%mm0, 64(%3)                     \n\t"
                    "movq %%mm1, 72(%3)                     \n\t"
    
                    "movq %%mm7, %%mm6                      \n\t"
                    "punpckhbw %%mm4, %%mm7                 \n\t"
                    "punpcklbw %%mm4, %%mm6                 \n\t"
    
                    NEXT //7
                    "mov %4, %0                             \n\t"
                    "add %1, %0                             \n\t"
                    PREV //0
                    "movq %%mm0, 80(%3)                     \n\t"
                    "movq %%mm1, 88(%3)                     \n\t"
    
                    PREV //1
                    "paddw %%mm6, %%mm0                     \n\t"
                    "paddw %%mm7, %%mm1                     \n\t"
                    "movq %%mm0, 96(%3)                     \n\t"
                    "movq %%mm1, 104(%3)                    \n\t"
    
                    PREV //2
                    "paddw %%mm6, %%mm0                     \n\t"
                    "paddw %%mm7, %%mm1                     \n\t"
                    "movq %%mm0, 112(%3)                    \n\t"
                    "movq %%mm1, 120(%3)                    \n\t"
    
                    PREV //3
                    "paddw %%mm6, %%mm0                     \n\t"
                    "paddw %%mm7, %%mm1                     \n\t"
                    "movq %%mm0, 128(%3)                    \n\t"
                    "movq %%mm1, 136(%3)                    \n\t"
    
                    PREV //4
                    "paddw %%mm6, %%mm0                     \n\t"
                    "paddw %%mm7, %%mm1                     \n\t"
                    "movq %%mm0, 144(%3)                    \n\t"
                    "movq %%mm1, 152(%3)                    \n\t"
    
                    "mov %4, %0                             \n\t" //FIXME
    
                    : "+&r"(src)
                    : "r" ((long)step), "m" (c->pQPb), "r"(sums), "g"(src)
                    );
    
                    src+= step; // src points to begin of the 8x8 Block
    
                    asm volatile(
                    "movq %4, %%mm6                         \n\t"
                    "pcmpeqb %%mm5, %%mm5                   \n\t"
                    "pxor %%mm6, %%mm5                      \n\t"
                    "pxor %%mm7, %%mm7                      \n\t"
    
                    "1:                                     \n\t"
                    "movq (%1), %%mm0                       \n\t"
                    "movq 8(%1), %%mm1                      \n\t"
                    "paddw 32(%1), %%mm0                    \n\t"
                    "paddw 40(%1), %%mm1                    \n\t"
                    "movq (%0, %3), %%mm2                   \n\t"
                    "movq %%mm2, %%mm3                      \n\t"
                    "movq %%mm2, %%mm4                      \n\t"
                    "punpcklbw %%mm7, %%mm2                 \n\t"
                    "punpckhbw %%mm7, %%mm3                 \n\t"
                    "paddw %%mm2, %%mm0                     \n\t"
                    "paddw %%mm3, %%mm1                     \n\t"
                    "paddw %%mm2, %%mm0                     \n\t"
                    "paddw %%mm3, %%mm1                     \n\t"
                    "psrlw $4, %%mm0                        \n\t"
                    "psrlw $4, %%mm1                        \n\t"
                    "packuswb %%mm1, %%mm0                  \n\t"
                    "pand %%mm6, %%mm0                      \n\t"
                    "pand %%mm5, %%mm4                      \n\t"
                    "por %%mm4, %%mm0                       \n\t"
                    "movq %%mm0, (%0, %3)                   \n\t"
                    "add $16, %1                            \n\t"
                    "add %2, %0                             \n\t"
                    " js 1b                                 \n\t"
    
                    : "+r"(offset), "+r"(temp_sums)
                    : "r" ((long)step), "r"(src - offset), "m"(both_masks)
                    );
            }else
                    src+= step; // src points to begin of the 8x8 Block
    
            if(eq_mask != -1LL){
                    uint8_t *temp_src= src;
                    asm volatile(
                    "pxor %%mm7, %%mm7                      \n\t"
                    "lea -40(%%"REG_SP"), %%"REG_c"         \n\t" // make space for 4 8-byte vars
                    "and "ALIGN_MASK", %%"REG_c"            \n\t" // align
    //      0       1       2       3       4       5       6       7       8       9
    //      %0      eax     eax+%1  eax+2%1 %0+4%1  ecx     ecx+%1  ecx+2%1 %1+8%1  ecx+4%1
    
                    "movq (%0), %%mm0                       \n\t"
                    "movq %%mm0, %%mm1                      \n\t"
                    "punpcklbw %%mm7, %%mm0                 \n\t" // low part of line 0
                    "punpckhbw %%mm7, %%mm1                 \n\t" // high part of line 0
    
                    "movq (%0, %1), %%mm2                   \n\t"
                    "lea (%0, %1, 2), %%"REG_a"             \n\t"
                    "movq %%mm2, %%mm3                      \n\t"
                    "punpcklbw %%mm7, %%mm2                 \n\t" // low part of line 1
                    "punpckhbw %%mm7, %%mm3                 \n\t" // high part of line 1
    
                    "movq (%%"REG_a"), %%mm4                \n\t"
                    "movq %%mm4, %%mm5                      \n\t"
                    "punpcklbw %%mm7, %%mm4                 \n\t" // low part of line 2
                    "punpckhbw %%mm7, %%mm5                 \n\t" // high part of line 2
    
                    "paddw %%mm0, %%mm0                     \n\t" // 2L0
                    "paddw %%mm1, %%mm1                     \n\t" // 2H0
                    "psubw %%mm4, %%mm2                     \n\t" // L1 - L2
                    "psubw %%mm5, %%mm3                     \n\t" // H1 - H2
                    "psubw %%mm2, %%mm0                     \n\t" // 2L0 - L1 + L2
                    "psubw %%mm3, %%mm1                     \n\t" // 2H0 - H1 + H2
    
                    "psllw $2, %%mm2                        \n\t" // 4L1 - 4L2
                    "psllw $2, %%mm3                        \n\t" // 4H1 - 4H2
                    "psubw %%mm2, %%mm0                     \n\t" // 2L0 - 5L1 + 5L2
                    "psubw %%mm3, %%mm1                     \n\t" // 2H0 - 5H1 + 5H2
    
                    "movq (%%"REG_a", %1), %%mm2            \n\t"
                    "movq %%mm2, %%mm3                      \n\t"
                    "punpcklbw %%mm7, %%mm2                 \n\t" // L3
                    "punpckhbw %%mm7, %%mm3                 \n\t" // H3
    
                    "psubw %%mm2, %%mm0                     \n\t" // 2L0 - 5L1 + 5L2 - L3
                    "psubw %%mm3, %%mm1                     \n\t" // 2H0 - 5H1 + 5H2 - H3
                    "psubw %%mm2, %%mm0                     \n\t" // 2L0 - 5L1 + 5L2 - 2L3
                    "psubw %%mm3, %%mm1                     \n\t" // 2H0 - 5H1 + 5H2 - 2H3
                    "movq %%mm0, (%%"REG_c")                \n\t" // 2L0 - 5L1 + 5L2 - 2L3
                    "movq %%mm1, 8(%%"REG_c")               \n\t" // 2H0 - 5H1 + 5H2 - 2H3
    
                    "movq (%%"REG_a", %1, 2), %%mm0         \n\t"
                    "movq %%mm0, %%mm1                      \n\t"
                    "punpcklbw %%mm7, %%mm0                 \n\t" // L4
                    "punpckhbw %%mm7, %%mm1                 \n\t" // H4
    
                    "psubw %%mm0, %%mm2                     \n\t" // L3 - L4
                    "psubw %%mm1, %%mm3                     \n\t" // H3 - H4
                    "movq %%mm2, 16(%%"REG_c")              \n\t" // L3 - L4
                    "movq %%mm3, 24(%%"REG_c")              \n\t" // H3 - H4
                    "paddw %%mm4, %%mm4                     \n\t" // 2L2
                    "paddw %%mm5, %%mm5                     \n\t" // 2H2
                    "psubw %%mm2, %%mm4                     \n\t" // 2L2 - L3 + L4
                    "psubw %%mm3, %%mm5                     \n\t" // 2H2 - H3 + H4
    
                    "lea (%%"REG_a", %1), %0                \n\t"
                    "psllw $2, %%mm2                        \n\t" // 4L3 - 4L4
                    "psllw $2, %%mm3                        \n\t" // 4H3 - 4H4
                    "psubw %%mm2, %%mm4                     \n\t" // 2L2 - 5L3 + 5L4
                    "psubw %%mm3, %%mm5                     \n\t" // 2H2 - 5H3 + 5H4
    
                    "movq (%0, %1, 2), %%mm2                \n\t"
                    "movq %%mm2, %%mm3                      \n\t"
                    "punpcklbw %%mm7, %%mm2                 \n\t" // L5
                    "punpckhbw %%mm7, %%mm3                 \n\t" // H5
                    "psubw %%mm2, %%mm4                     \n\t" // 2L2 - 5L3 + 5L4 - L5
                    "psubw %%mm3, %%mm5                     \n\t" // 2H2 - 5H3 + 5H4 - H5
                    "psubw %%mm2, %%mm4                     \n\t" // 2L2 - 5L3 + 5L4 - 2L5
                    "psubw %%mm3, %%mm5                     \n\t" // 2H2 - 5H3 + 5H4 - 2H5