Skip to content
Snippets Groups Projects
postprocess_template.c 162 KiB
Newer Older
  • Learn to ignore specific revisions
  •             a = colsrc[0       ];
                b = colsrc[stride  ];
                c = colsrc[stride*2];
                d = (a-b)>>31;
                e = (b-c)>>31;
                f = (c-a)>>31;
                colsrc[stride  ] = (a|(d^f)) & (b|(d^e)) & (c|(e^f));
                colsrc += stride*2;
    
    /**
     * transposes and shift the given 8x8 Block into dst1 and dst2
     */
    
    static inline void RENAME(transpose1)(uint8_t *dst1, uint8_t *dst2, uint8_t *src, int srcStride)
    
    //      0       1       2       3       4       5       6       7       8       9
    //      %0      eax     eax+%1  eax+2%1 %0+4%1  edx     edx+%1  edx+2%1 %0+8%1  edx+4%1
    
            "movq (%0), %%mm0                       \n\t" // 12345678
            "movq (%%"REG_a"), %%mm1                \n\t" // abcdefgh
            "movq %%mm0, %%mm2                      \n\t" // 12345678
            "punpcklbw %%mm1, %%mm0                 \n\t" // 1a2b3c4d
            "punpckhbw %%mm1, %%mm2                 \n\t" // 5e6f7g8h
    
            "movq (%%"REG_a", %1), %%mm1            \n\t"
            "movq (%%"REG_a", %1, 2), %%mm3         \n\t"
            "movq %%mm1, %%mm4                      \n\t"
            "punpcklbw %%mm3, %%mm1                 \n\t"
            "punpckhbw %%mm3, %%mm4                 \n\t"
    
            "movq %%mm0, %%mm3                      \n\t"
            "punpcklwd %%mm1, %%mm0                 \n\t"
            "punpckhwd %%mm1, %%mm3                 \n\t"
            "movq %%mm2, %%mm1                      \n\t"
            "punpcklwd %%mm4, %%mm2                 \n\t"
            "punpckhwd %%mm4, %%mm1                 \n\t"
    
            "movd %%mm0, 128(%2)                    \n\t"
            "psrlq $32, %%mm0                       \n\t"
            "movd %%mm0, 144(%2)                    \n\t"
            "movd %%mm3, 160(%2)                    \n\t"
            "psrlq $32, %%mm3                       \n\t"
            "movd %%mm3, 176(%2)                    \n\t"
            "movd %%mm3, 48(%3)                     \n\t"
            "movd %%mm2, 192(%2)                    \n\t"
            "movd %%mm2, 64(%3)                     \n\t"
            "psrlq $32, %%mm2                       \n\t"
            "movd %%mm2, 80(%3)                     \n\t"
            "movd %%mm1, 96(%3)                     \n\t"
            "psrlq $32, %%mm1                       \n\t"
            "movd %%mm1, 112(%3)                    \n\t"
    
            "lea (%%"REG_a", %1, 4), %%"REG_a"      \n\t"
    
            "movq (%0, %1, 4), %%mm0                \n\t" // 12345678
            "movq (%%"REG_a"), %%mm1                \n\t" // abcdefgh
            "movq %%mm0, %%mm2                      \n\t" // 12345678
            "punpcklbw %%mm1, %%mm0                 \n\t" // 1a2b3c4d
            "punpckhbw %%mm1, %%mm2                 \n\t" // 5e6f7g8h
    
            "movq (%%"REG_a", %1), %%mm1            \n\t"
            "movq (%%"REG_a", %1, 2), %%mm3         \n\t"
            "movq %%mm1, %%mm4                      \n\t"
            "punpcklbw %%mm3, %%mm1                 \n\t"
            "punpckhbw %%mm3, %%mm4                 \n\t"
    
            "movq %%mm0, %%mm3                      \n\t"
            "punpcklwd %%mm1, %%mm0                 \n\t"
            "punpckhwd %%mm1, %%mm3                 \n\t"
            "movq %%mm2, %%mm1                      \n\t"
            "punpcklwd %%mm4, %%mm2                 \n\t"
            "punpckhwd %%mm4, %%mm1                 \n\t"
    
            "movd %%mm0, 132(%2)                    \n\t"
            "psrlq $32, %%mm0                       \n\t"
            "movd %%mm0, 148(%2)                    \n\t"
            "movd %%mm3, 164(%2)                    \n\t"
            "psrlq $32, %%mm3                       \n\t"
            "movd %%mm3, 180(%2)                    \n\t"
            "movd %%mm3, 52(%3)                     \n\t"
            "movd %%mm2, 196(%2)                    \n\t"
            "movd %%mm2, 68(%3)                     \n\t"
            "psrlq $32, %%mm2                       \n\t"
            "movd %%mm2, 84(%3)                     \n\t"
            "movd %%mm1, 100(%3)                    \n\t"
            "psrlq $32, %%mm1                       \n\t"
            "movd %%mm1, 116(%3)                    \n\t"
    
    
    
            :: "r" (src), "r" ((long)srcStride), "r" (dst1), "r" (dst2)
            : "%"REG_a
    
    static inline void RENAME(transpose2)(uint8_t *dst, int dstStride, uint8_t *src)
    
        asm(
            "lea (%0, %1), %%"REG_a"                \n\t"
            "lea (%%"REG_a",%1,4), %%"REG_d"        \n\t"
    
    //      0       1       2       3       4       5       6       7       8       9
    //      %0      eax     eax+%1  eax+2%1 %0+4%1  edx     edx+%1  edx+2%1 %0+8%1  edx+4%1
    
            "movq (%2), %%mm0                       \n\t" // 12345678
            "movq 16(%2), %%mm1                     \n\t" // abcdefgh
            "movq %%mm0, %%mm2                      \n\t" // 12345678
            "punpcklbw %%mm1, %%mm0                 \n\t" // 1a2b3c4d
            "punpckhbw %%mm1, %%mm2                 \n\t" // 5e6f7g8h
    
            "movq 32(%2), %%mm1                     \n\t"
            "movq 48(%2), %%mm3                     \n\t"
            "movq %%mm1, %%mm4                      \n\t"
            "punpcklbw %%mm3, %%mm1                 \n\t"
            "punpckhbw %%mm3, %%mm4                 \n\t"
    
            "movq %%mm0, %%mm3                      \n\t"
            "punpcklwd %%mm1, %%mm0                 \n\t"
            "punpckhwd %%mm1, %%mm3                 \n\t"
            "movq %%mm2, %%mm1                      \n\t"
            "punpcklwd %%mm4, %%mm2                 \n\t"
            "punpckhwd %%mm4, %%mm1                 \n\t"
    
            "movd %%mm0, (%0)                       \n\t"
            "psrlq $32, %%mm0                       \n\t"
            "movd %%mm0, (%%"REG_a")                \n\t"
            "movd %%mm3, (%%"REG_a", %1)            \n\t"
            "psrlq $32, %%mm3                       \n\t"
            "movd %%mm3, (%%"REG_a", %1, 2)         \n\t"
            "movd %%mm2, (%0, %1, 4)                \n\t"
            "psrlq $32, %%mm2                       \n\t"
            "movd %%mm2, (%%"REG_d")                \n\t"
            "movd %%mm1, (%%"REG_d", %1)            \n\t"
            "psrlq $32, %%mm1                       \n\t"
            "movd %%mm1, (%%"REG_d", %1, 2)         \n\t"
    
    
            "movq 64(%2), %%mm0                     \n\t" // 12345678
            "movq 80(%2), %%mm1                     \n\t" // abcdefgh
            "movq %%mm0, %%mm2                      \n\t" // 12345678
            "punpcklbw %%mm1, %%mm0                 \n\t" // 1a2b3c4d
            "punpckhbw %%mm1, %%mm2                 \n\t" // 5e6f7g8h
    
            "movq 96(%2), %%mm1                     \n\t"
            "movq 112(%2), %%mm3                    \n\t"
            "movq %%mm1, %%mm4                      \n\t"
            "punpcklbw %%mm3, %%mm1                 \n\t"
            "punpckhbw %%mm3, %%mm4                 \n\t"
    
            "movq %%mm0, %%mm3                      \n\t"
            "punpcklwd %%mm1, %%mm0                 \n\t"
            "punpckhwd %%mm1, %%mm3                 \n\t"
            "movq %%mm2, %%mm1                      \n\t"
            "punpcklwd %%mm4, %%mm2                 \n\t"
            "punpckhwd %%mm4, %%mm1                 \n\t"
    
            "movd %%mm0, 4(%0)                      \n\t"
            "psrlq $32, %%mm0                       \n\t"
            "movd %%mm0, 4(%%"REG_a")               \n\t"
            "movd %%mm3, 4(%%"REG_a", %1)           \n\t"
            "psrlq $32, %%mm3                       \n\t"
            "movd %%mm3, 4(%%"REG_a", %1, 2)        \n\t"
            "movd %%mm2, 4(%0, %1, 4)               \n\t"
            "psrlq $32, %%mm2                       \n\t"
            "movd %%mm2, 4(%%"REG_d")               \n\t"
            "movd %%mm1, 4(%%"REG_d", %1)           \n\t"
            "psrlq $32, %%mm1                       \n\t"
            "movd %%mm1, 4(%%"REG_d", %1, 2)        \n\t"
    
    
            :: "r" (dst), "r" ((long)dstStride), "r" (src)
            : "%"REG_a, "%"REG_d
    
    #ifndef HAVE_ALTIVEC
    
    Michael Niedermayer's avatar
    Michael Niedermayer committed
    static inline void RENAME(tempNoiseReducer)(uint8_t *src, int stride,
    
                                        uint8_t *tempBlurred, uint32_t *tempBlurredPast, int *maxNoise)
    
        // to save a register (FIXME do this outside of the loops)
    
        tempBlurredPast[127]= maxNoise[0];
        tempBlurredPast[128]= maxNoise[1];
        tempBlurredPast[129]= maxNoise[2];
    
    #define FAST_L2_DIFF
    //#define L1_DIFF //u should change the thresholds too if u try that one
    #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
    
        asm volatile(
            "lea (%2, %2, 2), %%"REG_a"             \n\t" // 3*stride
            "lea (%2, %2, 4), %%"REG_d"             \n\t" // 5*stride
            "lea (%%"REG_d", %2, 2), %%"REG_c"      \n\t" // 7*stride
    
    //      0       1       2       3       4       5       6       7       8       9
    //      %x      %x+%2   %x+2%2  %x+eax  %x+4%2  %x+edx  %x+2eax %x+ecx  %x+8%2
    
    //FIXME reorder?
    #ifdef L1_DIFF //needs mmx2
    
            "movq (%0), %%mm0                       \n\t" // L0
            "psadbw (%1), %%mm0                     \n\t" // |L0-R0|
            "movq (%0, %2), %%mm1                   \n\t" // L1
            "psadbw (%1, %2), %%mm1                 \n\t" // |L1-R1|
            "movq (%0, %2, 2), %%mm2                \n\t" // L2
            "psadbw (%1, %2, 2), %%mm2              \n\t" // |L2-R2|
            "movq (%0, %%"REG_a"), %%mm3            \n\t" // L3
            "psadbw (%1, %%"REG_a"), %%mm3          \n\t" // |L3-R3|
    
            "movq (%0, %2, 4), %%mm4                \n\t" // L4
            "paddw %%mm1, %%mm0                     \n\t"
            "psadbw (%1, %2, 4), %%mm4              \n\t" // |L4-R4|
            "movq (%0, %%"REG_d"), %%mm5            \n\t" // L5
            "paddw %%mm2, %%mm0                     \n\t"
            "psadbw (%1, %%"REG_d"), %%mm5          \n\t" // |L5-R5|
            "movq (%0, %%"REG_a", 2), %%mm6         \n\t" // L6
            "paddw %%mm3, %%mm0                     \n\t"
            "psadbw (%1, %%"REG_a", 2), %%mm6       \n\t" // |L6-R6|
            "movq (%0, %%"REG_c"), %%mm7            \n\t" // L7
            "paddw %%mm4, %%mm0                     \n\t"
            "psadbw (%1, %%"REG_c"), %%mm7          \n\t" // |L7-R7|
            "paddw %%mm5, %%mm6                     \n\t"
            "paddw %%mm7, %%mm6                     \n\t"
            "paddw %%mm6, %%mm0                     \n\t"
    
            "pcmpeqb %%mm7, %%mm7                   \n\t"
            "movq "MANGLE(b80)", %%mm6              \n\t"
            "pxor %%mm0, %%mm0                      \n\t"
    
            "movq " #a ", %%mm5                     \n\t"\
            "movq " #b ", %%mm2                     \n\t"\
            "pxor %%mm7, %%mm2                      \n\t"\
            PAVGB(%%mm2, %%mm5)\
            "paddb %%mm6, %%mm5                     \n\t"\
            "movq %%mm5, %%mm2                      \n\t"\
            "psllw $8, %%mm5                        \n\t"\
            "pmaddwd %%mm5, %%mm5                   \n\t"\
            "pmaddwd %%mm2, %%mm2                   \n\t"\
            "paddd %%mm2, %%mm5                     \n\t"\
            "psrld $14, %%mm5                       \n\t"\
            "paddd %%mm5, %%mm0                     \n\t"
    
            "pxor %%mm7, %%mm7                      \n\t"
            "pxor %%mm0, %%mm0                      \n\t"
    
            "movq " #a ", %%mm5                     \n\t"\
            "movq " #b ", %%mm2                     \n\t"\
            "movq %%mm5, %%mm1                      \n\t"\
            "movq %%mm2, %%mm3                      \n\t"\
            "punpcklbw %%mm7, %%mm5                 \n\t"\
            "punpckhbw %%mm7, %%mm1                 \n\t"\
            "punpcklbw %%mm7, %%mm2                 \n\t"\
            "punpckhbw %%mm7, %%mm3                 \n\t"\
            "psubw %%mm2, %%mm5                     \n\t"\
            "psubw %%mm3, %%mm1                     \n\t"\
            "pmaddwd %%mm5, %%mm5                   \n\t"\
            "pmaddwd %%mm1, %%mm1                   \n\t"\
            "paddd %%mm1, %%mm5                     \n\t"\
            "paddd %%mm5, %%mm0                     \n\t"
    
    
    #define L2_DIFF_CORE(a, b)  REAL_L2_DIFF_CORE(a, b)
    
    
    L2_DIFF_CORE((%0)          , (%1))
    L2_DIFF_CORE((%0, %2)      , (%1, %2))
    L2_DIFF_CORE((%0, %2, 2)   , (%1, %2, 2))
    L2_DIFF_CORE((%0, %%REGa)  , (%1, %%REGa))
    L2_DIFF_CORE((%0, %2, 4)   , (%1, %2, 4))
    L2_DIFF_CORE((%0, %%REGd)  , (%1, %%REGd))
    
    L2_DIFF_CORE((%0, %%REGa,2), (%1, %%REGa,2))
    
    L2_DIFF_CORE((%0, %%REGc)  , (%1, %%REGc))
    
            "movq %%mm0, %%mm4                      \n\t"
            "psrlq $32, %%mm0                       \n\t"
            "paddd %%mm0, %%mm4                     \n\t"
            "movd %%mm4, %%ecx                      \n\t"
            "shll $2, %%ecx                         \n\t"
            "mov %3, %%"REG_d"                      \n\t"
            "addl -4(%%"REG_d"), %%ecx              \n\t"
            "addl 4(%%"REG_d"), %%ecx               \n\t"
            "addl -1024(%%"REG_d"), %%ecx           \n\t"
            "addl $4, %%ecx                         \n\t"
            "addl 1024(%%"REG_d"), %%ecx            \n\t"
            "shrl $3, %%ecx                         \n\t"
            "movl %%ecx, (%%"REG_d")                \n\t"
    
    //        "mov %3, %%"REG_c"                      \n\t"
    //        "mov %%"REG_c", test                    \n\t"
    //        "jmp 4f                                 \n\t"
            "cmpl 512(%%"REG_d"), %%ecx             \n\t"
            " jb 2f                                 \n\t"
            "cmpl 516(%%"REG_d"), %%ecx             \n\t"
            " jb 1f                                 \n\t"
    
            "lea (%%"REG_a", %2, 2), %%"REG_d"      \n\t" // 5*stride
            "lea (%%"REG_d", %2, 2), %%"REG_c"      \n\t" // 7*stride
            "movq (%0), %%mm0                       \n\t" // L0
            "movq (%0, %2), %%mm1                   \n\t" // L1
            "movq (%0, %2, 2), %%mm2                \n\t" // L2
            "movq (%0, %%"REG_a"), %%mm3            \n\t" // L3
            "movq (%0, %2, 4), %%mm4                \n\t" // L4
            "movq (%0, %%"REG_d"), %%mm5            \n\t" // L5
            "movq (%0, %%"REG_a", 2), %%mm6         \n\t" // L6
            "movq (%0, %%"REG_c"), %%mm7            \n\t" // L7
            "movq %%mm0, (%1)                       \n\t" // L0
            "movq %%mm1, (%1, %2)                   \n\t" // L1
            "movq %%mm2, (%1, %2, 2)                \n\t" // L2
            "movq %%mm3, (%1, %%"REG_a")            \n\t" // L3
            "movq %%mm4, (%1, %2, 4)                \n\t" // L4
            "movq %%mm5, (%1, %%"REG_d")            \n\t" // L5
            "movq %%mm6, (%1, %%"REG_a", 2)         \n\t" // L6
            "movq %%mm7, (%1, %%"REG_c")            \n\t" // L7
            "jmp 4f                                 \n\t"
    
            "1:                                     \n\t"
            "lea (%%"REG_a", %2, 2), %%"REG_d"      \n\t" // 5*stride
            "lea (%%"REG_d", %2, 2), %%"REG_c"      \n\t" // 7*stride
            "movq (%0), %%mm0                       \n\t" // L0
            PAVGB((%1), %%mm0)                            // L0
            "movq (%0, %2), %%mm1                   \n\t" // L1
            PAVGB((%1, %2), %%mm1)                        // L1
            "movq (%0, %2, 2), %%mm2                \n\t" // L2
            PAVGB((%1, %2, 2), %%mm2)                     // L2
            "movq (%0, %%"REG_a"), %%mm3            \n\t" // L3
            PAVGB((%1, %%REGa), %%mm3)                    // L3
            "movq (%0, %2, 4), %%mm4                \n\t" // L4
            PAVGB((%1, %2, 4), %%mm4)                     // L4
            "movq (%0, %%"REG_d"), %%mm5            \n\t" // L5
            PAVGB((%1, %%REGd), %%mm5)                    // L5
            "movq (%0, %%"REG_a", 2), %%mm6         \n\t" // L6
            PAVGB((%1, %%REGa, 2), %%mm6)                 // L6
            "movq (%0, %%"REG_c"), %%mm7            \n\t" // L7
            PAVGB((%1, %%REGc), %%mm7)                    // L7
            "movq %%mm0, (%1)                       \n\t" // R0
            "movq %%mm1, (%1, %2)                   \n\t" // R1
            "movq %%mm2, (%1, %2, 2)                \n\t" // R2
            "movq %%mm3, (%1, %%"REG_a")            \n\t" // R3
            "movq %%mm4, (%1, %2, 4)                \n\t" // R4
            "movq %%mm5, (%1, %%"REG_d")            \n\t" // R5
            "movq %%mm6, (%1, %%"REG_a", 2)         \n\t" // R6
            "movq %%mm7, (%1, %%"REG_c")            \n\t" // R7
            "movq %%mm0, (%0)                       \n\t" // L0
            "movq %%mm1, (%0, %2)                   \n\t" // L1
            "movq %%mm2, (%0, %2, 2)                \n\t" // L2
            "movq %%mm3, (%0, %%"REG_a")            \n\t" // L3
            "movq %%mm4, (%0, %2, 4)                \n\t" // L4
            "movq %%mm5, (%0, %%"REG_d")            \n\t" // L5
            "movq %%mm6, (%0, %%"REG_a", 2)         \n\t" // L6
            "movq %%mm7, (%0, %%"REG_c")            \n\t" // L7
            "jmp 4f                                 \n\t"
    
            "2:                                     \n\t"
            "cmpl 508(%%"REG_d"), %%ecx             \n\t"
            " jb 3f                                 \n\t"
    
            "lea (%%"REG_a", %2, 2), %%"REG_d"      \n\t" // 5*stride
            "lea (%%"REG_d", %2, 2), %%"REG_c"      \n\t" // 7*stride
            "movq (%0), %%mm0                       \n\t" // L0
            "movq (%0, %2), %%mm1                   \n\t" // L1
            "movq (%0, %2, 2), %%mm2                \n\t" // L2
            "movq (%0, %%"REG_a"), %%mm3            \n\t" // L3
            "movq (%1), %%mm4                       \n\t" // R0
            "movq (%1, %2), %%mm5                   \n\t" // R1
            "movq (%1, %2, 2), %%mm6                \n\t" // R2
            "movq (%1, %%"REG_a"), %%mm7            \n\t" // R3
            PAVGB(%%mm4, %%mm0)
            PAVGB(%%mm5, %%mm1)
            PAVGB(%%mm6, %%mm2)
            PAVGB(%%mm7, %%mm3)
            PAVGB(%%mm4, %%mm0)
            PAVGB(%%mm5, %%mm1)
            PAVGB(%%mm6, %%mm2)
            PAVGB(%%mm7, %%mm3)
            "movq %%mm0, (%1)                       \n\t" // R0
            "movq %%mm1, (%1, %2)                   \n\t" // R1
            "movq %%mm2, (%1, %2, 2)                \n\t" // R2
            "movq %%mm3, (%1, %%"REG_a")            \n\t" // R3
            "movq %%mm0, (%0)                       \n\t" // L0
            "movq %%mm1, (%0, %2)                   \n\t" // L1
            "movq %%mm2, (%0, %2, 2)                \n\t" // L2
            "movq %%mm3, (%0, %%"REG_a")            \n\t" // L3
    
            "movq (%0, %2, 4), %%mm0                \n\t" // L4
            "movq (%0, %%"REG_d"), %%mm1            \n\t" // L5
            "movq (%0, %%"REG_a", 2), %%mm2         \n\t" // L6
            "movq (%0, %%"REG_c"), %%mm3            \n\t" // L7
            "movq (%1, %2, 4), %%mm4                \n\t" // R4
            "movq (%1, %%"REG_d"), %%mm5            \n\t" // R5
            "movq (%1, %%"REG_a", 2), %%mm6         \n\t" // R6
            "movq (%1, %%"REG_c"), %%mm7            \n\t" // R7
            PAVGB(%%mm4, %%mm0)
            PAVGB(%%mm5, %%mm1)
            PAVGB(%%mm6, %%mm2)
            PAVGB(%%mm7, %%mm3)
            PAVGB(%%mm4, %%mm0)
            PAVGB(%%mm5, %%mm1)
            PAVGB(%%mm6, %%mm2)
            PAVGB(%%mm7, %%mm3)
            "movq %%mm0, (%1, %2, 4)                \n\t" // R4
            "movq %%mm1, (%1, %%"REG_d")            \n\t" // R5
            "movq %%mm2, (%1, %%"REG_a", 2)         \n\t" // R6
            "movq %%mm3, (%1, %%"REG_c")            \n\t" // R7
            "movq %%mm0, (%0, %2, 4)                \n\t" // L4
            "movq %%mm1, (%0, %%"REG_d")            \n\t" // L5
            "movq %%mm2, (%0, %%"REG_a", 2)         \n\t" // L6
            "movq %%mm3, (%0, %%"REG_c")            \n\t" // L7
            "jmp 4f                                 \n\t"
    
            "3:                                     \n\t"
            "lea (%%"REG_a", %2, 2), %%"REG_d"      \n\t" // 5*stride
            "lea (%%"REG_d", %2, 2), %%"REG_c"      \n\t" // 7*stride
            "movq (%0), %%mm0                       \n\t" // L0
            "movq (%0, %2), %%mm1                   \n\t" // L1
            "movq (%0, %2, 2), %%mm2                \n\t" // L2
            "movq (%0, %%"REG_a"), %%mm3            \n\t" // L3
            "movq (%1), %%mm4                       \n\t" // R0
            "movq (%1, %2), %%mm5                   \n\t" // R1
            "movq (%1, %2, 2), %%mm6                \n\t" // R2
            "movq (%1, %%"REG_a"), %%mm7            \n\t" // R3
            PAVGB(%%mm4, %%mm0)
            PAVGB(%%mm5, %%mm1)
            PAVGB(%%mm6, %%mm2)
            PAVGB(%%mm7, %%mm3)
            PAVGB(%%mm4, %%mm0)
            PAVGB(%%mm5, %%mm1)
            PAVGB(%%mm6, %%mm2)
            PAVGB(%%mm7, %%mm3)
            PAVGB(%%mm4, %%mm0)
            PAVGB(%%mm5, %%mm1)
            PAVGB(%%mm6, %%mm2)
            PAVGB(%%mm7, %%mm3)
            "movq %%mm0, (%1)                       \n\t" // R0
            "movq %%mm1, (%1, %2)                   \n\t" // R1
            "movq %%mm2, (%1, %2, 2)                \n\t" // R2
            "movq %%mm3, (%1, %%"REG_a")            \n\t" // R3
            "movq %%mm0, (%0)                       \n\t" // L0
            "movq %%mm1, (%0, %2)                   \n\t" // L1
            "movq %%mm2, (%0, %2, 2)                \n\t" // L2
            "movq %%mm3, (%0, %%"REG_a")            \n\t" // L3
    
            "movq (%0, %2, 4), %%mm0                \n\t" // L4
            "movq (%0, %%"REG_d"), %%mm1            \n\t" // L5
            "movq (%0, %%"REG_a", 2), %%mm2         \n\t" // L6
            "movq (%0, %%"REG_c"), %%mm3            \n\t" // L7
            "movq (%1, %2, 4), %%mm4                \n\t" // R4
            "movq (%1, %%"REG_d"), %%mm5            \n\t" // R5
            "movq (%1, %%"REG_a", 2), %%mm6         \n\t" // R6
            "movq (%1, %%"REG_c"), %%mm7            \n\t" // R7
            PAVGB(%%mm4, %%mm0)
            PAVGB(%%mm5, %%mm1)
            PAVGB(%%mm6, %%mm2)
            PAVGB(%%mm7, %%mm3)
            PAVGB(%%mm4, %%mm0)
            PAVGB(%%mm5, %%mm1)
            PAVGB(%%mm6, %%mm2)
            PAVGB(%%mm7, %%mm3)
            PAVGB(%%mm4, %%mm0)
            PAVGB(%%mm5, %%mm1)
            PAVGB(%%mm6, %%mm2)
            PAVGB(%%mm7, %%mm3)
            "movq %%mm0, (%1, %2, 4)                \n\t" // R4
            "movq %%mm1, (%1, %%"REG_d")            \n\t" // R5
            "movq %%mm2, (%1, %%"REG_a", 2)         \n\t" // R6
            "movq %%mm3, (%1, %%"REG_c")            \n\t" // R7
            "movq %%mm0, (%0, %2, 4)                \n\t" // L4
            "movq %%mm1, (%0, %%"REG_d")            \n\t" // L5
            "movq %%mm2, (%0, %%"REG_a", 2)         \n\t" // L6
            "movq %%mm3, (%0, %%"REG_c")            \n\t" // L7
    
            "4:                                     \n\t"
    
    
            :: "r" (src), "r" (tempBlurred), "r"((long)stride), "m" (tempBlurredPast)
    
            : "%"REG_a, "%"REG_d, "%"REG_c, "memory"
        );
    
    #else //defined (HAVE_MMX2) || defined (HAVE_3DNOW)
    
        for(y=0; y<8; y++){
            int x;
            for(x=0; x<8; x++){
    
                int ref= tempBlurred[ x + y*stride ];
    
                int cur= src[ x + y*stride ];
                int d1=ref - cur;
    //            if(x==0 || x==7) d1+= d1>>1;
    //            if(y==0 || y==7) d1+= d1>>1;
    //            d+= FFABS(d1);
                d+= d1*d1;
    //            sysd+= d1;
    
            +(*(tempBlurredPast-256))
            +(*(tempBlurredPast-1))+ (*(tempBlurredPast+1))
            +(*(tempBlurredPast+256))
    
        *tempBlurredPast=i;
    //    ((*tempBlurredPast)*3 + d + 2)>>2;
    
    /*
    Switch between
     1  0  0  0  0  0  0  (0)
    64 32 16  8  4  2  1  (1)
    64 48 36 27 20 15 11 (33) (approx)
    64 56 49 43 37 33 29 (200) (approx)
    */
    
        if(d > maxNoise[1]){
            if(d < maxNoise[2]){
                for(y=0; y<8; y++){
                    int x;
                    for(x=0; x<8; x++){
    
                        int ref= tempBlurred[ x + y*stride ];
    
                        tempBlurred[ x + y*stride ]=
    
                        src[ x + y*stride ]=
                            (ref + cur + 1)>>1;
    
                }
            }else{
                for(y=0; y<8; y++){
                    int x;
                    for(x=0; x<8; x++){
    
                        tempBlurred[ x + y*stride ]= src[ x + y*stride ];
    
        }else{
            if(d < maxNoise[0]){
                for(y=0; y<8; y++){
                    int x;
                    for(x=0; x<8; x++){
    
                        int ref= tempBlurred[ x + y*stride ];
    
                        tempBlurred[ x + y*stride ]=
    
                        src[ x + y*stride ]=
                            (ref*7 + cur + 4)>>3;
    
                }
            }else{
                for(y=0; y<8; y++){
                    int x;
                    for(x=0; x<8; x++){
    
                        int ref= tempBlurred[ x + y*stride ];
    
                        tempBlurred[ x + y*stride ]=
    
                        src[ x + y*stride ]=
                            (ref*3 + cur + 2)>>2;
    
    #endif //defined (HAVE_MMX2) || defined (HAVE_3DNOW)
    
    #endif //HAVE_ALTIVEC
    
    #ifdef HAVE_MMX
    /**
     * accurate deblock filter
     */
    
    static av_always_inline void RENAME(do_a_deblock)(uint8_t *src, int step, int stride, PPContext *c){
    
        int64_t dc_mask, eq_mask, both_masks;
        int64_t sums[10*8*2];
        src+= step*3; // src points to begin of the 8x8 Block
    
        asm volatile(
            "movq %0, %%mm7                         \n\t"
            "movq %1, %%mm6                         \n\t"
            : : "m" (c->mmxDcOffset[c->nonBQP]),  "m" (c->mmxDcThreshold[c->nonBQP])
            );
    
        asm volatile(
            "lea (%2, %3), %%"REG_a"                \n\t"
    
    //      0       1       2       3       4       5       6       7       8       9
    //      %1      eax     eax+%2  eax+2%2 %1+4%2  ecx     ecx+%2  ecx+2%2 %1+8%2  ecx+4%2
    
    
            "movq (%2), %%mm0                       \n\t"
            "movq (%%"REG_a"), %%mm1                \n\t"
            "movq %%mm1, %%mm3                      \n\t"
            "movq %%mm1, %%mm4                      \n\t"
            "psubb %%mm1, %%mm0                     \n\t" // mm0 = differnece
            "paddb %%mm7, %%mm0                     \n\t"
            "pcmpgtb %%mm6, %%mm0                   \n\t"
    
            "movq (%%"REG_a",%3), %%mm2             \n\t"
            PMAXUB(%%mm2, %%mm4)
            PMINUB(%%mm2, %%mm3, %%mm5)
            "psubb %%mm2, %%mm1                     \n\t"
            "paddb %%mm7, %%mm1                     \n\t"
            "pcmpgtb %%mm6, %%mm1                   \n\t"
            "paddb %%mm1, %%mm0                     \n\t"
    
            "movq (%%"REG_a", %3, 2), %%mm1         \n\t"
            PMAXUB(%%mm1, %%mm4)
            PMINUB(%%mm1, %%mm3, %%mm5)
            "psubb %%mm1, %%mm2                     \n\t"
            "paddb %%mm7, %%mm2                     \n\t"
            "pcmpgtb %%mm6, %%mm2                   \n\t"
            "paddb %%mm2, %%mm0                     \n\t"
    
            "lea (%%"REG_a", %3, 4), %%"REG_a"      \n\t"
    
            "movq (%2, %3, 4), %%mm2                \n\t"
            PMAXUB(%%mm2, %%mm4)
            PMINUB(%%mm2, %%mm3, %%mm5)
            "psubb %%mm2, %%mm1                     \n\t"
            "paddb %%mm7, %%mm1                     \n\t"
            "pcmpgtb %%mm6, %%mm1                   \n\t"
            "paddb %%mm1, %%mm0                     \n\t"
    
            "movq (%%"REG_a"), %%mm1                \n\t"
            PMAXUB(%%mm1, %%mm4)
            PMINUB(%%mm1, %%mm3, %%mm5)
            "psubb %%mm1, %%mm2                     \n\t"
            "paddb %%mm7, %%mm2                     \n\t"
            "pcmpgtb %%mm6, %%mm2                   \n\t"
            "paddb %%mm2, %%mm0                     \n\t"
    
            "movq (%%"REG_a", %3), %%mm2            \n\t"
            PMAXUB(%%mm2, %%mm4)
            PMINUB(%%mm2, %%mm3, %%mm5)
            "psubb %%mm2, %%mm1                     \n\t"
            "paddb %%mm7, %%mm1                     \n\t"
            "pcmpgtb %%mm6, %%mm1                   \n\t"
            "paddb %%mm1, %%mm0                     \n\t"
    
            "movq (%%"REG_a", %3, 2), %%mm1         \n\t"
            PMAXUB(%%mm1, %%mm4)
            PMINUB(%%mm1, %%mm3, %%mm5)
            "psubb %%mm1, %%mm2                     \n\t"
            "paddb %%mm7, %%mm2                     \n\t"
            "pcmpgtb %%mm6, %%mm2                   \n\t"
            "paddb %%mm2, %%mm0                     \n\t"
    
            "movq (%2, %3, 8), %%mm2                \n\t"
            PMAXUB(%%mm2, %%mm4)
            PMINUB(%%mm2, %%mm3, %%mm5)
            "psubb %%mm2, %%mm1                     \n\t"
            "paddb %%mm7, %%mm1                     \n\t"
            "pcmpgtb %%mm6, %%mm1                   \n\t"
            "paddb %%mm1, %%mm0                     \n\t"
    
            "movq (%%"REG_a", %3, 4), %%mm1         \n\t"
            "psubb %%mm1, %%mm2                     \n\t"
            "paddb %%mm7, %%mm2                     \n\t"
            "pcmpgtb %%mm6, %%mm2                   \n\t"
            "paddb %%mm2, %%mm0                     \n\t"
            "psubusb %%mm3, %%mm4                   \n\t"
    
            "pxor %%mm6, %%mm6                      \n\t"
            "movq %4, %%mm7                         \n\t" // QP,..., QP
            "paddusb %%mm7, %%mm7                   \n\t" // 2QP ... 2QP
            "psubusb %%mm4, %%mm7                   \n\t" // Diff >=2QP -> 0
            "pcmpeqb %%mm6, %%mm7                   \n\t" // Diff < 2QP -> 0
            "pcmpeqb %%mm6, %%mm7                   \n\t" // Diff < 2QP -> 0
            "movq %%mm7, %1                         \n\t"
    
            "movq %5, %%mm7                         \n\t"
            "punpcklbw %%mm7, %%mm7                 \n\t"
            "punpcklbw %%mm7, %%mm7                 \n\t"
            "punpcklbw %%mm7, %%mm7                 \n\t"
            "psubb %%mm0, %%mm6                     \n\t"
            "pcmpgtb %%mm7, %%mm6                   \n\t"
            "movq %%mm6, %0                         \n\t"
    
            : "=m" (eq_mask), "=m" (dc_mask)
            : "r" (src), "r" ((long)step), "m" (c->pQPb), "m"(c->ppMode.flatnessThreshold)
            : "%"REG_a
        );
    
        both_masks = dc_mask & eq_mask;
    
        if(both_masks){
            long offset= -8*step;
            int64_t *temp_sums= sums;
    
            asm volatile(
                "movq %2, %%mm0                         \n\t"  // QP,..., QP
                "pxor %%mm4, %%mm4                      \n\t"
    
                "movq (%0), %%mm6                       \n\t"
                "movq (%0, %1), %%mm5                   \n\t"
                "movq %%mm5, %%mm1                      \n\t"
                "movq %%mm6, %%mm2                      \n\t"
                "psubusb %%mm6, %%mm5                   \n\t"
                "psubusb %%mm1, %%mm2                   \n\t"
                "por %%mm5, %%mm2                       \n\t" // ABS Diff of lines
                "psubusb %%mm2, %%mm0                   \n\t" // diff >= QP -> 0
                "pcmpeqb %%mm4, %%mm0                   \n\t" // diff >= QP -> FF
    
                "pxor %%mm6, %%mm1                      \n\t"
                "pand %%mm0, %%mm1                      \n\t"
                "pxor %%mm1, %%mm6                      \n\t"
                // 0:QP  6:First
    
                "movq (%0, %1, 8), %%mm5                \n\t"
                "add %1, %0                             \n\t" // %0 points to line 1 not 0
                "movq (%0, %1, 8), %%mm7                \n\t"
                "movq %%mm5, %%mm1                      \n\t"
                "movq %%mm7, %%mm2                      \n\t"
                "psubusb %%mm7, %%mm5                   \n\t"
                "psubusb %%mm1, %%mm2                   \n\t"
                "por %%mm5, %%mm2                       \n\t" // ABS Diff of lines
                "movq %2, %%mm0                         \n\t"  // QP,..., QP
                "psubusb %%mm2, %%mm0                   \n\t" // diff >= QP -> 0
                "pcmpeqb %%mm4, %%mm0                   \n\t" // diff >= QP -> FF
    
                "pxor %%mm7, %%mm1                      \n\t"
                "pand %%mm0, %%mm1                      \n\t"
                "pxor %%mm1, %%mm7                      \n\t"
    
                "movq %%mm6, %%mm5                      \n\t"
                "punpckhbw %%mm4, %%mm6                 \n\t"
                "punpcklbw %%mm4, %%mm5                 \n\t"
                // 4:0 5/6:First 7:Last
    
                "movq %%mm5, %%mm0                      \n\t"
                "movq %%mm6, %%mm1                      \n\t"
                "psllw $2, %%mm0                        \n\t"
                "psllw $2, %%mm1                        \n\t"
                "paddw "MANGLE(w04)", %%mm0             \n\t"
                "paddw "MANGLE(w04)", %%mm1             \n\t"
    
                "movq (%0), %%mm2                       \n\t"\
                "movq (%0), %%mm3                       \n\t"\
                "add %1, %0                             \n\t"\
                "punpcklbw %%mm4, %%mm2                 \n\t"\
                "punpckhbw %%mm4, %%mm3                 \n\t"\
                "paddw %%mm2, %%mm0                     \n\t"\
                "paddw %%mm3, %%mm1                     \n\t"
    
                "movq (%0), %%mm2                       \n\t"\
                "movq (%0), %%mm3                       \n\t"\
                "add %1, %0                             \n\t"\
                "punpcklbw %%mm4, %%mm2                 \n\t"\
                "punpckhbw %%mm4, %%mm3                 \n\t"\
                "psubw %%mm2, %%mm0                     \n\t"\
                "psubw %%mm3, %%mm1                     \n\t"
    
    
                NEXT //0
                NEXT //1
                NEXT //2
                "movq %%mm0, (%3)                       \n\t"
                "movq %%mm1, 8(%3)                      \n\t"
    
                NEXT //3
                "psubw %%mm5, %%mm0                     \n\t"
                "psubw %%mm6, %%mm1                     \n\t"
                "movq %%mm0, 16(%3)                     \n\t"
                "movq %%mm1, 24(%3)                     \n\t"
    
                NEXT //4
                "psubw %%mm5, %%mm0                     \n\t"
                "psubw %%mm6, %%mm1                     \n\t"
                "movq %%mm0, 32(%3)                     \n\t"
                "movq %%mm1, 40(%3)                     \n\t"
    
                NEXT //5
                "psubw %%mm5, %%mm0                     \n\t"
                "psubw %%mm6, %%mm1                     \n\t"
                "movq %%mm0, 48(%3)                     \n\t"
                "movq %%mm1, 56(%3)                     \n\t"
    
                NEXT //6
                "psubw %%mm5, %%mm0                     \n\t"
                "psubw %%mm6, %%mm1                     \n\t"
                "movq %%mm0, 64(%3)                     \n\t"
                "movq %%mm1, 72(%3)                     \n\t"
    
                "movq %%mm7, %%mm6                      \n\t"
                "punpckhbw %%mm4, %%mm7                 \n\t"
                "punpcklbw %%mm4, %%mm6                 \n\t"
    
                NEXT //7
                "mov %4, %0                             \n\t"
                "add %1, %0                             \n\t"
                PREV //0
                "movq %%mm0, 80(%3)                     \n\t"
                "movq %%mm1, 88(%3)                     \n\t"
    
                PREV //1
                "paddw %%mm6, %%mm0                     \n\t"
                "paddw %%mm7, %%mm1                     \n\t"
                "movq %%mm0, 96(%3)                     \n\t"
                "movq %%mm1, 104(%3)                    \n\t"
    
                PREV //2
                "paddw %%mm6, %%mm0                     \n\t"
                "paddw %%mm7, %%mm1                     \n\t"
                "movq %%mm0, 112(%3)                    \n\t"
                "movq %%mm1, 120(%3)                    \n\t"
    
                PREV //3
                "paddw %%mm6, %%mm0                     \n\t"
                "paddw %%mm7, %%mm1                     \n\t"
                "movq %%mm0, 128(%3)                    \n\t"
                "movq %%mm1, 136(%3)                    \n\t"
    
                PREV //4
                "paddw %%mm6, %%mm0                     \n\t"
                "paddw %%mm7, %%mm1                     \n\t"
                "movq %%mm0, 144(%3)                    \n\t"
                "movq %%mm1, 152(%3)                    \n\t"
    
                "mov %4, %0                             \n\t" //FIXME
    
                : "+&r"(src)
                : "r" ((long)step), "m" (c->pQPb), "r"(sums), "g"(src)
            );
    
            src+= step; // src points to begin of the 8x8 Block
    
            asm volatile(
                "movq %4, %%mm6                         \n\t"
                "pcmpeqb %%mm5, %%mm5                   \n\t"
                "pxor %%mm6, %%mm5                      \n\t"
                "pxor %%mm7, %%mm7                      \n\t"
    
                "1:                                     \n\t"
                "movq (%1), %%mm0                       \n\t"
                "movq 8(%1), %%mm1                      \n\t"
                "paddw 32(%1), %%mm0                    \n\t"
                "paddw 40(%1), %%mm1                    \n\t"
                "movq (%0, %3), %%mm2                   \n\t"
                "movq %%mm2, %%mm3                      \n\t"
                "movq %%mm2, %%mm4                      \n\t"
                "punpcklbw %%mm7, %%mm2                 \n\t"
                "punpckhbw %%mm7, %%mm3                 \n\t"
                "paddw %%mm2, %%mm0                     \n\t"
                "paddw %%mm3, %%mm1                     \n\t"
                "paddw %%mm2, %%mm0                     \n\t"
                "paddw %%mm3, %%mm1                     \n\t"
                "psrlw $4, %%mm0                        \n\t"
                "psrlw $4, %%mm1                        \n\t"
                "packuswb %%mm1, %%mm0                  \n\t"
                "pand %%mm6, %%mm0                      \n\t"
                "pand %%mm5, %%mm4                      \n\t"
                "por %%mm4, %%mm0                       \n\t"
                "movq %%mm0, (%0, %3)                   \n\t"
                "add $16, %1                            \n\t"
                "add %2, %0                             \n\t"
                " js 1b                                 \n\t"
    
                : "+r"(offset), "+r"(temp_sums)
                : "r" ((long)step), "r"(src - offset), "m"(both_masks)
            );
        }else
            src+= step; // src points to begin of the 8x8 Block
    
        if(eq_mask != -1LL){
            uint8_t *temp_src= src;
            asm volatile(
                "pxor %%mm7, %%mm7                      \n\t"
                "lea -40(%%"REG_SP"), %%"REG_c"         \n\t" // make space for 4 8-byte vars
                "and "ALIGN_MASK", %%"REG_c"            \n\t" // align
    
    //      0       1       2       3       4       5       6       7       8       9
    //      %0      eax     eax+%1  eax+2%1 %0+4%1  ecx     ecx+%1  ecx+2%1 %1+8%1  ecx+4%1
    
    
                "movq (%0), %%mm0                       \n\t"
                "movq %%mm0, %%mm1                      \n\t"
                "punpcklbw %%mm7, %%mm0                 \n\t" // low part of line 0
                "punpckhbw %%mm7, %%mm1                 \n\t" // high part of line 0
    
                "movq (%0, %1), %%mm2                   \n\t"
                "lea (%0, %1, 2), %%"REG_a"             \n\t"
                "movq %%mm2, %%mm3                      \n\t"
                "punpcklbw %%mm7, %%mm2                 \n\t" // low part of line 1
                "punpckhbw %%mm7, %%mm3                 \n\t" // high part of line 1
    
                "movq (%%"REG_a"), %%mm4                \n\t"
                "movq %%mm4, %%mm5                      \n\t"
                "punpcklbw %%mm7, %%mm4                 \n\t" // low part of line 2
                "punpckhbw %%mm7, %%mm5                 \n\t" // high part of line 2
    
                "paddw %%mm0, %%mm0                     \n\t" // 2L0
                "paddw %%mm1, %%mm1                     \n\t" // 2H0
                "psubw %%mm4, %%mm2                     \n\t" // L1 - L2
                "psubw %%mm5, %%mm3                     \n\t" // H1 - H2
                "psubw %%mm2, %%mm0                     \n\t" // 2L0 - L1 + L2
                "psubw %%mm3, %%mm1                     \n\t" // 2H0 - H1 + H2
    
                "psllw $2, %%mm2                        \n\t" // 4L1 - 4L2
                "psllw $2, %%mm3                        \n\t" // 4H1 - 4H2
                "psubw %%mm2, %%mm0                     \n\t" // 2L0 - 5L1 + 5L2
                "psubw %%mm3, %%mm1                     \n\t" // 2H0 - 5H1 + 5H2
    
                "movq (%%"REG_a", %1), %%mm2            \n\t"
                "movq %%mm2, %%mm3                      \n\t"
                "punpcklbw %%mm7, %%mm2                 \n\t" // L3
                "punpckhbw %%mm7, %%mm3                 \n\t" // H3
    
                "psubw %%mm2, %%mm0                     \n\t" // 2L0 - 5L1 + 5L2 - L3
                "psubw %%mm3, %%mm1                     \n\t" // 2H0 - 5H1 + 5H2 - H3
                "psubw %%mm2, %%mm0                     \n\t" // 2L0 - 5L1 + 5L2 - 2L3
                "psubw %%mm3, %%mm1                     \n\t" // 2H0 - 5H1 + 5H2 - 2H3
                "movq %%mm0, (%%"REG_c")                \n\t" // 2L0 - 5L1 + 5L2 - 2L3
                "movq %%mm1, 8(%%"REG_c")               \n\t" // 2H0 - 5H1 + 5H2 - 2H3
    
                "movq (%%"REG_a", %1, 2), %%mm0         \n\t"
                "movq %%mm0, %%mm1                      \n\t"
                "punpcklbw %%mm7, %%mm0                 \n\t" // L4
                "punpckhbw %%mm7, %%mm1                 \n\t" // H4
    
                "psubw %%mm0, %%mm2                     \n\t" // L3 - L4
                "psubw %%mm1, %%mm3                     \n\t" // H3 - H4
                "movq %%mm2, 16(%%"REG_c")              \n\t" // L3 - L4
                "movq %%mm3, 24(%%"REG_c")              \n\t" // H3 - H4
                "paddw %%mm4, %%mm4                     \n\t" // 2L2
                "paddw %%mm5, %%mm5                     \n\t" // 2H2
                "psubw %%mm2, %%mm4                     \n\t" // 2L2 - L3 + L4
                "psubw %%mm3, %%mm5                     \n\t" // 2H2 - H3 + H4
    
                "lea (%%"REG_a", %1), %0                \n\t"
                "psllw $2, %%mm2                        \n\t" // 4L3 - 4L4
                "psllw $2, %%mm3                        \n\t" // 4H3 - 4H4
                "psubw %%mm2, %%mm4                     \n\t" // 2L2 - 5L3 + 5L4
                "psubw %%mm3, %%mm5                     \n\t" // 2H2 - 5H3 + 5H4
    
                "movq (%0, %1, 2), %%mm2                \n\t"
                "movq %%mm2, %%mm3                      \n\t"
                "punpcklbw %%mm7, %%mm2                 \n\t" // L5
                "punpckhbw %%mm7, %%mm3                 \n\t" // H5
                "psubw %%mm2, %%mm4                     \n\t" // 2L2 - 5L3 + 5L4 - L5
                "psubw %%mm3, %%mm5                     \n\t" // 2H2 - 5H3 + 5H4 - H5
                "psubw %%mm2, %%mm4                     \n\t" // 2L2 - 5L3 + 5L4 - 2L5
                "psubw %%mm3, %%mm5                     \n\t" // 2H2 - 5H3 + 5H4 - 2H5
    
                "movq (%%"REG_a", %1, 4), %%mm6         \n\t"
                "punpcklbw %%mm7, %%mm6                 \n\t" // L6
                "psubw %%mm6, %%mm2                     \n\t" // L5 - L6
                "movq (%%"REG_a", %1, 4), %%mm6         \n\t"
                "punpckhbw %%mm7, %%mm6                 \n\t" // H6
                "psubw %%mm6, %%mm3                     \n\t" // H5 - H6
    
                "paddw %%mm0, %%mm0                     \n\t" // 2L4
                "paddw %%mm1, %%mm1                     \n\t" // 2H4
                "psubw %%mm2, %%mm0                     \n\t" // 2L4 - L5 + L6
                "psubw %%mm3, %%mm1                     \n\t" // 2H4 - H5 + H6
    
                "psllw $2, %%mm2                        \n\t" // 4L5 - 4L6
                "psllw $2, %%mm3                        \n\t" // 4H5 - 4H6
                "psubw %%mm2, %%mm0                     \n\t" // 2L4 - 5L5 + 5L6
                "psubw %%mm3, %%mm1                     \n\t" // 2H4 - 5H5 + 5H6
    
                "movq (%0, %1, 4), %%mm2                \n\t"
                "movq %%mm2, %%mm3                      \n\t"
                "punpcklbw %%mm7, %%mm2                 \n\t" // L7
                "punpckhbw %%mm7, %%mm3                 \n\t" // H7
    
                "paddw %%mm2, %%mm2                     \n\t" // 2L7
                "paddw %%mm3, %%mm3                     \n\t" // 2H7
                "psubw %%mm2, %%mm0                     \n\t" // 2L4 - 5L5 + 5L6 - 2L7
                "psubw %%mm3, %%mm1                     \n\t" // 2H4 - 5H5 + 5H6 - 2H7
    
                "movq (%%"REG_c"), %%mm2                \n\t" // 2L0 - 5L1 + 5L2 - 2L3
                "movq 8(%%"REG_c"), %%mm3               \n\t" // 2H0 - 5H1 + 5H2 - 2H3
    
                "movq %%mm7, %%mm6                      \n\t" // 0
                "psubw %%mm0, %%mm6                     \n\t"
                "pmaxsw %%mm6, %%mm0                    \n\t" // |2L4 - 5L5 + 5L6 - 2L7|
                "movq %%mm7, %%mm6                      \n\t" // 0
                "psubw %%mm1, %%mm6                     \n\t"
                "pmaxsw %%mm6, %%mm1                    \n\t" // |2H4 - 5H5 + 5H6 - 2H7|
                "movq %%mm7, %%mm6                      \n\t" // 0
                "psubw %%mm2, %%mm6                     \n\t"
                "pmaxsw %%mm6, %%mm2                    \n\t" // |2L0 - 5L1 + 5L2 - 2L3|
                "movq %%mm7, %%mm6                      \n\t" // 0
                "psubw %%mm3, %%mm6                     \n\t"
                "pmaxsw %%mm6, %%mm3                    \n\t" // |2H0 - 5H1 + 5H2 - 2H3|
    
                "movq %%mm7, %%mm6                      \n\t" // 0
                "pcmpgtw %%mm0, %%mm6                   \n\t"
                "pxor %%mm6, %%mm0                      \n\t"
                "psubw %%mm6, %%mm0                     \n\t" // |2L4 - 5L5 + 5L6 - 2L7|
                "movq %%mm7, %%mm6                      \n\t" // 0
                "pcmpgtw %%mm1, %%mm6                   \n\t"
                "pxor %%mm6, %%mm1                      \n\t"
                "psubw %%mm6, %%mm1                     \n\t" // |2H4 - 5H5 + 5H6 - 2H7|
                "movq %%mm7, %%mm6                      \n\t" // 0
                "pcmpgtw %%mm2, %%mm6                   \n\t"
                "pxor %%mm6, %%mm2                      \n\t"
                "psubw %%mm6, %%mm2                     \n\t" // |2L0 - 5L1 + 5L2 - 2L3|
                "movq %%mm7, %%mm6                      \n\t" // 0
                "pcmpgtw %%mm3, %%mm6                   \n\t"