Skip to content
Snippets Groups Projects
postprocess_template.c 186 KiB
Newer Older
  • Learn to ignore specific revisions
  •                 "movq (%0, %1, 4), %%mm2                \n\t"
                    "movq %%mm2, %%mm3                      \n\t"
                    "punpcklbw %%mm7, %%mm2                 \n\t" // L7
                    "punpckhbw %%mm7, %%mm3                 \n\t" // H7
    
                    "paddw %%mm2, %%mm2                     \n\t" // 2L7
                    "paddw %%mm3, %%mm3                     \n\t" // 2H7
                    "psubw %%mm2, %%mm0                     \n\t" // 2L4 - 5L5 + 5L6 - 2L7
                    "psubw %%mm3, %%mm1                     \n\t" // 2H4 - 5H5 + 5H6 - 2H7
    
                    "movq (%%"REG_c"), %%mm2                \n\t" // 2L0 - 5L1 + 5L2 - 2L3
                    "movq 8(%%"REG_c"), %%mm3               \n\t" // 2H0 - 5H1 + 5H2 - 2H3
    
    Michael Niedermayer's avatar
    Michael Niedermayer committed
    
    #ifdef HAVE_MMX2
    
                    "movq %%mm7, %%mm6                      \n\t" // 0
                    "psubw %%mm0, %%mm6                     \n\t"
                    "pmaxsw %%mm6, %%mm0                    \n\t" // |2L4 - 5L5 + 5L6 - 2L7|
                    "movq %%mm7, %%mm6                      \n\t" // 0
                    "psubw %%mm1, %%mm6                     \n\t"
                    "pmaxsw %%mm6, %%mm1                    \n\t" // |2H4 - 5H5 + 5H6 - 2H7|
                    "movq %%mm7, %%mm6                      \n\t" // 0
                    "psubw %%mm2, %%mm6                     \n\t"
                    "pmaxsw %%mm6, %%mm2                    \n\t" // |2L0 - 5L1 + 5L2 - 2L3|
                    "movq %%mm7, %%mm6                      \n\t" // 0
                    "psubw %%mm3, %%mm6                     \n\t"
                    "pmaxsw %%mm6, %%mm3                    \n\t" // |2H0 - 5H1 + 5H2 - 2H3|
    
    Michael Niedermayer's avatar
    Michael Niedermayer committed
    #else
    
                    "movq %%mm7, %%mm6                      \n\t" // 0
                    "pcmpgtw %%mm0, %%mm6                   \n\t"
                    "pxor %%mm6, %%mm0                      \n\t"
                    "psubw %%mm6, %%mm0                     \n\t" // |2L4 - 5L5 + 5L6 - 2L7|
                    "movq %%mm7, %%mm6                      \n\t" // 0
                    "pcmpgtw %%mm1, %%mm6                   \n\t"
                    "pxor %%mm6, %%mm1                      \n\t"
                    "psubw %%mm6, %%mm1                     \n\t" // |2H4 - 5H5 + 5H6 - 2H7|
                    "movq %%mm7, %%mm6                      \n\t" // 0
                    "pcmpgtw %%mm2, %%mm6                   \n\t"
                    "pxor %%mm6, %%mm2                      \n\t"
                    "psubw %%mm6, %%mm2                     \n\t" // |2L0 - 5L1 + 5L2 - 2L3|
                    "movq %%mm7, %%mm6                      \n\t" // 0
                    "pcmpgtw %%mm3, %%mm6                   \n\t"
                    "pxor %%mm6, %%mm3                      \n\t"
                    "psubw %%mm6, %%mm3                     \n\t" // |2H0 - 5H1 + 5H2 - 2H3|
    
    Michael Niedermayer's avatar
    Michael Niedermayer committed
    #endif
    
                    "pminsw %%mm2, %%mm0                    \n\t"
                    "pminsw %%mm3, %%mm1                    \n\t"
    
                    "movq %%mm0, %%mm6                      \n\t"
                    "psubusw %%mm2, %%mm6                   \n\t"
                    "psubw %%mm6, %%mm0                     \n\t"
                    "movq %%mm1, %%mm6                      \n\t"
                    "psubusw %%mm3, %%mm6                   \n\t"
                    "psubw %%mm6, %%mm1                     \n\t"
    
                    "movd %2, %%mm2                         \n\t" // QP
                    "punpcklbw %%mm7, %%mm2                 \n\t"
    
                    "movq %%mm7, %%mm6                      \n\t" // 0
                    "pcmpgtw %%mm4, %%mm6                   \n\t" // sign(2L2 - 5L3 + 5L4 - 2L5)
                    "pxor %%mm6, %%mm4                      \n\t"
                    "psubw %%mm6, %%mm4                     \n\t" // |2L2 - 5L3 + 5L4 - 2L5|
                    "pcmpgtw %%mm5, %%mm7                   \n\t" // sign(2H2 - 5H3 + 5H4 - 2H5)
                    "pxor %%mm7, %%mm5                      \n\t"
                    "psubw %%mm7, %%mm5                     \n\t" // |2H2 - 5H3 + 5H4 - 2H5|
    
                    "psllw $3, %%mm2                        \n\t" // 8QP
                    "movq %%mm2, %%mm3                      \n\t" // 8QP
                    "pcmpgtw %%mm4, %%mm2                   \n\t"
                    "pcmpgtw %%mm5, %%mm3                   \n\t"
                    "pand %%mm2, %%mm4                      \n\t"
                    "pand %%mm3, %%mm5                      \n\t"
    
    
                    "psubusw %%mm0, %%mm4                   \n\t" // hd
                    "psubusw %%mm1, %%mm5                   \n\t" // ld
    
    
                    "movq "MANGLE(w05)", %%mm2              \n\t" // 5
                    "pmullw %%mm2, %%mm4                    \n\t"
                    "pmullw %%mm2, %%mm5                    \n\t"
                    "movq "MANGLE(w20)", %%mm2              \n\t" // 32
                    "paddw %%mm2, %%mm4                     \n\t"
                    "paddw %%mm2, %%mm5                     \n\t"
                    "psrlw $6, %%mm4                        \n\t"
                    "psrlw $6, %%mm5                        \n\t"
    
                    "movq 16(%%"REG_c"), %%mm0              \n\t" // L3 - L4
                    "movq 24(%%"REG_c"), %%mm1              \n\t" // H3 - H4
    
                    "pxor %%mm2, %%mm2                      \n\t"
                    "pxor %%mm3, %%mm3                      \n\t"
    
                    "pcmpgtw %%mm0, %%mm2                   \n\t" // sign (L3-L4)
                    "pcmpgtw %%mm1, %%mm3                   \n\t" // sign (H3-H4)
                    "pxor %%mm2, %%mm0                      \n\t"
                    "pxor %%mm3, %%mm1                      \n\t"
                    "psubw %%mm2, %%mm0                     \n\t" // |L3-L4|
                    "psubw %%mm3, %%mm1                     \n\t" // |H3-H4|
                    "psrlw $1, %%mm0                        \n\t" // |L3 - L4|/2
                    "psrlw $1, %%mm1                        \n\t" // |H3 - H4|/2
    
                    "pxor %%mm6, %%mm2                      \n\t"
                    "pxor %%mm7, %%mm3                      \n\t"
                    "pand %%mm2, %%mm4                      \n\t"
                    "pand %%mm3, %%mm5                      \n\t"
    
                    "pminsw %%mm0, %%mm4                    \n\t"
                    "pminsw %%mm1, %%mm5                    \n\t"
    
                    "movq %%mm4, %%mm2                      \n\t"
                    "psubusw %%mm0, %%mm2                   \n\t"
                    "psubw %%mm2, %%mm4                     \n\t"
                    "movq %%mm5, %%mm2                      \n\t"
                    "psubusw %%mm1, %%mm2                   \n\t"
                    "psubw %%mm2, %%mm5                     \n\t"
    
                    "pxor %%mm6, %%mm4                      \n\t"
                    "pxor %%mm7, %%mm5                      \n\t"
                    "psubw %%mm6, %%mm4                     \n\t"
                    "psubw %%mm7, %%mm5                     \n\t"
                    "packsswb %%mm5, %%mm4                  \n\t"
                    "movq (%0), %%mm0                       \n\t"
                    "paddb   %%mm4, %%mm0                   \n\t"
                    "movq %%mm0, (%0)                       \n\t"
                    "movq (%0, %1), %%mm0                   \n\t"
                    "psubb %%mm4, %%mm0                     \n\t"
                    "movq %%mm0, (%0, %1)                   \n\t"
    
                    : "+r" (src)
                    : "r" ((long)stride), "m" (c->pQPb)
                    : "%"REG_a, "%"REG_c
            );
    
    #else //defined (HAVE_MMX2) || defined (HAVE_3DNOW)
    
            const int l1= stride;
            const int l2= stride + l1;
            const int l3= stride + l2;
            const int l4= stride + l3;
            const int l5= stride + l4;
            const int l6= stride + l5;
            const int l7= stride + l6;
            const int l8= stride + l7;
    //        const int l9= stride + l8;
            int x;
            src+= stride*3;
            for(x=0; x<BLOCK_SIZE; x++)
            {
                    const int middleEnergy= 5*(src[l5] - src[l4]) + 2*(src[l3] - src[l6]);
    
                    if(FFABS(middleEnergy) < 8*c->QP)
    
                    {
                            const int q=(src[l4] - src[l5])/2;
                            const int leftEnergy=  5*(src[l3] - src[l2]) + 2*(src[l1] - src[l4]);
                            const int rightEnergy= 5*(src[l7] - src[l6]) + 2*(src[l5] - src[l8]);
    
    
                            int d= FFABS(middleEnergy) - FFMIN( FFABS(leftEnergy), FFABS(rightEnergy) );
    
    
                            if(q>0)
                            {
                                    d= d<0 ? 0 : d;
                                    d= d>q ? q : d;
                            }
                            else
                            {
                                    d= d>0 ? 0 : d;
                                    d= d<q ? q : d;
                            }
    
                            src[l4]-= d;
                            src[l5]+= d;
                    }
                    src++;
            }
    
    #endif //defined (HAVE_MMX2) || defined (HAVE_3DNOW)
    
    static inline void RENAME(dering)(uint8_t src[], int stride, PPContext *c)
    
    Michael Niedermayer's avatar
    Michael Niedermayer committed
    #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
    
            asm volatile(
                    "pxor %%mm6, %%mm6                      \n\t"
                    "pcmpeqb %%mm7, %%mm7                   \n\t"
                    "movq %2, %%mm0                         \n\t"
                    "punpcklbw %%mm6, %%mm0                 \n\t"
                    "psrlw $1, %%mm0                        \n\t"
                    "psubw %%mm7, %%mm0                     \n\t"
                    "packuswb %%mm0, %%mm0                  \n\t"
                    "movq %%mm0, %3                         \n\t"
    
                    "lea (%0, %1), %%"REG_a"                \n\t"
                    "lea (%%"REG_a", %1, 4), %%"REG_d"      \n\t"
    
    //        0        1        2        3        4        5        6        7        8        9
    //        %0        eax        eax+%1        eax+2%1        %0+4%1        edx        edx+%1        edx+2%1        %0+8%1        edx+4%1
    
    #undef FIND_MIN_MAX
    
    Michael Niedermayer's avatar
    Michael Niedermayer committed
    #ifdef HAVE_MMX2
    
                    "movq " #addr ", %%mm0                  \n\t"\
                    "pminub %%mm0, %%mm7                    \n\t"\
                    "pmaxub %%mm0, %%mm6                    \n\t"
    
    Michael Niedermayer's avatar
    Michael Niedermayer committed
    #else
    
                    "movq " #addr ", %%mm0                  \n\t"\
                    "movq %%mm7, %%mm1                      \n\t"\
                    "psubusb %%mm0, %%mm6                   \n\t"\
                    "paddb %%mm0, %%mm6                     \n\t"\
                    "psubusb %%mm0, %%mm1                   \n\t"\
                    "psubb %%mm1, %%mm7                     \n\t"
    
    Michael Niedermayer's avatar
    Michael Niedermayer committed
    #endif
    
    #define FIND_MIN_MAX(addr)  REAL_FIND_MIN_MAX(addr)
    
    FIND_MIN_MAX((%%REGa))
    FIND_MIN_MAX((%%REGa, %1))
    FIND_MIN_MAX((%%REGa, %1, 2))
    
    Michael Niedermayer's avatar
    Michael Niedermayer committed
    FIND_MIN_MAX((%0, %1, 4))
    
    FIND_MIN_MAX((%%REGd))
    FIND_MIN_MAX((%%REGd, %1))
    FIND_MIN_MAX((%%REGd, %1, 2))
    
    Michael Niedermayer's avatar
    Michael Niedermayer committed
    FIND_MIN_MAX((%0, %1, 8))
    
                    "movq %%mm7, %%mm4                      \n\t"
                    "psrlq $8, %%mm7                        \n\t"
    
                    "pminub %%mm4, %%mm7                    \n\t" // min of pixels
                    "pshufw $0xF9, %%mm7, %%mm4             \n\t"
                    "pminub %%mm4, %%mm7                    \n\t" // min of pixels
                    "pshufw $0xFE, %%mm7, %%mm4             \n\t"
                    "pminub %%mm4, %%mm7                    \n\t"
    
                    "movq %%mm7, %%mm1                      \n\t"
                    "psubusb %%mm4, %%mm1                   \n\t"
                    "psubb %%mm1, %%mm7                     \n\t"
                    "movq %%mm7, %%mm4                      \n\t"
                    "psrlq $16, %%mm7                       \n\t"
                    "movq %%mm7, %%mm1                      \n\t"
                    "psubusb %%mm4, %%mm1                   \n\t"
                    "psubb %%mm1, %%mm7                     \n\t"
                    "movq %%mm7, %%mm4                      \n\t"
                    "psrlq $32, %%mm7                       \n\t"
                    "movq %%mm7, %%mm1                      \n\t"
                    "psubusb %%mm4, %%mm1                   \n\t"
                    "psubb %%mm1, %%mm7                     \n\t"
    
                    "movq %%mm6, %%mm4                      \n\t"
                    "psrlq $8, %%mm6                        \n\t"
    
    Michael Niedermayer's avatar
    Michael Niedermayer committed
    #ifdef HAVE_MMX2
    
                    "pmaxub %%mm4, %%mm6                    \n\t" // max of pixels
                    "pshufw $0xF9, %%mm6, %%mm4             \n\t"
                    "pmaxub %%mm4, %%mm6                    \n\t"
                    "pshufw $0xFE, %%mm6, %%mm4             \n\t"
                    "pmaxub %%mm4, %%mm6                    \n\t"
    
    Michael Niedermayer's avatar
    Michael Niedermayer committed
    #else
    
                    "psubusb %%mm4, %%mm6                   \n\t"
                    "paddb %%mm4, %%mm6                     \n\t"
                    "movq %%mm6, %%mm4                      \n\t"
                    "psrlq $16, %%mm6                       \n\t"
                    "psubusb %%mm4, %%mm6                   \n\t"
                    "paddb %%mm4, %%mm6                     \n\t"
                    "movq %%mm6, %%mm4                      \n\t"
                    "psrlq $32, %%mm6                       \n\t"
                    "psubusb %%mm4, %%mm6                   \n\t"
                    "paddb %%mm4, %%mm6                     \n\t"
    
    Michael Niedermayer's avatar
    Michael Niedermayer committed
    #endif
    
                    "movq %%mm6, %%mm0                      \n\t" // max
                    "psubb %%mm7, %%mm6                     \n\t" // max - min
                    "movd %%mm6, %%ecx                      \n\t"
                    "cmpb "MANGLE(deringThreshold)", %%cl   \n\t"
                    " jb 1f                                 \n\t"
                    "lea -24(%%"REG_SP"), %%"REG_c"         \n\t"
                    "and "ALIGN_MASK", %%"REG_c"            \n\t"
                    PAVGB(%%mm0, %%mm7)                           // a=(max + min)/2
                    "punpcklbw %%mm7, %%mm7                 \n\t"
                    "punpcklbw %%mm7, %%mm7                 \n\t"
                    "punpcklbw %%mm7, %%mm7                 \n\t"
                    "movq %%mm7, (%%"REG_c")                \n\t"
    
                    "movq (%0), %%mm0                       \n\t" // L10
                    "movq %%mm0, %%mm1                      \n\t" // L10
                    "movq %%mm0, %%mm2                      \n\t" // L10
                    "psllq $8, %%mm1                        \n\t"
                    "psrlq $8, %%mm2                        \n\t"
                    "movd -4(%0), %%mm3                     \n\t"
                    "movd 8(%0), %%mm4                      \n\t"
                    "psrlq $24, %%mm3                       \n\t"
                    "psllq $56, %%mm4                       \n\t"
                    "por %%mm3, %%mm1                       \n\t" // L00
                    "por %%mm4, %%mm2                       \n\t" // L20
                    "movq %%mm1, %%mm3                      \n\t" // L00
                    PAVGB(%%mm2, %%mm1)                           // (L20 + L00)/2
                    PAVGB(%%mm0, %%mm1)                           // (L20 + L00 + 2L10)/4
                    "psubusb %%mm7, %%mm0                   \n\t"
                    "psubusb %%mm7, %%mm2                   \n\t"
                    "psubusb %%mm7, %%mm3                   \n\t"
                    "pcmpeqb "MANGLE(b00)", %%mm0           \n\t" // L10 > a ? 0 : -1
                    "pcmpeqb "MANGLE(b00)", %%mm2           \n\t" // L20 > a ? 0 : -1
                    "pcmpeqb "MANGLE(b00)", %%mm3           \n\t" // L00 > a ? 0 : -1
                    "paddb %%mm2, %%mm0                     \n\t"
                    "paddb %%mm3, %%mm0                     \n\t"
    
                    "movq (%%"REG_a"), %%mm2                \n\t" // L11
                    "movq %%mm2, %%mm3                      \n\t" // L11
                    "movq %%mm2, %%mm4                      \n\t" // L11
                    "psllq $8, %%mm3                        \n\t"
                    "psrlq $8, %%mm4                        \n\t"
                    "movd -4(%%"REG_a"), %%mm5              \n\t"
                    "movd 8(%%"REG_a"), %%mm6               \n\t"
                    "psrlq $24, %%mm5                       \n\t"
                    "psllq $56, %%mm6                       \n\t"
                    "por %%mm5, %%mm3                       \n\t" // L01
                    "por %%mm6, %%mm4                       \n\t" // L21
                    "movq %%mm3, %%mm5                      \n\t" // L01
                    PAVGB(%%mm4, %%mm3)                           // (L21 + L01)/2
                    PAVGB(%%mm2, %%mm3)                           // (L21 + L01 + 2L11)/4
                    "psubusb %%mm7, %%mm2                   \n\t"
                    "psubusb %%mm7, %%mm4                   \n\t"
                    "psubusb %%mm7, %%mm5                   \n\t"
                    "pcmpeqb "MANGLE(b00)", %%mm2           \n\t" // L11 > a ? 0 : -1
                    "pcmpeqb "MANGLE(b00)", %%mm4           \n\t" // L21 > a ? 0 : -1
                    "pcmpeqb "MANGLE(b00)", %%mm5           \n\t" // L01 > a ? 0 : -1
                    "paddb %%mm4, %%mm2                     \n\t"
                    "paddb %%mm5, %%mm2                     \n\t"
    
    Michael Niedermayer's avatar
    Michael Niedermayer committed
    // 0, 2, 3, 1
    
    #define REAL_DERING_CORE(dst,src,ppsx,psx,sx,pplx,plx,lx,t0,t1) \
    
                    "movq " #src ", " #sx "                 \n\t" /* src[0] */\
                    "movq " #sx ", " #lx "                  \n\t" /* src[0] */\
                    "movq " #sx ", " #t0 "                  \n\t" /* src[0] */\
                    "psllq $8, " #lx "                      \n\t"\
                    "psrlq $8, " #t0 "                      \n\t"\
                    "movd -4" #src ", " #t1 "               \n\t"\
                    "psrlq $24, " #t1 "                     \n\t"\
                    "por " #t1 ", " #lx "                   \n\t" /* src[-1] */\
                    "movd 8" #src ", " #t1 "                \n\t"\
                    "psllq $56, " #t1 "                     \n\t"\
                    "por " #t1 ", " #t0 "                   \n\t" /* src[+1] */\
                    "movq " #lx ", " #t1 "                  \n\t" /* src[-1] */\
                    PAVGB(t0, lx)                                 /* (src[-1] + src[+1])/2 */\
                    PAVGB(sx, lx)                                 /* (src[-1] + 2src[0] + src[+1])/4 */\
                    PAVGB(lx, pplx)                                     \
                    "movq " #lx ", 8(%%"REG_c")             \n\t"\
                    "movq (%%"REG_c"), " #lx "              \n\t"\
                    "psubusb " #lx ", " #t1 "               \n\t"\
                    "psubusb " #lx ", " #t0 "               \n\t"\
                    "psubusb " #lx ", " #sx "               \n\t"\
                    "movq "MANGLE(b00)", " #lx "            \n\t"\
                    "pcmpeqb " #lx ", " #t1 "               \n\t" /* src[-1] > a ? 0 : -1*/\
                    "pcmpeqb " #lx ", " #t0 "               \n\t" /* src[+1] > a ? 0 : -1*/\
                    "pcmpeqb " #lx ", " #sx "               \n\t" /* src[0]  > a ? 0 : -1*/\
                    "paddb " #t1 ", " #t0 "                 \n\t"\
                    "paddb " #t0 ", " #sx "                 \n\t"\
    
    Michael Niedermayer's avatar
    Michael Niedermayer committed
    \
    
                    PAVGB(plx, pplx)                              /* filtered */\
                    "movq " #dst ", " #t0 "                 \n\t" /* dst */\
                    "movq " #t0 ", " #t1 "                  \n\t" /* dst */\
                    "psubusb %3, " #t0 "                    \n\t"\
                    "paddusb %3, " #t1 "                    \n\t"\
                    PMAXUB(t0, pplx)\
                    PMINUB(t1, pplx, t0)\
                    "paddb " #sx ", " #ppsx "               \n\t"\
                    "paddb " #psx ", " #ppsx "              \n\t"\
                    "#paddb "MANGLE(b02)", " #ppsx "        \n\t"\
                    "pand "MANGLE(b08)", " #ppsx "          \n\t"\
                    "pcmpeqb " #lx ", " #ppsx "             \n\t"\
                    "pand " #ppsx ", " #pplx "              \n\t"\
                    "pandn " #dst ", " #ppsx "              \n\t"\
                    "por " #pplx ", " #ppsx "               \n\t"\
                    "movq " #ppsx ", " #dst "               \n\t"\
                    "movq 8(%%"REG_c"), " #lx "             \n\t"
    
    Michael Niedermayer's avatar
    Michael Niedermayer committed
    
    
    #define DERING_CORE(dst,src,ppsx,psx,sx,pplx,plx,lx,t0,t1) \
       REAL_DERING_CORE(dst,src,ppsx,psx,sx,pplx,plx,lx,t0,t1)
    
    Michael Niedermayer's avatar
    Michael Niedermayer committed
    /*
    0000000
    1111111
    
    Michael Niedermayer's avatar
    Michael Niedermayer committed
    1111110
    1111101
    1111100
    1111011
    1111010
    1111001
    
    Michael Niedermayer's avatar
    Michael Niedermayer committed
    1111000
    1110111
    
    Michael Niedermayer's avatar
    Michael Niedermayer committed
    */
    
    //DERING_CORE(dst          ,src            ,ppsx ,psx  ,sx   ,pplx ,plx  ,lx   ,t0   ,t1)
    DERING_CORE((%%REGa)       ,(%%REGa, %1)   ,%%mm0,%%mm2,%%mm4,%%mm1,%%mm3,%%mm5,%%mm6,%%mm7)
    DERING_CORE((%%REGa, %1)   ,(%%REGa, %1, 2),%%mm2,%%mm4,%%mm0,%%mm3,%%mm5,%%mm1,%%mm6,%%mm7)
    DERING_CORE((%%REGa, %1, 2),(%0, %1, 4)    ,%%mm4,%%mm0,%%mm2,%%mm5,%%mm1,%%mm3,%%mm6,%%mm7)
    DERING_CORE((%0, %1, 4)    ,(%%REGd)       ,%%mm0,%%mm2,%%mm4,%%mm1,%%mm3,%%mm5,%%mm6,%%mm7)
    DERING_CORE((%%REGd)       ,(%%REGd, %1)   ,%%mm2,%%mm4,%%mm0,%%mm3,%%mm5,%%mm1,%%mm6,%%mm7)
    DERING_CORE((%%REGd, %1)   ,(%%REGd, %1, 2),%%mm4,%%mm0,%%mm2,%%mm5,%%mm1,%%mm3,%%mm6,%%mm7)
    DERING_CORE((%%REGd, %1, 2),(%0, %1, 8)    ,%%mm0,%%mm2,%%mm4,%%mm1,%%mm3,%%mm5,%%mm6,%%mm7)
    DERING_CORE((%0, %1, 8)    ,(%%REGd, %1, 4),%%mm2,%%mm4,%%mm0,%%mm3,%%mm5,%%mm1,%%mm6,%%mm7)
    
                    "1:                        \n\t"
                    : : "r" (src), "r" ((long)stride), "m" (c->pQPb), "m"(c->pQPb2)
                    : "%"REG_a, "%"REG_d, "%"REG_c
            );
    
    #else //defined (HAVE_MMX2) || defined (HAVE_3DNOW)
    
            int y;
            int min=255;
            int max=0;
            int avg;
            uint8_t *p;
            int s[10];
            const int QP2= c->QP/2 + 1;
    
            for(y=1; y<9; y++)
            {
                    int x;
                    p= src + stride*y;
                    for(x=1; x<9; x++)
                    {
                            p++;
                            if(*p > max) max= *p;
                            if(*p < min) min= *p;
                    }
            }
            avg= (min + max + 1)>>1;
    
            if(max - min <deringThreshold) return;
    
            for(y=0; y<10; y++)
            {
                    int t = 0;
    
                    if(src[stride*y + 0] > avg) t+= 1;
                    if(src[stride*y + 1] > avg) t+= 2;
                    if(src[stride*y + 2] > avg) t+= 4;
                    if(src[stride*y + 3] > avg) t+= 8;
                    if(src[stride*y + 4] > avg) t+= 16;
                    if(src[stride*y + 5] > avg) t+= 32;
                    if(src[stride*y + 6] > avg) t+= 64;
                    if(src[stride*y + 7] > avg) t+= 128;
                    if(src[stride*y + 8] > avg) t+= 256;
                    if(src[stride*y + 9] > avg) t+= 512;
    
                    t |= (~t)<<16;
                    t &= (t<<1) & (t>>1);
                    s[y] = t;
            }
    
            for(y=1; y<9; y++)
            {
                    int t = s[y-1] & s[y] & s[y+1];
                    t|= t>>16;
                    s[y-1]= t;
            }
    
            for(y=1; y<9; y++)
            {
                    int x;
                    int t = s[y-1];
    
                    p= src + stride*y;
                    for(x=1; x<9; x++)
                    {
                            p++;
                            if(t & (1<<x))
                            {
                                    int f= (*(p-stride-1)) + 2*(*(p-stride)) + (*(p-stride+1))
                                          +2*(*(p     -1)) + 4*(*p         ) + 2*(*(p     +1))
                                          +(*(p+stride-1)) + 2*(*(p+stride)) + (*(p+stride+1));
                                    f= (f + 8)>>4;
    
    Michael Niedermayer's avatar
    Michael Niedermayer committed
    
    
    Michael Niedermayer's avatar
    Michael Niedermayer committed
    #ifdef DEBUG_DERING_THRESHOLD
    
                                    asm volatile("emms\n\t":);
                                    {
                                    static long long numPixels=0;
                                    if(x!=1 && x!=8 && y!=1 && y!=8) numPixels++;
    //                                if((max-min)<20 || (max-min)*QP<200)
    //                                if((max-min)*QP < 500)
    //                                if(max-min<QP/2)
                                    if(max-min < 20)
                                    {
                                            static int numSkiped=0;
                                            static int errorSum=0;
                                            static int worstQP=0;
                                            static int worstRange=0;
                                            static int worstDiff=0;
                                            int diff= (f - *p);
    
                                            int absDiff= FFABS(diff);
    
                                            int error= diff*diff;
    
                                            if(x==1 || x==8 || y==1 || y==8) continue;
    
                                            numSkiped++;
                                            if(absDiff > worstDiff)
                                            {
                                                    worstDiff= absDiff;
                                                    worstQP= QP;
                                                    worstRange= max-min;
                                            }
                                            errorSum+= error;
    
                                            if(1024LL*1024LL*1024LL % numSkiped == 0)
                                            {
    
                                                    av_log(c, AV_LOG_INFO, "sum:%1.3f, skip:%d, wQP:%d, "
    
                                                            "wRange:%d, wDiff:%d, relSkip:%1.3f\n",
                                                            (float)errorSum/numSkiped, numSkiped, worstQP, worstRange,
                                                            worstDiff, (float)numSkiped/numPixels);
                                            }
                                    }
                                    }
    
    Michael Niedermayer's avatar
    Michael Niedermayer committed
    #endif
    
                                    if     (*p + QP2 < f) *p= *p + QP2;
                                    else if(*p - QP2 > f) *p= *p - QP2;
                                    else *p=f;
                            }
                    }
            }
    
    Michael Niedermayer's avatar
    Michael Niedermayer committed
    #ifdef DEBUG_DERING_THRESHOLD
    
            if(max-min < 20)
            {
                    for(y=1; y<9; y++)
                    {
                            int x;
                            int t = 0;
                            p= src + stride*y;
                            for(x=1; x<9; x++)
                            {
                                    p++;
    
                            }
                    }
    //                src[0] = src[7]=src[stride*7]=src[stride*7 + 7]=255;
            }
    
    Michael Niedermayer's avatar
    Michael Niedermayer committed
    #endif
    
    #endif //defined (HAVE_MMX2) || defined (HAVE_3DNOW)
    
    Michael Niedermayer's avatar
    Michael Niedermayer committed
     * Deinterlaces the given block by linearly interpolating every second line.
    
    Michael Niedermayer's avatar
    Michael Niedermayer committed
     * will be called for every 8x8 block and can read & write from line 4-15
     * lines 0-3 have been passed through the deblock / dering filters allready, but can be read too
     * lines 4-12 will be read into the deblocking filter and should be deinterlaced
    
    static inline void RENAME(deInterlaceInterpolateLinear)(uint8_t src[], int stride)
    
    {
    #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
    
            src+= 4*stride;
            asm volatile(
                    "lea (%0, %1), %%"REG_a"                \n\t"
                    "lea (%%"REG_a", %1, 4), %%"REG_c"      \n\t"
    //      0       1       2       3       4       5       6       7       8       9
    //      %0      eax     eax+%1  eax+2%1 %0+4%1  ecx     ecx+%1  ecx+2%1 %0+8%1  ecx+4%1
    
                    "movq (%0), %%mm0                       \n\t"
                    "movq (%%"REG_a", %1), %%mm1            \n\t"
                    PAVGB(%%mm1, %%mm0)
                    "movq %%mm0, (%%"REG_a")                \n\t"
                    "movq (%0, %1, 4), %%mm0                \n\t"
                    PAVGB(%%mm0, %%mm1)
                    "movq %%mm1, (%%"REG_a", %1, 2)         \n\t"
                    "movq (%%"REG_c", %1), %%mm1            \n\t"
                    PAVGB(%%mm1, %%mm0)
                    "movq %%mm0, (%%"REG_c")                \n\t"
                    "movq (%0, %1, 8), %%mm0                \n\t"
                    PAVGB(%%mm0, %%mm1)
                    "movq %%mm1, (%%"REG_c", %1, 2)         \n\t"
    
                    : : "r" (src), "r" ((long)stride)
                    : "%"REG_a, "%"REG_c
            );
    
            int a, b, x;
            src+= 4*stride;
    
            for(x=0; x<2; x++){
                    a= *(uint32_t*)&src[stride*0];
                    b= *(uint32_t*)&src[stride*2];
                    *(uint32_t*)&src[stride*1]= (a|b) - (((a^b)&0xFEFEFEFEUL)>>1);
                    a= *(uint32_t*)&src[stride*4];
                    *(uint32_t*)&src[stride*3]= (a|b) - (((a^b)&0xFEFEFEFEUL)>>1);
                    b= *(uint32_t*)&src[stride*6];
                    *(uint32_t*)&src[stride*5]= (a|b) - (((a^b)&0xFEFEFEFEUL)>>1);
                    a= *(uint32_t*)&src[stride*8];
                    *(uint32_t*)&src[stride*7]= (a|b) - (((a^b)&0xFEFEFEFEUL)>>1);
                    src += 4;
            }
    
    Michael Niedermayer's avatar
    Michael Niedermayer committed
     * Deinterlaces the given block by cubic interpolating every second line.
    
    Michael Niedermayer's avatar
    Michael Niedermayer committed
     * will be called for every 8x8 block and can read & write from line 4-15
     * lines 0-3 have been passed through the deblock / dering filters allready, but can be read too
     * lines 4-12 will be read into the deblocking filter and should be deinterlaced
     * this filter will read lines 3-15 and write 7-13
    
    static inline void RENAME(deInterlaceInterpolateCubic)(uint8_t src[], int stride)
    
    {
    #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
    
            src+= stride*3;
            asm volatile(
                    "lea (%0, %1), %%"REG_a"                \n\t"
                    "lea (%%"REG_a", %1, 4), %%"REG_d"      \n\t"
                    "lea (%%"REG_d", %1, 4), %%"REG_c"      \n\t"
                    "add %1, %%"REG_c"                      \n\t"
                    "pxor %%mm7, %%mm7                      \n\t"
    //      0       1       2       3       4       5       6       7       8       9       10
    //      %0      eax     eax+%1  eax+2%1 %0+4%1  edx     edx+%1  edx+2%1 %0+8%1  edx+4%1 ecx
    
                    "movq " #a ", %%mm0                     \n\t"\
                    "movq " #b ", %%mm1                     \n\t"\
                    "movq " #d ", %%mm2                     \n\t"\
                    "movq " #e ", %%mm3                     \n\t"\
                    PAVGB(%%mm2, %%mm1)                             /* (b+d) /2 */\
                    PAVGB(%%mm3, %%mm0)                             /* a(a+e) /2 */\
                    "movq %%mm0, %%mm2                      \n\t"\
                    "punpcklbw %%mm7, %%mm0                 \n\t"\
                    "punpckhbw %%mm7, %%mm2                 \n\t"\
                    "movq %%mm1, %%mm3                      \n\t"\
                    "punpcklbw %%mm7, %%mm1                 \n\t"\
                    "punpckhbw %%mm7, %%mm3                 \n\t"\
                    "psubw %%mm1, %%mm0                     \n\t"   /* L(a+e - (b+d))/2 */\
                    "psubw %%mm3, %%mm2                     \n\t"   /* H(a+e - (b+d))/2 */\
                    "psraw $3, %%mm0                        \n\t"   /* L(a+e - (b+d))/16 */\
                    "psraw $3, %%mm2                        \n\t"   /* H(a+e - (b+d))/16 */\
                    "psubw %%mm0, %%mm1                     \n\t"   /* L(9b + 9d - a - e)/16 */\
                    "psubw %%mm2, %%mm3                     \n\t"   /* H(9b + 9d - a - e)/16 */\
                    "packuswb %%mm3, %%mm1                  \n\t"\
                    "movq %%mm1, " #c "                     \n\t"
    
    #define DEINT_CUBIC(a,b,c,d,e)  REAL_DEINT_CUBIC(a,b,c,d,e)
    
    DEINT_CUBIC((%0)        , (%%REGa, %1), (%%REGa, %1, 2), (%0, %1, 4) , (%%REGd, %1))
    DEINT_CUBIC((%%REGa, %1), (%0, %1, 4) , (%%REGd)       , (%%REGd, %1), (%0, %1, 8))
    DEINT_CUBIC((%0, %1, 4) , (%%REGd, %1), (%%REGd, %1, 2), (%0, %1, 8) , (%%REGc))
    DEINT_CUBIC((%%REGd, %1), (%0, %1, 8) , (%%REGd, %1, 4), (%%REGc)    , (%%REGc, %1, 2))
    
                    : : "r" (src), "r" ((long)stride)
                    : "%"REG_a, "%"REG_d, "%"REG_c
            );
    
    #else //defined (HAVE_MMX2) || defined (HAVE_3DNOW)
    
            int x;
            src+= stride*3;
            for(x=0; x<8; x++)
            {
                    src[stride*3] = CLIP((-src[0]        + 9*src[stride*2] + 9*src[stride*4] - src[stride*6])>>4);
                    src[stride*5] = CLIP((-src[stride*2] + 9*src[stride*4] + 9*src[stride*6] - src[stride*8])>>4);
                    src[stride*7] = CLIP((-src[stride*4] + 9*src[stride*6] + 9*src[stride*8] - src[stride*10])>>4);
                    src[stride*9] = CLIP((-src[stride*6] + 9*src[stride*8] + 9*src[stride*10] - src[stride*12])>>4);
                    src++;
            }
    
    #endif //defined (HAVE_MMX2) || defined (HAVE_3DNOW)
    
    Michael Niedermayer's avatar
    Michael Niedermayer committed
     * Deinterlaces the given block by filtering every second line with a (-1 4 2 4 -1) filter.
    
     * will be called for every 8x8 block and can read & write from line 4-15
     * lines 0-3 have been passed through the deblock / dering filters allready, but can be read too
     * lines 4-12 will be read into the deblocking filter and should be deinterlaced
     * this filter will read lines 4-13 and write 5-11
     */
    static inline void RENAME(deInterlaceFF)(uint8_t src[], int stride, uint8_t *tmp)
    {
    #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
    
            src+= stride*4;
            asm volatile(
                    "lea (%0, %1), %%"REG_a"                \n\t"
                    "lea (%%"REG_a", %1, 4), %%"REG_d"      \n\t"
                    "pxor %%mm7, %%mm7                      \n\t"
                    "movq (%2), %%mm0                       \n\t"
    //      0       1       2       3       4       5       6       7       8       9       10
    //      %0      eax     eax+%1  eax+2%1 %0+4%1  edx     edx+%1  edx+2%1 %0+8%1  edx+4%1 ecx
    
                    "movq " #a ", %%mm1                     \n\t"\
                    "movq " #b ", %%mm2                     \n\t"\
                    "movq " #c ", %%mm3                     \n\t"\
                    "movq " #d ", %%mm4                     \n\t"\
                    PAVGB(%%mm3, %%mm1)                          \
                    PAVGB(%%mm4, %%mm0)                          \
                    "movq %%mm0, %%mm3                      \n\t"\
                    "punpcklbw %%mm7, %%mm0                 \n\t"\
                    "punpckhbw %%mm7, %%mm3                 \n\t"\
                    "movq %%mm1, %%mm4                      \n\t"\
                    "punpcklbw %%mm7, %%mm1                 \n\t"\
                    "punpckhbw %%mm7, %%mm4                 \n\t"\
                    "psllw $2, %%mm1                        \n\t"\
                    "psllw $2, %%mm4                        \n\t"\
                    "psubw %%mm0, %%mm1                     \n\t"\
                    "psubw %%mm3, %%mm4                     \n\t"\
                    "movq %%mm2, %%mm5                      \n\t"\
                    "movq %%mm2, %%mm0                      \n\t"\
                    "punpcklbw %%mm7, %%mm2                 \n\t"\
                    "punpckhbw %%mm7, %%mm5                 \n\t"\
                    "paddw %%mm2, %%mm1                     \n\t"\
                    "paddw %%mm5, %%mm4                     \n\t"\
                    "psraw $2, %%mm1                        \n\t"\
                    "psraw $2, %%mm4                        \n\t"\
                    "packuswb %%mm4, %%mm1                  \n\t"\
                    "movq %%mm1, " #b "                     \n\t"\
    
    #define DEINT_FF(a,b,c,d)  REAL_DEINT_FF(a,b,c,d)
    
    
    DEINT_FF((%0)        , (%%REGa)       , (%%REGa, %1), (%%REGa, %1, 2))
    DEINT_FF((%%REGa, %1), (%%REGa, %1, 2), (%0, %1, 4) , (%%REGd)       )
    DEINT_FF((%0, %1, 4) , (%%REGd)       , (%%REGd, %1), (%%REGd, %1, 2))
    DEINT_FF((%%REGd, %1), (%%REGd, %1, 2), (%0, %1, 8) , (%%REGd, %1, 4))
    
                    "movq %%mm0, (%2)                       \n\t"
                    : : "r" (src), "r" ((long)stride), "r"(tmp)
                    : "%"REG_a, "%"REG_d
            );
    
    #else //defined (HAVE_MMX2) || defined (HAVE_3DNOW)
    
            int x;
            src+= stride*4;
            for(x=0; x<8; x++)
            {
                    int t1= tmp[x];
                    int t2= src[stride*1];
    
                    src[stride*1]= CLIP((-t1 + 4*src[stride*0] + 2*t2 + 4*src[stride*2] - src[stride*3] + 4)>>3);
                    t1= src[stride*4];
                    src[stride*3]= CLIP((-t2 + 4*src[stride*2] + 2*t1 + 4*src[stride*4] - src[stride*5] + 4)>>3);
                    t2= src[stride*6];
                    src[stride*5]= CLIP((-t1 + 4*src[stride*4] + 2*t2 + 4*src[stride*6] - src[stride*7] + 4)>>3);
                    t1= src[stride*8];
                    src[stride*7]= CLIP((-t2 + 4*src[stride*6] + 2*t1 + 4*src[stride*8] - src[stride*9] + 4)>>3);
                    tmp[x]= t1;
    
                    src++;
            }
    
    #endif //defined (HAVE_MMX2) || defined (HAVE_3DNOW)
    
    /**
     * Deinterlaces the given block by filtering every line with a (-1 2 6 2 -1) filter.
     * will be called for every 8x8 block and can read & write from line 4-15
     * lines 0-3 have been passed through the deblock / dering filters allready, but can be read too
     * lines 4-12 will be read into the deblocking filter and should be deinterlaced
     * this filter will read lines 4-13 and write 4-11
     */
    static inline void RENAME(deInterlaceL5)(uint8_t src[], int stride, uint8_t *tmp, uint8_t *tmp2)
    {
    #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
    
            src+= stride*4;
            asm volatile(
                    "lea (%0, %1), %%"REG_a"                \n\t"
                    "lea (%%"REG_a", %1, 4), %%"REG_d"      \n\t"
                    "pxor %%mm7, %%mm7                      \n\t"
                    "movq (%2), %%mm0                       \n\t"
                    "movq (%3), %%mm1                       \n\t"
    //      0       1       2       3       4       5       6       7       8       9       10
    //      %0      eax     eax+%1  eax+2%1 %0+4%1  edx     edx+%1  edx+2%1 %0+8%1  edx+4%1 ecx
    
                    "movq " #a ", %%mm2                     \n\t"\
                    "movq " #b ", %%mm3                     \n\t"\
                    "movq " #c ", %%mm4                     \n\t"\
                    PAVGB(t2, %%mm3)                             \
                    PAVGB(t1, %%mm4)                             \
                    "movq %%mm2, %%mm5                      \n\t"\
                    "movq %%mm2, " #t1 "                    \n\t"\
                    "punpcklbw %%mm7, %%mm2                 \n\t"\
                    "punpckhbw %%mm7, %%mm5                 \n\t"\
                    "movq %%mm2, %%mm6                      \n\t"\
                    "paddw %%mm2, %%mm2                     \n\t"\
                    "paddw %%mm6, %%mm2                     \n\t"\
                    "movq %%mm5, %%mm6                      \n\t"\
                    "paddw %%mm5, %%mm5                     \n\t"\
                    "paddw %%mm6, %%mm5                     \n\t"\
                    "movq %%mm3, %%mm6                      \n\t"\
                    "punpcklbw %%mm7, %%mm3                 \n\t"\
                    "punpckhbw %%mm7, %%mm6                 \n\t"\
                    "paddw %%mm3, %%mm3                     \n\t"\
                    "paddw %%mm6, %%mm6                     \n\t"\
                    "paddw %%mm3, %%mm2                     \n\t"\
                    "paddw %%mm6, %%mm5                     \n\t"\
                    "movq %%mm4, %%mm6                      \n\t"\
                    "punpcklbw %%mm7, %%mm4                 \n\t"\
                    "punpckhbw %%mm7, %%mm6                 \n\t"\
                    "psubw %%mm4, %%mm2                     \n\t"\
                    "psubw %%mm6, %%mm5                     \n\t"\
                    "psraw $2, %%mm2                        \n\t"\
                    "psraw $2, %%mm5                        \n\t"\
                    "packuswb %%mm5, %%mm2                  \n\t"\
                    "movq %%mm2, " #a "                     \n\t"\
    
    #define DEINT_L5(t1,t2,a,b,c)  REAL_DEINT_L5(t1,t2,a,b,c)
    
    DEINT_L5(%%mm0, %%mm1, (%0)           , (%%REGa)       , (%%REGa, %1)   )
    DEINT_L5(%%mm1, %%mm0, (%%REGa)       , (%%REGa, %1)   , (%%REGa, %1, 2))
    DEINT_L5(%%mm0, %%mm1, (%%REGa, %1)   , (%%REGa, %1, 2), (%0, %1, 4)   )
    DEINT_L5(%%mm1, %%mm0, (%%REGa, %1, 2), (%0, %1, 4)    , (%%REGd)       )
    
    DEINT_L5(%%mm0, %%mm1, (%0, %1, 4)    , (%%REGd)       , (%%REGd, %1)   )
    
    DEINT_L5(%%mm1, %%mm0, (%%REGd)       , (%%REGd, %1)   , (%%REGd, %1, 2))
    DEINT_L5(%%mm0, %%mm1, (%%REGd, %1)   , (%%REGd, %1, 2), (%0, %1, 8)   )
    DEINT_L5(%%mm1, %%mm0, (%%REGd, %1, 2), (%0, %1, 8)    , (%%REGd, %1, 4))
    
                    "movq %%mm0, (%2)                       \n\t"
                    "movq %%mm1, (%3)                       \n\t"
                    : : "r" (src), "r" ((long)stride), "r"(tmp), "r"(tmp2)
                    : "%"REG_a, "%"REG_d
            );
    
    #else //defined (HAVE_MMX2) || defined (HAVE_3DNOW)
    
            int x;
            src+= stride*4;
            for(x=0; x<8; x++)
            {
                    int t1= tmp[x];
                    int t2= tmp2[x];
                    int t3= src[0];
    
                    src[stride*0]= CLIP((-(t1 + src[stride*2]) + 2*(t2 + src[stride*1]) + 6*t3 + 4)>>3);
                    t1= src[stride*1];
                    src[stride*1]= CLIP((-(t2 + src[stride*3]) + 2*(t3 + src[stride*2]) + 6*t1 + 4)>>3);
                    t2= src[stride*2];
                    src[stride*2]= CLIP((-(t3 + src[stride*4]) + 2*(t1 + src[stride*3]) + 6*t2 + 4)>>3);
                    t3= src[stride*3];
                    src[stride*3]= CLIP((-(t1 + src[stride*5]) + 2*(t2 + src[stride*4]) + 6*t3 + 4)>>3);
                    t1= src[stride*4];
                    src[stride*4]= CLIP((-(t2 + src[stride*6]) + 2*(t3 + src[stride*5]) + 6*t1 + 4)>>3);
                    t2= src[stride*5];
                    src[stride*5]= CLIP((-(t3 + src[stride*7]) + 2*(t1 + src[stride*6]) + 6*t2 + 4)>>3);
                    t3= src[stride*6];
                    src[stride*6]= CLIP((-(t1 + src[stride*8]) + 2*(t2 + src[stride*7]) + 6*t3 + 4)>>3);
                    t1= src[stride*7];
                    src[stride*7]= CLIP((-(t2 + src[stride*9]) + 2*(t3 + src[stride*8]) + 6*t1 + 4)>>3);
    
                    tmp[x]= t3;
                    tmp2[x]= t1;
    
                    src++;
            }
    
    #endif //defined (HAVE_MMX2) || defined (HAVE_3DNOW)
    
    Michael Niedermayer's avatar
    Michael Niedermayer committed
     * Deinterlaces the given block by filtering all lines with a (1 2 1) filter.
    
    Michael Niedermayer's avatar
    Michael Niedermayer committed
     * will be called for every 8x8 block and can read & write from line 4-15
     * lines 0-3 have been passed through the deblock / dering filters allready, but can be read too
     * lines 4-12 will be read into the deblocking filter and should be deinterlaced
     * this filter will read lines 4-13 and write 4-11
    
    static inline void RENAME(deInterlaceBlendLinear)(uint8_t src[], int stride, uint8_t *tmp)
    
    {
    #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
    
            src+= 4*stride;
            asm volatile(
                    "lea (%0, %1), %%"REG_a"                \n\t"
                    "lea (%%"REG_a", %1, 4), %%"REG_d"      \n\t"
    //      0       1       2       3       4       5       6       7       8       9
    //      %0      eax     eax+%1  eax+2%1 %0+4%1  edx     edx+%1  edx+2%1 %0+8%1  edx+4%1
    
                    "movq (%2), %%mm0                       \n\t" // L0
                    "movq (%%"REG_a"), %%mm1                \n\t" // L2
                    PAVGB(%%mm1, %%mm0)                           // L0+L2
                    "movq (%0), %%mm2                       \n\t" // L1
                    PAVGB(%%mm2, %%mm0)
                    "movq %%mm0, (%0)                       \n\t"
                    "movq (%%"REG_a", %1), %%mm0            \n\t" // L3
                    PAVGB(%%mm0, %%mm2)                           // L1+L3
                    PAVGB(%%mm1, %%mm2)                           // 2L2 + L1 + L3
                    "movq %%mm2, (%%"REG_a")                \n\t"
                    "movq (%%"REG_a", %1, 2), %%mm2         \n\t" // L4
                    PAVGB(%%mm2, %%mm1)                           // L2+L4
                    PAVGB(%%mm0, %%mm1)                           // 2L3 + L2 + L4
                    "movq %%mm1, (%%"REG_a", %1)            \n\t"
                    "movq (%0, %1, 4), %%mm1                \n\t" // L5
                    PAVGB(%%mm1, %%mm0)                           // L3+L5
                    PAVGB(%%mm2, %%mm0)                           // 2L4 + L3 + L5
                    "movq %%mm0, (%%"REG_a", %1, 2)         \n\t"
                    "movq (%%"REG_d"), %%mm0                \n\t" // L6
                    PAVGB(%%mm0, %%mm2)                           // L4+L6
                    PAVGB(%%mm1, %%mm2)                           // 2L5 + L4 + L6
                    "movq %%mm2, (%0, %1, 4)                \n\t"
                    "movq (%%"REG_d", %1), %%mm2            \n\t" // L7
                    PAVGB(%%mm2, %%mm1)                           // L5+L7
                    PAVGB(%%mm0, %%mm1)                           // 2L6 + L5 + L7
                    "movq %%mm1, (%%"REG_d")                \n\t"
                    "movq (%%"REG_d", %1, 2), %%mm1         \n\t" // L8
                    PAVGB(%%mm1, %%mm0)                           // L6+L8
                    PAVGB(%%mm2, %%mm0)                           // 2L7 + L6 + L8
                    "movq %%mm0, (%%"REG_d", %1)            \n\t"
                    "movq (%0, %1, 8), %%mm0                \n\t" // L9
                    PAVGB(%%mm0, %%mm2)                           // L7+L9
                    PAVGB(%%mm1, %%mm2)                           // 2L8 + L7 + L9
                    "movq %%mm2, (%%"REG_d", %1, 2)         \n\t"
                    "movq %%mm1, (%2)                       \n\t"
    
                    : : "r" (src), "r" ((long)stride), "r" (tmp)
                    : "%"REG_a, "%"REG_d
            );
    
    #else //defined (HAVE_MMX2) || defined (HAVE_3DNOW)
    
            int a, b, c, x;
            src+= 4*stride;
    
            for(x=0; x<2; x++){
                    a= *(uint32_t*)&tmp[stride*0];
                    b= *(uint32_t*)&src[stride*0];
                    c= *(uint32_t*)&src[stride*1];
                    a= (a&c) + (((a^c)&0xFEFEFEFEUL)>>1);
                    *(uint32_t*)&src[stride*0]= (a|b) - (((a^b)&0xFEFEFEFEUL)>>1);
    
                    a= *(uint32_t*)&src[stride*2];
                    b= (a&b) + (((a^b)&0xFEFEFEFEUL)>>1);
                    *(uint32_t*)&src[stride*1]= (c|b) - (((c^b)&0xFEFEFEFEUL)>>1);
    
                    b= *(uint32_t*)&src[stride*3];
                    c= (b&c) + (((b^c)&0xFEFEFEFEUL)>>1);
                    *(uint32_t*)&src[stride*2]= (c|a) - (((c^a)&0xFEFEFEFEUL)>>1);
    
                    c= *(uint32_t*)&src[stride*4];
                    a= (a&c) + (((a^c)&0xFEFEFEFEUL)>>1);
                    *(uint32_t*)&src[stride*3]= (a|b) - (((a^b)&0xFEFEFEFEUL)>>1);
    
                    a= *(uint32_t*)&src[stride*5];
                    b= (a&b) + (((a^b)&0xFEFEFEFEUL)>>1);
                    *(uint32_t*)&src[stride*4]= (c|b) - (((c^b)&0xFEFEFEFEUL)>>1);
    
                    b= *(uint32_t*)&src[stride*6];
                    c= (b&c) + (((b^c)&0xFEFEFEFEUL)>>1);
                    *(uint32_t*)&src[stride*5]= (c|a) - (((c^a)&0xFEFEFEFEUL)>>1);
    
                    c= *(uint32_t*)&src[stride*7];
                    a= (a&c) + (((a^c)&0xFEFEFEFEUL)>>1);
                    *(uint32_t*)&src[stride*6]= (a|b) - (((a^b)&0xFEFEFEFEUL)>>1);
    
                    a= *(uint32_t*)&src[stride*8];
                    b= (a&b) + (((a^b)&0xFEFEFEFEUL)>>1);
                    *(uint32_t*)&src[stride*7]= (c|b) - (((c^b)&0xFEFEFEFEUL)>>1);
    
                    *(uint32_t*)&tmp[stride*0]= c;
                    src += 4;
                    tmp += 4;
            }
    
    #endif //defined (HAVE_MMX2) || defined (HAVE_3DNOW)
    
    Michael Niedermayer's avatar
    Michael Niedermayer committed
     * Deinterlaces the given block by applying a median filter to every second line.
    
    Michael Niedermayer's avatar
    Michael Niedermayer committed
     * will be called for every 8x8 block and can read & write from line 4-15,
     * lines 0-3 have been passed through the deblock / dering filters allready, but can be read too
     * lines 4-12 will be read into the deblocking filter and should be deinterlaced
    
    static inline void RENAME(deInterlaceMedian)(uint8_t src[], int stride)
    
    Michael Niedermayer's avatar
    Michael Niedermayer committed
    #ifdef HAVE_MMX
    
    Michael Niedermayer's avatar
    Michael Niedermayer committed
    #ifdef HAVE_MMX2
    
            asm volatile(
                    "lea (%0, %1), %%"REG_a"                \n\t"
                    "lea (%%"REG_a", %1, 4), %%"REG_d"      \n\t"
    //      0       1       2       3       4       5       6       7       8       9
    //      %0      eax     eax+%1  eax+2%1 %0+4%1  edx     edx+%1  edx+2%1 %0+8%1  edx+4%1
    
                    "movq (%0), %%mm0                       \n\t" //
                    "movq (%%"REG_a", %1), %%mm2            \n\t" //
                    "movq (%%"REG_a"), %%mm1                \n\t" //
                    "movq %%mm0, %%mm3                      \n\t"
                    "pmaxub %%mm1, %%mm0                    \n\t" //
                    "pminub %%mm3, %%mm1                    \n\t" //
                    "pmaxub %%mm2, %%mm1                    \n\t" //
                    "pminub %%mm1, %%mm0                    \n\t"
                    "movq %%mm0, (%%"REG_a")                \n\t"
    
                    "movq (%0, %1, 4), %%mm0                \n\t" //
                    "movq (%%"REG_a", %1, 2), %%mm1         \n\t" //
                    "movq %%mm2, %%mm3                      \n\t"
                    "pmaxub %%mm1, %%mm2                    \n\t" //
                    "pminub %%mm3, %%mm1                    \n\t" //
                    "pmaxub %%mm0, %%mm1                    \n\t" //
                    "pminub %%mm1, %%mm2                    \n\t"
                    "movq %%mm2, (%%"REG_a", %1, 2)         \n\t"
    
                    "movq (%%"REG_d"), %%mm2                \n\t" //
                    "movq (%%"REG_d", %1), %%mm1            \n\t" //
                    "movq %%mm2, %%mm3                      \n\t"
                    "pmaxub %%mm0, %%mm2                    \n\t" //
                    "pminub %%mm3, %%mm0                    \n\t" //
                    "pmaxub %%mm1, %%mm0                    \n\t" //
                    "pminub %%mm0, %%mm2                    \n\t"
                    "movq %%mm2, (%%"REG_d")                \n\t"
    
                    "movq (%%"REG_d", %1, 2), %%mm2         \n\t" //
                    "movq (%0, %1, 8), %%mm0                \n\t" //
                    "movq %%mm2, %%mm3                      \n\t"
                    "pmaxub %%mm0, %%mm2                    \n\t" //
                    "pminub %%mm3, %%mm0                    \n\t" //
                    "pmaxub %%mm1, %%mm0                    \n\t" //
                    "pminub %%mm0, %%mm2                    \n\t"
                    "movq %%mm2, (%%"REG_d", %1, 2)         \n\t"