Skip to content
Snippets Groups Projects
postprocess_template.c 162 KiB
Newer Older
  • Learn to ignore specific revisions
  •         "pxor %%mm6, %%mm1                      \n\t"
            "psubw %%mm6, %%mm1                     \n\t" // |2H4 - 5H5 + 5H6 - 2H7|
            "movq %%mm7, %%mm6                      \n\t" // 0
            "pcmpgtw %%mm2, %%mm6                   \n\t"
            "pxor %%mm6, %%mm2                      \n\t"
            "psubw %%mm6, %%mm2                     \n\t" // |2L0 - 5L1 + 5L2 - 2L3|
            "movq %%mm7, %%mm6                      \n\t" // 0
            "pcmpgtw %%mm3, %%mm6                   \n\t"
            "pxor %%mm6, %%mm3                      \n\t"
            "psubw %%mm6, %%mm3                     \n\t" // |2H0 - 5H1 + 5H2 - 2H3|
    
    Michael Niedermayer's avatar
    Michael Niedermayer committed
    #endif
    
            "pminsw %%mm2, %%mm0                    \n\t"
            "pminsw %%mm3, %%mm1                    \n\t"
    
            "movq %%mm0, %%mm6                      \n\t"
            "psubusw %%mm2, %%mm6                   \n\t"
            "psubw %%mm6, %%mm0                     \n\t"
            "movq %%mm1, %%mm6                      \n\t"
            "psubusw %%mm3, %%mm6                   \n\t"
            "psubw %%mm6, %%mm1                     \n\t"
    
            "movd %2, %%mm2                         \n\t" // QP
            "punpcklbw %%mm7, %%mm2                 \n\t"
    
            "movq %%mm7, %%mm6                      \n\t" // 0
            "pcmpgtw %%mm4, %%mm6                   \n\t" // sign(2L2 - 5L3 + 5L4 - 2L5)
            "pxor %%mm6, %%mm4                      \n\t"
            "psubw %%mm6, %%mm4                     \n\t" // |2L2 - 5L3 + 5L4 - 2L5|
            "pcmpgtw %%mm5, %%mm7                   \n\t" // sign(2H2 - 5H3 + 5H4 - 2H5)
            "pxor %%mm7, %%mm5                      \n\t"
            "psubw %%mm7, %%mm5                     \n\t" // |2H2 - 5H3 + 5H4 - 2H5|
    
            "psllw $3, %%mm2                        \n\t" // 8QP
            "movq %%mm2, %%mm3                      \n\t" // 8QP
            "pcmpgtw %%mm4, %%mm2                   \n\t"
            "pcmpgtw %%mm5, %%mm3                   \n\t"
            "pand %%mm2, %%mm4                      \n\t"
            "pand %%mm3, %%mm5                      \n\t"
    
    
            "psubusw %%mm0, %%mm4                   \n\t" // hd
            "psubusw %%mm1, %%mm5                   \n\t" // ld
    
    
            "movq "MANGLE(w05)", %%mm2              \n\t" // 5
            "pmullw %%mm2, %%mm4                    \n\t"
            "pmullw %%mm2, %%mm5                    \n\t"
            "movq "MANGLE(w20)", %%mm2              \n\t" // 32
            "paddw %%mm2, %%mm4                     \n\t"
            "paddw %%mm2, %%mm5                     \n\t"
            "psrlw $6, %%mm4                        \n\t"
            "psrlw $6, %%mm5                        \n\t"
    
            "movq 16(%%"REG_c"), %%mm0              \n\t" // L3 - L4
            "movq 24(%%"REG_c"), %%mm1              \n\t" // H3 - H4
    
            "pxor %%mm2, %%mm2                      \n\t"
            "pxor %%mm3, %%mm3                      \n\t"
    
            "pcmpgtw %%mm0, %%mm2                   \n\t" // sign (L3-L4)
            "pcmpgtw %%mm1, %%mm3                   \n\t" // sign (H3-H4)
            "pxor %%mm2, %%mm0                      \n\t"
            "pxor %%mm3, %%mm1                      \n\t"
            "psubw %%mm2, %%mm0                     \n\t" // |L3-L4|
            "psubw %%mm3, %%mm1                     \n\t" // |H3-H4|
            "psrlw $1, %%mm0                        \n\t" // |L3 - L4|/2
            "psrlw $1, %%mm1                        \n\t" // |H3 - H4|/2
    
            "pxor %%mm6, %%mm2                      \n\t"
            "pxor %%mm7, %%mm3                      \n\t"
            "pand %%mm2, %%mm4                      \n\t"
            "pand %%mm3, %%mm5                      \n\t"
    
            "pminsw %%mm0, %%mm4                    \n\t"
            "pminsw %%mm1, %%mm5                    \n\t"
    
            "movq %%mm4, %%mm2                      \n\t"
            "psubusw %%mm0, %%mm2                   \n\t"
            "psubw %%mm2, %%mm4                     \n\t"
            "movq %%mm5, %%mm2                      \n\t"
            "psubusw %%mm1, %%mm2                   \n\t"
            "psubw %%mm2, %%mm5                     \n\t"
    
            "pxor %%mm6, %%mm4                      \n\t"
            "pxor %%mm7, %%mm5                      \n\t"
            "psubw %%mm6, %%mm4                     \n\t"
            "psubw %%mm7, %%mm5                     \n\t"
            "packsswb %%mm5, %%mm4                  \n\t"
            "movq (%0), %%mm0                       \n\t"
            "paddb   %%mm4, %%mm0                   \n\t"
            "movq %%mm0, (%0)                       \n\t"
            "movq (%0, %1), %%mm0                   \n\t"
            "psubb %%mm4, %%mm0                     \n\t"
            "movq %%mm0, (%0, %1)                   \n\t"
    
            : "+r" (src)
            : "r" ((long)stride), "m" (c->pQPb)
            : "%"REG_a, "%"REG_c
        );
    
    #else //defined (HAVE_MMX2) || defined (HAVE_3DNOW)
    
        const int l1= stride;
        const int l2= stride + l1;
        const int l3= stride + l2;
        const int l4= stride + l3;
        const int l5= stride + l4;
        const int l6= stride + l5;
        const int l7= stride + l6;
        const int l8= stride + l7;
    //    const int l9= stride + l8;
        int x;
        src+= stride*3;
        for(x=0; x<BLOCK_SIZE; x++){
            const int middleEnergy= 5*(src[l5] - src[l4]) + 2*(src[l3] - src[l6]);
            if(FFABS(middleEnergy) < 8*c->QP){
                const int q=(src[l4] - src[l5])/2;
                const int leftEnergy=  5*(src[l3] - src[l2]) + 2*(src[l1] - src[l4]);
                const int rightEnergy= 5*(src[l7] - src[l6]) + 2*(src[l5] - src[l8]);
    
                int d= FFABS(middleEnergy) - FFMIN( FFABS(leftEnergy), FFABS(rightEnergy) );
                d= FFMAX(d, 0);
    
                d= (5*d + 32) >> 6;
                d*= FFSIGN(-middleEnergy);
    
                if(q>0){
                    d= d<0 ? 0 : d;
                    d= d>q ? q : d;
                }else{
                    d= d>0 ? 0 : d;
                    d= d<q ? q : d;
                }
    
                src[l4]-= d;
                src[l5]+= d;
    
    #endif //defined (HAVE_MMX2) || defined (HAVE_3DNOW)
    
    static inline void RENAME(dering)(uint8_t src[], int stride, PPContext *c)
    
    Michael Niedermayer's avatar
    Michael Niedermayer committed
    #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
    
        asm volatile(
            "pxor %%mm6, %%mm6                      \n\t"
            "pcmpeqb %%mm7, %%mm7                   \n\t"
            "movq %2, %%mm0                         \n\t"
            "punpcklbw %%mm6, %%mm0                 \n\t"
            "psrlw $1, %%mm0                        \n\t"
            "psubw %%mm7, %%mm0                     \n\t"
            "packuswb %%mm0, %%mm0                  \n\t"
            "movq %%mm0, %3                         \n\t"
    
            "lea (%0, %1), %%"REG_a"                \n\t"
            "lea (%%"REG_a", %1, 4), %%"REG_d"      \n\t"
    
    
    //        0        1        2        3        4        5        6        7        8        9
    //        %0        eax        eax+%1        eax+2%1        %0+4%1        edx        edx+%1        edx+2%1        %0+8%1        edx+4%1
    
    #undef FIND_MIN_MAX
    
    Michael Niedermayer's avatar
    Michael Niedermayer committed
    #ifdef HAVE_MMX2
    
            "movq " #addr ", %%mm0                  \n\t"\
            "pminub %%mm0, %%mm7                    \n\t"\
            "pmaxub %%mm0, %%mm6                    \n\t"
    
    Michael Niedermayer's avatar
    Michael Niedermayer committed
    #else
    
            "movq " #addr ", %%mm0                  \n\t"\
            "movq %%mm7, %%mm1                      \n\t"\
            "psubusb %%mm0, %%mm6                   \n\t"\
            "paddb %%mm0, %%mm6                     \n\t"\
            "psubusb %%mm0, %%mm1                   \n\t"\
            "psubb %%mm1, %%mm7                     \n\t"
    
    Michael Niedermayer's avatar
    Michael Niedermayer committed
    #endif
    
    #define FIND_MIN_MAX(addr)  REAL_FIND_MIN_MAX(addr)
    
    FIND_MIN_MAX((%%REGa))
    FIND_MIN_MAX((%%REGa, %1))
    FIND_MIN_MAX((%%REGa, %1, 2))
    
    Michael Niedermayer's avatar
    Michael Niedermayer committed
    FIND_MIN_MAX((%0, %1, 4))
    
    FIND_MIN_MAX((%%REGd))
    FIND_MIN_MAX((%%REGd, %1))
    FIND_MIN_MAX((%%REGd, %1, 2))
    
    Michael Niedermayer's avatar
    Michael Niedermayer committed
    FIND_MIN_MAX((%0, %1, 8))
    
            "movq %%mm7, %%mm4                      \n\t"
            "psrlq $8, %%mm7                        \n\t"
    
            "pminub %%mm4, %%mm7                    \n\t" // min of pixels
            "pshufw $0xF9, %%mm7, %%mm4             \n\t"
            "pminub %%mm4, %%mm7                    \n\t" // min of pixels
            "pshufw $0xFE, %%mm7, %%mm4             \n\t"
            "pminub %%mm4, %%mm7                    \n\t"
    
            "movq %%mm7, %%mm1                      \n\t"
            "psubusb %%mm4, %%mm1                   \n\t"
            "psubb %%mm1, %%mm7                     \n\t"
            "movq %%mm7, %%mm4                      \n\t"
            "psrlq $16, %%mm7                       \n\t"
            "movq %%mm7, %%mm1                      \n\t"
            "psubusb %%mm4, %%mm1                   \n\t"
            "psubb %%mm1, %%mm7                     \n\t"
            "movq %%mm7, %%mm4                      \n\t"
            "psrlq $32, %%mm7                       \n\t"
            "movq %%mm7, %%mm1                      \n\t"
            "psubusb %%mm4, %%mm1                   \n\t"
            "psubb %%mm1, %%mm7                     \n\t"
    
            "movq %%mm6, %%mm4                      \n\t"
            "psrlq $8, %%mm6                        \n\t"
    
    Michael Niedermayer's avatar
    Michael Niedermayer committed
    #ifdef HAVE_MMX2
    
            "pmaxub %%mm4, %%mm6                    \n\t" // max of pixels
            "pshufw $0xF9, %%mm6, %%mm4             \n\t"
            "pmaxub %%mm4, %%mm6                    \n\t"
            "pshufw $0xFE, %%mm6, %%mm4             \n\t"
            "pmaxub %%mm4, %%mm6                    \n\t"
    
    Michael Niedermayer's avatar
    Michael Niedermayer committed
    #else
    
            "psubusb %%mm4, %%mm6                   \n\t"
            "paddb %%mm4, %%mm6                     \n\t"
            "movq %%mm6, %%mm4                      \n\t"
            "psrlq $16, %%mm6                       \n\t"
            "psubusb %%mm4, %%mm6                   \n\t"
            "paddb %%mm4, %%mm6                     \n\t"
            "movq %%mm6, %%mm4                      \n\t"
            "psrlq $32, %%mm6                       \n\t"
            "psubusb %%mm4, %%mm6                   \n\t"
            "paddb %%mm4, %%mm6                     \n\t"
    
    Michael Niedermayer's avatar
    Michael Niedermayer committed
    #endif
    
            "movq %%mm6, %%mm0                      \n\t" // max
            "psubb %%mm7, %%mm6                     \n\t" // max - min
            "movd %%mm6, %%ecx                      \n\t"
            "cmpb "MANGLE(deringThreshold)", %%cl   \n\t"
            " jb 1f                                 \n\t"
            "lea -24(%%"REG_SP"), %%"REG_c"         \n\t"
            "and "ALIGN_MASK", %%"REG_c"            \n\t"
            PAVGB(%%mm0, %%mm7)                           // a=(max + min)/2
            "punpcklbw %%mm7, %%mm7                 \n\t"
            "punpcklbw %%mm7, %%mm7                 \n\t"
            "punpcklbw %%mm7, %%mm7                 \n\t"
            "movq %%mm7, (%%"REG_c")                \n\t"
    
            "movq (%0), %%mm0                       \n\t" // L10
            "movq %%mm0, %%mm1                      \n\t" // L10
            "movq %%mm0, %%mm2                      \n\t" // L10
            "psllq $8, %%mm1                        \n\t"
            "psrlq $8, %%mm2                        \n\t"
            "movd -4(%0), %%mm3                     \n\t"
            "movd 8(%0), %%mm4                      \n\t"
            "psrlq $24, %%mm3                       \n\t"
            "psllq $56, %%mm4                       \n\t"
            "por %%mm3, %%mm1                       \n\t" // L00
            "por %%mm4, %%mm2                       \n\t" // L20
            "movq %%mm1, %%mm3                      \n\t" // L00
            PAVGB(%%mm2, %%mm1)                           // (L20 + L00)/2
            PAVGB(%%mm0, %%mm1)                           // (L20 + L00 + 2L10)/4
            "psubusb %%mm7, %%mm0                   \n\t"
            "psubusb %%mm7, %%mm2                   \n\t"
            "psubusb %%mm7, %%mm3                   \n\t"
            "pcmpeqb "MANGLE(b00)", %%mm0           \n\t" // L10 > a ? 0 : -1
            "pcmpeqb "MANGLE(b00)", %%mm2           \n\t" // L20 > a ? 0 : -1
            "pcmpeqb "MANGLE(b00)", %%mm3           \n\t" // L00 > a ? 0 : -1
            "paddb %%mm2, %%mm0                     \n\t"
            "paddb %%mm3, %%mm0                     \n\t"
    
            "movq (%%"REG_a"), %%mm2                \n\t" // L11
            "movq %%mm2, %%mm3                      \n\t" // L11
            "movq %%mm2, %%mm4                      \n\t" // L11
            "psllq $8, %%mm3                        \n\t"
            "psrlq $8, %%mm4                        \n\t"
            "movd -4(%%"REG_a"), %%mm5              \n\t"
            "movd 8(%%"REG_a"), %%mm6               \n\t"
            "psrlq $24, %%mm5                       \n\t"
            "psllq $56, %%mm6                       \n\t"
            "por %%mm5, %%mm3                       \n\t" // L01
            "por %%mm6, %%mm4                       \n\t" // L21
            "movq %%mm3, %%mm5                      \n\t" // L01
            PAVGB(%%mm4, %%mm3)                           // (L21 + L01)/2
            PAVGB(%%mm2, %%mm3)                           // (L21 + L01 + 2L11)/4
            "psubusb %%mm7, %%mm2                   \n\t"
            "psubusb %%mm7, %%mm4                   \n\t"
            "psubusb %%mm7, %%mm5                   \n\t"
            "pcmpeqb "MANGLE(b00)", %%mm2           \n\t" // L11 > a ? 0 : -1
            "pcmpeqb "MANGLE(b00)", %%mm4           \n\t" // L21 > a ? 0 : -1
            "pcmpeqb "MANGLE(b00)", %%mm5           \n\t" // L01 > a ? 0 : -1
            "paddb %%mm4, %%mm2                     \n\t"
            "paddb %%mm5, %%mm2                     \n\t"
    
    Michael Niedermayer's avatar
    Michael Niedermayer committed
    // 0, 2, 3, 1
    
    #define REAL_DERING_CORE(dst,src,ppsx,psx,sx,pplx,plx,lx,t0,t1) \
    
            "movq " #src ", " #sx "                 \n\t" /* src[0] */\
            "movq " #sx ", " #lx "                  \n\t" /* src[0] */\
            "movq " #sx ", " #t0 "                  \n\t" /* src[0] */\
            "psllq $8, " #lx "                      \n\t"\
            "psrlq $8, " #t0 "                      \n\t"\
            "movd -4" #src ", " #t1 "               \n\t"\
            "psrlq $24, " #t1 "                     \n\t"\
            "por " #t1 ", " #lx "                   \n\t" /* src[-1] */\
            "movd 8" #src ", " #t1 "                \n\t"\
            "psllq $56, " #t1 "                     \n\t"\
            "por " #t1 ", " #t0 "                   \n\t" /* src[+1] */\
            "movq " #lx ", " #t1 "                  \n\t" /* src[-1] */\
            PAVGB(t0, lx)                                 /* (src[-1] + src[+1])/2 */\
            PAVGB(sx, lx)                                 /* (src[-1] + 2src[0] + src[+1])/4 */\
            PAVGB(lx, pplx)                                     \
            "movq " #lx ", 8(%%"REG_c")             \n\t"\
            "movq (%%"REG_c"), " #lx "              \n\t"\
            "psubusb " #lx ", " #t1 "               \n\t"\
            "psubusb " #lx ", " #t0 "               \n\t"\
            "psubusb " #lx ", " #sx "               \n\t"\
            "movq "MANGLE(b00)", " #lx "            \n\t"\
            "pcmpeqb " #lx ", " #t1 "               \n\t" /* src[-1] > a ? 0 : -1*/\
            "pcmpeqb " #lx ", " #t0 "               \n\t" /* src[+1] > a ? 0 : -1*/\
            "pcmpeqb " #lx ", " #sx "               \n\t" /* src[0]  > a ? 0 : -1*/\
            "paddb " #t1 ", " #t0 "                 \n\t"\
            "paddb " #t0 ", " #sx "                 \n\t"\
    
    Michael Niedermayer's avatar
    Michael Niedermayer committed
    \
    
            PAVGB(plx, pplx)                              /* filtered */\
            "movq " #dst ", " #t0 "                 \n\t" /* dst */\
            "movq " #t0 ", " #t1 "                  \n\t" /* dst */\
            "psubusb %3, " #t0 "                    \n\t"\
            "paddusb %3, " #t1 "                    \n\t"\
            PMAXUB(t0, pplx)\
            PMINUB(t1, pplx, t0)\
            "paddb " #sx ", " #ppsx "               \n\t"\
            "paddb " #psx ", " #ppsx "              \n\t"\
            "#paddb "MANGLE(b02)", " #ppsx "        \n\t"\
            "pand "MANGLE(b08)", " #ppsx "          \n\t"\
            "pcmpeqb " #lx ", " #ppsx "             \n\t"\
            "pand " #ppsx ", " #pplx "              \n\t"\
            "pandn " #dst ", " #ppsx "              \n\t"\
            "por " #pplx ", " #ppsx "               \n\t"\
            "movq " #ppsx ", " #dst "               \n\t"\
            "movq 8(%%"REG_c"), " #lx "             \n\t"
    
    Michael Niedermayer's avatar
    Michael Niedermayer committed
    
    
    #define DERING_CORE(dst,src,ppsx,psx,sx,pplx,plx,lx,t0,t1) \
       REAL_DERING_CORE(dst,src,ppsx,psx,sx,pplx,plx,lx,t0,t1)
    
    Michael Niedermayer's avatar
    Michael Niedermayer committed
    /*
    0000000
    1111111
    
    Michael Niedermayer's avatar
    Michael Niedermayer committed
    1111110
    1111101
    1111100
    1111011
    1111010
    1111001
    
    Michael Niedermayer's avatar
    Michael Niedermayer committed
    1111000
    1110111
    
    Michael Niedermayer's avatar
    Michael Niedermayer committed
    */
    
    //DERING_CORE(dst          ,src            ,ppsx ,psx  ,sx   ,pplx ,plx  ,lx   ,t0   ,t1)
    DERING_CORE((%%REGa)       ,(%%REGa, %1)   ,%%mm0,%%mm2,%%mm4,%%mm1,%%mm3,%%mm5,%%mm6,%%mm7)
    DERING_CORE((%%REGa, %1)   ,(%%REGa, %1, 2),%%mm2,%%mm4,%%mm0,%%mm3,%%mm5,%%mm1,%%mm6,%%mm7)
    DERING_CORE((%%REGa, %1, 2),(%0, %1, 4)    ,%%mm4,%%mm0,%%mm2,%%mm5,%%mm1,%%mm3,%%mm6,%%mm7)
    DERING_CORE((%0, %1, 4)    ,(%%REGd)       ,%%mm0,%%mm2,%%mm4,%%mm1,%%mm3,%%mm5,%%mm6,%%mm7)
    DERING_CORE((%%REGd)       ,(%%REGd, %1)   ,%%mm2,%%mm4,%%mm0,%%mm3,%%mm5,%%mm1,%%mm6,%%mm7)
    DERING_CORE((%%REGd, %1)   ,(%%REGd, %1, 2),%%mm4,%%mm0,%%mm2,%%mm5,%%mm1,%%mm3,%%mm6,%%mm7)
    DERING_CORE((%%REGd, %1, 2),(%0, %1, 8)    ,%%mm0,%%mm2,%%mm4,%%mm1,%%mm3,%%mm5,%%mm6,%%mm7)
    DERING_CORE((%0, %1, 8)    ,(%%REGd, %1, 4),%%mm2,%%mm4,%%mm0,%%mm3,%%mm5,%%mm1,%%mm6,%%mm7)
    
    
            "1:                        \n\t"
            : : "r" (src), "r" ((long)stride), "m" (c->pQPb), "m"(c->pQPb2)
            : "%"REG_a, "%"REG_d, "%"REG_c
        );
    
    #else //defined (HAVE_MMX2) || defined (HAVE_3DNOW)
    
        int y;
        int min=255;
        int max=0;
        int avg;
        uint8_t *p;
        int s[10];
        const int QP2= c->QP/2 + 1;
    
        for(y=1; y<9; y++){
            int x;
            p= src + stride*y;
            for(x=1; x<9; x++){
                p++;
                if(*p > max) max= *p;
                if(*p < min) min= *p;
    
        }
        avg= (min + max + 1)>>1;
    
        if(max - min <deringThreshold) return;
    
        for(y=0; y<10; y++){
            int t = 0;
    
            if(src[stride*y + 0] > avg) t+= 1;
            if(src[stride*y + 1] > avg) t+= 2;
            if(src[stride*y + 2] > avg) t+= 4;
            if(src[stride*y + 3] > avg) t+= 8;
            if(src[stride*y + 4] > avg) t+= 16;
            if(src[stride*y + 5] > avg) t+= 32;
            if(src[stride*y + 6] > avg) t+= 64;
            if(src[stride*y + 7] > avg) t+= 128;
            if(src[stride*y + 8] > avg) t+= 256;
            if(src[stride*y + 9] > avg) t+= 512;
    
            t |= (~t)<<16;
            t &= (t<<1) & (t>>1);
            s[y] = t;
        }
    
        for(y=1; y<9; y++){
            int t = s[y-1] & s[y] & s[y+1];
            t|= t>>16;
            s[y-1]= t;
        }
    
        for(y=1; y<9; y++){
            int x;
            int t = s[y-1];
    
            p= src + stride*y;
            for(x=1; x<9; x++){
                p++;
                if(t & (1<<x)){
                    int f= (*(p-stride-1)) + 2*(*(p-stride)) + (*(p-stride+1))
                          +2*(*(p     -1)) + 4*(*p         ) + 2*(*(p     +1))
                          +(*(p+stride-1)) + 2*(*(p+stride)) + (*(p+stride+1));
                    f= (f + 8)>>4;
    
    Michael Niedermayer's avatar
    Michael Niedermayer committed
    
    
    Michael Niedermayer's avatar
    Michael Niedermayer committed
    #ifdef DEBUG_DERING_THRESHOLD
    
                        asm volatile("emms\n\t":);
                        {
                        static long long numPixels=0;
                        if(x!=1 && x!=8 && y!=1 && y!=8) numPixels++;
    //                    if((max-min)<20 || (max-min)*QP<200)
    //                    if((max-min)*QP < 500)
    //                    if(max-min<QP/2)
                        if(max-min < 20){
    
                            static int errorSum=0;
                            static int worstQP=0;
                            static int worstRange=0;
                            static int worstDiff=0;
                            int diff= (f - *p);
                            int absDiff= FFABS(diff);
                            int error= diff*diff;
    
                            if(x==1 || x==8 || y==1 || y==8) continue;
    
    
                            if(absDiff > worstDiff){
                                worstDiff= absDiff;
                                worstQP= QP;
                                worstRange= max-min;
    
                            if(1024LL*1024LL*1024LL % numSkipped == 0){
    
                                av_log(c, AV_LOG_INFO, "sum:%1.3f, skip:%d, wQP:%d, "
                                       "wRange:%d, wDiff:%d, relSkip:%1.3f\n",
    
                                       (float)errorSum/numSkipped, numSkipped, worstQP, worstRange,
                                       worstDiff, (float)numSkipped/numPixels);
    
                            }
                        }
                        }
    #endif
                        if     (*p + QP2 < f) *p= *p + QP2;
                        else if(*p - QP2 > f) *p= *p - QP2;
                        else *p=f;
                }
    
    Michael Niedermayer's avatar
    Michael Niedermayer committed
    #ifdef DEBUG_DERING_THRESHOLD
    
        if(max-min < 20){
            for(y=1; y<9; y++){
                int x;
                int t = 0;
                p= src + stride*y;
                for(x=1; x<9; x++){
                    p++;
                    *p = FFMIN(*p + 20, 255);
                }
    
    //        src[0] = src[7]=src[stride*7]=src[stride*7 + 7]=255;
        }
    
    Michael Niedermayer's avatar
    Michael Niedermayer committed
    #endif
    
    #endif //defined (HAVE_MMX2) || defined (HAVE_3DNOW)
    
    Michael Niedermayer's avatar
    Michael Niedermayer committed
     * Deinterlaces the given block by linearly interpolating every second line.
    
    Michael Niedermayer's avatar
    Michael Niedermayer committed
     * will be called for every 8x8 block and can read & write from line 4-15
    
    Diego Biurrun's avatar
    Diego Biurrun committed
     * lines 0-3 have been passed through the deblock / dering filters already, but can be read, too.
    
    Michael Niedermayer's avatar
    Michael Niedermayer committed
     * lines 4-12 will be read into the deblocking filter and should be deinterlaced
    
    static inline void RENAME(deInterlaceInterpolateLinear)(uint8_t src[], int stride)
    
    {
    #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
    
        src+= 4*stride;
        asm volatile(
            "lea (%0, %1), %%"REG_a"                \n\t"
            "lea (%%"REG_a", %1, 4), %%"REG_c"      \n\t"
    
    //      0       1       2       3       4       5       6       7       8       9
    //      %0      eax     eax+%1  eax+2%1 %0+4%1  ecx     ecx+%1  ecx+2%1 %0+8%1  ecx+4%1
    
    
            "movq (%0), %%mm0                       \n\t"
            "movq (%%"REG_a", %1), %%mm1            \n\t"
            PAVGB(%%mm1, %%mm0)
            "movq %%mm0, (%%"REG_a")                \n\t"
            "movq (%0, %1, 4), %%mm0                \n\t"
            PAVGB(%%mm0, %%mm1)
            "movq %%mm1, (%%"REG_a", %1, 2)         \n\t"
            "movq (%%"REG_c", %1), %%mm1            \n\t"
            PAVGB(%%mm1, %%mm0)
            "movq %%mm0, (%%"REG_c")                \n\t"
            "movq (%0, %1, 8), %%mm0                \n\t"
            PAVGB(%%mm0, %%mm1)
            "movq %%mm1, (%%"REG_c", %1, 2)         \n\t"
    
            : : "r" (src), "r" ((long)stride)
            : "%"REG_a, "%"REG_c
        );
    
        int a, b, x;
        src+= 4*stride;
    
        for(x=0; x<2; x++){
            a= *(uint32_t*)&src[stride*0];
            b= *(uint32_t*)&src[stride*2];
            *(uint32_t*)&src[stride*1]= (a|b) - (((a^b)&0xFEFEFEFEUL)>>1);
            a= *(uint32_t*)&src[stride*4];
            *(uint32_t*)&src[stride*3]= (a|b) - (((a^b)&0xFEFEFEFEUL)>>1);
            b= *(uint32_t*)&src[stride*6];
            *(uint32_t*)&src[stride*5]= (a|b) - (((a^b)&0xFEFEFEFEUL)>>1);
            a= *(uint32_t*)&src[stride*8];
            *(uint32_t*)&src[stride*7]= (a|b) - (((a^b)&0xFEFEFEFEUL)>>1);
            src += 4;
        }
    
    Michael Niedermayer's avatar
    Michael Niedermayer committed
     * Deinterlaces the given block by cubic interpolating every second line.
    
    Michael Niedermayer's avatar
    Michael Niedermayer committed
     * will be called for every 8x8 block and can read & write from line 4-15
    
    Diego Biurrun's avatar
    Diego Biurrun committed
     * lines 0-3 have been passed through the deblock / dering filters already, but can be read, too.
    
    Michael Niedermayer's avatar
    Michael Niedermayer committed
     * lines 4-12 will be read into the deblocking filter and should be deinterlaced
     * this filter will read lines 3-15 and write 7-13
    
    static inline void RENAME(deInterlaceInterpolateCubic)(uint8_t src[], int stride)
    
    {
    #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
    
        src+= stride*3;
        asm volatile(
            "lea (%0, %1), %%"REG_a"                \n\t"
            "lea (%%"REG_a", %1, 4), %%"REG_d"      \n\t"
            "lea (%%"REG_d", %1, 4), %%"REG_c"      \n\t"
            "add %1, %%"REG_c"                      \n\t"
            "pxor %%mm7, %%mm7                      \n\t"
    
    //      0       1       2       3       4       5       6       7       8       9       10
    //      %0      eax     eax+%1  eax+2%1 %0+4%1  edx     edx+%1  edx+2%1 %0+8%1  edx+4%1 ecx
    
            "movq " #a ", %%mm0                     \n\t"\
            "movq " #b ", %%mm1                     \n\t"\
            "movq " #d ", %%mm2                     \n\t"\
            "movq " #e ", %%mm3                     \n\t"\
            PAVGB(%%mm2, %%mm1)                             /* (b+d) /2 */\
            PAVGB(%%mm3, %%mm0)                             /* a(a+e) /2 */\
            "movq %%mm0, %%mm2                      \n\t"\
            "punpcklbw %%mm7, %%mm0                 \n\t"\
            "punpckhbw %%mm7, %%mm2                 \n\t"\
            "movq %%mm1, %%mm3                      \n\t"\
            "punpcklbw %%mm7, %%mm1                 \n\t"\
            "punpckhbw %%mm7, %%mm3                 \n\t"\
            "psubw %%mm1, %%mm0                     \n\t"   /* L(a+e - (b+d))/2 */\
            "psubw %%mm3, %%mm2                     \n\t"   /* H(a+e - (b+d))/2 */\
            "psraw $3, %%mm0                        \n\t"   /* L(a+e - (b+d))/16 */\
            "psraw $3, %%mm2                        \n\t"   /* H(a+e - (b+d))/16 */\
            "psubw %%mm0, %%mm1                     \n\t"   /* L(9b + 9d - a - e)/16 */\
            "psubw %%mm2, %%mm3                     \n\t"   /* H(9b + 9d - a - e)/16 */\
            "packuswb %%mm3, %%mm1                  \n\t"\
            "movq %%mm1, " #c "                     \n\t"
    
    #define DEINT_CUBIC(a,b,c,d,e)  REAL_DEINT_CUBIC(a,b,c,d,e)
    
    DEINT_CUBIC((%0)        , (%%REGa, %1), (%%REGa, %1, 2), (%0, %1, 4) , (%%REGd, %1))
    DEINT_CUBIC((%%REGa, %1), (%0, %1, 4) , (%%REGd)       , (%%REGd, %1), (%0, %1, 8))
    DEINT_CUBIC((%0, %1, 4) , (%%REGd, %1), (%%REGd, %1, 2), (%0, %1, 8) , (%%REGc))
    DEINT_CUBIC((%%REGd, %1), (%0, %1, 8) , (%%REGd, %1, 4), (%%REGc)    , (%%REGc, %1, 2))
    
            : : "r" (src), "r" ((long)stride)
            : "%"REG_a, "%"REG_d, "%"REG_c
        );
    
    #else //defined (HAVE_MMX2) || defined (HAVE_3DNOW)
    
        int x;
        src+= stride*3;
        for(x=0; x<8; x++){
            src[stride*3] = CLIP((-src[0]        + 9*src[stride*2] + 9*src[stride*4] - src[stride*6])>>4);
            src[stride*5] = CLIP((-src[stride*2] + 9*src[stride*4] + 9*src[stride*6] - src[stride*8])>>4);
            src[stride*7] = CLIP((-src[stride*4] + 9*src[stride*6] + 9*src[stride*8] - src[stride*10])>>4);
            src[stride*9] = CLIP((-src[stride*6] + 9*src[stride*8] + 9*src[stride*10] - src[stride*12])>>4);
            src++;
        }
    
    #endif //defined (HAVE_MMX2) || defined (HAVE_3DNOW)
    
    Michael Niedermayer's avatar
    Michael Niedermayer committed
     * Deinterlaces the given block by filtering every second line with a (-1 4 2 4 -1) filter.
    
     * will be called for every 8x8 block and can read & write from line 4-15
    
    Diego Biurrun's avatar
    Diego Biurrun committed
     * lines 0-3 have been passed through the deblock / dering filters already, but can be read, too.
    
     * lines 4-12 will be read into the deblocking filter and should be deinterlaced
     * this filter will read lines 4-13 and write 5-11
     */
    static inline void RENAME(deInterlaceFF)(uint8_t src[], int stride, uint8_t *tmp)
    {
    #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
    
        src+= stride*4;
        asm volatile(
            "lea (%0, %1), %%"REG_a"                \n\t"
            "lea (%%"REG_a", %1, 4), %%"REG_d"      \n\t"
            "pxor %%mm7, %%mm7                      \n\t"
            "movq (%2), %%mm0                       \n\t"
    
    //      0       1       2       3       4       5       6       7       8       9       10
    //      %0      eax     eax+%1  eax+2%1 %0+4%1  edx     edx+%1  edx+2%1 %0+8%1  edx+4%1 ecx
    
            "movq " #a ", %%mm1                     \n\t"\
            "movq " #b ", %%mm2                     \n\t"\
            "movq " #c ", %%mm3                     \n\t"\
            "movq " #d ", %%mm4                     \n\t"\
            PAVGB(%%mm3, %%mm1)                          \
            PAVGB(%%mm4, %%mm0)                          \
            "movq %%mm0, %%mm3                      \n\t"\
            "punpcklbw %%mm7, %%mm0                 \n\t"\
            "punpckhbw %%mm7, %%mm3                 \n\t"\
            "movq %%mm1, %%mm4                      \n\t"\
            "punpcklbw %%mm7, %%mm1                 \n\t"\
            "punpckhbw %%mm7, %%mm4                 \n\t"\
            "psllw $2, %%mm1                        \n\t"\
            "psllw $2, %%mm4                        \n\t"\
            "psubw %%mm0, %%mm1                     \n\t"\
            "psubw %%mm3, %%mm4                     \n\t"\
            "movq %%mm2, %%mm5                      \n\t"\
            "movq %%mm2, %%mm0                      \n\t"\
            "punpcklbw %%mm7, %%mm2                 \n\t"\
            "punpckhbw %%mm7, %%mm5                 \n\t"\
            "paddw %%mm2, %%mm1                     \n\t"\
            "paddw %%mm5, %%mm4                     \n\t"\
            "psraw $2, %%mm1                        \n\t"\
            "psraw $2, %%mm4                        \n\t"\
            "packuswb %%mm4, %%mm1                  \n\t"\
            "movq %%mm1, " #b "                     \n\t"\
    
    #define DEINT_FF(a,b,c,d)  REAL_DEINT_FF(a,b,c,d)
    
    
    DEINT_FF((%0)        , (%%REGa)       , (%%REGa, %1), (%%REGa, %1, 2))
    DEINT_FF((%%REGa, %1), (%%REGa, %1, 2), (%0, %1, 4) , (%%REGd)       )
    DEINT_FF((%0, %1, 4) , (%%REGd)       , (%%REGd, %1), (%%REGd, %1, 2))
    DEINT_FF((%%REGd, %1), (%%REGd, %1, 2), (%0, %1, 8) , (%%REGd, %1, 4))
    
            "movq %%mm0, (%2)                       \n\t"
            : : "r" (src), "r" ((long)stride), "r"(tmp)
            : "%"REG_a, "%"REG_d
        );
    
    #else //defined (HAVE_MMX2) || defined (HAVE_3DNOW)
    
        int x;
        src+= stride*4;
        for(x=0; x<8; x++){
            int t1= tmp[x];
            int t2= src[stride*1];
    
            src[stride*1]= CLIP((-t1 + 4*src[stride*0] + 2*t2 + 4*src[stride*2] - src[stride*3] + 4)>>3);
            t1= src[stride*4];
            src[stride*3]= CLIP((-t2 + 4*src[stride*2] + 2*t1 + 4*src[stride*4] - src[stride*5] + 4)>>3);
            t2= src[stride*6];
            src[stride*5]= CLIP((-t1 + 4*src[stride*4] + 2*t2 + 4*src[stride*6] - src[stride*7] + 4)>>3);
            t1= src[stride*8];
            src[stride*7]= CLIP((-t2 + 4*src[stride*6] + 2*t1 + 4*src[stride*8] - src[stride*9] + 4)>>3);
            tmp[x]= t1;
    
            src++;
        }
    
    #endif //defined (HAVE_MMX2) || defined (HAVE_3DNOW)
    
    /**
     * Deinterlaces the given block by filtering every line with a (-1 2 6 2 -1) filter.
     * will be called for every 8x8 block and can read & write from line 4-15
    
    Diego Biurrun's avatar
    Diego Biurrun committed
     * lines 0-3 have been passed through the deblock / dering filters already, but can be read, too.
    
     * lines 4-12 will be read into the deblocking filter and should be deinterlaced
     * this filter will read lines 4-13 and write 4-11
     */
    static inline void RENAME(deInterlaceL5)(uint8_t src[], int stride, uint8_t *tmp, uint8_t *tmp2)
    {
    #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
    
        src+= stride*4;
        asm volatile(
            "lea (%0, %1), %%"REG_a"                \n\t"
            "lea (%%"REG_a", %1, 4), %%"REG_d"      \n\t"
            "pxor %%mm7, %%mm7                      \n\t"
            "movq (%2), %%mm0                       \n\t"
            "movq (%3), %%mm1                       \n\t"
    
    //      0       1       2       3       4       5       6       7       8       9       10
    //      %0      eax     eax+%1  eax+2%1 %0+4%1  edx     edx+%1  edx+2%1 %0+8%1  edx+4%1 ecx
    
            "movq " #a ", %%mm2                     \n\t"\
            "movq " #b ", %%mm3                     \n\t"\
            "movq " #c ", %%mm4                     \n\t"\
            PAVGB(t2, %%mm3)                             \
            PAVGB(t1, %%mm4)                             \
            "movq %%mm2, %%mm5                      \n\t"\
            "movq %%mm2, " #t1 "                    \n\t"\
            "punpcklbw %%mm7, %%mm2                 \n\t"\
            "punpckhbw %%mm7, %%mm5                 \n\t"\
            "movq %%mm2, %%mm6                      \n\t"\
            "paddw %%mm2, %%mm2                     \n\t"\
            "paddw %%mm6, %%mm2                     \n\t"\
            "movq %%mm5, %%mm6                      \n\t"\
            "paddw %%mm5, %%mm5                     \n\t"\
            "paddw %%mm6, %%mm5                     \n\t"\
            "movq %%mm3, %%mm6                      \n\t"\
            "punpcklbw %%mm7, %%mm3                 \n\t"\
            "punpckhbw %%mm7, %%mm6                 \n\t"\
            "paddw %%mm3, %%mm3                     \n\t"\
            "paddw %%mm6, %%mm6                     \n\t"\
            "paddw %%mm3, %%mm2                     \n\t"\
            "paddw %%mm6, %%mm5                     \n\t"\
            "movq %%mm4, %%mm6                      \n\t"\
            "punpcklbw %%mm7, %%mm4                 \n\t"\
            "punpckhbw %%mm7, %%mm6                 \n\t"\
            "psubw %%mm4, %%mm2                     \n\t"\
            "psubw %%mm6, %%mm5                     \n\t"\
            "psraw $2, %%mm2                        \n\t"\
            "psraw $2, %%mm5                        \n\t"\
            "packuswb %%mm5, %%mm2                  \n\t"\
            "movq %%mm2, " #a "                     \n\t"\
    
    #define DEINT_L5(t1,t2,a,b,c)  REAL_DEINT_L5(t1,t2,a,b,c)
    
    DEINT_L5(%%mm0, %%mm1, (%0)           , (%%REGa)       , (%%REGa, %1)   )
    DEINT_L5(%%mm1, %%mm0, (%%REGa)       , (%%REGa, %1)   , (%%REGa, %1, 2))
    DEINT_L5(%%mm0, %%mm1, (%%REGa, %1)   , (%%REGa, %1, 2), (%0, %1, 4)   )
    DEINT_L5(%%mm1, %%mm0, (%%REGa, %1, 2), (%0, %1, 4)    , (%%REGd)       )
    
    DEINT_L5(%%mm0, %%mm1, (%0, %1, 4)    , (%%REGd)       , (%%REGd, %1)   )
    
    DEINT_L5(%%mm1, %%mm0, (%%REGd)       , (%%REGd, %1)   , (%%REGd, %1, 2))
    DEINT_L5(%%mm0, %%mm1, (%%REGd, %1)   , (%%REGd, %1, 2), (%0, %1, 8)   )
    DEINT_L5(%%mm1, %%mm0, (%%REGd, %1, 2), (%0, %1, 8)    , (%%REGd, %1, 4))
    
            "movq %%mm0, (%2)                       \n\t"
            "movq %%mm1, (%3)                       \n\t"
            : : "r" (src), "r" ((long)stride), "r"(tmp), "r"(tmp2)
            : "%"REG_a, "%"REG_d
        );
    
    #else //defined (HAVE_MMX2) || defined (HAVE_3DNOW)
    
        int x;
        src+= stride*4;
        for(x=0; x<8; x++){
            int t1= tmp[x];
            int t2= tmp2[x];
            int t3= src[0];
    
            src[stride*0]= CLIP((-(t1 + src[stride*2]) + 2*(t2 + src[stride*1]) + 6*t3 + 4)>>3);
            t1= src[stride*1];
            src[stride*1]= CLIP((-(t2 + src[stride*3]) + 2*(t3 + src[stride*2]) + 6*t1 + 4)>>3);
            t2= src[stride*2];
            src[stride*2]= CLIP((-(t3 + src[stride*4]) + 2*(t1 + src[stride*3]) + 6*t2 + 4)>>3);
            t3= src[stride*3];
            src[stride*3]= CLIP((-(t1 + src[stride*5]) + 2*(t2 + src[stride*4]) + 6*t3 + 4)>>3);
            t1= src[stride*4];
            src[stride*4]= CLIP((-(t2 + src[stride*6]) + 2*(t3 + src[stride*5]) + 6*t1 + 4)>>3);
            t2= src[stride*5];
            src[stride*5]= CLIP((-(t3 + src[stride*7]) + 2*(t1 + src[stride*6]) + 6*t2 + 4)>>3);
            t3= src[stride*6];
            src[stride*6]= CLIP((-(t1 + src[stride*8]) + 2*(t2 + src[stride*7]) + 6*t3 + 4)>>3);
            t1= src[stride*7];
            src[stride*7]= CLIP((-(t2 + src[stride*9]) + 2*(t3 + src[stride*8]) + 6*t1 + 4)>>3);
    
            tmp[x]= t3;
            tmp2[x]= t1;
    
            src++;
        }
    
    #endif //defined (HAVE_MMX2) || defined (HAVE_3DNOW)
    
    Michael Niedermayer's avatar
    Michael Niedermayer committed
     * Deinterlaces the given block by filtering all lines with a (1 2 1) filter.
    
    Michael Niedermayer's avatar
    Michael Niedermayer committed
     * will be called for every 8x8 block and can read & write from line 4-15
    
    Diego Biurrun's avatar
    Diego Biurrun committed
     * lines 0-3 have been passed through the deblock / dering filters already, but can be read, too.
    
    Michael Niedermayer's avatar
    Michael Niedermayer committed
     * lines 4-12 will be read into the deblocking filter and should be deinterlaced
     * this filter will read lines 4-13 and write 4-11
    
    static inline void RENAME(deInterlaceBlendLinear)(uint8_t src[], int stride, uint8_t *tmp)
    
    {
    #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
    
        src+= 4*stride;
        asm volatile(
            "lea (%0, %1), %%"REG_a"                \n\t"
            "lea (%%"REG_a", %1, 4), %%"REG_d"      \n\t"
    
    //      0       1       2       3       4       5       6       7       8       9
    //      %0      eax     eax+%1  eax+2%1 %0+4%1  edx     edx+%1  edx+2%1 %0+8%1  edx+4%1
    
    
            "movq (%2), %%mm0                       \n\t" // L0
            "movq (%%"REG_a"), %%mm1                \n\t" // L2
            PAVGB(%%mm1, %%mm0)                           // L0+L2
            "movq (%0), %%mm2                       \n\t" // L1
            PAVGB(%%mm2, %%mm0)
            "movq %%mm0, (%0)                       \n\t"
            "movq (%%"REG_a", %1), %%mm0            \n\t" // L3
            PAVGB(%%mm0, %%mm2)                           // L1+L3
            PAVGB(%%mm1, %%mm2)                           // 2L2 + L1 + L3
            "movq %%mm2, (%%"REG_a")                \n\t"
            "movq (%%"REG_a", %1, 2), %%mm2         \n\t" // L4
            PAVGB(%%mm2, %%mm1)                           // L2+L4
            PAVGB(%%mm0, %%mm1)                           // 2L3 + L2 + L4
            "movq %%mm1, (%%"REG_a", %1)            \n\t"
            "movq (%0, %1, 4), %%mm1                \n\t" // L5
            PAVGB(%%mm1, %%mm0)                           // L3+L5
            PAVGB(%%mm2, %%mm0)                           // 2L4 + L3 + L5
            "movq %%mm0, (%%"REG_a", %1, 2)         \n\t"
            "movq (%%"REG_d"), %%mm0                \n\t" // L6
            PAVGB(%%mm0, %%mm2)                           // L4+L6
            PAVGB(%%mm1, %%mm2)                           // 2L5 + L4 + L6
            "movq %%mm2, (%0, %1, 4)                \n\t"
            "movq (%%"REG_d", %1), %%mm2            \n\t" // L7
            PAVGB(%%mm2, %%mm1)                           // L5+L7
            PAVGB(%%mm0, %%mm1)                           // 2L6 + L5 + L7
            "movq %%mm1, (%%"REG_d")                \n\t"
            "movq (%%"REG_d", %1, 2), %%mm1         \n\t" // L8
            PAVGB(%%mm1, %%mm0)                           // L6+L8
            PAVGB(%%mm2, %%mm0)                           // 2L7 + L6 + L8
            "movq %%mm0, (%%"REG_d", %1)            \n\t"
            "movq (%0, %1, 8), %%mm0                \n\t" // L9
            PAVGB(%%mm0, %%mm2)                           // L7+L9
            PAVGB(%%mm1, %%mm2)                           // 2L8 + L7 + L9
            "movq %%mm2, (%%"REG_d", %1, 2)         \n\t"
            "movq %%mm1, (%2)                       \n\t"
    
            : : "r" (src), "r" ((long)stride), "r" (tmp)
            : "%"REG_a, "%"REG_d
        );
    
    #else //defined (HAVE_MMX2) || defined (HAVE_3DNOW)
    
        int a, b, c, x;
        src+= 4*stride;
    
        for(x=0; x<2; x++){
            a= *(uint32_t*)&tmp[stride*0];
            b= *(uint32_t*)&src[stride*0];
            c= *(uint32_t*)&src[stride*1];
            a= (a&c) + (((a^c)&0xFEFEFEFEUL)>>1);
            *(uint32_t*)&src[stride*0]= (a|b) - (((a^b)&0xFEFEFEFEUL)>>1);
    
            a= *(uint32_t*)&src[stride*2];
            b= (a&b) + (((a^b)&0xFEFEFEFEUL)>>1);
            *(uint32_t*)&src[stride*1]= (c|b) - (((c^b)&0xFEFEFEFEUL)>>1);
    
            b= *(uint32_t*)&src[stride*3];
            c= (b&c) + (((b^c)&0xFEFEFEFEUL)>>1);
            *(uint32_t*)&src[stride*2]= (c|a) - (((c^a)&0xFEFEFEFEUL)>>1);
    
            c= *(uint32_t*)&src[stride*4];
            a= (a&c) + (((a^c)&0xFEFEFEFEUL)>>1);
            *(uint32_t*)&src[stride*3]= (a|b) - (((a^b)&0xFEFEFEFEUL)>>1);
    
            a= *(uint32_t*)&src[stride*5];
            b= (a&b) + (((a^b)&0xFEFEFEFEUL)>>1);
            *(uint32_t*)&src[stride*4]= (c|b) - (((c^b)&0xFEFEFEFEUL)>>1);
    
            b= *(uint32_t*)&src[stride*6];
            c= (b&c) + (((b^c)&0xFEFEFEFEUL)>>1);
            *(uint32_t*)&src[stride*5]= (c|a) - (((c^a)&0xFEFEFEFEUL)>>1);
    
            c= *(uint32_t*)&src[stride*7];
            a= (a&c) + (((a^c)&0xFEFEFEFEUL)>>1);
            *(uint32_t*)&src[stride*6]= (a|b) - (((a^b)&0xFEFEFEFEUL)>>1);
    
            a= *(uint32_t*)&src[stride*8];
            b= (a&b) + (((a^b)&0xFEFEFEFEUL)>>1);
            *(uint32_t*)&src[stride*7]= (c|b) - (((c^b)&0xFEFEFEFEUL)>>1);
    
            *(uint32_t*)&tmp[stride*0]= c;
            src += 4;
            tmp += 4;
        }
    
    #endif //defined (HAVE_MMX2) || defined (HAVE_3DNOW)
    
    Michael Niedermayer's avatar
    Michael Niedermayer committed
     * Deinterlaces the given block by applying a median filter to every second line.
    
    Michael Niedermayer's avatar
    Michael Niedermayer committed
     * will be called for every 8x8 block and can read & write from line 4-15,
    
    Diego Biurrun's avatar
    Diego Biurrun committed
     * lines 0-3 have been passed through the deblock / dering filters already, but can be read, too.
    
    Michael Niedermayer's avatar
    Michael Niedermayer committed
     * lines 4-12 will be read into the deblocking filter and should be deinterlaced
    
    static inline void RENAME(deInterlaceMedian)(uint8_t src[], int stride)
    
    Michael Niedermayer's avatar
    Michael Niedermayer committed
    #ifdef HAVE_MMX
    
    Michael Niedermayer's avatar
    Michael Niedermayer committed
    #ifdef HAVE_MMX2
    
        asm volatile(
            "lea (%0, %1), %%"REG_a"                \n\t"
            "lea (%%"REG_a", %1, 4), %%"REG_d"      \n\t"
    
    //      0       1       2       3       4       5       6       7       8       9
    //      %0      eax     eax+%1  eax+2%1 %0+4%1  edx     edx+%1  edx+2%1 %0+8%1  edx+4%1
    
    
            "movq (%0), %%mm0                       \n\t" //
            "movq (%%"REG_a", %1), %%mm2            \n\t" //
            "movq (%%"REG_a"), %%mm1                \n\t" //
            "movq %%mm0, %%mm3                      \n\t"
            "pmaxub %%mm1, %%mm0                    \n\t" //
            "pminub %%mm3, %%mm1                    \n\t" //
            "pmaxub %%mm2, %%mm1                    \n\t" //
            "pminub %%mm1, %%mm0                    \n\t"
            "movq %%mm0, (%%"REG_a")                \n\t"
    
            "movq (%0, %1, 4), %%mm0                \n\t" //
            "movq (%%"REG_a", %1, 2), %%mm1         \n\t" //
            "movq %%mm2, %%mm3                      \n\t"
            "pmaxub %%mm1, %%mm2                    \n\t" //
            "pminub %%mm3, %%mm1                    \n\t" //
            "pmaxub %%mm0, %%mm1                    \n\t" //
            "pminub %%mm1, %%mm2                    \n\t"
            "movq %%mm2, (%%"REG_a", %1, 2)         \n\t"
    
            "movq (%%"REG_d"), %%mm2                \n\t" //
            "movq (%%"REG_d", %1), %%mm1            \n\t" //
            "movq %%mm2, %%mm3                      \n\t"
            "pmaxub %%mm0, %%mm2                    \n\t" //
            "pminub %%mm3, %%mm0                    \n\t" //
            "pmaxub %%mm1, %%mm0                    \n\t" //
            "pminub %%mm0, %%mm2                    \n\t"
            "movq %%mm2, (%%"REG_d")                \n\t"
    
            "movq (%%"REG_d", %1, 2), %%mm2         \n\t" //
            "movq (%0, %1, 8), %%mm0                \n\t" //
            "movq %%mm2, %%mm3                      \n\t"
            "pmaxub %%mm0, %%mm2                    \n\t" //
            "pminub %%mm3, %%mm0                    \n\t" //
            "pmaxub %%mm1, %%mm0                    \n\t" //
            "pminub %%mm0, %%mm2                    \n\t"
            "movq %%mm2, (%%"REG_d", %1, 2)         \n\t"
    
    
            : : "r" (src), "r" ((long)stride)
            : "%"REG_a, "%"REG_d
        );
    
    Michael Niedermayer's avatar
    Michael Niedermayer committed
    
    #else // MMX without MMX2
    
        asm volatile(
            "lea (%0, %1), %%"REG_a"                \n\t"
            "lea (%%"REG_a", %1, 4), %%"REG_d"      \n\t"
    
    //      0       1       2       3       4       5       6       7       8       9
    //      %0      eax     eax+%1  eax+2%1 %0+4%1  edx     edx+%1  edx+2%1 %0+8%1  edx+4%1
    
            "movq " #a ", %%mm0                     \n\t"\
            "movq " #b ", %%mm2                     \n\t"\
            "movq " #c ", %%mm1                     \n\t"\
            "movq %%mm0, %%mm3                      \n\t"\
            "movq %%mm1, %%mm4                      \n\t"\
            "movq %%mm2, %%mm5                      \n\t"\
            "psubusb %%mm1, %%mm3                   \n\t"\
            "psubusb %%mm2, %%mm4                   \n\t"\
            "psubusb %%mm0, %%mm5                   \n\t"\
            "pcmpeqb %%mm7, %%mm3                   \n\t"\
            "pcmpeqb %%mm7, %%mm4                   \n\t"\
            "pcmpeqb %%mm7, %%mm5                   \n\t"\
            "movq %%mm3, %%mm6                      \n\t"\
            "pxor %%mm4, %%mm3                      \n\t"\
            "pxor %%mm5, %%mm4                      \n\t"\
            "pxor %%mm6, %%mm5                      \n\t"\
            "por %%mm3, %%mm1                       \n\t"\
            "por %%mm4, %%mm2                       \n\t"\
            "por %%mm5, %%mm0                       \n\t"\
            "pand %%mm2, %%mm0                      \n\t"\
            "pand %%mm1, %%mm0                      \n\t"\
            "movq %%mm0, " #b "                     \n\t"
    
    #define MEDIAN(a,b,c)  REAL_MEDIAN(a,b,c)
    
    MEDIAN((%0)        , (%%REGa)       , (%%REGa, %1))
    
    MEDIAN((%%REGa, %1), (%%REGa, %1, 2), (%0, %1, 4))
    
    MEDIAN((%0, %1, 4) , (%%REGd)       , (%%REGd, %1))
    
    MEDIAN((%%REGd, %1), (%%REGd, %1, 2), (%0, %1, 8))
    
            : : "r" (src), "r" ((long)stride)
            : "%"REG_a, "%"REG_d
        );
    
        int x, y;
        src+= 4*stride;
        // FIXME - there should be a way to do a few columns in parallel like w/mmx
        for(x=0; x<8; x++){
            uint8_t *colsrc = src;
            for (y=0; y<4; y++){
                int a, b, c, d, e, f;