Skip to content
Snippets Groups Projects
dsputil.c 167 KiB
Newer Older
  • Learn to ignore specific revisions
  •         const int p0 = pix[-1*xstride];
    
            const int q0 = pix[ 0*xstride];
            const int q1 = pix[ 1*xstride];
            const int q2 = pix[ 2*xstride];
    
            if( FFABS( p0 - q0 ) < alpha &&
                FFABS( p1 - p0 ) < beta &&
                FFABS( q1 - q0 ) < beta ) {
    
                if(FFABS( p0 - q0 ) < (( alpha >> 2 ) + 2 )){
                    if( FFABS( p2 - p0 ) < beta)
                    {
                        const int p3 = pix[-4*xstride];
                        /* p0', p1', p2' */
                        pix[-1*xstride] = ( p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4 ) >> 3;
                        pix[-2*xstride] = ( p2 + p1 + p0 + q0 + 2 ) >> 2;
                        pix[-3*xstride] = ( 2*p3 + 3*p2 + p1 + p0 + q0 + 4 ) >> 3;
                    } else {
                        /* p0' */
                        pix[-1*xstride] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
                    }
                    if( FFABS( q2 - q0 ) < beta)
                    {
                        const int q3 = pix[3*xstride];
                        /* q0', q1', q2' */
                        pix[0*xstride] = ( p1 + 2*p0 + 2*q0 + 2*q1 + q2 + 4 ) >> 3;
                        pix[1*xstride] = ( p0 + q0 + q1 + q2 + 2 ) >> 2;
                        pix[2*xstride] = ( 2*q3 + 3*q2 + q1 + q0 + p0 + 4 ) >> 3;
                    } else {
                        /* q0' */
                        pix[0*xstride] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
                    }
                }else{
                    /* p0', q0' */
                    pix[-1*xstride] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
                    pix[ 0*xstride] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
                }
            }
            pix += ystride;
        }
    }
    static void h264_v_loop_filter_luma_intra_c(uint8_t *pix, int stride, int alpha, int beta)
    {
        h264_loop_filter_luma_intra_c(pix, stride, 1, alpha, beta);
    }
    static void h264_h_loop_filter_luma_intra_c(uint8_t *pix, int stride, int alpha, int beta)
    {
        h264_loop_filter_luma_intra_c(pix, 1, stride, alpha, beta);
    }
    
    
    static inline void h264_loop_filter_chroma_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta, int8_t *tc0)
    
    {
        int i, d;
        for( i = 0; i < 4; i++ ) {
            const int tc = tc0[i];
            if( tc <= 0 ) {
                pix += 2*ystride;
                continue;
            }
            for( d = 0; d < 2; d++ ) {
                const int p0 = pix[-1*xstride];
                const int p1 = pix[-2*xstride];
                const int q0 = pix[0];
                const int q1 = pix[1*xstride];
    
    
                if( FFABS( p0 - q0 ) < alpha &&
                    FFABS( p1 - p0 ) < beta &&
                    FFABS( q1 - q0 ) < beta ) {
    
                    int delta = av_clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
    
                    pix[-xstride] = av_clip_uint8( p0 + delta );    /* p0' */
                    pix[0]        = av_clip_uint8( q0 - delta );    /* q0' */
    
    static void h264_v_loop_filter_chroma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
    
    {
        h264_loop_filter_chroma_c(pix, stride, 1, alpha, beta, tc0);
    }
    
    static void h264_h_loop_filter_chroma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
    
    {
        h264_loop_filter_chroma_c(pix, 1, stride, alpha, beta, tc0);
    }
    
    
    static inline void h264_loop_filter_chroma_intra_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta)
    {
        int d;
        for( d = 0; d < 8; d++ ) {
            const int p0 = pix[-1*xstride];
            const int p1 = pix[-2*xstride];
            const int q0 = pix[0];
            const int q1 = pix[1*xstride];
    
    
            if( FFABS( p0 - q0 ) < alpha &&
                FFABS( p1 - p0 ) < beta &&
                FFABS( q1 - q0 ) < beta ) {
    
    
                pix[-xstride] = ( 2*p1 + p0 + q1 + 2 ) >> 2;   /* p0' */
                pix[0]        = ( 2*q1 + q0 + p1 + 2 ) >> 2;   /* q0' */
            }
            pix += ystride;
        }
    }
    static void h264_v_loop_filter_chroma_intra_c(uint8_t *pix, int stride, int alpha, int beta)
    {
        h264_loop_filter_chroma_intra_c(pix, stride, 1, alpha, beta);
    }
    static void h264_h_loop_filter_chroma_intra_c(uint8_t *pix, int stride, int alpha, int beta)
    {
        h264_loop_filter_chroma_intra_c(pix, 1, stride, alpha, beta);
    }
    
    
    static inline int pix_abs16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
    
    Fabrice Bellard's avatar
    Fabrice Bellard committed
    {
        int s, i;
    
        s = 0;
    
        for(i=0;i<h;i++) {
    
    Fabrice Bellard's avatar
    Fabrice Bellard committed
            s += abs(pix1[0] - pix2[0]);
            s += abs(pix1[1] - pix2[1]);
            s += abs(pix1[2] - pix2[2]);
            s += abs(pix1[3] - pix2[3]);
            s += abs(pix1[4] - pix2[4]);
            s += abs(pix1[5] - pix2[5]);
            s += abs(pix1[6] - pix2[6]);
            s += abs(pix1[7] - pix2[7]);
            s += abs(pix1[8] - pix2[8]);
            s += abs(pix1[9] - pix2[9]);
            s += abs(pix1[10] - pix2[10]);
            s += abs(pix1[11] - pix2[11]);
            s += abs(pix1[12] - pix2[12]);
            s += abs(pix1[13] - pix2[13]);
            s += abs(pix1[14] - pix2[14]);
            s += abs(pix1[15] - pix2[15]);
            pix1 += line_size;
            pix2 += line_size;
        }
        return s;
    }
    
    
    static int pix_abs16_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
    
    Fabrice Bellard's avatar
    Fabrice Bellard committed
    {
        int s, i;
    
        s = 0;
    
        for(i=0;i<h;i++) {
    
    Fabrice Bellard's avatar
    Fabrice Bellard committed
            s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
            s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
            s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
            s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
            s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
            s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
            s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
            s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
            s += abs(pix1[8] - avg2(pix2[8], pix2[9]));
            s += abs(pix1[9] - avg2(pix2[9], pix2[10]));
            s += abs(pix1[10] - avg2(pix2[10], pix2[11]));
            s += abs(pix1[11] - avg2(pix2[11], pix2[12]));
            s += abs(pix1[12] - avg2(pix2[12], pix2[13]));
            s += abs(pix1[13] - avg2(pix2[13], pix2[14]));
            s += abs(pix1[14] - avg2(pix2[14], pix2[15]));
            s += abs(pix1[15] - avg2(pix2[15], pix2[16]));
            pix1 += line_size;
            pix2 += line_size;
        }
        return s;
    }
    
    
    static int pix_abs16_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
    
    Fabrice Bellard's avatar
    Fabrice Bellard committed
    {
        int s, i;
    
        uint8_t *pix3 = pix2 + line_size;
    
    Fabrice Bellard's avatar
    Fabrice Bellard committed
    
        s = 0;
    
        for(i=0;i<h;i++) {
    
    Fabrice Bellard's avatar
    Fabrice Bellard committed
            s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
            s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
            s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
            s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
            s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
            s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
            s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
            s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
            s += abs(pix1[8] - avg2(pix2[8], pix3[8]));
            s += abs(pix1[9] - avg2(pix2[9], pix3[9]));
            s += abs(pix1[10] - avg2(pix2[10], pix3[10]));
            s += abs(pix1[11] - avg2(pix2[11], pix3[11]));
            s += abs(pix1[12] - avg2(pix2[12], pix3[12]));
            s += abs(pix1[13] - avg2(pix2[13], pix3[13]));
            s += abs(pix1[14] - avg2(pix2[14], pix3[14]));
            s += abs(pix1[15] - avg2(pix2[15], pix3[15]));
            pix1 += line_size;
            pix2 += line_size;
            pix3 += line_size;
        }
        return s;
    }
    
    
    static int pix_abs16_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
    
    Fabrice Bellard's avatar
    Fabrice Bellard committed
    {
        int s, i;
    
        uint8_t *pix3 = pix2 + line_size;
    
    Fabrice Bellard's avatar
    Fabrice Bellard committed
    
        s = 0;
    
        for(i=0;i<h;i++) {
    
    Fabrice Bellard's avatar
    Fabrice Bellard committed
            s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
            s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
            s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
            s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
            s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
            s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
            s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
            s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
            s += abs(pix1[8] - avg4(pix2[8], pix2[9], pix3[8], pix3[9]));
            s += abs(pix1[9] - avg4(pix2[9], pix2[10], pix3[9], pix3[10]));
            s += abs(pix1[10] - avg4(pix2[10], pix2[11], pix3[10], pix3[11]));
            s += abs(pix1[11] - avg4(pix2[11], pix2[12], pix3[11], pix3[12]));
            s += abs(pix1[12] - avg4(pix2[12], pix2[13], pix3[12], pix3[13]));
            s += abs(pix1[13] - avg4(pix2[13], pix2[14], pix3[13], pix3[14]));
            s += abs(pix1[14] - avg4(pix2[14], pix2[15], pix3[14], pix3[15]));
            s += abs(pix1[15] - avg4(pix2[15], pix2[16], pix3[15], pix3[16]));
            pix1 += line_size;
            pix2 += line_size;
            pix3 += line_size;
        }
        return s;
    }
    
    
    static inline int pix_abs8_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
    
        for(i=0;i<h;i++) {
    
            s += abs(pix1[0] - pix2[0]);
            s += abs(pix1[1] - pix2[1]);
            s += abs(pix1[2] - pix2[2]);
            s += abs(pix1[3] - pix2[3]);
            s += abs(pix1[4] - pix2[4]);
            s += abs(pix1[5] - pix2[5]);
            s += abs(pix1[6] - pix2[6]);
            s += abs(pix1[7] - pix2[7]);
            pix1 += line_size;
            pix2 += line_size;
        }
        return s;
    }
    
    
    static int pix_abs8_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
    
        for(i=0;i<h;i++) {
    
            s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
            s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
            s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
            s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
            s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
            s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
            s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
            s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
            pix1 += line_size;
            pix2 += line_size;
        }
        return s;
    }
    
    
    static int pix_abs8_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
    
        uint8_t *pix3 = pix2 + line_size;
    
        for(i=0;i<h;i++) {
    
            s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
            s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
            s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
            s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
            s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
            s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
            s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
            s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
            pix1 += line_size;
            pix2 += line_size;
            pix3 += line_size;
        }
        return s;
    }
    
    
    static int pix_abs8_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
    
        uint8_t *pix3 = pix2 + line_size;
    
        for(i=0;i<h;i++) {
    
            s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
            s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
            s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
            s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
            s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
            s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
            s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
            s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
            pix1 += line_size;
            pix2 += line_size;
            pix3 += line_size;
        }
        return s;
    }
    
    
    static int nsse16_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
        MpegEncContext *c = v;
    
    Michael Niedermayer's avatar
    Michael Niedermayer committed
    
    
        for(y=0; y<h; y++){
            for(x=0; x<16; x++){
                score1+= (s1[x  ] - s2[x ])*(s1[x  ] - s2[x ]);
            }
            if(y+1<h){
                for(x=0; x<15; x++){
    
                    score2+= FFABS(  s1[x  ] - s1[x  +stride]
    
                                 - s1[x+1] + s1[x+1+stride])
    
                            -FFABS(  s2[x  ] - s2[x  +stride]
    
                                 - s2[x+1] + s2[x+1+stride]);
                }
            }
            s1+= stride;
            s2+= stride;
        }
    
    Michael Niedermayer's avatar
    Michael Niedermayer committed
    
    
        if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
        else  return score1 + FFABS(score2)*8;
    
    static int nsse8_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
        MpegEncContext *c = v;
    
        for(y=0; y<h; y++){
            for(x=0; x<8; x++){
                score1+= (s1[x  ] - s2[x ])*(s1[x  ] - s2[x ]);
            }
            if(y+1<h){
                for(x=0; x<7; x++){
    
                    score2+= FFABS(  s1[x  ] - s1[x  +stride]
    
                                 - s1[x+1] + s1[x+1+stride])
    
                            -FFABS(  s2[x  ] - s2[x  +stride]
    
                                 - s2[x+1] + s2[x+1+stride]);
                }
            }
            s1+= stride;
            s2+= stride;
        }
    
        if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
        else  return score1 + FFABS(score2)*8;
    
    static int try_8x8basis_c(int16_t rem[64], int16_t weight[64], int16_t basis[64], int scale){
        int i;
        unsigned int sum=0;
    
        for(i=0; i<8*8; i++){
            int b= rem[i] + ((basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT));
            int w= weight[i];
            b>>= RECON_SHIFT;
            assert(-512<b && b<512);
    
            sum += (w*b)*(w*b)>>4;
        }
        return sum>>2;
    }
    
    static void add_8x8basis_c(int16_t rem[64], int16_t basis[64], int scale){
        int i;
    
        for(i=0; i<8*8; i++){
            rem[i] += (basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT);
    
    /**
     * permutes an 8x8 block.
    
     * @param block the block which will be permuted according to the given permutation vector
    
     * @param permutation the permutation vector
     * @param last the last non zero coefficient in scantable order, used to speed the permutation up
    
     * @param scantable the used scantable, this is only used to speed the permutation up, the block is not
    
     *                  (inverse) permutated to scantable order!
    
    void ff_block_permute(DCTELEM *block, uint8_t *permutation, const uint8_t *scantable, int last)
    
        DCTELEM temp[64];
    
        if(last<=0) return;
    
    Diego Biurrun's avatar
    Diego Biurrun committed
        //if(permutation[1]==1) return; //FIXME it is ok but not clean and might fail for some permutations
    
        for(i=0; i<=last; i++){
            const int j= scantable[i];
            temp[j]= block[j];
            block[j]=0;
        }
    
        for(i=0; i<=last; i++){
            const int j= scantable[i];
            const int perm_j= permutation[j];
            block[perm_j]= temp[j];
        }
    
    static int zero_cmp(void *s, uint8_t *a, uint8_t *b, int stride, int h){
        return 0;
    }
    
    void ff_set_cmp(DSPContext* c, me_cmp_func *cmp, int type){
        int i;
    
        memset(cmp, 0, sizeof(void*)*6);
    
            switch(type&0xFF){
            case FF_CMP_SAD:
                cmp[i]= c->sad[i];
                break;
            case FF_CMP_SATD:
                cmp[i]= c->hadamard8_diff[i];
                break;
            case FF_CMP_SSE:
                cmp[i]= c->sse[i];
                break;
            case FF_CMP_DCT:
                cmp[i]= c->dct_sad[i];
                break;
    
            case FF_CMP_DCT264:
                cmp[i]= c->dct264_sad[i];
                break;
    
            case FF_CMP_DCTMAX:
                cmp[i]= c->dct_max[i];
                break;
    
            case FF_CMP_PSNR:
                cmp[i]= c->quant_psnr[i];
                break;
            case FF_CMP_BIT:
                cmp[i]= c->bit[i];
                break;
            case FF_CMP_RD:
                cmp[i]= c->rd[i];
                break;
            case FF_CMP_VSAD:
                cmp[i]= c->vsad[i];
                break;
            case FF_CMP_VSSE:
                cmp[i]= c->vsse[i];
                break;
            case FF_CMP_ZERO:
                cmp[i]= zero_cmp;
                break;
    
            case FF_CMP_NSSE:
                cmp[i]= c->nsse[i];
                break;
    
    #if CONFIG_SNOW_ENCODER
    
            case FF_CMP_W53:
                cmp[i]= c->w53[i];
                break;
            case FF_CMP_W97:
                cmp[i]= c->w97[i];
                break;
    
            default:
                av_log(NULL, AV_LOG_ERROR,"internal error in cmp function selection\n");
            }
        }
    }
    
    
    Loren Merritt's avatar
    Loren Merritt committed
    static void clear_block_c(DCTELEM *block)
    {
        memset(block, 0, sizeof(DCTELEM)*64);
    }
    
    
    /**
     * memset(blocks, 0, sizeof(DCTELEM)*6*64)
     */
    
    static void clear_blocks_c(DCTELEM *blocks)
    
    {
        memset(blocks, 0, sizeof(DCTELEM)*6*64);
    }
    
    
    Michael Niedermayer's avatar
    Michael Niedermayer committed
    static void add_bytes_c(uint8_t *dst, uint8_t *src, int w){
    
        long i;
        for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
            long a = *(long*)(src+i);
            long b = *(long*)(dst+i);
            *(long*)(dst+i) = ((a&pb_7f) + (b&pb_7f)) ^ ((a^b)&pb_80);
    
    Michael Niedermayer's avatar
    Michael Niedermayer committed
        }
        for(; i<w; i++)
            dst[i+0] += src[i+0];
    }
    
    
    static void add_bytes_l2_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
    
        for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
            long a = *(long*)(src1+i);
            long b = *(long*)(src2+i);
    
            *(long*)(dst+i) = ((a&pb_7f) + (b&pb_7f)) ^ ((a^b)&pb_80);
    
        }
        for(; i<w; i++)
            dst[i] = src1[i]+src2[i];
    }
    
    
    Michael Niedermayer's avatar
    Michael Niedermayer committed
    static void diff_bytes_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
    
    #if !HAVE_FAST_UNALIGNED
    
        if((long)src2 & (sizeof(long)-1)){
    
    Loren Merritt's avatar
    Loren Merritt committed
            for(i=0; i+7<w; i+=8){
                dst[i+0] = src1[i+0]-src2[i+0];
                dst[i+1] = src1[i+1]-src2[i+1];
                dst[i+2] = src1[i+2]-src2[i+2];
                dst[i+3] = src1[i+3]-src2[i+3];
                dst[i+4] = src1[i+4]-src2[i+4];
                dst[i+5] = src1[i+5]-src2[i+5];
                dst[i+6] = src1[i+6]-src2[i+6];
                dst[i+7] = src1[i+7]-src2[i+7];
            }
    
        }else
    #endif
        for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
            long a = *(long*)(src1+i);
            long b = *(long*)(src2+i);
            *(long*)(dst+i) = ((a|pb_80) - (b&pb_7f)) ^ ((a^b^pb_80)&pb_80);
        }
    
    Michael Niedermayer's avatar
    Michael Niedermayer committed
        for(; i<w; i++)
            dst[i+0] = src1[i+0]-src2[i+0];
    }
    
    
    static void add_hfyu_median_prediction_c(uint8_t *dst, uint8_t *src1, uint8_t *diff, int w, int *left, int *left_top){
        int i;
        uint8_t l, lt;
    
        l= *left;
        lt= *left_top;
    
        for(i=0; i<w; i++){
            l= mid_pred(l, src1[i], (l + src1[i] - lt)&0xFF) + diff[i];
            lt= src1[i];
            dst[i]= l;
        }
    
        *left= l;
        *left_top= lt;
    }
    
    
    static void sub_hfyu_median_prediction_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w, int *left, int *left_top){
        int i;
        uint8_t l, lt;
    
        l= *left;
        lt= *left_top;
    
        for(i=0; i<w; i++){
            const int pred= mid_pred(l, src1[i], (l + src1[i] - lt)&0xFF);
            lt= src1[i];
            l= src2[i];
            dst[i]= l - pred;
    
    Michael Niedermayer's avatar
    Michael Niedermayer committed
    #define BUTTERFLY2(o1,o2,i1,i2) \
    o1= (i1)+(i2);\
    o2= (i1)-(i2);
    
    #define BUTTERFLY1(x,y) \
    {\
        int a,b;\
        a= x;\
        b= y;\
        x= a+b;\
        y= a-b;\
    }
    
    
    #define BUTTERFLYA(x,y) (FFABS((x)+(y)) + FFABS((x)-(y)))
    
    static int hadamard8_diff8x8_c(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride, int h){
    
    Michael Niedermayer's avatar
    Michael Niedermayer committed
        int i;
        int temp[64];
        int sum=0;
    
        assert(h==8);
    
    Michael Niedermayer's avatar
    Michael Niedermayer committed
    
        for(i=0; i<8; i++){
            //FIXME try pointer walks
            BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0]-dst[stride*i+0],src[stride*i+1]-dst[stride*i+1]);
            BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2]-dst[stride*i+2],src[stride*i+3]-dst[stride*i+3]);
            BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4]-dst[stride*i+4],src[stride*i+5]-dst[stride*i+5]);
            BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6]-dst[stride*i+6],src[stride*i+7]-dst[stride*i+7]);
    
    Michael Niedermayer's avatar
    Michael Niedermayer committed
            BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
            BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
            BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
            BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
    
    Michael Niedermayer's avatar
    Michael Niedermayer committed
            BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
            BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
            BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
            BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
        }
    
        for(i=0; i<8; i++){
            BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
            BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
            BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
            BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
    
    Michael Niedermayer's avatar
    Michael Niedermayer committed
            BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
            BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
            BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
            BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
    
    
    Michael Niedermayer's avatar
    Michael Niedermayer committed
                 BUTTERFLYA(temp[8*0+i], temp[8*4+i])
                +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
                +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
                +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
        }
    #if 0
    static int maxi=0;
    if(sum>maxi){
        maxi=sum;
        printf("MAX:%d\n", maxi);
    }
    #endif
        return sum;
    }
    
    
    static int hadamard8_intra8x8_c(/*MpegEncContext*/ void *s, uint8_t *src, uint8_t *dummy, int stride, int h){
    
    Michael Niedermayer's avatar
    Michael Niedermayer committed
        int i;
        int temp[64];
        int sum=0;
    
        assert(h==8);
    
    Michael Niedermayer's avatar
    Michael Niedermayer committed
        for(i=0; i<8; i++){
            //FIXME try pointer walks
    
            BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0],src[stride*i+1]);
            BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2],src[stride*i+3]);
            BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4],src[stride*i+5]);
            BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6],src[stride*i+7]);
    
    Michael Niedermayer's avatar
    Michael Niedermayer committed
            BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
            BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
            BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
            BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
    
    Michael Niedermayer's avatar
    Michael Niedermayer committed
            BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
            BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
            BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
            BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
        }
    
        for(i=0; i<8; i++){
            BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
            BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
            BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
            BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
    
    Michael Niedermayer's avatar
    Michael Niedermayer committed
            BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
            BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
            BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
            BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
    
    Michael Niedermayer's avatar
    Michael Niedermayer committed
                 BUTTERFLYA(temp[8*0+i], temp[8*4+i])
                +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
                +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
                +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
        }
    
        sum -= FFABS(temp[8*0] + temp[8*4]); // -mean
    
    Michael Niedermayer's avatar
    Michael Niedermayer committed
        return sum;
    }
    
    
    static int dct_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
    
    Michael Niedermayer's avatar
    Michael Niedermayer committed
        MpegEncContext * const s= (MpegEncContext *)c;
    
        DECLARE_ALIGNED_16(uint64_t, aligned_temp[sizeof(DCTELEM)*64/8]);
    
    Michael Niedermayer's avatar
    Michael Niedermayer committed
        DCTELEM * const temp= (DCTELEM*)aligned_temp;
    
        assert(h==8);
    
    Michael Niedermayer's avatar
    Michael Niedermayer committed
    
        s->dsp.diff_pixels(temp, src1, src2, stride);
    
        return s->dsp.sum_abs_dctelem(temp);
    
    #define DCT8_1D {\
        const int s07 = SRC(0) + SRC(7);\
        const int s16 = SRC(1) + SRC(6);\
        const int s25 = SRC(2) + SRC(5);\
        const int s34 = SRC(3) + SRC(4);\
        const int a0 = s07 + s34;\
        const int a1 = s16 + s25;\
        const int a2 = s07 - s34;\
        const int a3 = s16 - s25;\
        const int d07 = SRC(0) - SRC(7);\
        const int d16 = SRC(1) - SRC(6);\
        const int d25 = SRC(2) - SRC(5);\
        const int d34 = SRC(3) - SRC(4);\
        const int a4 = d16 + d25 + (d07 + (d07>>1));\
        const int a5 = d07 - d34 - (d25 + (d25>>1));\
        const int a6 = d07 + d34 - (d16 + (d16>>1));\
        const int a7 = d16 - d25 + (d34 + (d34>>1));\
        DST(0,  a0 + a1     ) ;\
        DST(1,  a4 + (a7>>2)) ;\
        DST(2,  a2 + (a3>>1)) ;\
        DST(3,  a5 + (a6>>2)) ;\
        DST(4,  a0 - a1     ) ;\
        DST(5,  a6 - (a5>>2)) ;\
        DST(6, (a2>>1) - a3 ) ;\
        DST(7, (a4>>2) - a7 ) ;\
    }
    
    static int dct264_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
        MpegEncContext * const s= (MpegEncContext *)c;
    
    Måns Rullgård's avatar
    Måns Rullgård committed
        DCTELEM dct[8][8];
    
    Måns Rullgård's avatar
    Måns Rullgård committed
        s->dsp.diff_pixels(dct[0], src1, src2, stride);
    
    
    #define SRC(x) dct[i][x]
    #define DST(x,v) dct[i][x]= v
        for( i = 0; i < 8; i++ )
            DCT8_1D
    #undef SRC
    #undef DST
    
    #define SRC(x) dct[x][i]
    
    #define DST(x,v) sum += FFABS(v)
    
        for( i = 0; i < 8; i++ )
            DCT8_1D
    #undef SRC
    #undef DST
        return sum;
    }
    #endif
    
    
    static int dct_max8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
        MpegEncContext * const s= (MpegEncContext *)c;
    
        DECLARE_ALIGNED_8(uint64_t, aligned_temp[sizeof(DCTELEM)*64/8]);
    
        DCTELEM * const temp= (DCTELEM*)aligned_temp;
        int sum=0, i;
    
        assert(h==8);
    
        s->dsp.diff_pixels(temp, src1, src2, stride);
        s->dsp.fdct(temp);
    
        for(i=0; i<64; i++)
    
            sum= FFMAX(sum, FFABS(temp[i]));
    
    static int quant_psnr8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
    
    Michael Niedermayer's avatar
    Michael Niedermayer committed
        MpegEncContext * const s= (MpegEncContext *)c;
    
        DECLARE_ALIGNED_8 (uint64_t, aligned_temp[sizeof(DCTELEM)*64*2/8]);
    
    Michael Niedermayer's avatar
    Michael Niedermayer committed
        DCTELEM * const temp= (DCTELEM*)aligned_temp;
        DCTELEM * const bak = ((DCTELEM*)aligned_temp)+64;
    
    Michael Niedermayer's avatar
    Michael Niedermayer committed
        int sum=0, i;
    
    
        assert(h==8);
    
    Michael Niedermayer's avatar
    Michael Niedermayer committed
        s->mb_intra=0;
    
    Michael Niedermayer's avatar
    Michael Niedermayer committed
        s->dsp.diff_pixels(temp, src1, src2, stride);
    
    Michael Niedermayer's avatar
    Michael Niedermayer committed
        memcpy(bak, temp, 64*sizeof(DCTELEM));
    
        s->block_last_index[0/*FIXME*/]= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
    
        s->dct_unquantize_inter(s, temp, 0, s->qscale);
    
        ff_simple_idct(temp); //FIXME
    
    Michael Niedermayer's avatar
    Michael Niedermayer committed
        for(i=0; i<64; i++)
            sum+= (temp[i]-bak[i])*(temp[i]-bak[i]);
    
    Michael Niedermayer's avatar
    Michael Niedermayer committed
        return sum;
    }
    
    
    static int rd8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
    
        MpegEncContext * const s= (MpegEncContext *)c;
    
        const uint8_t *scantable= s->intra_scantable.permutated;
    
        DECLARE_ALIGNED_8 (uint64_t, aligned_temp[sizeof(DCTELEM)*64/8]);
        DECLARE_ALIGNED_8 (uint64_t, aligned_bak[stride]);
    
    Michael Niedermayer's avatar
    Michael Niedermayer committed
        DCTELEM * const temp= (DCTELEM*)aligned_temp;
        uint8_t * const bak= (uint8_t*)aligned_bak;
    
        int i, last, run, bits, level, distortion, start_i;
    
        const int esc_length= s->ac_esc_length;
        uint8_t * length;
        uint8_t * last_length;
    
        for(i=0; i<8; i++){
            ((uint32_t*)(bak + i*stride))[0]= ((uint32_t*)(src2 + i*stride))[0];
            ((uint32_t*)(bak + i*stride))[1]= ((uint32_t*)(src2 + i*stride))[1];
        }
    
        s->dsp.diff_pixels(temp, src1, src2, stride);
    
        s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
    
        bits=0;
    
        if (s->mb_intra) {
    
            length     = s->intra_ac_vlc_length;
            last_length= s->intra_ac_vlc_last_length;
    
            bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
    
        } else {
            start_i = 0;
            length     = s->inter_ac_vlc_length;
            last_length= s->inter_ac_vlc_last_length;
        }
    
        if(last>=start_i){
    
            run=0;
            for(i=start_i; i<last; i++){
                int j= scantable[i];
                level= temp[j];
    
                if(level){
                    level+=64;
                    if((level&(~127)) == 0){
                        bits+= length[UNI_AC_ENC_INDEX(run, level)];
                    }else
                        bits+= esc_length;
                    run=0;
                }else
                    run++;
            }
            i= scantable[last];
    
            level= temp[i] + 64;
    
    Michael Niedermayer's avatar
    Michael Niedermayer committed
    
            assert(level - 64);
    
            if((level&(~127)) == 0){
                bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
            }else
                bits+= esc_length;
    
            if(s->mb_intra)
                s->dct_unquantize_intra(s, temp, 0, s->qscale);
            else
                s->dct_unquantize_inter(s, temp, 0, s->qscale);
    
        s->dsp.idct_add(bak, stride, temp);
    
        distortion= s->dsp.sse[1](NULL, bak, src1, stride, 8);
    
        return distortion + ((bits*s->qscale*s->qscale*109 + 64)>>7);
    
    static int bit8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
    
        MpegEncContext * const s= (MpegEncContext *)c;
    
        const uint8_t *scantable= s->intra_scantable.permutated;
    
        DECLARE_ALIGNED_8 (uint64_t, aligned_temp[sizeof(DCTELEM)*64/8]);
    
    Michael Niedermayer's avatar
    Michael Niedermayer committed
        DCTELEM * const temp= (DCTELEM*)aligned_temp;
    
        int i, last, run, bits, level, start_i;
        const int esc_length= s->ac_esc_length;
        uint8_t * length;
        uint8_t * last_length;
    
        s->dsp.diff_pixels(temp, src1, src2, stride);
    
        s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
    
        bits=0;
    
        if (s->mb_intra) {
    
            length     = s->intra_ac_vlc_length;
            last_length= s->intra_ac_vlc_last_length;
    
            bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
    
        } else {
            start_i = 0;
            length     = s->inter_ac_vlc_length;
            last_length= s->inter_ac_vlc_last_length;
        }
    
        if(last>=start_i){
    
            run=0;
            for(i=start_i; i<last; i++){
                int j= scantable[i];
                level= temp[j];
    
                if(level){
                    level+=64;
                    if((level&(~127)) == 0){
                        bits+= length[UNI_AC_ENC_INDEX(run, level)];
                    }else
                        bits+= esc_length;
                    run=0;
                }else
                    run++;
            }
            i= scantable[last];
    
            level= temp[i] + 64;
    
            assert(level - 64);
    
            if((level&(~127)) == 0){
                bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
            }else
                bits+= esc_length;
        }
    
        return bits;
    }
    
    
    #define VSAD_INTRA(size) \
    static int vsad_intra##size##_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){ \
        int score=0;                                                                                            \
        int x,y;                                                                                                \
                                                                                                                \
        for(y=1; y<h; y++){                                                                                     \
            for(x=0; x<size; x+=4){                                                                             \
                score+= FFABS(s[x  ] - s[x  +stride]) + FFABS(s[x+1] - s[x+1+stride])                           \
                       +FFABS(s[x+2] - s[x+2+stride]) + FFABS(s[x+3] - s[x+3+stride]);                          \
            }                                                                                                   \
            s+= stride;                                                                                         \
        }                                                                                                       \
                                                                                                                \
        return score;                                                                                           \
    }
    VSAD_INTRA(8)
    VSAD_INTRA(16)
    
    
    static int vsad16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
        int score=0;
        int x,y;
    
        for(y=1; y<h; y++){
            for(x=0; x<16; x++){
    
                score+= FFABS(s1[x  ] - s2[x ] - s1[x  +stride] + s2[x +stride]);
    
            }
            s1+= stride;
            s2+= stride;
        }
    
        return score;
    }
    
    #define SQ(a) ((a)*(a))
    
    #define VSSE_INTRA(size) \
    static int vsse_intra##size##_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){ \
        int score=0;                                                                                            \
        int x,y;                                                                                                \
                                                                                                                \
        for(y=1; y<h; y++){                                                                                     \
            for(x=0; x<size; x+=4){                                                                               \
                score+= SQ(s[x  ] - s[x  +stride]) + SQ(s[x+1] - s[x+1+stride])                                 \
                       +SQ(s[x+2] - s[x+2+stride]) + SQ(s[x+3] - s[x+3+stride]);                                \
            }                                                                                                   \
            s+= stride;                                                                                         \
        }                                                                                                       \
                                                                                                                \
        return score;                                                                                           \
    }
    VSSE_INTRA(8)
    VSSE_INTRA(16)
    
    
    static int vsse16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
        int score=0;
        int x,y;
    
        for(y=1; y<h; y++){
            for(x=0; x<16; x++){
                score+= SQ(s1[x  ] - s2[x ] - s1[x  +stride] + s2[x +stride]);
            }
            s1+= stride;