Skip to content
Snippets Groups Projects
h264.c 113 KiB
Newer Older
  • Learn to ignore specific revisions
  • /*
     * H.26L/H.264/AVC/JVT/14496-10/... encoder/decoder
     * Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at>
     *
    
     * This file is part of FFmpeg.
     *
     * FFmpeg is free software; you can redistribute it and/or
    
     * modify it under the terms of the GNU Lesser General Public
     * License as published by the Free Software Foundation; either
    
     * version 2.1 of the License, or (at your option) any later version.
    
     * FFmpeg is distributed in the hope that it will be useful,
    
     * but WITHOUT ANY WARRANTY; without even the implied warranty of
     * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
     * Lesser General Public License for more details.
     *
     * You should have received a copy of the GNU Lesser General Public
    
     * License along with FFmpeg; if not, write to the Free Software
    
     * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
    
     * @file libavcodec/h264.c
    
     * H.264 / AVC / MPEG4 part10 codec.
     * @author Michael Niedermayer <michaelni@gmx.at>
     */
    
    
    #include "dsputil.h"
    #include "avcodec.h"
    #include "mpegvideo.h"
    
    #include "h264.h"
    
    #include "h264data.h"
    
    #include "h264_mvpred.h"
    
    #include "h264_parser.h"
    
    #include "golomb.h"
    
    #include "rectangle.h"
    
    #include "x86/h264_i386.h"
    
    #include <assert.h>
    
    
    static void svq3_luma_dc_dequant_idct_c(DCTELEM *block, int qp);
    static void svq3_add_idct_c(uint8_t *dst, DCTELEM *block, int stride, int qp, int dc);
    
    
    static const uint8_t rem6[52]={
    
    0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3,
    };
    
    
    static const uint8_t div6[52]={
    
    0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 8, 8, 8, 8,
    };
    
    
    void ff_h264_write_back_intra_pred_mode(H264Context *h){
    
    
        h->intra4x4_pred_mode[mb_xy][0]= h->intra4x4_pred_mode_cache[7+8*1];
        h->intra4x4_pred_mode[mb_xy][1]= h->intra4x4_pred_mode_cache[7+8*2];
        h->intra4x4_pred_mode[mb_xy][2]= h->intra4x4_pred_mode_cache[7+8*3];
        h->intra4x4_pred_mode[mb_xy][3]= h->intra4x4_pred_mode_cache[7+8*4];
        h->intra4x4_pred_mode[mb_xy][4]= h->intra4x4_pred_mode_cache[4+8*4];
        h->intra4x4_pred_mode[mb_xy][5]= h->intra4x4_pred_mode_cache[5+8*4];
        h->intra4x4_pred_mode[mb_xy][6]= h->intra4x4_pred_mode_cache[6+8*4];
    }
    
    
    /**
     * checks if the top & left blocks are available if needed & changes the dc mode so it only uses the available blocks.
     */
    int ff_h264_check_intra4x4_pred_mode(H264Context *h){
        MpegEncContext * const s = &h->s;
        static const int8_t top [12]= {-1, 0,LEFT_DC_PRED,-1,-1,-1,-1,-1, 0};
        static const int8_t left[12]= { 0,-1, TOP_DC_PRED, 0,-1,-1,-1, 0,-1,DC_128_PRED};
        int i;
    
        if(!(h->top_samples_available&0x8000)){
            for(i=0; i<4; i++){
                int status= top[ h->intra4x4_pred_mode_cache[scan8[0] + i] ];
                if(status<0){
                    av_log(h->s.avctx, AV_LOG_ERROR, "top block unavailable for requested intra4x4 mode %d at %d %d\n", status, s->mb_x, s->mb_y);
                    return -1;
                } else if(status){
                    h->intra4x4_pred_mode_cache[scan8[0] + i]= status;
                }
            }
        }
    
        if((h->left_samples_available&0x8888)!=0x8888){
            static const int mask[4]={0x8000,0x2000,0x80,0x20};
            for(i=0; i<4; i++){
                if(!(h->left_samples_available&mask[i])){
                    int status= left[ h->intra4x4_pred_mode_cache[scan8[0] + 8*i] ];
                    if(status<0){
                        av_log(h->s.avctx, AV_LOG_ERROR, "left block unavailable for requested intra4x4 mode %d at %d %d\n", status, s->mb_x, s->mb_y);
                        return -1;
                    } else if(status){
                        h->intra4x4_pred_mode_cache[scan8[0] + 8*i]= status;
                    }
                }
            }
        }
    
        return 0;
    } //FIXME cleanup like ff_h264_check_intra_pred_mode
    
    
    /**
     * checks if the top & left blocks are available if needed & changes the dc mode so it only uses the available blocks.
     */
    
    int ff_h264_check_intra_pred_mode(H264Context *h, int mode){
    
        MpegEncContext * const s = &h->s;
        static const int8_t top [7]= {LEFT_DC_PRED8x8, 1,-1,-1};
        static const int8_t left[7]= { TOP_DC_PRED8x8,-1, 2,-1,DC_128_PRED8x8};
    
    Michael Niedermayer's avatar
    Michael Niedermayer committed
        if(mode > 6U) {
    
    Loic Le Loarer's avatar
    Loic Le Loarer committed
            av_log(h->s.avctx, AV_LOG_ERROR, "out of range intra chroma pred mode at %d %d\n", s->mb_x, s->mb_y);
    
    Michael Niedermayer's avatar
    Michael Niedermayer committed
            return -1;
    
        if(!(h->top_samples_available&0x8000)){
            mode= top[ mode ];
            if(mode<0){
    
                av_log(h->s.avctx, AV_LOG_ERROR, "top block unavailable for requested intra mode at %d %d\n", s->mb_x, s->mb_y);
    
                return -1;
            }
        }
    
        if((h->left_samples_available&0x8080) != 0x8080){
    
            mode= left[ mode ];
    
            if(h->left_samples_available&0x8080){ //mad cow disease mode, aka MBAFF + constrained_intra_pred
                mode= ALZHEIMER_DC_L0T_PRED8x8 + (!(h->left_samples_available&0x8000)) + 2*(mode == DC_128_PRED8x8);
            }
    
            if(mode<0){
    
                av_log(h->s.avctx, AV_LOG_ERROR, "left block unavailable for requested intra mode at %d %d\n", s->mb_x, s->mb_y);
    
                return -1;
    
    const uint8_t *ff_h264_decode_nal(H264Context *h, const uint8_t *src, int *dst_length, int *consumed, int length){
    
        int i, si, di;
        uint8_t *dst;
    
    //    src[0]&0x80;                //forbidden bit
    
        h->nal_ref_idc= src[0]>>5;
        h->nal_unit_type= src[0]&0x1F;
    
        src++; length--;
    
        for(i=0; i<length; i++)
            printf("%2X ", src[i]);
    #endif
    
    #if HAVE_FAST_UNALIGNED
    # if HAVE_FAST_64BIT
    
    #   define RS 7
        for(i=0; i+1<length; i+=9){
    
    Ivan Schreter's avatar
    Ivan Schreter committed
            if(!((~*(const uint64_t*)(src+i) & (*(const uint64_t*)(src+i) - 0x0100010001000101ULL)) & 0x8000800080008080ULL))
    
    # else
    #   define RS 3
        for(i=0; i+1<length; i+=5){
    
    Ivan Schreter's avatar
    Ivan Schreter committed
            if(!((~*(const uint32_t*)(src+i) & (*(const uint32_t*)(src+i) - 0x01000101U)) & 0x80008080U))
    
    # endif
                continue;
            if(i>0 && !src[i]) i--;
            while(src[i]) i++;
    #else
    #   define RS 0
    
        for(i=0; i+1<length; i+=2){
            if(src[i]) continue;
            if(i>0 && src[i-1]==0) i--;
    
            if(i+2<length && src[i+1]==0 && src[i+2]<=3){
                if(src[i+2]!=3){
                    /* startcode, so we must be past the end */
                    length=i;
                }
                break;
            }
    
        }
    
        if(i>=length-1){ //no escaped 0
            *dst_length= length;
            *consumed= length+1; //+1 for the header
    
        bufidx = h->nal_unit_type == NAL_DPC ? 1 : 0; // use second escape buffer for inter data
    
        av_fast_malloc(&h->rbsp_buffer[bufidx], &h->rbsp_buffer_size[bufidx], length+FF_INPUT_BUFFER_PADDING_SIZE);
    
        dst= h->rbsp_buffer[bufidx];
    
        memcpy(dst, src, i);
        si=di=i;
        while(si+2<length){
    
            //remove escapes (very rare 1:2^22)
    
            if(src[si+2]>3){
                dst[di++]= src[si++];
                dst[di++]= src[si++];
            }else if(src[si]==0 && src[si+1]==0){
    
                if(src[si+2]==3){ //escape
                    dst[di++]= 0;
                    dst[di++]= 0;
                    si+=3;
    
                }else //next start code
    
                    goto nsc;
    
            }
    
            dst[di++]= src[si++];
        }
    
        while(si<length)
            dst[di++]= src[si++];
    nsc:
    
        memset(dst+di, 0, FF_INPUT_BUFFER_PADDING_SIZE);
    
    
        *dst_length= di;
        *consumed= si + 1;//+1 for the header
    
    Diego Biurrun's avatar
    Diego Biurrun committed
    //FIXME store exact number of bits in the getbitcontext (it is needed for decoding)
    
    int ff_h264_decode_rbsp_trailing(H264Context *h, const uint8_t *src){
    
        int v= *src;
        int r;
    
    
        tprintf(h->s.avctx, "rbsp trailing %X\n", v);
    
    
        for(r=1; r<9; r++){
            if(v&1) return r;
            v>>=1;
        }
        return 0;
    }
    
    /**
    
     * IDCT transforms the 16 dc values and dequantizes them.
    
     * @param qp quantization parameter
     */
    
    static void h264_luma_dc_dequant_idct_c(DCTELEM *block, int qp, int qmul){
    
    #define stride 16
        int i;
        int temp[16]; //FIXME check if this is a good idea
        static const int x_offset[4]={0, 1*stride, 4* stride,  5*stride};
        static const int y_offset[4]={0, 2*stride, 8* stride, 10*stride};
    
    //memset(block, 64, 2*256);
    //return;
        for(i=0; i<4; i++){
            const int offset= y_offset[i];
            const int z0= block[offset+stride*0] + block[offset+stride*4];
            const int z1= block[offset+stride*0] - block[offset+stride*4];
            const int z2= block[offset+stride*1] - block[offset+stride*5];
            const int z3= block[offset+stride*1] + block[offset+stride*5];
    
            temp[4*i+0]= z0+z3;
            temp[4*i+1]= z1+z2;
            temp[4*i+2]= z1-z2;
            temp[4*i+3]= z0-z3;
        }
    
        for(i=0; i<4; i++){
            const int offset= x_offset[i];
            const int z0= temp[4*0+i] + temp[4*2+i];
            const int z1= temp[4*0+i] - temp[4*2+i];
            const int z2= temp[4*1+i] - temp[4*3+i];
            const int z3= temp[4*1+i] + temp[4*3+i];
    
    
            block[stride*0 +offset]= ((((z0 + z3)*qmul + 128 ) >> 8)); //FIXME think about merging this into decode_residual
    
            block[stride*2 +offset]= ((((z1 + z2)*qmul + 128 ) >> 8));
            block[stride*8 +offset]= ((((z1 - z2)*qmul + 128 ) >> 8));
            block[stride*10+offset]= ((((z0 - z3)*qmul + 128 ) >> 8));
    
     * DCT transforms the 16 dc values.
    
     * @param qp quantization parameter ??? FIXME
     */
    static void h264_luma_dc_dct_c(DCTELEM *block/*, int qp*/){
    //    const int qmul= dequant_coeff[qp][0];
        int i;
        int temp[16]; //FIXME check if this is a good idea
        static const int x_offset[4]={0, 1*stride, 4* stride,  5*stride};
        static const int y_offset[4]={0, 2*stride, 8* stride, 10*stride};
    
        for(i=0; i<4; i++){
            const int offset= y_offset[i];
            const int z0= block[offset+stride*0] + block[offset+stride*4];
            const int z1= block[offset+stride*0] - block[offset+stride*4];
            const int z2= block[offset+stride*1] - block[offset+stride*5];
            const int z3= block[offset+stride*1] + block[offset+stride*5];
    
            temp[4*i+0]= z0+z3;
            temp[4*i+1]= z1+z2;
            temp[4*i+2]= z1-z2;
            temp[4*i+3]= z0-z3;
        }
    
        for(i=0; i<4; i++){
            const int offset= x_offset[i];
            const int z0= temp[4*0+i] + temp[4*2+i];
            const int z1= temp[4*0+i] - temp[4*2+i];
            const int z2= temp[4*1+i] - temp[4*3+i];
            const int z3= temp[4*1+i] + temp[4*3+i];
    
            block[stride*0 +offset]= (z0 + z3)>>1;
            block[stride*2 +offset]= (z1 + z2)>>1;
            block[stride*8 +offset]= (z1 - z2)>>1;
            block[stride*10+offset]= (z0 - z3)>>1;
        }
    }
    
    #undef xStride
    #undef stride
    
    
    static void chroma_dc_dequant_idct_c(DCTELEM *block, int qp, int qmul){
    
        const int stride= 16*2;
        const int xStride= 16;
        int a,b,c,d,e;
    
        a= block[stride*0 + xStride*0];
        b= block[stride*0 + xStride*1];
        c= block[stride*1 + xStride*0];
        d= block[stride*1 + xStride*1];
    
        e= a-b;
        a= a+b;
        b= c-d;
        c= c+d;
    
    
        block[stride*0 + xStride*0]= ((a+c)*qmul) >> 7;
        block[stride*0 + xStride*1]= ((e+b)*qmul) >> 7;
        block[stride*1 + xStride*0]= ((a-c)*qmul) >> 7;
        block[stride*1 + xStride*1]= ((e-b)*qmul) >> 7;
    
    static void chroma_dc_dct_c(DCTELEM *block){
        const int stride= 16*2;
        const int xStride= 16;
        int a,b,c,d,e;
    
        a= block[stride*0 + xStride*0];
        b= block[stride*0 + xStride*1];
        c= block[stride*1 + xStride*0];
        d= block[stride*1 + xStride*1];
    
        e= a-b;
        a= a+b;
        b= c-d;
        c= c+d;
    
        block[stride*0 + xStride*0]= (a+c);
        block[stride*0 + xStride*1]= (e+b);
        block[stride*1 + xStride*0]= (a-c);
        block[stride*1 + xStride*1]= (e-b);
    }
    
    
    static inline void mc_dir_part(H264Context *h, Picture *pic, int n, int square, int chroma_height, int delta, int list,
                               uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
                               int src_x_offset, int src_y_offset,
                               qpel_mc_func *qpix_op, h264_chroma_mc_func chroma_op){
        MpegEncContext * const s = &h->s;
        const int mx= h->mv_cache[list][ scan8[n] ][0] + src_x_offset*8;
    
        int my=       h->mv_cache[list][ scan8[n] ][1] + src_y_offset*8;
    
        const int luma_xy= (mx&3) + ((my&3)<<2);
    
        uint8_t * src_y = pic->data[0] + (mx>>2) + (my>>2)*h->mb_linesize;
        uint8_t * src_cb, * src_cr;
        int extra_width= h->emu_edge_width;
        int extra_height= h->emu_edge_height;
    
        int emu=0;
        const int full_mx= mx>>2;
        const int full_my= my>>2;
    
        const int pic_width  = 16*s->mb_width;
    
        const int pic_height = 16*s->mb_height >> MB_FIELD;
    
        if(mx&7) extra_width -= 3;
        if(my&7) extra_height -= 3;
    
    
        if(   full_mx < 0-extra_width
           || full_my < 0-extra_height
           || full_mx + 16/*FIXME*/ > pic_width + extra_width
    
           || full_my + 16/*FIXME*/ > pic_height + extra_height){
    
            ff_emulated_edge_mc(s->edge_emu_buffer, src_y - 2 - 2*h->mb_linesize, h->mb_linesize, 16+5, 16+5/*FIXME*/, full_mx-2, full_my-2, pic_width, pic_height);
                src_y= s->edge_emu_buffer + 2 + 2*h->mb_linesize;
    
        qpix_op[luma_xy](dest_y, src_y, h->mb_linesize); //FIXME try variable height perhaps?
    
        if(!square){
    
            qpix_op[luma_xy](dest_y + delta, src_y + delta, h->mb_linesize);
    
        if(CONFIG_GRAY && s->flags&CODEC_FLAG_GRAY) return;
    
        if(MB_FIELD){
    
            // chroma offset when predicting from a field of opposite parity
    
            my += 2 * ((s->mb_y & 1) - (pic->reference - 1));
    
            emu |= (my>>3) < 0 || (my>>3) + 8 >= (pic_height>>1);
        }
        src_cb= pic->data[1] + (mx>>3) + (my>>3)*h->mb_uvlinesize;
        src_cr= pic->data[2] + (mx>>3) + (my>>3)*h->mb_uvlinesize;
    
    
        if(emu){
    
            ff_emulated_edge_mc(s->edge_emu_buffer, src_cb, h->mb_uvlinesize, 9, 9/*FIXME*/, (mx>>3), (my>>3), pic_width>>1, pic_height>>1);
    
                src_cb= s->edge_emu_buffer;
        }
    
        chroma_op(dest_cb, src_cb, h->mb_uvlinesize, chroma_height, mx&7, my&7);
    
            ff_emulated_edge_mc(s->edge_emu_buffer, src_cr, h->mb_uvlinesize, 9, 9/*FIXME*/, (mx>>3), (my>>3), pic_width>>1, pic_height>>1);
    
                src_cr= s->edge_emu_buffer;
        }
    
        chroma_op(dest_cr, src_cr, h->mb_uvlinesize, chroma_height, mx&7, my&7);
    
    static inline void mc_part_std(H264Context *h, int n, int square, int chroma_height, int delta,
    
                               uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
                               int x_offset, int y_offset,
                               qpel_mc_func *qpix_put, h264_chroma_mc_func chroma_put,
                               qpel_mc_func *qpix_avg, h264_chroma_mc_func chroma_avg,
                               int list0, int list1){
        MpegEncContext * const s = &h->s;
        qpel_mc_func *qpix_op=  qpix_put;
        h264_chroma_mc_func chroma_op= chroma_put;
    
        dest_y  += 2*x_offset + 2*y_offset*h->  mb_linesize;
        dest_cb +=   x_offset +   y_offset*h->mb_uvlinesize;
        dest_cr +=   x_offset +   y_offset*h->mb_uvlinesize;
    
        x_offset += 8*s->mb_x;
    
        y_offset += 8*(s->mb_y >> MB_FIELD);
    
        if(list0){
    
            Picture *ref= &h->ref_list[0][ h->ref_cache[0][ scan8[n] ] ];
    
            mc_dir_part(h, ref, n, square, chroma_height, delta, 0,
                               dest_y, dest_cb, dest_cr, x_offset, y_offset,
                               qpix_op, chroma_op);
    
            qpix_op=  qpix_avg;
            chroma_op= chroma_avg;
        }
    
        if(list1){
    
            Picture *ref= &h->ref_list[1][ h->ref_cache[1][ scan8[n] ] ];
    
            mc_dir_part(h, ref, n, square, chroma_height, delta, 1,
                               dest_y, dest_cb, dest_cr, x_offset, y_offset,
                               qpix_op, chroma_op);
        }
    }
    
    
    static inline void mc_part_weighted(H264Context *h, int n, int square, int chroma_height, int delta,
                               uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
                               int x_offset, int y_offset,
                               qpel_mc_func *qpix_put, h264_chroma_mc_func chroma_put,
                               h264_weight_func luma_weight_op, h264_weight_func chroma_weight_op,
                               h264_biweight_func luma_weight_avg, h264_biweight_func chroma_weight_avg,
                               int list0, int list1){
        MpegEncContext * const s = &h->s;
    
    
        dest_y  += 2*x_offset + 2*y_offset*h->  mb_linesize;
        dest_cb +=   x_offset +   y_offset*h->mb_uvlinesize;
        dest_cr +=   x_offset +   y_offset*h->mb_uvlinesize;
    
        x_offset += 8*s->mb_x;
    
        y_offset += 8*(s->mb_y >> MB_FIELD);
    
        if(list0 && list1){
            /* don't optimize for luma-only case, since B-frames usually
             * use implicit weights => chroma too. */
            uint8_t *tmp_cb = s->obmc_scratchpad;
    
            uint8_t *tmp_cr = s->obmc_scratchpad + 8;
            uint8_t *tmp_y  = s->obmc_scratchpad + 8*h->mb_uvlinesize;
    
            int refn0 = h->ref_cache[0][ scan8[n] ];
            int refn1 = h->ref_cache[1][ scan8[n] ];
    
            mc_dir_part(h, &h->ref_list[0][refn0], n, square, chroma_height, delta, 0,
                        dest_y, dest_cb, dest_cr,
                        x_offset, y_offset, qpix_put, chroma_put);
            mc_dir_part(h, &h->ref_list[1][refn1], n, square, chroma_height, delta, 1,
                        tmp_y, tmp_cb, tmp_cr,
                        x_offset, y_offset, qpix_put, chroma_put);
    
            if(h->use_weight == 2){
                int weight0 = h->implicit_weight[refn0][refn1];
                int weight1 = 64 - weight0;
    
                luma_weight_avg(  dest_y,  tmp_y,  h->  mb_linesize, 5, weight0, weight1, 0);
                chroma_weight_avg(dest_cb, tmp_cb, h->mb_uvlinesize, 5, weight0, weight1, 0);
                chroma_weight_avg(dest_cr, tmp_cr, h->mb_uvlinesize, 5, weight0, weight1, 0);
    
            }else{
    
                luma_weight_avg(dest_y, tmp_y, h->mb_linesize, h->luma_log2_weight_denom,
    
                                h->luma_weight[0][refn0], h->luma_weight[1][refn1],
    
    Loren Merritt's avatar
    Loren Merritt committed
                                h->luma_offset[0][refn0] + h->luma_offset[1][refn1]);
    
                chroma_weight_avg(dest_cb, tmp_cb, h->mb_uvlinesize, h->chroma_log2_weight_denom,
    
                                h->chroma_weight[0][refn0][0], h->chroma_weight[1][refn1][0],
    
    Loren Merritt's avatar
    Loren Merritt committed
                                h->chroma_offset[0][refn0][0] + h->chroma_offset[1][refn1][0]);
    
                chroma_weight_avg(dest_cr, tmp_cr, h->mb_uvlinesize, h->chroma_log2_weight_denom,
    
                                h->chroma_weight[0][refn0][1], h->chroma_weight[1][refn1][1],
    
    Loren Merritt's avatar
    Loren Merritt committed
                                h->chroma_offset[0][refn0][1] + h->chroma_offset[1][refn1][1]);
    
            }
        }else{
            int list = list1 ? 1 : 0;
            int refn = h->ref_cache[list][ scan8[n] ];
            Picture *ref= &h->ref_list[list][refn];
            mc_dir_part(h, ref, n, square, chroma_height, delta, list,
                        dest_y, dest_cb, dest_cr, x_offset, y_offset,
                        qpix_put, chroma_put);
    
    
            luma_weight_op(dest_y, h->mb_linesize, h->luma_log2_weight_denom,
    
                           h->luma_weight[list][refn], h->luma_offset[list][refn]);
            if(h->use_weight_chroma){
    
                chroma_weight_op(dest_cb, h->mb_uvlinesize, h->chroma_log2_weight_denom,
    
                                 h->chroma_weight[list][refn][0], h->chroma_offset[list][refn][0]);
    
                chroma_weight_op(dest_cr, h->mb_uvlinesize, h->chroma_log2_weight_denom,
    
                                 h->chroma_weight[list][refn][1], h->chroma_offset[list][refn][1]);
            }
        }
    }
    
    static inline void mc_part(H264Context *h, int n, int square, int chroma_height, int delta,
                               uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
                               int x_offset, int y_offset,
                               qpel_mc_func *qpix_put, h264_chroma_mc_func chroma_put,
                               qpel_mc_func *qpix_avg, h264_chroma_mc_func chroma_avg,
    
                               h264_weight_func *weight_op, h264_biweight_func *weight_avg,
    
                               int list0, int list1){
        if((h->use_weight==2 && list0 && list1
            && (h->implicit_weight[ h->ref_cache[0][scan8[n]] ][ h->ref_cache[1][scan8[n]] ] != 32))
           || h->use_weight==1)
            mc_part_weighted(h, n, square, chroma_height, delta, dest_y, dest_cb, dest_cr,
                             x_offset, y_offset, qpix_put, chroma_put,
                             weight_op[0], weight_op[3], weight_avg[0], weight_avg[3], list0, list1);
        else
            mc_part_std(h, n, square, chroma_height, delta, dest_y, dest_cb, dest_cr,
                        x_offset, y_offset, qpix_put, chroma_put, qpix_avg, chroma_avg, list0, list1);
    }
    
    
    static inline void prefetch_motion(H264Context *h, int list){
        /* fetch pixels for estimated mv 4 macroblocks ahead
         * optimized for 64byte cache lines */
        MpegEncContext * const s = &h->s;
        const int refn = h->ref_cache[list][scan8[0]];
        if(refn >= 0){
            const int mx= (h->mv_cache[list][scan8[0]][0]>>2) + 16*s->mb_x + 8;
            const int my= (h->mv_cache[list][scan8[0]][1]>>2) + 16*s->mb_y;
            uint8_t **src= h->ref_list[list][refn].data;
    
            int off= mx + (my + (s->mb_x&3)*4)*h->mb_linesize + 64;
    
            s->dsp.prefetch(src[0]+off, s->linesize, 4);
            off= (mx>>1) + ((my>>1) + (s->mb_x&7))*s->uvlinesize + 64;
            s->dsp.prefetch(src[1]+off, src[2]-src[1], 2);
        }
    }
    
    
    static void hl_motion(H264Context *h, uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
                          qpel_mc_func (*qpix_put)[16], h264_chroma_mc_func (*chroma_put),
    
                          qpel_mc_func (*qpix_avg)[16], h264_chroma_mc_func (*chroma_avg),
                          h264_weight_func *weight_op, h264_biweight_func *weight_avg){
    
        MpegEncContext * const s = &h->s;
    
        const int mb_type= s->current_picture.mb_type[mb_xy];
    
        assert(IS_INTER(mb_type));
    
        if(IS_16X16(mb_type)){
            mc_part(h, 0, 1, 8, 0, dest_y, dest_cb, dest_cr, 0, 0,
                    qpix_put[0], chroma_put[0], qpix_avg[0], chroma_avg[0],
    
                    &weight_op[0], &weight_avg[0],
    
                    IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1));
        }else if(IS_16X8(mb_type)){
            mc_part(h, 0, 0, 4, 8, dest_y, dest_cb, dest_cr, 0, 0,
                    qpix_put[1], chroma_put[0], qpix_avg[1], chroma_avg[0],
    
                    &weight_op[1], &weight_avg[1],
    
                    IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1));
            mc_part(h, 8, 0, 4, 8, dest_y, dest_cb, dest_cr, 0, 4,
                    qpix_put[1], chroma_put[0], qpix_avg[1], chroma_avg[0],
    
                    &weight_op[1], &weight_avg[1],
    
                    IS_DIR(mb_type, 1, 0), IS_DIR(mb_type, 1, 1));
        }else if(IS_8X16(mb_type)){
    
            mc_part(h, 0, 0, 8, 8*h->mb_linesize, dest_y, dest_cb, dest_cr, 0, 0,
    
                    qpix_put[1], chroma_put[1], qpix_avg[1], chroma_avg[1],
    
                    &weight_op[2], &weight_avg[2],
    
                    IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1));
    
            mc_part(h, 4, 0, 8, 8*h->mb_linesize, dest_y, dest_cb, dest_cr, 4, 0,
    
                    qpix_put[1], chroma_put[1], qpix_avg[1], chroma_avg[1],
    
                    &weight_op[2], &weight_avg[2],
    
                    IS_DIR(mb_type, 1, 0), IS_DIR(mb_type, 1, 1));
        }else{
            int i;
    
            assert(IS_8X8(mb_type));
    
            for(i=0; i<4; i++){
                const int sub_mb_type= h->sub_mb_type[i];
                const int n= 4*i;
                int x_offset= (i&1)<<2;
                int y_offset= (i&2)<<1;
    
                if(IS_SUB_8X8(sub_mb_type)){
                    mc_part(h, n, 1, 4, 0, dest_y, dest_cb, dest_cr, x_offset, y_offset,
                        qpix_put[1], chroma_put[1], qpix_avg[1], chroma_avg[1],
    
                        &weight_op[3], &weight_avg[3],
    
                        IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
                }else if(IS_SUB_8X4(sub_mb_type)){
                    mc_part(h, n  , 0, 2, 4, dest_y, dest_cb, dest_cr, x_offset, y_offset,
                        qpix_put[2], chroma_put[1], qpix_avg[2], chroma_avg[1],
    
                        &weight_op[4], &weight_avg[4],
    
                        IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
                    mc_part(h, n+2, 0, 2, 4, dest_y, dest_cb, dest_cr, x_offset, y_offset+2,
                        qpix_put[2], chroma_put[1], qpix_avg[2], chroma_avg[1],
    
                        &weight_op[4], &weight_avg[4],
    
                        IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
                }else if(IS_SUB_4X8(sub_mb_type)){
    
                    mc_part(h, n  , 0, 4, 4*h->mb_linesize, dest_y, dest_cb, dest_cr, x_offset, y_offset,
    
                        qpix_put[2], chroma_put[2], qpix_avg[2], chroma_avg[2],
    
                        &weight_op[5], &weight_avg[5],
    
                        IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
    
                    mc_part(h, n+1, 0, 4, 4*h->mb_linesize, dest_y, dest_cb, dest_cr, x_offset+2, y_offset,
    
                        qpix_put[2], chroma_put[2], qpix_avg[2], chroma_avg[2],
    
                        &weight_op[5], &weight_avg[5],
    
                        IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
                }else{
                    int j;
                    assert(IS_SUB_4X4(sub_mb_type));
                    for(j=0; j<4; j++){
                        int sub_x_offset= x_offset + 2*(j&1);
                        int sub_y_offset= y_offset +   (j&2);
                        mc_part(h, n+j, 1, 2, 0, dest_y, dest_cb, dest_cr, sub_x_offset, sub_y_offset,
                            qpix_put[2], chroma_put[2], qpix_avg[2], chroma_avg[2],
    
                            &weight_op[6], &weight_avg[6],
    
                            IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
                    }
                }
            }
        }
    
    }
    
    
    static void free_tables(H264Context *h){
    
        H264Context *hx;
    
        av_freep(&h->intra4x4_pred_mode);
    
        av_freep(&h->chroma_pred_mode_table);
        av_freep(&h->cbp_table);
    
        av_freep(&h->mvd_table[0]);
        av_freep(&h->mvd_table[1]);
    
        av_freep(&h->direct_table);
    
        av_freep(&h->non_zero_count);
        av_freep(&h->slice_table_base);
        h->slice_table= NULL;
    
        av_freep(&h->mb2b_xy);
        av_freep(&h->mb2b8_xy);
    
            hx = h->thread_context[i];
            if(!hx) continue;
            av_freep(&hx->top_borders[1]);
            av_freep(&hx->top_borders[0]);
            av_freep(&hx->s.obmc_scratchpad);
    
            av_freep(&hx->rbsp_buffer[1]);
            av_freep(&hx->rbsp_buffer[0]);
    
            hx->rbsp_buffer_size[0] = 0;
            hx->rbsp_buffer_size[1] = 0;
    
            if (i) av_freep(&h->thread_context[i]);
    
    static void init_dequant8_coeff_table(H264Context *h){
        int i,q,x;
    
    Loren Merritt's avatar
    Loren Merritt committed
        const int transpose = (h->s.dsp.h264_idct8_add != ff_h264_idct8_add_c); //FIXME ugly
    
        h->dequant8_coeff[0] = h->dequant8_buffer[0];
        h->dequant8_coeff[1] = h->dequant8_buffer[1];
    
        for(i=0; i<2; i++ ){
            if(i && !memcmp(h->pps.scaling_matrix8[0], h->pps.scaling_matrix8[1], 64*sizeof(uint8_t))){
                h->dequant8_coeff[1] = h->dequant8_buffer[0];
                break;
            }
    
            for(q=0; q<52; q++){
    
                int shift = div6[q];
                int idx = rem6[q];
    
                for(x=0; x<64; x++)
    
    Loren Merritt's avatar
    Loren Merritt committed
                    h->dequant8_coeff[i][q][transpose ? (x>>3)|((x&7)<<3) : x] =
                        ((uint32_t)dequant8_coeff_init[idx][ dequant8_coeff_init_scan[((x>>1)&12) | (x&3)] ] *
                        h->pps.scaling_matrix8[i][x]) << shift;
    
            }
        }
    }
    
    static void init_dequant4_coeff_table(H264Context *h){
        int i,j,q,x;
    
        const int transpose = (h->s.dsp.h264_idct_add != ff_h264_idct_add_c); //FIXME ugly
    
        for(i=0; i<6; i++ ){
            h->dequant4_coeff[i] = h->dequant4_buffer[i];
            for(j=0; j<i; j++){
                if(!memcmp(h->pps.scaling_matrix4[j], h->pps.scaling_matrix4[i], 16*sizeof(uint8_t))){
                    h->dequant4_coeff[i] = h->dequant4_buffer[j];
                    break;
                }
            }
            if(j<i)
                continue;
    
            for(q=0; q<52; q++){
    
                int shift = div6[q] + 2;
                int idx = rem6[q];
    
                for(x=0; x<16; x++)
    
                    h->dequant4_coeff[i][q][transpose ? (x>>2)|((x<<2)&0xF) : x] =
                        ((uint32_t)dequant4_coeff_init[idx][(x&1) + ((x>>2)&1)] *
    
                        h->pps.scaling_matrix4[i][x]) << shift;
            }
        }
    }
    
    static void init_dequant_tables(H264Context *h){
        int i,x;
        init_dequant4_coeff_table(h);
        if(h->pps.transform_8x8_mode)
            init_dequant8_coeff_table(h);
        if(h->sps.transform_bypass){
            for(i=0; i<6; i++)
                for(x=0; x<16; x++)
                    h->dequant4_coeff[i][0][x] = 1<<6;
            if(h->pps.transform_8x8_mode)
                for(i=0; i<2; i++)
                    for(x=0; x<64; x++)
                        h->dequant8_coeff[i][0][x] = 1<<6;
        }
    }
    
    
    
    int ff_h264_alloc_tables(H264Context *h){
    
        MpegEncContext * const s = &h->s;
    
        const int big_mb_num= s->mb_stride * (s->mb_height+1);
    
        int x,y;
    
        FF_ALLOCZ_OR_GOTO(h->s.avctx, h->intra4x4_pred_mode, big_mb_num * 8  * sizeof(uint8_t), fail)
    
        FF_ALLOCZ_OR_GOTO(h->s.avctx, h->non_zero_count    , big_mb_num * 16 * sizeof(uint8_t), fail)
        FF_ALLOCZ_OR_GOTO(h->s.avctx, h->slice_table_base  , (big_mb_num+s->mb_stride) * sizeof(*h->slice_table_base), fail)
        FF_ALLOCZ_OR_GOTO(h->s.avctx, h->cbp_table, big_mb_num * sizeof(uint16_t), fail)
    
        FF_ALLOCZ_OR_GOTO(h->s.avctx, h->chroma_pred_mode_table, big_mb_num * sizeof(uint8_t), fail)
        FF_ALLOCZ_OR_GOTO(h->s.avctx, h->mvd_table[0], 32*big_mb_num * sizeof(uint16_t), fail);
        FF_ALLOCZ_OR_GOTO(h->s.avctx, h->mvd_table[1], 32*big_mb_num * sizeof(uint16_t), fail);
        FF_ALLOCZ_OR_GOTO(h->s.avctx, h->direct_table, 32*big_mb_num * sizeof(uint8_t) , fail);
    
        memset(h->slice_table_base, -1, (big_mb_num+s->mb_stride)  * sizeof(*h->slice_table_base));
    
        h->slice_table= h->slice_table_base + s->mb_stride*2 + 1;
    
        FF_ALLOCZ_OR_GOTO(h->s.avctx, h->mb2b_xy  , big_mb_num * sizeof(uint32_t), fail);
        FF_ALLOCZ_OR_GOTO(h->s.avctx, h->mb2b8_xy , big_mb_num * sizeof(uint32_t), fail);
    
        for(y=0; y<s->mb_height; y++){
            for(x=0; x<s->mb_width; x++){
    
                const int b_xy = 4*x + 4*y*h->b_stride;
                const int b8_xy= 2*x + 2*y*h->b8_stride;
    
                h->mb2b_xy [mb_xy]= b_xy;
                h->mb2b8_xy[mb_xy]= b8_xy;
            }
        }
    
        s->obmc_scratchpad = NULL;
    
    
        if(!h->dequant4_coeff[0])
            init_dequant_tables(h);
    
    
        return 0;
    fail:
        free_tables(h);
        return -1;
    }
    
    
    /**
     * Mimic alloc_tables(), but for every context thread.
     */
    static void clone_tables(H264Context *dst, H264Context *src){
        dst->intra4x4_pred_mode       = src->intra4x4_pred_mode;
        dst->non_zero_count           = src->non_zero_count;
        dst->slice_table              = src->slice_table;
        dst->cbp_table                = src->cbp_table;
        dst->mb2b_xy                  = src->mb2b_xy;
        dst->mb2b8_xy                 = src->mb2b8_xy;
        dst->chroma_pred_mode_table   = src->chroma_pred_mode_table;
        dst->mvd_table[0]             = src->mvd_table[0];
        dst->mvd_table[1]             = src->mvd_table[1];
        dst->direct_table             = src->direct_table;
    
        dst->s.obmc_scratchpad = NULL;
        ff_h264_pred_init(&dst->hpc, src->s.codec_id);
    }
    
    /**
     * Init context
     * Allocate buffers which are not shared amongst multiple threads.
     */
    static int context_init(H264Context *h){
    
        FF_ALLOCZ_OR_GOTO(h->s.avctx, h->top_borders[0], h->s.mb_width * (16+8+8) * sizeof(uint8_t), fail)
        FF_ALLOCZ_OR_GOTO(h->s.avctx, h->top_borders[1], h->s.mb_width * (16+8+8) * sizeof(uint8_t), fail)
    
    
        return 0;
    fail:
        return -1; // free_tables will clean up for us
    }
    
    
    static av_cold void common_init(H264Context *h){
    
        MpegEncContext * const s = &h->s;
    
        s->width = s->avctx->width;
        s->height = s->avctx->height;
        s->codec_id= s->avctx->codec->id;
    
        ff_h264_pred_init(&h->hpc, s->codec_id);
    
        h->dequant_coeff_pps= -1;
    
    Michael Niedermayer's avatar
    Michael Niedermayer committed
        s->unrestricted_mv=1;
    
        s->decode=1; //FIXME
    
        dsputil_init(&s->dsp, s->avctx); // needed so that idct permutation is known early
    
    
        memset(h->pps.scaling_matrix4, 16, 6*16*sizeof(uint8_t));
        memset(h->pps.scaling_matrix8, 16, 2*64*sizeof(uint8_t));
    
    av_cold int ff_h264_decode_init(AVCodecContext *avctx){
    
        H264Context *h= avctx->priv_data;
        MpegEncContext * const s = &h->s;
    
    
    Michael Niedermayer's avatar
    Michael Niedermayer committed
        MPV_decode_defaults(s);
    
        s->avctx = avctx;
        common_init(h);
    
        s->out_format = FMT_H264;
        s->workaround_bugs= avctx->workaround_bugs;
    
        // set defaults
    //    s->decode_mb= ff_h263_decode_mb;
    
        s->low_delay= 1;
    
        avctx->chroma_sample_location = AVCHROMA_LOC_LEFT;
    
        ff_h264_decode_init_vlc();
    
        if(avctx->extradata_size > 0 && avctx->extradata &&
           *(char *)avctx->extradata == 1){
    
            h->is_avc = 1;
            h->got_avcC = 0;
    
        h->thread_context[0] = h;
    
        h->outputed_poc = INT_MIN;
    
        ff_h264_reset_sei(h);
    
        if(avctx->codec_id == CODEC_ID_H264){
            if(avctx->ticks_per_frame == 1){
                s->avctx->time_base.den *=2;
            }
    
            avctx->ticks_per_frame = 2;
    
    int ff_h264_frame_start(H264Context *h){
    
        MpegEncContext * const s = &h->s;
        int i;
    
    
        if(MPV_frame_start(s, s->avctx) < 0)
            return -1;
    
        ff_er_frame_start(s);
    
        /*
         * MPV_frame_start uses pict_type to derive key_frame.
         * This is incorrect for H.264; IDR markings must be used.
    
         * Zero here; IDR markings per slice in frame or fields are ORed in later.
    
         * See decode_nal_units().
         */
        s->current_picture_ptr->key_frame= 0;
    
        s->current_picture_ptr->mmco_reset= 0;
    
    
        assert(s->linesize && s->uvlinesize);
    
        for(i=0; i<16; i++){
            h->block_offset[i]= 4*((scan8[i] - scan8[0])&7) + 4*s->linesize*((scan8[i] - scan8[0])>>3);
    
            h->block_offset[24+i]= 4*((scan8[i] - scan8[0])&7) + 8*s->linesize*((scan8[i] - scan8[0])>>3);
    
        }
        for(i=0; i<4; i++){
            h->block_offset[16+i]=
            h->block_offset[20+i]= 4*((scan8[i] - scan8[0])&7) + 4*s->uvlinesize*((scan8[i] - scan8[0])>>3);
    
            h->block_offset[24+16+i]=
            h->block_offset[24+20+i]= 4*((scan8[i] - scan8[0])&7) + 8*s->uvlinesize*((scan8[i] - scan8[0])>>3);
    
        /* can't be in alloc_tables because linesize isn't known there.
         * FIXME: redo bipred weight to not require extra buffer? */
    
        for(i = 0; i < s->avctx->thread_count; i++)
            if(!h->thread_context[i]->s.obmc_scratchpad)
                h->thread_context[i]->s.obmc_scratchpad = av_malloc(16*2*s->linesize + 8*2*s->uvlinesize);
    
    
        /* some macroblocks will be accessed before they're available */
    
        if(FRAME_MBAFF || s->avctx->thread_count > 1)
    
            memset(h->slice_table, -1, (s->mb_height*s->mb_stride-1) * sizeof(*h->slice_table));
    
    //    s->decode= (s->flags&CODEC_FLAG_PSNR) || !s->encoding || s->current_picture.reference /*|| h->contains_intra*/ || 1;
    
        // We mark the current picture as non-reference after allocating it, so
    
        // that if we break out due to an error it can be released automatically
        // in the next MPV_frame_start().
        // SVQ3 as well as most other codecs have only last/next/current and thus
        // get released even with set reference, besides SVQ3 and others do not
        // mark frames as reference later "naturally".
        if(s->codec_id != CODEC_ID_SVQ3)
            s->current_picture_ptr->reference= 0;
    
    
        s->current_picture_ptr->field_poc[0]=
        s->current_picture_ptr->field_poc[1]= INT_MAX;
    
        assert(s->current_picture_ptr->long_ref==0);
    
    static inline void backup_mb_border(H264Context *h, uint8_t *src_y, uint8_t *src_cb, uint8_t *src_cr, int linesize, int uvlinesize, int simple){
    
        int step    = 1;
        int offset  = 1;
        int uvoffset= 1;
        int top_idx = 1;
        int skiplast= 0;
    
        if(!simple && FRAME_MBAFF){
            if(s->mb_y&1){
                offset  = MB_MBAFF ? 1 : 17;
                uvoffset= MB_MBAFF ? 1 : 9;
                if(!MB_MBAFF){
                    *(uint64_t*)(h->top_borders[0][s->mb_x]+ 0)= *(uint64_t*)(src_y +  15*linesize);
                    *(uint64_t*)(h->top_borders[0][s->mb_x]+ 8)= *(uint64_t*)(src_y +8+15*linesize);
    
                    if(simple || !CONFIG_GRAY || !(s->flags&CODEC_FLAG_GRAY)){
    
                        *(uint64_t*)(h->top_borders[0][s->mb_x]+16)= *(uint64_t*)(src_cb+7*uvlinesize);
                        *(uint64_t*)(h->top_borders[0][s->mb_x]+24)= *(uint64_t*)(src_cr+7*uvlinesize);
                    }
                }
            }else{
                if(!MB_MBAFF){
                    h->left_border[0]= h->top_borders[0][s->mb_x][15];
    
                    if(simple || !CONFIG_GRAY || !(s->flags&CODEC_FLAG_GRAY)){
    
                        h->left_border[34   ]= h->top_borders[0][s->mb_x][16+7  ];
                        h->left_border[34+18]= h->top_borders[0][s->mb_x][16+8+7];
                    }
                    skiplast= 1;
                }
                offset  =
                uvoffset=
                top_idx = MB_MBAFF ? 0 : 1;
            }
            step= MB_MBAFF ? 2 : 1;
        }
    
    
        // There are two lines saved, the line above the the top macroblock of a pair,
    
        // and the line above the bottom macroblock
    
        h->left_border[offset]= h->top_borders[top_idx][s->mb_x][15];
        for(i=1; i<17 - skiplast; i++){
            h->left_border[offset+i*step]= src_y[15+i*  linesize];
    
        *(uint64_t*)(h->top_borders[top_idx][s->mb_x]+0)= *(uint64_t*)(src_y +  16*linesize);
        *(uint64_t*)(h->top_borders[top_idx][s->mb_x]+8)= *(uint64_t*)(src_y +8+16*linesize);
    
        if(simple || !CONFIG_GRAY || !(s->flags&CODEC_FLAG_GRAY)){
    
            h->left_border[uvoffset+34   ]= h->top_borders[top_idx][s->mb_x][16+7];
            h->left_border[uvoffset+34+18]= h->top_borders[top_idx][s->mb_x][24+7];
            for(i=1; i<9 - skiplast; i++){