vc1.c

/*
 * VC-1 and WMV3 decoder
 * Copyright (c) 2006 Konstantin Shishkov
 * Partly based on vc9.c (c) 2005 Anonymous, Alex Beregszaszi, Michael Niedermayer
 *
 * This library is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2 of the License, or (at your option) any later version.
 *
 * This library is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with this library; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 *
 */

/**
 * @file vc1.c
 * VC-1 and WMV3 decoder
 *
 */
#include "common.h"
#include "dsputil.h"
#include "avcodec.h"
#include "mpegvideo.h"
#include "vc1data.h"
#include "vc1acdata.h"

#undef NDEBUG
#include <assert.h>

extern const uint32_t ff_table0_dc_lum[120][2], ff_table1_dc_lum[120][2];
extern const uint32_t ff_table0_dc_chroma[120][2], ff_table1_dc_chroma[120][2];
extern VLC ff_msmp4_dc_luma_vlc[2], ff_msmp4_dc_chroma_vlc[2];
#define MB_INTRA_VLC_BITS 9
extern VLC ff_msmp4_mb_i_vlc;
extern const uint16_t ff_msmp4_mb_i_table[64][2];
#define DC_VLC_BITS 9
#define AC_VLC_BITS 9
static const uint16_t table_mb_intra[64][2];


/** Available Profiles */
//@{
enum Profile {
    PROFILE_SIMPLE,
    PROFILE_MAIN,
    PROFILE_COMPLEX, ///< TODO: WMV9 specific
    PROFILE_ADVANCED
};
//@}

/** Sequence quantizer mode */
//@{
enum QuantMode {
    QUANT_FRAME_IMPLICIT,    ///< Implicitly specified at frame level
    QUANT_FRAME_EXPLICIT,    ///< Explicitly specified at frame level
    QUANT_NON_UNIFORM,       ///< Non-uniform quant used for all frames
    QUANT_UNIFORM            ///< Uniform quant used for all frames
};
//@}

/** Where quant can be changed */
//@{
enum DQProfile {
    DQPROFILE_FOUR_EDGES,
    DQPROFILE_DOUBLE_EDGES,
    DQPROFILE_SINGLE_EDGE,
    DQPROFILE_ALL_MBS
};
//@}

/** @name Where quant can be changed
 */
//@{
enum DQSingleEdge {
    DQSINGLE_BEDGE_LEFT,
    DQSINGLE_BEDGE_TOP,
    DQSINGLE_BEDGE_RIGHT,
    DQSINGLE_BEDGE_BOTTOM
};
//@}

/** Which pair of edges is quantized with ALTPQUANT */
//@{
enum DQDoubleEdge {
    DQDOUBLE_BEDGE_TOPLEFT,
    DQDOUBLE_BEDGE_TOPRIGHT,
    DQDOUBLE_BEDGE_BOTTOMRIGHT,
    DQDOUBLE_BEDGE_BOTTOMLEFT
};
//@}

/** MV modes for P frames */
//@{
enum MVModes {
    MV_PMODE_1MV_HPEL_BILIN,
    MV_PMODE_1MV,
    MV_PMODE_1MV_HPEL,
    MV_PMODE_MIXED_MV,
    MV_PMODE_INTENSITY_COMP
};
//@}

/** @name MV types for B frames */
//@{
enum BMVTypes {
    BMV_TYPE_BACKWARD,
    BMV_TYPE_FORWARD,
    BMV_TYPE_INTERPOLATED = 3 //XXX: ??
};
//@}

/** @name Block types for P/B frames */
//@{
enum TransformTypes {
    TT_8X8,
    TT_8X4_BOTTOM,
    TT_8X4_TOP,
    TT_8X4, //Both halves
    TT_4X8_RIGHT,
    TT_4X8_LEFT,
    TT_4X8, //Both halves
    TT_4X4
};
//@}

/** Table for conversion between TTBLK and TTMB */
static const int ttblk_to_tt[3][8] = {
  { TT_8X4, TT_4X8, TT_8X8, TT_4X4, TT_8X4_TOP, TT_8X4_BOTTOM, TT_4X8_RIGHT, TT_4X8_LEFT },
  { TT_8X8, TT_4X8_RIGHT, TT_4X8_LEFT, TT_4X4, TT_8X4, TT_4X8, TT_8X4_BOTTOM, TT_8X4_TOP },
  { TT_8X8, TT_4X8, TT_4X4, TT_8X4_BOTTOM, TT_4X8_RIGHT, TT_4X8_LEFT, TT_8X4, TT_8X4_TOP }
};

/** MV P mode - the 5th element is only used for mode 1 */
static const uint8_t mv_pmode_table[2][5] = {
  { MV_PMODE_1MV_HPEL_BILIN, MV_PMODE_1MV, MV_PMODE_1MV_HPEL, MV_PMODE_INTENSITY_COMP, MV_PMODE_MIXED_MV },
  { MV_PMODE_1MV, MV_PMODE_MIXED_MV, MV_PMODE_1MV_HPEL, MV_PMODE_INTENSITY_COMP, MV_PMODE_1MV_HPEL_BILIN }
};

/** One more frame type */
#define BI_TYPE 7

static const int fps_nr[5] = { 24, 25, 30, 50, 60 },
  fps_dr[2] = { 1000, 1001 };
static const uint8_t pquant_table[3][32] = {
  {  /* Implicit quantizer */
     0,  1,  2,  3,  4,  5,  6,  7,  8,  6,  7,  8,  9, 10, 11, 12,
    13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 27, 29, 31
  },
  {  /* Explicit quantizer, pquantizer uniform */
     0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15,
    16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
  },
  {  /* Explicit quantizer, pquantizer non-uniform */
     0,  1,  1,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13,
    14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 29, 31
  }
};

/** @name VC-1 VLC tables and defines
 *  @todo TODO move this into the context
 */
//@{
#define VC1_BFRACTION_VLC_BITS 7
static VLC vc1_bfraction_vlc;
#define VC1_IMODE_VLC_BITS 4
static VLC vc1_imode_vlc;
#define VC1_NORM2_VLC_BITS 3
static VLC vc1_norm2_vlc;
#define VC1_NORM6_VLC_BITS 9
static VLC vc1_norm6_vlc;
/* Could be optimized, one table only needs 8 bits */
#define VC1_TTMB_VLC_BITS 9 //12
static VLC vc1_ttmb_vlc[3];
#define VC1_MV_DIFF_VLC_BITS 9 //15
static VLC vc1_mv_diff_vlc[4];
#define VC1_CBPCY_P_VLC_BITS 9 //14
static VLC vc1_cbpcy_p_vlc[4];
#define VC1_4MV_BLOCK_PATTERN_VLC_BITS 6
static VLC vc1_4mv_block_pattern_vlc[4];
#define VC1_TTBLK_VLC_BITS 5
static VLC vc1_ttblk_vlc[3];
#define VC1_SUBBLKPAT_VLC_BITS 6
static VLC vc1_subblkpat_vlc[3];

static VLC vc1_ac_coeff_table[8];
//@}

enum CodingSet {
    CS_HIGH_MOT_INTRA = 0,
    CS_HIGH_MOT_INTER,
    CS_LOW_MOT_INTRA,
    CS_LOW_MOT_INTER,
    CS_MID_RATE_INTRA,
    CS_MID_RATE_INTER,
    CS_HIGH_RATE_INTRA,
    CS_HIGH_RATE_INTER
};

/** Bitplane struct
 * We mainly need data and is_raw, so this struct could be avoided
 * to save a level of indirection; feel free to modify
 * @fixme For now, stride=width
 * @warning Data are bits, either 1 or 0
 */
typedef struct BitPlane {
    uint8_t *data;      ///< Data buffer
    int width;          ///< Width of the buffer
    int stride;         ///< Stride of the buffer
    int height;         ///< Plane height
    uint8_t is_raw;     ///< Bit values must be read at MB level
} BitPlane;


/** Block data for DC/AC prediction
*/
typedef struct Block {
    uint16_t dc;
    int16_t hor_ac[7];
    int16_t vert_ac[7];
    int16_t dcstep, step;
} Block;

/** The VC1 Context
 * @fixme Change size wherever another size is more efficient
 * Many members are only used for Advanced Profile
 */
typedef struct VC1Context{
    MpegEncContext s;

    int bits;

    /** Simple/Main Profile sequence header */
    //@{
    int res_sm;           ///< reserved, 2b
    int res_x8;           ///< reserved
    int multires;         ///< frame-level RESPIC syntax element present
    int res_fasttx;       ///< reserved, always 1
    int res_transtab;     ///< reserved, always 0
    int rangered;         ///< RANGEREDFRM (range reduction) syntax element present
                          ///< at frame level
    int res_rtm_flag;     ///< reserved, set to 1
    int reserved;         ///< reserved
    //@}

    /** Advanced Profile */
    //@{
    int level;            ///< 3bits, for Advanced/Simple Profile, provided by TS layer
    int chromaformat;     ///< 2bits, 2=4:2:0, only defined
    int postprocflag;     ///< Per-frame processing suggestion flag present
    int broadcast;        ///< TFF/RFF present
    int interlace;        ///< Progressive/interlaced (RPTFTM syntax element)
    int tfcntrflag;       ///< TFCNTR present
    int panscanflag;      ///< NUMPANSCANWIN, TOPLEFT{X,Y}, BOTRIGHT{X,Y} present
    int extended_dmv;     ///< Additional extended dmv range at P/B frame-level
    int color_prim;       ///< 8bits, chroma coordinates of the color primaries
    int transfer_char;    ///< 8bits, Opto-electronic transfer characteristics
    int matrix_coef;      ///< 8bits, Color primaries->YCbCr transform matrix
    int hrd_param_flag;   ///< Presence of Hypothetical Reference
                          ///< Decoder parameters
    //@}

    /** Sequence header data for all Profiles
     * TODO: choose between ints, uint8_ts and monobit flags
     */
    //@{
    int profile;          ///< 2bits, Profile
    int frmrtq_postproc;  ///< 3bits,
    int bitrtq_postproc;  ///< 5bits, quantized framerate-based postprocessing strength
    int fastuvmc;         ///< Rounding of qpel vector to hpel ? (not in Simple)
    int extended_mv;      ///< Ext MV in P/B (not in Simple)
    int dquant;           ///< How qscale varies with MBs, 2bits (not in Simple)
    int vstransform;      ///< variable-size [48]x[48] transform type + info
    int overlap;          ///< overlapped transforms in use
    int quantizer_mode;   ///< 2bits, quantizer mode used for sequence, see QUANT_*
    int finterpflag;      ///< INTERPFRM present
    //@}

    /** Frame decoding info for all profiles */
    //@{
    uint8_t mv_mode;      ///< MV coding monde
    uint8_t mv_mode2;     ///< Secondary MV coding mode (B frames)
    int k_x;              ///< Number of bits for MVs (depends on MV range)
    int k_y;              ///< Number of bits for MVs (depends on MV range)
    int range_x, range_y; ///< MV range
    uint8_t pq, altpq;    ///< Current/alternate frame quantizer scale
    /** pquant parameters */
    //@{
    uint8_t dquantfrm;
    uint8_t dqprofile;
    uint8_t dqsbedge;
    uint8_t dqbilevel;
    //@}
    /** AC coding set indexes
     * @see 8.1.1.10, p(1)10
     */
    //@{
    int c_ac_table_index; ///< Chroma index from ACFRM element
    int y_ac_table_index; ///< Luma index from AC2FRM element
    //@}
    int ttfrm;            ///< Transform type info present at frame level
    uint8_t ttmbf;        ///< Transform type flag
    int ttmb;             ///< Transform type
    uint8_t ttblk4x4;     ///< Value of ttblk which indicates a 4x4 transform
    int codingset;        ///< index of current table set from 11.8 to use for luma block decoding
    int codingset2;       ///< index of current table set from 11.8 to use for chroma block decoding
    int pqindex;          ///< raw pqindex used in coding set selection
    int a_avail, c_avail;


    /** Luma compensation parameters */
    //@{
    uint8_t lumscale;
    uint8_t lumshift;
    //@}
    int16_t bfraction;    ///< Relative position % anchors=> how to scale MVs
    uint8_t halfpq;       ///< Uniform quant over image and qp+.5
    uint8_t respic;       ///< Frame-level flag for resized images
    int buffer_fullness;  ///< HRD info
    /** Ranges:
     * -# 0 -> [-64n 63.f] x [-32, 31.f]
     * -# 1 -> [-128, 127.f] x [-64, 63.f]
     * -# 2 -> [-512, 511.f] x [-128, 127.f]
     * -# 3 -> [-1024, 1023.f] x [-256, 255.f]
     */
    uint8_t mvrange;
    uint8_t pquantizer;           ///< Uniform (over sequence) quantizer in use
    uint8_t *previous_line_cbpcy; ///< To use for predicted CBPCY
    VLC *cbpcy_vlc;               ///< CBPCY VLC table
    int tt_index;                 ///< Index for Transform Type tables
    BitPlane mv_type_mb_plane;    ///< bitplane for mv_type == (4MV)
    BitPlane skip_mb_plane;       ///< bitplane for skipped MBs
    BitPlane direct_mb_plane;     ///< bitplane for "direct" MBs

    /** Frame decoding info for S/M profiles only */
    //@{
    uint8_t rangeredfrm; ///< out_sample = CLIP((in_sample-128)*2+128)
    uint8_t interpfrm;
    //@}

    /** Frame decoding info for Advanced profile */
    //@{
    uint8_t fcm; ///< 0->Progressive, 2->Frame-Interlace, 3->Field-Interlace
    uint8_t numpanscanwin;
    uint8_t tfcntr;
    uint8_t rptfrm, tff, rff;
    uint16_t topleftx;
    uint16_t toplefty;
    uint16_t bottomrightx;
    uint16_t bottomrighty;
    uint8_t uvsamp;
    uint8_t postproc;
    int hrd_num_leaky_buckets;
    uint8_t bit_rate_exponent;
    uint8_t buffer_size_exponent;
    BitPlane ac_pred_plane;       ///< AC prediction flags bitplane
    BitPlane over_flags_plane;    ///< Overflags bitplane
    uint8_t condover;
    uint16_t *hrd_rate, *hrd_buffer;
    uint8_t *hrd_fullness;
    uint8_t range_mapy_flag;
    uint8_t range_mapuv_flag;
    uint8_t range_mapy;
    uint8_t range_mapuv;
    //@}
} VC1Context;

/**
 * Get unary code of limited length
 * @fixme FIXME Slow and ugly
 * @param gb GetBitContext
 * @param[in] stop The bitstop value (unary code of 1's or 0's)
 * @param[in] len Maximum length
 * @return Unary length/index
 */
static int get_prefix(GetBitContext *gb, int stop, int len)
{
#if 1
    int i;

    for(i = 0; i < len && get_bits1(gb) != stop; i++);
    return i;
/*  int i = 0, tmp = !stop;

  while (i != len && tmp != stop)
  {
    tmp = get_bits(gb, 1);
    i++;
  }
  if (i == len && tmp != stop) return len+1;
  return i;*/
#else
  unsigned int buf;
  int log;

  OPEN_READER(re, gb);
  UPDATE_CACHE(re, gb);
  buf=GET_CACHE(re, gb); //Still not sure
  if (stop) buf = ~buf;

  log= av_log2(-buf); //FIXME: -?
  if (log < limit){
    LAST_SKIP_BITS(re, gb, log+1);
    CLOSE_READER(re, gb);
    return log;
  }

  LAST_SKIP_BITS(re, gb, limit);
  CLOSE_READER(re, gb);
  return limit;
#endif
}

static inline int decode210(GetBitContext *gb){
    int n;
    n = get_bits1(gb);
    if (n == 1)
        return 0;
    else
        return 2 - get_bits1(gb);
}

/**
 * Init VC-1 specific tables and VC1Context members
 * @param v The VC1Context to initialize
 * @return Status
 */
static int vc1_init_common(VC1Context *v)
{
    static int done = 0;
    int i = 0;

    /* Set the bit planes */
    v->mv_type_mb_plane = (struct BitPlane) { NULL, 0, 0, 0 };
    v->direct_mb_plane = (struct BitPlane) { NULL, 0, 0, 0 };
    v->skip_mb_plane = (struct BitPlane) { NULL, 0, 0, 0 };
    v->ac_pred_plane = v->over_flags_plane = (struct BitPlane) { NULL, 0, 0, 0 };
    v->hrd_rate = v->hrd_buffer = NULL;

    /* VLC tables */
    if(!done)
    {
        done = 1;
        init_vlc(&vc1_bfraction_vlc, VC1_BFRACTION_VLC_BITS, 23,
                 vc1_bfraction_bits, 1, 1,
                 vc1_bfraction_codes, 1, 1, 1);
        init_vlc(&vc1_norm2_vlc, VC1_NORM2_VLC_BITS, 4,
                 vc1_norm2_bits, 1, 1,
                 vc1_norm2_codes, 1, 1, 1);
        init_vlc(&vc1_norm6_vlc, VC1_NORM6_VLC_BITS, 64,
                 vc1_norm6_bits, 1, 1,
                 vc1_norm6_codes, 2, 2, 1);
        init_vlc(&vc1_imode_vlc, VC1_IMODE_VLC_BITS, 7,
                 vc1_imode_bits, 1, 1,
                 vc1_imode_codes, 1, 1, 1);
        for (i=0; i<3; i++)
        {
            init_vlc(&vc1_ttmb_vlc[i], VC1_TTMB_VLC_BITS, 16,
                     vc1_ttmb_bits[i], 1, 1,
                     vc1_ttmb_codes[i], 2, 2, 1);
            init_vlc(&vc1_ttblk_vlc[i], VC1_TTBLK_VLC_BITS, 8,
                     vc1_ttblk_bits[i], 1, 1,
                     vc1_ttblk_codes[i], 1, 1, 1);
            init_vlc(&vc1_subblkpat_vlc[i], VC1_SUBBLKPAT_VLC_BITS, 15,
                     vc1_subblkpat_bits[i], 1, 1,
                     vc1_subblkpat_codes[i], 1, 1, 1);
        }
        for(i=0; i<4; i++)
        {
            init_vlc(&vc1_4mv_block_pattern_vlc[i], VC1_4MV_BLOCK_PATTERN_VLC_BITS, 16,
                     vc1_4mv_block_pattern_bits[i], 1, 1,
                     vc1_4mv_block_pattern_codes[i], 1, 1, 1);
            init_vlc(&vc1_cbpcy_p_vlc[i], VC1_CBPCY_P_VLC_BITS, 64,
                     vc1_cbpcy_p_bits[i], 1, 1,
                     vc1_cbpcy_p_codes[i], 2, 2, 1);
            init_vlc(&vc1_mv_diff_vlc[i], VC1_MV_DIFF_VLC_BITS, 73,
                     vc1_mv_diff_bits[i], 1, 1,
                     vc1_mv_diff_codes[i], 2, 2, 1);
        }
        for(i=0; i<8; i++)
            init_vlc(&vc1_ac_coeff_table[i], AC_VLC_BITS, vc1_ac_sizes[i],
                     &vc1_ac_tables[i][0][1], 8, 4,
                     &vc1_ac_tables[i][0][0], 8, 4, 1);
        init_vlc(&ff_msmp4_mb_i_vlc, MB_INTRA_VLC_BITS, 64,
                 &ff_msmp4_mb_i_table[0][1], 4, 2,
                 &ff_msmp4_mb_i_table[0][0], 4, 2, 1);
    }

    /* Other defaults */
    v->pq = -1;
    v->mvrange = 0; /* 7.1.1.18, p80 */

    return 0;
}

/***********************************************************************/
/**
 * @defgroup bitplane VC9 Bitplane decoding
 * @see 8.7, p56
 * @{
 */

/** @addtogroup bitplane
 * Imode types
 * @{
 */
enum Imode {
    IMODE_RAW,
    IMODE_NORM2,
    IMODE_DIFF2,
    IMODE_NORM6,
    IMODE_DIFF6,
    IMODE_ROWSKIP,
    IMODE_COLSKIP
};
/** @} */ //imode defines

/** Allocate the buffer from a bitplane, given its dimensions
 * @param bp Bitplane which buffer is to allocate
 * @param[in] width Width of the buffer
 * @param[in] height Height of the buffer
 * @return Status
 * @todo TODO: Take into account stride
 * @todo TODO: Allow use of external buffers ?
 */
static int alloc_bitplane(BitPlane *bp, int width, int height)
{
    if (!bp || bp->width<0 || bp->height<0) return -1;
    bp->data = (uint8_t*)av_malloc(width*height);
    if (!bp->data) return -1;
    bp->width = bp->stride = width;
    bp->height = height;
    return 0;
}

/** Free the bitplane's buffer
 * @param bp Bitplane which buffer is to free
 */
static void free_bitplane(BitPlane *bp)
{
    bp->width = bp->stride = bp->height = 0;
    if (bp->data) av_freep(&bp->data);
}

/** Decode rows by checking if they are skipped
 * @param plane Buffer to store decoded bits
 * @param[in] width Width of this buffer
 * @param[in] height Height of this buffer
 * @param[in] stride of this buffer
 */
static void decode_rowskip(uint8_t* plane, int width, int height, int stride, GetBitContext *gb){
    int x, y;

    for (y=0; y<height; y++){
        if (!get_bits(gb, 1)) //rowskip
            memset(plane, 0, width);
        else
            for (x=0; x<width; x++)
                plane[x] = get_bits(gb, 1);
        plane += stride;
    }
}

/** Decode columns by checking if they are skipped
 * @param plane Buffer to store decoded bits
 * @param[in] width Width of this buffer
 * @param[in] height Height of this buffer
 * @param[in] stride of this buffer
 * @fixme FIXME: Optimize
 */
static void decode_colskip(uint8_t* plane, int width, int height, int stride, GetBitContext *gb){
    int x, y;

    for (x=0; x<width; x++){
        if (!get_bits(gb, 1)) //colskip
            for (y=0; y<height; y++)
                plane[y*stride] = 0;
        else
            for (y=0; y<height; y++)
                plane[y*stride] = get_bits(gb, 1);
        plane ++;
    }
}

/** Decode a bitplane's bits
 * @param bp Bitplane where to store the decode bits
 * @param v VC-1 context for bit reading and logging
 * @return Status
 * @fixme FIXME: Optimize
 * @todo TODO: Decide if a struct is needed
 */
static int bitplane_decoding(BitPlane *bp, VC1Context *v)
{
    GetBitContext *gb = &v->s.gb;

    int imode, x, y, code, offset;
    uint8_t invert, *planep = bp->data;

    invert = get_bits(gb, 1);
    imode = get_vlc2(gb, vc1_imode_vlc.table, VC1_IMODE_VLC_BITS, 1);

    bp->is_raw = 0;
    switch (imode)
    {
    case IMODE_RAW:
        //Data is actually read in the MB layer (same for all tests == "raw")
        bp->is_raw = 1; //invert ignored
        return invert;
    case IMODE_DIFF2:
    case IMODE_NORM2:
        if ((bp->height * bp->width) & 1)
        {
            *planep++ = get_bits(gb, 1);
            offset = 1;
        }
        else offset = 0;
        // decode bitplane as one long line
        for (y = offset; y < bp->height * bp->width; y += 2) {
            code = get_vlc2(gb, vc1_norm2_vlc.table, VC1_NORM2_VLC_BITS, 1);
            *planep++ = code & 1;
            offset++;
            if(offset == bp->width) {
                offset = 0;
                planep += bp->stride - bp->width;
            }
            *planep++ = code >> 1;
            offset++;
            if(offset == bp->width) {
                offset = 0;
                planep += bp->stride - bp->width;
            }
        }
        break;
    case IMODE_DIFF6:
    case IMODE_NORM6:
        if(!(bp->height % 3) && (bp->width % 3)) { // use 2x3 decoding
            for(y = 0; y < bp->height; y+= 3) {
                for(x = bp->width & 1; x < bp->width; x += 2) {
                    code = get_vlc2(gb, vc1_norm6_vlc.table, VC1_NORM6_VLC_BITS, 2);
                    if(code < 0){
                        av_log(v->s.avctx, AV_LOG_DEBUG, "invalid NORM-6 VLC\n");
                        return -1;
                    }
                    planep[x + 0] = (code >> 0) & 1;
                    planep[x + 1] = (code >> 1) & 1;
                    planep[x + 0 + bp->stride] = (code >> 2) & 1;
                    planep[x + 1 + bp->stride] = (code >> 3) & 1;
                    planep[x + 0 + bp->stride * 2] = (code >> 4) & 1;
                    planep[x + 1 + bp->stride * 2] = (code >> 5) & 1;
                }
                planep += bp->stride * 3;
            }
            if(bp->width & 1) decode_colskip(bp->data, 1, bp->height, bp->stride, &v->s.gb);
        } else { // 3x2
            for(y = bp->height & 1; y < bp->height; y += 2) {
                for(x = bp->width % 3; x < bp->width; x += 3) {
                    code = get_vlc2(gb, vc1_norm6_vlc.table, VC1_NORM6_VLC_BITS, 2);
                    if(code < 0){
                        av_log(v->s.avctx, AV_LOG_DEBUG, "invalid NORM-6 VLC\n");
                        return -1;
                    }
                    planep[x + 0] = (code >> 0) & 1;
                    planep[x + 1] = (code >> 1) & 1;
                    planep[x + 2] = (code >> 2) & 1;
                    planep[x + 0 + bp->stride] = (code >> 3) & 1;
                    planep[x + 1 + bp->stride] = (code >> 4) & 1;
                    planep[x + 2 + bp->stride] = (code >> 5) & 1;
                }
                planep += bp->stride * 2;
            }
            x = bp->width % 3;
            if(x) decode_colskip(bp->data  ,             x, bp->height    , bp->stride, &v->s.gb);
            if(bp->height & 1) decode_rowskip(bp->data+x, bp->width - x, bp->height & 1, bp->stride, &v->s.gb);
        }
        break;
    case IMODE_ROWSKIP:
        decode_rowskip(bp->data, bp->width, bp->height, bp->stride, &v->s.gb);
        break;
    case IMODE_COLSKIP:
        decode_colskip(bp->data, bp->width, bp->height, bp->stride, &v->s.gb);
        break;
    default: break;
    }

    /* Applying diff operator */
    if (imode == IMODE_DIFF2 || imode == IMODE_DIFF6)
    {
        planep = bp->data;
        planep[0] ^= invert;
        for (x=1; x<bp->width; x++)
            planep[x] ^= planep[x-1];
        for (y=1; y<bp->height; y++)
        {
            planep += bp->stride;
            planep[0] ^= planep[-bp->stride];
            for (x=1; x<bp->width; x++)
            {
                if (planep[x-1] != planep[x-bp->stride]) planep[x] ^= invert;
                else                                     planep[x] ^= planep[x-1];
            }
        }
    }
    else if (invert)
    {
        planep = bp->data;
        for (x=0; x<bp->width*bp->height; x++) planep[x] = !planep[x]; //FIXME stride
    }
    return (imode<<1) + invert;
}
/** @} */ //Bitplane group

/***********************************************************************/
/** VOP Dquant decoding
 * @param v VC-1 Context
 */
static int vop_dquant_decoding(VC1Context *v)
{
    GetBitContext *gb = &v->s.gb;
    int pqdiff;

    //variable size
    if (v->dquant == 2)
    {
        pqdiff = get_bits(gb, 3);
        if (pqdiff == 7) v->altpq = get_bits(gb, 5);
        else v->altpq = v->pq + pqdiff + 1;
    }
    else
    {
        v->dquantfrm = get_bits(gb, 1);
        if ( v->dquantfrm )
        {
            v->dqprofile = get_bits(gb, 2);
            switch (v->dqprofile)
            {
            case DQPROFILE_SINGLE_EDGE:
            case DQPROFILE_DOUBLE_EDGES:
                v->dqsbedge = get_bits(gb, 2);
                break;
            case DQPROFILE_ALL_MBS:
                v->dqbilevel = get_bits(gb, 1);
            default: break; //Forbidden ?
            }
            if (!v->dqbilevel || v->dqprofile != DQPROFILE_ALL_MBS)
            {
                pqdiff = get_bits(gb, 3);
                if (pqdiff == 7) v->altpq = get_bits(gb, 5);
                else v->altpq = v->pq + pqdiff + 1;
            }
        }
    }
    return 0;
}


/** Do inverse transform
 */
static void vc1_inv_trans(DCTELEM block[64], int M, int N)
{
    int i;
    register int t1,t2,t3,t4,t5,t6,t7,t8;
    DCTELEM *src, *dst;

    src = block;
    dst = block;
    if(M==4){
        for(i = 0; i < N; i++){
            t1 = 17 * (src[0] + src[2]);
            t2 = 17 * (src[0] - src[2]);
            t3 = 22 * src[1];
            t4 = 22 * src[3];
            t5 = 10 * src[1];
            t6 = 10 * src[3];

            dst[0] = (t1 + t3 + t6 + 4) >> 3;
            dst[1] = (t2 - t4 + t5 + 4) >> 3;
            dst[2] = (t2 + t4 - t5 + 4) >> 3;
            dst[3] = (t1 - t3 - t6 + 4) >> 3;

            src += 8;
            dst += 8;
        }
    }else{
        for(i = 0; i < N; i++){
            t1 = 12 * (src[0] + src[4]);
            t2 = 12 * (src[0] - src[4]);
            t3 = 16 * src[2] +  6 * src[6];
            t4 =  6 * src[2] - 16 * src[6];

            t5 = t1 + t3;
            t6 = t2 + t4;
            t7 = t2 - t4;
            t8 = t1 - t3;

            t1 = 16 * src[1] + 15 * src[3] +  9 * src[5] +  4 * src[7];
            t2 = 15 * src[1] -  4 * src[3] - 16 * src[5] -  9 * src[7];
            t3 =  9 * src[1] - 16 * src[3] +  4 * src[5] + 15 * src[7];
            t4 =  4 * src[1] -  9 * src[3] + 15 * src[5] - 16 * src[7];

            dst[0] = (t5 + t1 + 4) >> 3;
            dst[1] = (t6 + t2 + 4) >> 3;
            dst[2] = (t7 + t3 + 4) >> 3;
            dst[3] = (t8 + t4 + 4) >> 3;
            dst[4] = (t8 - t4 + 4) >> 3;
            dst[5] = (t7 - t3 + 4) >> 3;
            dst[6] = (t6 - t2 + 4) >> 3;
            dst[7] = (t5 - t1 + 4) >> 3;

            src += 8;
            dst += 8;
        }
    }

    src = block;
    dst = block;
    if(N==4){
        for(i = 0; i < M; i++){
            t1 = 17 * (src[ 0] + src[16]);
            t2 = 17 * (src[ 0] - src[16]);
            t3 = 22 * src[ 8];
            t4 = 22 * src[24];
            t5 = 10 * src[ 8];
            t6 = 10 * src[24];

            dst[ 0] = (t1 + t3 + t6 + 64) >> 7;
            dst[ 8] = (t2 - t4 + t5 + 64) >> 7;
            dst[16] = (t2 + t4 - t5 + 64) >> 7;
            dst[24] = (t1 - t3 - t6 + 64) >> 7;

            src ++;
            dst ++;
        }
    }else{
        for(i = 0; i < M; i++){
            t1 = 12 * (src[ 0] + src[32]);
            t2 = 12 * (src[ 0] - src[32]);
            t3 = 16 * src[16] +  6 * src[48];
            t4 =  6 * src[16] - 16 * src[48];

            t5 = t1 + t3;
            t6 = t2 + t4;
            t7 = t2 - t4;
            t8 = t1 - t3;

            t1 = 16 * src[ 8] + 15 * src[24] +  9 * src[40] +  4 * src[56];
            t2 = 15 * src[ 8] -  4 * src[24] - 16 * src[40] -  9 * src[56];
            t3 =  9 * src[ 8] - 16 * src[24] +  4 * src[40] + 15 * src[56];
            t4 =  4 * src[ 8] -  9 * src[24] + 15 * src[40] - 16 * src[56];

            dst[ 0] = (t5 + t1 + 64) >> 7;
            dst[ 8] = (t6 + t2 + 64) >> 7;
            dst[16] = (t7 + t3 + 64) >> 7;
            dst[24] = (t8 + t4 + 64) >> 7;
            dst[32] = (t8 - t4 + 64 + 1) >> 7;
            dst[40] = (t7 - t3 + 64 + 1) >> 7;
            dst[48] = (t6 - t2 + 64 + 1) >> 7;
            dst[56] = (t5 - t1 + 64 + 1) >> 7;

            src++;
            dst++;
        }
    }
}

/** Apply overlap transform
 * @todo optimize
 * @todo move to DSPContext
 */
static void vc1_overlap_block(MpegEncContext *s, DCTELEM block[64], int n, int do_hor, int do_vert)
{
    int i;

    if(do_hor) { //TODO
    }
    if(do_vert) { //TODO
    }

    for(i = 0; i < 64; i++)
        block[i] += 128;
}


/** Put block onto picture
 * @todo move to DSPContext
 */
static void vc1_put_block(VC1Context *v, DCTELEM block[6][64])
{
    uint8_t *Y;
    int ys, us, vs;
    DSPContext *dsp = &v->s.dsp;

    ys = v->s.current_picture.linesize[0];
    us = v->s.current_picture.linesize[1];
    vs = v->s.current_picture.linesize[2];
    Y = v->s.dest[0];

    dsp->put_pixels_clamped(block[0], Y, ys);
    dsp->put_pixels_clamped(block[1], Y + 8, ys);
    Y += ys * 8;
    dsp->put_pixels_clamped(block[2], Y, ys);
    dsp->put_pixels_clamped(block[3], Y + 8, ys);

    dsp->put_pixels_clamped(block[4], v->s.dest[1], us);
    dsp->put_pixels_clamped(block[5], v->s.dest[2], vs);
}

/** Do motion compensation over 1 macroblock
 * Mostly adapted hpel_motion and qpel_motion from mpegvideo.c
 */
static void vc1_mc_1mv(VC1Context *v)
{
    MpegEncContext *s = &v->s;
    DSPContext *dsp = &v->s.dsp;
    uint8_t *srcY, *srcU, *srcV;
    int dxy, mx, my, src_x, src_y;
    int width = s->mb_width * 16, height = s->mb_height * 16;

    if(!v->s.last_picture.data[0])return;

    mx = s->mv[0][0][0] >> s->mspel;
    my = s->mv[0][0][1] >> s->mspel;
    srcY = s->last_picture.data[0];
    srcU = s->last_picture.data[1];
    srcV = s->last_picture.data[2];

    if(s->mspel) { // hpel mc
        dxy = ((my & 1) << 1) | (mx & 1);
        src_x = s->mb_x * 16 + (mx >> 1);
        src_y = s->mb_y * 16 + (my >> 1);
/*        src_x = clip(src_x, -16, width); //FIXME unneeded for emu?
        if (src_x == width)
            dxy &= ~1;
        src_y = clip(src_y, -16, height);
        if (src_y == height)
            dxy &= ~2;*/
        srcY += src_y * s->linesize + src_x;
        srcU += (src_y >> 1) * s->uvlinesize + (src_x >> 1);
        srcV += (src_y >> 1) * s->uvlinesize + (src_x >> 1);

        if((unsigned)src_x > s->h_edge_pos - (mx&1) - 16
           || (unsigned)src_y > s->v_edge_pos - (my&1) - 16){
            uint8_t *uvbuf= s->edge_emu_buffer + 18 * s->linesize;

            ff_emulated_edge_mc(s->edge_emu_buffer, srcY, s->linesize, 16+1, 16+1,
                             src_x, src_y, s->h_edge_pos, s->v_edge_pos);
            srcY = s->edge_emu_buffer;
            ff_emulated_edge_mc(uvbuf, srcU, s->uvlinesize, 8+1, 8+1,
                             src_x >> 1, src_y >> 1, s->h_edge_pos >> 1, s->v_edge_pos >> 1);
            ff_emulated_edge_mc(uvbuf + 16, srcV, s->uvlinesize, 8+1, 8+1,
                             src_x >> 1, src_y >> 1, s->h_edge_pos >> 1, s->v_edge_pos >> 1);
            srcU = uvbuf;
            srcV = uvbuf + 16;
        }
        dsp->put_no_rnd_pixels_tab[0][dxy](s->dest[0], srcY, s->linesize, 16);
        dsp->put_no_rnd_pixels_tab[1][0](s->dest[1], srcU, s->uvlinesize, 8);
        dsp->put_no_rnd_pixels_tab[1][0](s->dest[2], srcV, s->uvlinesize, 8);
    } else {
        int motion_x = mx, motion_y = my, uvdxy, uvsrc_x, uvsrc_y;
        dxy = ((motion_y & 3) << 2) | (motion_x & 3);
        src_x = s->mb_x * 16 + (mx >> 2);
        src_y = s->mb_y * 16 + (my >> 2);

        mx= motion_x/2;
        my= motion_y/2;

        mx= (mx>>1)|(mx&1);
        my= (my>>1)|(my&1);

        uvdxy= (mx&1) | ((my&1)<<1);
        mx>>=1;
        my>>=1;

        uvsrc_x = s->mb_x * 8 + mx;
        uvsrc_y = s->mb_y * 8 + my;

        srcY = s->last_picture.data[0] +   src_y *   s->linesize +   src_x;
        srcU = s->last_picture.data[1] + uvsrc_y * s->uvlinesize + uvsrc_x;
        srcV = s->last_picture.data[2] + uvsrc_y * s->uvlinesize + uvsrc_x;

        if(   (unsigned)src_x > s->h_edge_pos - (motion_x&3) - 16
              || (unsigned)src_y > s->v_edge_pos - (motion_y&3) - 16  ){
            uint8_t *uvbuf= s->edge_emu_buffer + 18*s->linesize;
            ff_emulated_edge_mc(s->edge_emu_buffer, srcY, s->linesize, 17, 17,
                                src_x, src_y, s->h_edge_pos, s->v_edge_pos);
            srcY = s->edge_emu_buffer;
            ff_emulated_edge_mc(uvbuf, srcU, s->uvlinesize, 9, 9,
                                    uvsrc_x, uvsrc_y, s->h_edge_pos>>1, s->v_edge_pos>>1);
            ff_emulated_edge_mc(uvbuf + 16, srcV, s->uvlinesize, 9, 9,
                                    uvsrc_x, uvsrc_y, s->h_edge_pos>>1, s->v_edge_pos>>1);
            srcU = uvbuf;
            srcV = uvbuf + 16;
        }

        dsp->put_no_rnd_qpel_pixels_tab[0][dxy](s->dest[0], srcY, s->linesize);