diff --git a/libavcodec/aacenc.c b/libavcodec/aacenc.c index bb9004842e2e4101c183728238d79cb633620f14..1b95ebd7551ed629641ea2e4e79a9e9ca259773b 100644 --- a/libavcodec/aacenc.c +++ b/libavcodec/aacenc.c @@ -489,7 +489,7 @@ static int aac_encode_frame(AVCodecContext *avctx, AVPacket *avpkt, float **samples = s->planar_samples, *samples2, *la, *overlap; ChannelElement *cpe; SingleChannelElement *sce; - int i, ch, w, chans, tag, start_ch, ret; + int i, its, ch, w, chans, tag, start_ch, ret, frame_bits; int ms_mode = 0, is_mode = 0, tns_mode = 0, pred_mode = 0; int chan_el_counter[4]; FFPsyWindowInfo windows[AAC_MAX_CHANNELS]; @@ -581,14 +581,16 @@ static int aac_encode_frame(AVCodecContext *avctx, AVPacket *avpkt, } if ((ret = ff_alloc_packet2(avctx, avpkt, 8192 * s->channels, 0)) < 0) return ret; + frame_bits = its = 0; do { - int frame_bits; + int target_bits, too_many_bits, too_few_bits; init_put_bits(&s->pb, avpkt->data, avpkt->size); if ((avctx->frame_number & 0xFF)==1 && !(avctx->flags & AV_CODEC_FLAG_BITEXACT)) put_bitstream_info(s, LIBAVCODEC_IDENT); start_ch = 0; + target_bits = 0; memset(chan_el_counter, 0, sizeof(chan_el_counter)); for (i = 0; i < s->chan_map[0]; i++) { FFPsyWindowInfo* wi = windows + start_ch; @@ -611,7 +613,15 @@ static int aac_encode_frame(AVCodecContext *avctx, AVPacket *avpkt, if (sce->band_type[w] > RESERVED_BT) sce->band_type[w] = 0; } + s->psy.bitres.alloc = -1; + s->psy.bitres.bits = avctx->frame_bits / s->channels; s->psy.model->analyze(&s->psy, start_ch, coeffs, wi); + if (s->psy.bitres.alloc > 0) { + /* Lambda unused here on purpose, we need to take psy's unscaled allocation */ + target_bits += s->psy.bitres.alloc; + s->psy.bitres.alloc /= chans; + } + s->cur_type = tag; for (ch = 0; ch < chans; ch++) { s->cur_channel = start_ch + ch; s->coder->search_for_quantizers(avctx, s, &cpe->ch[ch], s->lambda); @@ -692,36 +702,69 @@ static int aac_encode_frame(AVCodecContext *avctx, AVPacket *avpkt, start_ch += chans; } - frame_bits = put_bits_count(&s->pb); - if (frame_bits <= 6144 * s->channels - 3) { - s->psy.bitres.bits = frame_bits / s->channels; + if (avctx->flags & CODEC_FLAG_QSCALE) { + /* When using a constant Q-scale, don't mess with lambda */ break; } - if (is_mode || ms_mode || tns_mode || pred_mode) { - for (i = 0; i < s->chan_map[0]; i++) { - // Must restore coeffs - chans = tag == TYPE_CPE ? 2 : 1; - cpe = &s->cpe[i]; - for (ch = 0; ch < chans; ch++) - memcpy(cpe->ch[ch].coeffs, cpe->ch[ch].pcoeffs, sizeof(cpe->ch[ch].coeffs)); - } - } - s->lambda *= avctx->bit_rate * 1024.0f / avctx->sample_rate / frame_bits; + /* rate control stuff + * target either the nominal bitrate, or what psy's bit reservoir says to target + * whichever is greatest + */ + + frame_bits = put_bits_count(&s->pb); + target_bits = FFMAX(target_bits, avctx->bit_rate * 1024 / avctx->sample_rate); + target_bits = FFMIN(target_bits, 6144 * s->channels - 3); + + /* When using ABR, be strict (but only for increasing) */ + too_many_bits = target_bits + target_bits/2; + too_few_bits = target_bits - target_bits/8; + + if ( its == 0 /* for steady-state Q-scale tracking */ + || (its < 5 && (frame_bits < too_few_bits || frame_bits > too_many_bits)) + || frame_bits >= 6144 * s->channels - 3 ) + { + float ratio = ((float)target_bits) / frame_bits; + + if (frame_bits >= too_few_bits && frame_bits <= too_many_bits) { + /* + * This path is for steady-state Q-scale tracking + * When frame bits fall within the stable range, we still need to adjust + * lambda to maintain it like so in a stable fashion (large jumps in lambda + * create artifacts and should be avoided), but slowly + */ + ratio = sqrtf(sqrtf(ratio)); + ratio = av_clipf(ratio, 0.9f, 1.1f); + } else { + /* Not so fast though */ + ratio = sqrtf(ratio); + } + s->lambda = FFMIN(s->lambda * ratio, 65536.f); + /* Keep iterating if we must reduce and lambda is in the sky */ + if (s->lambda < 300.f || ratio > 0.9f) { + break; + } else { + if (is_mode || ms_mode || tns_mode || pred_mode) { + for (i = 0; i < s->chan_map[0]; i++) { + // Must restore coeffs + chans = tag == TYPE_CPE ? 2 : 1; + cpe = &s->cpe[i]; + for (ch = 0; ch < chans; ch++) + memcpy(cpe->ch[ch].coeffs, cpe->ch[ch].pcoeffs, sizeof(cpe->ch[ch].coeffs)); + } + } + its++; + } + } else { + break; + } } while (1); put_bits(&s->pb, 3, TYPE_END); flush_put_bits(&s->pb); avctx->frame_bits = put_bits_count(&s->pb); - // rate control stuff - if (!(avctx->flags & AV_CODEC_FLAG_QSCALE)) { - float ratio = avctx->bit_rate * 1024.0f / avctx->sample_rate / avctx->frame_bits; - s->lambda *= ratio; - s->lambda = FFMIN(s->lambda, 65536.f); - } - if (!frame) s->last_frame++; diff --git a/libavcodec/aacenc.h b/libavcodec/aacenc.h index 54951f9f70a8527351d1a629236e17ae11bd8464..7e7609b1a8398804dd8437418bb675c432285567 100644 --- a/libavcodec/aacenc.h +++ b/libavcodec/aacenc.h @@ -96,10 +96,12 @@ typedef struct AACEncContext { FFPsyContext psy; struct FFPsyPreprocessContext* psypp; AACCoefficientsEncoder *coder; - int cur_channel; + int cur_channel; ///< current channel for coder context int last_frame; int random_state; float lambda; + enum RawDataBlockType cur_type; ///< channel group type cur_channel belongs to + AudioFrameQueue afq; DECLARE_ALIGNED(16, int, qcoefs)[96]; ///< quantized coefficients DECLARE_ALIGNED(32, float, scoefs)[1024]; ///< scaled coefficients diff --git a/libavcodec/aacpsy.c b/libavcodec/aacpsy.c index 82b670d49d8c41d160ae639b63e1933a3f9aa38d..af235c758c8bd30417c8b7943ea9c1ae91e501db 100644 --- a/libavcodec/aacpsy.c +++ b/libavcodec/aacpsy.c @@ -87,6 +87,7 @@ enum { }; #define PSY_3GPP_BITS_TO_PE(bits) ((bits) * 1.18f) +#define PSY_3GPP_PE_TO_BITS(bits) ((bits) / 1.18f) /* LAME psy model constants */ #define PSY_LAME_FIR_LEN 21 ///< LAME psy model FIR order @@ -687,6 +688,7 @@ static void psy_3gpp_analyze_channel(FFPsyContext *ctx, int channel, desired_pe *= av_clipf(pctx->pe.previous / PSY_3GPP_BITS_TO_PE(ctx->bitres.bits), 0.85f, 1.15f); pctx->pe.previous = PSY_3GPP_BITS_TO_PE(desired_bits); + ctx->bitres.alloc = desired_bits; if (desired_pe < pe) { /* 5.6.1.3.4 "First Estimation of the reduction value" */ @@ -788,6 +790,7 @@ static void psy_3gpp_analyze_channel(FFPsyContext *ctx, int channel, psy_band->threshold = band->thr; psy_band->energy = band->energy; psy_band->spread = band->active_lines * 2.0f / band_sizes[g]; + psy_band->bits = PSY_3GPP_PE_TO_BITS(band->pe); } } diff --git a/libavcodec/psymodel.h b/libavcodec/psymodel.h index e9be1f6fa5c8898916da21a83e3ce4e749fee762..a04cc4d2260c39d6d09c9741cf48b4ddc0ded18a 100644 --- a/libavcodec/psymodel.h +++ b/libavcodec/psymodel.h @@ -88,6 +88,7 @@ typedef struct FFPsyContext { struct { int size; ///< size of the bitresevoir in bits int bits; ///< number of bits used in the bitresevoir + int alloc; ///< number of bits allocated by the psy, or -1 if no allocation was done } bitres; void* model_priv_data; ///< psychoacoustic model implementation private data diff --git a/tests/fate/aac.mak b/tests/fate/aac.mak index f30d4dbbc5ba0f1bdfda971905e4c4fbe202842c..8e9c91507f91ee11d0b7be1e1b97104872beb5bf 100644 --- a/tests/fate/aac.mak +++ b/tests/fate/aac.mak @@ -146,7 +146,7 @@ fate-aac-aref-encode: CMD = enc_dec_pcm adts wav s16le $(REF) -strict -2 -c:a aa fate-aac-aref-encode: CMP = stddev fate-aac-aref-encode: REF = ./tests/data/asynth-44100-2.wav fate-aac-aref-encode: CMP_SHIFT = -4096 -fate-aac-aref-encode: CMP_TARGET = 594 +fate-aac-aref-encode: CMP_TARGET = 584 fate-aac-aref-encode: SIZE_TOLERANCE = 2464 fate-aac-aref-encode: FUZZ = 6 @@ -172,7 +172,7 @@ fate-aac-pns-encode: CMD = enc_dec_pcm adts wav s16le $(TARGET_SAMPLES)/audio-re fate-aac-pns-encode: CMP = stddev fate-aac-pns-encode: REF = $(SAMPLES)/audio-reference/luckynight_2ch_44kHz_s16.wav fate-aac-pns-encode: CMP_SHIFT = -4096 -fate-aac-pns-encode: CMP_TARGET = 633.77 +fate-aac-pns-encode: CMP_TARGET = 623.77 fate-aac-pns-encode: SIZE_TOLERANCE = 3560 fate-aac-pns-encode: FUZZ = 1 @@ -181,7 +181,7 @@ fate-aac-tns-encode: CMD = enc_dec_pcm adts wav s16le $(TARGET_SAMPLES)/audio-re fate-aac-tns-encode: CMP = stddev fate-aac-tns-encode: REF = $(SAMPLES)/audio-reference/luckynight_2ch_44kHz_s16.wav fate-aac-tns-encode: CMP_SHIFT = -4096 -fate-aac-tns-encode: CMP_TARGET = 650.37 +fate-aac-tns-encode: CMP_TARGET = 644.50 fate-aac-tns-encode: FUZZ = 2.8 fate-aac-tns-encode: SIZE_TOLERANCE = 3560 @@ -190,7 +190,7 @@ fate-aac-is-encode: CMD = enc_dec_pcm adts wav s16le $(TARGET_SAMPLES)/audio-ref fate-aac-is-encode: CMP = stddev fate-aac-is-encode: REF = $(SAMPLES)/audio-reference/luckynight_2ch_44kHz_s16.wav fate-aac-is-encode: CMP_SHIFT = -4096 -fate-aac-is-encode: CMP_TARGET = 616.75 +fate-aac-is-encode: CMP_TARGET = 614.04 fate-aac-is-encode: SIZE_TOLERANCE = 3560 fate-aac-is-encode: FUZZ = 1 @@ -199,7 +199,7 @@ fate-aac-pred-encode: CMD = enc_dec_pcm adts wav s16le $(TARGET_SAMPLES)/audio-r fate-aac-pred-encode: CMP = stddev fate-aac-pred-encode: REF = $(SAMPLES)/audio-reference/luckynight_2ch_44kHz_s16.wav fate-aac-pred-encode: CMP_SHIFT = -4096 -fate-aac-pred-encode: CMP_TARGET = 652.60 +fate-aac-pred-encode: CMP_TARGET = 657 fate-aac-pred-encode: FUZZ = 5 fate-aac-pred-encode: SIZE_TOLERANCE = 3560