diff --git a/libavcodec/x86/idctdsp_init.c b/libavcodec/x86/idctdsp_init.c index 6ab262097ca93c1812234e364c338012c227fbf4..bcf7e5be0e2266bdf7832b9d0e1bf5525091be8a 100644 --- a/libavcodec/x86/idctdsp_init.c +++ b/libavcodec/x86/idctdsp_init.c @@ -86,8 +86,8 @@ av_cold void ff_idctdsp_init_x86(IDCTDSPContext *c, AVCodecContext *avctx, c->add_pixels_clamped = ff_add_pixels_clamped_sse2; } - if (ARCH_X86_64 && - avctx->bits_per_raw_sample == 10 && avctx->lowres == 0 && + if (ARCH_X86_64 && avctx->lowres == 0) { + if (avctx->bits_per_raw_sample == 10 && (avctx->idct_algo == FF_IDCT_AUTO || avctx->idct_algo == FF_IDCT_SIMPLEAUTO || avctx->idct_algo == FF_IDCT_SIMPLE)) { @@ -104,5 +104,23 @@ av_cold void ff_idctdsp_init_x86(IDCTDSPContext *c, AVCodecContext *avctx, c->idct = ff_simple_idct10_avx; c->perm_type = FF_IDCT_PERM_TRANSPOSE; } + } + + if (avctx->bits_per_raw_sample == 12 && + (avctx->idct_algo == FF_IDCT_AUTO || + avctx->idct_algo == FF_IDCT_SIMPLEMMX)) { + if (EXTERNAL_SSE2(cpu_flags)) { + c->idct_put = ff_simple_idct12_put_sse2; + c->idct_add = NULL; + c->idct = ff_simple_idct12_sse2; + c->perm_type = FF_IDCT_PERM_TRANSPOSE; + } + if (EXTERNAL_AVX(cpu_flags)) { + c->idct_put = ff_simple_idct12_put_avx; + c->idct_add = NULL; + c->idct = ff_simple_idct12_avx; + c->perm_type = FF_IDCT_PERM_TRANSPOSE; + } + } } } diff --git a/libavcodec/x86/simple_idct.h b/libavcodec/x86/simple_idct.h index e8f59c1865bde63f37fb03c6a00146662c72ec7a..8eeb31e29985932198bb478fc77baaabd2c6c828 100644 --- a/libavcodec/x86/simple_idct.h +++ b/libavcodec/x86/simple_idct.h @@ -31,4 +31,10 @@ void ff_simple_idct10_avx(int16_t *block); void ff_simple_idct10_put_sse2(uint8_t *dest, int line_size, int16_t *block); void ff_simple_idct10_put_avx(uint8_t *dest, int line_size, int16_t *block); +void ff_simple_idct12_sse2(int16_t *block); +void ff_simple_idct12_avx(int16_t *block); + +void ff_simple_idct12_put_sse2(uint8_t *dest, int line_size, int16_t *block); +void ff_simple_idct12_put_avx(uint8_t *dest, int line_size, int16_t *block); + #endif /* AVCODEC_X86_SIMPLE_IDCT_H */ diff --git a/libavcodec/x86/simple_idct10.asm b/libavcodec/x86/simple_idct10.asm index 3af2042f08c5fa196e489c35801bdac88b58fea1..ec388f99ed08d654a5631456c85516ebc8ba1a40 100644 --- a/libavcodec/x86/simple_idct10.asm +++ b/libavcodec/x86/simple_idct10.asm @@ -29,9 +29,12 @@ SECTION_RODATA +cextern pw_2 cextern pw_16 cextern pw_1023 +cextern pw_4095 pd_round_12: times 4 dd 1<<(12-1) +pd_round_15: times 4 dd 1<<(15-1) pd_round_19: times 4 dd 1<<(19-1) %include "libavcodec/x86/simple_idct10_template.asm" @@ -46,6 +49,19 @@ cglobal simple_idct10, 1, 1, 16 cglobal simple_idct10_put, 3, 3, 16 IDCT_FN "", 12, "", 19, 0, pw_1023 RET + +cglobal simple_idct12, 1, 1, 16 + ; coeffs are already 15bits, adding the offset would cause + ; overflow in the input + IDCT_FN "", 15, pw_2, 16 + RET + +cglobal simple_idct12_put, 3, 3, 16 + ; range isn't known, so the C simple_idct range is used + ; Also, using a bias on input overflows, so use the bias + ; on output of the first butterfly instead + IDCT_FN "", 15, pw_2, 16, 0, pw_4095 + RET %endmacro INIT_XMM sse2