diff --git a/libavcodec/x86/aacpsdsp.asm b/libavcodec/x86/aacpsdsp.asm index 66bbbf4e44f5b0946d188042c8503fba3c08d17a..a7327d39cee22d593cf8374c6f12d772c55fc74e 100644 --- a/libavcodec/x86/aacpsdsp.asm +++ b/libavcodec/x86/aacpsdsp.asm @@ -166,6 +166,112 @@ align 16 jl .loop REP_RET +;********************************************************** +;void ps_hybrid_analysis_ileave_sse(float out[2][38][64], +; float (*in)[32][2], +; int i, int len) +;********************************************************** +INIT_XMM sse +cglobal ps_hybrid_analysis_ileave, 3, 7, 5, out, in, i, len, in0, in1, tmp + movsxdifnidn iq, id + mov lend, 32 << 3 + lea inq, [inq+iq*4] + mov tmpd, id + shl tmpd, 8 + add outq, tmpq + mov tmpd, 64 + sub tmpd, id + mov id, tmpd + + test id, 1 + jne .loop4 + test id, 2 + jne .loop8 + +align 16 +.loop16: + mov in0q, inq + mov in1q, 38*64*4 + add in1q, in0q + mov tmpd, lend + +.inner_loop16: + movaps m0, [in0q] + movaps m1, [in1q] + movaps m2, [in0q+lenq] + movaps m3, [in1q+lenq] + TRANSPOSE4x4PS 0, 1, 2, 3, 4 + movaps [outq], m0 + movaps [outq+lenq], m1 + movaps [outq+lenq*2], m2 + movaps [outq+3*32*2*4], m3 + lea in0q, [in0q+lenq*2] + lea in1q, [in1q+lenq*2] + add outq, mmsize + sub tmpd, mmsize + jg .inner_loop16 + add inq, 16 + add outq, 3*32*2*4 + sub id, 4 + jg .loop16 + RET + +align 16 +.loop8: + mov in0q, inq + mov in1q, 38*64*4 + add in1q, in0q + mov tmpd, lend + +.inner_loop8: + movlps m0, [in0q] + movlps m1, [in1q] + movhps m0, [in0q+lenq] + movhps m1, [in1q+lenq] + SBUTTERFLYPS 0, 1, 2 + SBUTTERFLYPD 0, 1, 2 + movaps [outq], m0 + movaps [outq+lenq], m1 + lea in0q, [in0q+lenq*2] + lea in1q, [in1q+lenq*2] + add outq, mmsize + sub tmpd, mmsize + jg .inner_loop8 + add inq, 8 + add outq, lenq + sub id, 2 + jg .loop16 + RET + +align 16 +.loop4: + mov in0q, inq + mov in1q, 38*64*4 + add in1q, in0q + mov tmpd, lend + +.inner_loop4: + movss m0, [in0q] + movss m1, [in1q] + movss m2, [in0q+lenq] + movss m3, [in1q+lenq] + movlhps m0, m1 + movlhps m2, m3 + shufps m0, m2, q2020 + movaps [outq], m0 + lea in0q, [in0q+lenq*2] + lea in1q, [in1q+lenq*2] + add outq, mmsize + sub tmpd, mmsize + jg .inner_loop4 + add inq, 4 + sub id, 1 + test id, 2 + jne .loop8 + cmp id, 4 + jge .loop16 + RET + ;*********************************************************** ;void ps_hybrid_synthesis_deint_sse4(float out[2][38][64], ; float (*in)[32][2], diff --git a/libavcodec/x86/aacpsdsp_init.c b/libavcodec/x86/aacpsdsp_init.c index 25e089c3952e048d933f8946261fbb6d09182544..056e23e59e34e753a40ca1f3ca8f95255b487c85 100644 --- a/libavcodec/x86/aacpsdsp_init.c +++ b/libavcodec/x86/aacpsdsp_init.c @@ -44,6 +44,8 @@ void ff_ps_hybrid_synthesis_deint_sse(float out[2][38][64], float (*in)[32][2], int i, int len); void ff_ps_hybrid_synthesis_deint_sse4(float out[2][38][64], float (*in)[32][2], int i, int len); +void ff_ps_hybrid_analysis_ileave_sse(float (*out)[32][2], float L[2][38][64], + int i, int len); av_cold void ff_psdsp_init_x86(PSDSPContext *s) { @@ -52,6 +54,7 @@ av_cold void ff_psdsp_init_x86(PSDSPContext *s) if (EXTERNAL_SSE(cpu_flags)) { s->add_squares = ff_ps_add_squares_sse; s->mul_pair_single = ff_ps_mul_pair_single_sse; + s->hybrid_analysis_ileave = ff_ps_hybrid_analysis_ileave_sse; s->hybrid_synthesis_deint = ff_ps_hybrid_synthesis_deint_sse; s->hybrid_analysis = ff_ps_hybrid_analysis_sse; }