diff --git a/libavcodec/dsputil.c b/libavcodec/dsputil.c
index 182063ca2b16926707ea3d4eae36d0f9c7ec63a5..91238578b6b2bc2d1209a88fab8ac9c0127fddb7 100644
--- a/libavcodec/dsputil.c
+++ b/libavcodec/dsputil.c
@@ -2509,6 +2509,18 @@ static void butterflies_float_c(float *restrict v1, float *restrict v2,
     }
 }
 
+static void butterflies_float_interleave_c(float *dst, const float *src0,
+                                           const float *src1, int len)
+{
+    int i;
+    for (i = 0; i < len; i++) {
+        float f1 = src0[i];
+        float f2 = src1[i];
+        dst[2*i    ] = f1 + f2;
+        dst[2*i + 1] = f1 - f2;
+    }
+}
+
 static float scalarproduct_float_c(const float *v1, const float *v2, int len)
 {
     float p = 0.0;
@@ -3036,6 +3048,7 @@ av_cold void dsputil_init(DSPContext* c, AVCodecContext *avctx)
     c->vector_clip_int32 = vector_clip_int32_c;
     c->scalarproduct_float = scalarproduct_float_c;
     c->butterflies_float = butterflies_float_c;
+    c->butterflies_float_interleave = butterflies_float_interleave_c;
     c->vector_fmul_scalar = vector_fmul_scalar_c;
     c->vector_fmac_scalar = vector_fmac_scalar_c;
 
diff --git a/libavcodec/dsputil.h b/libavcodec/dsputil.h
index acb204146017657608721dfde7b078feebd26615..98b7b1eeaaa5e9664b90a934e9d3d29519f3a64e 100644
--- a/libavcodec/dsputil.h
+++ b/libavcodec/dsputil.h
@@ -453,6 +453,23 @@ typedef struct DSPContext {
      */
     void (*butterflies_float)(float *restrict v1, float *restrict v2, int len);
 
+    /**
+     * Calculate the sum and difference of two vectors of floats and interleave
+     * results into a separate output vector of floats, with each sum
+     * positioned before the corresponding difference.
+     *
+     * @param dst  output vector
+     *             constraints: 16-byte aligned
+     * @param src0 first input vector
+     *             constraints: 32-byte aligned
+     * @param src1 second input vector
+     *             constraints: 32-byte aligned
+     * @param len  number of elements in the input
+     *             constraints: multiple of 8
+     */
+    void (*butterflies_float_interleave)(float *dst, const float *src0,
+                                         const float *src1, int len);
+
     /* (I)DCT */
     void (*fdct)(DCTELEM *block/* align 16*/);
     void (*fdct248)(DCTELEM *block/* align 16*/);
diff --git a/libavcodec/twinvq.c b/libavcodec/twinvq.c
index 73eb7c14992d6a6674c705856c63c05abdfc9a1a..a2851562eeec0e545b8c34fd8f25cd7e3eb62eed 100644
--- a/libavcodec/twinvq.c
+++ b/libavcodec/twinvq.c
@@ -665,8 +665,9 @@ static void imdct_output(TwinContext *tctx, enum FrameType ftype, int wtype,
                          float *out)
 {
     const ModeTab *mtab = tctx->mtab;
+    int size1, size2;
     float *prev_buf = tctx->prev_frame + tctx->last_block_pos[0];
-    int i, j;
+    int i;
 
     for (i = 0; i < tctx->avctx->channels; i++) {
         imdct_and_window(tctx, ftype, wtype,
@@ -675,27 +676,24 @@ static void imdct_output(TwinContext *tctx, enum FrameType ftype, int wtype,
                          i);
     }
 
+    size2 = tctx->last_block_pos[0];
+    size1 = mtab->size - size2;
     if (tctx->avctx->channels == 2) {
-        for (i = 0; i < mtab->size - tctx->last_block_pos[0]; i++) {
-            float f1 = prev_buf[               i];
-            float f2 = prev_buf[2*mtab->size + i];
-            out[2*i    ] = f1 + f2;
-            out[2*i + 1] = f1 - f2;
-        }
-        for (j = 0; i < mtab->size; j++,i++) {
-            float f1 = tctx->curr_frame[               j];
-            float f2 = tctx->curr_frame[2*mtab->size + j];
-            out[2*i    ] = f1 + f2;
-            out[2*i + 1] = f1 - f2;
-        }
+        tctx->dsp.butterflies_float_interleave(out, prev_buf,
+                                               &prev_buf[2*mtab->size],
+                                               size1);
+
+        out += 2 * size1;
+
+        tctx->dsp.butterflies_float_interleave(out, tctx->curr_frame,
+                                               &tctx->curr_frame[2*mtab->size],
+                                               size2);
     } else {
-        memcpy(out, prev_buf,
-               (mtab->size - tctx->last_block_pos[0]) * sizeof(*out));
+        memcpy(out, prev_buf, size1 * sizeof(*out));
 
-        out +=  mtab->size - tctx->last_block_pos[0];
+        out += size1;
 
-        memcpy(out, tctx->curr_frame,
-               (tctx->last_block_pos[0]) * sizeof(*out));
+        memcpy(out, tctx->curr_frame, size2 * sizeof(*out));
     }
 
 }
diff --git a/libavcodec/x86/dsputil_mmx.c b/libavcodec/x86/dsputil_mmx.c
index dd6cbf51ad15d534bb1c7c3e5ba36dcc2f30e6cf..f0de05a763b1b3213c5e6206c6f87c8168ec5fc5 100644
--- a/libavcodec/x86/dsputil_mmx.c
+++ b/libavcodec/x86/dsputil_mmx.c
@@ -2424,6 +2424,11 @@ void ff_vector_clip_int32_int_sse2(int32_t *dst, const int32_t *src, int32_t min
 void ff_vector_clip_int32_sse4    (int32_t *dst, const int32_t *src, int32_t min,
                                    int32_t max, unsigned int len);
 
+extern void ff_butterflies_float_interleave_sse(float *dst, const float *src0,
+                                                const float *src1, int len);
+extern void ff_butterflies_float_interleave_avx(float *dst, const float *src0,
+                                                const float *src1, int len);
+
 void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
 {
     int mm_flags = av_get_cpu_flags();
@@ -2868,6 +2873,7 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
             c->vector_clipf = vector_clipf_sse;
 #if HAVE_YASM
             c->scalarproduct_float = ff_scalarproduct_float_sse;
+            c->butterflies_float_interleave = ff_butterflies_float_interleave_sse;
 #endif
         }
         if (HAVE_AMD3DNOW && (mm_flags & AV_CPU_FLAG_3DNOW))
@@ -2925,6 +2931,7 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
                 c->put_h264_chroma_pixels_tab[0]= ff_put_h264_chroma_mc8_10_avx;
                 c->avg_h264_chroma_pixels_tab[0]= ff_avg_h264_chroma_mc8_10_avx;
             }
+            c->butterflies_float_interleave = ff_butterflies_float_interleave_avx;
         }
 #endif
     }
diff --git a/libavcodec/x86/dsputil_yasm.asm b/libavcodec/x86/dsputil_yasm.asm
index 8e3cbdc03df9f8066bee61bff2e5ff7e30a12b8c..f2894cd50137b20cb4187ba5accf7df0f7b34f90 100644
--- a/libavcodec/x86/dsputil_yasm.asm
+++ b/libavcodec/x86/dsputil_yasm.asm
@@ -1129,3 +1129,51 @@ VECTOR_CLIP_INT32 11, 1, 1, 0
 %else
 VECTOR_CLIP_INT32 6, 1, 0, 0
 %endif
+
+;-----------------------------------------------------------------------------
+; void ff_butterflies_float_interleave(float *dst, const float *src0,
+;                                      const float *src1, int len);
+;-----------------------------------------------------------------------------
+
+%macro BUTTERFLIES_FLOAT_INTERLEAVE 0
+cglobal butterflies_float_interleave, 4,4,3, dst, src0, src1, len
+%ifdef ARCH_X86_64
+    movsxd    lenq, lend
+%endif
+    test      lenq, lenq
+    jz .end
+    shl       lenq, 2
+    lea      src0q, [src0q +   lenq]
+    lea      src1q, [src1q +   lenq]
+    lea       dstq, [ dstq + 2*lenq]
+    neg       lenq
+.loop:
+    mova        m0, [src0q + lenq]
+    mova        m1, [src1q + lenq]
+    subps       m2, m0, m1
+    addps       m0, m0, m1
+    unpcklps    m1, m0, m2
+    unpckhps    m0, m0, m2
+%if cpuflag(avx)
+    vextractf128 [dstq + 2*lenq     ], m1, 0
+    vextractf128 [dstq + 2*lenq + 16], m0, 0
+    vextractf128 [dstq + 2*lenq + 32], m1, 1
+    vextractf128 [dstq + 2*lenq + 48], m0, 1
+%else
+    mova [dstq + 2*lenq         ], m1
+    mova [dstq + 2*lenq + mmsize], m0
+%endif
+    add       lenq, mmsize
+    jl .loop
+%if mmsize == 32
+    vzeroupper
+    RET
+%endif
+.end:
+    REP_RET
+%endmacro
+
+INIT_XMM sse
+BUTTERFLIES_FLOAT_INTERLEAVE
+INIT_YMM avx
+BUTTERFLIES_FLOAT_INTERLEAVE