From b385c4c6a316f798b6a14418f09e545cda7fef7f Mon Sep 17 00:00:00 2001
From: James Almer <jamrial@gmail.com>
Date: Thu, 6 Nov 2014 20:43:06 -0300
Subject: [PATCH] x86/swr: replace sse4 instructions in pack_6ch with sse ones

There's no benefit from using blendps here except on CPUs with AVX, where
it's faster than shufps according to Intel's documentation.
As such, rename the sse4 functions to sse/sse2 and use shufps instead.

Reviewed-by: Michael Niedermayer <michaelni@gmx.at>
Signed-off-by: James Almer <jamrial@gmail.com>
---
 libswresample/x86/audio_convert.asm    | 31 +++++++++++++++++---------
 libswresample/x86/audio_convert_init.c | 23 ++++++++++---------
 2 files changed, 33 insertions(+), 21 deletions(-)

diff --git a/libswresample/x86/audio_convert.asm b/libswresample/x86/audio_convert.asm
index b6e9e5d79d5..d77e93439b9 100644
--- a/libswresample/x86/audio_convert.asm
+++ b/libswresample/x86/audio_convert.asm
@@ -245,15 +245,27 @@ pack_6ch_%2_to_%1_u_int %+ SUFFIX
     mov%3     m4, [srcq+src4q]
     mov%3     m5, [srcq+src5q]
     %7 x,x,x,x,m7,x
-%if cpuflag(sse4)
+%if cpuflag(sse)
     SBUTTERFLYPS 0, 1, 6
     SBUTTERFLYPS 2, 3, 6
     SBUTTERFLYPS 4, 5, 6
 
+%if cpuflag(avx)
     blendps   m6, m4, m0, 1100b
+%else
+    movaps    m6, m4
+    shufps    m4, m0, q3210
+    SWAP 4,6
+%endif
     movlhps   m0, m2
     movhlps   m4, m2
+%if cpuflag(avx)
     blendps   m2, m5, m1, 1100b
+%else
+    movaps    m2, m5
+    shufps    m5, m1, q3210
+    SWAP 2,5
+%endif
     movlhps   m1, m3
     movhlps   m5, m3
 
@@ -380,6 +392,10 @@ CONV int16, int32, a, 1, 2, INT32_TO_INT16_N, NOP_N
 PACK_6CH float, float, u, 2, 2, NOP_N, NOP_N
 PACK_6CH float, float, a, 2, 2, NOP_N, NOP_N
 
+INIT_XMM sse
+PACK_6CH float, float, u, 2, 2, NOP_N, NOP_N
+PACK_6CH float, float, a, 2, 2, NOP_N, NOP_N
+
 INIT_XMM sse2
 CONV int32, int16, u, 2, 1, INT16_TO_INT32_N, NOP_N
 CONV int32, int16, a, 2, 1, INT16_TO_INT32_N, NOP_N
@@ -431,6 +447,10 @@ UNPACK_2CH float, int16, a, 2, 1, INT16_TO_FLOAT_N, INT16_TO_FLOAT_INIT
 UNPACK_2CH int16, float, u, 1, 2, FLOAT_TO_INT16_N, FLOAT_TO_INT16_INIT
 UNPACK_2CH int16, float, a, 1, 2, FLOAT_TO_INT16_N, FLOAT_TO_INT16_INIT
 
+PACK_6CH float, int32, u, 2, 2, INT32_TO_FLOAT_N, INT32_TO_FLOAT_INIT
+PACK_6CH float, int32, a, 2, 2, INT32_TO_FLOAT_N, INT32_TO_FLOAT_INIT
+PACK_6CH int32, float, u, 2, 2, FLOAT_TO_INT32_N, FLOAT_TO_INT32_INIT
+PACK_6CH int32, float, a, 2, 2, FLOAT_TO_INT32_N, FLOAT_TO_INT32_INIT
 
 INIT_XMM ssse3
 UNPACK_2CH int16, int16, u, 1, 1, NOP_N, NOP_N
@@ -440,15 +460,6 @@ UNPACK_2CH int32, int16, a, 2, 1, INT16_TO_INT32_N, NOP_N
 UNPACK_2CH float, int16, u, 2, 1, INT16_TO_FLOAT_N, INT16_TO_FLOAT_INIT
 UNPACK_2CH float, int16, a, 2, 1, INT16_TO_FLOAT_N, INT16_TO_FLOAT_INIT
 
-INIT_XMM sse4
-PACK_6CH float, float, u, 2, 2, NOP_N, NOP_N
-PACK_6CH float, float, a, 2, 2, NOP_N, NOP_N
-
-PACK_6CH float, int32, u, 2, 2, INT32_TO_FLOAT_N, INT32_TO_FLOAT_INIT
-PACK_6CH float, int32, a, 2, 2, INT32_TO_FLOAT_N, INT32_TO_FLOAT_INIT
-PACK_6CH int32, float, u, 2, 2, FLOAT_TO_INT32_N, FLOAT_TO_INT32_INIT
-PACK_6CH int32, float, a, 2, 2, FLOAT_TO_INT32_N, FLOAT_TO_INT32_INIT
-
 %if HAVE_AVX_EXTERNAL
 INIT_XMM avx
 PACK_6CH float, float, u, 2, 2, NOP_N, NOP_N
diff --git a/libswresample/x86/audio_convert_init.c b/libswresample/x86/audio_convert_init.c
index a26cdf6ea6b..769575d0fc7 100644
--- a/libswresample/x86/audio_convert_init.c
+++ b/libswresample/x86/audio_convert_init.c
@@ -58,7 +58,12 @@ MULTI_CAPS_FUNC(SSE2, sse2)
                 ac->simd_f =  ff_pack_6ch_float_to_float_a_mmx;
         }
     }
-
+    if(EXTERNAL_SSE(mm_flags)) {
+        if(channels == 6) {
+            if(   out_fmt == AV_SAMPLE_FMT_FLT  && in_fmt == AV_SAMPLE_FMT_FLTP || out_fmt == AV_SAMPLE_FMT_S32 && in_fmt == AV_SAMPLE_FMT_S32P)
+                ac->simd_f =  ff_pack_6ch_float_to_float_a_sse;
+        }
+    }
     if(EXTERNAL_SSE2(mm_flags)) {
         if(   out_fmt == AV_SAMPLE_FMT_FLT  && in_fmt == AV_SAMPLE_FMT_S32 || out_fmt == AV_SAMPLE_FMT_FLTP && in_fmt == AV_SAMPLE_FMT_S32P)
             ac->simd_f =  ff_int32_to_float_a_sse2;
@@ -105,6 +110,12 @@ MULTI_CAPS_FUNC(SSE2, sse2)
             if(   out_fmt == AV_SAMPLE_FMT_S16P  && in_fmt == AV_SAMPLE_FMT_FLT)
                 ac->simd_f =  ff_unpack_2ch_float_to_int16_a_sse2;
         }
+        if(channels == 6) {
+            if(   out_fmt == AV_SAMPLE_FMT_FLT  && in_fmt == AV_SAMPLE_FMT_S32P)
+                ac->simd_f =  ff_pack_6ch_int32_to_float_a_sse2;
+            if(   out_fmt == AV_SAMPLE_FMT_S32  && in_fmt == AV_SAMPLE_FMT_FLTP)
+                ac->simd_f =  ff_pack_6ch_float_to_int32_a_sse2;
+        }
     }
     if(EXTERNAL_SSSE3(mm_flags)) {
         if(channels == 2) {
@@ -116,16 +127,6 @@ MULTI_CAPS_FUNC(SSE2, sse2)
                 ac->simd_f =  ff_unpack_2ch_int16_to_float_a_ssse3;
         }
     }
-    if(EXTERNAL_SSE4(mm_flags)) {
-        if(channels == 6) {
-            if(   out_fmt == AV_SAMPLE_FMT_FLT  && in_fmt == AV_SAMPLE_FMT_FLTP || out_fmt == AV_SAMPLE_FMT_S32 && in_fmt == AV_SAMPLE_FMT_S32P)
-                ac->simd_f =  ff_pack_6ch_float_to_float_a_sse4;
-            if(   out_fmt == AV_SAMPLE_FMT_FLT  && in_fmt == AV_SAMPLE_FMT_S32P)
-                ac->simd_f =  ff_pack_6ch_int32_to_float_a_sse4;
-            if(   out_fmt == AV_SAMPLE_FMT_S32  && in_fmt == AV_SAMPLE_FMT_FLTP)
-                ac->simd_f =  ff_pack_6ch_float_to_int32_a_sse4;
-        }
-    }
     if(EXTERNAL_AVX(mm_flags)) {
         if(   out_fmt == AV_SAMPLE_FMT_FLT  && in_fmt == AV_SAMPLE_FMT_S32 || out_fmt == AV_SAMPLE_FMT_FLTP && in_fmt == AV_SAMPLE_FMT_S32P)
             ac->simd_f =  ff_int32_to_float_a_avx;
-- 
GitLab