From 388e0d2515bc6bbc9d0c9af1d230bd16cf945fe7 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Martin=20Storsj=C3=B6?= <martin@martin.st>
Date: Sat, 17 Dec 2016 13:14:38 +0200
Subject: [PATCH] aarch64: vp9mc: Calculate less unused data in the 4 pixel
 wide horizontal filter
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

No measured speedup on a Cortex A53, but other cores might benefit.

Signed-off-by: Martin Storsjö <martin@martin.st>
---
 libavcodec/aarch64/vp9mc_neon.S | 15 +++++++++++++--
 1 file changed, 13 insertions(+), 2 deletions(-)

diff --git a/libavcodec/aarch64/vp9mc_neon.S b/libavcodec/aarch64/vp9mc_neon.S
index 99f18092709..95ed26c2325 100644
--- a/libavcodec/aarch64/vp9mc_neon.S
+++ b/libavcodec/aarch64/vp9mc_neon.S
@@ -202,9 +202,12 @@ endfunc
         ext             v23.16b, \src5\().16b, \src6\().16b, #(2*\offset)
         mla             \dst2\().8h, v21.8h, v0.h[\offset]
         mla             \dst4\().8h, v23.8h, v0.h[\offset]
-.else
+.elseif \size == 8
         mla             \dst1\().8h, v20.8h, v0.h[\offset]
         mla             \dst3\().8h, v22.8h, v0.h[\offset]
+.else
+        mla             \dst1\().4h, v20.4h, v0.h[\offset]
+        mla             \dst3\().4h, v22.4h, v0.h[\offset]
 .endif
 .endm
 // The same as above, but don't accumulate straight into the
@@ -219,16 +222,24 @@ endfunc
         ext             v23.16b, \src5\().16b, \src6\().16b, #(2*\offset)
         mul             v21.8h, v21.8h, v0.h[\offset]
         mul             v23.8h, v23.8h, v0.h[\offset]
-.else
+.elseif \size == 8
         mul             v20.8h, v20.8h, v0.h[\offset]
         mul             v22.8h, v22.8h, v0.h[\offset]
+.else
+        mul             v20.4h, v20.4h, v0.h[\offset]
+        mul             v22.4h, v22.4h, v0.h[\offset]
 .endif
+.if \size == 4
+        sqadd           \dst1\().4h, \dst1\().4h, v20.4h
+        sqadd           \dst3\().4h, \dst3\().4h, v22.4h
+.else
         sqadd           \dst1\().8h, \dst1\().8h, v20.8h
         sqadd           \dst3\().8h, \dst3\().8h, v22.8h
 .if \size >= 16
         sqadd           \dst2\().8h, \dst2\().8h, v21.8h
         sqadd           \dst4\().8h, \dst4\().8h, v23.8h
 .endif
+.endif
 .endm
 
 
-- 
GitLab