diff --git a/libavutil/x86/x86inc.asm b/libavutil/x86/x86inc.asm
index 483ae7b0484dbfdbdad8cdb363a4adcbc0efc08b..de0f22c2dcbb0ccf070fd8a2128471f783cb1fb9 100644
--- a/libavutil/x86/x86inc.asm
+++ b/libavutil/x86/x86inc.asm
@@ -428,7 +428,7 @@ DECLARE_REG 14, R15, 120
     %assign %%i xmm_regs_used
     %rep (xmm_regs_used-6)
         %assign %%i %%i-1
-        movdqa [rsp + (%%i-6)*16 + stack_size], xmm %+ %%i
+        movdqa [rsp + (%%i-6)*16 + stack_size + (~stack_offset&8)], xmm %+ %%i
     %endrep
 %endmacro
 
@@ -436,8 +436,7 @@ DECLARE_REG 14, R15, 120
     %assign xmm_regs_used %1
     ASSERT xmm_regs_used <= 16
     %if xmm_regs_used > 6
-        %assign stack_size_padded (xmm_regs_used-6)*16+16-gprsize-(stack_offset&15)
-        SUB rsp, stack_size_padded
+        SUB rsp, (xmm_regs_used-6)*16+16
         WIN64_PUSH_XMM
     %endif
 %endmacro
@@ -447,8 +446,11 @@ DECLARE_REG 14, R15, 120
         %assign %%i xmm_regs_used
         %rep (xmm_regs_used-6)
             %assign %%i %%i-1
-            movdqa xmm %+ %%i, [%1 + (%%i-6)*16+stack_size]
+            movdqa xmm %+ %%i, [%1 + (%%i-6)*16+stack_size+(~stack_offset&8)]
         %endrep
+        %if stack_size_padded == 0
+            add %1, (xmm_regs_used-6)*16+16
+        %endif
     %endif
     %if stack_size_padded > 0
         %if stack_size > 0 && (mmsize == 32 || HAVE_ALIGNED_STACK == 0)