From e4b36d443428f86d6f9db8af1897553818e76fa0 Mon Sep 17 00:00:00 2001
From: Michael Niedermayer <michaelni@gmx.at>
Date: Wed, 1 Jun 2005 08:43:40 +0000
Subject: [PATCH] avoid one transpose (730->680 dezicycles on duron)

Originally committed as revision 4332 to svn://svn.ffmpeg.org/ffmpeg/trunk
---
 libavcodec/h264.c          | 22 ++++++++++++++++++----
 libavcodec/i386/idct_mmx.c | 11 ++++-------
 2 files changed, 22 insertions(+), 11 deletions(-)

diff --git a/libavcodec/h264.c b/libavcodec/h264.c
index d8dbc3fa0bc..fa419e89810 100644
--- a/libavcodec/h264.c
+++ b/libavcodec/h264.c
@@ -333,6 +333,8 @@ typedef struct H264Context{
     uint8_t     *direct_table;
     uint8_t     direct_cache[5*8];
 
+    uint8_t zigzag_scan[16];
+    uint8_t field_scan[16];
 }H264Context;
 
 static VLC coeff_token_vlc[4];
@@ -2721,6 +2723,18 @@ static int decode_init(AVCodecContext *avctx){
     s->low_delay= 1;
     avctx->pix_fmt= PIX_FMT_YUV420P;
 
+    if(s->dsp.h264_idct_add == ff_h264_idct_add_c){ //FIXME little ugly
+        memcpy(h->zigzag_scan, zigzag_scan, 16*sizeof(uint8_t));
+        memcpy(h-> field_scan,  field_scan, 16*sizeof(uint8_t));
+    }else{
+        int i;
+        for(i=0; i<16; i++){
+#define T(x) (x>>2) | ((x<<2) & 0xF)
+            h->zigzag_scan[i] = T(zigzag_scan[i]);
+            h-> field_scan[i] = T( field_scan[i]);
+        }
+    }
+
     decode_init_vlc(h);
     
     if(avctx->extradata_size > 0 && avctx->extradata &&
@@ -4591,10 +4605,10 @@ decode_intra_mb:
 //        fill_non_zero_count_cache(h);
 
         if(IS_INTERLACED(mb_type)){
-            scan= field_scan;
+            scan= h->field_scan;
             dc_scan= luma_dc_field_scan;
         }else{
-            scan= zigzag_scan;
+            scan= h->zigzag_scan;
             dc_scan= luma_dc_zigzag_scan;
         }
 
@@ -5575,10 +5589,10 @@ decode_intra_mb:
         int dqp;
 
         if(IS_INTERLACED(mb_type)){
-            scan= field_scan;
+            scan= h->field_scan;
             dc_scan= luma_dc_field_scan;
         }else{
-            scan= zigzag_scan;
+            scan= h->zigzag_scan;
             dc_scan= luma_dc_zigzag_scan;
         }
 
diff --git a/libavcodec/i386/idct_mmx.c b/libavcodec/i386/idct_mmx.c
index 6ce73ae2c32..7e50f4c86bd 100644
--- a/libavcodec/i386/idct_mmx.c
+++ b/libavcodec/i386/idct_mmx.c
@@ -673,14 +673,11 @@ void ff_h264_idct_add_mmx2(uint8_t *dst, int16_t *block, int stride)
         /* mm2=s02+s13  mm3=s02-s13  mm4=d02+d13  mm1=d02-d13 */
         IDCT4_1D( %%mm3, %%mm2, %%mm1, %%mm0, %%mm4, %%mm5 )
 
-        /* in: 2,4,1,3  out: 2,3,0,1 */
-        TRANSPOSE4( %%mm2, %%mm4, %%mm1, %%mm3, %%mm0 )
-
         "pxor %%mm7, %%mm7    \n\t"
     :: "m"(ff_pw_32));
 
-    STORE_DIFF_4P( %%mm2, %%mm4, %%mm7, &dst[0*stride] );
-    STORE_DIFF_4P( %%mm3, %%mm4, %%mm7, &dst[1*stride] );
-    STORE_DIFF_4P( %%mm0, %%mm4, %%mm7, &dst[2*stride] );
-    STORE_DIFF_4P( %%mm1, %%mm4, %%mm7, &dst[3*stride] );
+    STORE_DIFF_4P( %%mm2, %%mm0, %%mm7, &dst[0*stride] );
+    STORE_DIFF_4P( %%mm4, %%mm0, %%mm7, &dst[1*stride] );
+    STORE_DIFF_4P( %%mm1, %%mm0, %%mm7, &dst[2*stride] );
+    STORE_DIFF_4P( %%mm3, %%mm0, %%mm7, &dst[3*stride] );
 }
-- 
GitLab