diff --git a/libavcodec/x86/Makefile b/libavcodec/x86/Makefile index a972a2779afd750cea99adf4c9da5edc03bf6c12..adf4843a6198aad373e08f7d1eaef6e6eb1b6409 100644 --- a/libavcodec/x86/Makefile +++ b/libavcodec/x86/Makefile @@ -50,13 +50,11 @@ OBJS-$(CONFIG_VP9_DECODER) += x86/vp9dsp_init.o OBJS-$(CONFIG_WEBP_DECODER) += x86/vp8dsp_init.o MMX-OBJS-$(CONFIG_DSPUTIL) += x86/dsputil_mmx.o \ - x86/fpel_mmx.o \ x86/idct_mmx_xvid.o \ x86/idct_sse2_xvid.o \ x86/simple_idct.o MMX-OBJS-$(CONFIG_DIRAC_DECODER) += x86/dirac_dwt.o -MMX-OBJS-$(CONFIG_HPELDSP) += x86/fpel_mmx.o \ - x86/hpeldsp_mmx.o +MMX-OBJS-$(CONFIG_HPELDSP) += x86/hpeldsp_mmx.o MMX-OBJS-$(CONFIG_SNOW_DECODER) += x86/snowdsp.o MMX-OBJS-$(CONFIG_SNOW_ENCODER) += x86/snowdsp.o diff --git a/libavcodec/x86/cavsdsp.c b/libavcodec/x86/cavsdsp.c index b45126cf011f553fc06efb40d4c81273aa968cf5..78d4689ba0ea75d66da9466303ce09e63164229e 100644 --- a/libavcodec/x86/cavsdsp.c +++ b/libavcodec/x86/cavsdsp.c @@ -461,7 +461,7 @@ static void OPNAME ## cavs_qpel ## SIZE ## _mc03_ ## MMX(uint8_t *dst, uint8_t * #endif /* (HAVE_MMXEXT_INLINE || HAVE_AMD3DNOW_INLINE) */ -#if HAVE_MMX_INLINE +#if HAVE_MMX_EXTERNAL static void put_cavs_qpel8_mc00_mmx(uint8_t *dst, uint8_t *src, ptrdiff_t stride) { @@ -485,19 +485,23 @@ static void avg_cavs_qpel16_mc00_mmx(uint8_t *dst, uint8_t *src, { ff_avg_pixels16_mmx(dst, src, stride, 16); } +#endif static av_cold void cavsdsp_init_mmx(CAVSDSPContext *c, AVCodecContext *avctx) { +#if HAVE_MMX_EXTERNAL c->put_cavs_qpel_pixels_tab[0][0] = put_cavs_qpel16_mc00_mmx; c->put_cavs_qpel_pixels_tab[1][0] = put_cavs_qpel8_mc00_mmx; c->avg_cavs_qpel_pixels_tab[0][0] = avg_cavs_qpel16_mc00_mmx; c->avg_cavs_qpel_pixels_tab[1][0] = avg_cavs_qpel8_mc00_mmx; +#endif +#if HAVE_MMX_INLINE c->cavs_idct8_add = cavs_idct8_add_mmx; c->idct_perm = FF_TRANSPOSE_IDCT_PERM; -} #endif /* HAVE_MMX_INLINE */ +} #define DSPFUNC(PFX, IDX, NUM, EXT) \ c->PFX ## _cavs_qpel_pixels_tab[IDX][ 2] = PFX ## _cavs_qpel ## NUM ## _mc20_ ## EXT; \ @@ -545,12 +549,9 @@ static av_cold void cavsdsp_init_3dnow(CAVSDSPContext *c, av_cold void ff_cavsdsp_init_x86(CAVSDSPContext *c, AVCodecContext *avctx) { -#if HAVE_MMX_INLINE int cpu_flags = av_get_cpu_flags(); - if (INLINE_MMX(cpu_flags)) - cavsdsp_init_mmx(c, avctx); -#endif /* HAVE_MMX_INLINE */ + cavsdsp_init_mmx(c, avctx); #if HAVE_AMD3DNOW_INLINE if (INLINE_AMD3DNOW(cpu_flags)) cavsdsp_init_3dnow(c, avctx); diff --git a/libavcodec/x86/dsputil_init.c b/libavcodec/x86/dsputil_init.c index 6c7b5218c46e9b439bd0d58ed238c78d65803b48..414da1411849d5cefa0d1650237b871bb68c9687 100644 --- a/libavcodec/x86/dsputil_init.c +++ b/libavcodec/x86/dsputil_init.c @@ -73,8 +73,8 @@ void ff_avg_mpeg4_qpel8_v_lowpass_mmxext(uint8_t *dst, uint8_t *src, int dstStride, int srcStride); void ff_put_no_rnd_mpeg4_qpel8_v_lowpass_mmxext(uint8_t *dst, uint8_t *src, int dstStride, int srcStride); -#define ff_put_no_rnd_pixels16_mmxext ff_put_pixels16_mmxext -#define ff_put_no_rnd_pixels8_mmxext ff_put_pixels8_mmxext +#define ff_put_no_rnd_pixels16_mmxext ff_put_pixels16_mmx +#define ff_put_no_rnd_pixels8_mmxext ff_put_pixels8_mmx int32_t ff_scalarproduct_int16_mmxext(const int16_t *v1, const int16_t *v2, int order); @@ -112,8 +112,11 @@ void ff_vector_clip_int32_sse4(int32_t *dst, const int32_t *src, #if HAVE_YASM -CALL_2X_PIXELS(ff_avg_pixels16_mmxext, ff_avg_pixels8_mmxext, 8) -CALL_2X_PIXELS(ff_put_pixels16_mmxext, ff_put_pixels8_mmxext, 8) +#define ff_put_pixels16_mmxext ff_put_pixels16_mmx +#define ff_put_pixels8_mmxext ff_put_pixels8_mmx + +void ff_avg_pixels16_mmxext(uint8_t *block, const uint8_t *pixels, + ptrdiff_t line_size, int h); #define QPEL_OP(OPNAME, RND, MMX) \ static void OPNAME ## qpel8_mc00_ ## MMX(uint8_t *dst, uint8_t *src, \ diff --git a/libavcodec/x86/dsputil_mmx.c b/libavcodec/x86/dsputil_mmx.c index 86100ba6ff86e9c6791364b3b4b00be1fea03b41..42e25c4ff6ad292c19fff89a9e9968b23c6d7282 100644 --- a/libavcodec/x86/dsputil_mmx.c +++ b/libavcodec/x86/dsputil_mmx.c @@ -554,13 +554,12 @@ void ff_ ## OPNAME2 ## _dirac_pixels32_ ## EXT(uint8_t *dst, const uint8_t *src[ }\ } -#if HAVE_MMX_INLINE -CALL_2X_PIXELS(ff_avg_pixels16_mmxext, ff_avg_pixels8_mmxext, 8) +#if HAVE_YASM +void ff_avg_pixels16_mmxext(uint8_t *block, const uint8_t *pixels, + ptrdiff_t line_size, int h); + DIRAC_PIXOP(put, ff_put, mmx) DIRAC_PIXOP(avg, ff_avg, mmx) -#endif - -#if HAVE_YASM DIRAC_PIXOP(avg, ff_avg, mmxext) void ff_put_dirac_pixels16_sse2(uint8_t *dst, const uint8_t *src[5], int stride, int h) diff --git a/libavcodec/x86/fpel.asm b/libavcodec/x86/fpel.asm index de6b1d6c1b5cb0f84b01b3fec978decbf2f170ff..0e3b444e2af01b0e7e2aa176aca340ad635d4667 100644 --- a/libavcodec/x86/fpel.asm +++ b/libavcodec/x86/fpel.asm @@ -25,85 +25,83 @@ SECTION .text -INIT_MMX mmxext +%macro PAVGB_MMX 4 + LOAD %3, %1 + por %3, %2 + pxor %2, %1 + pand %2, %4 + psrlq %2, 1 + psubb %3, %2 + SWAP %2, %3 +%endmacro + ; void ff_put/avg_pixels(uint8_t *block, const uint8_t *pixels, ; ptrdiff_t line_size, int h) -%macro PIXELS48 2 -%if %2 == 4 -%define OP movh +%macro OP_PIXELS 2 +%if %2 == mmsize/2 +%define LOAD movh +%define SAVE movh +%define LEN mmsize %else -%define OP mova +%define LOAD movu +%define SAVE mova +%define LEN %2 %endif -cglobal %1_pixels%2, 4,5 +cglobal %1_pixels%2, 4,5,4 movsxdifnidn r2, r2d lea r4, [r2*3] +%ifidn %1, avg +%if notcpuflag(mmxext) + pcmpeqd m6, m6 + paddb m6, m6 +%endif +%endif .loop: - OP m0, [r1] - OP m1, [r1+r2] - OP m2, [r1+r2*2] - OP m3, [r1+r4] - lea r1, [r1+r2*4] +%assign %%i 0 +%rep LEN/mmsize + LOAD m0, [r1 + %%i] + LOAD m1, [r1+r2 + %%i] + LOAD m2, [r1+r2*2 + %%i] + LOAD m3, [r1+r4 + %%i] %ifidn %1, avg - pavgb m0, [r0] - pavgb m1, [r0+r2] - pavgb m2, [r0+r2*2] - pavgb m3, [r0+r4] +%if notcpuflag(mmxext) + PAVGB_MMX [r0 + %%i], m0, m4, m6 + PAVGB_MMX [r0+r2 + %%i], m1, m5, m6 + PAVGB_MMX [r0+r2*2 + %%i], m2, m4, m6 + PAVGB_MMX [r0+r4 + %%i], m3, m5, m6 +%else + pavgb m0, [r0 + %%i] + pavgb m1, [r0+r2 + %%i] + pavgb m2, [r0+r2*2 + %%i] + pavgb m3, [r0+r4 + %%i] +%endif %endif - OP [r0], m0 - OP [r0+r2], m1 - OP [r0+r2*2], m2 - OP [r0+r4], m3 + SAVE [r0 + %%i], m0 + SAVE [r0+r2 + %%i], m1 + SAVE [r0+r2*2 + %%i], m2 + SAVE [r0+r4 + %%i], m3 +%assign %%i %%i+mmsize +%endrep sub r3d, 4 + lea r1, [r1+r2*4] lea r0, [r0+r2*4] jne .loop RET %endmacro -PIXELS48 put, 4 -PIXELS48 avg, 4 -PIXELS48 put, 8 -PIXELS48 avg, 8 +INIT_MMX mmx +OP_PIXELS put, 4 +OP_PIXELS avg, 4 +OP_PIXELS put, 8 +OP_PIXELS avg, 8 +OP_PIXELS put, 16 +OP_PIXELS avg, 16 +INIT_MMX mmxext +OP_PIXELS avg, 4 +OP_PIXELS avg, 8 +OP_PIXELS avg, 16 INIT_XMM sse2 -; void ff_put_pixels16_sse2(uint8_t *block, const uint8_t *pixels, -; ptrdiff_t line_size, int h) -cglobal put_pixels16, 4,5,4 - lea r4, [r2*3] -.loop: - movu m0, [r1] - movu m1, [r1+r2] - movu m2, [r1+r2*2] - movu m3, [r1+r4] - lea r1, [r1+r2*4] - mova [r0], m0 - mova [r0+r2], m1 - mova [r0+r2*2], m2 - mova [r0+r4], m3 - sub r3d, 4 - lea r0, [r0+r2*4] - jnz .loop - REP_RET - -; void ff_avg_pixels16_sse2(uint8_t *block, const uint8_t *pixels, -; ptrdiff_t line_size, int h) -cglobal avg_pixels16, 4,5,4 - lea r4, [r2*3] -.loop: - movu m0, [r1] - movu m1, [r1+r2] - movu m2, [r1+r2*2] - movu m3, [r1+r4] - lea r1, [r1+r2*4] - pavgb m0, [r0] - pavgb m1, [r0+r2] - pavgb m2, [r0+r2*2] - pavgb m3, [r0+r4] - mova [r0], m0 - mova [r0+r2], m1 - mova [r0+r2*2], m2 - mova [r0+r4], m3 - sub r3d, 4 - lea r0, [r0+r2*4] - jnz .loop - REP_RET +OP_PIXELS put, 16 +OP_PIXELS avg, 16 diff --git a/libavcodec/x86/fpel_mmx.c b/libavcodec/x86/fpel_mmx.c deleted file mode 100644 index 45cc57e6d7175bbef705945786f982f50317cb9c..0000000000000000000000000000000000000000 --- a/libavcodec/x86/fpel_mmx.c +++ /dev/null @@ -1,140 +0,0 @@ -/* - * MMX-optimized avg/put pixel routines - * - * Copyright (c) 2000, 2001 Fabrice Bellard - * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at> - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#include <stddef.h> -#include <stdint.h> - -#include "config.h" -#include "fpel.h" -#include "inline_asm.h" - -#if HAVE_MMX_INLINE - -// in case more speed is needed - unrolling would certainly help -void ff_avg_pixels8_mmx(uint8_t *block, const uint8_t *pixels, - ptrdiff_t line_size, int h) -{ - MOVQ_BFE(mm6); - JUMPALIGN(); - do { - __asm__ volatile( - "movq %0, %%mm0 \n\t" - "movq %1, %%mm1 \n\t" - PAVGB_MMX(%%mm0, %%mm1, %%mm2, %%mm6) - "movq %%mm2, %0 \n\t" - :"+m"(*block) - :"m"(*pixels) - :"memory"); - pixels += line_size; - block += line_size; - } - while (--h); -} - -void ff_avg_pixels16_mmx(uint8_t *block, const uint8_t *pixels, - ptrdiff_t line_size, int h) -{ - MOVQ_BFE(mm6); - JUMPALIGN(); - do { - __asm__ volatile( - "movq %0, %%mm0 \n\t" - "movq %1, %%mm1 \n\t" - PAVGB_MMX(%%mm0, %%mm1, %%mm2, %%mm6) - "movq %%mm2, %0 \n\t" - "movq 8%0, %%mm0 \n\t" - "movq 8%1, %%mm1 \n\t" - PAVGB_MMX(%%mm0, %%mm1, %%mm2, %%mm6) - "movq %%mm2, 8%0 \n\t" - :"+m"(*block) - :"m"(*pixels) - :"memory"); - pixels += line_size; - block += line_size; - } - while (--h); -} - -void ff_put_pixels8_mmx(uint8_t *block, const uint8_t *pixels, - ptrdiff_t line_size, int h) -{ - __asm__ volatile ( - "lea (%3, %3), %%"REG_a" \n\t" - ".p2align 3 \n\t" - "1: \n\t" - "movq (%1 ), %%mm0 \n\t" - "movq (%1, %3), %%mm1 \n\t" - "movq %%mm0, (%2) \n\t" - "movq %%mm1, (%2, %3) \n\t" - "add %%"REG_a", %1 \n\t" - "add %%"REG_a", %2 \n\t" - "movq (%1 ), %%mm0 \n\t" - "movq (%1, %3), %%mm1 \n\t" - "movq %%mm0, (%2) \n\t" - "movq %%mm1, (%2, %3) \n\t" - "add %%"REG_a", %1 \n\t" - "add %%"REG_a", %2 \n\t" - "subl $4, %0 \n\t" - "jnz 1b \n\t" - : "+g"(h), "+r"(pixels), "+r"(block) - : "r"((x86_reg)line_size) - : "%"REG_a, "memory" - ); -} - -void ff_put_pixels16_mmx(uint8_t *block, const uint8_t *pixels, - ptrdiff_t line_size, int h) -{ - __asm__ volatile ( - "lea (%3, %3), %%"REG_a" \n\t" - ".p2align 3 \n\t" - "1: \n\t" - "movq (%1 ), %%mm0 \n\t" - "movq 8(%1 ), %%mm4 \n\t" - "movq (%1, %3), %%mm1 \n\t" - "movq 8(%1, %3), %%mm5 \n\t" - "movq %%mm0, (%2) \n\t" - "movq %%mm4, 8(%2) \n\t" - "movq %%mm1, (%2, %3) \n\t" - "movq %%mm5, 8(%2, %3) \n\t" - "add %%"REG_a", %1 \n\t" - "add %%"REG_a", %2 \n\t" - "movq (%1 ), %%mm0 \n\t" - "movq 8(%1 ), %%mm4 \n\t" - "movq (%1, %3), %%mm1 \n\t" - "movq 8(%1, %3), %%mm5 \n\t" - "movq %%mm0, (%2) \n\t" - "movq %%mm4, 8(%2) \n\t" - "movq %%mm1, (%2, %3) \n\t" - "movq %%mm5, 8(%2, %3) \n\t" - "add %%"REG_a", %1 \n\t" - "add %%"REG_a", %2 \n\t" - "subl $4, %0 \n\t" - "jnz 1b \n\t" - : "+g"(h), "+r"(pixels), "+r"(block) - : "r"((x86_reg)line_size) - : "%"REG_a, "memory" - ); -} - -#endif /* HAVE_MMX_INLINE */ diff --git a/libavcodec/x86/h264_qpel.c b/libavcodec/x86/h264_qpel.c index 8edaeacde62a9cf50acb267ae47d3b0e761856ac..1a06acab2edcf2d89fdbcd62f3677583c4f69a52 100644 --- a/libavcodec/x86/h264_qpel.c +++ b/libavcodec/x86/h264_qpel.c @@ -29,8 +29,8 @@ #include "fpel.h" #if HAVE_YASM -void ff_put_pixels4_mmxext(uint8_t *block, const uint8_t *pixels, - ptrdiff_t line_size, int h); +void ff_put_pixels4_mmx(uint8_t *block, const uint8_t *pixels, + ptrdiff_t line_size, int h); void ff_avg_pixels4_mmxext(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h); void ff_put_pixels4_l2_mmxext(uint8_t *dst, uint8_t *src1, uint8_t *src2, @@ -49,9 +49,12 @@ void ff_avg_pixels16_l2_mmxext(uint8_t *dst, uint8_t *src1, uint8_t *src2, #define ff_avg_pixels8_l2_sse2 ff_avg_pixels8_l2_mmxext #define ff_put_pixels16_l2_sse2 ff_put_pixels16_l2_mmxext #define ff_avg_pixels16_l2_sse2 ff_avg_pixels16_l2_mmxext +#define ff_put_pixels16_mmxext ff_put_pixels16_mmx +#define ff_put_pixels8_mmxext ff_put_pixels8_mmx +#define ff_put_pixels4_mmxext ff_put_pixels4_mmx -CALL_2X_PIXELS(ff_avg_pixels16_mmxext, ff_avg_pixels8_mmxext, 8) -CALL_2X_PIXELS(ff_put_pixels16_mmxext, ff_put_pixels8_mmxext, 8) +void ff_avg_pixels16_mmxext(uint8_t *block, const uint8_t *pixels, + ptrdiff_t line_size, int h); #define DEF_QPEL(OPNAME)\ void ff_ ## OPNAME ## _h264_qpel4_h_lowpass_mmxext(uint8_t *dst, uint8_t *src, int dstStride, int srcStride);\ diff --git a/libavcodec/x86/hpeldsp_init.c b/libavcodec/x86/hpeldsp_init.c index f860533f7e5d923341faafc3ac7746b5abfb0e7d..e9878cf916605285fab1d028e05c29fa14ebebd8 100644 --- a/libavcodec/x86/hpeldsp_init.c +++ b/libavcodec/x86/hpeldsp_init.c @@ -165,15 +165,17 @@ HPELDSP_AVG_PIXELS16(_mmxext) #define SET_HPEL_FUNCS(PFX, IDX, SIZE, CPU) \ do { \ + if (HAVE_MMX_EXTERNAL) \ c->PFX ## _pixels_tab IDX [0] = PFX ## _pixels ## SIZE ## _ ## CPU; \ + if (HAVE_MMX_INLINE) { \ c->PFX ## _pixels_tab IDX [1] = PFX ## _pixels ## SIZE ## _x2_ ## CPU; \ c->PFX ## _pixels_tab IDX [2] = PFX ## _pixels ## SIZE ## _y2_ ## CPU; \ c->PFX ## _pixels_tab IDX [3] = PFX ## _pixels ## SIZE ## _xy2_ ## CPU; \ + } \ } while (0) static void hpeldsp_init_mmx(HpelDSPContext *c, int flags, int cpu_flags) { -#if HAVE_MMX_INLINE SET_HPEL_FUNCS(put, [0], 16, mmx); SET_HPEL_FUNCS(put_no_rnd, [0], 16, mmx); SET_HPEL_FUNCS(avg, [0], 16, mmx); @@ -181,7 +183,6 @@ static void hpeldsp_init_mmx(HpelDSPContext *c, int flags, int cpu_flags) SET_HPEL_FUNCS(put, [1], 8, mmx); SET_HPEL_FUNCS(put_no_rnd, [1], 8, mmx); SET_HPEL_FUNCS(avg, [1], 8, mmx); -#endif /* HAVE_MMX_INLINE */ } static void hpeldsp_init_mmxext(HpelDSPContext *c, int flags, int cpu_flags) diff --git a/libavcodec/x86/vc1dsp_mmx.c b/libavcodec/x86/vc1dsp_mmx.c index 87e4638ca79d58ecb4b3790f691754b43b34e1e8..77a8e359ff76f9700b33b39a2c4fd8a12ef7f835 100644 --- a/libavcodec/x86/vc1dsp_mmx.c +++ b/libavcodec/x86/vc1dsp_mmx.c @@ -728,6 +728,7 @@ static void vc1_inv_trans_8x8_dc_mmxext(uint8_t *dest, int linesize, ); } +#if HAVE_MMX_EXTERNAL static void put_vc1_mspel_mc00_mmx(uint8_t *dst, const uint8_t *src, ptrdiff_t stride, int rnd) { @@ -748,6 +749,7 @@ static void avg_vc1_mspel_mc00_16_mmx(uint8_t *dst, const uint8_t *src, { ff_avg_pixels16_mmx(dst, src, stride, 16); } +#endif #define FN_ASSIGN(OP, X, Y, INSN) \ dsp->OP##vc1_mspel_pixels_tab[1][X+4*Y] = OP##vc1_mspel_mc##X##Y##INSN; \ @@ -755,7 +757,10 @@ static void avg_vc1_mspel_mc00_16_mmx(uint8_t *dst, const uint8_t *src, av_cold void ff_vc1dsp_init_mmx(VC1DSPContext *dsp) { +#if HAVE_MMX_EXTERNAL FN_ASSIGN(put_, 0, 0, _mmx); + FN_ASSIGN(avg_, 0, 0, _mmx); +#endif FN_ASSIGN(put_, 0, 1, _mmx); FN_ASSIGN(put_, 0, 2, _mmx); FN_ASSIGN(put_, 0, 3, _mmx); @@ -774,8 +779,6 @@ av_cold void ff_vc1dsp_init_mmx(VC1DSPContext *dsp) FN_ASSIGN(put_, 3, 1, _mmx); FN_ASSIGN(put_, 3, 2, _mmx); FN_ASSIGN(put_, 3, 3, _mmx); - - FN_ASSIGN(avg_, 0, 0, _mmx); } av_cold void ff_vc1dsp_init_mmxext(VC1DSPContext *dsp)