Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
F
FFmpeg
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Wiki
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Container Registry
Model registry
Operate
Environments
Monitor
Incidents
Service Desk
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
libremedia
Tethys
FFmpeg
Commits
6204feb1
Commit
6204feb1
authored
13 years ago
by
Vitor Sessak
Committed by
Reinhard Tartler
13 years ago
Browse files
Options
Downloads
Patches
Plain Diff
dct32: Add AVX implementation of 32-point DCT
parent
4e653b98
No related branches found
Branches containing commit
No related tags found
Tags containing commit
No related merge requests found
Changes
4
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
libavcodec/mpegaudiodec.c
+2
-2
2 additions, 2 deletions
libavcodec/mpegaudiodec.c
libavcodec/x86/dct32_sse.asm
+218
-116
218 additions, 116 deletions
libavcodec/x86/dct32_sse.asm
libavcodec/x86/fft.c
+3
-1
3 additions, 1 deletion
libavcodec/x86/fft.c
libavcodec/x86/fft.h
+1
-0
1 addition, 0 deletions
libavcodec/x86/fft.h
with
224 additions
and
119 deletions
libavcodec/mpegaudiodec.c
+
2
−
2
View file @
6204feb1
...
...
@@ -69,9 +69,9 @@ typedef struct MPADecodeContext {
uint32_t
free_format_next_header
;
GetBitContext
gb
;
GetBitContext
in_gb
;
DECLARE_ALIGNED
(
16
,
MPA_INT
,
synth_buf
)[
MPA_MAX_CHANNELS
][
512
*
2
];
DECLARE_ALIGNED
(
32
,
MPA_INT
,
synth_buf
)[
MPA_MAX_CHANNELS
][
512
*
2
];
int
synth_buf_offset
[
MPA_MAX_CHANNELS
];
DECLARE_ALIGNED
(
16
,
INTFLOAT
,
sb_samples
)[
MPA_MAX_CHANNELS
][
36
][
SBLIMIT
];
DECLARE_ALIGNED
(
32
,
INTFLOAT
,
sb_samples
)[
MPA_MAX_CHANNELS
][
36
][
SBLIMIT
];
INTFLOAT
mdct_buf
[
MPA_MAX_CHANNELS
][
SBLIMIT
*
18
];
/* previous samples, for layer 3 MDCT */
GranuleDef
granules
[
2
][
2
];
/* Used in Layer 3 */
#ifdef DEBUG
...
...
This diff is collapsed.
Click to expand it.
libavcodec/x86/dct32_sse.asm
+
218
−
116
View file @
6204feb1
...
...
@@ -20,31 +20,41 @@
;******************************************************************************
%include "x86inc.asm"
%include "config.asm"
SECTION
_RODATA
32
align
32
ps_cos_vec:
dd
0.500603
,
0.505471
,
0.515447
,
0.531043
dd
0.553104
,
0.582935
,
0.622504
,
0.674808
dd
-
1.169440
,
-
0.972568
,
-
0.839350
,
-
0.744536
dd
-
10.190008
,
-
3.407609
,
-
2.057781
,
-
1.484165
dd
-
1.169440
,
-
0.972568
,
-
0.839350
,
-
0.744536
dd
0.502419
,
0.522499
,
0.566944
,
0.646822
dd
0.788155
,
1.060678
,
1.722447
,
5.101149
dd
0.509796
,
0.601345
,
0.899976
,
2.562916
dd
0.509796
,
0.601345
,
0.899976
,
2.562916
dd
1.000000
,
1.000000
,
1.306563
,
0.541196
dd
1.000000
,
1.000000
,
1.306563
,
0.541196
dd
1.000000
,
0.707107
,
1.000000
,
-
0.707107
dd
1.000000
,
0.707107
,
1.000000
,
-
0.707107
ps_p1p1m1m1:
dd
0
,
0
,
0x80000000
,
0x80000000
ps_p1p1m1m1:
dd
0
,
0
,
0x80000000
,
0x80000000
,
0
,
0
,
0x80000000
,
0x80000000
%macro BUTTERFLY 4
%macro BUTTERFLY
_SSE
4
movaps
%
4
,
%
1
subps
%
1
,
%
2
addps
%
2
,
%
4
mulps
%
1
,
%
3
%endmacro
%macro BUTTERFLY0 5
%macro BUTTERFLY_AVX 4
vsubps
%
4
,
%
1
,
%
2
vaddps
%
2
,
%
2
,
%
1
vmulps
%
1
,
%
4
,
%
3
%endmacro
%macro BUTTERFLY0_SSE 5
movaps
%
4
,
%
1
shufps
%
1
,
%
1
,
%
5
xorps
%
4
,
%
2
...
...
@@ -52,6 +62,13 @@ ps_p1p1m1m1: dd 0, 0, 0x80000000, 0x80000000
mulps
%
1
,
%
3
%endmacro
%macro BUTTERFLY0_AVX 5
vshufps
%
4
,
%
1
,
%
1
,
%
5
vxorps
%
1
,
%
1
,
%
2
vaddps
%
4
,
%
4
,
%
1
vmulps
%
1
,
%
4
,
%
3
%endmacro
%macro BUTTERFLY2 4
BUTTERFLY0
%
1
,
%
2
,
%
3
,
%
4
,
0x1b
%endmacro
...
...
@@ -60,8 +77,199 @@ ps_p1p1m1m1: dd 0, 0, 0x80000000, 0x80000000
BUTTERFLY0
%
1
,
%
2
,
%
3
,
%
4
,
0xb1
%endmacro
INIT_XMM
%macro PASS6_AND_PERMUTE 0
mov
tmpd
,
[
outq
+
4
]
movss
m7
,
[
outq
+
72
]
addss
m7
,
[
outq
+
76
]
movss
m3
,
[
outq
+
56
]
addss
m3
,
[
outq
+
60
]
addss
m4
,
m3
movss
m2
,
[
outq
+
52
]
addss
m2
,
m3
movss
m3
,
[
outq
+
104
]
addss
m3
,
[
outq
+
108
]
addss
m1
,
m3
addss
m5
,
m4
movss
[
outq
+
16
],
m1
movss
m1
,
[
outq
+
100
]
addss
m1
,
m3
movss
m3
,
[
outq
+
40
]
movss
[
outq
+
48
],
m1
addss
m3
,
[
outq
+
44
]
movss
m1
,
[
outq
+
100
]
addss
m4
,
m3
addss
m3
,
m2
addss
m1
,
[
outq
+
108
]
movss
[
outq
+
40
],
m3
addss
m2
,
[
outq
+
36
]
movss
m3
,
[
outq
+
8
]
movss
[
outq
+
56
],
m2
addss
m3
,
[
outq
+
12
]
movss
[
outq
+
32
],
m3
movss
m3
,
[
outq
+
80
]
movss
[
outq
+
8
],
m5
movss
[
outq
+
80
],
m1
movss
m2
,
[
outq
+
52
]
movss
m5
,
[
outq
+
120
]
addss
m5
,
[
outq
+
124
]
movss
m1
,
[
outq
+
64
]
addss
m2
,
[
outq
+
60
]
addss
m0
,
m5
addss
m5
,
[
outq
+
116
]
mov
[
outq
+
64
],
tmpd
addss
m6
,
m0
addss
m1
,
m6
mov
tmpd
,
[
outq
+
12
]
mov
[
outq
+
96
],
tmpd
movss
[
outq
+
4
],
m1
movss
m1
,
[
outq
+
24
]
movss
[
outq
+
24
],
m4
movss
m4
,
[
outq
+
88
]
addss
m4
,
[
outq
+
92
]
addss
m3
,
m4
addss
m4
,
[
outq
+
84
]
mov
tmpd
,
[
outq
+
108
]
addss
m1
,
[
outq
+
28
]
addss
m0
,
m1
addss
m1
,
m5
addss
m6
,
m3
addss
m3
,
m0
addss
m0
,
m7
addss
m5
,
[
outq
+
20
]
addss
m7
,
m1
movss
[
outq
+
12
],
m6
mov
[
outq
+
112
],
tmpd
movss
m6
,
[
outq
+
28
]
movss
[
outq
+
28
],
m0
movss
m0
,
[
outq
+
36
]
movss
[
outq
+
36
],
m7
addss
m1
,
m4
movss
m7
,
[
outq
+
116
]
addss
m0
,
m2
addss
m7
,
[
outq
+
124
]
movss
[
outq
+
72
],
m0
movss
m0
,
[
outq
+
44
]
addss
m2
,
m0
movss
[
outq
+
44
],
m1
movss
[
outq
+
88
],
m2
addss
m0
,
[
outq
+
60
]
mov
tmpd
,
[
outq
+
60
]
mov
[
outq
+
120
],
tmpd
movss
[
outq
+
104
],
m0
addss
m4
,
m5
addss
m5
,
[
outq
+
68
]
movss
[
outq
+
52
],
m4
movss
[
outq
+
60
],
m5
movss
m4
,
[
outq
+
68
]
movss
m5
,
[
outq
+
20
]
movss
[
outq
+
20
],
m3
addss
m5
,
m7
addss
m7
,
m6
addss
m4
,
m5
movss
m2
,
[
outq
+
84
]
addss
m2
,
[
outq
+
92
]
addss
m5
,
m2
movss
[
outq
+
68
],
m4
addss
m2
,
m7
movss
m4
,
[
outq
+
76
]
movss
[
outq
+
84
],
m2
movss
[
outq
+
76
],
m5
addss
m7
,
m4
addss
m6
,
[
outq
+
124
]
addss
m4
,
m6
addss
m6
,
[
outq
+
92
]
movss
[
outq
+
100
],
m4
movss
[
outq
+
108
],
m6
movss
m6
,
[
outq
+
92
]
movss
[
outq
+
92
],
m7
addss
m6
,
[
outq
+
124
]
movss
[
outq
+
116
],
m6
%endmacro
%define BUTTERFLY BUTTERFLY_AVX
%define BUTTERFLY0 BUTTERFLY0_AVX
INIT_YMM
section
.text
al
ign
=
16
%ifdef HAVE_AVX
; void ff_dct32_float_avx(FFTSample *out, const FFTSample *in)
cglobal
dct32_float_avx
,
2
,
3
,
8
,
out
,
in
,
tmp
; pass 1
vmovaps
m4
,
[
inq
+
0
]
vinsertf128
m5
,
m5
,
[
inq
+
96
],
1
vinsertf128
m5
,
m5
,
[
inq
+
112
],
0
vshufps
m5
,
m5
,
m5
,
0x1b
BUTTERFLY
m4
,
m5
,
[
ps_cos_vec
],
m6
vmovaps
m2
,
[
inq
+
64
]
vinsertf128
m6
,
m6
,
[
inq
+
32
],
1
vinsertf128
m6
,
m6
,
[
inq
+
48
],
0
vshufps
m6
,
m6
,
m6
,
0x1b
BUTTERFLY
m2
,
m6
,
[
ps_cos_vec
+
32
],
m0
; pass 2
BUTTERFLY
m5
,
m6
,
[
ps_cos_vec
+
64
],
m0
BUTTERFLY
m4
,
m2
,
[
ps_cos_vec
+
64
],
m7
; pass 3
vperm2f128
m3
,
m6
,
m4
,
0x31
vperm2f128
m1
,
m6
,
m4
,
0x20
vshufps
m3
,
m3
,
m3
,
0x1b
BUTTERFLY
m1
,
m3
,
[
ps_cos_vec
+
96
],
m6
vperm2f128
m4
,
m5
,
m2
,
0x20
vperm2f128
m5
,
m5
,
m2
,
0x31
vshufps
m5
,
m5
,
m5
,
0x1b
BUTTERFLY
m4
,
m5
,
[
ps_cos_vec
+
96
],
m6
; pass 4
vmovaps
m6
,
[
ps_p1p1m1m1
+
0
]
vmovaps
m2
,
[
ps_cos_vec
+
128
]
BUTTERFLY2
m5
,
m6
,
m2
,
m7
BUTTERFLY2
m4
,
m6
,
m2
,
m7
BUTTERFLY2
m1
,
m6
,
m2
,
m7
BUTTERFLY2
m3
,
m6
,
m2
,
m7
; pass 5
vshufps
m6
,
m6
,
m6
,
0xcc
vmovaps
m2
,
[
ps_cos_vec
+
160
]
BUTTERFLY3
m5
,
m6
,
m2
,
m7
BUTTERFLY3
m4
,
m6
,
m2
,
m7
BUTTERFLY3
m1
,
m6
,
m2
,
m7
BUTTERFLY3
m3
,
m6
,
m2
,
m7
vperm2f128
m6
,
m3
,
m3
,
0x31
vmovaps
[
outq
],
m3
vextractf128
[
outq
+
64
],
m5
,
1
vextractf128
[
outq
+
32
],
m5
,
0
vextractf128
[
outq
+
80
],
m4
,
1
vextractf128
[
outq
+
48
],
m4
,
0
vperm2f128
m0
,
m1
,
m1
,
0x31
vmovaps
[
outq
+
96
],
m1
vzeroupper
; pass 6, no SIMD...
INIT_XMM
PASS6_AND_PERMUTE
RET
%endif
%define BUTTERFLY BUTTERFLY_SSE
%define BUTTERFLY0 BUTTERFLY0_SSE
INIT_XMM
; void ff_dct32_float_sse(FFTSample *out, const FFTSample *in)
cglobal
dct32_float_sse
,
2
,
3
,
8
,
out
,
in
,
tmp
; pass 1
...
...
@@ -74,8 +282,7 @@ cglobal dct32_float_sse, 2,3,8, out, in, tmp
movaps
m7
,
[
inq
+
64
]
movaps
m4
,
[
inq
+
48
]
shufps
m4
,
m4
,
0x1b
BUTTERFLY
m7
,
m4
,
[
ps_cos_vec
+
48
],
m3
BUTTERFLY
m7
,
m4
,
[
ps_cos_vec
+
32
],
m3
; pass 2
movaps
m2
,
[
ps_cos_vec
+
64
]
...
...
@@ -92,7 +299,7 @@ cglobal dct32_float_sse, 2,3,8, out, in, tmp
movaps
m4
,
[
inq
+
80
]
movaps
m5
,
[
inq
+
32
]
shufps
m5
,
m5
,
0x1b
BUTTERFLY
m4
,
m5
,
[
ps_cos_vec
+
32
],
m3
BUTTERFLY
m4
,
m5
,
[
ps_cos_vec
+
48
],
m3
; pass 2
BUTTERFLY
m0
,
m7
,
m2
,
m3
...
...
@@ -123,7 +330,7 @@ cglobal dct32_float_sse, 2,3,8, out, in, tmp
; pass 4
movaps
m3
,
[
ps_p1p1m1m1
+
0
]
movaps
m2
,
[
ps_cos_vec
+
1
1
2
]
movaps
m2
,
[
ps_cos_vec
+
12
8
]
BUTTERFLY2
m5
,
m3
,
m2
,
m1
...
...
@@ -148,7 +355,7 @@ cglobal dct32_float_sse, 2,3,8, out, in, tmp
BUTTERFLY2
m0
,
m3
,
m2
,
m1
; pass 5
movaps
m2
,
[
ps_cos_vec
+
1
28
]
movaps
m2
,
[
ps_cos_vec
+
1
60
]
shufps
m3
,
m3
,
0xcc
BUTTERFLY3
m5
,
m3
,
m2
,
m1
...
...
@@ -180,110 +387,5 @@ cglobal dct32_float_sse, 2,3,8, out, in, tmp
; pass 6, no SIMD...
mov
tmpd
,
[
outq
+
4
]
movss
m7
,
[
outq
+
72
]
addss
m7
,
[
outq
+
76
]
movss
m3
,
[
outq
+
56
]
addss
m3
,
[
outq
+
60
]
addss
m4
,
m3
movss
m2
,
[
outq
+
52
]
addss
m2
,
m3
movss
m3
,
[
outq
+
104
]
addss
m3
,
[
outq
+
108
]
addss
m1
,
m3
addss
m5
,
m4
movss
[
outq
+
16
],
m1
movss
m1
,
[
outq
+
100
]
addss
m1
,
m3
movss
m3
,
[
outq
+
40
]
movss
[
outq
+
48
],
m1
addss
m3
,
[
outq
+
44
]
movss
m1
,
[
outq
+
100
]
addss
m4
,
m3
addss
m3
,
m2
addss
m1
,
[
outq
+
108
]
movss
[
outq
+
40
],
m3
addss
m2
,
[
outq
+
36
]
movss
m3
,
[
outq
+
8
]
movss
[
outq
+
56
],
m2
addss
m3
,
[
outq
+
12
]
movss
[
outq
+
32
],
m3
movss
m3
,
[
outq
+
80
]
movss
[
outq
+
8
],
m5
movss
[
outq
+
80
],
m1
movss
m2
,
[
outq
+
52
]
movss
m5
,
[
outq
+
120
]
addss
m5
,
[
outq
+
124
]
movss
m1
,
[
outq
+
64
]
addss
m2
,
[
outq
+
60
]
addss
m0
,
m5
addss
m5
,
[
outq
+
116
]
mov
[
outq
+
64
],
tmpd
addss
m6
,
m0
addss
m1
,
m6
mov
tmpd
,
[
outq
+
12
]
mov
[
outq
+
96
],
tmpd
movss
[
outq
+
4
],
m1
movss
m1
,
[
outq
+
24
]
movss
[
outq
+
24
],
m4
movss
m4
,
[
outq
+
88
]
addss
m4
,
[
outq
+
92
]
addss
m3
,
m4
addss
m4
,
[
outq
+
84
]
mov
tmpd
,
[
outq
+
108
]
addss
m1
,
[
outq
+
28
]
addss
m0
,
m1
addss
m1
,
m5
addss
m6
,
m3
addss
m3
,
m0
addss
m0
,
m7
addss
m5
,
[
outq
+
20
]
addss
m7
,
m1
movss
[
outq
+
12
],
m6
mov
[
outq
+
112
],
tmpd
movss
m6
,
[
outq
+
28
]
movss
[
outq
+
28
],
m0
movss
m0
,
[
outq
+
36
]
movss
[
outq
+
36
],
m7
addss
m1
,
m4
movss
m7
,
[
outq
+
116
]
addss
m0
,
m2
addss
m7
,
[
outq
+
124
]
movss
[
outq
+
72
],
m0
movss
m0
,
[
outq
+
44
]
addss
m2
,
m0
movss
[
outq
+
44
],
m1
movss
[
outq
+
88
],
m2
addss
m0
,
[
outq
+
60
]
mov
tmpd
,
[
outq
+
60
]
mov
[
outq
+
120
],
tmpd
movss
[
outq
+
104
],
m0
addss
m4
,
m5
addss
m5
,
[
outq
+
68
]
movss
[
outq
+
52
],
m4
movss
[
outq
+
60
],
m5
movss
m4
,
[
outq
+
68
]
movss
m5
,
[
outq
+
20
]
movss
[
outq
+
20
],
m3
addss
m5
,
m7
addss
m7
,
m6
addss
m4
,
m5
movss
m2
,
[
outq
+
84
]
addss
m2
,
[
outq
+
92
]
addss
m5
,
m2
movss
[
outq
+
68
],
m4
addss
m2
,
m7
movss
m4
,
[
outq
+
76
]
movss
[
outq
+
84
],
m2
movss
[
outq
+
76
],
m5
addss
m7
,
m4
addss
m6
,
[
outq
+
124
]
addss
m4
,
m6
addss
m6
,
[
outq
+
92
]
movss
[
outq
+
100
],
m4
movss
[
outq
+
108
],
m6
movss
m6
,
[
outq
+
92
]
movss
[
outq
+
92
],
m7
addss
m6
,
[
outq
+
124
]
movss
[
outq
+
116
],
m6
PASS6_AND_PERMUTE
RET
This diff is collapsed.
Click to expand it.
libavcodec/x86/fft.c
+
3
−
1
View file @
6204feb1
...
...
@@ -57,7 +57,9 @@ av_cold void ff_fft_init_mmx(FFTContext *s)
av_cold
void
ff_dct_init_mmx
(
DCTContext
*
s
)
{
int
has_vectors
=
av_get_cpu_flags
();
if
(
has_vectors
&
AV_CPU_FLAG_SSE
&&
HAVE_SSE
)
if
(
has_vectors
&
AV_CPU_FLAG_AVX
&&
HAVE_AVX
)
s
->
dct32
=
ff_dct32_float_avx
;
else
if
(
has_vectors
&
AV_CPU_FLAG_SSE
&&
HAVE_SSE
)
s
->
dct32
=
ff_dct32_float_sse
;
}
#endif
...
...
This diff is collapsed.
Click to expand it.
libavcodec/x86/fft.h
+
1
−
0
View file @
6204feb1
...
...
@@ -35,5 +35,6 @@ void ff_imdct_calc_sse(FFTContext *s, FFTSample *output, const FFTSample *input)
void
ff_imdct_half_sse
(
FFTContext
*
s
,
FFTSample
*
output
,
const
FFTSample
*
input
);
void
ff_imdct_half_avx
(
FFTContext
*
s
,
FFTSample
*
output
,
const
FFTSample
*
input
);
void
ff_dct32_float_sse
(
FFTSample
*
out
,
const
FFTSample
*
in
);
void
ff_dct32_float_avx
(
FFTSample
*
out
,
const
FFTSample
*
in
);
#endif
/* AVCODEC_X86_FFT_H */
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment