Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
F
FFmpeg
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Wiki
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Container Registry
Model registry
Operate
Environments
Monitor
Incidents
Service Desk
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
libremedia
Tethys
FFmpeg
Commits
57b5b84e
Commit
57b5b84e
authored
11 years ago
by
Diego Biurrun
Browse files
Options
Downloads
Patches
Plain Diff
x86: dsputil: Move ff_apply_window_int16_* bits to ac3dsp, where they belong
parent
c2c5be57
No related branches found
Branches containing commit
No related tags found
Tags containing commit
No related merge requests found
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
libavcodec/x86/ac3dsp.asm
+131
-0
131 additions, 0 deletions
libavcodec/x86/ac3dsp.asm
libavcodec/x86/dsputil.asm
+0
-130
0 additions, 130 deletions
libavcodec/x86/dsputil.asm
with
131 additions
and
130 deletions
libavcodec/x86/ac3dsp.asm
+
131
−
0
View file @
57b5b84e
...
@@ -35,6 +35,10 @@ pw_bap_mul2: dw 5, 7, 0, 7, 5, 7, 0, 7
...
@@ -35,6 +35,10 @@ pw_bap_mul2: dw 5, 7, 0, 7, 5, 7, 0, 7
pd_1:
times
4
dd
1
pd_1:
times
4
dd
1
pd_151:
times
4
dd
151
pd_151:
times
4
dd
151
; used in ff_apply_window_int16()
pb_revwords:
SHUFFLE_MASK_W
7
,
6
,
5
,
4
,
3
,
2
,
1
,
0
pd_16384:
times
4
dd
16384
SECTION
.text
SECTION
.text
;-----------------------------------------------------------------------------
;-----------------------------------------------------------------------------
...
@@ -419,3 +423,130 @@ AC3_EXTRACT_EXPONENTS
...
@@ -419,3 +423,130 @@ AC3_EXTRACT_EXPONENTS
INIT_XMM
ss
se3
INIT_XMM
ss
se3
AC3_EXTRACT_EXPONENTS
AC3_EXTRACT_EXPONENTS
%endif
%endif
;-----------------------------------------------------------------------------
; void ff_apply_window_int16(int16_t *output, const int16_t *input,
; const int16_t *window, unsigned int len)
;-----------------------------------------------------------------------------
%macro REVERSE_WORDS 1-2
%if cpuflag(ssse3) && notcpuflag(atom)
pshufb
%
1
,
%
2
%elif cpuflag(sse2)
pshuflw
%
1
,
%
1
,
0x1B
pshufhw
%
1
,
%
1
,
0x1B
pshufd
%
1
,
%
1
,
0x4E
%elif cpuflag(mmxext)
pshufw
%
1
,
%
1
,
0x1B
%endif
%endmacro
%macro MUL16FIXED 3
%if cpuflag(ssse3)
; dst, src, unused
; dst = ((dst * src) + (1<<14)) >> 15
pmulhrsw
%
1
,
%
2
%elif cpuflag(mmxext)
; dst, src, temp
; dst = (dst * src) >> 15
; pmulhw cuts off the bottom bit, so we have to lshift by 1 and add it back
; in from the pmullw result.
mova
%
3
,
%
1
pmulhw
%
1
,
%
2
pmullw
%
3
,
%
2
psrlw
%
3
,
15
psllw
%
1
,
1
por
%
1
,
%
3
%endif
%endmacro
%macro APPLY_WINDOW_INT16 1
; %1 bitexact version
%if %1
cglobal
apply_window_int16
,
4
,
5
,
6
,
output
,
input
,
window
,
offset
,
offset2
%else
cglobal
apply_window_int16_round
,
4
,
5
,
6
,
output
,
input
,
window
,
offset
,
offset2
%endif
lea
offset2q
,
[
offsetq
-
mmsize
]
%if cpuflag(ssse3) && notcpuflag(atom)
mova
m5
,
[
pb_revwords
]
ALIGN
16
%elif %1
mova
m5
,
[
pd_16384
]
%endif
.loop:
%if cpuflag(ssse3)
; This version does the 16x16->16 multiplication in-place without expanding
; to 32-bit. The ssse3 version is bit-identical.
mova
m0
,
[
windowq
+
offset2q
]
mova
m1
,
[
inputq
+
offset2q
]
pmulhrsw
m1
,
m0
REVERSE_WORDS
m0
,
m5
pmulhrsw
m0
,
[
inputq
+
offsetq
]
mova
[
outputq
+
offset2q
],
m1
mova
[
outputq
+
offsetq
],
m0
%elif %1
; This version expands 16-bit to 32-bit, multiplies by the window,
; adds 16384 for rounding, right shifts 15, then repacks back to words to
; save to the output. The window is reversed for the second half.
mova
m3
,
[
windowq
+
offset2q
]
mova
m4
,
[
inputq
+
offset2q
]
pxor
m0
,
m0
punpcklwd
m0
,
m3
punpcklwd
m1
,
m4
pmaddwd
m0
,
m1
paddd
m0
,
m5
psrad
m0
,
15
pxor
m2
,
m2
punpckhwd
m2
,
m3
punpckhwd
m1
,
m4
pmaddwd
m2
,
m1
paddd
m2
,
m5
psrad
m2
,
15
packssdw
m0
,
m2
mova
[
outputq
+
offset2q
],
m0
REVERSE_WORDS
m3
mova
m4
,
[
inputq
+
offsetq
]
pxor
m0
,
m0
punpcklwd
m0
,
m3
punpcklwd
m1
,
m4
pmaddwd
m0
,
m1
paddd
m0
,
m5
psrad
m0
,
15
pxor
m2
,
m2
punpckhwd
m2
,
m3
punpckhwd
m1
,
m4
pmaddwd
m2
,
m1
paddd
m2
,
m5
psrad
m2
,
15
packssdw
m0
,
m2
mova
[
outputq
+
offsetq
],
m0
%else
; This version does the 16x16->16 multiplication in-place without expanding
; to 32-bit. The mmxext and sse2 versions do not use rounding, and
; therefore are not bit-identical to the C version.
mova
m0
,
[
windowq
+
offset2q
]
mova
m1
,
[
inputq
+
offset2q
]
mova
m2
,
[
inputq
+
offsetq
]
MUL16FIXED
m1
,
m0
,
m3
REVERSE_WORDS
m0
MUL16FIXED
m2
,
m0
,
m3
mova
[
outputq
+
offset2q
],
m1
mova
[
outputq
+
offsetq
],
m2
%endif
add
offsetd
,
mmsize
sub
offset2d
,
mmsize
jae
.loop
REP_RET
%endmacro
INIT_MMX
mmxext
APPLY_WINDOW_INT16
0
INIT_XMM
ss
e2
APPLY_WINDOW_INT16
0
INIT_MMX
mmxext
APPLY_WINDOW_INT16
1
INIT_XMM
ss
e2
APPLY_WINDOW_INT16
1
INIT_XMM
ss
se3
APPLY_WINDOW_INT16
1
INIT_XMM
ss
se3
,
atom
APPLY_WINDOW_INT16
1
This diff is collapsed.
Click to expand it.
libavcodec/x86/dsputil.asm
+
0
−
130
View file @
57b5b84e
...
@@ -27,8 +27,6 @@ pb_zzzzzzzz77777777: times 8 db -1
...
@@ -27,8 +27,6 @@ pb_zzzzzzzz77777777: times 8 db -1
pb_7:
times
8
db
7
pb_7:
times
8
db
7
pb_zzzz3333zzzzbbbb:
db
-
1
,
-
1
,
-
1
,
-
1
,
3
,
3
,
3
,
3
,
-
1
,
-
1
,
-
1
,
-
1
,
11
,
11
,
11
,
11
pb_zzzz3333zzzzbbbb:
db
-
1
,
-
1
,
-
1
,
-
1
,
3
,
3
,
3
,
3
,
-
1
,
-
1
,
-
1
,
-
1
,
11
,
11
,
11
,
11
pb_zz11zz55zz99zzdd:
db
-
1
,
-
1
,
1
,
1
,
-
1
,
-
1
,
5
,
5
,
-
1
,
-
1
,
9
,
9
,
-
1
,
-
1
,
13
,
13
pb_zz11zz55zz99zzdd:
db
-
1
,
-
1
,
1
,
1
,
-
1
,
-
1
,
5
,
5
,
-
1
,
-
1
,
9
,
9
,
-
1
,
-
1
,
13
,
13
pb_revwords:
SHUFFLE_MASK_W
7
,
6
,
5
,
4
,
3
,
2
,
1
,
0
pd_16384:
times
4
dd
16384
pb_bswap32:
db
3
,
2
,
1
,
0
,
7
,
6
,
5
,
4
,
11
,
10
,
9
,
8
,
15
,
14
,
13
,
12
pb_bswap32:
db
3
,
2
,
1
,
0
,
7
,
6
,
5
,
4
,
11
,
10
,
9
,
8
,
15
,
14
,
13
,
12
SECTION
_TEXT
SECTION
_TEXT
...
@@ -205,134 +203,6 @@ SCALARPRODUCT_LOOP 0
...
@@ -205,134 +203,6 @@ SCALARPRODUCT_LOOP 0
RET
RET
;-----------------------------------------------------------------------------
; void ff_apply_window_int16(int16_t *output, const int16_t *input,
; const int16_t *window, unsigned int len)
;-----------------------------------------------------------------------------
%macro REVERSE_WORDS 1-2
%if cpuflag(ssse3) && notcpuflag(atom)
pshufb
%
1
,
%
2
%elif cpuflag(sse2)
pshuflw
%
1
,
%
1
,
0x1B
pshufhw
%
1
,
%
1
,
0x1B
pshufd
%
1
,
%
1
,
0x4E
%elif cpuflag(mmxext)
pshufw
%
1
,
%
1
,
0x1B
%endif
%endmacro
%macro MUL16FIXED 3
%if cpuflag(ssse3)
; dst, src, unused
; dst = ((dst * src) + (1<<14)) >> 15
pmulhrsw
%
1
,
%
2
%elif cpuflag(mmxext)
; dst, src, temp
; dst = (dst * src) >> 15
; pmulhw cuts off the bottom bit, so we have to lshift by 1 and add it back
; in from the pmullw result.
mova
%
3
,
%
1
pmulhw
%
1
,
%
2
pmullw
%
3
,
%
2
psrlw
%
3
,
15
psllw
%
1
,
1
por
%
1
,
%
3
%endif
%endmacro
%macro APPLY_WINDOW_INT16 1
; %1 bitexact version
%if %1
cglobal
apply_window_int16
,
4
,
5
,
6
,
output
,
input
,
window
,
offset
,
offset2
%else
cglobal
apply_window_int16_round
,
4
,
5
,
6
,
output
,
input
,
window
,
offset
,
offset2
%endif
lea
offset2q
,
[
offsetq
-
mmsize
]
%if cpuflag(ssse3) && notcpuflag(atom)
mova
m5
,
[
pb_revwords
]
ALIGN
16
%elif %1
mova
m5
,
[
pd_16384
]
%endif
.loop:
%if cpuflag(ssse3)
; This version does the 16x16->16 multiplication in-place without expanding
; to 32-bit. The ssse3 version is bit-identical.
mova
m0
,
[
windowq
+
offset2q
]
mova
m1
,
[
inputq
+
offset2q
]
pmulhrsw
m1
,
m0
REVERSE_WORDS
m0
,
m5
pmulhrsw
m0
,
[
inputq
+
offsetq
]
mova
[
outputq
+
offset2q
],
m1
mova
[
outputq
+
offsetq
],
m0
%elif %1
; This version expands 16-bit to 32-bit, multiplies by the window,
; adds 16384 for rounding, right shifts 15, then repacks back to words to
; save to the output. The window is reversed for the second half.
mova
m3
,
[
windowq
+
offset2q
]
mova
m4
,
[
inputq
+
offset2q
]
pxor
m0
,
m0
punpcklwd
m0
,
m3
punpcklwd
m1
,
m4
pmaddwd
m0
,
m1
paddd
m0
,
m5
psrad
m0
,
15
pxor
m2
,
m2
punpckhwd
m2
,
m3
punpckhwd
m1
,
m4
pmaddwd
m2
,
m1
paddd
m2
,
m5
psrad
m2
,
15
packssdw
m0
,
m2
mova
[
outputq
+
offset2q
],
m0
REVERSE_WORDS
m3
mova
m4
,
[
inputq
+
offsetq
]
pxor
m0
,
m0
punpcklwd
m0
,
m3
punpcklwd
m1
,
m4
pmaddwd
m0
,
m1
paddd
m0
,
m5
psrad
m0
,
15
pxor
m2
,
m2
punpckhwd
m2
,
m3
punpckhwd
m1
,
m4
pmaddwd
m2
,
m1
paddd
m2
,
m5
psrad
m2
,
15
packssdw
m0
,
m2
mova
[
outputq
+
offsetq
],
m0
%else
; This version does the 16x16->16 multiplication in-place without expanding
; to 32-bit. The mmxext and sse2 versions do not use rounding, and
; therefore are not bit-identical to the C version.
mova
m0
,
[
windowq
+
offset2q
]
mova
m1
,
[
inputq
+
offset2q
]
mova
m2
,
[
inputq
+
offsetq
]
MUL16FIXED
m1
,
m0
,
m3
REVERSE_WORDS
m0
MUL16FIXED
m2
,
m0
,
m3
mova
[
outputq
+
offset2q
],
m1
mova
[
outputq
+
offsetq
],
m2
%endif
add
offsetd
,
mmsize
sub
offset2d
,
mmsize
jae
.loop
REP_RET
%endmacro
INIT_MMX
mmxext
APPLY_WINDOW_INT16
0
INIT_XMM
ss
e2
APPLY_WINDOW_INT16
0
INIT_MMX
mmxext
APPLY_WINDOW_INT16
1
INIT_XMM
ss
e2
APPLY_WINDOW_INT16
1
INIT_XMM
ss
se3
APPLY_WINDOW_INT16
1
INIT_XMM
ss
se3
,
atom
APPLY_WINDOW_INT16
1
; void ff_add_hfyu_median_prediction_mmxext(uint8_t *dst, const uint8_t *top,
; void ff_add_hfyu_median_prediction_mmxext(uint8_t *dst, const uint8_t *top,
; const uint8_t *diff, int w,
; const uint8_t *diff, int w,
; int *left, int *left_top)
; int *left, int *left_top)
...
...
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment