Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
F
FFmpeg
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Wiki
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Container Registry
Model registry
Operate
Environments
Monitor
Incidents
Service Desk
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
libremedia
Tethys
FFmpeg
Commits
c83f44db
Commit
c83f44db
authored
12 years ago
by
Ronald S. Bultje
Browse files
Options
Downloads
Patches
Plain Diff
h264_idct_10bit: port x86 assembly to cpuflags.
parent
f8d8fe25
No related branches found
No related tags found
No related merge requests found
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
libavcodec/x86/h264_idct_10bit.asm
+127
-127
127 additions, 127 deletions
libavcodec/x86/h264_idct_10bit.asm
with
127 additions
and
127 deletions
libavcodec/x86/h264_idct_10bit.asm
+
127
−
127
View file @
c83f44db
...
@@ -72,25 +72,25 @@ SECTION .text
...
@@ -72,25 +72,25 @@ SECTION .text
STORE_DIFFx2
m2
,
m3
,
m4
,
m5
,
%
1
,
%
3
STORE_DIFFx2
m2
,
m3
,
m4
,
m5
,
%
1
,
%
3
%endmacro
%endmacro
%macro IDCT_ADD_10
1
%macro IDCT_ADD_10
0
cglobal
h264_idct_add_10
_
%
1
,
3
,
3
cglobal
h264_idct_add_10
,
3
,
3
IDCT4_ADD_10
r0
,
r1
,
r2
IDCT4_ADD_10
r0
,
r1
,
r2
RET
RET
%endmacro
%endmacro
INIT_XMM
INIT_XMM
ss
e2
IDCT_ADD_10
ss
e2
IDCT_ADD_10
%if HAVE_AVX
%if HAVE_AVX
INIT_
AVX
INIT_
XMM
avx
IDCT_ADD_10
avx
IDCT_ADD_10
%endif
%endif
;-----------------------------------------------------------------------------
;-----------------------------------------------------------------------------
; h264_idct_add16(pixel *dst, const int *block_offset, dctcoef *block, int stride, const uint8_t nnzc[6*8])
; h264_idct_add16(pixel *dst, const int *block_offset, dctcoef *block, int stride, const uint8_t nnzc[6*8])
;-----------------------------------------------------------------------------
;-----------------------------------------------------------------------------
;;;;;;; NO FATE SAMPLES TRIGGER THIS
;;;;;;; NO FATE SAMPLES TRIGGER THIS
%macro ADD4x4IDCT
1
%macro ADD4x4IDCT
0
add4x4_idct
_
%
1
:
add4x4_idct
%+
SUFFIX
:
add
r5
,
r0
add
r5
,
r0
mova
m0
,
[
r2
+
0
]
mova
m0
,
[
r2
+
0
]
mova
m1
,
[
r2
+
16
]
mova
m1
,
[
r2
+
16
]
...
@@ -107,52 +107,52 @@ add4x4_idct_%1:
...
@@ -107,52 +107,52 @@ add4x4_idct_%1:
ret
ret
%endmacro
%endmacro
INIT_XMM
INIT_XMM
ss
e2
ALIGN
16
ALIGN
16
ADD4x4IDCT
ss
e2
ADD4x4IDCT
%if HAVE_AVX
%if HAVE_AVX
INIT_
AVX
INIT_
XMM
avx
ALIGN
16
ALIGN
16
ADD4x4IDCT
avx
ADD4x4IDCT
%endif
%endif
%macro ADD16_OP
3
%macro ADD16_OP
2
cmp
byte
[
r4
+%
3
],
0
cmp
byte
[
r4
+%
2
],
0
jz
.skipblock
%
2
jz
.skipblock
%
1
mov
r5d
,
[
r1
+%
2
*
4
]
mov
r5d
,
[
r1
+%
1
*
4
]
call
add4x4_idct
_
%
1
call
add4x4_idct
%+
SUFFIX
.skipblock
%
2
:
.skipblock
%
1
:
%if %
2
<15
%if %
1
<15
add
r2
,
64
add
r2
,
64
%endif
%endif
%endmacro
%endmacro
%macro IDCT_ADD16_10
1
%macro IDCT_ADD16_10
0
cglobal
h264_idct_add16_10
_
%
1
,
5
,
6
cglobal
h264_idct_add16_10
,
5
,
6
ADD16_OP
%
1
,
0
,
4
+
1
*
8
ADD16_OP
0
,
4
+
1
*
8
ADD16_OP
%
1
,
1
,
5
+
1
*
8
ADD16_OP
1
,
5
+
1
*
8
ADD16_OP
%
1
,
2
,
4
+
2
*
8
ADD16_OP
2
,
4
+
2
*
8
ADD16_OP
%
1
,
3
,
5
+
2
*
8
ADD16_OP
3
,
5
+
2
*
8
ADD16_OP
%
1
,
4
,
6
+
1
*
8
ADD16_OP
4
,
6
+
1
*
8
ADD16_OP
%
1
,
5
,
7
+
1
*
8
ADD16_OP
5
,
7
+
1
*
8
ADD16_OP
%
1
,
6
,
6
+
2
*
8
ADD16_OP
6
,
6
+
2
*
8
ADD16_OP
%
1
,
7
,
7
+
2
*
8
ADD16_OP
7
,
7
+
2
*
8
ADD16_OP
%
1
,
8
,
4
+
3
*
8
ADD16_OP
8
,
4
+
3
*
8
ADD16_OP
%
1
,
9
,
5
+
3
*
8
ADD16_OP
9
,
5
+
3
*
8
ADD16_OP
%
1
,
10
,
4
+
4
*
8
ADD16_OP
10
,
4
+
4
*
8
ADD16_OP
%
1
,
11
,
5
+
4
*
8
ADD16_OP
11
,
5
+
4
*
8
ADD16_OP
%
1
,
12
,
6
+
3
*
8
ADD16_OP
12
,
6
+
3
*
8
ADD16_OP
%
1
,
13
,
7
+
3
*
8
ADD16_OP
13
,
7
+
3
*
8
ADD16_OP
%
1
,
14
,
6
+
4
*
8
ADD16_OP
14
,
6
+
4
*
8
ADD16_OP
%
1
,
15
,
7
+
4
*
8
ADD16_OP
15
,
7
+
4
*
8
REP_RET
REP_RET
%endmacro
%endmacro
INIT_XMM
INIT_XMM
ss
e2
IDCT_ADD16_10
ss
e2
IDCT_ADD16_10
%if HAVE_AVX
%if HAVE_AVX
INIT_
AVX
INIT_
XMM
avx
IDCT_ADD16_10
avx
IDCT_ADD16_10
%endif
%endif
;-----------------------------------------------------------------------------
;-----------------------------------------------------------------------------
...
@@ -185,8 +185,8 @@ IDCT_ADD16_10 avx
...
@@ -185,8 +185,8 @@ IDCT_ADD16_10 avx
mova
[
%
1
+%
3
],
m4
mova
[
%
1
+%
3
],
m4
%endmacro
%endmacro
INIT_MMX
INIT_MMX
mmx2
cglobal
h264_idct_dc_add_10
_mmx2
,
3
,
3
cglobal
h264_idct_dc_add_10
,
3
,
3
movd
m0
,
[
r1
]
movd
m0
,
[
r1
]
paddd
m0
,
[
pd_32
]
paddd
m0
,
[
pd_32
]
psrad
m0
,
6
psrad
m0
,
6
...
@@ -199,8 +199,8 @@ cglobal h264_idct_dc_add_10_mmx2,3,3
...
@@ -199,8 +199,8 @@ cglobal h264_idct_dc_add_10_mmx2,3,3
;-----------------------------------------------------------------------------
;-----------------------------------------------------------------------------
; void h264_idct8_dc_add(pixel *dst, dctcoef *block, int stride)
; void h264_idct8_dc_add(pixel *dst, dctcoef *block, int stride)
;-----------------------------------------------------------------------------
;-----------------------------------------------------------------------------
%macro IDCT8_DC_ADD
1
%macro IDCT8_DC_ADD
0
cglobal
h264_idct8_dc_add_10
_
%
1
,
3
,
3
,
7
cglobal
h264_idct8_dc_add_10
,
3
,
3
,
7
mov
r1d
,
[
r1
]
mov
r1d
,
[
r1
]
add
r1
,
32
add
r1
,
32
sar
r1
,
6
sar
r1
,
6
...
@@ -214,45 +214,45 @@ cglobal h264_idct8_dc_add_10_%1,3,3,7
...
@@ -214,45 +214,45 @@ cglobal h264_idct8_dc_add_10_%1,3,3,7
RET
RET
%endmacro
%endmacro
INIT_XMM
INIT_XMM
ss
e2
IDCT8_DC_ADD
ss
e2
IDCT8_DC_ADD
%if HAVE_AVX
%if HAVE_AVX
INIT_
AVX
INIT_
XMM
avx
IDCT8_DC_ADD
avx
IDCT8_DC_ADD
%endif
%endif
;-----------------------------------------------------------------------------
;-----------------------------------------------------------------------------
; h264_idct_add16intra(pixel *dst, const int *block_offset, dctcoef *block, int stride, const uint8_t nnzc[6*8])
; h264_idct_add16intra(pixel *dst, const int *block_offset, dctcoef *block, int stride, const uint8_t nnzc[6*8])
;-----------------------------------------------------------------------------
;-----------------------------------------------------------------------------
%macro AC
2
%macro AC
1
.ac
%
2
.ac
%
1
mov
r5d
,
[
r1
+
(
%
2
+
0
)
*
4
]
mov
r5d
,
[
r1
+
(
%
1
+
0
)
*
4
]
call
add4x4_idct
_
%
1
call
add4x4_idct
%+
SUFFIX
mov
r5d
,
[
r1
+
(
%
2
+
1
)
*
4
]
mov
r5d
,
[
r1
+
(
%
1
+
1
)
*
4
]
add
r2
,
64
add
r2
,
64
call
add4x4_idct
_
%
1
call
add4x4_idct
%+
SUFFIX
add
r2
,
64
add
r2
,
64
jmp
.skipadd
%
2
jmp
.skipadd
%
1
%endmacro
%endmacro
%assign last_block 16
%assign last_block 16
%macro ADD16_OP_INTRA
3
%macro ADD16_OP_INTRA
2
cmp
word
[
r4
+%
3
],
0
cmp
word
[
r4
+%
2
],
0
jnz
.ac
%
2
jnz
.ac
%
1
mov
r5d
,
[
r2
+
0
]
mov
r5d
,
[
r2
+
0
]
or
r5d
,
[
r2
+
64
]
or
r5d
,
[
r2
+
64
]
jz
.skipblock
%
2
jz
.skipblock
%
1
mov
r5d
,
[
r1
+
(
%
2
+
0
)
*
4
]
mov
r5d
,
[
r1
+
(
%
1
+
0
)
*
4
]
call
idct_dc_add
_
%
1
call
idct_dc_add
%+
SUFFIX
.skipblock
%
2
:
.skipblock
%
1
:
%if %
2
<last_block-2
%if %
1
<last_block-2
add
r2
,
128
add
r2
,
128
%endif
%endif
.skipadd
%
2
:
.skipadd
%
1
:
%endmacro
%endmacro
%macro IDCT_ADD16INTRA_10
1
%macro IDCT_ADD16INTRA_10
0
idct_dc_add
_
%
1
:
idct_dc_add
%+
SUFFIX
:
add
r5
,
r0
add
r5
,
r0
movq
m0
,
[
r2
+
0
]
movq
m0
,
[
r2
+
0
]
movhps
m0
,
[
r2
+
64
]
movhps
m0
,
[
r2
+
64
]
...
@@ -265,46 +265,46 @@ idct_dc_add_%1:
...
@@ -265,46 +265,46 @@ idct_dc_add_%1:
IDCT_DC_ADD_OP_10
r5
,
r3
,
r6
IDCT_DC_ADD_OP_10
r5
,
r3
,
r6
ret
ret
cglobal
h264_idct_add16intra_10
_
%
1
,
5
,
7
,
8
cglobal
h264_idct_add16intra_10
,
5
,
7
,
8
ADD16_OP_INTRA
%
1
,
0
,
4
+
1
*
8
ADD16_OP_INTRA
0
,
4
+
1
*
8
ADD16_OP_INTRA
%
1
,
2
,
4
+
2
*
8
ADD16_OP_INTRA
2
,
4
+
2
*
8
ADD16_OP_INTRA
%
1
,
4
,
6
+
1
*
8
ADD16_OP_INTRA
4
,
6
+
1
*
8
ADD16_OP_INTRA
%
1
,
6
,
6
+
2
*
8
ADD16_OP_INTRA
6
,
6
+
2
*
8
ADD16_OP_INTRA
%
1
,
8
,
4
+
3
*
8
ADD16_OP_INTRA
8
,
4
+
3
*
8
ADD16_OP_INTRA
%
1
,
10
,
4
+
4
*
8
ADD16_OP_INTRA
10
,
4
+
4
*
8
ADD16_OP_INTRA
%
1
,
12
,
6
+
3
*
8
ADD16_OP_INTRA
12
,
6
+
3
*
8
ADD16_OP_INTRA
%
1
,
14
,
6
+
4
*
8
ADD16_OP_INTRA
14
,
6
+
4
*
8
REP_RET
REP_RET
AC
%
1
,
8
AC
8
AC
%
1
,
10
AC
10
AC
%
1
,
12
AC
12
AC
%
1
,
14
AC
14
AC
%
1
,
0
AC
0
AC
%
1
,
2
AC
2
AC
%
1
,
4
AC
4
AC
%
1
,
6
AC
6
%endmacro
%endmacro
INIT_XMM
INIT_XMM
ss
e2
IDCT_ADD16INTRA_10
ss
e2
IDCT_ADD16INTRA_10
%if HAVE_AVX
%if HAVE_AVX
INIT_
AVX
INIT_
XMM
avx
IDCT_ADD16INTRA_10
avx
IDCT_ADD16INTRA_10
%endif
%endif
%assign last_block 36
%assign last_block 36
;-----------------------------------------------------------------------------
;-----------------------------------------------------------------------------
; h264_idct_add8(pixel **dst, const int *block_offset, dctcoef *block, int stride, const uint8_t nnzc[6*8])
; h264_idct_add8(pixel **dst, const int *block_offset, dctcoef *block, int stride, const uint8_t nnzc[6*8])
;-----------------------------------------------------------------------------
;-----------------------------------------------------------------------------
%macro IDCT_ADD8
1
%macro IDCT_ADD8
0
cglobal
h264_idct_add8_10
_
%
1
,
5
,
8
,
7
cglobal
h264_idct_add8_10
,
5
,
8
,
7
%if ARCH_X86_64
%if ARCH_X86_64
mov
r7
,
r0
mov
r7
,
r0
%endif
%endif
add
r2
,
1024
add
r2
,
1024
mov
r0
,
[
r0
]
mov
r0
,
[
r0
]
ADD16_OP_INTRA
%
1
,
16
,
4
+
6
*
8
ADD16_OP_INTRA
16
,
4
+
6
*
8
ADD16_OP_INTRA
%
1
,
18
,
4
+
7
*
8
ADD16_OP_INTRA
18
,
4
+
7
*
8
add
r2
,
1024
-
128
*
2
add
r2
,
1024
-
128
*
2
%if ARCH_X86_64
%if ARCH_X86_64
mov
r0
,
[
r7
+
gprsize
]
mov
r0
,
[
r7
+
gprsize
]
...
@@ -312,21 +312,21 @@ cglobal h264_idct_add8_10_%1,5,8,7
...
@@ -312,21 +312,21 @@ cglobal h264_idct_add8_10_%1,5,8,7
mov
r0
,
r0m
mov
r0
,
r0m
mov
r0
,
[
r0
+
gprsize
]
mov
r0
,
[
r0
+
gprsize
]
%endif
%endif
ADD16_OP_INTRA
%
1
,
32
,
4
+
11
*
8
ADD16_OP_INTRA
32
,
4
+
11
*
8
ADD16_OP_INTRA
%
1
,
34
,
4
+
12
*
8
ADD16_OP_INTRA
34
,
4
+
12
*
8
REP_RET
REP_RET
AC
%
1
,
16
AC
16
AC
%
1
,
18
AC
18
AC
%
1
,
32
AC
32
AC
%
1
,
34
AC
34
%endmacro
; IDCT_ADD8
%endmacro
; IDCT_ADD8
INIT_XMM
INIT_XMM
ss
e2
IDCT_ADD8
ss
e2
IDCT_ADD8
%if HAVE_AVX
%if HAVE_AVX
INIT_
AVX
INIT_
XMM
avx
IDCT_ADD8
avx
IDCT_ADD8
%endif
%endif
;-----------------------------------------------------------------------------
;-----------------------------------------------------------------------------
...
@@ -432,19 +432,19 @@ IDCT_ADD8 avx
...
@@ -432,19 +432,19 @@ IDCT_ADD8 avx
STORE_DIFFx2
m0
,
m1
,
m6
,
m7
,
%
1
,
%
3
STORE_DIFFx2
m0
,
m1
,
m6
,
m7
,
%
1
,
%
3
%endmacro
%endmacro
%macro IDCT8_ADD
1
%macro IDCT8_ADD
0
cglobal
h264_idct8_add_10
_
%
1
,
3
,
4
,
16
cglobal
h264_idct8_add_10
,
3
,
4
,
16
%if UNIX64 == 0
%if UNIX64 == 0
%assign pad 16-gprsize-(stack_offset&15)
%assign pad 16-gprsize-(stack_offset&15)
sub
rsp
,
pad
sub
rsp
,
pad
call
h264_idct8_add1_10
_
%
1
call
h264_idct8_add1_10
%+
SUFFIX
add
rsp
,
pad
add
rsp
,
pad
RET
RET
%endif
%endif
ALIGN
16
ALIGN
16
; TODO: does not need to use stack
; TODO: does not need to use stack
h264_idct8_add1_10
_
%
1
:
h264_idct8_add1_10
%+
SUFFIX
:
%assign pad 256+16-gprsize
%assign pad 256+16-gprsize
sub
rsp
,
pad
sub
rsp
,
pad
add
dword
[
r1
],
32
add
dword
[
r1
],
32
...
@@ -499,31 +499,31 @@ h264_idct8_add1_10_%1:
...
@@ -499,31 +499,31 @@ h264_idct8_add1_10_%1:
ret
ret
%endmacro
%endmacro
INIT_XMM
INIT_XMM
ss
e2
IDCT8_ADD
ss
e2
IDCT8_ADD
%if HAVE_AVX
%if HAVE_AVX
INIT_
AVX
INIT_
XMM
avx
IDCT8_ADD
avx
IDCT8_ADD
%endif
%endif
;-----------------------------------------------------------------------------
;-----------------------------------------------------------------------------
; h264_idct8_add4(pixel **dst, const int *block_offset, dctcoef *block, int stride, const uint8_t nnzc[6*8])
; h264_idct8_add4(pixel **dst, const int *block_offset, dctcoef *block, int stride, const uint8_t nnzc[6*8])
;-----------------------------------------------------------------------------
;-----------------------------------------------------------------------------
;;;;;;; NO FATE SAMPLES TRIGGER THIS
;;;;;;; NO FATE SAMPLES TRIGGER THIS
%macro IDCT8_ADD4_OP
3
%macro IDCT8_ADD4_OP
2
cmp
byte
[
r4
+%
3
],
0
cmp
byte
[
r4
+%
2
],
0
jz
.skipblock
%
2
jz
.skipblock
%
1
mov
r0d
,
[
r6
+%
2
*
4
]
mov
r0d
,
[
r6
+%
1
*
4
]
add
r0
,
r5
add
r0
,
r5
call
h264_idct8_add1_10
_
%
1
call
h264_idct8_add1_10
%+
SUFFIX
.skipblock
%
2
:
.skipblock
%
1
:
%if %
2
<12
%if %
1
<12
add
r1
,
256
add
r1
,
256
%endif
%endif
%endmacro
%endmacro
%macro IDCT8_ADD4
1
%macro IDCT8_ADD4
0
cglobal
h264_idct8_add4_10
_
%
1
,
0
,
7
,
16
cglobal
h264_idct8_add4_10
,
0
,
7
,
16
%assign pad 16-gprsize-(stack_offset&15)
%assign pad 16-gprsize-(stack_offset&15)
SUB
rsp
,
pad
SUB
rsp
,
pad
mov
r5
,
r0mp
mov
r5
,
r0mp
...
@@ -531,17 +531,17 @@ cglobal h264_idct8_add4_10_%1, 0,7,16
...
@@ -531,17 +531,17 @@ cglobal h264_idct8_add4_10_%1, 0,7,16
mov
r1
,
r2mp
mov
r1
,
r2mp
mov
r2d
,
r3m
mov
r2d
,
r3m
movifnidn
r4
,
r4mp
movifnidn
r4
,
r4mp
IDCT8_ADD4_OP
%
1
,
0
,
4
+
1
*
8
IDCT8_ADD4_OP
0
,
4
+
1
*
8
IDCT8_ADD4_OP
%
1
,
4
,
6
+
1
*
8
IDCT8_ADD4_OP
4
,
6
+
1
*
8
IDCT8_ADD4_OP
%
1
,
8
,
4
+
3
*
8
IDCT8_ADD4_OP
8
,
4
+
3
*
8
IDCT8_ADD4_OP
%
1
,
12
,
6
+
3
*
8
IDCT8_ADD4_OP
12
,
6
+
3
*
8
ADD
rsp
,
pad
ADD
rsp
,
pad
RET
RET
%endmacro
; IDCT8_ADD4
%endmacro
; IDCT8_ADD4
INIT_XMM
INIT_XMM
ss
e2
IDCT8_ADD4
ss
e2
IDCT8_ADD4
%if HAVE_AVX
%if HAVE_AVX
INIT_
AVX
INIT_
XMM
avx
IDCT8_ADD4
avx
IDCT8_ADD4
%endif
%endif
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment