Newer
Older
Diego Biurrun
committed
* Copyright (C) 2001-2002 Michael Niedermayer (michaelni@gmx.at)
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
#include "libavutil/x86/asm.h"
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
/* A single TEMPLATE_PP_* should be defined (to 1) when this template is
* included. The following macros will define its dependencies to 1 as well
* (like MMX2 depending on MMX), and will define to 0 all the others. Every
* TEMPLATE_PP_* need to be undef at the end. */
#ifdef TEMPLATE_PP_C
# define RENAME(a) a ## _C
#else
# define TEMPLATE_PP_C 0
#endif
#ifdef TEMPLATE_PP_ALTIVEC
# define RENAME(a) a ## _altivec
#else
# define TEMPLATE_PP_ALTIVEC 0
#endif
#ifdef TEMPLATE_PP_MMX
# define RENAME(a) a ## _MMX
#else
# define TEMPLATE_PP_MMX 0
#endif
#ifdef TEMPLATE_PP_MMXEXT
# undef TEMPLATE_PP_MMX
# define TEMPLATE_PP_MMX 1
# define RENAME(a) a ## _MMX2
#else
# define TEMPLATE_PP_MMXEXT 0
#endif
#ifdef TEMPLATE_PP_3DNOW
# undef TEMPLATE_PP_MMX
# define TEMPLATE_PP_MMX 1
# define RENAME(a) a ## _3DNow
#else
# define TEMPLATE_PP_3DNOW 0
#endif
#ifdef TEMPLATE_PP_SSE2
# undef TEMPLATE_PP_MMX
# define TEMPLATE_PP_MMX 1
# undef TEMPLATE_PP_MMXEXT
# define TEMPLATE_PP_MMXEXT 1
# define RENAME(a) a ## _SSE2
#else
# define TEMPLATE_PP_SSE2 0
#endif
#undef REAL_PAVGB
#undef PAVGB
#undef PMINUB
#undef PMAXUB
Aurelien Jacobs
committed
#define REAL_PAVGB(a,b) "pavgb " #a ", " #b " \n\t"
Aurelien Jacobs
committed
#define PAVGB(a,b) REAL_PAVGB(a,b)
"movq " #a ", " #t " \n\t"\
"psubusb " #b ", " #t " \n\t"\
"psubb " #t ", " #a " \n\t"
"psubusb " #a ", " #b " \n\t"\
"paddb " #a ", " #b " \n\t"
* Check if the middle 8x8 Block in the given 8x16 block is flat
Michael Niedermayer
committed
static inline int RENAME(vertClassify)(const uint8_t src[], int stride, PPContext *c){
int numEq= 0, dcOk;
src+= stride*4; // src points to begin of the 8x8 Block
"movq %0, %%mm7 \n\t"
"movq %1, %%mm6 \n\t"
: : "m" (c->mmxDcOffset[c->nonBQP]), "m" (c->mmxDcThreshold[c->nonBQP])
);
"lea (%2, %3), %%"FF_REG_a" \n\t"
// 0 1 2 3 4 5 6 7 8 9
// %1 eax eax+%2 eax+2%2 %1+4%2 ecx ecx+%2 ecx+2%2 %1+8%2 ecx+4%2
"movq (%2), %%mm0 \n\t"
"movq (%%"FF_REG_a"), %%mm1 \n\t"
"movq %%mm0, %%mm3 \n\t"
"movq %%mm0, %%mm4 \n\t"
PMAXUB(%%mm1, %%mm4)
PMINUB(%%mm1, %%mm3, %%mm5)
"psubb %%mm1, %%mm0 \n\t" // mm0 = difference
"paddb %%mm7, %%mm0 \n\t"
"pcmpgtb %%mm6, %%mm0 \n\t"
"movq (%%"FF_REG_a",%3), %%mm2 \n\t"
PMAXUB(%%mm2, %%mm4)
PMINUB(%%mm2, %%mm3, %%mm5)
"psubb %%mm2, %%mm1 \n\t"
"paddb %%mm7, %%mm1 \n\t"
"pcmpgtb %%mm6, %%mm1 \n\t"
"paddb %%mm1, %%mm0 \n\t"
"movq (%%"FF_REG_a", %3, 2), %%mm1 \n\t"
PMAXUB(%%mm1, %%mm4)
PMINUB(%%mm1, %%mm3, %%mm5)
"psubb %%mm1, %%mm2 \n\t"
"paddb %%mm7, %%mm2 \n\t"
"pcmpgtb %%mm6, %%mm2 \n\t"
"paddb %%mm2, %%mm0 \n\t"
"lea (%%"FF_REG_a", %3, 4), %%"FF_REG_a"\n\t"
"movq (%2, %3, 4), %%mm2 \n\t"
PMAXUB(%%mm2, %%mm4)
PMINUB(%%mm2, %%mm3, %%mm5)
"psubb %%mm2, %%mm1 \n\t"
"paddb %%mm7, %%mm1 \n\t"
"pcmpgtb %%mm6, %%mm1 \n\t"
"paddb %%mm1, %%mm0 \n\t"
"movq (%%"FF_REG_a"), %%mm1 \n\t"
PMAXUB(%%mm1, %%mm4)
PMINUB(%%mm1, %%mm3, %%mm5)
"psubb %%mm1, %%mm2 \n\t"
"paddb %%mm7, %%mm2 \n\t"
"pcmpgtb %%mm6, %%mm2 \n\t"
"paddb %%mm2, %%mm0 \n\t"
"movq (%%"FF_REG_a", %3), %%mm2 \n\t"
PMAXUB(%%mm2, %%mm4)
PMINUB(%%mm2, %%mm3, %%mm5)
"psubb %%mm2, %%mm1 \n\t"
"paddb %%mm7, %%mm1 \n\t"
"pcmpgtb %%mm6, %%mm1 \n\t"
"paddb %%mm1, %%mm0 \n\t"
"movq (%%"FF_REG_a", %3, 2), %%mm1 \n\t"
PMAXUB(%%mm1, %%mm4)
PMINUB(%%mm1, %%mm3, %%mm5)
"psubb %%mm1, %%mm2 \n\t"
"paddb %%mm7, %%mm2 \n\t"
"pcmpgtb %%mm6, %%mm2 \n\t"
"paddb %%mm2, %%mm0 \n\t"
"psubusb %%mm3, %%mm4 \n\t"
" \n\t"
"pxor %%mm7, %%mm7 \n\t"
"psadbw %%mm7, %%mm0 \n\t"
"movq %%mm0, %%mm1 \n\t"
"psrlw $8, %%mm0 \n\t"
"paddb %%mm1, %%mm0 \n\t"
"movq %%mm0, %%mm1 \n\t"
"psrlq $16, %%mm0 \n\t"
"paddb %%mm1, %%mm0 \n\t"
"movq %%mm0, %%mm1 \n\t"
"psrlq $32, %%mm0 \n\t"
"paddb %%mm1, %%mm0 \n\t"
"movq %4, %%mm7 \n\t" // QP,..., QP
"paddusb %%mm7, %%mm7 \n\t" // 2QP ... 2QP
"psubusb %%mm7, %%mm4 \n\t" // Diff <= 2QP -> 0
"packssdw %%mm4, %%mm4 \n\t"
"movd %%mm0, %0 \n\t"
"movd %%mm4, %1 \n\t"
: "=r" (numEq), "=r" (dcOk)
: "r" (src), "r" ((x86_reg)stride), "m" (c->pQPb)
);
numEq= (-numEq) &0xFF;
if(numEq > c->ppMode.flatnessThreshold){
if(dcOk) return 0;
else return 1;
}else{
return 2;
}
* Do a vertical low pass filter on the 8x16 block (only write to the 8x8 block in the middle)
static inline void RENAME(doVertLowPass)(uint8_t *src, int stride, PPContext *c)
#if TEMPLATE_PP_MMXEXT || TEMPLATE_PP_3DNOW
src+= stride*3;
__asm__ volatile( //"movv %0 %1 %2\n\t"
"movq %2, %%mm0 \n\t" // QP,..., QP
"pxor %%mm4, %%mm4 \n\t"
"movq (%0), %%mm6 \n\t"
"movq (%0, %1), %%mm5 \n\t"
"movq %%mm5, %%mm1 \n\t"
"movq %%mm6, %%mm2 \n\t"
"psubusb %%mm6, %%mm5 \n\t"
"psubusb %%mm1, %%mm2 \n\t"
"por %%mm5, %%mm2 \n\t" // ABS Diff of lines
"psubusb %%mm0, %%mm2 \n\t" // diff <= QP -> 0
"pcmpeqb %%mm4, %%mm2 \n\t" // diff <= QP -> FF
"pand %%mm2, %%mm6 \n\t"
"pandn %%mm1, %%mm2 \n\t"
"por %%mm2, %%mm6 \n\t"// First Line to Filter
"movq (%0, %1, 8), %%mm5 \n\t"
"lea (%0, %1, 4), %%"FF_REG_a" \n\t"
"lea (%0, %1, 8), %%"FF_REG_c" \n\t"
"sub %1, %%"FF_REG_c" \n\t"
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
"add %1, %0 \n\t" // %0 points to line 1 not 0
"movq (%0, %1, 8), %%mm7 \n\t"
"movq %%mm5, %%mm1 \n\t"
"movq %%mm7, %%mm2 \n\t"
"psubusb %%mm7, %%mm5 \n\t"
"psubusb %%mm1, %%mm2 \n\t"
"por %%mm5, %%mm2 \n\t" // ABS Diff of lines
"psubusb %%mm0, %%mm2 \n\t" // diff <= QP -> 0
"pcmpeqb %%mm4, %%mm2 \n\t" // diff <= QP -> FF
"pand %%mm2, %%mm7 \n\t"
"pandn %%mm1, %%mm2 \n\t"
"por %%mm2, %%mm7 \n\t" // First Line to Filter
// 1 2 3 4 5 6 7 8
// %0 %0+%1 %0+2%1 eax %0+4%1 eax+2%1 ecx eax+4%1
// 6 4 2 2 1 1
// 6 4 4 2
// 6 8 2
"movq (%0, %1), %%mm0 \n\t" // 1
"movq %%mm0, %%mm1 \n\t" // 1
PAVGB(%%mm6, %%mm0) //1 1 /2
PAVGB(%%mm6, %%mm0) //3 1 /4
"movq (%0, %1, 4), %%mm2 \n\t" // 1
"movq %%mm2, %%mm5 \n\t" // 1
PAVGB((%%FF_REGa), %%mm2) // 11 /2
PAVGB((%0, %1, 2), %%mm2) // 211 /4
"movq %%mm2, %%mm3 \n\t" // 211 /4
"movq (%0), %%mm4 \n\t" // 1
PAVGB(%%mm4, %%mm3) // 4 211 /8
PAVGB(%%mm0, %%mm3) //642211 /16
"movq %%mm3, (%0) \n\t" // X
// mm1=2 mm2=3(211) mm4=1 mm5=5 mm6=0 mm7=9
"movq %%mm1, %%mm0 \n\t" // 1
PAVGB(%%mm6, %%mm0) //1 1 /2
"movq %%mm4, %%mm3 \n\t" // 1
PAVGB((%0,%1,2), %%mm3) // 1 1 /2
PAVGB((%%FF_REGa,%1,2), %%mm5) // 11 /2
PAVGB((%%FF_REGa), %%mm5) // 211 /4
PAVGB(%%mm5, %%mm3) // 2 2211 /8
PAVGB(%%mm0, %%mm3) //4242211 /16
"movq %%mm3, (%0,%1) \n\t" // X
// mm1=2 mm2=3(211) mm4=1 mm5=4(211) mm6=0 mm7=9
PAVGB(%%mm4, %%mm6) //11 /2
"movq (%%"FF_REG_c"), %%mm0 \n\t" // 1
PAVGB((%%FF_REGa, %1, 2), %%mm0) // 11/2
"movq %%mm0, %%mm3 \n\t" // 11/2
PAVGB(%%mm1, %%mm0) // 2 11/4
PAVGB(%%mm6, %%mm0) //222 11/8
PAVGB(%%mm2, %%mm0) //22242211/16
"movq (%0, %1, 2), %%mm2 \n\t" // 1
"movq %%mm0, (%0, %1, 2) \n\t" // X
// mm1=2 mm2=3 mm3=6(11) mm4=1 mm5=4(211) mm6=0(11) mm7=9
"movq (%%"FF_REG_a", %1, 4), %%mm0 \n\t" // 1
PAVGB((%%FF_REGc), %%mm0) // 11 /2
PAVGB(%%mm0, %%mm6) //11 11 /4
PAVGB(%%mm1, %%mm4) // 11 /2
PAVGB(%%mm2, %%mm1) // 11 /2
PAVGB(%%mm1, %%mm6) //1122 11 /8
PAVGB(%%mm5, %%mm6) //112242211 /16
"movq (%%"FF_REG_a"), %%mm5 \n\t" // 1
"movq %%mm6, (%%"FF_REG_a") \n\t" // X
// mm0=7(11) mm1=2(11) mm2=3 mm3=6(11) mm4=1(11) mm5=4 mm7=9
"movq (%%"FF_REG_a", %1, 4), %%mm6 \n\t" // 1
PAVGB(%%mm7, %%mm6) // 11 /2
PAVGB(%%mm4, %%mm6) // 11 11 /4
PAVGB(%%mm3, %%mm6) // 11 2211 /8
PAVGB(%%mm5, %%mm2) // 11 /2
"movq (%0, %1, 4), %%mm4 \n\t" // 1
PAVGB(%%mm4, %%mm2) // 112 /4
PAVGB(%%mm2, %%mm6) // 112242211 /16
"movq %%mm6, (%0, %1, 4) \n\t" // X
// mm0=7(11) mm1=2(11) mm2=3(112) mm3=6(11) mm4=5 mm5=4 mm7=9
PAVGB(%%mm7, %%mm1) // 11 2 /4
PAVGB(%%mm4, %%mm5) // 11 /2
PAVGB(%%mm5, %%mm0) // 11 11 /4
"movq (%%"FF_REG_a", %1, 2), %%mm6 \n\t" // 1
PAVGB(%%mm6, %%mm1) // 11 4 2 /8
PAVGB(%%mm0, %%mm1) // 11224222 /16
"movq %%mm1, (%%"FF_REG_a", %1, 2) \n\t" // X
// mm2=3(112) mm3=6(11) mm4=5 mm5=4(11) mm6=6 mm7=9
PAVGB((%%FF_REGc), %%mm2) // 112 4 /8
"movq (%%"FF_REG_a", %1, 4), %%mm0 \n\t" // 1
PAVGB(%%mm0, %%mm6) // 1 1 /2
PAVGB(%%mm7, %%mm6) // 1 12 /4
PAVGB(%%mm2, %%mm6) // 1122424 /4
"movq %%mm6, (%%"FF_REG_c") \n\t" // X
// mm0=8 mm3=6(11) mm4=5 mm5=4(11) mm7=9
PAVGB(%%mm7, %%mm5) // 11 2 /4
PAVGB(%%mm7, %%mm5) // 11 6 /8
PAVGB(%%mm3, %%mm0) // 112 /4
PAVGB(%%mm0, %%mm5) // 112246 /16
"movq %%mm5, (%%"FF_REG_a", %1, 4) \n\t" // X
"sub %1, %0 \n\t"
:
: "r" (src), "r" ((x86_reg)stride), "m" (c->pQPb)
: "%"FF_REG_a, "%"FF_REG_c
#else //TEMPLATE_PP_MMXEXT || TEMPLATE_PP_3DNOW
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
const int l1= stride;
const int l2= stride + l1;
const int l3= stride + l2;
const int l4= stride + l3;
const int l5= stride + l4;
const int l6= stride + l5;
const int l7= stride + l6;
const int l8= stride + l7;
const int l9= stride + l8;
int x;
src+= stride*3;
for(x=0; x<BLOCK_SIZE; x++){
const int first= FFABS(src[0] - src[l1]) < c->QP ? src[0] : src[l1];
const int last= FFABS(src[l8] - src[l9]) < c->QP ? src[l9] : src[l8];
int sums[10];
sums[0] = 4*first + src[l1] + src[l2] + src[l3] + 4;
sums[1] = sums[0] - first + src[l4];
sums[2] = sums[1] - first + src[l5];
sums[3] = sums[2] - first + src[l6];
sums[4] = sums[3] - first + src[l7];
sums[5] = sums[4] - src[l1] + src[l8];
sums[6] = sums[5] - src[l2] + last;
sums[7] = sums[6] - src[l3] + last;
sums[8] = sums[7] - src[l4] + last;
sums[9] = sums[8] - src[l5] + last;
src[l1]= (sums[0] + sums[2] + 2*src[l1])>>4;
src[l2]= (sums[1] + sums[3] + 2*src[l2])>>4;
src[l3]= (sums[2] + sums[4] + 2*src[l3])>>4;
src[l4]= (sums[3] + sums[5] + 2*src[l4])>>4;
src[l5]= (sums[4] + sums[6] + 2*src[l5])>>4;
src[l6]= (sums[5] + sums[7] + 2*src[l6])>>4;
src[l7]= (sums[6] + sums[8] + 2*src[l7])>>4;
src[l8]= (sums[7] + sums[9] + 2*src[l8])>>4;
src++;
}
#endif //TEMPLATE_PP_MMXEXT || TEMPLATE_PP_3DNOW
* will not damage linear gradients
* Flat blocks should look like they were passed through the (1,1,2,2,4,2,2,1,1) 9-Tap filter
* can only smooth blocks at the expected locations (it cannot smooth them if they did move)
* MMX2 version does correct clipping C version does not
static inline void RENAME(vertX1Filter)(uint8_t *src, int stride, PPContext *co)
#if TEMPLATE_PP_MMXEXT || TEMPLATE_PP_3DNOW
src+= stride*3;
"pxor %%mm7, %%mm7 \n\t" // 0
"lea (%0, %1), %%"FF_REG_a" \n\t"
"lea (%%"FF_REG_a", %1, 4), %%"FF_REG_c"\n\t"
// 0 1 2 3 4 5 6 7 8 9
// %0 eax eax+%1 eax+2%1 %0+4%1 ecx ecx+%1 ecx+2%1 %0+8%1 ecx+4%1
"movq (%%"FF_REG_a", %1, 2), %%mm0 \n\t" // line 3
"movq (%0, %1, 4), %%mm1 \n\t" // line 4
"movq %%mm1, %%mm2 \n\t" // line 4
"psubusb %%mm0, %%mm1 \n\t"
"psubusb %%mm2, %%mm0 \n\t"
"por %%mm1, %%mm0 \n\t" // |l2 - l3|
"movq (%%"FF_REG_c"), %%mm3 \n\t" // line 5
"movq (%%"FF_REG_c", %1), %%mm4 \n\t" // line 6
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
"movq %%mm3, %%mm5 \n\t" // line 5
"psubusb %%mm4, %%mm3 \n\t"
"psubusb %%mm5, %%mm4 \n\t"
"por %%mm4, %%mm3 \n\t" // |l5 - l6|
PAVGB(%%mm3, %%mm0) // (|l2 - l3| + |l5 - l6|)/2
"movq %%mm2, %%mm1 \n\t" // line 4
"psubusb %%mm5, %%mm2 \n\t"
"movq %%mm2, %%mm4 \n\t"
"pcmpeqb %%mm7, %%mm2 \n\t" // (l4 - l5) <= 0 ? -1 : 0
"psubusb %%mm1, %%mm5 \n\t"
"por %%mm5, %%mm4 \n\t" // |l4 - l5|
"psubusb %%mm0, %%mm4 \n\t" //d = MAX(0, |l4-l5| - (|l2-l3| + |l5-l6|)/2)
"movq %%mm4, %%mm3 \n\t" // d
"movq %2, %%mm0 \n\t"
"paddusb %%mm0, %%mm0 \n\t"
"psubusb %%mm0, %%mm4 \n\t"
"pcmpeqb %%mm7, %%mm4 \n\t" // d <= QP ? -1 : 0
"psubusb "MANGLE(b01)", %%mm3 \n\t"
"pand %%mm4, %%mm3 \n\t" // d <= QP ? d : 0
PAVGB(%%mm7, %%mm3) // d/2
"movq %%mm3, %%mm1 \n\t" // d/2
PAVGB(%%mm7, %%mm3) // d/4
PAVGB(%%mm1, %%mm3) // 3*d/8
"movq (%0, %1, 4), %%mm0 \n\t" // line 4
"pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l4-1 : l4
"psubusb %%mm3, %%mm0 \n\t"
"pxor %%mm2, %%mm0 \n\t"
"movq %%mm0, (%0, %1, 4) \n\t" // line 4
"movq (%%"FF_REG_c"), %%mm0 \n\t" // line 5
"pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l5-1 : l5
"paddusb %%mm3, %%mm0 \n\t"
"pxor %%mm2, %%mm0 \n\t"
"movq %%mm0, (%%"FF_REG_c") \n\t" // line 5
PAVGB(%%mm7, %%mm1) // d/4
"movq (%%"FF_REG_a", %1, 2), %%mm0 \n\t" // line 3
"pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l4-1 : l4
"psubusb %%mm1, %%mm0 \n\t"
"pxor %%mm2, %%mm0 \n\t"
"movq %%mm0, (%%"FF_REG_a", %1, 2) \n\t" // line 3
"movq (%%"FF_REG_c", %1), %%mm0 \n\t" // line 6
"pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l5-1 : l5
"paddusb %%mm1, %%mm0 \n\t"
"pxor %%mm2, %%mm0 \n\t"
"movq %%mm0, (%%"FF_REG_c", %1) \n\t" // line 6
PAVGB(%%mm7, %%mm1) // d/8
"movq (%%"FF_REG_a", %1), %%mm0 \n\t" // line 2
"pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l2-1 : l2
"psubusb %%mm1, %%mm0 \n\t"
"pxor %%mm2, %%mm0 \n\t"
"movq %%mm0, (%%"FF_REG_a", %1) \n\t" // line 2
"movq (%%"FF_REG_c", %1, 2), %%mm0 \n\t" // line 7
"pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l7-1 : l7
"paddusb %%mm1, %%mm0 \n\t"
"pxor %%mm2, %%mm0 \n\t"
"movq %%mm0, (%%"FF_REG_c", %1, 2) \n\t" // line 7
:
: "r" (src), "r" ((x86_reg)stride), "m" (co->pQPb)
NAMED_CONSTRAINTS_ADD(b01)
: "%"FF_REG_a, "%"FF_REG_c
#else //TEMPLATE_PP_MMXEXT || TEMPLATE_PP_3DNOW
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
const int l1= stride;
const int l2= stride + l1;
const int l3= stride + l2;
const int l4= stride + l3;
const int l5= stride + l4;
const int l6= stride + l5;
const int l7= stride + l6;
// const int l8= stride + l7;
// const int l9= stride + l8;
int x;
src+= stride*3;
for(x=0; x<BLOCK_SIZE; x++){
int a= src[l3] - src[l4];
int b= src[l4] - src[l5];
int c= src[l5] - src[l6];
int d= FFABS(b) - ((FFABS(a) + FFABS(c))>>1);
d= FFMAX(d, 0);
if(d < co->QP*2){
int v = d * FFSIGN(-b);
src[l2] +=v>>3;
src[l3] +=v>>2;
src[l4] +=(3*v)>>3;
src[l5] -=(3*v)>>3;
src[l6] -=v>>2;
src[l7] -=v>>3;
src++;
}
#endif //TEMPLATE_PP_MMXEXT || TEMPLATE_PP_3DNOW
static inline void RENAME(doVertDefFilter)(uint8_t src[], int stride, PPContext *c)
#if TEMPLATE_PP_MMXEXT || TEMPLATE_PP_3DNOW
uint8_t tmp[16];
const int l1= stride;
const int l2= stride + l1;
const int l3= stride + l2;
const int l4= (int)tmp - (int)src - stride*3;
const int l5= (int)tmp - (int)src - stride*3 + 8;
const int l6= stride*3 + l3;
const int l7= stride + l6;
const int l8= stride + l7;
memcpy(tmp, src+stride*7, 8);
memcpy(tmp+8, src+stride*8, 8);
src+= stride*4;
#if 0 //slightly more accurate and slightly slower
"pxor %%mm7, %%mm7 \n\t" // 0
"lea (%0, %1), %%"FF_REG_a" \n\t"
"lea (%%"FF_REG_a", %1, 4), %%"FF_REG_c"\n\t"
// 0 1 2 3 4 5 6 7
// %0 %0+%1 %0+2%1 eax+2%1 %0+4%1 eax+4%1 ecx+%1 ecx+2%1
// %0 eax eax+%1 eax+2%1 %0+4%1 ecx ecx+%1 ecx+2%1
"movq (%0, %1, 2), %%mm0 \n\t" // l2
"movq (%0), %%mm1 \n\t" // l0
"movq %%mm0, %%mm2 \n\t" // l2
PAVGB(%%mm7, %%mm0) // ~l2/2
PAVGB(%%mm1, %%mm0) // ~(l2 + 2l0)/4
PAVGB(%%mm2, %%mm0) // ~(5l2 + 2l0)/8
"movq (%%"FF_REG_a"), %%mm1 \n\t" // l1
"movq (%%"FF_REG_a", %1, 2), %%mm3 \n\t" // l3
"movq %%mm1, %%mm4 \n\t" // l1
PAVGB(%%mm7, %%mm1) // ~l1/2
PAVGB(%%mm3, %%mm1) // ~(l1 + 2l3)/4
PAVGB(%%mm4, %%mm1) // ~(5l1 + 2l3)/8
"movq %%mm0, %%mm4 \n\t" // ~(5l2 + 2l0)/8
"psubusb %%mm1, %%mm0 \n\t"
"psubusb %%mm4, %%mm1 \n\t"
"por %%mm0, %%mm1 \n\t" // ~|2l0 - 5l1 + 5l2 - 2l3|/8
// mm1= |lenergy|, mm2= l2, mm3= l3, mm7=0
"movq (%0, %1, 4), %%mm0 \n\t" // l4
"movq %%mm0, %%mm4 \n\t" // l4
PAVGB(%%mm7, %%mm0) // ~l4/2
PAVGB(%%mm2, %%mm0) // ~(l4 + 2l2)/4
PAVGB(%%mm4, %%mm0) // ~(5l4 + 2l2)/8
"movq (%%"FF_REG_c"), %%mm2 \n\t" // l5
"movq %%mm3, %%mm5 \n\t" // l3
PAVGB(%%mm7, %%mm3) // ~l3/2
PAVGB(%%mm2, %%mm3) // ~(l3 + 2l5)/4
PAVGB(%%mm5, %%mm3) // ~(5l3 + 2l5)/8
"movq %%mm0, %%mm6 \n\t" // ~(5l4 + 2l2)/8
"psubusb %%mm3, %%mm0 \n\t"
"psubusb %%mm6, %%mm3 \n\t"
"por %%mm0, %%mm3 \n\t" // ~|2l2 - 5l3 + 5l4 - 2l5|/8
"pcmpeqb %%mm7, %%mm0 \n\t" // SIGN(2l2 - 5l3 + 5l4 - 2l5)
// mm0= SIGN(menergy), mm1= |lenergy|, mm2= l5, mm3= |menergy|, mm4=l4, mm5= l3, mm7=0
"movq (%%"FF_REG_c", %1), %%mm6 \n\t" // l6
"movq %%mm6, %%mm5 \n\t" // l6
PAVGB(%%mm7, %%mm6) // ~l6/2
PAVGB(%%mm4, %%mm6) // ~(l6 + 2l4)/4
PAVGB(%%mm5, %%mm6) // ~(5l6 + 2l4)/8
"movq (%%"FF_REG_c", %1, 2), %%mm5 \n\t" // l7
"movq %%mm2, %%mm4 \n\t" // l5
PAVGB(%%mm7, %%mm2) // ~l5/2
PAVGB(%%mm5, %%mm2) // ~(l5 + 2l7)/4
PAVGB(%%mm4, %%mm2) // ~(5l5 + 2l7)/8
"movq %%mm6, %%mm4 \n\t" // ~(5l6 + 2l4)/8
"psubusb %%mm2, %%mm6 \n\t"
"psubusb %%mm4, %%mm2 \n\t"
"por %%mm6, %%mm2 \n\t" // ~|2l4 - 5l5 + 5l6 - 2l7|/8
// mm0= SIGN(menergy), mm1= |lenergy|/8, mm2= |renergy|/8, mm3= |menergy|/8, mm7=0
PMINUB(%%mm2, %%mm1, %%mm4) // MIN(|lenergy|,|renergy|)/8
"movq %2, %%mm4 \n\t" // QP //FIXME QP+1 ?
"paddusb "MANGLE(b01)", %%mm4 \n\t"
"pcmpgtb %%mm3, %%mm4 \n\t" // |menergy|/8 < QP
"psubusb %%mm1, %%mm3 \n\t" // d=|menergy|/8-MIN(|lenergy|,|renergy|)/8
"pand %%mm4, %%mm3 \n\t"
"movq %%mm3, %%mm1 \n\t"
// "psubusb "MANGLE(b01)", %%mm3 \n\t"
PAVGB(%%mm7, %%mm3)
PAVGB(%%mm7, %%mm3)
"paddusb %%mm1, %%mm3 \n\t"
// "paddusb "MANGLE(b01)", %%mm3 \n\t"
"movq (%%"FF_REG_a", %1, 2), %%mm6 \n\t" //l3
"movq (%0, %1, 4), %%mm5 \n\t" //l4
"movq (%0, %1, 4), %%mm4 \n\t" //l4
"psubusb %%mm6, %%mm5 \n\t"
"psubusb %%mm4, %%mm6 \n\t"
"por %%mm6, %%mm5 \n\t" // |l3-l4|
"pcmpeqb %%mm7, %%mm6 \n\t" // SIGN(l3-l4)
"pxor %%mm6, %%mm0 \n\t"
"pand %%mm0, %%mm3 \n\t"
PMINUB(%%mm5, %%mm3, %%mm0)
"psubusb "MANGLE(b01)", %%mm3 \n\t"
PAVGB(%%mm7, %%mm3)
"movq (%%"FF_REG_a", %1, 2), %%mm0 \n\t"
"movq (%0, %1, 4), %%mm2 \n\t"
"pxor %%mm6, %%mm0 \n\t"
"pxor %%mm6, %%mm2 \n\t"
"psubb %%mm3, %%mm0 \n\t"
"paddb %%mm3, %%mm2 \n\t"
"pxor %%mm6, %%mm0 \n\t"
"pxor %%mm6, %%mm2 \n\t"
"movq %%mm0, (%%"FF_REG_a", %1, 2) \n\t"
"movq %%mm2, (%0, %1, 4) \n\t"
#endif //0
"lea (%0, %1), %%"FF_REG_a" \n\t"
"pcmpeqb %%mm6, %%mm6 \n\t" // -1
// 0 1 2 3 4 5 6 7
// %0 %0+%1 %0+2%1 eax+2%1 %0+4%1 eax+4%1 ecx+%1 ecx+2%1
// %0 eax eax+%1 eax+2%1 %0+4%1 ecx ecx+%1 ecx+2%1
"movq (%%"FF_REG_a", %1, 2), %%mm1 \n\t" // l3
"movq (%0, %1, 4), %%mm0 \n\t" // l4
"pxor %%mm6, %%mm1 \n\t" // -l3-1
PAVGB(%%mm1, %%mm0) // -q+128 = (l4-l3+256)/2
"movq (%%"FF_REG_a", %1, 4), %%mm2 \n\t" // l5
"movq (%%"FF_REG_a", %1), %%mm3 \n\t" // l2
"pxor %%mm6, %%mm2 \n\t" // -l5-1
"movq %%mm2, %%mm5 \n\t" // -l5-1
"movq "MANGLE(b80)", %%mm4 \n\t" // 128
"lea (%%"FF_REG_a", %1, 4), %%"FF_REG_c"\n\t"
PAVGB(%%mm3, %%mm2) // (l2-l5+256)/2
PAVGB(%%mm0, %%mm4) // ~(l4-l3)/4 + 128
PAVGB(%%mm2, %%mm4) // ~(l2-l5)/4 +(l4-l3)/8 + 128
PAVGB(%%mm0, %%mm4) // ~(l2-l5)/8 +5(l4-l3)/16 + 128
// mm1=-l3-1, mm0=128-q, mm3=l2, mm4=menergy/16 + 128, mm5= -l5-1
"movq (%%"FF_REG_a"), %%mm2 \n\t" // l1
"pxor %%mm6, %%mm2 \n\t" // -l1-1
PAVGB(%%mm3, %%mm2) // (l2-l1+256)/2
PAVGB((%0), %%mm1) // (l0-l3+256)/2
"movq "MANGLE(b80)", %%mm3 \n\t" // 128
PAVGB(%%mm2, %%mm3) // ~(l2-l1)/4 + 128
PAVGB(%%mm1, %%mm3) // ~(l0-l3)/4 +(l2-l1)/8 + 128
PAVGB(%%mm2, %%mm3) // ~(l0-l3)/8 +5(l2-l1)/16 + 128
// mm0=128-q, mm3=lenergy/16 + 128, mm4= menergy/16 + 128, mm5= -l5-1
PAVGB((%%FF_REGc, %1), %%mm5) // (l6-l5+256)/2
"movq (%%"FF_REG_c", %1, 2), %%mm1 \n\t" // l7
"pxor %%mm6, %%mm1 \n\t" // -l7-1
PAVGB((%0, %1, 4), %%mm1) // (l4-l7+256)/2
"movq "MANGLE(b80)", %%mm2 \n\t" // 128
PAVGB(%%mm5, %%mm2) // ~(l6-l5)/4 + 128
PAVGB(%%mm1, %%mm2) // ~(l4-l7)/4 +(l6-l5)/8 + 128
PAVGB(%%mm5, %%mm2) // ~(l4-l7)/8 +5(l6-l5)/16 + 128
// mm0=128-q, mm2=renergy/16 + 128, mm3=lenergy/16 + 128, mm4= menergy/16 + 128
"movq "MANGLE(b00)", %%mm1 \n\t" // 0
"movq "MANGLE(b00)", %%mm5 \n\t" // 0
"psubb %%mm2, %%mm1 \n\t" // 128 - renergy/16
"psubb %%mm3, %%mm5 \n\t" // 128 - lenergy/16
PMAXUB(%%mm1, %%mm2) // 128 + |renergy/16|
PMAXUB(%%mm5, %%mm3) // 128 + |lenergy/16|
PMINUB(%%mm2, %%mm3, %%mm1) // 128 + MIN(|lenergy|,|renergy|)/16
// mm0=128-q, mm3=128 + MIN(|lenergy|,|renergy|)/16, mm4= menergy/16 + 128
"movq "MANGLE(b00)", %%mm7 \n\t" // 0
"movq %2, %%mm2 \n\t" // QP
PAVGB(%%mm6, %%mm2) // 128 + QP/2
"psubb %%mm6, %%mm2 \n\t"
"movq %%mm4, %%mm1 \n\t"
"pcmpgtb %%mm7, %%mm1 \n\t" // SIGN(menergy)
"pxor %%mm1, %%mm4 \n\t"
"psubb %%mm1, %%mm4 \n\t" // 128 + |menergy|/16
"pcmpgtb %%mm4, %%mm2 \n\t" // |menergy|/16 < QP/2
"psubusb %%mm3, %%mm4 \n\t" //d=|menergy|/16 - MIN(|lenergy|,|renergy|)/16
// mm0=128-q, mm1= SIGN(menergy), mm2= |menergy|/16 < QP/2, mm4= d/16
"movq %%mm4, %%mm3 \n\t" // d
"psubusb "MANGLE(b01)", %%mm4 \n\t"
PAVGB(%%mm7, %%mm4) // d/32
PAVGB(%%mm7, %%mm4) // (d + 32)/64
"paddb %%mm3, %%mm4 \n\t" // 5d/64
"pand %%mm2, %%mm4 \n\t"
"movq "MANGLE(b80)", %%mm5 \n\t" // 128
"psubb %%mm0, %%mm5 \n\t" // q
"paddsb %%mm6, %%mm5 \n\t" // fix bad rounding
"pcmpgtb %%mm5, %%mm7 \n\t" // SIGN(q)
"pxor %%mm7, %%mm5 \n\t"
PMINUB(%%mm5, %%mm4, %%mm3) // MIN(|q|, 5d/64)
"pxor %%mm1, %%mm7 \n\t" // SIGN(d*q)
"pand %%mm7, %%mm4 \n\t"
"movq (%%"FF_REG_a", %1, 2), %%mm0 \n\t"
"movq (%0, %1, 4), %%mm2 \n\t"
"pxor %%mm1, %%mm0 \n\t"
"pxor %%mm1, %%mm2 \n\t"
"paddb %%mm4, %%mm0 \n\t"
"psubb %%mm4, %%mm2 \n\t"
"pxor %%mm1, %%mm0 \n\t"
"pxor %%mm1, %%mm2 \n\t"
"movq %%mm0, (%%"FF_REG_a", %1, 2) \n\t"
"movq %%mm2, (%0, %1, 4) \n\t"
:
: "r" (src), "r" ((x86_reg)stride), "m" (c->pQPb)
NAMED_CONSTRAINTS_ADD(b80,b00,b01)
: "%"FF_REG_a, "%"FF_REG_c
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
{
int x;
src-= stride;
for(x=0; x<BLOCK_SIZE; x++){
const int middleEnergy= 5*(src[l5] - src[l4]) + 2*(src[l3] - src[l6]);
if(FFABS(middleEnergy)< 8*QP){
const int q=(src[l4] - src[l5])/2;
const int leftEnergy= 5*(src[l3] - src[l2]) + 2*(src[l1] - src[l4]);
const int rightEnergy= 5*(src[l7] - src[l6]) + 2*(src[l5] - src[l8]);
int d= FFABS(middleEnergy) - FFMIN( FFABS(leftEnergy), FFABS(rightEnergy) );
d= FFMAX(d, 0);
d= (5*d + 32) >> 6;
d*= FFSIGN(-middleEnergy);
if(q>0){
d= d<0 ? 0 : d;
d= d>q ? q : d;
}else{
d= d>0 ? 0 : d;
d= d<q ? q : d;
}
src[l4]-= d;
src[l5]+= d;
src++;
}
src-=8;
for(x=0; x<8; x++){
int y;
for(y=4; y<6; y++){
int d= src[x+y*stride] - tmp[x+(y-4)*8];
int ad= FFABS(d);
static int max=0;
static int sum=0;
static int num=0;
static int bias=0;
if(max<ad) max=ad;
sum+= ad>3 ? 1 : 0;
if(ad>3){
src[0] = src[7] = src[stride*7] = src[(stride+1)*7]=255;
}
if(y==4) bias+=d;
num++;
if(num%1000000 == 0){
av_log(c, AV_LOG_INFO, " %d %d %d %d\n", num, sum, max, bias);
}
DECLARE_ALIGNED(8, uint64_t, tmp)[4]; // make space for 4 8-byte vars
src+= stride*4;
"pxor %%mm7, %%mm7 \n\t"
// 0 1 2 3 4 5 6 7
// %0 %0+%1 %0+2%1 eax+2%1 %0+4%1 eax+4%1 edx+%1 edx+2%1
// %0 eax eax+%1 eax+2%1 %0+4%1 edx edx+%1 edx+2%1
"movq (%0), %%mm0 \n\t"
"movq %%mm0, %%mm1 \n\t"
"punpcklbw %%mm7, %%mm0 \n\t" // low part of line 0
"punpckhbw %%mm7, %%mm1 \n\t" // high part of line 0
"movq (%0, %1), %%mm2 \n\t"
"lea (%0, %1, 2), %%"FF_REG_a" \n\t"
"movq %%mm2, %%mm3 \n\t"
"punpcklbw %%mm7, %%mm2 \n\t" // low part of line 1
"punpckhbw %%mm7, %%mm3 \n\t" // high part of line 1
"movq (%%"FF_REG_a"), %%mm4 \n\t"
"movq %%mm4, %%mm5 \n\t"
"punpcklbw %%mm7, %%mm4 \n\t" // low part of line 2
"punpckhbw %%mm7, %%mm5 \n\t" // high part of line 2
"paddw %%mm0, %%mm0 \n\t" // 2L0
"paddw %%mm1, %%mm1 \n\t" // 2H0
"psubw %%mm4, %%mm2 \n\t" // L1 - L2
"psubw %%mm5, %%mm3 \n\t" // H1 - H2
"psubw %%mm2, %%mm0 \n\t" // 2L0 - L1 + L2
"psubw %%mm3, %%mm1 \n\t" // 2H0 - H1 + H2
"psllw $2, %%mm2 \n\t" // 4L1 - 4L2
"psllw $2, %%mm3 \n\t" // 4H1 - 4H2
"psubw %%mm2, %%mm0 \n\t" // 2L0 - 5L1 + 5L2
"psubw %%mm3, %%mm1 \n\t" // 2H0 - 5H1 + 5H2
"movq (%%"FF_REG_a", %1), %%mm2 \n\t"
"movq %%mm2, %%mm3 \n\t"
"punpcklbw %%mm7, %%mm2 \n\t" // L3
"punpckhbw %%mm7, %%mm3 \n\t" // H3
"psubw %%mm2, %%mm0 \n\t" // 2L0 - 5L1 + 5L2 - L3
"psubw %%mm3, %%mm1 \n\t" // 2H0 - 5H1 + 5H2 - H3
"psubw %%mm2, %%mm0 \n\t" // 2L0 - 5L1 + 5L2 - 2L3
"psubw %%mm3, %%mm1 \n\t" // 2H0 - 5H1 + 5H2 - 2H3
"movq %%mm0, (%3) \n\t" // 2L0 - 5L1 + 5L2 - 2L3
"movq %%mm1, 8(%3) \n\t" // 2H0 - 5H1 + 5H2 - 2H3
"movq (%%"FF_REG_a", %1, 2), %%mm0 \n\t"
"movq %%mm0, %%mm1 \n\t"
"punpcklbw %%mm7, %%mm0 \n\t" // L4
"punpckhbw %%mm7, %%mm1 \n\t" // H4
"psubw %%mm0, %%mm2 \n\t" // L3 - L4
"psubw %%mm1, %%mm3 \n\t" // H3 - H4
"movq %%mm2, 16(%3) \n\t" // L3 - L4
"movq %%mm3, 24(%3) \n\t" // H3 - H4
"paddw %%mm4, %%mm4 \n\t" // 2L2
"paddw %%mm5, %%mm5 \n\t" // 2H2
"psubw %%mm2, %%mm4 \n\t" // 2L2 - L3 + L4
"psubw %%mm3, %%mm5 \n\t" // 2H2 - H3 + H4
"lea (%%"FF_REG_a", %1), %0 \n\t"
"psllw $2, %%mm2 \n\t" // 4L3 - 4L4
"psllw $2, %%mm3 \n\t" // 4H3 - 4H4
"psubw %%mm2, %%mm4 \n\t" // 2L2 - 5L3 + 5L4
"psubw %%mm3, %%mm5 \n\t" // 2H2 - 5H3 + 5H4
//50 opcodes so far
"movq (%0, %1, 2), %%mm2 \n\t"
"movq %%mm2, %%mm3 \n\t"
"punpcklbw %%mm7, %%mm2 \n\t" // L5
"punpckhbw %%mm7, %%mm3 \n\t" // H5
"psubw %%mm2, %%mm4 \n\t" // 2L2 - 5L3 + 5L4 - L5
"psubw %%mm3, %%mm5 \n\t" // 2H2 - 5H3 + 5H4 - H5
"psubw %%mm2, %%mm4 \n\t" // 2L2 - 5L3 + 5L4 - 2L5
"psubw %%mm3, %%mm5 \n\t" // 2H2 - 5H3 + 5H4 - 2H5
"movq (%%"FF_REG_a", %1, 4), %%mm6 \n\t"
"punpcklbw %%mm7, %%mm6 \n\t" // L6
"psubw %%mm6, %%mm2 \n\t" // L5 - L6
"movq (%%"FF_REG_a", %1, 4), %%mm6 \n\t"
"punpckhbw %%mm7, %%mm6 \n\t" // H6
"psubw %%mm6, %%mm3 \n\t" // H5 - H6
"paddw %%mm0, %%mm0 \n\t" // 2L4
"paddw %%mm1, %%mm1 \n\t" // 2H4
"psubw %%mm2, %%mm0 \n\t" // 2L4 - L5 + L6
"psubw %%mm3, %%mm1 \n\t" // 2H4 - H5 + H6
"psllw $2, %%mm2 \n\t" // 4L5 - 4L6
"psllw $2, %%mm3 \n\t" // 4H5 - 4H6
"psubw %%mm2, %%mm0 \n\t" // 2L4 - 5L5 + 5L6
"psubw %%mm3, %%mm1 \n\t" // 2H4 - 5H5 + 5H6
"movq (%0, %1, 4), %%mm2 \n\t"
"movq %%mm2, %%mm3 \n\t"
"punpcklbw %%mm7, %%mm2 \n\t" // L7
"punpckhbw %%mm7, %%mm3 \n\t" // H7
"paddw %%mm2, %%mm2 \n\t" // 2L7
"paddw %%mm3, %%mm3 \n\t" // 2H7
"psubw %%mm2, %%mm0 \n\t" // 2L4 - 5L5 + 5L6 - 2L7
"psubw %%mm3, %%mm1 \n\t" // 2H4 - 5H5 + 5H6 - 2H7
"movq (%3), %%mm2 \n\t" // 2L0 - 5L1 + 5L2 - 2L3
"movq 8(%3), %%mm3 \n\t" // 2H0 - 5H1 + 5H2 - 2H3
"movq %%mm7, %%mm6 \n\t" // 0
"psubw %%mm0, %%mm6 \n\t"
"pmaxsw %%mm6, %%mm0 \n\t" // |2L4 - 5L5 + 5L6 - 2L7|
"movq %%mm7, %%mm6 \n\t" // 0
"psubw %%mm1, %%mm6 \n\t"
"pmaxsw %%mm6, %%mm1 \n\t" // |2H4 - 5H5 + 5H6 - 2H7|
"movq %%mm7, %%mm6 \n\t" // 0
"psubw %%mm2, %%mm6 \n\t"
"pmaxsw %%mm6, %%mm2 \n\t" // |2L0 - 5L1 + 5L2 - 2L3|
"movq %%mm7, %%mm6 \n\t" // 0
"psubw %%mm3, %%mm6 \n\t"
"pmaxsw %%mm6, %%mm3 \n\t" // |2H0 - 5H1 + 5H2 - 2H3|
"movq %%mm7, %%mm6 \n\t" // 0
"pcmpgtw %%mm0, %%mm6 \n\t"
"pxor %%mm6, %%mm0 \n\t"
"psubw %%mm6, %%mm0 \n\t" // |2L4 - 5L5 + 5L6 - 2L7|
"movq %%mm7, %%mm6 \n\t" // 0
"pcmpgtw %%mm1, %%mm6 \n\t"
"pxor %%mm6, %%mm1 \n\t"
"psubw %%mm6, %%mm1 \n\t" // |2H4 - 5H5 + 5H6 - 2H7|
"movq %%mm7, %%mm6 \n\t" // 0
"pcmpgtw %%mm2, %%mm6 \n\t"
"pxor %%mm6, %%mm2 \n\t"
"psubw %%mm6, %%mm2 \n\t" // |2L0 - 5L1 + 5L2 - 2L3|
"movq %%mm7, %%mm6 \n\t" // 0
"pcmpgtw %%mm3, %%mm6 \n\t"
"pxor %%mm6, %%mm3 \n\t"
"psubw %%mm6, %%mm3 \n\t" // |2H0 - 5H1 + 5H2 - 2H3|
"pminsw %%mm2, %%mm0 \n\t"
"pminsw %%mm3, %%mm1 \n\t"
"movq %%mm0, %%mm6 \n\t"
"psubusw %%mm2, %%mm6 \n\t"
"psubw %%mm6, %%mm0 \n\t"
"movq %%mm1, %%mm6 \n\t"
"psubusw %%mm3, %%mm6 \n\t"
"psubw %%mm6, %%mm1 \n\t"
"movd %2, %%mm2 \n\t" // QP
"punpcklbw %%mm7, %%mm2 \n\t"
"movq %%mm7, %%mm6 \n\t" // 0
"pcmpgtw %%mm4, %%mm6 \n\t" // sign(2L2 - 5L3 + 5L4 - 2L5)
"pxor %%mm6, %%mm4 \n\t"
"psubw %%mm6, %%mm4 \n\t" // |2L2 - 5L3 + 5L4 - 2L5|
"pcmpgtw %%mm5, %%mm7 \n\t" // sign(2H2 - 5H3 + 5H4 - 2H5)
"pxor %%mm7, %%mm5 \n\t"
"psubw %%mm7, %%mm5 \n\t" // |2H2 - 5H3 + 5H4 - 2H5|
"psllw $3, %%mm2 \n\t" // 8QP
"movq %%mm2, %%mm3 \n\t" // 8QP
"pcmpgtw %%mm4, %%mm2 \n\t"
"pcmpgtw %%mm5, %%mm3 \n\t"
"pand %%mm2, %%mm4 \n\t"
"pand %%mm3, %%mm5 \n\t"
"psubusw %%mm0, %%mm4 \n\t" // hd
"psubusw %%mm1, %%mm5 \n\t" // ld
"movq "MANGLE(w05)", %%mm2 \n\t" // 5
"pmullw %%mm2, %%mm4 \n\t"
"pmullw %%mm2, %%mm5 \n\t"
"movq "MANGLE(w20)", %%mm2 \n\t" // 32
"paddw %%mm2, %%mm4 \n\t"
"paddw %%mm2, %%mm5 \n\t"
"psrlw $6, %%mm4 \n\t"
"psrlw $6, %%mm5 \n\t"