Newer
Older
* Copyright (c) 2000, 2001 Fabrice Bellard
Michael Niedermayer
committed
* Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
* gmc & q-pel & 32/64 bit based MC by Michael Niedermayer <michaelni@gmx.at>
*
Diego Biurrun
committed
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
Diego Biurrun
committed
* version 2.1 of the License, or (at your option) any later version.
Diego Biurrun
committed
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
* You should have received a copy of the GNU Lesser General Public
Diego Biurrun
committed
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
* @file libavcodec/dsputil.c
#include "simple_idct.h"
Aurelien Jacobs
committed
#include "mathops.h"
#include "h263.h"
/* snow.c */
void ff_spatial_dwt(int *buffer, int width, int height, int stride, int type, int decomposition_count);
/* vorbis.c */
void vorbis_inverse_coupling(float *mag, float *ang, int blocksize);
/* ac3dec.c */
void ff_ac3_downmix_c(float (*samples)[256], float (*matrix)[2], int out_ch, int in_ch, int len);
/* flacenc.c */
void ff_flac_compute_autocorr(const int32_t *data, int len, int lag, double *autoc);
/* pngdec.c */
void ff_add_png_paeth_prediction(uint8_t *dst, uint8_t *src, uint8_t *top, int w, int bpp);
/* eaidct.c */
void ff_ea_idct_put_c(uint8_t *dest, int linesize, DCTELEM *block);
uint8_t ff_cropTbl[256 + 2 * MAX_NEG_CROP] = {0, };
// 0x7f7f7f7f or 0x7f7f7f7f7f7f7f7f or whatever, depending on the cpu's native arithmetic size
#define pb_7f (~0UL/255 * 0x7f)
#define pb_80 (~0UL/255 * 0x80)
const uint8_t ff_zigzag_direct[64] = {
0, 1, 8, 16, 9, 2, 3, 10,
17, 24, 32, 25, 18, 11, 4, 5,
27, 20, 13, 6, 7, 14, 21, 28,
35, 42, 49, 56, 57, 50, 43, 36,
29, 22, 15, 23, 30, 37, 44, 51,
58, 59, 52, 45, 38, 31, 39, 46,
53, 60, 61, 54, 47, 55, 62, 63
};
/* Specific zigzag scan for 248 idct. NOTE that unlike the
specification, we interleave the fields */
const uint8_t ff_zigzag248_direct[64] = {
0, 8, 1, 9, 16, 24, 2, 10,
17, 25, 32, 40, 48, 56, 33, 41,
18, 26, 3, 11, 4, 12, 19, 27,
34, 42, 49, 57, 50, 58, 35, 43,
20, 28, 5, 13, 6, 14, 21, 29,
36, 44, 51, 59, 52, 60, 37, 45,
22, 30, 7, 15, 23, 31, 38, 46,
53, 61, 54, 62, 39, 47, 55, 63,
};
/* not permutated inverse zigzag_direct + 1 for MMX quantizer */
DECLARE_ALIGNED_8(uint16_t, inv_zigzag_direct16[64]) = {0, };
const uint8_t ff_alternate_horizontal_scan[64] = {
13, 12, 19, 18, 24, 25, 32, 33,
30, 31, 34, 35, 40, 41, 48, 49,
46, 47, 50, 51, 56, 57, 58, 59,
52, 53, 54, 55, 60, 61, 62, 63,
};
const uint8_t ff_alternate_vertical_scan[64] = {
41, 33, 26, 18, 3, 11, 4, 12,
51, 59, 20, 28, 5, 13, 6, 14,
53, 61, 22, 30, 7, 15, 23, 31,
38, 46, 54, 62, 39, 47, 55, 63,
};
/* a*inverse[b]>>32 == a/b for all 0<=a<=65536 && 2<=b<=255 */
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
0, 4294967295U,2147483648U,1431655766, 1073741824, 858993460, 715827883, 613566757,
536870912, 477218589, 429496730, 390451573, 357913942, 330382100, 306783379, 286331154,
268435456, 252645136, 238609295, 226050911, 214748365, 204522253, 195225787, 186737709,
178956971, 171798692, 165191050, 159072863, 153391690, 148102321, 143165577, 138547333,
134217728, 130150525, 126322568, 122713352, 119304648, 116080198, 113025456, 110127367,
107374183, 104755300, 102261127, 99882961, 97612894, 95443718, 93368855, 91382283,
89478486, 87652394, 85899346, 84215046, 82595525, 81037119, 79536432, 78090315,
76695845, 75350304, 74051161, 72796056, 71582789, 70409300, 69273667, 68174085,
67108864, 66076420, 65075263, 64103990, 63161284, 62245903, 61356676, 60492498,
59652324, 58835169, 58040099, 57266231, 56512728, 55778797, 55063684, 54366675,
53687092, 53024288, 52377650, 51746594, 51130564, 50529028, 49941481, 49367441,
48806447, 48258060, 47721859, 47197443, 46684428, 46182445, 45691142, 45210183,
44739243, 44278014, 43826197, 43383509, 42949673, 42524429, 42107523, 41698712,
41297763, 40904451, 40518560, 40139882, 39768216, 39403370, 39045158, 38693400,
38347923, 38008561, 37675152, 37347542, 37025581, 36709123, 36398028, 36092163,
35791395, 35495598, 35204650, 34918434, 34636834, 34359739, 34087043, 33818641,
33554432, 33294321, 33038210, 32786010, 32537632, 32292988, 32051995, 31814573,
31580642, 31350127, 31122952, 30899046, 30678338, 30460761, 30246249, 30034737,
29826162, 29620465, 29417585, 29217465, 29020050, 28825284, 28633116, 28443493,
28256364, 28071682, 27889399, 27709467, 27531842, 27356480, 27183338, 27012373,
26843546, 26676816, 26512144, 26349493, 26188825, 26030105, 25873297, 25718368,
25565282, 25414008, 25264514, 25116768, 24970741, 24826401, 24683721, 24542671,
24403224, 24265352, 24129030, 23994231, 23860930, 23729102, 23598722, 23469767,
23342214, 23216040, 23091223, 22967740, 22845571, 22724695, 22605092, 22486740,
22369622, 22253717, 22139007, 22025474, 21913099, 21801865, 21691755, 21582751,
21474837, 21367997, 21262215, 21157475, 21053762, 20951060, 20849356, 20748635,
20648882, 20550083, 20452226, 20355296, 20259280, 20164166, 20069941, 19976593,
19884108, 19792477, 19701685, 19611723, 19522579, 19434242, 19346700, 19259944,
19173962, 19088744, 19004281, 18920561, 18837576, 18755316, 18673771, 18592933,
18512791, 18433337, 18354562, 18276457, 18199014, 18122225, 18046082, 17970575,
17895698, 17821442, 17747799, 17674763, 17602325, 17530479, 17459217, 17388532,
17318417, 17248865, 17179870, 17111424, 17043522, 16976156, 16909321, 16843010,
};
/* Input permutation for the simple_idct_mmx */
static const uint8_t simple_mmx_permutation[64]={
0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D,
0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D,
0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D,
0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F,
0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F,
0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D,
0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F,
0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
Alexander Strange
committed
static const uint8_t idct_sse2_row_perm[8] = {0, 4, 1, 5, 2, 6, 3, 7};
void ff_init_scantable(uint8_t *permutation, ScanTable *st, const uint8_t *src_scantable){
int i;
int end;
st->scantable= src_scantable;
for(i=0; i<64; i++){
int j;
j = src_scantable[i];
st->permutated[i] = permutation[j];
st->inverse[j] = i;
#endif
}
end=-1;
for(i=0; i<64; i++){
int j;
j = st->permutated[i];
if(j>end) end=j;
st->raster_end[i]= end;
}
}
static int pix_sum_c(uint8_t * pix, int line_size)
{
int s, i, j;
s = 0;
for (i = 0; i < 16; i++) {
for (j = 0; j < 16; j += 8) {
s += pix[0];
s += pix[1];
s += pix[2];
s += pix[3];
s += pix[4];
s += pix[5];
s += pix[6];
s += pix[7];
pix += 8;
}
pix += line_size - 16;
static int pix_norm1_c(uint8_t * pix, int line_size)
for (j = 0; j < 16; j += 8) {
Felix von Leitner
committed
#if 0
s += sq[pix[0]];
s += sq[pix[1]];
s += sq[pix[2]];
s += sq[pix[3]];
s += sq[pix[4]];
s += sq[pix[5]];
s += sq[pix[6]];
s += sq[pix[7]];
Felix von Leitner
committed
#else
#if LONG_MAX > 2147483647
register uint64_t x=*(uint64_t*)pix;
s += sq[x&0xff];
s += sq[(x>>8)&0xff];
s += sq[(x>>16)&0xff];
s += sq[(x>>24)&0xff];
Felix von Leitner
committed
s += sq[(x>>32)&0xff];
s += sq[(x>>40)&0xff];
s += sq[(x>>48)&0xff];
s += sq[(x>>56)&0xff];
#else
register uint32_t x=*(uint32_t*)pix;
s += sq[x&0xff];
s += sq[(x>>8)&0xff];
s += sq[(x>>16)&0xff];
s += sq[(x>>24)&0xff];
Felix von Leitner
committed
x=*(uint32_t*)(pix+4);
s += sq[x&0xff];
s += sq[(x>>8)&0xff];
s += sq[(x>>16)&0xff];
s += sq[(x>>24)&0xff];
#endif
#endif
pix += 8;
}
pix += line_size - 16;
static void bswap_buf(uint32_t *dst, const uint32_t *src, int w){
for(i=0; i+8<=w; i+=8){
dst[i+0]= bswap_32(src[i+0]);
dst[i+1]= bswap_32(src[i+1]);
dst[i+2]= bswap_32(src[i+2]);
dst[i+3]= bswap_32(src[i+3]);
dst[i+4]= bswap_32(src[i+4]);
dst[i+5]= bswap_32(src[i+5]);
dst[i+6]= bswap_32(src[i+6]);
dst[i+7]= bswap_32(src[i+7]);
}
for(;i<w; i++){
dst[i+0]= bswap_32(src[i+0]);
}
}
static int sse4_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
{
int s, i;
s = 0;
for (i = 0; i < h; i++) {
s += sq[pix1[0] - pix2[0]];
s += sq[pix1[1] - pix2[1]];
s += sq[pix1[2] - pix2[2]];
s += sq[pix1[3] - pix2[3]];
pix1 += line_size;
pix2 += line_size;
}
return s;
}
static int sse8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
s += sq[pix1[0] - pix2[0]];
s += sq[pix1[1] - pix2[1]];
s += sq[pix1[2] - pix2[2]];
s += sq[pix1[3] - pix2[3]];
s += sq[pix1[4] - pix2[4]];
s += sq[pix1[5] - pix2[5]];
s += sq[pix1[6] - pix2[6]];
s += sq[pix1[7] - pix2[7]];
pix1 += line_size;
pix2 += line_size;
}
return s;
}
static int sse16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
int s, i;
s += sq[pix1[ 0] - pix2[ 0]];
s += sq[pix1[ 1] - pix2[ 1]];
s += sq[pix1[ 2] - pix2[ 2]];
s += sq[pix1[ 3] - pix2[ 3]];
s += sq[pix1[ 4] - pix2[ 4]];
s += sq[pix1[ 5] - pix2[ 5]];
s += sq[pix1[ 6] - pix2[ 6]];
s += sq[pix1[ 7] - pix2[ 7]];
s += sq[pix1[ 8] - pix2[ 8]];
s += sq[pix1[ 9] - pix2[ 9]];
s += sq[pix1[10] - pix2[10]];
s += sq[pix1[11] - pix2[11]];
s += sq[pix1[12] - pix2[12]];
s += sq[pix1[13] - pix2[13]];
s += sq[pix1[14] - pix2[14]];
s += sq[pix1[15] - pix2[15]];
Felix von Leitner
committed
pix1 += line_size;
pix2 += line_size;
#if CONFIG_SNOW_ENCODER //dwt is in snow.c
static inline int w_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int w, int h, int type){
int s, i, j;
const int dec_count= w==8 ? 3 : 4;
Loren Merritt
committed
int tmp[32*32];
static const int scale[2][2][4][4]={
Loren Merritt
committed
// 9/7 8x8 dec=3
{268, 239, 239, 213},
{ 0, 224, 224, 152},
{ 0, 135, 135, 110},
},{
Loren Merritt
committed
// 9/7 16x16 or 32x32 dec=4
{344, 310, 310, 280},
{ 0, 320, 320, 228},
{ 0, 175, 175, 136},
{ 0, 129, 129, 102},
}
},{
Loren Merritt
committed
{
// 5/3 8x8 dec=3
{275, 245, 245, 218},
{ 0, 230, 230, 156},
{ 0, 138, 138, 113},
},{
Loren Merritt
committed
// 5/3 16x16 or 32x32 dec=4
{352, 317, 317, 286},
{ 0, 328, 328, 233},
{ 0, 180, 180, 140},
{ 0, 132, 132, 105},
}
}
};
for (i = 0; i < h; i++) {
for (j = 0; j < w; j+=4) {
Loren Merritt
committed
tmp[32*i+j+0] = (pix1[j+0] - pix2[j+0])<<4;
tmp[32*i+j+1] = (pix1[j+1] - pix2[j+1])<<4;
tmp[32*i+j+2] = (pix1[j+2] - pix2[j+2])<<4;
tmp[32*i+j+3] = (pix1[j+3] - pix2[j+3])<<4;
}
pix1 += line_size;
pix2 += line_size;
}
Loren Merritt
committed
ff_spatial_dwt(tmp, w, h, 32, type, dec_count);
Loren Merritt
committed
assert(w==h);
for(level=0; level<dec_count; level++){
for(ori= level ? 1 : 0; ori<4; ori++){
Loren Merritt
committed
int size= w>>(dec_count-level);
int sx= (ori&1) ? size : 0;
int stride= 32<<(dec_count-level);
for(i=0; i<size; i++){
for(j=0; j<size; j++){
int v= tmp[sx + sy + i*stride + j] * scale[type][dec_count-3][level][ori];
Loren Merritt
committed
return s>>9;
}
static int w53_8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
return w_c(v, pix1, pix2, line_size, 8, h, 1);
}
static int w97_8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
return w_c(v, pix1, pix2, line_size, 8, h, 0);
}
static int w53_16_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
return w_c(v, pix1, pix2, line_size, 16, h, 1);
}
static int w97_16_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
return w_c(v, pix1, pix2, line_size, 16, h, 0);
}
int w53_32_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
Loren Merritt
committed
return w_c(v, pix1, pix2, line_size, 32, h, 1);
}
int w97_32_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
Loren Merritt
committed
return w_c(v, pix1, pix2, line_size, 32, h, 0);
}
Loren Merritt
committed
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
/* draw the edges of width 'w' of an image of size width, height */
//FIXME check that this is ok for mpeg4 interlaced
static void draw_edges_c(uint8_t *buf, int wrap, int width, int height, int w)
{
uint8_t *ptr, *last_line;
int i;
last_line = buf + (height - 1) * wrap;
for(i=0;i<w;i++) {
/* top and bottom */
memcpy(buf - (i + 1) * wrap, buf, width);
memcpy(last_line + (i + 1) * wrap, last_line, width);
}
/* left and right */
ptr = buf;
for(i=0;i<height;i++) {
memset(ptr - w, ptr[0], w);
memset(ptr + width, ptr[width-1], w);
ptr += wrap;
}
/* corners */
for(i=0;i<w;i++) {
memset(buf - (i + 1) * wrap - w, buf[0], w); /* top left */
memset(buf - (i + 1) * wrap + width, buf[width-1], w); /* top right */
memset(last_line + (i + 1) * wrap - w, last_line[0], w); /* top left */
memset(last_line + (i + 1) * wrap + width, last_line[width-1], w); /* top right */
}
}
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
/**
* Copies a rectangular area of samples to a temporary buffer and replicates the boarder samples.
* @param buf destination buffer
* @param src source buffer
* @param linesize number of bytes between 2 vertically adjacent samples in both the source and destination buffers
* @param block_w width of block
* @param block_h height of block
* @param src_x x coordinate of the top left sample of the block in the source buffer
* @param src_y y coordinate of the top left sample of the block in the source buffer
* @param w width of the source buffer
* @param h height of the source buffer
*/
void ff_emulated_edge_mc(uint8_t *buf, uint8_t *src, int linesize, int block_w, int block_h,
int src_x, int src_y, int w, int h){
int x, y;
int start_y, start_x, end_y, end_x;
if(src_y>= h){
src+= (h-1-src_y)*linesize;
src_y=h-1;
}else if(src_y<=-block_h){
src+= (1-block_h-src_y)*linesize;
src_y=1-block_h;
}
if(src_x>= w){
src+= (w-1-src_x);
src_x=w-1;
}else if(src_x<=-block_w){
src+= (1-block_w-src_x);
src_x=1-block_w;
}
start_y= FFMAX(0, -src_y);
start_x= FFMAX(0, -src_x);
end_y= FFMIN(block_h, h-src_y);
end_x= FFMIN(block_w, w-src_x);
// copy existing part
for(y=start_y; y<end_y; y++){
for(x=start_x; x<end_x; x++){
buf[x + y*linesize]= src[x + y*linesize];
}
}
//top
for(y=0; y<start_y; y++){
for(x=start_x; x<end_x; x++){
buf[x + y*linesize]= buf[x + start_y*linesize];
}
}
//bottom
for(y=end_y; y<block_h; y++){
for(x=start_x; x<end_x; x++){
buf[x + y*linesize]= buf[x + (end_y-1)*linesize];
}
}
for(y=0; y<block_h; y++){
//left
for(x=0; x<start_x; x++){
buf[x + y*linesize]= buf[start_x + y*linesize];
}
//right
for(x=end_x; x<block_w; x++){
buf[x + y*linesize]= buf[end_x - 1 + y*linesize];
}
}
}
static void get_pixels_c(DCTELEM *restrict block, const uint8_t *pixels, int line_size)
{
int i;
/* read the pixels */
for(i=0;i<8;i++) {
block[0] = pixels[0];
block[1] = pixels[1];
block[2] = pixels[2];
block[3] = pixels[3];
block[4] = pixels[4];
block[5] = pixels[5];
block[6] = pixels[6];
block[7] = pixels[7];
pixels += line_size;
block += 8;
static void diff_pixels_c(DCTELEM *restrict block, const uint8_t *s1,
const uint8_t *s2, int stride){
int i;
/* read the pixels */
for(i=0;i<8;i++) {
block[0] = s1[0] - s2[0];
block[1] = s1[1] - s2[1];
block[2] = s1[2] - s2[2];
block[3] = s1[3] - s2[3];
block[4] = s1[4] - s2[4];
block[5] = s1[5] - s2[5];
block[6] = s1[6] - s2[6];
block[7] = s1[7] - s2[7];
s1 += stride;
s2 += stride;
static void put_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
pixels[0] = cm[block[0]];
pixels[1] = cm[block[1]];
pixels[2] = cm[block[2]];
pixels[3] = cm[block[3]];
pixels[4] = cm[block[4]];
pixels[5] = cm[block[5]];
pixels[6] = cm[block[6]];
pixels[7] = cm[block[7]];
pixels += line_size;
block += 8;
static void put_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels,
/* read the pixels */
for(i=0;i<4;i++) {
pixels[0] = cm[block[0]];
pixels[1] = cm[block[1]];
pixels[2] = cm[block[2]];
pixels[3] = cm[block[3]];
pixels += line_size;
block += 8;
}
}
static void put_pixels_clamped2_c(const DCTELEM *block, uint8_t *restrict pixels,
/* read the pixels */
for(i=0;i<2;i++) {
pixels[0] = cm[block[0]];
pixels[1] = cm[block[1]];
pixels += line_size;
block += 8;
}
}
static void put_signed_pixels_clamped_c(const DCTELEM *block,
uint8_t *restrict pixels,
int line_size)
{
int i, j;
for (i = 0; i < 8; i++) {
for (j = 0; j < 8; j++) {
if (*block < -128)
*pixels = 0;
else if (*block > 127)
*pixels = 255;
else
*pixels = (uint8_t)(*block + 128);
block++;
pixels++;
}
pixels += (line_size - 8);
}
}
static void add_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
pixels[0] = cm[pixels[0] + block[0]];
pixels[1] = cm[pixels[1] + block[1]];
pixels[2] = cm[pixels[2] + block[2]];
pixels[3] = cm[pixels[3] + block[3]];
pixels[4] = cm[pixels[4] + block[4]];
pixels[5] = cm[pixels[5] + block[5]];
pixels[6] = cm[pixels[6] + block[6]];
pixels[7] = cm[pixels[7] + block[7]];
pixels += line_size;
block += 8;
static void add_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels,
int line_size)
{
int i;
/* read the pixels */
for(i=0;i<4;i++) {
pixels[0] = cm[pixels[0] + block[0]];
pixels[1] = cm[pixels[1] + block[1]];
pixels[2] = cm[pixels[2] + block[2]];
pixels[3] = cm[pixels[3] + block[3]];
pixels += line_size;
block += 8;
}
}
static void add_pixels_clamped2_c(const DCTELEM *block, uint8_t *restrict pixels,
int line_size)
{
int i;
/* read the pixels */
for(i=0;i<2;i++) {
pixels[0] = cm[pixels[0] + block[0]];
pixels[1] = cm[pixels[1] + block[1]];
pixels += line_size;
block += 8;
}
}
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
static void add_pixels8_c(uint8_t *restrict pixels, DCTELEM *block, int line_size)
{
int i;
for(i=0;i<8;i++) {
pixels[0] += block[0];
pixels[1] += block[1];
pixels[2] += block[2];
pixels[3] += block[3];
pixels[4] += block[4];
pixels[5] += block[5];
pixels[6] += block[6];
pixels[7] += block[7];
pixels += line_size;
block += 8;
}
}
static void add_pixels4_c(uint8_t *restrict pixels, DCTELEM *block, int line_size)
{
int i;
for(i=0;i<4;i++) {
pixels[0] += block[0];
pixels[1] += block[1];
pixels[2] += block[2];
pixels[3] += block[3];
pixels += line_size;
block += 4;
}
}
static int sum_abs_dctelem_c(DCTELEM *block)
{
int sum=0, i;
for(i=0; i<64; i++)
sum+= FFABS(block[i]);
return sum;
}
#if 0
#define PIXOP2(OPNAME, OP) \
static void OPNAME ## _pixels(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
OP(*((uint64_t*)block), AV_RN64(pixels));\
pixels+=line_size;\
block +=line_size;\
}\
}\
\
static void OPNAME ## _no_rnd_pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
const uint64_t a= AV_RN64(pixels );\
const uint64_t b= AV_RN64(pixels+1);\
OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
pixels+=line_size;\
block +=line_size;\
}\
}\
\
static void OPNAME ## _pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
const uint64_t a= AV_RN64(pixels );\
const uint64_t b= AV_RN64(pixels+1);\
OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
pixels+=line_size;\
block +=line_size;\
}\
}\
\
static void OPNAME ## _no_rnd_pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
const uint64_t a= AV_RN64(pixels );\
const uint64_t b= AV_RN64(pixels+line_size);\
OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
pixels+=line_size;\
block +=line_size;\
}\
}\
\
static void OPNAME ## _pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
const uint64_t a= AV_RN64(pixels );\
const uint64_t b= AV_RN64(pixels+line_size);\
OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
pixels+=line_size;\
block +=line_size;\
}\
}\
\
static void OPNAME ## _pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
const uint64_t a= AV_RN64(pixels );\
const uint64_t b= AV_RN64(pixels+1);\
uint64_t l0= (a&0x0303030303030303ULL)\
+ (b&0x0303030303030303ULL)\
+ 0x0202020202020202ULL;\
uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
+ ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
uint64_t l1,h1;\
\
pixels+=line_size;\
for(i=0; i<h; i+=2){\
uint64_t a= AV_RN64(pixels );\
uint64_t b= AV_RN64(pixels+1);\
l1= (a&0x0303030303030303ULL)\
+ (b&0x0303030303030303ULL);\
h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
+ ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
pixels+=line_size;\
block +=line_size;\
a= AV_RN64(pixels );\
b= AV_RN64(pixels+1);\
l0= (a&0x0303030303030303ULL)\
+ (b&0x0303030303030303ULL)\
+ 0x0202020202020202ULL;\
h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
+ ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
pixels+=line_size;\
block +=line_size;\
}\
}\
\
static void OPNAME ## _no_rnd_pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
const uint64_t a= AV_RN64(pixels );\
const uint64_t b= AV_RN64(pixels+1);\
uint64_t l0= (a&0x0303030303030303ULL)\
+ (b&0x0303030303030303ULL)\
+ 0x0101010101010101ULL;\
uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
+ ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
uint64_t l1,h1;\
\
pixels+=line_size;\
for(i=0; i<h; i+=2){\
uint64_t a= AV_RN64(pixels );\
uint64_t b= AV_RN64(pixels+1);\
l1= (a&0x0303030303030303ULL)\
+ (b&0x0303030303030303ULL);\
h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
+ ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
pixels+=line_size;\
block +=line_size;\
a= AV_RN64(pixels );\
b= AV_RN64(pixels+1);\
l0= (a&0x0303030303030303ULL)\
+ (b&0x0303030303030303ULL)\
+ 0x0101010101010101ULL;\
h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
+ ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
pixels+=line_size;\
block +=line_size;\
}\
}\
\
CALL_2X_PIXELS(OPNAME ## _pixels16_c , OPNAME ## _pixels_c , 8)\
CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels_x2_c , 8)\
CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels_y2_c , 8)\
CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels_xy2_c, 8)\
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels_x2_c , 8)\
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels_y2_c , 8)\
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels_xy2_c, 8)
#define op_avg(a, b) a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEFEFEFEFEULL)>>1) )
#else // 64 bit variant
#define PIXOP2(OPNAME, OP) \
Michael Niedermayer
committed
static void OPNAME ## _pixels2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
int i;\
for(i=0; i<h; i++){\
OP(*((uint16_t*)(block )), AV_RN16(pixels ));\
Michael Niedermayer
committed
pixels+=line_size;\
block +=line_size;\
}\
}\
static void OPNAME ## _pixels4_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
int i;\
for(i=0; i<h; i++){\
OP(*((uint32_t*)(block )), AV_RN32(pixels ));\
pixels+=line_size;\
block +=line_size;\
}\
}\
static void OPNAME ## _pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
OP(*((uint32_t*)(block )), AV_RN32(pixels ));\
OP(*((uint32_t*)(block+4)), AV_RN32(pixels+4));\
pixels+=line_size;\
block +=line_size;\
}\
}\
static inline void OPNAME ## _no_rnd_pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
OPNAME ## _pixels8_c(block, pixels, line_size, h);\
static inline void OPNAME ## _no_rnd_pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
int src_stride1, int src_stride2, int h){\
a= AV_RN32(&src1[i*src_stride1 ]);\
b= AV_RN32(&src2[i*src_stride2 ]);\
OP(*((uint32_t*)&dst[i*dst_stride ]), no_rnd_avg32(a, b));\
a= AV_RN32(&src1[i*src_stride1+4]);\
b= AV_RN32(&src2[i*src_stride2+4]);\
OP(*((uint32_t*)&dst[i*dst_stride+4]), no_rnd_avg32(a, b));\
static inline void OPNAME ## _pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
int src_stride1, int src_stride2, int h){\
a= AV_RN32(&src1[i*src_stride1 ]);\
b= AV_RN32(&src2[i*src_stride2 ]);\
a= AV_RN32(&src1[i*src_stride1+4]);\
b= AV_RN32(&src2[i*src_stride2+4]);\
static inline void OPNAME ## _pixels4_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
int src_stride1, int src_stride2, int h){\
int i;\
for(i=0; i<h; i++){\
uint32_t a,b;\
a= AV_RN32(&src1[i*src_stride1 ]);\
b= AV_RN32(&src2[i*src_stride2 ]);\
Michael Niedermayer
committed
static inline void OPNAME ## _pixels2_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
int src_stride1, int src_stride2, int h){\
int i;\
for(i=0; i<h; i++){\
uint32_t a,b;\
a= AV_RN16(&src1[i*src_stride1 ]);\
b= AV_RN16(&src2[i*src_stride2 ]);\
Michael Niedermayer
committed
OP(*((uint16_t*)&dst[i*dst_stride ]), rnd_avg32(a, b));\
}\
}\
\
static inline void OPNAME ## _pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
int src_stride1, int src_stride2, int h){\
OPNAME ## _pixels8_l2(dst , src1 , src2 , dst_stride, src_stride1, src_stride2, h);\
OPNAME ## _pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
}\
\
static inline void OPNAME ## _no_rnd_pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
int src_stride1, int src_stride2, int h){\
OPNAME ## _no_rnd_pixels8_l2(dst , src1 , src2 , dst_stride, src_stride1, src_stride2, h);\
OPNAME ## _no_rnd_pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
}\
\
static inline void OPNAME ## _no_rnd_pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
}\
\
static inline void OPNAME ## _pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
OPNAME ## _pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
}\
\
static inline void OPNAME ## _no_rnd_pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
}\
\
static inline void OPNAME ## _pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
OPNAME ## _pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
}\
\
static inline void OPNAME ## _pixels8_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\