Newer
Older
/*
* Copyright (C) 2001-2003 Michael Niedermayer <michaelni@gmx.at>
*
* This file is part of Libav.
* Libav is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* Libav is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with Libav; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "config.h"
#define _SVID_SOURCE // needed for MAP_ANONYMOUS
#include <assert.h>
#include <inttypes.h>
#include <math.h>
#include <stdio.h>
#if HAVE_SYS_MMAN_H
#include <sys/mman.h>
#if defined(MAP_ANON) && !defined(MAP_ANONYMOUS)
#define MAP_ANONYMOUS MAP_ANON
#endif
#endif
#if HAVE_VIRTUALALLOC
#define WIN32_LEAN_AND_MEAN
#include <windows.h>
#endif
#include "libavutil/attributes.h"
#include "libavutil/avutil.h"
#include "libavutil/bswap.h"
#include "libavutil/cpu.h"
#include "libavutil/intreadwrite.h"
#include "libavutil/mathematics.h"
#include "libavutil/opt.h"
#include "libavutil/pixdesc.h"
#include "libavutil/x86/asm.h"
#include "rgb2rgb.h"
#include "swscale.h"
#include "swscale_internal.h"
unsigned swscale_version(void)
{
return LIBSWSCALE_VERSION_INT;
}
const char *swscale_configuration(void)
{
return LIBAV_CONFIGURATION;
}
const char *swscale_license(void)
{
#define LICENSE_PREFIX "libswscale license: "
return LICENSE_PREFIX LIBAV_LICENSE + sizeof(LICENSE_PREFIX) - 1;
}
#define RET 0xC3 // near return opcode for x86
typedef struct FormatEntry {
int is_supported_in, is_supported_out;
} FormatEntry;
static const FormatEntry format_entries[PIX_FMT_NB] = {
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
[PIX_FMT_YUV420P] = { 1, 1 },
[PIX_FMT_YUYV422] = { 1, 1 },
[PIX_FMT_RGB24] = { 1, 1 },
[PIX_FMT_BGR24] = { 1, 1 },
[PIX_FMT_YUV422P] = { 1, 1 },
[PIX_FMT_YUV444P] = { 1, 1 },
[PIX_FMT_YUV410P] = { 1, 1 },
[PIX_FMT_YUV411P] = { 1, 1 },
[PIX_FMT_GRAY8] = { 1, 1 },
[PIX_FMT_MONOWHITE] = { 1, 1 },
[PIX_FMT_MONOBLACK] = { 1, 1 },
[PIX_FMT_PAL8] = { 1, 0 },
[PIX_FMT_YUVJ420P] = { 1, 1 },
[PIX_FMT_YUVJ422P] = { 1, 1 },
[PIX_FMT_YUVJ444P] = { 1, 1 },
[PIX_FMT_UYVY422] = { 1, 1 },
[PIX_FMT_UYYVYY411] = { 0, 0 },
[PIX_FMT_BGR8] = { 1, 1 },
[PIX_FMT_BGR4] = { 0, 1 },
[PIX_FMT_BGR4_BYTE] = { 1, 1 },
[PIX_FMT_RGB8] = { 1, 1 },
[PIX_FMT_RGB4] = { 0, 1 },
[PIX_FMT_RGB4_BYTE] = { 1, 1 },
[PIX_FMT_NV12] = { 1, 1 },
[PIX_FMT_NV21] = { 1, 1 },
[PIX_FMT_ARGB] = { 1, 1 },
[PIX_FMT_RGBA] = { 1, 1 },
[PIX_FMT_ABGR] = { 1, 1 },
[PIX_FMT_BGRA] = { 1, 1 },
[PIX_FMT_GRAY16BE] = { 1, 1 },
[PIX_FMT_GRAY16LE] = { 1, 1 },
[PIX_FMT_YUV440P] = { 1, 1 },
[PIX_FMT_YUVJ440P] = { 1, 1 },
[PIX_FMT_YUVA420P] = { 1, 1 },
[PIX_FMT_RGB48BE] = { 1, 1 },
[PIX_FMT_RGB48LE] = { 1, 1 },
[PIX_FMT_RGB565BE] = { 1, 1 },
[PIX_FMT_RGB565LE] = { 1, 1 },
[PIX_FMT_RGB555BE] = { 1, 1 },
[PIX_FMT_RGB555LE] = { 1, 1 },
[PIX_FMT_BGR565BE] = { 1, 1 },
[PIX_FMT_BGR565LE] = { 1, 1 },
[PIX_FMT_BGR555BE] = { 1, 1 },
[PIX_FMT_BGR555LE] = { 1, 1 },
[PIX_FMT_YUV420P16LE] = { 1, 1 },
[PIX_FMT_YUV420P16BE] = { 1, 1 },
[PIX_FMT_YUV422P16LE] = { 1, 1 },
[PIX_FMT_YUV422P16BE] = { 1, 1 },
[PIX_FMT_YUV444P16LE] = { 1, 1 },
[PIX_FMT_YUV444P16BE] = { 1, 1 },
[PIX_FMT_RGB444LE] = { 1, 1 },
[PIX_FMT_RGB444BE] = { 1, 1 },
[PIX_FMT_BGR444LE] = { 1, 1 },
[PIX_FMT_BGR444BE] = { 1, 1 },
[PIX_FMT_Y400A] = { 1, 0 },
[PIX_FMT_BGR48BE] = { 1, 1 },
[PIX_FMT_BGR48LE] = { 1, 1 },
[PIX_FMT_YUV420P9BE] = { 1, 1 },
[PIX_FMT_YUV420P9LE] = { 1, 1 },
[PIX_FMT_YUV420P10BE] = { 1, 1 },
[PIX_FMT_YUV420P10LE] = { 1, 1 },
[PIX_FMT_YUV422P9BE] = { 1, 1 },
[PIX_FMT_YUV422P9LE] = { 1, 1 },
[PIX_FMT_YUV422P10BE] = { 1, 1 },
[PIX_FMT_YUV422P10LE] = { 1, 1 },
[PIX_FMT_YUV444P9BE] = { 1, 1 },
[PIX_FMT_YUV444P9LE] = { 1, 1 },
[PIX_FMT_YUV444P10BE] = { 1, 1 },
[PIX_FMT_YUV444P10LE] = { 1, 1 },
[PIX_FMT_GBRP] = { 1, 0 },
[PIX_FMT_GBRP9LE] = { 1, 0 },
[PIX_FMT_GBRP9BE] = { 1, 0 },
[PIX_FMT_GBRP10LE] = { 1, 0 },
[PIX_FMT_GBRP10BE] = { 1, 0 },
[PIX_FMT_GBRP16LE] = { 1, 0 },
[PIX_FMT_GBRP16BE] = { 1, 0 },
int sws_isSupportedInput(enum PixelFormat pix_fmt)
{
return (unsigned)pix_fmt < PIX_FMT_NB ?
format_entries[pix_fmt].is_supported_in : 0;
}
int sws_isSupportedOutput(enum PixelFormat pix_fmt)
{
return (unsigned)pix_fmt < PIX_FMT_NB ?
format_entries[pix_fmt].is_supported_out : 0;
}
extern const int32_t ff_yuv2rgb_coeffs[8][4];
const char *sws_format_name(enum PixelFormat format)
{
if ((unsigned)format < PIX_FMT_NB && av_pix_fmt_descriptors[format].name)
return av_pix_fmt_descriptors[format].name;
else
return "Unknown format";
}
static double getSplineCoeff(double a, double b, double c, double d,
double dist)
{
if (dist <= 1.0)
return ((d * dist + c) * dist + b) * dist + a;
else
return getSplineCoeff(0.0,
b + 2.0 * c + 3.0 * d,
c + 3.0 * d,
-b - 3.0 * c - 6.0 * d,
dist - 1.0);
}
static int initFilter(int16_t **outFilter, int32_t **filterPos,
int *outFilterSize, int xInc, int srcW, int dstW,
int filterAlign, int one, int flags, int cpu_flags,
SwsVector *srcFilter, SwsVector *dstFilter,
double param[2], int is_horizontal)
{
int i;
int filterSize;
int filter2Size;
int minFilterSize;
int64_t *filter = NULL;
int64_t *filter2 = NULL;
const int64_t fone = 1LL << 54;
int ret = -1;
emms_c(); // FIXME should not be required but IS (even for non-MMX versions)
// NOTE: the +3 is for the MMX(+1) / SSE(+3) scaler which reads over the end
FF_ALLOC_OR_GOTO(NULL, *filterPos, (dstW + 3) * sizeof(**filterPos), fail);
if (FFABS(xInc - 0x10000) < 10) { // unscaled
int i;
filterSize = 1;
FF_ALLOCZ_OR_GOTO(NULL, filter,
dstW * sizeof(*filter) * filterSize, fail);
for (i = 0; i < dstW; i++) {
filter[i * filterSize] = fone;
(*filterPos)[i] = i;
}
} else if (flags & SWS_POINT) { // lame looking point sampling mode
int i;
int xDstInSrc;
filterSize = 1;
FF_ALLOC_OR_GOTO(NULL, filter,
dstW * sizeof(*filter) * filterSize, fail);
xDstInSrc = xInc / 2 - 0x8000;
for (i = 0; i < dstW; i++) {
int xx = (xDstInSrc - ((filterSize - 1) << 15) + (1 << 15)) >> 16;
(*filterPos)[i] = xx;
filter[i] = fone;
xDstInSrc += xInc;
}
} else if ((xInc <= (1 << 16) && (flags & SWS_AREA)) ||
(flags & SWS_FAST_BILINEAR)) { // bilinear upscale
int i;
int xDstInSrc;
filterSize = 2;
FF_ALLOC_OR_GOTO(NULL, filter,
dstW * sizeof(*filter) * filterSize, fail);
xDstInSrc = xInc / 2 - 0x8000;
for (i = 0; i < dstW; i++) {
int xx = (xDstInSrc - ((filterSize - 1) << 15) + (1 << 15)) >> 16;
int j;
(*filterPos)[i] = xx;
// bilinear upscale / linear interpolate / area averaging
for (j = 0; j < filterSize; j++) {
int64_t coeff = fone - FFABS((xx << 16) - xDstInSrc) *
(fone >> 16);
if (coeff < 0)
coeff = 0;
filter[i * filterSize + j] = coeff;
xx++;
}
}
} else {
int64_t xDstInSrc;
int sizeFactor;
if (flags & SWS_BICUBIC)
sizeFactor = 4;
else if (flags & SWS_X)
sizeFactor = 8;
else if (flags & SWS_AREA)
sizeFactor = 1; // downscale only, for upscale it is bilinear
else if (flags & SWS_GAUSS)
sizeFactor = 8; // infinite ;)
else if (flags & SWS_LANCZOS)
sizeFactor = param[0] != SWS_PARAM_DEFAULT ? ceil(2 * param[0]) : 6;
else if (flags & SWS_SINC)
sizeFactor = 20; // infinite ;)
else if (flags & SWS_SPLINE)
sizeFactor = 20; // infinite ;)
else if (flags & SWS_BILINEAR)
sizeFactor = 2;
else {
sizeFactor = 0; // GCC warning killer
assert(0);
}
if (xInc <= 1 << 16)
filterSize = 1 + sizeFactor; // upscale
else
filterSize = 1 + (sizeFactor * srcW + dstW - 1) / dstW;
filterSize = FFMIN(filterSize, srcW - 2);
filterSize = FFMAX(filterSize, 1);
FF_ALLOC_OR_GOTO(NULL, filter,
dstW * sizeof(*filter) * filterSize, fail);
xDstInSrc = xInc - 0x10000;
for (i = 0; i < dstW; i++) {
int xx = (xDstInSrc - ((filterSize - 2) << 16)) / (1 << 17);
int j;
(*filterPos)[i] = xx;
for (j = 0; j < filterSize; j++) {
int64_t d = (FFABS(((int64_t)xx << 17) - xDstInSrc)) << 13;
double floatd;
int64_t coeff;
if (xInc > 1 << 16)
d = d * dstW / srcW;
floatd = d * (1.0 / (1 << 30));
if (flags & SWS_BICUBIC) {
int64_t B = (param[0] != SWS_PARAM_DEFAULT ? param[0] : 0) * (1 << 24);
int64_t C = (param[1] != SWS_PARAM_DEFAULT ? param[1] : 0.6) * (1 << 24);
coeff = 0.0;
} else {
int64_t dd = (d * d) >> 30;
int64_t ddd = (dd * d) >> 30;
if (d < 1LL << 30)
coeff = (12 * (1 << 24) - 9 * B - 6 * C) * ddd +
(-18 * (1 << 24) + 12 * B + 6 * C) * dd +
(6 * (1 << 24) - 2 * B) * (1 << 30);
coeff = (-B - 6 * C) * ddd +
(6 * B + 30 * C) * dd +
(-12 * B - 48 * C) * d +
(8 * B + 24 * C) * (1 << 30);
}
#if 0
else if (flags & SWS_X) {
double p = param ? param * 0.01 : 0.3;
coeff = d ? sin(d * M_PI) / (d * M_PI) : 1.0;
coeff *= pow(2.0, -p * d * d);
}
#endif
else if (flags & SWS_X) {
double A = param[0] != SWS_PARAM_DEFAULT ? param[0] : 1.0;
double c;
if (floatd < 1.0)
c = cos(floatd * M_PI);
else
c = -1.0;
if (c < 0.0)
c = -pow(-c, A);
else
c = pow(c, A);
coeff = (c * 0.5 + 0.5) * fone;
} else if (flags & SWS_AREA) {
int64_t d2 = d - (1 << 29);
if (d2 * xInc < -(1LL << (29 + 16)))
coeff = 1.0 * (1LL << (30 + 16));
else if (d2 * xInc < (1LL << (29 + 16)))
coeff = -d2 * xInc + (1LL << (29 + 16));
else
coeff = 0.0;
coeff *= fone >> (30 + 16);
} else if (flags & SWS_GAUSS) {
double p = param[0] != SWS_PARAM_DEFAULT ? param[0] : 3.0;
coeff = (pow(2.0, -p * floatd * floatd)) * fone;
} else if (flags & SWS_SINC) {
coeff = (d ? sin(floatd * M_PI) / (floatd * M_PI) : 1.0) * fone;
} else if (flags & SWS_LANCZOS) {
double p = param[0] != SWS_PARAM_DEFAULT ? param[0] : 3.0;
coeff = (d ? sin(floatd * M_PI) * sin(floatd * M_PI / p) /
(floatd * floatd * M_PI * M_PI / p) : 1.0) * fone;
if (floatd > p)
coeff = 0;
} else if (flags & SWS_BILINEAR) {
coeff = (1 << 30) - d;
if (coeff < 0)
coeff = 0;
coeff *= fone >> 30;
} else if (flags & SWS_SPLINE) {
double p = -2.196152422706632;
coeff = getSplineCoeff(1.0, 0.0, p, -p - 1.0, floatd) * fone;
} else {
coeff = 0.0; // GCC warning killer
assert(0);
}
filter[i * filterSize + j] = coeff;
xx++;
}
}
}
/* apply src & dst Filter to filter -> filter2
* av_free(filter);
*/
assert(filterSize > 0);
filter2Size = filterSize;
if (srcFilter)
filter2Size += srcFilter->length - 1;
if (dstFilter)
filter2Size += dstFilter->length - 1;
assert(filter2Size > 0);
FF_ALLOCZ_OR_GOTO(NULL, filter2, filter2Size * dstW * sizeof(*filter2), fail);
for (i = 0; i < dstW; i++) {
int j, k;
if (srcFilter) {
for (k = 0; k < srcFilter->length; k++) {
for (j = 0; j < filterSize; j++)
filter2[i * filter2Size + k + j] +=
srcFilter->coeff[k] * filter[i * filterSize + j];
}
} else {
for (j = 0; j < filterSize; j++)
filter2[i * filter2Size + j] = filter[i * filterSize + j];
}
(*filterPos)[i] += (filterSize - 1) / 2 - (filter2Size - 1) / 2;
}
av_freep(&filter);
/* try to reduce the filter-size (step1 find size and shift left) */
// Assume it is near normalized (*0.5 or *2.0 is OK but * 0.001 is not).
minFilterSize = 0;
for (i = dstW - 1; i >= 0; i--) {
int min = filter2Size;
int j;
/* get rid of near zero elements on the left by shifting left */
for (j = 0; j < filter2Size; j++) {
int k;
cutOff += FFABS(filter2[i * filter2Size]);
if (cutOff > SWS_MAX_REDUCE_CUTOFF * fone)
break;
/* preserve monotonicity because the core can't handle the
* filter otherwise */
if (i < dstW - 1 && (*filterPos)[i] >= (*filterPos)[i + 1])
break;
// move filter coefficients left
for (k = 1; k < filter2Size; k++)
filter2[i * filter2Size + k - 1] = filter2[i * filter2Size + k];
filter2[i * filter2Size + k - 1] = 0;
(*filterPos)[i]++;
}
/* count near zeros on the right */
for (j = filter2Size - 1; j > 0; j--) {
cutOff += FFABS(filter2[i * filter2Size + j]);
if (cutOff > SWS_MAX_REDUCE_CUTOFF * fone)
break;
min--;
}
if (min > minFilterSize)
minFilterSize = min;
}
if (HAVE_ALTIVEC && cpu_flags & AV_CPU_FLAG_ALTIVEC) {
// we can handle the special case 4, so we don't want to go the full 8
if (minFilterSize < 5)
filterAlign = 4;
/* We really don't want to waste our time doing useless computation, so
* fall back on the scalar C code for very small filters.
* Vectorizing is worth it only if you have a decent-sized vector. */
if (minFilterSize < 3)
filterAlign = 1;
}
if (HAVE_MMX && cpu_flags & AV_CPU_FLAG_MMX) {
// special case for unscaled vertical filtering
if (minFilterSize == 1 && filterAlign == 2)
}
assert(minFilterSize > 0);
filterSize = (minFilterSize + (filterAlign - 1)) & (~(filterAlign - 1));
assert(filterSize > 0);
filter = av_malloc(filterSize * dstW * sizeof(*filter));
if (filterSize >= MAX_FILTER_SIZE * 16 /
((flags & SWS_ACCURATE_RND) ? APCK_SIZE : 16) || !filter)
goto fail;
*outFilterSize = filterSize;
if (flags & SWS_PRINT_INFO)
av_log(NULL, AV_LOG_VERBOSE,
"SwScaler: reducing / aligning filtersize %d -> %d\n",
filter2Size, filterSize);
/* try to reduce the filter-size (step2 reduce it) */
for (i = 0; i < dstW; i++) {
int j;
for (j = 0; j < filterSize; j++) {
if (j >= filter2Size)
filter[i * filterSize + j] = 0;
else
filter[i * filterSize + j] = filter2[i * filter2Size + j];
if ((flags & SWS_BITEXACT) && j >= minFilterSize)
filter[i * filterSize + j] = 0;
}
}
// FIXME try to align filterPos if possible
if (is_horizontal) {
for (i = 0; i < dstW; i++) {
int j;
if ((*filterPos)[i] < 0) {
// move filter coefficients left to compensate for filterPos
for (j = 1; j < filterSize; j++) {
int left = FFMAX(j + (*filterPos)[i], 0);
filter[i * filterSize + left] += filter[i * filterSize + j];
filter[i * filterSize + j] = 0;
}
(*filterPos)[i] = 0;
}
if ((*filterPos)[i] + filterSize > srcW) {
int shift = (*filterPos)[i] + filterSize - srcW;
// move filter coefficients right to compensate for filterPos
for (j = filterSize - 2; j >= 0; j--) {
int right = FFMIN(j + shift, filterSize - 1);
filter[i * filterSize + right] += filter[i * filterSize + j];
filter[i * filterSize + j] = 0;
}
(*filterPos)[i] = srcW - filterSize;
}
}
}
// Note the +1 is for the MMX scaler which reads over the end
/* align at 16 for AltiVec (needed by hScale_altivec_real) */
FF_ALLOCZ_OR_GOTO(NULL, *outFilter,
*outFilterSize * (dstW + 3) * sizeof(int16_t), fail);
/* normalize & store in outFilter */
for (i = 0; i < dstW; i++) {
int j;
int64_t error = 0;
int64_t sum = 0;
for (j = 0; j < filterSize; j++) {
sum += filter[i * filterSize + j];
}
sum = (sum + one / 2) / one;
for (j = 0; j < *outFilterSize; j++) {
int64_t v = filter[i * filterSize + j] + error;
int intV = ROUNDED_DIV(v, sum);
(*outFilter)[i * (*outFilterSize) + j] = intV;
error = v - intV * sum;
}
}
(*filterPos)[dstW + 0] =
(*filterPos)[dstW + 1] =
(*filterPos)[dstW + 2] = (*filterPos)[dstW - 1]; /* the MMX/SSE scaler will
* read over the end */
for (i = 0; i < *outFilterSize; i++) {
int k = (dstW - 1) * (*outFilterSize) + i;
(*outFilter)[k + 1 * (*outFilterSize)] =
(*outFilter)[k + 2 * (*outFilterSize)] =
(*outFilter)[k + 3 * (*outFilterSize)] = (*outFilter)[k];
}
fail:
av_free(filter);
av_free(filter2);
return ret;
}
static int initMMX2HScaler(int dstW, int xInc, uint8_t *filterCode,
int16_t *filter, int32_t *filterPos, int numSplits)
{
uint8_t *fragmentA;
x86_reg imm8OfPShufW1A;
x86_reg imm8OfPShufW2A;
x86_reg fragmentLengthA;
uint8_t *fragmentB;
x86_reg imm8OfPShufW1B;
x86_reg imm8OfPShufW2B;
x86_reg fragmentLengthB;
int fragmentPos;
int xpos, i;
// create an optimized horizontal scaling routine
/* This scaler is made of runtime-generated MMX2 code using specially tuned
* pshufw instructions. For every four output pixels, if four input pixels
* are enough for the fast bilinear scaling, then a chunk of fragmentB is
* used. If five input pixels are needed, then a chunk of fragmentA is used.
*/
"jmp 9f \n\t"
"0: \n\t"
"movq (%%"REG_d", %%"REG_a"), %%mm3 \n\t"
"movd (%%"REG_c", %%"REG_S"), %%mm0 \n\t"
"movd 1(%%"REG_c", %%"REG_S"), %%mm1 \n\t"
"punpcklbw %%mm7, %%mm1 \n\t"
"punpcklbw %%mm7, %%mm0 \n\t"
"pshufw $0xFF, %%mm1, %%mm1 \n\t"
"1: \n\t"
"pshufw $0xFF, %%mm0, %%mm0 \n\t"
"2: \n\t"
"psubw %%mm1, %%mm0 \n\t"
"movl 8(%%"REG_b", %%"REG_a"), %%esi \n\t"
"pmullw %%mm3, %%mm0 \n\t"
"psllw $7, %%mm1 \n\t"
"paddw %%mm1, %%mm0 \n\t"
"movq %%mm0, (%%"REG_D", %%"REG_a") \n\t"
"add $8, %%"REG_a" \n\t"
"9: \n\t"
// "int $3 \n\t"
"lea " LOCAL_MANGLE(0b) ", %0 \n\t"
"lea " LOCAL_MANGLE(1b) ", %1 \n\t"
"lea " LOCAL_MANGLE(2b) ", %2 \n\t"
"dec %1 \n\t"
"dec %2 \n\t"
"sub %0, %1 \n\t"
"sub %0, %2 \n\t"
"lea " LOCAL_MANGLE(9b) ", %3 \n\t"
"sub %0, %3 \n\t"
: "=r" (fragmentA), "=r" (imm8OfPShufW1A), "=r" (imm8OfPShufW2A),
"=r" (fragmentLengthA)
);
"jmp 9f \n\t"
"0: \n\t"
"movq (%%"REG_d", %%"REG_a"), %%mm3 \n\t"
"movd (%%"REG_c", %%"REG_S"), %%mm0 \n\t"
"punpcklbw %%mm7, %%mm0 \n\t"
"pshufw $0xFF, %%mm0, %%mm1 \n\t"
"1: \n\t"
"pshufw $0xFF, %%mm0, %%mm0 \n\t"
"2: \n\t"
"psubw %%mm1, %%mm0 \n\t"
"movl 8(%%"REG_b", %%"REG_a"), %%esi \n\t"
"pmullw %%mm3, %%mm0 \n\t"
"psllw $7, %%mm1 \n\t"
"paddw %%mm1, %%mm0 \n\t"
"movq %%mm0, (%%"REG_D", %%"REG_a") \n\t"
"add $8, %%"REG_a" \n\t"
"9: \n\t"
// "int $3 \n\t"
"lea " LOCAL_MANGLE(0b) ", %0 \n\t"
"lea " LOCAL_MANGLE(1b) ", %1 \n\t"
"lea " LOCAL_MANGLE(2b) ", %2 \n\t"
"dec %1 \n\t"
"dec %2 \n\t"
"sub %0, %1 \n\t"
"sub %0, %2 \n\t"
"lea " LOCAL_MANGLE(9b) ", %3 \n\t"
"sub %0, %3 \n\t"
: "=r" (fragmentB), "=r" (imm8OfPShufW1B), "=r" (imm8OfPShufW2B),
"=r" (fragmentLengthB)
);
xpos = 0; // lumXInc/2 - 0x8000; // difference between pixel centers
fragmentPos = 0;
for (i = 0; i < dstW / numSplits; i++) {
int xx = xpos >> 16;
if ((i & 3) == 0) {
int a = 0;
int b = ((xpos + xInc) >> 16) - xx;
int c = ((xpos + xInc * 2) >> 16) - xx;
int d = ((xpos + xInc * 3) >> 16) - xx;
int inc = (d + 1 < 4);
uint8_t *fragment = (d + 1 < 4) ? fragmentB : fragmentA;
x86_reg imm8OfPShufW1 = (d + 1 < 4) ? imm8OfPShufW1B : imm8OfPShufW1A;
x86_reg imm8OfPShufW2 = (d + 1 < 4) ? imm8OfPShufW2B : imm8OfPShufW2A;
x86_reg fragmentLength = (d + 1 < 4) ? fragmentLengthB : fragmentLengthA;
int maxShift = 3 - (d + inc);
int shift = 0;
if (filterCode) {
filter[i] = ((xpos & 0xFFFF) ^ 0xFFFF) >> 9;
filter[i + 1] = (((xpos + xInc) & 0xFFFF) ^ 0xFFFF) >> 9;
filter[i + 2] = (((xpos + xInc * 2) & 0xFFFF) ^ 0xFFFF) >> 9;
filter[i + 3] = (((xpos + xInc * 3) & 0xFFFF) ^ 0xFFFF) >> 9;
filterPos[i / 2] = xx;
memcpy(filterCode + fragmentPos, fragment, fragmentLength);
filterCode[fragmentPos + imm8OfPShufW1] = (a + inc) |
((b + inc) << 2) |
((c + inc) << 4) |
((d + inc) << 6);
filterCode[fragmentPos + imm8OfPShufW2] = a | (b << 2) |
(c << 4) |
(d << 6);
if (i + 4 - inc >= dstW)
shift = maxShift; // avoid overread
else if ((filterPos[i / 2] & 3) <= maxShift)
shift = filterPos[i / 2] & 3; // align
if (shift && i >= shift) {
filterCode[fragmentPos + imm8OfPShufW1] += 0x55 * shift;
filterCode[fragmentPos + imm8OfPShufW2] += 0x55 * shift;
filterPos[i / 2] -= shift;
}
}
fragmentPos += fragmentLength;
if (filterCode)
filterCode[fragmentPos] = RET;
}
}
if (filterCode)
filterPos[((i / 2) + 1) & (~1)] = xpos >> 16; // needed to jump to the next part
return fragmentPos + 1;
}
#endif /* HAVE_MMXEXT && HAVE_INLINE_ASM */
static void getSubSampleFactors(int *h, int *v, enum PixelFormat format)
{
*h = av_pix_fmt_descriptors[format].log2_chroma_w;
*v = av_pix_fmt_descriptors[format].log2_chroma_h;
}
int sws_setColorspaceDetails(struct SwsContext *c, const int inv_table[4],
int srcRange, const int table[4], int dstRange,
int brightness, int contrast, int saturation)
{
memcpy(c->srcColorspaceTable, inv_table, sizeof(int) * 4);
memcpy(c->dstColorspaceTable, table, sizeof(int) * 4);
c->brightness = brightness;
c->contrast = contrast;
c->saturation = saturation;
c->srcRange = srcRange;
c->dstRange = dstRange;
if (isYUV(c->dstFormat) || isGray(c->dstFormat))
return -1;
Michael Niedermayer
committed
c->dstFormatBpp = av_get_bits_per_pixel(&av_pix_fmt_descriptors[c->dstFormat]);
c->srcFormatBpp = av_get_bits_per_pixel(&av_pix_fmt_descriptors[c->srcFormat]);
ff_yuv2rgb_c_init_tables(c, inv_table, srcRange, brightness,
contrast, saturation);
// FIXME factorize
if (HAVE_ALTIVEC && av_get_cpu_flags() & AV_CPU_FLAG_ALTIVEC)
ff_yuv2rgb_init_tables_altivec(c, inv_table, brightness,
contrast, saturation);
return 0;
}
int sws_getColorspaceDetails(struct SwsContext *c, int **inv_table,
int *srcRange, int **table, int *dstRange,
int *brightness, int *contrast, int *saturation)
{
if (isYUV(c->dstFormat) || isGray(c->dstFormat))
return -1;
*inv_table = c->srcColorspaceTable;
*table = c->dstColorspaceTable;
*srcRange = c->srcRange;
*dstRange = c->dstRange;
*brightness = c->brightness;
*contrast = c->contrast;
*saturation = c->saturation;
return 0;
}
static int handle_jpeg(enum PixelFormat *format)
{
switch (*format) {
case PIX_FMT_YUVJ420P:
*format = PIX_FMT_YUV420P;
return 1;
case PIX_FMT_YUVJ422P:
*format = PIX_FMT_YUV422P;
return 1;
case PIX_FMT_YUVJ444P:
*format = PIX_FMT_YUV444P;
return 1;
case PIX_FMT_YUVJ440P:
*format = PIX_FMT_YUV440P;
return 1;
default:
return 0;
}
}
SwsContext *sws_alloc_context(void)
{
SwsContext *c = av_mallocz(sizeof(SwsContext));
Michael Niedermayer
committed
c->av_class = &sws_context_class;
av_opt_set_defaults(c);
Michael Niedermayer
committed
return c;
}
av_cold int sws_init_context(SwsContext *c, SwsFilter *srcFilter,
SwsFilter *dstFilter)
int i;
int usesVFilter, usesHFilter;
int unscaled;
SwsFilter dummyFilter = { NULL, NULL, NULL, NULL };
int srcW = c->srcW;
int srcH = c->srcH;
int dstW = c->dstW;
int dstH = c->dstH;
int dst_stride = FFALIGN(dstW * sizeof(int16_t) + 16, 16);
int dst_stride_px = dst_stride >> 1;
int flags, cpu_flags;
enum PixelFormat srcFormat = c->srcFormat;
enum PixelFormat dstFormat = c->dstFormat;
Michael Niedermayer
committed
cpu_flags = av_get_cpu_flags();
flags = c->flags;
if (!rgb15to16)
sws_rgb2rgb_init();
unscaled = (srcW == dstW && srcH == dstH);
if (!sws_isSupportedInput(srcFormat)) {
av_log(c, AV_LOG_ERROR, "%s is not supported as input pixel format\n",
sws_format_name(srcFormat));
Michael Niedermayer
committed
return AVERROR(EINVAL);
}
if (!sws_isSupportedOutput(dstFormat)) {
av_log(c, AV_LOG_ERROR, "%s is not supported as output pixel format\n",
sws_format_name(dstFormat));
Michael Niedermayer
committed
return AVERROR(EINVAL);
}
i = flags & (SWS_POINT |
SWS_AREA |
SWS_BILINEAR |
SWS_FAST_BILINEAR |
SWS_BICUBIC |
SWS_X |
SWS_GAUSS |
SWS_LANCZOS |
SWS_SINC |
SWS_SPLINE |
SWS_BICUBLIN);
if (!i || (i & (i - 1))) {
av_log(c, AV_LOG_ERROR,
"Exactly one scaler algorithm must be chosen\n");
Michael Niedermayer
committed
return AVERROR(EINVAL);
}
/* sanity check */
if (srcW < 4 || srcH < 1 || dstW < 8 || dstH < 1) {
/* FIXME check if these are enough and try to lower them after
* fixing the relevant parts of the code */
av_log(c, AV_LOG_ERROR, "%dx%d -> %dx%d is invalid scaling dimension\n",
srcW, srcH, dstW, dstH);
Michael Niedermayer
committed
return AVERROR(EINVAL);
}
if (!dstFilter)
dstFilter = &dummyFilter;
if (!srcFilter)
srcFilter = &dummyFilter;
c->lumXInc = (((int64_t)srcW << 16) + (dstW >> 1)) / dstW;
c->lumYInc = (((int64_t)srcH << 16) + (dstH >> 1)) / dstH;
c->dstFormatBpp = av_get_bits_per_pixel(&av_pix_fmt_descriptors[dstFormat]);
c->srcFormatBpp = av_get_bits_per_pixel(&av_pix_fmt_descriptors[srcFormat]);
c->vRounder = 4 * 0x0001000100010001ULL;
usesVFilter = (srcFilter->lumV && srcFilter->lumV->length > 1) ||
(srcFilter->chrV && srcFilter->chrV->length > 1) ||
(dstFilter->lumV && dstFilter->lumV->length > 1) ||
(dstFilter->chrV && dstFilter->chrV->length > 1);
usesHFilter = (srcFilter->lumH && srcFilter->lumH->length > 1) ||
(srcFilter->chrH && srcFilter->chrH->length > 1) ||
(dstFilter->lumH && dstFilter->lumH->length > 1) ||
(dstFilter->chrH && dstFilter->chrH->length > 1);
getSubSampleFactors(&c->chrSrcHSubSample, &c->chrSrcVSubSample, srcFormat);
getSubSampleFactors(&c->chrDstHSubSample, &c->chrDstVSubSample, dstFormat);
/* reuse chroma for 2 pixels RGB/BGR unless user wants full
* chroma interpolation */
if (flags & SWS_FULL_CHR_H_INT &&
isAnyRGB(dstFormat) &&
dstFormat != PIX_FMT_RGBA &&
dstFormat != PIX_FMT_ARGB &&
dstFormat != PIX_FMT_BGRA &&
dstFormat != PIX_FMT_ABGR &&
dstFormat != PIX_FMT_RGB24 &&
dstFormat != PIX_FMT_BGR24) {
av_log(c, AV_LOG_ERROR,
"full chroma interpolation for destination format '%s' not yet implemented\n",
sws_format_name(dstFormat));
flags &= ~SWS_FULL_CHR_H_INT;
c->flags = flags;
}
if (isAnyRGB(dstFormat) && !(flags & SWS_FULL_CHR_H_INT))
c->chrDstHSubSample = 1;
// drop some chroma lines if the user wants it
c->vChrDrop = (flags & SWS_SRC_V_CHR_DROP_MASK) >>
SWS_SRC_V_CHR_DROP_SHIFT;
c->chrSrcVSubSample += c->vChrDrop;
/* drop every other pixel for chroma calculation unless user
* wants full chroma */
if (isAnyRGB(srcFormat) && !(flags & SWS_FULL_CHR_H_INP) &&
srcFormat != PIX_FMT_RGB8 && srcFormat != PIX_FMT_BGR8 &&
srcFormat != PIX_FMT_RGB4 && srcFormat != PIX_FMT_BGR4 &&
srcFormat != PIX_FMT_RGB4_BYTE && srcFormat != PIX_FMT_BGR4_BYTE &&
((dstW >> c->chrDstHSubSample) <= (srcW >> 1) ||
(flags & SWS_FAST_BILINEAR)))
c->chrSrcHSubSample = 1;
// Note the -((-x)>>y) is so that we always round toward +inf.
c->chrSrcW = -((-srcW) >> c->chrSrcHSubSample);
c->chrSrcH = -((-srcH) >> c->chrSrcVSubSample);
c->chrDstW = -((-dstW) >> c->chrDstHSubSample);
c->chrDstH = -((-dstH) >> c->chrDstVSubSample);
/* unscaled special cases */
if (unscaled && !usesHFilter && !usesVFilter &&
(c->srcRange == c->dstRange || isAnyRGB(dstFormat))) {
ff_get_unscaled_swscale(c);
if (c->swScale) {
if (flags & SWS_PRINT_INFO)
av_log(c, AV_LOG_INFO,
"using unscaled %s -> %s special converter\n",
sws_format_name(srcFormat), sws_format_name(dstFormat));
Michael Niedermayer
committed
return 0;
}
}
c->srcBpc = 1 + av_pix_fmt_descriptors[srcFormat].comp[0].depth_minus1;
if (c->srcBpc < 8)
c->srcBpc = 8;
c->dstBpc = 1 + av_pix_fmt_descriptors[dstFormat].comp[0].depth_minus1;
if (c->dstBpc < 8)
c->dstBpc = 8;
if (c->dstBpc == 16)
FF_ALLOC_OR_GOTO(c, c->formatConvBuffer,
(FFALIGN(srcW, 16) * 2 * FFALIGN(c->srcBpc, 8) >> 3) + 16,
if (HAVE_MMXEXT && HAVE_INLINE_ASM && cpu_flags & AV_CPU_FLAG_MMXEXT &&
c->srcBpc == 8 && c->dstBpc <= 10) {
c->canMMX2BeUsed = (dstW >= srcW && (dstW & 31) == 0 &&
(srcW & 15) == 0) ? 1 : 0;
if (!c->canMMX2BeUsed && dstW >= srcW && (srcW & 15) == 0
&& (flags & SWS_FAST_BILINEAR)) {
if (flags & SWS_PRINT_INFO)
av_log(c, AV_LOG_INFO,
"output width is not a multiple of 32 -> no MMX2 scaler\n");
}
if (usesHFilter)
c->canMMX2BeUsed = 0;
} else
c->canMMX2BeUsed = 0;
c->chrXInc = (((int64_t)c->chrSrcW << 16) + (c->chrDstW >> 1)) / c->chrDstW;
c->chrYInc = (((int64_t)c->chrSrcH << 16) + (c->chrDstH >> 1)) / c->chrDstH;
/* Match pixel 0 of the src to pixel 0 of dst and match pixel n-2 of src
* to pixel n-2 of dst, but only for the FAST_BILINEAR mode otherwise do
* correct scaling.
* n-2 is the last chrominance sample available.
* This is not perfect, but no one should notice the difference, the more
* correct variant would be like the vertical one, but that would require
* some special code for the first and last pixel */