Newer
Older
vec_st(svB##src##i, i * stride + 16, src)
PACK_AND_STORE(src, 0);
PACK_AND_STORE(src, 1);
PACK_AND_STORE(src, 2);
PACK_AND_STORE(src, 3);
PACK_AND_STORE(src, 4);
PACK_AND_STORE(src, 5);
PACK_AND_STORE(src, 6);
PACK_AND_STORE(src, 7);
PACK_AND_STORE(tempBlurred, 0);
PACK_AND_STORE(tempBlurred, 1);
PACK_AND_STORE(tempBlurred, 2);
PACK_AND_STORE(tempBlurred, 3);
PACK_AND_STORE(tempBlurred, 4);
PACK_AND_STORE(tempBlurred, 5);
PACK_AND_STORE(tempBlurred, 6);
PACK_AND_STORE(tempBlurred, 7);
#undef PACK_AND_STORE
}
static inline void transpose_16x8_char_toPackedAlign_altivec(unsigned char* dst, unsigned char* src, int stride) {
const vector unsigned char zero = vec_splat_u8(0);
#define LOAD_DOUBLE_LINE(i, j) \
vector unsigned char perm1##i = vec_lvsl(i * stride, src); \
vector unsigned char perm2##i = vec_lvsl(j * stride, src); \
vector unsigned char srcA##i = vec_ld(i * stride, src); \
vector unsigned char srcB##i = vec_ld(i * stride + 16, src); \
vector unsigned char srcC##i = vec_ld(j * stride, src); \
vector unsigned char srcD##i = vec_ld(j * stride+ 16, src); \
vector unsigned char src##i = vec_perm(srcA##i, srcB##i, perm1##i); \
vector unsigned char src##j = vec_perm(srcC##i, srcD##i, perm2##i)
LOAD_DOUBLE_LINE(0, 1);
LOAD_DOUBLE_LINE(2, 3);
LOAD_DOUBLE_LINE(4, 5);
LOAD_DOUBLE_LINE(6, 7);
vector unsigned char tempA = vec_mergeh(src0, zero);
vector unsigned char tempB = vec_mergel(src0, zero);
vector unsigned char tempC = vec_mergeh(src1, zero);
vector unsigned char tempD = vec_mergel(src1, zero);
vector unsigned char tempE = vec_mergeh(src2, zero);
vector unsigned char tempF = vec_mergel(src2, zero);
vector unsigned char tempG = vec_mergeh(src3, zero);
vector unsigned char tempH = vec_mergel(src3, zero);
vector unsigned char tempI = vec_mergeh(src4, zero);
vector unsigned char tempJ = vec_mergel(src4, zero);
vector unsigned char tempK = vec_mergeh(src5, zero);
vector unsigned char tempL = vec_mergel(src5, zero);
vector unsigned char tempM = vec_mergeh(src6, zero);
vector unsigned char tempN = vec_mergel(src6, zero);
vector unsigned char tempO = vec_mergeh(src7, zero);
vector unsigned char tempP = vec_mergel(src7, zero);
vector unsigned char temp0 = vec_mergeh(tempA, tempI);
vector unsigned char temp1 = vec_mergel(tempA, tempI);
vector unsigned char temp2 = vec_mergeh(tempB, tempJ);
vector unsigned char temp3 = vec_mergel(tempB, tempJ);
vector unsigned char temp4 = vec_mergeh(tempC, tempK);
vector unsigned char temp5 = vec_mergel(tempC, tempK);
vector unsigned char temp6 = vec_mergeh(tempD, tempL);
vector unsigned char temp7 = vec_mergel(tempD, tempL);
vector unsigned char temp8 = vec_mergeh(tempE, tempM);
vector unsigned char temp9 = vec_mergel(tempE, tempM);
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
vector unsigned char temp10 = vec_mergeh(tempF, tempN);
vector unsigned char temp11 = vec_mergel(tempF, tempN);
vector unsigned char temp12 = vec_mergeh(tempG, tempO);
vector unsigned char temp13 = vec_mergel(tempG, tempO);
vector unsigned char temp14 = vec_mergeh(tempH, tempP);
vector unsigned char temp15 = vec_mergel(tempH, tempP);
tempA = vec_mergeh(temp0, temp8);
tempB = vec_mergel(temp0, temp8);
tempC = vec_mergeh(temp1, temp9);
tempD = vec_mergel(temp1, temp9);
tempE = vec_mergeh(temp2, temp10);
tempF = vec_mergel(temp2, temp10);
tempG = vec_mergeh(temp3, temp11);
tempH = vec_mergel(temp3, temp11);
tempI = vec_mergeh(temp4, temp12);
tempJ = vec_mergel(temp4, temp12);
tempK = vec_mergeh(temp5, temp13);
tempL = vec_mergel(temp5, temp13);
tempM = vec_mergeh(temp6, temp14);
tempN = vec_mergel(temp6, temp14);
tempO = vec_mergeh(temp7, temp15);
tempP = vec_mergel(temp7, temp15);
temp0 = vec_mergeh(tempA, tempI);
temp1 = vec_mergel(tempA, tempI);
temp2 = vec_mergeh(tempB, tempJ);
temp3 = vec_mergel(tempB, tempJ);
temp4 = vec_mergeh(tempC, tempK);
temp5 = vec_mergel(tempC, tempK);
temp6 = vec_mergeh(tempD, tempL);
temp7 = vec_mergel(tempD, tempL);
temp8 = vec_mergeh(tempE, tempM);
temp9 = vec_mergel(tempE, tempM);
temp10 = vec_mergeh(tempF, tempN);
temp11 = vec_mergel(tempF, tempN);
temp12 = vec_mergeh(tempG, tempO);
temp13 = vec_mergel(tempG, tempO);
temp14 = vec_mergeh(tempH, tempP);
temp15 = vec_mergel(tempH, tempP);
vec_st(temp0, 0, dst);
vec_st(temp1, 16, dst);
vec_st(temp2, 32, dst);
vec_st(temp3, 48, dst);
vec_st(temp4, 64, dst);
vec_st(temp5, 80, dst);
vec_st(temp6, 96, dst);
vec_st(temp7, 112, dst);
vec_st(temp8, 128, dst);
vec_st(temp9, 144, dst);
vec_st(temp10, 160, dst);
vec_st(temp11, 176, dst);
vec_st(temp12, 192, dst);
vec_st(temp13, 208, dst);
vec_st(temp14, 224, dst);
vec_st(temp15, 240, dst);
}
static inline void transpose_8x16_char_fromPackedAlign_altivec(unsigned char* dst, unsigned char* src, int stride) {
const vector unsigned char zero = vec_splat_u8(0);
#define LOAD_DOUBLE_LINE(i, j) \
vector unsigned char src##i = vec_ld(i * 16, src); \
vector unsigned char src##j = vec_ld(j * 16, src)
LOAD_DOUBLE_LINE(0, 1);
LOAD_DOUBLE_LINE(2, 3);
LOAD_DOUBLE_LINE(4, 5);
LOAD_DOUBLE_LINE(6, 7);
LOAD_DOUBLE_LINE(8, 9);
LOAD_DOUBLE_LINE(10, 11);
LOAD_DOUBLE_LINE(12, 13);
LOAD_DOUBLE_LINE(14, 15);
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
vector unsigned char tempA = vec_mergeh(src0, src8);
vector unsigned char tempB;
vector unsigned char tempC = vec_mergeh(src1, src9);
vector unsigned char tempD;
vector unsigned char tempE = vec_mergeh(src2, src10);
vector unsigned char tempG = vec_mergeh(src3, src11);
vector unsigned char tempI = vec_mergeh(src4, src12);
vector unsigned char tempJ;
vector unsigned char tempK = vec_mergeh(src5, src13);
vector unsigned char tempL;
vector unsigned char tempM = vec_mergeh(src6, src14);
vector unsigned char tempO = vec_mergeh(src7, src15);
vector unsigned char temp0 = vec_mergeh(tempA, tempI);
vector unsigned char temp1 = vec_mergel(tempA, tempI);
vector unsigned char temp2;
vector unsigned char temp3;
vector unsigned char temp4 = vec_mergeh(tempC, tempK);
vector unsigned char temp5 = vec_mergel(tempC, tempK);
vector unsigned char temp6;
vector unsigned char temp7;
vector unsigned char temp8 = vec_mergeh(tempE, tempM);
vector unsigned char temp9 = vec_mergel(tempE, tempM);
vector unsigned char temp12 = vec_mergeh(tempG, tempO);
vector unsigned char temp13 = vec_mergel(tempG, tempO);
tempA = vec_mergeh(temp0, temp8);
tempB = vec_mergel(temp0, temp8);
tempC = vec_mergeh(temp1, temp9);
tempD = vec_mergel(temp1, temp9);
tempI = vec_mergeh(temp4, temp12);
tempJ = vec_mergel(temp4, temp12);
tempK = vec_mergeh(temp5, temp13);
tempL = vec_mergel(temp5, temp13);
temp0 = vec_mergeh(tempA, tempI);
temp1 = vec_mergel(tempA, tempI);
temp2 = vec_mergeh(tempB, tempJ);
temp3 = vec_mergel(tempB, tempJ);
temp4 = vec_mergeh(tempC, tempK);
temp5 = vec_mergel(tempC, tempK);
temp6 = vec_mergeh(tempD, tempL);
temp7 = vec_mergel(tempD, tempL);
const vector signed char neg1 = vec_splat_s8(-1);
#define STORE_DOUBLE_LINE(i, j) \
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
vector unsigned char dstA##i = vec_ld(i * stride, dst); \
vector unsigned char dstB##i = vec_ld(i * stride + 16, dst); \
vector unsigned char dstA##j = vec_ld(j * stride, dst); \
vector unsigned char dstB##j = vec_ld(j * stride+ 16, dst); \
vector unsigned char align##i = vec_lvsr(i * stride, dst); \
vector unsigned char align##j = vec_lvsr(j * stride, dst); \
vector unsigned char mask##i = vec_perm(zero, (vector unsigned char)neg1, align##i); \
vector unsigned char mask##j = vec_perm(zero, (vector unsigned char)neg1, align##j); \
vector unsigned char dstR##i = vec_perm(temp##i, temp##i, align##i);\
vector unsigned char dstR##j = vec_perm(temp##j, temp##j, align##j);\
vector unsigned char dstAF##i = vec_sel(dstA##i, dstR##i, mask##i); \
vector unsigned char dstBF##i = vec_sel(dstR##i, dstB##i, mask##i); \
vector unsigned char dstAF##j = vec_sel(dstA##j, dstR##j, mask##j); \
vector unsigned char dstBF##j = vec_sel(dstR##j, dstB##j, mask##j); \
vec_st(dstAF##i, i * stride, dst); \
vec_st(dstBF##i, i * stride + 16, dst); \
vec_st(dstAF##j, j * stride, dst); \
vec_st(dstBF##j, j * stride + 16, dst)
STORE_DOUBLE_LINE(0,1);
STORE_DOUBLE_LINE(2,3);
STORE_DOUBLE_LINE(4,5);
STORE_DOUBLE_LINE(6,7);