vpx/convolve/filtering.c

436 lines
12 KiB
C

#include <assert.h>
#include <stdio.h>
#include <stdint.h>
#include <stdlib.h>
#include <string.h>
#include <tmmintrin.h>
#define RAND_SEED (0xbeef)
unsigned int seed = RAND_SEED;
static inline unsigned int readtsc(void) {
unsigned int tsc;
__asm__ __volatile__("rdtsc\n\t":"=a"(tsc):);
return tsc;
}
int round_power_of_two(int x, int n) {
int ret = (x + (1 << (n - 1))) >> n;
return ret;
}
uint8_t inline clip_pixel(int x) {
uint8_t ret = x;
if (x < 0) {
ret = 0;
}
if (x > 255) {
ret = 255;
}
return ret;
}
static int filtering(const uint8_t *src, const int16_t *filter, int flen) {
int k;
int sum = 0;
int prod;
for (k = 0; k < flen; ++k) {
prod = src[k] * filter[k];
sum += prod;
}
return sum;
}
void convolve(const uint8_t *src, int w, const int16_t *filter, int flen,
int *buffer) {
int i;
int sum;
for (i = 0; i < w; ++i) {
sum = filtering(src, filter, flen);
buffer[i] = sum;
src += 1;
}
}
void init_state(int *buf, uint8_t *pixel, int w, int block) {
int i;
memset(buf, 0, sizeof(buf[0]) * block);
memset(pixel, 0, sizeof(pixel[0]) * block);
seed = RAND_SEED;
for (i = 0; i < w; ++i) {
pixel[i] = clip_pixel(rand_r(&seed) % RAND_SEED);
}
}
void check_buffer(const int *buf1, const int *buf2, int width) {
int i;
for (i = 0; i < width; ++i) {
if (buf1[i] != buf2[i]) {
printf("Not bit-exact on index %d\n", i);
printf("Expected: %d, Actual: %d\n", buf1[i], buf2[i]);
return;
}
}
}
static const int16_t filter12[12] = {
-1, 3, -4, 8, -18, 120, 28, -12, 7, -4, 2, -1};
static const int16_t filter10[10] = {
1, -3, 7, -17, 119, 28, -11, 5, -2, 1};
// SSE4.1
const int16_t pfilter12[5][16] __attribute__ ((aligned(16))) = {
{-1, 3, -4, 8, -18, 120, 28, -12, 7, -4, 2, -1, 0, 0, 0, 0},
{ 0, -1, 3, -4, 8, -18, 120, 28, -12, 7, -4, 2, -1, 0, 0, 0},
{ 0, 0, -1, 3, -4, 8, -18, 120, 28, -12, 7, -4, 2, -1, 0, 0},
{ 0, 0, 0, -1, 3, -4, 8, -18, 120, 28, -12, 7, -4, 2, -1, 0},
{ 0, 0, 0, 0, -1, 3, -4, 8, -18, 120, 28, -12, 7, -4, 2, -1},
};
const int16_t pfilter12_2[3][16] __attribute__ ((aligned(16))) = {
{-1, 3, -1, 3, -1, 3, -1, 3, -4, 8, -4, 8, -4, 8, -4, 8},
{-18, 120, -18, 120, -18, 120, -18, 120, 28, -12, 28, -12, 28, -12, 28, -12},
{ 7, -4, 7, -4, 7, -4, 7, -4, 2, -1, 2, -1, 2, -1, 2, -1},
};
const int16_t pfilter10[7][16] __attribute__ ((aligned(16))) = {
{1, -3, 7, -17, 119, 28, -11, 5, -2, 1, 0, 0, 0, 0, 0, 0},
{0, 1, -3, 7, -17, 119, 28, -11, 5, -2, 1, 0, 0, 0, 0, 0},
{0, 0, 1, -3, 7, -17, 119, 28, -11, 5, -2, 1, 0, 0, 0, 0},
{0, 0, 0, 1, -3, 7, -17, 119, 28, -11, 5, -2, 1, 0, 0, 0},
{0, 0, 0, 0, 1, -3, 7, -17, 119, 28, -11, 5, -2, 1, 0, 0},
{0, 0, 0, 0, 0, 1, -3, 7, -17, 119, 28, -11, 5, -2, 1, 0},
{0, 0, 0, 0, 0, 0, 1, -3, 7, -17, 119, 28, -11, 5, -2, 1},
};
// Note:
// We can do this:
// typedef const int16_t filter_array[16];
//
// struct Filter {
// filter_array *coeffs;
// };
struct Filter {
const int16_t (*coeffs)[16];
int tapsNum;
int signalSpan;
};
const struct Filter pfilter_12tap = {
pfilter12, 12, 5
};
const struct Filter pfilter_12tap_2 = {
pfilter12_2, 12, 5
};
const struct Filter pfilter_10tap = {
pfilter10, 10, 7
};
static inline __m128i multiply_add(const __m128i ps0, const __m128i ps1,
const __m128i fl0, const __m128i fl1) {
__m128i sum;
__m128i sum0 = _mm_madd_epi16(ps0, fl0);
__m128i sum1 = _mm_madd_epi16(ps1, fl1);
sum = _mm_hadd_epi32(sum0, sum1);
sum = _mm_hadd_epi32(sum, sum);
return _mm_hadd_epi32(sum, sum);
}
// Note:
// Instead of, const int16_t (*filter)[16],
// we can also, const int16_t filter[][16]
// Note:
// This function memory access is 4 (12-tap filter) or 6 (10-tap filter) bytes
// more than C version
void inline convolve_w4_sse4_1(const uint8_t *src, const __m128i *f,
int *buffer) {
__m128i u0, u1, x, s[4];
__m128i pixel, pixello, pixelhi;
const __m128i zero = _mm_setzero_si128();
pixel = _mm_loadu_si128((__m128i const *)src);
pixello = _mm_unpacklo_epi8(pixel, zero);
pixelhi = _mm_unpackhi_epi8(pixel, zero);
s[0] = multiply_add(pixello, pixelhi, f[0], f[1]);
s[1] = multiply_add(pixello, pixelhi, f[2], f[3]);
s[2] = multiply_add(pixello, pixelhi, f[4], f[5]);
s[3] = multiply_add(pixello, pixelhi, f[6], f[7]);
u0 = _mm_unpacklo_epi32(s[0], s[1]);
u1 = _mm_unpacklo_epi32(s[2], s[3]);
x = _mm_unpacklo_epi64(u0, u1);
_mm_storeu_si128((__m128i *)buffer, x);
}
void convolve_w8_sse4_1(const uint8_t *src, const __m128i *f, int *buffer) {
convolve_w4_sse4_1(src, f, buffer);
src += 4;
buffer += 4;
convolve_w4_sse4_1(src, f, buffer);
}
void convolve_w16_sse4_1(const uint8_t *src, const __m128i *f, int *buffer) {
convolve_w8_sse4_1(src, f, buffer);
src += 8;
buffer += 8;
convolve_w8_sse4_1(src, f, buffer);
}
void convolve_w32_sse4_1(const uint8_t *src, const __m128i *f, int *buffer) {
convolve_w16_sse4_1(src, f, buffer);
src += 16;
buffer += 16;
convolve_w16_sse4_1(src, f, buffer);
}
void convolve_w64_sse4_1(const uint8_t *src, const __m128i *f, int *buffer) {
convolve_w32_sse4_1(src, f, buffer);
src += 32;
buffer += 32;
convolve_w32_sse4_1(src, f, buffer);
}
void (*convolveTab[5])(const uint8_t *, const __m128i *, int *) = {
convolve_w4_sse4_1,
convolve_w8_sse4_1,
convolve_w16_sse4_1,
convolve_w32_sse4_1,
convolve_w64_sse4_1,
};
void convolve_sse4_1(const uint8_t *src, const struct Filter fData,
int width, int *buffer) {
const int16_t *filter = (const int16_t *) fData.coeffs;
__m128i f[10];
f[0] = *((__m128i *)(filter + 0 * 8));
f[1] = *((__m128i *)(filter + 1 * 8));
f[2] = *((__m128i *)(filter + 2 * 8));
f[3] = *((__m128i *)(filter + 3 * 8));
f[4] = *((__m128i *)(filter + 4 * 8));
f[5] = *((__m128i *)(filter + 5 * 8));
f[6] = *((__m128i *)(filter + 6 * 8));
f[7] = *((__m128i *)(filter + 7 * 8));
f[8] = *((__m128i *)(filter + 8 * 8));
f[9] = *((__m128i *)(filter + 9 * 8));
switch (width) {
case 4:
convolveTab[0](src, f, buffer);
break;
case 8:
convolveTab[1](src, f, buffer);
break;
case 16:
convolveTab[2](src, f, buffer);
break;
case 32:
convolveTab[3](src, f, buffer);
break;
case 64:
convolveTab[4](src, f, buffer);
break;
default:
assert(0);
}
}
// Solution 2
void inline convolve2_w4_sse4_1(const uint8_t *src, const __m128i *f,
const __m128i *cm, int *buffer) {
__m128i pixel;
__m128i s0, s1, s2, s3, s4, s5;
const __m128i zero = _mm_setzero_si128();
pixel = _mm_loadu_si128((__m128i const *)src);
__m128i y0 = _mm_shuffle_epi8(pixel, cm[0]);
__m128i y1 = _mm_shuffle_epi8(pixel, cm[1]);
__m128i y2 = _mm_shuffle_epi8(pixel, cm[2]);
__m128i y3 = _mm_shuffle_epi8(pixel, cm[3]);
__m128i y4 = _mm_shuffle_epi8(pixel, cm[4]);
__m128i y5 = _mm_shuffle_epi8(pixel, cm[5]);
y0 = _mm_unpacklo_epi8(y0, zero);
y1 = _mm_unpacklo_epi8(y1, zero);
y2 = _mm_unpacklo_epi8(y2, zero);
y3 = _mm_unpacklo_epi8(y3, zero);
y4 = _mm_unpacklo_epi8(y4, zero);
y5 = _mm_unpacklo_epi8(y5, zero);
s0 = _mm_madd_epi16(y0, f[0]);
s1 = _mm_madd_epi16(y1, f[1]);
s2 = _mm_madd_epi16(y2, f[2]);
s3 = _mm_madd_epi16(y3, f[3]);
s4 = _mm_madd_epi16(y4, f[4]);
s5 = _mm_madd_epi16(y5, f[5]);
s0 = _mm_add_epi32(s0, s1);
s0 = _mm_add_epi32(s0, s2);
s0 = _mm_add_epi32(s0, s3);
s0 = _mm_add_epi32(s0, s4);
s0 = _mm_add_epi32(s0, s5);
_mm_storeu_si128((__m128i *)buffer, s0);
}
void convolve2_w8_sse4_1(const uint8_t *src, const __m128i *f,
const __m128i *cm, int *buffer) {
convolve2_w4_sse4_1(src, f, cm, buffer);
src += 4;
buffer += 4;
convolve2_w4_sse4_1(src, f, cm, buffer);
}
void convolve2_w16_sse4_1(const uint8_t *src, const __m128i *f,
const __m128i *cm, int *buffer) {
convolve2_w8_sse4_1(src, f, cm, buffer);
src += 8;
buffer += 8;
convolve2_w8_sse4_1(src, f, cm, buffer);
}
void convolve2_w32_sse4_1(const uint8_t *src, const __m128i *f,
const __m128i *cm, int *buffer) {
convolve2_w16_sse4_1(src, f, cm, buffer);
src += 16;
buffer += 16;
convolve2_w16_sse4_1(src, f, cm, buffer);
}
void convolve2_w64_sse4_1(const uint8_t *src, const __m128i *f,
const __m128i *cm, int *buffer) {
convolve2_w32_sse4_1(src, f, cm, buffer);
src += 32;
buffer += 32;
convolve2_w32_sse4_1(src, f, cm, buffer);
}
void (*convolveTab2[5])(const uint8_t *, const __m128i *, const __m128i *,
int *) = {
convolve2_w4_sse4_1,
convolve2_w8_sse4_1,
convolve2_w16_sse4_1,
convolve2_w32_sse4_1,
convolve2_w64_sse4_1,
};
void convolve2_sse4_1(const uint8_t *src, const struct Filter fData,
int width, int *buffer) {
__m128i f[6];
__m128i cm[6];
const int16_t *filter = (const int16_t *) fData.coeffs;
f[0] = *((__m128i *)(filter + 0 * 8));
f[1] = *((__m128i *)(filter + 1 * 8));
f[2] = *((__m128i *)(filter + 2 * 8));
f[3] = *((__m128i *)(filter + 3 * 8));
f[4] = *((__m128i *)(filter + 4 * 8));
f[5] = *((__m128i *)(filter + 5 * 8));
cm[0] = _mm_setr_epi16(0x100, 0x201, 0x302, 0x403, 0, 0, 0, 0);
cm[1] = _mm_setr_epi16(0x302, 0x403, 0x504, 0x605, 0, 0, 0, 0);
cm[2] = _mm_setr_epi16(0x504, 0x605, 0x706, 0x807, 0, 0, 0, 0);
cm[3] = _mm_setr_epi16(0x706, 0x807, 0x908, 0xa09, 0, 0, 0, 0);
cm[4] = _mm_setr_epi16(0x908, 0xa09, 0xb0a, 0xc0b, 0, 0, 0, 0);
cm[5] = _mm_setr_epi16(0xb0a, 0xc0b, 0xd0c, 0xe0d, 0, 0, 0, 0);
switch (width) {
case 4:
convolveTab2[0](src, f, cm, buffer);
break;
case 8:
convolveTab2[1](src, f, cm, buffer);
break;
case 16:
convolveTab2[2](src, f, cm, buffer);
break;
case 32:
convolveTab2[3](src, f, cm, buffer);
break;
case 64:
convolveTab2[4](src, f, cm, buffer);
break;
default:
assert(0);
}
}
#define TEST_NUM (32)
int main(int argc, char **argv)
{
const size_t block_size = 256;
if (argc != 2) {
printf("Usage: filtering <width>, where width = 4, 8, 16, 32, 64\n");
return -1;
}
const int width = atoi(argv[1]);
int *buffer = (int *) malloc(3 * sizeof(buffer[0]) * block_size);
uint8_t *pixel = (uint8_t *) malloc(3 * sizeof(pixel[0]) * block_size);
uint8_t *ppixel = pixel + block_size;
int *pbuffer = buffer + block_size;
uint8_t *ppixel2 = pixel + 2 * block_size;
int *pbuffer2 = buffer + 2 * block_size;
uint32_t start, end;
int count;
init_state(buffer, pixel, width, block_size);
init_state(pbuffer, ppixel, width, block_size);
init_state(pbuffer2, ppixel2, width, block_size);
count = 0;
start = readtsc();
do {
convolve(pixel, width, filter12, 12, buffer);
count++;
} while (count < TEST_NUM);
end = readtsc();
printf("C version cycles: %d\n", end - start);
// Solution 1
count = 0;
start = readtsc();
do {
convolve_sse4_1(ppixel, pfilter_12tap, width, pbuffer);
count++;
} while (count < TEST_NUM);
end = readtsc();
printf("SIMD version cycles: %d\n", end - start);
check_buffer(buffer, pbuffer, width);
// Solution 2
count = 0;
start = readtsc();
do {
convolve2_sse4_1(ppixel2, pfilter_12tap_2, width, pbuffer2);
count++;
} while (count < TEST_NUM);
end = readtsc();
printf("SIMD version 2 cycles: %d\n", end - start);
check_buffer(buffer, pbuffer2, width);
free(buffer);
free(pixel);
return 0;
}