Replace copy_memNxM functions with a generic copy/avg function.
Change-Id: I3ce849452ed4f08527de9565a9914d5ee36170aa
This commit is contained in:
parent
c13e0bcb52
commit
decead7336
@ -22,8 +22,8 @@ extern "C" {
|
||||
}
|
||||
|
||||
namespace {
|
||||
typedef void (*convolve_fn_t)(const uint8_t *src, int src_stride,
|
||||
uint8_t *dst, int dst_stride,
|
||||
typedef void (*convolve_fn_t)(const uint8_t *src, ptrdiff_t src_stride,
|
||||
uint8_t *dst, ptrdiff_t dst_stride,
|
||||
const int16_t *filter_x, int filter_x_stride,
|
||||
const int16_t *filter_y, int filter_y_stride,
|
||||
int w, int h);
|
||||
|
@ -38,8 +38,8 @@
|
||||
*/
|
||||
#define ALIGN_FILTERS_256 1
|
||||
|
||||
static void convolve_horiz_c(const uint8_t *src, int src_stride,
|
||||
uint8_t *dst, int dst_stride,
|
||||
static void convolve_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
|
||||
uint8_t *dst, ptrdiff_t dst_stride,
|
||||
const int16_t *filter_x0, int x_step_q4,
|
||||
const int16_t *filter_y, int y_step_q4,
|
||||
int w, int h, int taps) {
|
||||
@ -80,8 +80,8 @@ static void convolve_horiz_c(const uint8_t *src, int src_stride,
|
||||
}
|
||||
}
|
||||
|
||||
static void convolve_avg_horiz_c(const uint8_t *src, int src_stride,
|
||||
uint8_t *dst, int dst_stride,
|
||||
static void convolve_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
|
||||
uint8_t *dst, ptrdiff_t dst_stride,
|
||||
const int16_t *filter_x0, int x_step_q4,
|
||||
const int16_t *filter_y, int y_step_q4,
|
||||
int w, int h, int taps) {
|
||||
@ -122,8 +122,8 @@ static void convolve_avg_horiz_c(const uint8_t *src, int src_stride,
|
||||
}
|
||||
}
|
||||
|
||||
static void convolve_vert_c(const uint8_t *src, int src_stride,
|
||||
uint8_t *dst, int dst_stride,
|
||||
static void convolve_vert_c(const uint8_t *src, ptrdiff_t src_stride,
|
||||
uint8_t *dst, ptrdiff_t dst_stride,
|
||||
const int16_t *filter_x, int x_step_q4,
|
||||
const int16_t *filter_y0, int y_step_q4,
|
||||
int w, int h, int taps) {
|
||||
@ -164,8 +164,8 @@ static void convolve_vert_c(const uint8_t *src, int src_stride,
|
||||
}
|
||||
}
|
||||
|
||||
static void convolve_avg_vert_c(const uint8_t *src, int src_stride,
|
||||
uint8_t *dst, int dst_stride,
|
||||
static void convolve_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride,
|
||||
uint8_t *dst, ptrdiff_t dst_stride,
|
||||
const int16_t *filter_x, int x_step_q4,
|
||||
const int16_t *filter_y0, int y_step_q4,
|
||||
int w, int h, int taps) {
|
||||
@ -207,8 +207,8 @@ static void convolve_avg_vert_c(const uint8_t *src, int src_stride,
|
||||
}
|
||||
}
|
||||
|
||||
static void convolve_c(const uint8_t *src, int src_stride,
|
||||
uint8_t *dst, int dst_stride,
|
||||
static void convolve_c(const uint8_t *src, ptrdiff_t src_stride,
|
||||
uint8_t *dst, ptrdiff_t dst_stride,
|
||||
const int16_t *filter_x, int x_step_q4,
|
||||
const int16_t *filter_y, int y_step_q4,
|
||||
int w, int h, int taps) {
|
||||
@ -237,8 +237,8 @@ static void convolve_c(const uint8_t *src, int src_stride,
|
||||
w, h, taps);
|
||||
}
|
||||
|
||||
static void convolve_avg_c(const uint8_t *src, int src_stride,
|
||||
uint8_t *dst, int dst_stride,
|
||||
static void convolve_avg_c(const uint8_t *src, ptrdiff_t src_stride,
|
||||
uint8_t *dst, ptrdiff_t dst_stride,
|
||||
const int16_t *filter_x, int x_step_q4,
|
||||
const int16_t *filter_y, int y_step_q4,
|
||||
int w, int h, int taps) {
|
||||
@ -267,8 +267,8 @@ static void convolve_avg_c(const uint8_t *src, int src_stride,
|
||||
w, h, taps);
|
||||
}
|
||||
|
||||
void vp9_convolve8_horiz_c(const uint8_t *src, int src_stride,
|
||||
uint8_t *dst, int dst_stride,
|
||||
void vp9_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
|
||||
uint8_t *dst, ptrdiff_t dst_stride,
|
||||
const int16_t *filter_x, int x_step_q4,
|
||||
const int16_t *filter_y, int y_step_q4,
|
||||
int w, int h) {
|
||||
@ -277,8 +277,8 @@ void vp9_convolve8_horiz_c(const uint8_t *src, int src_stride,
|
||||
w, h, 8);
|
||||
}
|
||||
|
||||
void vp9_convolve8_avg_horiz_c(const uint8_t *src, int src_stride,
|
||||
uint8_t *dst, int dst_stride,
|
||||
void vp9_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
|
||||
uint8_t *dst, ptrdiff_t dst_stride,
|
||||
const int16_t *filter_x, int x_step_q4,
|
||||
const int16_t *filter_y, int y_step_q4,
|
||||
int w, int h) {
|
||||
@ -287,8 +287,8 @@ void vp9_convolve8_avg_horiz_c(const uint8_t *src, int src_stride,
|
||||
w, h, 8);
|
||||
}
|
||||
|
||||
void vp9_convolve8_vert_c(const uint8_t *src, int src_stride,
|
||||
uint8_t *dst, int dst_stride,
|
||||
void vp9_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride,
|
||||
uint8_t *dst, ptrdiff_t dst_stride,
|
||||
const int16_t *filter_x, int x_step_q4,
|
||||
const int16_t *filter_y, int y_step_q4,
|
||||
int w, int h) {
|
||||
@ -297,8 +297,8 @@ void vp9_convolve8_vert_c(const uint8_t *src, int src_stride,
|
||||
w, h, 8);
|
||||
}
|
||||
|
||||
void vp9_convolve8_avg_vert_c(const uint8_t *src, int src_stride,
|
||||
uint8_t *dst, int dst_stride,
|
||||
void vp9_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride,
|
||||
uint8_t *dst, ptrdiff_t dst_stride,
|
||||
const int16_t *filter_x, int x_step_q4,
|
||||
const int16_t *filter_y, int y_step_q4,
|
||||
int w, int h) {
|
||||
@ -307,8 +307,8 @@ void vp9_convolve8_avg_vert_c(const uint8_t *src, int src_stride,
|
||||
w, h, 8);
|
||||
}
|
||||
|
||||
void vp9_convolve8_c(const uint8_t *src, int src_stride,
|
||||
uint8_t *dst, int dst_stride,
|
||||
void vp9_convolve8_c(const uint8_t *src, ptrdiff_t src_stride,
|
||||
uint8_t *dst, ptrdiff_t dst_stride,
|
||||
const int16_t *filter_x, int x_step_q4,
|
||||
const int16_t *filter_y, int y_step_q4,
|
||||
int w, int h) {
|
||||
@ -317,8 +317,8 @@ void vp9_convolve8_c(const uint8_t *src, int src_stride,
|
||||
w, h, 8);
|
||||
}
|
||||
|
||||
void vp9_convolve8_avg_c(const uint8_t *src, int src_stride,
|
||||
uint8_t *dst, int dst_stride,
|
||||
void vp9_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride,
|
||||
uint8_t *dst, ptrdiff_t dst_stride,
|
||||
const int16_t *filter_x, int x_step_q4,
|
||||
const int16_t *filter_y, int y_step_q4,
|
||||
int w, int h) {
|
||||
@ -339,33 +339,25 @@ void vp9_convolve8_avg_c(const uint8_t *src, int src_stride,
|
||||
w, h);
|
||||
}
|
||||
|
||||
void vp9_convolve_copy(const uint8_t *src, int src_stride,
|
||||
uint8_t *dst, int dst_stride,
|
||||
const int16_t *filter_x, int filter_x_stride,
|
||||
const int16_t *filter_y, int filter_y_stride,
|
||||
int w, int h) {
|
||||
if (w == 16 && h == 16) {
|
||||
vp9_copy_mem16x16(src, src_stride, dst, dst_stride);
|
||||
} else if (w == 8 && h == 8) {
|
||||
vp9_copy_mem8x8(src, src_stride, dst, dst_stride);
|
||||
} else if (w == 8 && h == 4) {
|
||||
vp9_copy_mem8x4(src, src_stride, dst, dst_stride);
|
||||
} else {
|
||||
int r;
|
||||
void vp9_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride,
|
||||
uint8_t *dst, ptrdiff_t dst_stride,
|
||||
const int16_t *filter_x, int filter_x_stride,
|
||||
const int16_t *filter_y, int filter_y_stride,
|
||||
int w, int h) {
|
||||
int r;
|
||||
|
||||
for (r = h; r > 0; --r) {
|
||||
memcpy(dst, src, w);
|
||||
src += src_stride;
|
||||
dst += dst_stride;
|
||||
}
|
||||
for (r = h; r > 0; --r) {
|
||||
memcpy(dst, src, w);
|
||||
src += src_stride;
|
||||
dst += dst_stride;
|
||||
}
|
||||
}
|
||||
|
||||
void vp9_convolve_avg(const uint8_t *src, int src_stride,
|
||||
uint8_t *dst, int dst_stride,
|
||||
const int16_t *filter_x, int filter_x_stride,
|
||||
const int16_t *filter_y, int filter_y_stride,
|
||||
int w, int h) {
|
||||
void vp9_convolve_avg_c(const uint8_t *src, ptrdiff_t src_stride,
|
||||
uint8_t *dst, ptrdiff_t dst_stride,
|
||||
const int16_t *filter_x, int filter_x_stride,
|
||||
const int16_t *filter_y, int filter_y_stride,
|
||||
int w, int h) {
|
||||
int x, y;
|
||||
|
||||
for (y = 0; y < h; ++y) {
|
||||
|
@ -13,26 +13,12 @@
|
||||
#include "./vpx_config.h"
|
||||
#include "vpx/vpx_integer.h"
|
||||
|
||||
typedef void (*convolve_fn_t)(const uint8_t *src, int src_stride,
|
||||
uint8_t *dst, int dst_stride,
|
||||
typedef void (*convolve_fn_t)(const uint8_t *src, ptrdiff_t src_stride,
|
||||
uint8_t *dst, ptrdiff_t dst_stride,
|
||||
const int16_t *filter_x, int x_step_q4,
|
||||
const int16_t *filter_y, int y_step_q4,
|
||||
int w, int h);
|
||||
|
||||
// Not a convolution, a block copy conforming to the convolution prototype
|
||||
void vp9_convolve_copy(const uint8_t *src, int src_stride,
|
||||
uint8_t *dst, int dst_stride,
|
||||
const int16_t *filter_x, int x_step_q4,
|
||||
const int16_t *filter_y, int y_step_q4,
|
||||
int w, int h);
|
||||
|
||||
// Not a convolution, a block average conforming to the convolution prototype
|
||||
void vp9_convolve_avg(const uint8_t *src, int src_stride,
|
||||
uint8_t *dst, int dst_stride,
|
||||
const int16_t *filter_x, int x_step_q4,
|
||||
const int16_t *filter_y, int y_step_q4,
|
||||
int w, int h);
|
||||
|
||||
struct subpix_fn_table {
|
||||
const int16_t (*filter_x)[8];
|
||||
const int16_t (*filter_y)[8];
|
||||
|
@ -194,93 +194,6 @@ void vp9_setup_interp_filters(MACROBLOCKD *xd,
|
||||
assert(((intptr_t)xd->subpix.filter_x & 0xff) == 0);
|
||||
}
|
||||
|
||||
void vp9_copy_mem16x16_c(const uint8_t *src,
|
||||
int src_stride,
|
||||
uint8_t *dst,
|
||||
int dst_stride) {
|
||||
int r;
|
||||
|
||||
for (r = 0; r < 16; r++) {
|
||||
#if !(CONFIG_FAST_UNALIGNED)
|
||||
dst[0] = src[0];
|
||||
dst[1] = src[1];
|
||||
dst[2] = src[2];
|
||||
dst[3] = src[3];
|
||||
dst[4] = src[4];
|
||||
dst[5] = src[5];
|
||||
dst[6] = src[6];
|
||||
dst[7] = src[7];
|
||||
dst[8] = src[8];
|
||||
dst[9] = src[9];
|
||||
dst[10] = src[10];
|
||||
dst[11] = src[11];
|
||||
dst[12] = src[12];
|
||||
dst[13] = src[13];
|
||||
dst[14] = src[14];
|
||||
dst[15] = src[15];
|
||||
|
||||
#else
|
||||
((uint32_t *)dst)[0] = ((const uint32_t *)src)[0];
|
||||
((uint32_t *)dst)[1] = ((const uint32_t *)src)[1];
|
||||
((uint32_t *)dst)[2] = ((const uint32_t *)src)[2];
|
||||
((uint32_t *)dst)[3] = ((const uint32_t *)src)[3];
|
||||
|
||||
#endif
|
||||
src += src_stride;
|
||||
dst += dst_stride;
|
||||
}
|
||||
}
|
||||
|
||||
void vp9_copy_mem8x8_c(const uint8_t *src,
|
||||
int src_stride,
|
||||
uint8_t *dst,
|
||||
int dst_stride) {
|
||||
int r;
|
||||
|
||||
for (r = 0; r < 8; r++) {
|
||||
#if !(CONFIG_FAST_UNALIGNED)
|
||||
dst[0] = src[0];
|
||||
dst[1] = src[1];
|
||||
dst[2] = src[2];
|
||||
dst[3] = src[3];
|
||||
dst[4] = src[4];
|
||||
dst[5] = src[5];
|
||||
dst[6] = src[6];
|
||||
dst[7] = src[7];
|
||||
#else
|
||||
((uint32_t *)dst)[0] = ((const uint32_t *)src)[0];
|
||||
((uint32_t *)dst)[1] = ((const uint32_t *)src)[1];
|
||||
#endif
|
||||
src += src_stride;
|
||||
dst += dst_stride;
|
||||
}
|
||||
}
|
||||
|
||||
void vp9_copy_mem8x4_c(const uint8_t *src,
|
||||
int src_stride,
|
||||
uint8_t *dst,
|
||||
int dst_stride) {
|
||||
int r;
|
||||
|
||||
for (r = 0; r < 4; r++) {
|
||||
#if !(CONFIG_FAST_UNALIGNED)
|
||||
dst[0] = src[0];
|
||||
dst[1] = src[1];
|
||||
dst[2] = src[2];
|
||||
dst[3] = src[3];
|
||||
dst[4] = src[4];
|
||||
dst[5] = src[5];
|
||||
dst[6] = src[6];
|
||||
dst[7] = src[7];
|
||||
#else
|
||||
((uint32_t *)dst)[0] = ((const uint32_t *)src)[0];
|
||||
((uint32_t *)dst)[1] = ((const uint32_t *)src)[1];
|
||||
#endif
|
||||
src += src_stride;
|
||||
dst += dst_stride;
|
||||
}
|
||||
}
|
||||
|
||||
void vp9_build_inter_predictor(const uint8_t *src, int src_stride,
|
||||
uint8_t *dst, int dst_stride,
|
||||
const int_mv *src_mv,
|
||||
|
@ -43,17 +43,6 @@ specialize vp9_idct_add_32x32
|
||||
#
|
||||
# RECON
|
||||
#
|
||||
prototype void vp9_copy_mem16x16 "const uint8_t *src, int src_pitch, uint8_t *dst, int dst_pitch"
|
||||
specialize vp9_copy_mem16x16 mmx sse2 dspr2
|
||||
vp9_copy_mem16x16_dspr2=vp9_copy_mem16x16_dspr2
|
||||
|
||||
prototype void vp9_copy_mem8x8 "const uint8_t *src, int src_pitch, uint8_t *dst, int dst_pitch"
|
||||
specialize vp9_copy_mem8x8 mmx dspr2
|
||||
vp9_copy_mem8x8_dspr2=vp9_copy_mem8x8_dspr2
|
||||
|
||||
prototype void vp9_copy_mem8x4 "const uint8_t *src, int src_pitch, uint8_t *dst, int dst_pitch"
|
||||
specialize vp9_copy_mem8x4 mmx
|
||||
|
||||
prototype void vp9_d27_predictor_4x4 "uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col"
|
||||
specialize vp9_d27_predictor_4x4
|
||||
|
||||
@ -275,22 +264,28 @@ specialize vp9_blend_b
|
||||
#
|
||||
# Sub Pixel Filters
|
||||
#
|
||||
prototype void vp9_convolve8 "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"
|
||||
prototype void vp9_convolve_copy "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"
|
||||
specialize vp9_convolve_copy sse2
|
||||
|
||||
prototype void vp9_convolve_avg "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"
|
||||
specialize vp9_convolve_avg sse2
|
||||
|
||||
prototype void vp9_convolve8 "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"
|
||||
specialize vp9_convolve8 ssse3
|
||||
|
||||
prototype void vp9_convolve8_horiz "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"
|
||||
prototype void vp9_convolve8_horiz "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"
|
||||
specialize vp9_convolve8_horiz ssse3
|
||||
|
||||
prototype void vp9_convolve8_vert "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"
|
||||
prototype void vp9_convolve8_vert "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"
|
||||
specialize vp9_convolve8_vert ssse3
|
||||
|
||||
prototype void vp9_convolve8_avg "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"
|
||||
prototype void vp9_convolve8_avg "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"
|
||||
specialize vp9_convolve8_avg ssse3
|
||||
|
||||
prototype void vp9_convolve8_avg_horiz "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"
|
||||
prototype void vp9_convolve8_avg_horiz "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"
|
||||
specialize vp9_convolve8_avg_horiz ssse3
|
||||
|
||||
prototype void vp9_convolve8_avg_vert "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"
|
||||
prototype void vp9_convolve8_avg_vert "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"
|
||||
specialize vp9_convolve8_avg_vert ssse3
|
||||
|
||||
#
|
||||
|
@ -121,8 +121,8 @@ void vp9_filter_block1d4_h8_avg_ssse3(const unsigned char *src_ptr,
|
||||
unsigned int output_height,
|
||||
const short *filter);
|
||||
|
||||
void vp9_convolve8_horiz_ssse3(const uint8_t *src, int src_stride,
|
||||
uint8_t *dst, int dst_stride,
|
||||
void vp9_convolve8_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride,
|
||||
uint8_t *dst, ptrdiff_t dst_stride,
|
||||
const int16_t *filter_x, int x_step_q4,
|
||||
const int16_t *filter_y, int y_step_q4,
|
||||
int w, int h) {
|
||||
@ -159,8 +159,8 @@ void vp9_convolve8_horiz_ssse3(const uint8_t *src, int src_stride,
|
||||
}
|
||||
}
|
||||
|
||||
void vp9_convolve8_vert_ssse3(const uint8_t *src, int src_stride,
|
||||
uint8_t *dst, int dst_stride,
|
||||
void vp9_convolve8_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride,
|
||||
uint8_t *dst, ptrdiff_t dst_stride,
|
||||
const int16_t *filter_x, int x_step_q4,
|
||||
const int16_t *filter_y, int y_step_q4,
|
||||
int w, int h) {
|
||||
@ -197,8 +197,8 @@ void vp9_convolve8_vert_ssse3(const uint8_t *src, int src_stride,
|
||||
}
|
||||
}
|
||||
|
||||
void vp9_convolve8_avg_horiz_ssse3(const uint8_t *src, int src_stride,
|
||||
uint8_t *dst, int dst_stride,
|
||||
void vp9_convolve8_avg_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride,
|
||||
uint8_t *dst, ptrdiff_t dst_stride,
|
||||
const int16_t *filter_x, int x_step_q4,
|
||||
const int16_t *filter_y, int y_step_q4,
|
||||
int w, int h) {
|
||||
@ -235,8 +235,8 @@ void vp9_convolve8_avg_horiz_ssse3(const uint8_t *src, int src_stride,
|
||||
}
|
||||
}
|
||||
|
||||
void vp9_convolve8_avg_vert_ssse3(const uint8_t *src, int src_stride,
|
||||
uint8_t *dst, int dst_stride,
|
||||
void vp9_convolve8_avg_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride,
|
||||
uint8_t *dst, ptrdiff_t dst_stride,
|
||||
const int16_t *filter_x, int x_step_q4,
|
||||
const int16_t *filter_y, int y_step_q4,
|
||||
int w, int h) {
|
||||
@ -273,8 +273,8 @@ void vp9_convolve8_avg_vert_ssse3(const uint8_t *src, int src_stride,
|
||||
}
|
||||
}
|
||||
|
||||
void vp9_convolve8_ssse3(const uint8_t *src, int src_stride,
|
||||
uint8_t *dst, int dst_stride,
|
||||
void vp9_convolve8_ssse3(const uint8_t *src, ptrdiff_t src_stride,
|
||||
uint8_t *dst, ptrdiff_t dst_stride,
|
||||
const int16_t *filter_x, int x_step_q4,
|
||||
const int16_t *filter_y, int y_step_q4,
|
||||
int w, int h) {
|
||||
@ -294,8 +294,8 @@ void vp9_convolve8_ssse3(const uint8_t *src, int src_stride,
|
||||
}
|
||||
}
|
||||
|
||||
void vp9_convolve8_avg_ssse3(const uint8_t *src, int src_stride,
|
||||
uint8_t *dst, int dst_stride,
|
||||
void vp9_convolve8_avg_ssse3(const uint8_t *src, ptrdiff_t src_stride,
|
||||
uint8_t *dst, ptrdiff_t dst_stride,
|
||||
const int16_t *filter_x, int x_step_q4,
|
||||
const int16_t *filter_y, int y_step_q4,
|
||||
int w, int h) {
|
||||
|
152
vp9/common/x86/vp9_copy_sse2.asm
Normal file
152
vp9/common/x86/vp9_copy_sse2.asm
Normal file
@ -0,0 +1,152 @@
|
||||
;
|
||||
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
|
||||
;
|
||||
; Use of this source code is governed by a BSD-style license
|
||||
; that can be found in the LICENSE file in the root of the source
|
||||
; tree. An additional intellectual property rights grant can be found
|
||||
; in the file PATENTS. All contributing project authors may
|
||||
; be found in the AUTHORS file in the root of the source tree.
|
||||
;
|
||||
|
||||
%include "third_party/x86inc/x86inc.asm"
|
||||
|
||||
SECTION .text
|
||||
|
||||
%macro convolve_fn 1
|
||||
INIT_XMM sse2
|
||||
cglobal convolve_%1, 4, 7, 4, src, src_stride, dst, dst_stride, \
|
||||
fx, fxs, fy, fys, w, h
|
||||
mov r4d, dword wm
|
||||
cmp r4d, 4
|
||||
je .w4
|
||||
cmp r4d, 8
|
||||
je .w8
|
||||
cmp r4d, 16
|
||||
je .w16
|
||||
cmp r4d, 32
|
||||
je .w32
|
||||
|
||||
mov r4d, dword hm
|
||||
.loop64:
|
||||
movu m0, [srcq]
|
||||
movu m1, [srcq+16]
|
||||
movu m2, [srcq+32]
|
||||
movu m3, [srcq+48]
|
||||
add srcq, src_strideq
|
||||
%ifidn %1, avg
|
||||
pavgb m0, [dstq]
|
||||
pavgb m1, [dstq+16]
|
||||
pavgb m2, [dstq+32]
|
||||
pavgb m3, [dstq+48]
|
||||
%endif
|
||||
mova [dstq ], m0
|
||||
mova [dstq+16], m1
|
||||
mova [dstq+32], m2
|
||||
mova [dstq+48], m3
|
||||
add dstq, dst_strideq
|
||||
dec r4d
|
||||
jnz .loop64
|
||||
RET
|
||||
|
||||
.w32:
|
||||
mov r4d, dword hm
|
||||
.loop32:
|
||||
movu m0, [srcq]
|
||||
movu m1, [srcq+16]
|
||||
movu m2, [srcq+src_strideq]
|
||||
movu m3, [srcq+src_strideq+16]
|
||||
lea srcq, [srcq+src_strideq*2]
|
||||
%ifidn %1, avg
|
||||
pavgb m0, [dstq]
|
||||
pavgb m1, [dstq +16]
|
||||
pavgb m2, [dstq+dst_strideq]
|
||||
pavgb m3, [dstq+dst_strideq+16]
|
||||
%endif
|
||||
mova [dstq ], m0
|
||||
mova [dstq +16], m1
|
||||
mova [dstq+dst_strideq ], m2
|
||||
mova [dstq+dst_strideq+16], m3
|
||||
lea dstq, [dstq+dst_strideq*2]
|
||||
sub r4d, 2
|
||||
jnz .loop32
|
||||
RET
|
||||
|
||||
.w16:
|
||||
mov r4d, dword hm
|
||||
lea r5q, [src_strideq*3]
|
||||
lea r6q, [dst_strideq*3]
|
||||
.loop16:
|
||||
movu m0, [srcq]
|
||||
movu m1, [srcq+src_strideq]
|
||||
movu m2, [srcq+src_strideq*2]
|
||||
movu m3, [srcq+r5q]
|
||||
lea srcq, [srcq+src_strideq*4]
|
||||
%ifidn %1, avg
|
||||
pavgb m0, [dstq]
|
||||
pavgb m1, [dstq+dst_strideq]
|
||||
pavgb m2, [dstq+dst_strideq*2]
|
||||
pavgb m3, [dstq+r6q]
|
||||
%endif
|
||||
mova [dstq ], m0
|
||||
mova [dstq+dst_strideq ], m1
|
||||
mova [dstq+dst_strideq*2], m2
|
||||
mova [dstq+r6q ], m3
|
||||
lea dstq, [dstq+dst_strideq*4]
|
||||
sub r4d, 4
|
||||
jnz .loop16
|
||||
RET
|
||||
|
||||
INIT_MMX sse
|
||||
.w8:
|
||||
mov r4d, dword hm
|
||||
lea r5q, [src_strideq*3]
|
||||
lea r6q, [dst_strideq*3]
|
||||
.loop8:
|
||||
movu m0, [srcq]
|
||||
movu m1, [srcq+src_strideq]
|
||||
movu m2, [srcq+src_strideq*2]
|
||||
movu m3, [srcq+r5q]
|
||||
lea srcq, [srcq+src_strideq*4]
|
||||
%ifidn %1, avg
|
||||
pavgb m0, [dstq]
|
||||
pavgb m1, [dstq+dst_strideq]
|
||||
pavgb m2, [dstq+dst_strideq*2]
|
||||
pavgb m3, [dstq+r6q]
|
||||
%endif
|
||||
mova [dstq ], m0
|
||||
mova [dstq+dst_strideq ], m1
|
||||
mova [dstq+dst_strideq*2], m2
|
||||
mova [dstq+r6q ], m3
|
||||
lea dstq, [dstq+dst_strideq*4]
|
||||
sub r4d, 4
|
||||
jnz .loop8
|
||||
RET
|
||||
|
||||
.w4:
|
||||
mov r4d, dword hm
|
||||
lea r5q, [src_strideq*3]
|
||||
lea r6q, [dst_strideq*3]
|
||||
.loop4:
|
||||
movh m0, [srcq]
|
||||
movh m1, [srcq+src_strideq]
|
||||
movh m2, [srcq+src_strideq*2]
|
||||
movh m3, [srcq+r5q]
|
||||
lea srcq, [srcq+src_strideq*4]
|
||||
%ifidn %1, avg
|
||||
pavgb m0, [dstq]
|
||||
pavgb m1, [dstq+dst_strideq]
|
||||
pavgb m2, [dstq+dst_strideq*2]
|
||||
pavgb m3, [dstq+r6q]
|
||||
%endif
|
||||
movh [dstq ], m0
|
||||
movh [dstq+dst_strideq ], m1
|
||||
movh [dstq+dst_strideq*2], m2
|
||||
movh [dstq+r6q ], m3
|
||||
lea dstq, [dstq+dst_strideq*4]
|
||||
sub r4d, 4
|
||||
jnz .loop4
|
||||
RET
|
||||
%endmacro
|
||||
|
||||
convolve_fn copy
|
||||
convolve_fn avg
|
@ -1,272 +0,0 @@
|
||||
;
|
||||
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
|
||||
;
|
||||
; Use of this source code is governed by a BSD-style license
|
||||
; that can be found in the LICENSE file in the root of the source
|
||||
; tree. An additional intellectual property rights grant can be found
|
||||
; in the file PATENTS. All contributing project authors may
|
||||
; be found in the AUTHORS file in the root of the source tree.
|
||||
;
|
||||
|
||||
|
||||
%include "vpx_ports/x86_abi_support.asm"
|
||||
;void copy_mem8x8_mmx(
|
||||
; unsigned char *src,
|
||||
; int src_stride,
|
||||
; unsigned char *dst,
|
||||
; int dst_stride
|
||||
; )
|
||||
global sym(vp9_copy_mem8x8_mmx) PRIVATE
|
||||
sym(vp9_copy_mem8x8_mmx):
|
||||
push rbp
|
||||
mov rbp, rsp
|
||||
SHADOW_ARGS_TO_STACK 4
|
||||
push rsi
|
||||
push rdi
|
||||
; end prolog
|
||||
|
||||
mov rsi, arg(0) ;src;
|
||||
movq mm0, [rsi]
|
||||
|
||||
movsxd rax, dword ptr arg(1) ;src_stride;
|
||||
mov rdi, arg(2) ;dst;
|
||||
|
||||
movq mm1, [rsi+rax]
|
||||
movq mm2, [rsi+rax*2]
|
||||
|
||||
movsxd rcx, dword ptr arg(3) ;dst_stride
|
||||
lea rsi, [rsi+rax*2]
|
||||
|
||||
movq [rdi], mm0
|
||||
add rsi, rax
|
||||
|
||||
movq [rdi+rcx], mm1
|
||||
movq [rdi+rcx*2], mm2
|
||||
|
||||
|
||||
lea rdi, [rdi+rcx*2]
|
||||
movq mm3, [rsi]
|
||||
|
||||
add rdi, rcx
|
||||
movq mm4, [rsi+rax]
|
||||
|
||||
movq mm5, [rsi+rax*2]
|
||||
movq [rdi], mm3
|
||||
|
||||
lea rsi, [rsi+rax*2]
|
||||
movq [rdi+rcx], mm4
|
||||
|
||||
movq [rdi+rcx*2], mm5
|
||||
lea rdi, [rdi+rcx*2]
|
||||
|
||||
movq mm0, [rsi+rax]
|
||||
movq mm1, [rsi+rax*2]
|
||||
|
||||
movq [rdi+rcx], mm0
|
||||
movq [rdi+rcx*2],mm1
|
||||
|
||||
; begin epilog
|
||||
pop rdi
|
||||
pop rsi
|
||||
UNSHADOW_ARGS
|
||||
pop rbp
|
||||
ret
|
||||
|
||||
|
||||
;void copy_mem8x4_mmx(
|
||||
; unsigned char *src,
|
||||
; int src_stride,
|
||||
; unsigned char *dst,
|
||||
; int dst_stride
|
||||
; )
|
||||
global sym(vp9_copy_mem8x4_mmx) PRIVATE
|
||||
sym(vp9_copy_mem8x4_mmx):
|
||||
push rbp
|
||||
mov rbp, rsp
|
||||
SHADOW_ARGS_TO_STACK 4
|
||||
push rsi
|
||||
push rdi
|
||||
; end prolog
|
||||
|
||||
mov rsi, arg(0) ;src;
|
||||
movq mm0, [rsi]
|
||||
|
||||
movsxd rax, dword ptr arg(1) ;src_stride;
|
||||
mov rdi, arg(2) ;dst;
|
||||
|
||||
movq mm1, [rsi+rax]
|
||||
movq mm2, [rsi+rax*2]
|
||||
|
||||
movsxd rcx, dword ptr arg(3) ;dst_stride
|
||||
lea rsi, [rsi+rax*2]
|
||||
|
||||
movq [rdi], mm0
|
||||
movq [rdi+rcx], mm1
|
||||
|
||||
movq [rdi+rcx*2], mm2
|
||||
lea rdi, [rdi+rcx*2]
|
||||
|
||||
movq mm3, [rsi+rax]
|
||||
movq [rdi+rcx], mm3
|
||||
|
||||
; begin epilog
|
||||
pop rdi
|
||||
pop rsi
|
||||
UNSHADOW_ARGS
|
||||
pop rbp
|
||||
ret
|
||||
|
||||
|
||||
;void copy_mem16x16_mmx(
|
||||
; unsigned char *src,
|
||||
; int src_stride,
|
||||
; unsigned char *dst,
|
||||
; int dst_stride
|
||||
; )
|
||||
global sym(vp9_copy_mem16x16_mmx) PRIVATE
|
||||
sym(vp9_copy_mem16x16_mmx):
|
||||
push rbp
|
||||
mov rbp, rsp
|
||||
SHADOW_ARGS_TO_STACK 4
|
||||
push rsi
|
||||
push rdi
|
||||
; end prolog
|
||||
|
||||
mov rsi, arg(0) ;src;
|
||||
movsxd rax, dword ptr arg(1) ;src_stride;
|
||||
|
||||
mov rdi, arg(2) ;dst;
|
||||
movsxd rcx, dword ptr arg(3) ;dst_stride
|
||||
|
||||
movq mm0, [rsi]
|
||||
movq mm3, [rsi+8];
|
||||
|
||||
movq mm1, [rsi+rax]
|
||||
movq mm4, [rsi+rax+8]
|
||||
|
||||
movq mm2, [rsi+rax*2]
|
||||
movq mm5, [rsi+rax*2+8]
|
||||
|
||||
lea rsi, [rsi+rax*2]
|
||||
add rsi, rax
|
||||
|
||||
movq [rdi], mm0
|
||||
movq [rdi+8], mm3
|
||||
|
||||
movq [rdi+rcx], mm1
|
||||
movq [rdi+rcx+8], mm4
|
||||
|
||||
movq [rdi+rcx*2], mm2
|
||||
movq [rdi+rcx*2+8], mm5
|
||||
|
||||
lea rdi, [rdi+rcx*2]
|
||||
add rdi, rcx
|
||||
|
||||
movq mm0, [rsi]
|
||||
movq mm3, [rsi+8];
|
||||
|
||||
movq mm1, [rsi+rax]
|
||||
movq mm4, [rsi+rax+8]
|
||||
|
||||
movq mm2, [rsi+rax*2]
|
||||
movq mm5, [rsi+rax*2+8]
|
||||
|
||||
lea rsi, [rsi+rax*2]
|
||||
add rsi, rax
|
||||
|
||||
movq [rdi], mm0
|
||||
movq [rdi+8], mm3
|
||||
|
||||
movq [rdi+rcx], mm1
|
||||
movq [rdi+rcx+8], mm4
|
||||
|
||||
movq [rdi+rcx*2], mm2
|
||||
movq [rdi+rcx*2+8], mm5
|
||||
|
||||
lea rdi, [rdi+rcx*2]
|
||||
add rdi, rcx
|
||||
|
||||
movq mm0, [rsi]
|
||||
movq mm3, [rsi+8];
|
||||
|
||||
movq mm1, [rsi+rax]
|
||||
movq mm4, [rsi+rax+8]
|
||||
|
||||
movq mm2, [rsi+rax*2]
|
||||
movq mm5, [rsi+rax*2+8]
|
||||
|
||||
lea rsi, [rsi+rax*2]
|
||||
add rsi, rax
|
||||
|
||||
movq [rdi], mm0
|
||||
movq [rdi+8], mm3
|
||||
|
||||
movq [rdi+rcx], mm1
|
||||
movq [rdi+rcx+8], mm4
|
||||
|
||||
movq [rdi+rcx*2], mm2
|
||||
movq [rdi+rcx*2+8], mm5
|
||||
|
||||
lea rdi, [rdi+rcx*2]
|
||||
add rdi, rcx
|
||||
|
||||
movq mm0, [rsi]
|
||||
movq mm3, [rsi+8];
|
||||
|
||||
movq mm1, [rsi+rax]
|
||||
movq mm4, [rsi+rax+8]
|
||||
|
||||
movq mm2, [rsi+rax*2]
|
||||
movq mm5, [rsi+rax*2+8]
|
||||
|
||||
lea rsi, [rsi+rax*2]
|
||||
add rsi, rax
|
||||
|
||||
movq [rdi], mm0
|
||||
movq [rdi+8], mm3
|
||||
|
||||
movq [rdi+rcx], mm1
|
||||
movq [rdi+rcx+8], mm4
|
||||
|
||||
movq [rdi+rcx*2], mm2
|
||||
movq [rdi+rcx*2+8], mm5
|
||||
|
||||
lea rdi, [rdi+rcx*2]
|
||||
add rdi, rcx
|
||||
|
||||
movq mm0, [rsi]
|
||||
movq mm3, [rsi+8];
|
||||
|
||||
movq mm1, [rsi+rax]
|
||||
movq mm4, [rsi+rax+8]
|
||||
|
||||
movq mm2, [rsi+rax*2]
|
||||
movq mm5, [rsi+rax*2+8]
|
||||
|
||||
lea rsi, [rsi+rax*2]
|
||||
add rsi, rax
|
||||
|
||||
movq [rdi], mm0
|
||||
movq [rdi+8], mm3
|
||||
|
||||
movq [rdi+rcx], mm1
|
||||
movq [rdi+rcx+8], mm4
|
||||
|
||||
movq [rdi+rcx*2], mm2
|
||||
movq [rdi+rcx*2+8], mm5
|
||||
|
||||
lea rdi, [rdi+rcx*2]
|
||||
add rdi, rcx
|
||||
|
||||
movq mm0, [rsi]
|
||||
movq mm3, [rsi+8];
|
||||
|
||||
movq [rdi], mm0
|
||||
movq [rdi+8], mm3
|
||||
|
||||
; begin epilog
|
||||
pop rdi
|
||||
pop rsi
|
||||
UNSHADOW_ARGS
|
||||
pop rbp
|
||||
ret
|
@ -1,115 +0,0 @@
|
||||
;
|
||||
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
|
||||
;
|
||||
; Use of this source code is governed by a BSD-style license
|
||||
; that can be found in the LICENSE file in the root of the source
|
||||
; tree. An additional intellectual property rights grant can be found
|
||||
; in the file PATENTS. All contributing project authors may
|
||||
; be found in the AUTHORS file in the root of the source tree.
|
||||
;
|
||||
|
||||
|
||||
%include "vpx_ports/x86_abi_support.asm"
|
||||
;void copy_mem16x16_sse2(
|
||||
; unsigned char *src,
|
||||
; int src_stride,
|
||||
; unsigned char *dst,
|
||||
; int dst_stride
|
||||
; )
|
||||
global sym(vp9_copy_mem16x16_sse2) PRIVATE
|
||||
sym(vp9_copy_mem16x16_sse2):
|
||||
push rbp
|
||||
mov rbp, rsp
|
||||
SHADOW_ARGS_TO_STACK 4
|
||||
push rsi
|
||||
push rdi
|
||||
; end prolog
|
||||
|
||||
mov rsi, arg(0) ;src;
|
||||
movdqu xmm0, [rsi]
|
||||
|
||||
movsxd rax, dword ptr arg(1) ;src_stride;
|
||||
mov rdi, arg(2) ;dst;
|
||||
|
||||
movdqu xmm1, [rsi+rax]
|
||||
movdqu xmm2, [rsi+rax*2]
|
||||
|
||||
movsxd rcx, dword ptr arg(3) ;dst_stride
|
||||
lea rsi, [rsi+rax*2]
|
||||
|
||||
movdqa [rdi], xmm0
|
||||
add rsi, rax
|
||||
|
||||
movdqa [rdi+rcx], xmm1
|
||||
movdqa [rdi+rcx*2],xmm2
|
||||
|
||||
lea rdi, [rdi+rcx*2]
|
||||
movdqu xmm3, [rsi]
|
||||
|
||||
add rdi, rcx
|
||||
movdqu xmm4, [rsi+rax]
|
||||
|
||||
movdqu xmm5, [rsi+rax*2]
|
||||
lea rsi, [rsi+rax*2]
|
||||
|
||||
movdqa [rdi], xmm3
|
||||
add rsi, rax
|
||||
|
||||
movdqa [rdi+rcx], xmm4
|
||||
movdqa [rdi+rcx*2],xmm5
|
||||
|
||||
lea rdi, [rdi+rcx*2]
|
||||
movdqu xmm0, [rsi]
|
||||
|
||||
add rdi, rcx
|
||||
movdqu xmm1, [rsi+rax]
|
||||
|
||||
movdqu xmm2, [rsi+rax*2]
|
||||
lea rsi, [rsi+rax*2]
|
||||
|
||||
movdqa [rdi], xmm0
|
||||
add rsi, rax
|
||||
|
||||
movdqa [rdi+rcx], xmm1
|
||||
|
||||
movdqa [rdi+rcx*2], xmm2
|
||||
movdqu xmm3, [rsi]
|
||||
|
||||
movdqu xmm4, [rsi+rax]
|
||||
lea rdi, [rdi+rcx*2]
|
||||
|
||||
add rdi, rcx
|
||||
movdqu xmm5, [rsi+rax*2]
|
||||
|
||||
lea rsi, [rsi+rax*2]
|
||||
movdqa [rdi], xmm3
|
||||
|
||||
add rsi, rax
|
||||
movdqa [rdi+rcx], xmm4
|
||||
|
||||
movdqa [rdi+rcx*2],xmm5
|
||||
movdqu xmm0, [rsi]
|
||||
|
||||
lea rdi, [rdi+rcx*2]
|
||||
movdqu xmm1, [rsi+rax]
|
||||
|
||||
add rdi, rcx
|
||||
movdqu xmm2, [rsi+rax*2]
|
||||
|
||||
lea rsi, [rsi+rax*2]
|
||||
movdqa [rdi], xmm0
|
||||
|
||||
movdqa [rdi+rcx], xmm1
|
||||
movdqa [rdi+rcx*2],xmm2
|
||||
|
||||
movdqu xmm3, [rsi+rax]
|
||||
lea rdi, [rdi+rcx*2]
|
||||
|
||||
movdqa [rdi+rcx], xmm3
|
||||
|
||||
; begin epilog
|
||||
pop rdi
|
||||
pop rsi
|
||||
UNSHADOW_ARGS
|
||||
pop rbp
|
||||
ret
|
@ -75,10 +75,9 @@ VP9_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/vp9_asm_stubs.c
|
||||
VP9_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/vp9_loopfilter_intrin_sse2.c
|
||||
VP9_COMMON_SRCS-$(CONFIG_POSTPROC) += common/vp9_postproc.h
|
||||
VP9_COMMON_SRCS-$(CONFIG_POSTPROC) += common/vp9_postproc.c
|
||||
VP9_COMMON_SRCS-$(HAVE_MMX) += common/x86/vp9_recon_mmx.asm
|
||||
VP9_COMMON_SRCS-$(HAVE_MMX) += common/x86/vp9_loopfilter_mmx.asm
|
||||
VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_copy_sse2.asm
|
||||
VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_loopfilter_sse2.asm
|
||||
VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_recon_sse2.asm
|
||||
VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_intrapred_sse2.asm
|
||||
VP9_COMMON_SRCS-$(HAVE_SSSE3) += common/x86/vp9_intrapred_ssse3.asm
|
||||
VP9_COMMON_SRCS-$(HAVE_SSSE3) += common/x86/vp9_subpixel_8t_ssse3.asm
|
||||
|
Loading…
x
Reference in New Issue
Block a user