vpx: hadamard: use ptrdiff_t instead of int for stride

Eliminates the following instruction for the x86 (64 bit)
intrinsic code:

movslq %esi,%rax

Change-Id: I8f5ebd40726f998708a668b0f52ea7a0576befae
This commit is contained in:
Scott LaVarnway 2017-10-26 09:45:06 -07:00
parent 037e596f04
commit 3bf02ad74a
8 changed files with 21 additions and 18 deletions

View File

@ -22,7 +22,8 @@ namespace {
using ::libvpx_test::ACMRandom; using ::libvpx_test::ACMRandom;
typedef void (*HadamardFunc)(const int16_t *a, int a_stride, tran_low_t *b); typedef void (*HadamardFunc)(const int16_t *a, ptrdiff_t a_stride,
tran_low_t *b);
void hadamard_loop(const int16_t *a, int a_stride, int16_t *out) { void hadamard_loop(const int16_t *a, int a_stride, int16_t *out) {
int16_t b[8]; int16_t b[8];

View File

@ -47,7 +47,7 @@ static void hadamard8x8_one_pass(int16x8_t *a0, int16x8_t *a1, int16x8_t *a2,
*a7 = vaddq_s16(c1, c5); *a7 = vaddq_s16(c1, c5);
} }
void vpx_hadamard_8x8_neon(const int16_t *src_diff, int src_stride, void vpx_hadamard_8x8_neon(const int16_t *src_diff, ptrdiff_t src_stride,
tran_low_t *coeff) { tran_low_t *coeff) {
int16x8_t a0 = vld1q_s16(src_diff); int16x8_t a0 = vld1q_s16(src_diff);
int16x8_t a1 = vld1q_s16(src_diff + src_stride); int16x8_t a1 = vld1q_s16(src_diff + src_stride);
@ -76,7 +76,7 @@ void vpx_hadamard_8x8_neon(const int16_t *src_diff, int src_stride,
store_s16q_to_tran_low(coeff + 56, a7); store_s16q_to_tran_low(coeff + 56, a7);
} }
void vpx_hadamard_16x16_neon(const int16_t *src_diff, int src_stride, void vpx_hadamard_16x16_neon(const int16_t *src_diff, ptrdiff_t src_stride,
tran_low_t *coeff) { tran_low_t *coeff) {
int i; int i;

View File

@ -34,7 +34,7 @@ unsigned int vpx_avg_4x4_c(const uint8_t *s, int p) {
// src_diff: first pass, 9 bit, dynamic range [-255, 255] // src_diff: first pass, 9 bit, dynamic range [-255, 255]
// second pass, 12 bit, dynamic range [-2040, 2040] // second pass, 12 bit, dynamic range [-2040, 2040]
static void hadamard_col8(const int16_t *src_diff, int src_stride, static void hadamard_col8(const int16_t *src_diff, ptrdiff_t src_stride,
int16_t *coeff) { int16_t *coeff) {
int16_t b0 = src_diff[0 * src_stride] + src_diff[1 * src_stride]; int16_t b0 = src_diff[0 * src_stride] + src_diff[1 * src_stride];
int16_t b1 = src_diff[0 * src_stride] - src_diff[1 * src_stride]; int16_t b1 = src_diff[0 * src_stride] - src_diff[1 * src_stride];
@ -66,7 +66,7 @@ static void hadamard_col8(const int16_t *src_diff, int src_stride,
// The order of the output coeff of the hadamard is not important. For // The order of the output coeff of the hadamard is not important. For
// optimization purposes the final transpose may be skipped. // optimization purposes the final transpose may be skipped.
void vpx_hadamard_8x8_c(const int16_t *src_diff, int src_stride, void vpx_hadamard_8x8_c(const int16_t *src_diff, ptrdiff_t src_stride,
tran_low_t *coeff) { tran_low_t *coeff) {
int idx; int idx;
int16_t buffer[64]; int16_t buffer[64];
@ -92,7 +92,7 @@ void vpx_hadamard_8x8_c(const int16_t *src_diff, int src_stride,
} }
// In place 16x16 2D Hadamard transform // In place 16x16 2D Hadamard transform
void vpx_hadamard_16x16_c(const int16_t *src_diff, int src_stride, void vpx_hadamard_16x16_c(const int16_t *src_diff, ptrdiff_t src_stride,
tran_low_t *coeff) { tran_low_t *coeff) {
int idx; int idx;
for (idx = 0; idx < 4; ++idx) { for (idx = 0; idx < 4; ++idx) {

View File

@ -56,7 +56,8 @@ uint32_t vpx_avg_4x4_msa(const uint8_t *src, int32_t src_stride) {
return sum_out; return sum_out;
} }
void vpx_hadamard_8x8_msa(const int16_t *src, int src_stride, int16_t *dst) { void vpx_hadamard_8x8_msa(const int16_t *src, ptrdiff_t src_stride,
int16_t *dst) {
v8i16 src0, src1, src2, src3, src4, src5, src6, src7; v8i16 src0, src1, src2, src3, src4, src5, src6, src7;
v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
@ -80,7 +81,8 @@ void vpx_hadamard_8x8_msa(const int16_t *src, int src_stride, int16_t *dst) {
ST_SH8(src0, src1, src2, src3, src4, src5, src6, src7, dst, 8); ST_SH8(src0, src1, src2, src3, src4, src5, src6, src7, dst, 8);
} }
void vpx_hadamard_16x16_msa(const int16_t *src, int src_stride, int16_t *dst) { void vpx_hadamard_16x16_msa(const int16_t *src, ptrdiff_t src_stride,
int16_t *dst) {
v8i16 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; v8i16 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
v8i16 src11, src12, src13, src14, src15, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5; v8i16 src11, src12, src13, src14, src15, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
v8i16 tmp6, tmp7, tmp8, tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15; v8i16 tmp6, tmp7, tmp8, tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15;

View File

@ -42,7 +42,7 @@ static void vpx_hadamard_s16_8x8_one_pass(int16x8_t v[8]) {
v[7] = vec_add(c1, c5); v[7] = vec_add(c1, c5);
} }
void vpx_hadamard_8x8_vsx(const int16_t *src_diff, int src_stride, void vpx_hadamard_8x8_vsx(const int16_t *src_diff, ptrdiff_t src_stride,
tran_low_t *coeff) { tran_low_t *coeff) {
int16x8_t v[8]; int16x8_t v[8];
@ -71,7 +71,7 @@ void vpx_hadamard_8x8_vsx(const int16_t *src_diff, int src_stride,
store_tran_low(v[7], 0, coeff + 56); store_tran_low(v[7], 0, coeff + 56);
} }
void vpx_hadamard_16x16_vsx(const int16_t *src_diff, int src_stride, void vpx_hadamard_16x16_vsx(const int16_t *src_diff, ptrdiff_t src_stride,
tran_low_t *coeff) { tran_low_t *coeff) {
int i; int i;
const uint16x8_t ones = vec_splat_u16(1); const uint16x8_t ones = vec_splat_u16(1);

View File

@ -765,19 +765,19 @@ if (vpx_config("CONFIG_VP9_ENCODER") eq "yes") {
specialize qw/vpx_minmax_8x8 sse2 neon msa/; specialize qw/vpx_minmax_8x8 sse2 neon msa/;
if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
add_proto qw/void vpx_hadamard_8x8/, "const int16_t *src_diff, int src_stride, tran_low_t *coeff"; add_proto qw/void vpx_hadamard_8x8/, "const int16_t *src_diff, ptrdiff_t src_stride, tran_low_t *coeff";
specialize qw/vpx_hadamard_8x8 sse2 neon vsx/, "$ssse3_x86_64"; specialize qw/vpx_hadamard_8x8 sse2 neon vsx/, "$ssse3_x86_64";
add_proto qw/void vpx_hadamard_16x16/, "const int16_t *src_diff, int src_stride, tran_low_t *coeff"; add_proto qw/void vpx_hadamard_16x16/, "const int16_t *src_diff, ptrdiff_t src_stride, tran_low_t *coeff";
specialize qw/vpx_hadamard_16x16 avx2 sse2 neon vsx/; specialize qw/vpx_hadamard_16x16 avx2 sse2 neon vsx/;
add_proto qw/int vpx_satd/, "const tran_low_t *coeff, int length"; add_proto qw/int vpx_satd/, "const tran_low_t *coeff, int length";
specialize qw/vpx_satd sse2 neon/; specialize qw/vpx_satd sse2 neon/;
} else { } else {
add_proto qw/void vpx_hadamard_8x8/, "const int16_t *src_diff, int src_stride, int16_t *coeff"; add_proto qw/void vpx_hadamard_8x8/, "const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff";
specialize qw/vpx_hadamard_8x8 sse2 neon msa vsx/, "$ssse3_x86_64"; specialize qw/vpx_hadamard_8x8 sse2 neon msa vsx/, "$ssse3_x86_64";
add_proto qw/void vpx_hadamard_16x16/, "const int16_t *src_diff, int src_stride, int16_t *coeff"; add_proto qw/void vpx_hadamard_16x16/, "const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff";
specialize qw/vpx_hadamard_16x16 avx2 sse2 neon msa vsx/; specialize qw/vpx_hadamard_16x16 avx2 sse2 neon msa vsx/;
add_proto qw/int vpx_satd/, "const int16_t *coeff, int length"; add_proto qw/int vpx_satd/, "const int16_t *coeff, int length";

View File

@ -91,7 +91,7 @@ static void hadamard_col8x2_avx2(__m256i *in, int iter) {
} }
} }
static void hadamard_8x8x2_avx2(int16_t const *src_diff, int src_stride, static void hadamard_8x8x2_avx2(int16_t const *src_diff, ptrdiff_t src_stride,
int16_t *coeff) { int16_t *coeff) {
__m256i src[8]; __m256i src[8];
src[0] = _mm256_loadu_si256((const __m256i *)src_diff); src[0] = _mm256_loadu_si256((const __m256i *)src_diff);
@ -131,7 +131,7 @@ static void hadamard_8x8x2_avx2(int16_t const *src_diff, int src_stride,
_mm256_permute2x128_si256(src[6], src[7], 0x31)); _mm256_permute2x128_si256(src[6], src[7], 0x31));
} }
void vpx_hadamard_16x16_avx2(int16_t const *src_diff, int src_stride, void vpx_hadamard_16x16_avx2(int16_t const *src_diff, ptrdiff_t src_stride,
tran_low_t *coeff) { tran_low_t *coeff) {
int idx; int idx;
#if CONFIG_VP9_HIGHBITDEPTH #if CONFIG_VP9_HIGHBITDEPTH

View File

@ -214,7 +214,7 @@ static void hadamard_col8_sse2(__m128i *in, int iter) {
} }
} }
void vpx_hadamard_8x8_sse2(int16_t const *src_diff, int src_stride, void vpx_hadamard_8x8_sse2(int16_t const *src_diff, ptrdiff_t src_stride,
tran_low_t *coeff) { tran_low_t *coeff) {
__m128i src[8]; __m128i src[8];
src[0] = _mm_load_si128((const __m128i *)src_diff); src[0] = _mm_load_si128((const __m128i *)src_diff);
@ -246,7 +246,7 @@ void vpx_hadamard_8x8_sse2(int16_t const *src_diff, int src_stride,
store_tran_low(src[7], coeff); store_tran_low(src[7], coeff);
} }
void vpx_hadamard_16x16_sse2(int16_t const *src_diff, int src_stride, void vpx_hadamard_16x16_sse2(int16_t const *src_diff, ptrdiff_t src_stride,
tran_low_t *coeff) { tran_low_t *coeff) {
int idx; int idx;
for (idx = 0; idx < 4; ++idx) { for (idx = 0; idx < 4; ++idx) {