VPX: scaled convolve : fix windows build errors

Change-Id: Ic81d435ea928183197040cdf64b6afd7dbaf57e4
2015-08-20 13:09:27 -07:00 · 2015-08-20 13:09:27 -07:00 · acf24cc1b8
commit acf24cc1b8
parent 2030c49cf8
1 changed files with 78 additions and 89 deletions
--- a/vpx_dsp/x86/vpx_subpixel_8t_intrin_ssse3.c
+++ b/vpx_dsp/x86/vpx_subpixel_8t_intrin_ssse3.c
@ -616,7 +616,7 @@ FUN_CONV_1D(avg_vert, y_step_q4, filter_y, v, src - src_stride * 3, avg_,
  out7 = _mm_unpackhi_epi64(tr2_3, tr2_3);                              \
 }

-static void filter_horiz_w8_ssse3(const uint8_t *src_x, int src_pitch,
+static void filter_horiz_w8_ssse3(const uint8_t *src_x, ptrdiff_t src_pitch,
                                  uint8_t *dst, const int16_t *x_filter) {
  const __m128i k_256 = _mm_set1_epi16(1 << 8);
  const __m128i f_values = _mm_load_si128((const __m128i *)x_filter);
@ -740,18 +740,17 @@ static void scaledconvolve_horiz_w8(const uint8_t *src, ptrdiff_t src_stride,

 static void filter_horiz_w4_ssse3(const uint8_t *src_ptr, ptrdiff_t src_pitch,
                                  uint8_t *dst, const int16_t *filter) {
-  const __m64 k_256 = _mm_set1_pi16(1 << 8);
-  const __m64 f_values_lo = *((const __m64 *)filter);
-  const __m64 f_values_hi = *((const __m64 *)(filter + 4));
+  const __m128i k_256 = _mm_set1_epi16(1 << 8);
+  const __m128i f_values = _mm_load_si128((const __m128i *)filter);
  // pack and duplicate the filter values
-  const __m64 f1f0 = _mm_shuffle_pi8(f_values_lo, _mm_set1_pi16(0x0200u));
-  const __m64 f3f2 = _mm_shuffle_pi8(f_values_lo, _mm_set1_pi16(0x0604u));
-  const __m64 f5f4 = _mm_shuffle_pi8(f_values_hi, _mm_set1_pi16(0x0a08u));
-  const __m64 f7f6 = _mm_shuffle_pi8(f_values_hi, _mm_set1_pi16(0x0e0cu));
-  const __m64 A = *((const __m64 *)src_ptr);
-  const __m64 B = *((const __m64 *)(src_ptr + src_pitch));
-  const __m64 C = *((const __m64 *)(src_ptr + src_pitch * 2));
-  const __m64 D = *((const __m64 *)(src_ptr + src_pitch * 3));
+  const __m128i f1f0 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0200u));
+  const __m128i f3f2 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0604u));
+  const __m128i f5f4 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0a08u));
+  const __m128i f7f6 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0e0cu));
+  const __m128i A = _mm_loadl_epi64((const __m128i *)src_ptr);
+  const __m128i B = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch));
+  const __m128i C = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 2));
+  const __m128i D = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 3));
  // TRANSPOSE...
  // 00 01 02 03 04 05 06 07
  // 10 11 12 13 14 15 16 17
@ -769,63 +768,57 @@ static void filter_horiz_w4_ssse3(const uint8_t *src_ptr, ptrdiff_t src_pitch,
  // 06 16 26 36
  // 07 17 27 37
  //
-  // 00 01 10 11 02 03 12 13
-  const __m64 tr0_0 = _mm_unpacklo_pi16(A, B);
-  // 20 21 30 31 22 23 32 33
-  const __m64 tr0_1 = _mm_unpacklo_pi16(C, D);
-  // 04 05 14 15 06 07 16 17
-  const __m64 tr0_2 = _mm_unpackhi_pi16(A, B);
-  // 24 25 34 35 26 27 36 37
-  const __m64 tr0_3 = _mm_unpackhi_pi16(C, D);
-  // 00 01 10 11 20 21 30 31
-  const __m64 s1s0  = _mm_unpacklo_pi32(tr0_0, tr0_1);
+  // 00 01 10 11 02 03 12 13 04 05 14 15 06 07 16 17
+  const __m128i tr0_0 = _mm_unpacklo_epi16(A, B);
+  // 20 21 30 31 22 23 32 33 24 25 34 35 26 27 36 37
+  const __m128i tr0_1 = _mm_unpacklo_epi16(C, D);
+  // 00 01 10 11 20 21 30 31 02 03 12 13 22 23 32 33
+  const __m128i s1s0  = _mm_unpacklo_epi32(tr0_0, tr0_1);
+  // 04 05 14 15 24 25 34 35 06 07 16 17 26 27 36 37
+  const __m128i s5s4 = _mm_unpackhi_epi32(tr0_0, tr0_1);
  // 02 03 12 13 22 23 32 33
-  const __m64 s3s2 = _mm_unpackhi_pi32(tr0_0, tr0_1);
-  // 04 05 14 15 24 25 34 35
-  const __m64 s5s4 = _mm_unpacklo_pi32(tr0_2, tr0_3);
+  const __m128i s3s2 = _mm_srli_si128(s1s0, 8);
  // 06 07 16 17 26 27 36 37
-  const __m64 s7s6 = _mm_unpackhi_pi32(tr0_2, tr0_3);
+  const __m128i s7s6 = _mm_srli_si128(s5s4, 8);
  // multiply 2 adjacent elements with the filter and add the result
-  const __m64 x0 = _mm_maddubs_pi16(s1s0, f1f0);
-  const __m64 x1 = _mm_maddubs_pi16(s3s2, f3f2);
-  const __m64 x2 = _mm_maddubs_pi16(s5s4, f5f4);
-  const __m64 x3 = _mm_maddubs_pi16(s7s6, f7f6);
+  const __m128i x0 = _mm_maddubs_epi16(s1s0, f1f0);
+  const __m128i x1 = _mm_maddubs_epi16(s3s2, f3f2);
+  const __m128i x2 = _mm_maddubs_epi16(s5s4, f5f4);
+  const __m128i x3 = _mm_maddubs_epi16(s7s6, f7f6);
  // add and saturate the results together
-  const __m64 min_x2x1 = _mm_min_pi16(x2, x1);
-  const __m64 max_x2x1 = _mm_max_pi16(x2, x1);
-  __m64 temp = _mm_adds_pi16(x0, x3);
-  temp = _mm_adds_pi16(temp, min_x2x1);
-  temp = _mm_adds_pi16(temp, max_x2x1);
+  const __m128i min_x2x1 = _mm_min_epi16(x2, x1);
+  const __m128i max_x2x1 = _mm_max_epi16(x2, x1);
+  __m128i temp = _mm_adds_epi16(x0, x3);
+  temp = _mm_adds_epi16(temp, min_x2x1);
+  temp = _mm_adds_epi16(temp, max_x2x1);
  // round and shift by 7 bit each 16 bit
-  temp = _mm_mulhrs_pi16(temp, k_256);
+  temp = _mm_mulhrs_epi16(temp, k_256);
  // shrink to 8 bit each 16 bits
-  temp = _mm_packs_pu16(temp, temp);
+  temp = _mm_packus_epi16(temp, temp);
  // save only 4 bytes
-  *(int *)dst = _mm_cvtsi64_si32(temp);
+  *(int *)dst = _mm_cvtsi128_si32(temp);
 }

 static void transpose4x4_to_dst(const uint8_t *src, ptrdiff_t src_stride,
                                uint8_t *dst, ptrdiff_t dst_stride) {
-  __m64 A = _mm_cvtsi32_si64(*(const int *)src);
-  __m64 B = _mm_cvtsi32_si64(*(const int *)(src + src_stride));
-  __m64 C = _mm_cvtsi32_si64(*(const int *)(src + src_stride * 2));
-  __m64 D = _mm_cvtsi32_si64(*(const int *)(src + src_stride * 3));
+  __m128i A = _mm_cvtsi32_si128(*(const int *)src);
+  __m128i B = _mm_cvtsi32_si128(*(const int *)(src + src_stride));
+  __m128i C = _mm_cvtsi32_si128(*(const int *)(src + src_stride * 2));
+  __m128i D = _mm_cvtsi32_si128(*(const int *)(src + src_stride * 3));
  // 00 10 01 11 02 12 03 13
-  const __m64 tr0_0 = _mm_unpacklo_pi8(A, B);
+  const __m128i tr0_0 = _mm_unpacklo_epi8(A, B);
  // 20 30 21 31 22 32 23 33
-  const __m64 tr0_1 = _mm_unpacklo_pi8(C, D);
-  // 00 10 20 30 01 11 21 31
-  A = _mm_unpacklo_pi16(tr0_0, tr0_1);
-  // 02 12 22 32 03 13 23 33
-  C = _mm_unpackhi_pi16(tr0_0, tr0_1);
+  const __m128i tr0_1 = _mm_unpacklo_epi8(C, D);
+  // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
+  A = _mm_unpacklo_epi16(tr0_0, tr0_1);
+  B = _mm_srli_si128(A, 4);
+  C = _mm_srli_si128(A, 8);
+  D = _mm_srli_si128(A, 12);

-  B = _mm_unpackhi_pi32(A, A);
-  D = _mm_unpackhi_pi32(C, C);
-
-  *(int *)(dst) =  _mm_cvtsi64_si32(A);
-  *(int *)(dst + dst_stride) =  _mm_cvtsi64_si32(B);
-  *(int *)(dst + dst_stride * 2) =  _mm_cvtsi64_si32(C);
-  *(int *)(dst + dst_stride * 3) =  _mm_cvtsi64_si32(D);
+  *(int *)(dst) =  _mm_cvtsi128_si32(A);
+  *(int *)(dst + dst_stride) =  _mm_cvtsi128_si32(B);
+  *(int *)(dst + dst_stride * 2) =  _mm_cvtsi128_si32(C);
+  *(int *)(dst + dst_stride * 3) =  _mm_cvtsi128_si32(D);
 }

 static void scaledconvolve_horiz_w4(const uint8_t *src, ptrdiff_t src_stride,
@ -865,46 +858,42 @@ static void scaledconvolve_horiz_w4(const uint8_t *src, ptrdiff_t src_stride,

 static void filter_vert_w4_ssse3(const uint8_t *src_ptr, ptrdiff_t src_pitch,
                                 uint8_t *dst, const int16_t *filter) {
-  const __m64 k_256 = _mm_set1_pi16(1 << 8);
-  const __m64 f_values_lo = *((const __m64 *)filter);
-  const __m64 f_values_hi = *((const __m64 *)(filter + 4));
+  const __m128i k_256 = _mm_set1_epi16(1 << 8);
+  const __m128i f_values = _mm_load_si128((const __m128i *)filter);
  // pack and duplicate the filter values
-  const __m64 f1f0 = _mm_shuffle_pi8(f_values_lo, _mm_set1_pi16(0x0200u));
-  const __m64 f3f2 = _mm_shuffle_pi8(f_values_lo, _mm_set1_pi16(0x0604u));
-  const __m64 f5f4 = _mm_shuffle_pi8(f_values_hi, _mm_set1_pi16(0x0a08u));
-  const __m64 f7f6 = _mm_shuffle_pi8(f_values_hi, _mm_set1_pi16(0x0e0cu));
-  const __m64 A = _mm_cvtsi32_si64(*(const int *)src_ptr);
-  const __m64 B = _mm_cvtsi32_si64(*(const int *)(src_ptr + src_pitch));
-  const __m64 C = _mm_cvtsi32_si64(*(const int *)(src_ptr + src_pitch * 2));
-  const __m64 D = _mm_cvtsi32_si64(*(const int *)(src_ptr + src_pitch * 3));
-  const __m64 E = _mm_cvtsi32_si64(*(const int *)(src_ptr + src_pitch * 4));
-  const __m64 F = _mm_cvtsi32_si64(*(const int *)(src_ptr + src_pitch * 5));
-  const __m64 G = _mm_cvtsi32_si64(*(const int *)(src_ptr + src_pitch * 6));
-  const __m64 H = _mm_cvtsi32_si64(*(const int *)(src_ptr + src_pitch * 7));
-  const __m64 s1s0 = _mm_unpacklo_pi8(A, B);
-  // 02 03 12 13 22 23 32 33
-  const __m64 s3s2 = _mm_unpacklo_pi8(C, D);
-  // 04 05 14 15 24 25 34 35
-  const __m64 s5s4 = _mm_unpacklo_pi8(E, F);
-  // 06 07 16 17 26 27 36 37
-  const __m64 s7s6 = _mm_unpacklo_pi8(G, H);
+  const __m128i f1f0 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0200u));
+  const __m128i f3f2 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0604u));
+  const __m128i f5f4 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0a08u));
+  const __m128i f7f6 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0e0cu));
+  const __m128i A = _mm_cvtsi32_si128(*(const int *)src_ptr);
+  const __m128i B = _mm_cvtsi32_si128(*(const int *)(src_ptr + src_pitch));
+  const __m128i C = _mm_cvtsi32_si128(*(const int *)(src_ptr + src_pitch * 2));
+  const __m128i D = _mm_cvtsi32_si128(*(const int *)(src_ptr + src_pitch * 3));
+  const __m128i E = _mm_cvtsi32_si128(*(const int *)(src_ptr + src_pitch * 4));
+  const __m128i F = _mm_cvtsi32_si128(*(const int *)(src_ptr + src_pitch * 5));
+  const __m128i G = _mm_cvtsi32_si128(*(const int *)(src_ptr + src_pitch * 6));
+  const __m128i H = _mm_cvtsi32_si128(*(const int *)(src_ptr + src_pitch * 7));
+  const __m128i s1s0 = _mm_unpacklo_epi8(A, B);
+  const __m128i s3s2 = _mm_unpacklo_epi8(C, D);
+  const __m128i s5s4 = _mm_unpacklo_epi8(E, F);
+  const __m128i s7s6 = _mm_unpacklo_epi8(G, H);
  // multiply 2 adjacent elements with the filter and add the result
-  const __m64 x0 = _mm_maddubs_pi16(s1s0, f1f0);
-  const __m64 x1 = _mm_maddubs_pi16(s3s2, f3f2);
-  const __m64 x2 = _mm_maddubs_pi16(s5s4, f5f4);
-  const __m64 x3 = _mm_maddubs_pi16(s7s6, f7f6);
+  const __m128i x0 = _mm_maddubs_epi16(s1s0, f1f0);
+  const __m128i x1 = _mm_maddubs_epi16(s3s2, f3f2);
+  const __m128i x2 = _mm_maddubs_epi16(s5s4, f5f4);
+  const __m128i x3 = _mm_maddubs_epi16(s7s6, f7f6);
  // add and saturate the results together
-  const __m64 min_x2x1 = _mm_min_pi16(x2, x1);
-  const __m64 max_x2x1 = _mm_max_pi16(x2, x1);
-  __m64 temp = _mm_adds_pi16(x0, x3);
-  temp = _mm_adds_pi16(temp, min_x2x1);
-  temp = _mm_adds_pi16(temp, max_x2x1);
+  const __m128i min_x2x1 = _mm_min_epi16(x2, x1);
+  const __m128i max_x2x1 = _mm_max_epi16(x2, x1);
+  __m128i temp = _mm_adds_epi16(x0, x3);
+  temp = _mm_adds_epi16(temp, min_x2x1);
+  temp = _mm_adds_epi16(temp, max_x2x1);
  // round and shift by 7 bit each 16 bit
-  temp = _mm_mulhrs_pi16(temp, k_256);
+  temp = _mm_mulhrs_epi16(temp, k_256);
  // shrink to 8 bit each 16 bits
-  temp = _mm_packs_pu16(temp, temp);
+  temp = _mm_packus_epi16(temp, temp);
  // save only 4 bytes
-  *(int *)dst = _mm_cvtsi64_si32(temp);
+  *(int *)dst = _mm_cvtsi128_si32(temp);
 }

 static void scaledconvolve_vert_w4(const uint8_t *src, ptrdiff_t src_stride,