diff --git a/vp9/common/vp9_loopfilter.c b/vp9/common/vp9_loopfilter.c index cad08bbf3..7b3f0be24 100644 --- a/vp9/common/vp9_loopfilter.c +++ b/vp9/common/vp9_loopfilter.c @@ -184,7 +184,7 @@ static void filter_selectively_vert(uint8_t *s, int pitch, if (mask & 1) { if (mask_16x16 & 1) { vp9_mb_lpf_vertical_edge_w(s, pitch, lfi->mblim, lfi->lim, - lfi->hev_thr, 1); + lfi->hev_thr); assert(!(mask_8x8 & 1)); assert(!(mask_4x4 & 1)); assert(!(mask_4x4_int & 1)); @@ -229,7 +229,7 @@ static void filter_selectively_horiz(uint8_t *s, int pitch, if (!only_4x4_1) { if (mask_16x16 & 1) { vp9_mb_lpf_horizontal_edge_w(s, pitch, lfi->mblim, lfi->lim, - lfi->hev_thr, 1); + lfi->hev_thr); assert(!(mask_8x8 & 1)); assert(!(mask_4x4 & 1)); assert(!(mask_4x4_int & 1)); diff --git a/vp9/common/vp9_loopfilter.h b/vp9/common/vp9_loopfilter.h index 65f522bab..ce954c0c3 100644 --- a/vp9/common/vp9_loopfilter.h +++ b/vp9/common/vp9_loopfilter.h @@ -82,15 +82,4 @@ void vp9_loop_filter_partial_frame(struct VP9Common *cm, void vp9_loop_filter_update_sharpness(loop_filter_info_n *lfi, int sharpness_lvl); -void vp9_mb_lpf_horizontal_edge_w(unsigned char *s, int p, - const unsigned char *blimit, - const unsigned char *limit, - const unsigned char *thresh, - int count); - -void vp9_mb_lpf_vertical_edge_w(unsigned char *s, int p, - const unsigned char *blimit, - const unsigned char *limit, - const unsigned char *thresh, - int count); #endif // VP9_COMMON_VP9_LOOPFILTER_H_ diff --git a/vp9/common/vp9_loopfilter_filters.c b/vp9/common/vp9_loopfilter_filters.c index bf03692a0..0efbcafe0 100644 --- a/vp9/common/vp9_loopfilter_filters.c +++ b/vp9/common/vp9_loopfilter_filters.c @@ -255,16 +255,15 @@ static INLINE void wide_mbfilter(int8_t mask, uint8_t hev, } } -void vp9_mb_lpf_horizontal_edge_w(uint8_t *s, int p, - const uint8_t *blimit, - const uint8_t *limit, - const uint8_t *thresh, - int count) { +void vp9_mb_lpf_horizontal_edge_w_c(uint8_t *s, int p, + const uint8_t *blimit, + const uint8_t *limit, + const uint8_t *thresh) { int i; // loop filter designed to work using chars so that we can make maximum use // of 8 bit simd instructions. - for (i = 0; i < 8 * count; ++i) { + for (i = 0; i < 8; ++i) { const uint8_t p3 = s[-4 * p], p2 = s[-3 * p], p1 = s[-2 * p], p0 = s[-p]; const uint8_t q0 = s[0 * p], q1 = s[1 * p], q2 = s[2 * p], q3 = s[3 * p]; const int8_t mask = filter_mask(*limit, *blimit, @@ -285,14 +284,13 @@ void vp9_mb_lpf_horizontal_edge_w(uint8_t *s, int p, } } -void vp9_mb_lpf_vertical_edge_w(uint8_t *s, int p, - const uint8_t *blimit, - const uint8_t *limit, - const uint8_t *thresh, - int count) { +void vp9_mb_lpf_vertical_edge_w_c(uint8_t *s, int p, + const uint8_t *blimit, + const uint8_t *limit, + const uint8_t *thresh) { int i; - for (i = 0; i < 8 * count; ++i) { + for (i = 0; i < 8; ++i) { const uint8_t p3 = s[-4], p2 = s[-3], p1 = s[-2], p0 = s[-1]; const uint8_t q0 = s[0], q1 = s[1], q2 = s[2], q3 = s[3]; const int8_t mask = filter_mask(*limit, *blimit, diff --git a/vp9/common/vp9_rtcd_defs.sh b/vp9/common/vp9_rtcd_defs.sh index 74d3ab57f..ddfc85aa0 100644 --- a/vp9/common/vp9_rtcd_defs.sh +++ b/vp9/common/vp9_rtcd_defs.sh @@ -86,8 +86,8 @@ fi # # Loopfilter # -prototype void vp9_mb_lpf_vertical_edge_w "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count" -specialize vp9_mb_lpf_vertical_edge_w +prototype void vp9_mb_lpf_vertical_edge_w "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh" +specialize vp9_mb_lpf_vertical_edge_w sse2 prototype void vp9_mbloop_filter_vertical_edge "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count" specialize vp9_mbloop_filter_vertical_edge @@ -95,8 +95,8 @@ specialize vp9_mbloop_filter_vertical_edge prototype void vp9_loop_filter_vertical_edge "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count" specialize vp9_loop_filter_vertical_edge mmx -prototype void vp9_mb_lpf_horizontal_edge_w "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count" -specialize vp9_mb_lpf_horizontal_edge_w +prototype void vp9_mb_lpf_horizontal_edge_w "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh" +specialize vp9_mb_lpf_horizontal_edge_w sse2 prototype void vp9_mbloop_filter_horizontal_edge "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count" specialize vp9_mbloop_filter_horizontal_edge diff --git a/vp9/common/x86/vp9_loopfilter_intrin_sse2.c b/vp9/common/x86/vp9_loopfilter_intrin_sse2.c index 7982ca6a2..9ab51208d 100644 --- a/vp9/common/x86/vp9_loopfilter_intrin_sse2.c +++ b/vp9/common/x86/vp9_loopfilter_intrin_sse2.c @@ -23,14 +23,14 @@ void vp9_mb_lpf_horizontal_edge_w_sse2(unsigned char *s, const unsigned char *_blimit, const unsigned char *_limit, const unsigned char *_thresh) { - DECLARE_ALIGNED(16, unsigned char, flat2_op[7][16]); - DECLARE_ALIGNED(16, unsigned char, flat2_oq[7][16]); + DECLARE_ALIGNED(16, unsigned char, flat2_op[7][8]); + DECLARE_ALIGNED(16, unsigned char, flat2_oq[7][8]); - DECLARE_ALIGNED(16, unsigned char, flat_op[3][16]); - DECLARE_ALIGNED(16, unsigned char, flat_oq[3][16]); + DECLARE_ALIGNED(16, unsigned char, flat_op[3][8]); + DECLARE_ALIGNED(16, unsigned char, flat_oq[3][8]); - DECLARE_ALIGNED(16, unsigned char, ap[8][16]); - DECLARE_ALIGNED(16, unsigned char, aq[8][16]); + DECLARE_ALIGNED(16, unsigned char, ap[8][8]); + DECLARE_ALIGNED(16, unsigned char, aq[8][8]); __m128i mask, hev, flat, flat2; @@ -50,27 +50,27 @@ void vp9_mb_lpf_horizontal_edge_w_sse2(unsigned char *s, const __m128i blimit = _mm_shuffle_epi32(_mm_cvtsi32_si128((int)extended_blimit), 0); - p4 = _mm_loadu_si128((__m128i *)(s - 5 * p)); - p3 = _mm_loadu_si128((__m128i *)(s - 4 * p)); - p2 = _mm_loadu_si128((__m128i *)(s - 3 * p)); - p1 = _mm_loadu_si128((__m128i *)(s - 2 * p)); - p0 = _mm_loadu_si128((__m128i *)(s - 1 * p)); - q0 = _mm_loadu_si128((__m128i *)(s - 0 * p)); - q1 = _mm_loadu_si128((__m128i *)(s + 1 * p)); - q2 = _mm_loadu_si128((__m128i *)(s + 2 * p)); - q3 = _mm_loadu_si128((__m128i *)(s + 3 * p)); - q4 = _mm_loadu_si128((__m128i *)(s + 4 * p)); + p4 = _mm_loadl_epi64((__m128i *)(s - 5 * p)); + p3 = _mm_loadl_epi64((__m128i *)(s - 4 * p)); + p2 = _mm_loadl_epi64((__m128i *)(s - 3 * p)); + p1 = _mm_loadl_epi64((__m128i *)(s - 2 * p)); + p0 = _mm_loadl_epi64((__m128i *)(s - 1 * p)); + q0 = _mm_loadl_epi64((__m128i *)(s - 0 * p)); + q1 = _mm_loadl_epi64((__m128i *)(s + 1 * p)); + q2 = _mm_loadl_epi64((__m128i *)(s + 2 * p)); + q3 = _mm_loadl_epi64((__m128i *)(s + 3 * p)); + q4 = _mm_loadl_epi64((__m128i *)(s + 4 * p)); - _mm_store_si128((__m128i *)ap[4], p4); - _mm_store_si128((__m128i *)ap[3], p3); - _mm_store_si128((__m128i *)ap[2], p2); - _mm_store_si128((__m128i *)ap[1], p1); - _mm_store_si128((__m128i *)ap[0], p0); - _mm_store_si128((__m128i *)aq[4], q4); - _mm_store_si128((__m128i *)aq[3], q3); - _mm_store_si128((__m128i *)aq[2], q2); - _mm_store_si128((__m128i *)aq[1], q1); - _mm_store_si128((__m128i *)aq[0], q0); + _mm_storel_epi64((__m128i *)ap[4], p4); + _mm_storel_epi64((__m128i *)ap[3], p3); + _mm_storel_epi64((__m128i *)ap[2], p2); + _mm_storel_epi64((__m128i *)ap[1], p1); + _mm_storel_epi64((__m128i *)ap[0], p0); + _mm_storel_epi64((__m128i *)aq[4], q4); + _mm_storel_epi64((__m128i *)aq[3], q3); + _mm_storel_epi64((__m128i *)aq[2], q2); + _mm_storel_epi64((__m128i *)aq[1], q1); + _mm_storel_epi64((__m128i *)aq[0], q0); { @@ -188,33 +188,33 @@ void vp9_mb_lpf_horizontal_edge_w_sse2(unsigned char *s, flat = _mm_cmpeq_epi8(flat, zero); flat = _mm_and_si128(flat, mask); - p5 = _mm_loadu_si128((__m128i *)(s - 6 * p)); - q5 = _mm_loadu_si128((__m128i *)(s + 5 * p)); + p5 = _mm_loadl_epi64((__m128i *)(s - 6 * p)); + q5 = _mm_loadl_epi64((__m128i *)(s + 5 * p)); flat2 = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p5, p0), _mm_subs_epu8(p0, p5)), _mm_or_si128(_mm_subs_epu8(q5, q0), _mm_subs_epu8(q0, q5))); - _mm_store_si128((__m128i *)ap[5], p5); - _mm_store_si128((__m128i *)aq[5], q5); + _mm_storel_epi64((__m128i *)ap[5], p5); + _mm_storel_epi64((__m128i *)aq[5], q5); flat2 = _mm_max_epu8(work, flat2); - p6 = _mm_loadu_si128((__m128i *)(s - 7 * p)); - q6 = _mm_loadu_si128((__m128i *)(s + 6 * p)); + p6 = _mm_loadl_epi64((__m128i *)(s - 7 * p)); + q6 = _mm_loadl_epi64((__m128i *)(s + 6 * p)); work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p6, p0), _mm_subs_epu8(p0, p6)), _mm_or_si128(_mm_subs_epu8(q6, q0), _mm_subs_epu8(q0, q6))); - _mm_store_si128((__m128i *)ap[6], p6); - _mm_store_si128((__m128i *)aq[6], q6); + _mm_storel_epi64((__m128i *)ap[6], p6); + _mm_storel_epi64((__m128i *)aq[6], q6); flat2 = _mm_max_epu8(work, flat2); - p7 = _mm_loadu_si128((__m128i *)(s - 8 * p)); - q7 = _mm_loadu_si128((__m128i *)(s + 7 * p)); + p7 = _mm_loadl_epi64((__m128i *)(s - 8 * p)); + q7 = _mm_loadl_epi64((__m128i *)(s + 7 * p)); work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p7, p0), _mm_subs_epu8(p0, p7)), _mm_or_si128(_mm_subs_epu8(q7, q0), _mm_subs_epu8(q0, q7))); - _mm_store_si128((__m128i *)ap[7], p7); - _mm_store_si128((__m128i *)aq[7], q7); + _mm_storel_epi64((__m128i *)ap[7], p7); + _mm_storel_epi64((__m128i *)aq[7], q7); flat2 = _mm_max_epu8(work, flat2); flat2 = _mm_subs_epu8(flat2, one); flat2 = _mm_cmpeq_epi8(flat2, zero); @@ -226,30 +226,26 @@ void vp9_mb_lpf_horizontal_edge_w_sse2(unsigned char *s, { const __m128i eight = _mm_set1_epi16(8); const __m128i four = _mm_set1_epi16(4); - __m128i temp_flat2 = flat2; - unsigned char *src = s; - int i = 0; - do { + { __m128i workp_shft; __m128i a, b, c; - unsigned int off = i * 8; - p7 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(ap[7] + off)), zero); - p6 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(ap[6] + off)), zero); - p5 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(ap[5] + off)), zero); - p4 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(ap[4] + off)), zero); - p3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(ap[3] + off)), zero); - p2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(ap[2] + off)), zero); - p1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(ap[1] + off)), zero); - p0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(ap[0] + off)), zero); - q0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(aq[0] + off)), zero); - q1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(aq[1] + off)), zero); - q2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(aq[2] + off)), zero); - q3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(aq[3] + off)), zero); - q4 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(aq[4] + off)), zero); - q5 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(aq[5] + off)), zero); - q6 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(aq[6] + off)), zero); - q7 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(aq[7] + off)), zero); + p7 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(ap[7])), zero); + p6 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(ap[6])), zero); + p5 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(ap[5])), zero); + p4 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(ap[4])), zero); + p3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(ap[3])), zero); + p2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(ap[2])), zero); + p1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(ap[1])), zero); + p0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(ap[0])), zero); + q0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(aq[0])), zero); + q1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(aq[1])), zero); + q2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(aq[2])), zero); + q3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(aq[3])), zero); + q4 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(aq[4])), zero); + q5 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(aq[5])), zero); + q6 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(aq[6])), zero); + q7 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(aq[7])), zero); c = _mm_sub_epi16(_mm_slli_epi16(p7, 3), p7); // p7 * 7 c = _mm_add_epi16(_mm_slli_epi16(p6, 1), _mm_add_epi16(p4, c)); @@ -370,120 +366,117 @@ void vp9_mb_lpf_horizontal_edge_w_sse2(unsigned char *s, workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4); _mm_storel_epi64((__m128i *)&flat2_oq[6][i*8], _mm_packus_epi16(workp_shft, workp_shft)); - - temp_flat2 = _mm_srli_si128(temp_flat2, 8); - src += 8; - } while (++i < 2); + } } // wide flat // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - work_a = _mm_load_si128((__m128i *)ap[2]); - p2 = _mm_load_si128((__m128i *)flat_op[2]); + work_a = _mm_loadl_epi64((__m128i *)ap[2]); + p2 = _mm_loadl_epi64((__m128i *)flat_op[2]); work_a = _mm_andnot_si128(flat, work_a); p2 = _mm_and_si128(flat, p2); p2 = _mm_or_si128(work_a, p2); - _mm_store_si128((__m128i *)flat_op[2], p2); + _mm_storel_epi64((__m128i *)flat_op[2], p2); - p1 = _mm_load_si128((__m128i *)flat_op[1]); + p1 = _mm_loadl_epi64((__m128i *)flat_op[1]); work_a = _mm_andnot_si128(flat, ps1); p1 = _mm_and_si128(flat, p1); p1 = _mm_or_si128(work_a, p1); - _mm_store_si128((__m128i *)flat_op[1], p1); + _mm_storel_epi64((__m128i *)flat_op[1], p1); - p0 = _mm_load_si128((__m128i *)flat_op[0]); + p0 = _mm_loadl_epi64((__m128i *)flat_op[0]); work_a = _mm_andnot_si128(flat, ps0); p0 = _mm_and_si128(flat, p0); p0 = _mm_or_si128(work_a, p0); - _mm_store_si128((__m128i *)flat_op[0], p0); + _mm_storel_epi64((__m128i *)flat_op[0], p0); - q0 = _mm_load_si128((__m128i *)flat_oq[0]); + q0 = _mm_loadl_epi64((__m128i *)flat_oq[0]); work_a = _mm_andnot_si128(flat, qs0); q0 = _mm_and_si128(flat, q0); q0 = _mm_or_si128(work_a, q0); - _mm_store_si128((__m128i *)flat_oq[0], q0); + _mm_storel_epi64((__m128i *)flat_oq[0], q0); - q1 = _mm_load_si128((__m128i *)flat_oq[1]); + q1 = _mm_loadl_epi64((__m128i *)flat_oq[1]); work_a = _mm_andnot_si128(flat, qs1); q1 = _mm_and_si128(flat, q1); q1 = _mm_or_si128(work_a, q1); - _mm_store_si128((__m128i *)flat_oq[1], q1); + _mm_storel_epi64((__m128i *)flat_oq[1], q1); - work_a = _mm_load_si128((__m128i *)aq[2]); - q2 = _mm_load_si128((__m128i *)flat_oq[2]); + work_a = _mm_loadl_epi64((__m128i *)aq[2]); + q2 = _mm_loadl_epi64((__m128i *)flat_oq[2]); work_a = _mm_andnot_si128(flat, work_a); q2 = _mm_and_si128(flat, q2); q2 = _mm_or_si128(work_a, q2); - _mm_store_si128((__m128i *)flat_oq[2], q2); + _mm_storel_epi64((__m128i *)flat_oq[2], q2); // write out op6 - op3 { unsigned char *dst = (s - 7 * p); for (i = 6; i > 2; i--) { __m128i flat2_output; - work_a = _mm_load_si128((__m128i *)ap[i]); - flat2_output = _mm_load_si128((__m128i *)flat2_op[i]); + work_a = _mm_loadl_epi64((__m128i *)ap[i]); + flat2_output = _mm_loadl_epi64((__m128i *)flat2_op[i]); work_a = _mm_andnot_si128(flat2, work_a); flat2_output = _mm_and_si128(flat2, flat2_output); work_a = _mm_or_si128(work_a, flat2_output); - _mm_storeu_si128((__m128i *)dst, work_a); + _mm_storel_epi64((__m128i *)dst, work_a); dst += p; } } - work_a = _mm_load_si128((__m128i *)flat_op[2]); - p2 = _mm_load_si128((__m128i *)flat2_op[2]); + work_a = _mm_loadl_epi64((__m128i *)flat_op[2]); + p2 = _mm_loadl_epi64((__m128i *)flat2_op[2]); work_a = _mm_andnot_si128(flat2, work_a); p2 = _mm_and_si128(flat2, p2); p2 = _mm_or_si128(work_a, p2); - _mm_storeu_si128((__m128i *)(s - 3 * p), p2); + _mm_storel_epi64((__m128i *)(s - 3 * p), p2); - work_a = _mm_load_si128((__m128i *)flat_op[1]); - p1 = _mm_load_si128((__m128i *)flat2_op[1]); + work_a = _mm_loadl_epi64((__m128i *)flat_op[1]); + p1 = _mm_loadl_epi64((__m128i *)flat2_op[1]); work_a = _mm_andnot_si128(flat2, work_a); p1 = _mm_and_si128(flat2, p1); p1 = _mm_or_si128(work_a, p1); - _mm_storeu_si128((__m128i *)(s - 2 * p), p1); + _mm_storel_epi64((__m128i *)(s - 2 * p), p1); - work_a = _mm_load_si128((__m128i *)flat_op[0]); - p0 = _mm_load_si128((__m128i *)flat2_op[0]); + work_a = _mm_loadl_epi64((__m128i *)flat_op[0]); + p0 = _mm_loadl_epi64((__m128i *)flat2_op[0]); work_a = _mm_andnot_si128(flat2, work_a); p0 = _mm_and_si128(flat2, p0); p0 = _mm_or_si128(work_a, p0); - _mm_storeu_si128((__m128i *)(s - 1 * p), p0); + _mm_storel_epi64((__m128i *)(s - 1 * p), p0); - work_a = _mm_load_si128((__m128i *)flat_oq[0]); - q0 = _mm_load_si128((__m128i *)flat2_oq[0]); + work_a = _mm_loadl_epi64((__m128i *)flat_oq[0]); + q0 = _mm_loadl_epi64((__m128i *)flat2_oq[0]); work_a = _mm_andnot_si128(flat2, work_a); q0 = _mm_and_si128(flat2, q0); q0 = _mm_or_si128(work_a, q0); - _mm_storeu_si128((__m128i *)(s - 0 * p), q0); + _mm_storel_epi64((__m128i *)(s - 0 * p), q0); - work_a = _mm_load_si128((__m128i *)flat_oq[1]); - q1 = _mm_load_si128((__m128i *)flat2_oq[1]); + work_a = _mm_loadl_epi64((__m128i *)flat_oq[1]); + q1 = _mm_loadl_epi64((__m128i *)flat2_oq[1]); work_a = _mm_andnot_si128(flat2, work_a); q1 = _mm_and_si128(flat2, q1); q1 = _mm_or_si128(work_a, q1); - _mm_storeu_si128((__m128i *)(s + 1 * p), q1); + _mm_storel_epi64((__m128i *)(s + 1 * p), q1); - work_a = _mm_load_si128((__m128i *)flat_oq[2]); - q2 = _mm_load_si128((__m128i *)flat2_oq[2]); + work_a = _mm_loadl_epi64((__m128i *)flat_oq[2]); + q2 = _mm_loadl_epi64((__m128i *)flat2_oq[2]); work_a = _mm_andnot_si128(flat2, work_a); q2 = _mm_and_si128(flat2, q2); q2 = _mm_or_si128(work_a, q2); - _mm_storeu_si128((__m128i *)(s + 2 * p), q2); + _mm_storel_epi64((__m128i *)(s + 2 * p), q2); // write out oq3 - oq7 { unsigned char *dst = (s + 3 * p); for (i = 3; i < 7; i++) { __m128i flat2_output; - work_a = _mm_load_si128((__m128i *)aq[i]); - flat2_output = _mm_load_si128((__m128i *)flat2_oq[i]); + work_a = _mm_loadl_epi64((__m128i *)aq[i]); + flat2_output = _mm_loadl_epi64((__m128i *)flat2_oq[i]); work_a = _mm_andnot_si128(flat2, work_a); flat2_output = _mm_and_si128(flat2, flat2_output); work_a = _mm_or_si128(work_a, flat2_output); - _mm_storeu_si128((__m128i *)dst, work_a); + _mm_storel_epi64((__m128i *)dst, work_a); dst += p; } } @@ -967,9 +960,14 @@ void vp9_mb_lpf_vertical_edge_w_sse2(unsigned char *s, unsigned char *src[4]; unsigned char *dst[4]; + dst[0] = t_dst; + dst[1] = t_dst + 8 * 16; + + src[0] = s - 8; + src[1] = s - 8 + 8; + /* Transpose 16x16 */ - transpose8x16(s - 8, s - 8 + p * 8, p, t_dst, 16); - transpose8x16(s, s + p * 8, p, t_dst + 16 * 8, 16); + transpose(src, p, dst, 16, 2); /* Loop filtering */ vp9_mb_lpf_horizontal_edge_w_sse2(t_dst + 8 * 16, 16, blimit, limit, @@ -977,16 +975,11 @@ void vp9_mb_lpf_vertical_edge_w_sse2(unsigned char *s, src[0] = t_dst; src[1] = t_dst + 8 * 16; - src[2] = t_dst + 8; - src[3] = t_dst + 8 * 16 + 8; dst[0] = s - 8; dst[1] = s - 8 + 8; - dst[2] = s - 8 + p * 8; - dst[3] = s - 8 + p * 8 + 8; - /* Transpose 16x16 */ - transpose(src, 16, dst, p, 4); + transpose(src, 16, dst, p, 2); }