Fix high bit-depth loop-filter sse2 compiling issue - part 4
Change-Id: I39f56f60425836f2e1ec07da71edd4810a4c78bb
This commit is contained in:
parent
a28a8cb726
commit
5b87f1bb5a
@ -706,15 +706,7 @@ void vp9_highbd_lpf_horizontal_4_sse2(uint16_t *s, int p,
|
||||
const uint8_t *_thresh,
|
||||
int count, int bd) {
|
||||
const __m128i zero = _mm_set1_epi16(0);
|
||||
const __m128i blimit = _mm_slli_epi16(
|
||||
_mm_unpacklo_epi8(
|
||||
_mm_load_si128((const __m128i *)_blimit), zero), bd - 8);
|
||||
const __m128i limit = _mm_slli_epi16(
|
||||
_mm_unpacklo_epi8(
|
||||
_mm_load_si128((const __m128i *)_limit), zero), bd - 8);
|
||||
const __m128i thresh = _mm_slli_epi16(
|
||||
_mm_unpacklo_epi8(
|
||||
_mm_load_si128((const __m128i *)_thresh), zero), bd - 8);
|
||||
__m128i blimit, limit, thresh;
|
||||
__m128i mask, hev, flat;
|
||||
__m128i p3 = _mm_loadu_si128((__m128i *)(s - 4 * p));
|
||||
__m128i p2 = _mm_loadu_si128((__m128i *)(s - 3 * p));
|
||||
@ -737,30 +729,63 @@ void vp9_highbd_lpf_horizontal_4_sse2(uint16_t *s, int p,
|
||||
__m128i work;
|
||||
const __m128i t4 = _mm_set1_epi16(4);
|
||||
const __m128i t3 = _mm_set1_epi16(3);
|
||||
const __m128i t80 = _mm_slli_epi16(_mm_set1_epi16(0x80), bd - 8);
|
||||
const __m128i tff80 = _mm_slli_epi16(_mm_set1_epi16(0xff80), bd - 8);
|
||||
const __m128i tffe0 = _mm_slli_epi16(_mm_set1_epi16(0xffe0), bd - 8);
|
||||
const __m128i t1f = _mm_srli_epi16(_mm_set1_epi16(0x1fff), 16 - bd);
|
||||
__m128i t80;
|
||||
__m128i tff80;
|
||||
__m128i tffe0;
|
||||
__m128i t1f;
|
||||
// equivalent to shifting 0x1f left by bitdepth - 8
|
||||
// and setting new bits to 1
|
||||
const __m128i t1 = _mm_set1_epi16(0x1);
|
||||
const __m128i t7f = _mm_srli_epi16(_mm_set1_epi16(0x7fff), 16 - bd);
|
||||
__m128i t7f;
|
||||
// equivalent to shifting 0x7f left by bitdepth - 8
|
||||
// and setting new bits to 1
|
||||
const __m128i ps1 = _mm_subs_epi16(_mm_loadu_si128((__m128i *)(s - 2 * p)),
|
||||
t80);
|
||||
const __m128i ps0 = _mm_subs_epi16(_mm_loadu_si128((__m128i *)(s - 1 * p)),
|
||||
t80);
|
||||
const __m128i qs0 = _mm_subs_epi16(_mm_loadu_si128((__m128i *)(s + 0 * p)),
|
||||
t80);
|
||||
const __m128i qs1 = _mm_subs_epi16(_mm_loadu_si128((__m128i *)(s + 1 * p)),
|
||||
t80);
|
||||
__m128i ps1, ps0, qs0, qs1;
|
||||
__m128i filt;
|
||||
__m128i work_a;
|
||||
__m128i filter1, filter2;
|
||||
|
||||
(void)count;
|
||||
|
||||
if (bd == 8) {
|
||||
blimit = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit), zero);
|
||||
limit = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit), zero);
|
||||
thresh = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_thresh), zero);
|
||||
t80 = _mm_set1_epi16(0x80);
|
||||
tff80 = _mm_set1_epi16(0xff80);
|
||||
tffe0 = _mm_set1_epi16(0xffe0);
|
||||
t1f = _mm_srli_epi16(_mm_set1_epi16(0x1fff), 8);
|
||||
t7f = _mm_srli_epi16(_mm_set1_epi16(0x7fff), 8);
|
||||
} else if (bd == 10) {
|
||||
blimit = _mm_slli_epi16(
|
||||
_mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit), zero), 2);
|
||||
limit = _mm_slli_epi16(
|
||||
_mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit), zero), 2);
|
||||
thresh = _mm_slli_epi16(
|
||||
_mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_thresh), zero), 2);
|
||||
t80 = _mm_slli_epi16(_mm_set1_epi16(0x80), 2);
|
||||
tff80 = _mm_slli_epi16(_mm_set1_epi16(0xff80), 2);
|
||||
tffe0 = _mm_slli_epi16(_mm_set1_epi16(0xffe0), 2);
|
||||
t1f = _mm_srli_epi16(_mm_set1_epi16(0x1fff), 6);
|
||||
t7f = _mm_srli_epi16(_mm_set1_epi16(0x7fff), 6);
|
||||
} else { // bd == 12
|
||||
blimit = _mm_slli_epi16(
|
||||
_mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit), zero), 4);
|
||||
limit = _mm_slli_epi16(
|
||||
_mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit), zero), 4);
|
||||
thresh = _mm_slli_epi16(
|
||||
_mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_thresh), zero), 4);
|
||||
t80 = _mm_slli_epi16(_mm_set1_epi16(0x80), 4);
|
||||
tff80 = _mm_slli_epi16(_mm_set1_epi16(0xff80), 4);
|
||||
tffe0 = _mm_slli_epi16(_mm_set1_epi16(0xffe0), 4);
|
||||
t1f = _mm_srli_epi16(_mm_set1_epi16(0x1fff), 4);
|
||||
t7f = _mm_srli_epi16(_mm_set1_epi16(0x7fff), 4);
|
||||
}
|
||||
|
||||
ps1 = _mm_subs_epi16(_mm_loadu_si128((__m128i *)(s - 2 * p)), t80);
|
||||
ps0 = _mm_subs_epi16(_mm_loadu_si128((__m128i *)(s - 1 * p)), t80);
|
||||
qs0 = _mm_subs_epi16(_mm_loadu_si128((__m128i *)(s + 0 * p)), t80);
|
||||
qs1 = _mm_subs_epi16(_mm_loadu_si128((__m128i *)(s + 1 * p)), t80);
|
||||
|
||||
// filter_mask and hev_mask
|
||||
flat = _mm_max_epi16(abs_p1p0, abs_q1q0);
|
||||
hev = _mm_subs_epu16(flat, thresh);
|
||||
@ -796,6 +821,7 @@ void vp9_highbd_lpf_horizontal_4_sse2(uint16_t *s, int p,
|
||||
filt = _mm_adds_epi16(filt, work_a);
|
||||
filt = _mm_adds_epi16(filt, work_a);
|
||||
filt = signed_char_clamp_bd_sse2(_mm_adds_epi16(filt, work_a), bd);
|
||||
|
||||
// (vp9_filter + 3 * (qs0 - ps0)) & mask
|
||||
filt = _mm_and_si128(filt, mask);
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user