From ac3996a6d1bf1f3696cffbf1102528643391ec19 Mon Sep 17 00:00:00 2001 From: Johann Date: Fri, 3 Feb 2017 15:57:28 -0800 Subject: [PATCH 1/3] quantize_fp highbd sse2: use tran_low_t for coeff Change-Id: Id96a8df33354a7987ce890a3d6798c7375ffa4aa --- vp9/common/vp9_rtcd_defs.pl | 2 +- vp9/encoder/x86/vp9_quantize_sse2.c | 53 +++++++++++++++-------------- 2 files changed, 29 insertions(+), 26 deletions(-) diff --git a/vp9/common/vp9_rtcd_defs.pl b/vp9/common/vp9_rtcd_defs.pl index 720e171b5..ddc85ded5 100644 --- a/vp9/common/vp9_rtcd_defs.pl +++ b/vp9/common/vp9_rtcd_defs.pl @@ -136,7 +136,7 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { specialize qw/vp9_block_error_fp sse2/; add_proto qw/void vp9_quantize_fp/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"; - specialize qw/vp9_quantize_fp neon/; + specialize qw/vp9_quantize_fp neon sse2/; add_proto qw/void vp9_quantize_fp_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"; diff --git a/vp9/encoder/x86/vp9_quantize_sse2.c b/vp9/encoder/x86/vp9_quantize_sse2.c index 3f8ee5f24..954f8cb82 100644 --- a/vp9/encoder/x86/vp9_quantize_sse2.c +++ b/vp9/encoder/x86/vp9_quantize_sse2.c @@ -13,14 +13,16 @@ #include "./vp9_rtcd.h" #include "vpx/vpx_integer.h" +#include "vpx_dsp/vpx_dsp_common.h" +#include "vpx_dsp/x86/bitdepth_conversion_sse2.h" -void vp9_quantize_fp_sse2(const int16_t *coeff_ptr, intptr_t n_coeffs, +void vp9_quantize_fp_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, - const int16_t *quant_shift_ptr, int16_t *qcoeff_ptr, - int16_t *dqcoeff_ptr, const int16_t *dequant_ptr, - uint16_t *eob_ptr, const int16_t *scan_ptr, - const int16_t *iscan_ptr) { + const int16_t *quant_shift_ptr, + tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, + const int16_t *dequant_ptr, uint16_t *eob_ptr, + const int16_t *scan_ptr, const int16_t *iscan_ptr) { __m128i zero; __m128i thr; int16_t nzflag; @@ -53,8 +55,8 @@ void vp9_quantize_fp_sse2(const int16_t *coeff_ptr, intptr_t n_coeffs, __m128i qcoeff0, qcoeff1; __m128i qtmp0, qtmp1; // Do DC and first 15 AC - coeff0 = _mm_load_si128((const __m128i *)(coeff_ptr + n_coeffs)); - coeff1 = _mm_load_si128((const __m128i *)(coeff_ptr + n_coeffs) + 1); + coeff0 = load_tran_low(coeff_ptr + n_coeffs); + coeff1 = load_tran_low(coeff_ptr + n_coeffs + 8); // Poor man's sign extract coeff0_sign = _mm_srai_epi16(coeff0, 15); @@ -77,15 +79,15 @@ void vp9_quantize_fp_sse2(const int16_t *coeff_ptr, intptr_t n_coeffs, qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign); qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign); - _mm_store_si128((__m128i *)(qcoeff_ptr + n_coeffs), qcoeff0); - _mm_store_si128((__m128i *)(qcoeff_ptr + n_coeffs) + 1, qcoeff1); + store_tran_low(qcoeff0, qcoeff_ptr + n_coeffs); + store_tran_low(qcoeff1, qcoeff_ptr + n_coeffs + 8); coeff0 = _mm_mullo_epi16(qcoeff0, dequant); dequant = _mm_unpackhi_epi64(dequant, dequant); coeff1 = _mm_mullo_epi16(qcoeff1, dequant); - _mm_store_si128((__m128i *)(dqcoeff_ptr + n_coeffs), coeff0); - _mm_store_si128((__m128i *)(dqcoeff_ptr + n_coeffs) + 1, coeff1); + store_tran_low(coeff0, dqcoeff_ptr + n_coeffs); + store_tran_low(coeff1, dqcoeff_ptr + n_coeffs + 8); } { @@ -120,8 +122,8 @@ void vp9_quantize_fp_sse2(const int16_t *coeff_ptr, intptr_t n_coeffs, __m128i qcoeff0, qcoeff1; __m128i qtmp0, qtmp1; - coeff0 = _mm_load_si128((const __m128i *)(coeff_ptr + n_coeffs)); - coeff1 = _mm_load_si128((const __m128i *)(coeff_ptr + n_coeffs) + 1); + coeff0 = load_tran_low(coeff_ptr + n_coeffs); + coeff1 = load_tran_low(coeff_ptr + n_coeffs + 8); // Poor man's sign extract coeff0_sign = _mm_srai_epi16(coeff0, 15); @@ -146,20 +148,20 @@ void vp9_quantize_fp_sse2(const int16_t *coeff_ptr, intptr_t n_coeffs, qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign); qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign); - _mm_store_si128((__m128i *)(qcoeff_ptr + n_coeffs), qcoeff0); - _mm_store_si128((__m128i *)(qcoeff_ptr + n_coeffs) + 1, qcoeff1); + store_tran_low(qcoeff0, qcoeff_ptr + n_coeffs); + store_tran_low(qcoeff1, qcoeff_ptr + n_coeffs + 8); coeff0 = _mm_mullo_epi16(qcoeff0, dequant); coeff1 = _mm_mullo_epi16(qcoeff1, dequant); - _mm_store_si128((__m128i *)(dqcoeff_ptr + n_coeffs), coeff0); - _mm_store_si128((__m128i *)(dqcoeff_ptr + n_coeffs) + 1, coeff1); + store_tran_low(coeff0, dqcoeff_ptr + n_coeffs); + store_tran_low(coeff1, dqcoeff_ptr + n_coeffs + 8); } else { - _mm_store_si128((__m128i *)(qcoeff_ptr + n_coeffs), zero); - _mm_store_si128((__m128i *)(qcoeff_ptr + n_coeffs) + 1, zero); + store_zero_tran_low(qcoeff_ptr + n_coeffs); + store_zero_tran_low(qcoeff_ptr + n_coeffs + 8); - _mm_store_si128((__m128i *)(dqcoeff_ptr + n_coeffs), zero); - _mm_store_si128((__m128i *)(dqcoeff_ptr + n_coeffs) + 1, zero); + store_zero_tran_low(dqcoeff_ptr + n_coeffs); + store_zero_tran_low(dqcoeff_ptr + n_coeffs + 8); } } @@ -199,10 +201,11 @@ void vp9_quantize_fp_sse2(const int16_t *coeff_ptr, intptr_t n_coeffs, } } else { do { - _mm_store_si128((__m128i *)(dqcoeff_ptr + n_coeffs), zero); - _mm_store_si128((__m128i *)(dqcoeff_ptr + n_coeffs) + 1, zero); - _mm_store_si128((__m128i *)(qcoeff_ptr + n_coeffs), zero); - _mm_store_si128((__m128i *)(qcoeff_ptr + n_coeffs) + 1, zero); + store_zero_tran_low(qcoeff_ptr + n_coeffs); + store_zero_tran_low(qcoeff_ptr + n_coeffs + 8); + + store_zero_tran_low(dqcoeff_ptr + n_coeffs); + store_zero_tran_low(dqcoeff_ptr + n_coeffs + 8); n_coeffs += 8 * 2; } while (n_coeffs < 0); *eob_ptr = 0; From 4682130b60dd27ec5a03d13a20e8249156fcd250 Mon Sep 17 00:00:00 2001 From: Johann Date: Wed, 15 Feb 2017 19:01:38 -0800 Subject: [PATCH 2/3] quantize_fp highbd ssse3: use tran_low_t for coeff Change-Id: Iebade0efc0efbb0a80a0f3adbef4962e3a2f25e8 --- vp9/common/vp9_rtcd_defs.pl | 2 +- vp9/encoder/x86/vp9_quantize_ssse3_x86_64.asm | 49 ++++++++++--------- vpx_dsp/x86/bitdepth_conversion_sse2.asm | 38 +++++++++++--- 3 files changed, 57 insertions(+), 32 deletions(-) diff --git a/vp9/common/vp9_rtcd_defs.pl b/vp9/common/vp9_rtcd_defs.pl index ddc85ded5..5ba98a278 100644 --- a/vp9/common/vp9_rtcd_defs.pl +++ b/vp9/common/vp9_rtcd_defs.pl @@ -136,7 +136,7 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { specialize qw/vp9_block_error_fp sse2/; add_proto qw/void vp9_quantize_fp/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"; - specialize qw/vp9_quantize_fp neon sse2/; + specialize qw/vp9_quantize_fp neon sse2/, "$ssse3_x86_64"; add_proto qw/void vp9_quantize_fp_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"; diff --git a/vp9/encoder/x86/vp9_quantize_ssse3_x86_64.asm b/vp9/encoder/x86/vp9_quantize_ssse3_x86_64.asm index ec61c0c3a..c9f0cbce4 100644 --- a/vp9/encoder/x86/vp9_quantize_ssse3_x86_64.asm +++ b/vp9/encoder/x86/vp9_quantize_ssse3_x86_64.asm @@ -11,6 +11,7 @@ %define private_prefix vp9 %include "third_party/x86inc/x86inc.asm" +%include "vpx_dsp/x86/bitdepth_conversion_sse2.asm" SECTION_RODATA pw_1: times 8 dw 1 @@ -48,15 +49,15 @@ cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \ %endif pxor m5, m5 ; m5 = dedicated zero - lea coeffq, [ coeffq+ncoeffq*2] - lea r5q, [ r5q+ncoeffq*2] - lea r3q, [ r3q+ncoeffq*2] - lea r4q, [r4q+ncoeffq*2] + INCREMENT_ELEMENTS_TRAN_LOW coeffq, ncoeffq + lea r5q, [r5q+ncoeffq*2] + INCREMENT_ELEMENTS_TRAN_LOW r3q, ncoeffq + INCREMENT_ELEMENTS_TRAN_LOW r4q, ncoeffq neg ncoeffq ; get DC and first 15 AC coeffs - mova m9, [ coeffq+ncoeffq*2+ 0] ; m9 = c[i] - mova m10, [ coeffq+ncoeffq*2+16] ; m10 = c[i] + LOAD_TRAN_LOW 9, coeffq, ncoeffq ; m9 = c[i] + LOAD_TRAN_LOW 10, coeffq, ncoeffq + 8 ; m10 = c[i] pabsw m6, m9 ; m6 = abs(m9) pabsw m11, m10 ; m11 = abs(m10) pcmpeqw m7, m7 @@ -69,8 +70,8 @@ cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \ pmulhw m13, m11, m2 ; m13 = m11*q>>16 psignw m8, m9 ; m8 = reinsert sign psignw m13, m10 ; m13 = reinsert sign - mova [r3q+ncoeffq*2+ 0], m8 - mova [r3q+ncoeffq*2+16], m13 + STORE_TRAN_LOW 8, r3q, ncoeffq, 6, 11, 12 + STORE_TRAN_LOW 13, r3q, ncoeffq + 8, 6, 11, 12 %ifidn %1, fp_32x32 pabsw m8, m8 pabsw m13, m13 @@ -87,8 +88,8 @@ cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \ %else psrlw m0, m3, 1 %endif - mova [r4q+ncoeffq*2+ 0], m8 - mova [r4q+ncoeffq*2+16], m13 + STORE_TRAN_LOW 8, r4q, ncoeffq, 6, 11, 12 + STORE_TRAN_LOW 13, r4q, ncoeffq + 8, 6, 11, 12 pcmpeqw m8, m5 ; m8 = c[i] == 0 pcmpeqw m13, m5 ; m13 = c[i] == 0 mova m6, [ r5q+ncoeffq*2+ 0] ; m6 = scan[i] @@ -102,8 +103,8 @@ cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \ jz .accumulate_eob .ac_only_loop: - mova m9, [ coeffq+ncoeffq*2+ 0] ; m9 = c[i] - mova m10, [ coeffq+ncoeffq*2+16] ; m10 = c[i] + LOAD_TRAN_LOW 9, coeffq, ncoeffq ; m9 = c[i] + LOAD_TRAN_LOW 10, coeffq, ncoeffq + 8 ; m10 = c[i] pabsw m6, m9 ; m6 = abs(m9) pabsw m11, m10 ; m11 = abs(m10) @@ -123,8 +124,8 @@ cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \ pmulhw m13, m11, m2 ; m13 = m11*q>>16 psignw m14, m9 ; m14 = reinsert sign psignw m13, m10 ; m13 = reinsert sign - mova [r3q+ncoeffq*2+ 0], m14 - mova [r3q+ncoeffq*2+16], m13 + STORE_TRAN_LOW 14, r3q, ncoeffq, 6, 11, 12 + STORE_TRAN_LOW 13, r3q, ncoeffq + 8, 6, 11, 12 %ifidn %1, fp_32x32 pabsw m14, m14 pabsw m13, m13 @@ -137,8 +138,8 @@ cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \ psignw m14, m9 psignw m13, m10 %endif - mova [r4q+ncoeffq*2+ 0], m14 - mova [r4q+ncoeffq*2+16], m13 + STORE_TRAN_LOW 14, r4q, ncoeffq, 6, 11, 12 + STORE_TRAN_LOW 13, r4q, ncoeffq + 8, 6, 11, 12 pcmpeqw m14, m5 ; m14 = c[i] == 0 pcmpeqw m13, m5 ; m13 = c[i] == 0 mova m6, [ r5q+ncoeffq*2+ 0] ; m6 = scan[i] @@ -154,10 +155,10 @@ cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \ jmp .accumulate_eob .skip_iter: - mova [r3q+ncoeffq*2+ 0], m5 - mova [r3q+ncoeffq*2+16], m5 - mova [r4q+ncoeffq*2+ 0], m5 - mova [r4q+ncoeffq*2+16], m5 + STORE_ZERO_TRAN_LOW 5, r3q, ncoeffq + STORE_ZERO_TRAN_LOW 5, r3q, ncoeffq + 8 + STORE_ZERO_TRAN_LOW 5, r4q, ncoeffq + STORE_ZERO_TRAN_LOW 5, r4q, ncoeffq + 8 add ncoeffq, mmsize jl .ac_only_loop @@ -186,10 +187,10 @@ cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \ neg ncoeffq pxor m7, m7 .blank_loop: - mova [r0q+ncoeffq*2+ 0], m7 - mova [r0q+ncoeffq*2+16], m7 - mova [r2q+ncoeffq*2+ 0], m7 - mova [r2q+ncoeffq*2+16], m7 + STORE_ZERO_TRAN_LOW 7, r0q, ncoeffq + STORE_ZERO_TRAN_LOW 7, r0q, ncoeffq + 8 + STORE_ZERO_TRAN_LOW 7, r2q, ncoeffq + STORE_ZERO_TRAN_LOW 7, r2q, ncoeffq + 8 add ncoeffq, mmsize jl .blank_loop mov word [r3q], 0 diff --git a/vpx_dsp/x86/bitdepth_conversion_sse2.asm b/vpx_dsp/x86/bitdepth_conversion_sse2.asm index 2bcbc0ac1..aacf71f7a 100644 --- a/vpx_dsp/x86/bitdepth_conversion_sse2.asm +++ b/vpx_dsp/x86/bitdepth_conversion_sse2.asm @@ -38,29 +38,53 @@ ; the values down to 16 bits. %macro LOAD_TRAN_LOW 3 %if CONFIG_VP9_HIGHBITDEPTH - mova m%1, [%2 + %3 * 4] - packssdw m%1, [%2 + %3 * 4 + 16] + mova m%1, [%2 + (%3) * 4] + packssdw m%1, [%2 + (%3) * 4 + 16] %else - mova m%1, [%2 + %3 * 2] + mova m%1, [%2 + (%3) * 2] %endif %endmacro ; Store m%1 to %2 + %3. ; %3 is the offset in elements, not bytes. +; If 5 arguments are provided then m%1 is corrupted. +; If 6 arguments are provided then m%1 is preserved. ; If tran_low_t is 16 bits (low bit depth configuration) then store the value ; directly. If tran_low_t is 32 bits (high bit depth configuration) then sign ; extend the values first. ; Uses m%4-m%6 as scratch registers for high bit depth. -%macro STORE_TRAN_LOW 5 +%macro STORE_TRAN_LOW 5-6 %if CONFIG_VP9_HIGHBITDEPTH pxor m%4, m%4 mova m%5, m%1 + %if %0 == 6 + mova m%6, m%1 + %endif pcmpgtw m%4, m%1 punpcklwd m%5, m%4 + %if %0 == 5 punpckhwd m%1, m%4 - mova [%2 + %3 * 4 + 0], m%5 - mova [%2 + %3 * 4 + 16], m%1 + %else + punpckhwd m%6, m%4 + %endif + mova [%2 + (%3) * 4 + 0], m%5 + %if %0 == 5 + mova [%2 + (%3) * 4 + 16], m%1 + %else + mova [%2 + (%3) * 4 + 16], m%6 + %endif %else - mova [%2 + %3 * 2], m%1 + mova [%2 + (%3) * 2], m%1 +%endif +%endmacro + +; Store zeros (in m%1) to %2 + %3. +; %3 is the offset in elements, not bytes. +%macro STORE_ZERO_TRAN_LOW 3 +%if CONFIG_VP9_HIGHBITDEPTH + mova [%2 + (%3) * 4 + 0], m%1 + mova [%2 + (%3) * 4 + 16], m%1 +%else + mova [%2 + (%3) * 2], m%1 %endif %endmacro From ff37a911ce063f62d2ddae18e917628b6d82a0e9 Mon Sep 17 00:00:00 2001 From: Johann Date: Thu, 16 Feb 2017 07:29:32 -0800 Subject: [PATCH 3/3] quantize_fp_32x32 highbd ssse3: enable existing function This was created as part of the quantize_fp_ssse3 change. Both functions use the same source file with different macro parameters. Change-Id: I267050a559426a85955d215aa0aaca270439c5ab --- vp9/common/vp9_rtcd_defs.pl | 1 + 1 file changed, 1 insertion(+) diff --git a/vp9/common/vp9_rtcd_defs.pl b/vp9/common/vp9_rtcd_defs.pl index 5ba98a278..4aada2605 100644 --- a/vp9/common/vp9_rtcd_defs.pl +++ b/vp9/common/vp9_rtcd_defs.pl @@ -139,6 +139,7 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { specialize qw/vp9_quantize_fp neon sse2/, "$ssse3_x86_64"; add_proto qw/void vp9_quantize_fp_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"; + specialize qw/vp9_quantize_fp_32x32/, "$ssse3_x86_64"; add_proto qw/void vp9_fdct8x8_quant/, "const int16_t *input, int stride, tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"; specialize qw/vp9_fdct8x8_quant neon ssse3/;