diff --git a/test/dct16x16_test.cc b/test/dct16x16_test.cc index 0d19aa05b..028f8ff94 100644 --- a/test/dct16x16_test.cc +++ b/test/dct16x16_test.cc @@ -21,7 +21,7 @@ extern "C" { #include "vp9/common/vp9_entropy.h" #include "./vp9_rtcd.h" -void vp9_short_idct16x16_add_c(int16_t *input, uint8_t *output, int pitch); +void vp9_idct16x16_256_add_c(int16_t *input, uint8_t *output, int pitch); } #include "vpx/vpx_integer.h" @@ -496,7 +496,7 @@ using std::tr1::make_tuple; INSTANTIATE_TEST_CASE_P( C, Trans16x16DCT, ::testing::Values( - make_tuple(&vp9_short_fdct16x16_c, &vp9_short_idct16x16_add_c, 0))); + make_tuple(&vp9_short_fdct16x16_c, &vp9_idct16x16_256_add_c, 0))); INSTANTIATE_TEST_CASE_P( C, Trans16x16HT, ::testing::Values( @@ -510,7 +510,7 @@ INSTANTIATE_TEST_CASE_P( SSE2, Trans16x16DCT, ::testing::Values( make_tuple(&vp9_short_fdct16x16_sse2, - &vp9_short_idct16x16_add_sse2, 0))); + &vp9_idct16x16_256_add_sse2, 0))); INSTANTIATE_TEST_CASE_P( SSE2, Trans16x16HT, ::testing::Values( diff --git a/vp9/common/arm/neon/vp9_idct16x16_neon.c b/vp9/common/arm/neon/vp9_idct16x16_neon.c index fb7b5cdc4..33aa4e001 100644 --- a/vp9/common/arm/neon/vp9_idct16x16_neon.c +++ b/vp9/common/arm/neon/vp9_idct16x16_neon.c @@ -11,19 +11,19 @@ #include "./vp9_rtcd.h" #include "vp9/common/vp9_common.h" -extern void vp9_short_idct16x16_add_neon_pass1(int16_t *input, +extern void vp9_idct16x16_256_add_neon_pass1(int16_t *input, int16_t *output, int output_stride); -extern void vp9_short_idct16x16_add_neon_pass2(int16_t *src, +extern void vp9_idct16x16_256_add_neon_pass2(int16_t *src, int16_t *output, int16_t *pass1Output, int16_t skip_adding, uint8_t *dest, int dest_stride); -extern void vp9_short_idct16x16_10_add_neon_pass1(int16_t *input, +extern void vp9_idct16x16_10_add_neon_pass1(int16_t *input, int16_t *output, int output_stride); -extern void vp9_short_idct16x16_10_add_neon_pass2(int16_t *src, +extern void vp9_idct16x16_10_add_neon_pass2(int16_t *src, int16_t *output, int16_t *pass1Output, int16_t skip_adding, @@ -34,7 +34,7 @@ extern void vp9_short_idct16x16_10_add_neon_pass2(int16_t *src, extern void vp9_push_neon(int64_t *store); extern void vp9_pop_neon(int64_t *store); -void vp9_short_idct16x16_add_neon(int16_t *input, +void vp9_idct16x16_256_add_neon(int16_t *input, uint8_t *dest, int dest_stride) { int64_t store_reg[8]; int16_t pass1_output[16*16] = {0}; @@ -46,12 +46,12 @@ void vp9_short_idct16x16_add_neon(int16_t *input, /* Parallel idct on the upper 8 rows */ // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the // stage 6 result in pass1_output. - vp9_short_idct16x16_add_neon_pass1(input, pass1_output, 8); + vp9_idct16x16_256_add_neon_pass1(input, pass1_output, 8); // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines // with result in pass1(pass1_output) to calculate final result in stage 7 // which will be saved into row_idct_output. - vp9_short_idct16x16_add_neon_pass2(input+1, + vp9_idct16x16_256_add_neon_pass2(input+1, row_idct_output, pass1_output, 0, @@ -61,12 +61,12 @@ void vp9_short_idct16x16_add_neon(int16_t *input, /* Parallel idct on the lower 8 rows */ // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the // stage 6 result in pass1_output. - vp9_short_idct16x16_add_neon_pass1(input+8*16, pass1_output, 8); + vp9_idct16x16_256_add_neon_pass1(input+8*16, pass1_output, 8); // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines // with result in pass1(pass1_output) to calculate final result in stage 7 // which will be saved into row_idct_output. - vp9_short_idct16x16_add_neon_pass2(input+8*16+1, + vp9_idct16x16_256_add_neon_pass2(input+8*16+1, row_idct_output+8, pass1_output, 0, @@ -76,12 +76,12 @@ void vp9_short_idct16x16_add_neon(int16_t *input, /* Parallel idct on the left 8 columns */ // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the // stage 6 result in pass1_output. - vp9_short_idct16x16_add_neon_pass1(row_idct_output, pass1_output, 8); + vp9_idct16x16_256_add_neon_pass1(row_idct_output, pass1_output, 8); // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines // with result in pass1(pass1_output) to calculate final result in stage 7. // Then add the result to the destination data. - vp9_short_idct16x16_add_neon_pass2(row_idct_output+1, + vp9_idct16x16_256_add_neon_pass2(row_idct_output+1, row_idct_output, pass1_output, 1, @@ -91,12 +91,12 @@ void vp9_short_idct16x16_add_neon(int16_t *input, /* Parallel idct on the right 8 columns */ // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the // stage 6 result in pass1_output. - vp9_short_idct16x16_add_neon_pass1(row_idct_output+8*16, pass1_output, 8); + vp9_idct16x16_256_add_neon_pass1(row_idct_output+8*16, pass1_output, 8); // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines // with result in pass1(pass1_output) to calculate final result in stage 7. // Then add the result to the destination data. - vp9_short_idct16x16_add_neon_pass2(row_idct_output+8*16+1, + vp9_idct16x16_256_add_neon_pass2(row_idct_output+8*16+1, row_idct_output+8, pass1_output, 1, @@ -109,7 +109,7 @@ void vp9_short_idct16x16_add_neon(int16_t *input, return; } -void vp9_short_idct16x16_10_add_neon(int16_t *input, +void vp9_idct16x16_10_add_neon(int16_t *input, uint8_t *dest, int dest_stride) { int64_t store_reg[8]; int16_t pass1_output[16*16] = {0}; @@ -121,12 +121,12 @@ void vp9_short_idct16x16_10_add_neon(int16_t *input, /* Parallel idct on the upper 8 rows */ // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the // stage 6 result in pass1_output. - vp9_short_idct16x16_10_add_neon_pass1(input, pass1_output, 8); + vp9_idct16x16_10_add_neon_pass1(input, pass1_output, 8); // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines // with result in pass1(pass1_output) to calculate final result in stage 7 // which will be saved into row_idct_output. - vp9_short_idct16x16_10_add_neon_pass2(input+1, + vp9_idct16x16_10_add_neon_pass2(input+1, row_idct_output, pass1_output, 0, @@ -138,12 +138,12 @@ void vp9_short_idct16x16_10_add_neon(int16_t *input, /* Parallel idct on the left 8 columns */ // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the // stage 6 result in pass1_output. - vp9_short_idct16x16_add_neon_pass1(row_idct_output, pass1_output, 8); + vp9_idct16x16_256_add_neon_pass1(row_idct_output, pass1_output, 8); // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines // with result in pass1(pass1_output) to calculate final result in stage 7. // Then add the result to the destination data. - vp9_short_idct16x16_add_neon_pass2(row_idct_output+1, + vp9_idct16x16_256_add_neon_pass2(row_idct_output+1, row_idct_output, pass1_output, 1, @@ -153,12 +153,12 @@ void vp9_short_idct16x16_10_add_neon(int16_t *input, /* Parallel idct on the right 8 columns */ // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the // stage 6 result in pass1_output. - vp9_short_idct16x16_add_neon_pass1(row_idct_output+8*16, pass1_output, 8); + vp9_idct16x16_256_add_neon_pass1(row_idct_output+8*16, pass1_output, 8); // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines // with result in pass1(pass1_output) to calculate final result in stage 7. // Then add the result to the destination data. - vp9_short_idct16x16_add_neon_pass2(row_idct_output+8*16+1, + vp9_idct16x16_256_add_neon_pass2(row_idct_output+8*16+1, row_idct_output+8, pass1_output, 1, diff --git a/vp9/common/arm/neon/vp9_short_idct16x16_1_add_neon.asm b/vp9/common/arm/neon/vp9_short_idct16x16_1_add_neon.asm index cf5c8f7d0..b1fd21bb6 100644 --- a/vp9/common/arm/neon/vp9_short_idct16x16_1_add_neon.asm +++ b/vp9/common/arm/neon/vp9_short_idct16x16_1_add_neon.asm @@ -8,21 +8,21 @@ ; - EXPORT |vp9_short_idct16x16_1_add_neon| + EXPORT |vp9_idct16x16_1_add_neon| ARM REQUIRE8 PRESERVE8 AREA ||.text||, CODE, READONLY, ALIGN=2 -;void vp9_short_idct16x16_1_add_neon(int16_t *input, uint8_t *dest, +;void vp9_idct16x16_1_add_neon(int16_t *input, uint8_t *dest, ; int dest_stride) ; ; r0 int16_t input ; r1 uint8_t *dest ; r2 int dest_stride) -|vp9_short_idct16x16_1_add_neon| PROC +|vp9_idct16x16_1_add_neon| PROC ldrsh r0, [r0] ; generate cospi_16_64 = 11585 @@ -193,6 +193,6 @@ vst1.64 {d31}, [r12], r2 bx lr - ENDP ; |vp9_short_idct16x16_1_add_neon| + ENDP ; |vp9_idct16x16_1_add_neon| END diff --git a/vp9/common/arm/neon/vp9_short_idct16x16_add_neon.asm b/vp9/common/arm/neon/vp9_short_idct16x16_add_neon.asm index df2a0526c..a13c0d04b 100644 --- a/vp9/common/arm/neon/vp9_short_idct16x16_add_neon.asm +++ b/vp9/common/arm/neon/vp9_short_idct16x16_add_neon.asm @@ -8,10 +8,10 @@ ; be found in the AUTHORS file in the root of the source tree. ; - EXPORT |vp9_short_idct16x16_add_neon_pass1| - EXPORT |vp9_short_idct16x16_add_neon_pass2| - EXPORT |vp9_short_idct16x16_10_add_neon_pass1| - EXPORT |vp9_short_idct16x16_10_add_neon_pass2| + EXPORT |vp9_idct16x16_256_add_neon_pass1| + EXPORT |vp9_idct16x16_256_add_neon_pass2| + EXPORT |vp9_idct16x16_10_add_neon_pass1| + EXPORT |vp9_idct16x16_10_add_neon_pass2| ARM REQUIRE8 PRESERVE8 @@ -36,7 +36,7 @@ MEND AREA Block, CODE, READONLY ; name this block of code -;void |vp9_short_idct16x16_add_neon_pass1|(int16_t *input, +;void |vp9_idct16x16_256_add_neon_pass1|(int16_t *input, ; int16_t *output, int output_stride) ; ; r0 int16_t input @@ -46,7 +46,7 @@ ; idct16 stage1 - stage6 on all the elements loaded in q8-q15. The output ; will be stored back into q8-q15 registers. This function will touch q0-q7 ; registers and use them as buffer during calculation. -|vp9_short_idct16x16_add_neon_pass1| PROC +|vp9_idct16x16_256_add_neon_pass1| PROC ; TODO(hkuang): Find a better way to load the elements. ; load elements of 0, 2, 4, 6, 8, 10, 12, 14 into q8 - q15 @@ -273,9 +273,9 @@ vst1.64 {d31}, [r1], r2 bx lr - ENDP ; |vp9_short_idct16x16_add_neon_pass1| + ENDP ; |vp9_idct16x16_256_add_neon_pass1| -;void vp9_short_idct16x16_add_neon_pass2(int16_t *src, +;void vp9_idct16x16_256_add_neon_pass2(int16_t *src, ; int16_t *output, ; int16_t *pass1Output, ; int16_t skip_adding, @@ -292,7 +292,7 @@ ; idct16 stage1 - stage7 on all the elements loaded in q8-q15. The output ; will be stored back into q8-q15 registers. This function will touch q0-q7 ; registers and use them as buffer during calculation. -|vp9_short_idct16x16_add_neon_pass2| PROC +|vp9_idct16x16_256_add_neon_pass2| PROC push {r3-r9} ; TODO(hkuang): Find a better way to load the elements. @@ -784,9 +784,9 @@ skip_adding_dest end_idct16x16_pass2 pop {r3-r9} bx lr - ENDP ; |vp9_short_idct16x16_add_neon_pass2| + ENDP ; |vp9_idct16x16_256_add_neon_pass2| -;void |vp9_short_idct16x16_10_add_neon_pass1|(int16_t *input, +;void |vp9_idct16x16_10_add_neon_pass1|(int16_t *input, ; int16_t *output, int output_stride) ; ; r0 int16_t input @@ -796,7 +796,7 @@ end_idct16x16_pass2 ; idct16 stage1 - stage6 on all the elements loaded in q8-q15. The output ; will be stored back into q8-q15 registers. This function will touch q0-q7 ; registers and use them as buffer during calculation. -|vp9_short_idct16x16_10_add_neon_pass1| PROC +|vp9_idct16x16_10_add_neon_pass1| PROC ; TODO(hkuang): Find a better way to load the elements. ; load elements of 0, 2, 4, 6, 8, 10, 12, 14 into q8 - q15 @@ -905,9 +905,9 @@ end_idct16x16_pass2 vst1.64 {d31}, [r1], r2 bx lr - ENDP ; |vp9_short_idct16x16_10_add_neon_pass1| + ENDP ; |vp9_idct16x16_10_add_neon_pass1| -;void vp9_short_idct16x16_10_add_neon_pass2(int16_t *src, +;void vp9_idct16x16_10_add_neon_pass2(int16_t *src, ; int16_t *output, ; int16_t *pass1Output, ; int16_t skip_adding, @@ -924,7 +924,7 @@ end_idct16x16_pass2 ; idct16 stage1 - stage7 on all the elements loaded in q8-q15. The output ; will be stored back into q8-q15 registers. This function will touch q0-q7 ; registers and use them as buffer during calculation. -|vp9_short_idct16x16_10_add_neon_pass2| PROC +|vp9_idct16x16_10_add_neon_pass2| PROC push {r3-r9} ; TODO(hkuang): Find a better way to load the elements. @@ -1175,5 +1175,5 @@ end_idct16x16_pass2 end_idct10_16x16_pass2 pop {r3-r9} bx lr - ENDP ; |vp9_short_idct16x16_10_add_neon_pass2| + ENDP ; |vp9_idct16x16_10_add_neon_pass2| END diff --git a/vp9/common/vp9_idct.c b/vp9/common/vp9_idct.c index dea923724..2df4c206e 100644 --- a/vp9/common/vp9_idct.c +++ b/vp9/common/vp9_idct.c @@ -611,7 +611,7 @@ static void idct16_1d(int16_t *input, int16_t *output) { output[15] = step2[0] - step2[15]; } -void vp9_short_idct16x16_add_c(int16_t *input, uint8_t *dest, int dest_stride) { +void vp9_idct16x16_256_add_c(int16_t *input, uint8_t *dest, int dest_stride) { int16_t out[16 * 16]; int16_t *outptr = out; int i, j; @@ -838,7 +838,7 @@ void vp9_short_iht16x16_add_c(int16_t *input, uint8_t *dest, int dest_stride, + dest[j * dest_stride + i]); } } -void vp9_short_idct16x16_10_add_c(int16_t *input, uint8_t *dest, +void vp9_idct16x16_10_add_c(int16_t *input, uint8_t *dest, int dest_stride) { int16_t out[16 * 16] = { 0 }; int16_t *outptr = out; @@ -864,7 +864,7 @@ void vp9_short_idct16x16_10_add_c(int16_t *input, uint8_t *dest, } } -void vp9_short_idct16x16_1_add_c(int16_t *input, uint8_t *dest, +void vp9_idct16x16_1_add_c(int16_t *input, uint8_t *dest, int dest_stride) { int i, j; int a1; @@ -1320,17 +1320,17 @@ void vp9_idct8x8_add(int16_t *input, uint8_t *dest, int stride, int eob) { } } -void vp9_idct_add_16x16(int16_t *input, uint8_t *dest, int stride, int eob) { +void vp9_idct16x16_add(int16_t *input, uint8_t *dest, int stride, int eob) { /* The calculation can be simplified if there are not many non-zero dct * coefficients. Use eobs to separate different cases. */ if (eob) { if (eob == 1) /* DC only DCT coefficient. */ - vp9_short_idct16x16_1_add(input, dest, stride); + vp9_idct16x16_1_add(input, dest, stride); else if (eob <= 10) - vp9_short_idct16x16_10_add(input, dest, stride); + vp9_idct16x16_10_add(input, dest, stride); else - vp9_short_idct16x16_add(input, dest, stride); + vp9_idct16x16_256_add(input, dest, stride); } } @@ -1366,7 +1366,7 @@ void vp9_iht_add_8x8(TX_TYPE tx_type, int16_t *input, uint8_t *dest, void vp9_iht_add_16x16(TX_TYPE tx_type, int16_t *input, uint8_t *dest, int stride, int eob) { if (tx_type == DCT_DCT) { - vp9_idct_add_16x16(input, dest, stride, eob); + vp9_idct16x16_add(input, dest, stride, eob); } else { if (eob > 0) { vp9_short_iht16x16_add(input, dest, stride, tx_type); diff --git a/vp9/common/vp9_idct.h b/vp9/common/vp9_idct.h index 41519ce44..e01c6638b 100644 --- a/vp9/common/vp9_idct.h +++ b/vp9/common/vp9_idct.h @@ -91,7 +91,7 @@ typedef struct { void vp9_idct4x4_add(int16_t *input, uint8_t *dest, int stride, int eob); void vp9_iwht4x4_add(int16_t *input, uint8_t *dest, int stride, int eob); void vp9_idct8x8_add(int16_t *input, uint8_t *dest, int stride, int eob); -void vp9_idct_add_16x16(int16_t *input, uint8_t *dest, int stride, int eob); +void vp9_idct16x16_add(int16_t *input, uint8_t *dest, int stride, int eob); void vp9_idct_add_32x32(int16_t *input, uint8_t *dest, int stride, int eob); void vp9_iht_add(TX_TYPE tx_type, int16_t *input, uint8_t *dest, diff --git a/vp9/common/vp9_rtcd_defs.sh b/vp9/common/vp9_rtcd_defs.sh index 244064f60..9aaf84867 100644 --- a/vp9/common/vp9_rtcd_defs.sh +++ b/vp9/common/vp9_rtcd_defs.sh @@ -282,14 +282,14 @@ specialize vp9_idct8x8_64_add sse2 neon prototype void vp9_idct8x8_10_add "int16_t *input, uint8_t *dest, int dest_stride" specialize vp9_idct8x8_10_add sse2 neon -prototype void vp9_short_idct16x16_1_add "int16_t *input, uint8_t *dest, int dest_stride" -specialize vp9_short_idct16x16_1_add sse2 neon +prototype void vp9_idct16x16_1_add "int16_t *input, uint8_t *dest, int dest_stride" +specialize vp9_idct16x16_1_add sse2 neon -prototype void vp9_short_idct16x16_add "int16_t *input, uint8_t *dest, int dest_stride" -specialize vp9_short_idct16x16_add sse2 neon +prototype void vp9_idct16x16_256_add "int16_t *input, uint8_t *dest, int dest_stride" +specialize vp9_idct16x16_256_add sse2 neon -prototype void vp9_short_idct16x16_10_add "int16_t *input, uint8_t *dest, int dest_stride" -specialize vp9_short_idct16x16_10_add sse2 neon +prototype void vp9_idct16x16_10_add "int16_t *input, uint8_t *dest, int dest_stride" +specialize vp9_idct16x16_10_add sse2 neon prototype void vp9_short_idct32x32_add "int16_t *input, uint8_t *dest, int dest_stride" specialize vp9_short_idct32x32_add sse2 neon diff --git a/vp9/common/x86/vp9_idct_intrin_sse2.c b/vp9/common/x86/vp9_idct_intrin_sse2.c index 84338051b..c9d0d092a 100644 --- a/vp9/common/x86/vp9_idct_intrin_sse2.c +++ b/vp9/common/x86/vp9_idct_intrin_sse2.c @@ -1263,7 +1263,7 @@ void vp9_idct8x8_10_add_sse2(int16_t *input, uint8_t *dest, int stride) { stp2_10, stp2_13, stp2_11, stp2_12) \ } -void vp9_short_idct16x16_add_sse2(int16_t *input, uint8_t *dest, int stride) { +void vp9_idct16x16_256_add_sse2(int16_t *input, uint8_t *dest, int stride) { const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); const __m128i final_rounding = _mm_set1_epi16(1<<5); const __m128i zero = _mm_setzero_si128(); @@ -1470,7 +1470,7 @@ void vp9_short_idct16x16_add_sse2(int16_t *input, uint8_t *dest, int stride) { } } -void vp9_short_idct16x16_1_add_sse2(int16_t *input, uint8_t *dest, int stride) { +void vp9_idct16x16_1_add_sse2(int16_t *input, uint8_t *dest, int stride) { __m128i dc_value; const __m128i zero = _mm_setzero_si128(); int a, i; @@ -2456,7 +2456,7 @@ void vp9_short_iht16x16_add_sse2(int16_t *input, uint8_t *dest, int stride, write_buffer_8x16(dest, in1, stride); } -void vp9_short_idct16x16_10_add_sse2(int16_t *input, uint8_t *dest, +void vp9_idct16x16_10_add_sse2(int16_t *input, uint8_t *dest, int stride) { const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); const __m128i final_rounding = _mm_set1_epi16(1<<5); diff --git a/vp9/encoder/vp9_encodemb.c b/vp9/encoder/vp9_encodemb.c index 994d0c810..9c6605cdd 100644 --- a/vp9/encoder/vp9_encodemb.c +++ b/vp9/encoder/vp9_encodemb.c @@ -457,7 +457,7 @@ static void encode_block(int plane, int block, BLOCK_SIZE plane_bsize, vp9_short_idct32x32_add(dqcoeff, dst, pd->dst.stride); break; case TX_16X16: - vp9_idct_add_16x16(dqcoeff, dst, pd->dst.stride, pd->eobs[block]); + vp9_idct16x16_add(dqcoeff, dst, pd->dst.stride, pd->eobs[block]); break; case TX_8X8: vp9_idct8x8_add(dqcoeff, dst, pd->dst.stride, pd->eobs[block]);