Change eob threshold for partial inverse 8x8 2D-DCT to 12
The scanning order has the first 12 coefficients of the 8x8 2D-DCT sitting in the top left 4x4 block. Hence the partial inverse 8x8 2D-DCT allows to handle cases with eob below 12. The overall runtime of the inverse 8x8 2D-DCT unit is reduced from 166 cycles (using SSE2) to 150 cycles (using SSSE3). Change-Id: I4514f9748042809ac84df4c14382c00f313f1cd2
This commit is contained in:
parent
9e7b09bc5d
commit
41a350a83d
@ -132,8 +132,8 @@ INSTANTIATE_TEST_CASE_P(
|
|||||||
&vp9_idct16x16_1_add_c,
|
&vp9_idct16x16_1_add_c,
|
||||||
TX_16X16, 1),
|
TX_16X16, 1),
|
||||||
make_tuple(&vp9_idct8x8_64_add_c,
|
make_tuple(&vp9_idct8x8_64_add_c,
|
||||||
&vp9_idct8x8_10_add_c,
|
&vp9_idct8x8_12_add_c,
|
||||||
TX_8X8, 10),
|
TX_8X8, 12),
|
||||||
make_tuple(&vp9_idct8x8_64_add_c,
|
make_tuple(&vp9_idct8x8_64_add_c,
|
||||||
&vp9_idct8x8_1_add_c,
|
&vp9_idct8x8_1_add_c,
|
||||||
TX_8X8, 1),
|
TX_8X8, 1),
|
||||||
@ -154,8 +154,8 @@ INSTANTIATE_TEST_CASE_P(
|
|||||||
&vp9_idct16x16_1_add_neon,
|
&vp9_idct16x16_1_add_neon,
|
||||||
TX_16X16, 1),
|
TX_16X16, 1),
|
||||||
make_tuple(&vp9_idct8x8_64_add_c,
|
make_tuple(&vp9_idct8x8_64_add_c,
|
||||||
&vp9_idct8x8_10_add_neon,
|
&vp9_idct8x8_12_add_neon,
|
||||||
TX_8X8, 10),
|
TX_8X8, 12),
|
||||||
make_tuple(&vp9_idct8x8_64_add_c,
|
make_tuple(&vp9_idct8x8_64_add_c,
|
||||||
&vp9_idct8x8_1_add_neon,
|
&vp9_idct8x8_1_add_neon,
|
||||||
TX_8X8, 1),
|
TX_8X8, 1),
|
||||||
@ -181,8 +181,8 @@ INSTANTIATE_TEST_CASE_P(
|
|||||||
&vp9_idct16x16_1_add_sse2,
|
&vp9_idct16x16_1_add_sse2,
|
||||||
TX_16X16, 1),
|
TX_16X16, 1),
|
||||||
make_tuple(&vp9_idct8x8_64_add_c,
|
make_tuple(&vp9_idct8x8_64_add_c,
|
||||||
&vp9_idct8x8_10_add_sse2,
|
&vp9_idct8x8_12_add_sse2,
|
||||||
TX_8X8, 10),
|
TX_8X8, 12),
|
||||||
make_tuple(&vp9_idct8x8_64_add_c,
|
make_tuple(&vp9_idct8x8_64_add_c,
|
||||||
&vp9_idct8x8_1_add_sse2,
|
&vp9_idct8x8_1_add_sse2,
|
||||||
TX_8X8, 1),
|
TX_8X8, 1),
|
||||||
|
@ -9,7 +9,7 @@
|
|||||||
;
|
;
|
||||||
|
|
||||||
EXPORT |vp9_idct8x8_64_add_neon|
|
EXPORT |vp9_idct8x8_64_add_neon|
|
||||||
EXPORT |vp9_idct8x8_10_add_neon|
|
EXPORT |vp9_idct8x8_12_add_neon|
|
||||||
ARM
|
ARM
|
||||||
REQUIRE8
|
REQUIRE8
|
||||||
PRESERVE8
|
PRESERVE8
|
||||||
@ -310,13 +310,13 @@
|
|||||||
bx lr
|
bx lr
|
||||||
ENDP ; |vp9_idct8x8_64_add_neon|
|
ENDP ; |vp9_idct8x8_64_add_neon|
|
||||||
|
|
||||||
;void vp9_idct8x8_10_add_neon(int16_t *input, uint8_t *dest, int dest_stride)
|
;void vp9_idct8x8_12_add_neon(int16_t *input, uint8_t *dest, int dest_stride)
|
||||||
;
|
;
|
||||||
; r0 int16_t input
|
; r0 int16_t input
|
||||||
; r1 uint8_t *dest
|
; r1 uint8_t *dest
|
||||||
; r2 int dest_stride)
|
; r2 int dest_stride)
|
||||||
|
|
||||||
|vp9_idct8x8_10_add_neon| PROC
|
|vp9_idct8x8_12_add_neon| PROC
|
||||||
push {r4-r9}
|
push {r4-r9}
|
||||||
vpush {d8-d15}
|
vpush {d8-d15}
|
||||||
vld1.s16 {q8,q9}, [r0]!
|
vld1.s16 {q8,q9}, [r0]!
|
||||||
@ -514,6 +514,6 @@
|
|||||||
vpop {d8-d15}
|
vpop {d8-d15}
|
||||||
pop {r4-r9}
|
pop {r4-r9}
|
||||||
bx lr
|
bx lr
|
||||||
ENDP ; |vp9_idct8x8_10_add_neon|
|
ENDP ; |vp9_idct8x8_12_add_neon|
|
||||||
|
|
||||||
END
|
END
|
||||||
|
@ -617,7 +617,7 @@ void vp9_iht8x8_64_add_dspr2(const int16_t *input, uint8_t *dest,
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void vp9_idct8x8_10_add_dspr2(const int16_t *input, uint8_t *dest,
|
void vp9_idct8x8_12_add_dspr2(const int16_t *input, uint8_t *dest,
|
||||||
int dest_stride) {
|
int dest_stride) {
|
||||||
DECLARE_ALIGNED(32, int16_t, out[8 * 8]);
|
DECLARE_ALIGNED(32, int16_t, out[8 * 8]);
|
||||||
int16_t *outptr = out;
|
int16_t *outptr = out;
|
||||||
|
@ -421,7 +421,7 @@ void vp9_iht8x8_64_add_c(const int16_t *input, uint8_t *dest, int stride,
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void vp9_idct8x8_10_add_c(const int16_t *input, uint8_t *dest, int stride) {
|
void vp9_idct8x8_12_add_c(const int16_t *input, uint8_t *dest, int stride) {
|
||||||
int16_t out[8 * 8] = { 0 };
|
int16_t out[8 * 8] = { 0 };
|
||||||
int16_t *outptr = out;
|
int16_t *outptr = out;
|
||||||
int i, j;
|
int i, j;
|
||||||
@ -1348,8 +1348,8 @@ void vp9_idct8x8_add(const int16_t *input, uint8_t *dest, int stride, int eob) {
|
|||||||
if (eob == 1)
|
if (eob == 1)
|
||||||
// DC only DCT coefficient
|
// DC only DCT coefficient
|
||||||
vp9_idct8x8_1_add(input, dest, stride);
|
vp9_idct8x8_1_add(input, dest, stride);
|
||||||
else if (eob <= 10)
|
else if (eob <= 12)
|
||||||
vp9_idct8x8_10_add(input, dest, stride);
|
vp9_idct8x8_12_add(input, dest, stride);
|
||||||
else
|
else
|
||||||
vp9_idct8x8_64_add(input, dest, stride);
|
vp9_idct8x8_64_add(input, dest, stride);
|
||||||
}
|
}
|
||||||
|
@ -312,8 +312,8 @@ specialize qw/vp9_idct8x8_1_add sse2 neon dspr2/;
|
|||||||
add_proto qw/void vp9_idct8x8_64_add/, "const int16_t *input, uint8_t *dest, int dest_stride";
|
add_proto qw/void vp9_idct8x8_64_add/, "const int16_t *input, uint8_t *dest, int dest_stride";
|
||||||
specialize qw/vp9_idct8x8_64_add sse2 neon dspr2/, "$ssse3_x86_64";
|
specialize qw/vp9_idct8x8_64_add sse2 neon dspr2/, "$ssse3_x86_64";
|
||||||
|
|
||||||
add_proto qw/void vp9_idct8x8_10_add/, "const int16_t *input, uint8_t *dest, int dest_stride";
|
add_proto qw/void vp9_idct8x8_12_add/, "const int16_t *input, uint8_t *dest, int dest_stride";
|
||||||
specialize qw/vp9_idct8x8_10_add sse2 neon dspr2/, "$ssse3_x86_64";
|
specialize qw/vp9_idct8x8_12_add sse2 neon dspr2/, "$ssse3_x86_64";
|
||||||
|
|
||||||
add_proto qw/void vp9_idct16x16_1_add/, "const int16_t *input, uint8_t *dest, int dest_stride";
|
add_proto qw/void vp9_idct16x16_1_add/, "const int16_t *input, uint8_t *dest, int dest_stride";
|
||||||
specialize qw/vp9_idct16x16_1_add sse2 neon dspr2/;
|
specialize qw/vp9_idct16x16_1_add sse2 neon dspr2/;
|
||||||
|
@ -995,7 +995,7 @@ void vp9_iht8x8_64_add_sse2(const int16_t *input, uint8_t *dest, int stride,
|
|||||||
RECON_AND_STORE(dest, in[7]);
|
RECON_AND_STORE(dest, in[7]);
|
||||||
}
|
}
|
||||||
|
|
||||||
void vp9_idct8x8_10_add_sse2(const int16_t *input, uint8_t *dest, int stride) {
|
void vp9_idct8x8_12_add_sse2(const int16_t *input, uint8_t *dest, int stride) {
|
||||||
const __m128i zero = _mm_setzero_si128();
|
const __m128i zero = _mm_setzero_si128();
|
||||||
const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
|
const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
|
||||||
const __m128i final_rounding = _mm_set1_epi16(1<<4);
|
const __m128i final_rounding = _mm_set1_epi16(1<<4);
|
||||||
|
@ -185,7 +185,7 @@ cglobal idct8x8_64_add, 3, 5, 13, input, output, stride
|
|||||||
RET
|
RET
|
||||||
|
|
||||||
; inverse 8x8 2D-DCT transform with only first 10 coeffs non-zero
|
; inverse 8x8 2D-DCT transform with only first 10 coeffs non-zero
|
||||||
cglobal idct8x8_10_add, 3, 5, 13, input, output, stride
|
cglobal idct8x8_12_add, 3, 5, 13, input, output, stride
|
||||||
mova m8, [pd_8192]
|
mova m8, [pd_8192]
|
||||||
mova m11, [pw_16]
|
mova m11, [pw_16]
|
||||||
mova m12, [pw_11585x2]
|
mova m12, [pw_11585x2]
|
||||||
|
Loading…
x
Reference in New Issue
Block a user