bitdepth conversion: really use num elements
The previous implementation confused bit/bytes/elements. It was using '32' as the multiplier but that was mistakenly adopted because a 32x32 transform embedded the stride. Change-Id: Ieeb867a332416b9a40580b5e7c9b20088e9e691a
This commit is contained in:
parent
60a10116d1
commit
44600442dc
@ -64,6 +64,6 @@ cglobal fwht4x4, 3, 4, 8, input, output, stride
|
|||||||
psllw m1, 2
|
psllw m1, 2
|
||||||
|
|
||||||
STORE_TRAN_LOW 0, outputq, 0, 2, 3
|
STORE_TRAN_LOW 0, outputq, 0, 2, 3
|
||||||
STORE_TRAN_LOW 1, outputq, 1, 2, 3
|
STORE_TRAN_LOW 1, outputq, 8, 2, 3
|
||||||
|
|
||||||
RET
|
RET
|
||||||
|
@ -91,8 +91,8 @@ cglobal block_error_fp, 3, 3, 6, uqc, dqc, size
|
|||||||
.loop:
|
.loop:
|
||||||
LOAD_TRAN_LOW 2, uqcq, 0
|
LOAD_TRAN_LOW 2, uqcq, 0
|
||||||
LOAD_TRAN_LOW 0, dqcq, 0
|
LOAD_TRAN_LOW 0, dqcq, 0
|
||||||
LOAD_TRAN_LOW 3, uqcq, 1
|
LOAD_TRAN_LOW 3, uqcq, 8
|
||||||
LOAD_TRAN_LOW 1, dqcq, 1
|
LOAD_TRAN_LOW 1, dqcq, 8
|
||||||
INCREMENT_ELEMENTS_TRAN_LOW uqcq, 16
|
INCREMENT_ELEMENTS_TRAN_LOW uqcq, 16
|
||||||
INCREMENT_ELEMENTS_TRAN_LOW dqcq, 16
|
INCREMENT_ELEMENTS_TRAN_LOW dqcq, 16
|
||||||
sub sizeq, 16
|
sub sizeq, 16
|
||||||
|
@ -117,14 +117,14 @@ cglobal hadamard_8x8, 3, 5, 11, input, stride, output
|
|||||||
TRANSPOSE8X8 0, 1, 2, 3, 4, 5, 6, 7, 9, 10
|
TRANSPOSE8X8 0, 1, 2, 3, 4, 5, 6, 7, 9, 10
|
||||||
HMD8_1D
|
HMD8_1D
|
||||||
|
|
||||||
STORE_TRAN_LOW 0, outputq, 0, 8, 9
|
STORE_TRAN_LOW 0, outputq, 0, 8, 9
|
||||||
STORE_TRAN_LOW 1, outputq, 1, 8, 9
|
STORE_TRAN_LOW 1, outputq, 8, 8, 9
|
||||||
STORE_TRAN_LOW 2, outputq, 2, 8, 9
|
STORE_TRAN_LOW 2, outputq, 16, 8, 9
|
||||||
STORE_TRAN_LOW 3, outputq, 3, 8, 9
|
STORE_TRAN_LOW 3, outputq, 24, 8, 9
|
||||||
STORE_TRAN_LOW 4, outputq, 4, 8, 9
|
STORE_TRAN_LOW 4, outputq, 32, 8, 9
|
||||||
STORE_TRAN_LOW 5, outputq, 5, 8, 9
|
STORE_TRAN_LOW 5, outputq, 40, 8, 9
|
||||||
STORE_TRAN_LOW 6, outputq, 6, 8, 9
|
STORE_TRAN_LOW 6, outputq, 48, 8, 9
|
||||||
STORE_TRAN_LOW 7, outputq, 7, 8, 9
|
STORE_TRAN_LOW 7, outputq, 56, 8, 9
|
||||||
|
|
||||||
RET
|
RET
|
||||||
%endif
|
%endif
|
||||||
|
@ -32,21 +32,21 @@
|
|||||||
%endmacro
|
%endmacro
|
||||||
|
|
||||||
; Load %2 + %3 into m%1.
|
; Load %2 + %3 into m%1.
|
||||||
; %3 is the offset in elements, not bits.
|
; %3 is the offset in elements, not bytes.
|
||||||
; If tran_low_t is 16 bits (low bit depth configuration) then load the value
|
; If tran_low_t is 16 bits (low bit depth configuration) then load the value
|
||||||
; directly. If tran_low_t is 32 bits (high bit depth configuration) then pack
|
; directly. If tran_low_t is 32 bits (high bit depth configuration) then pack
|
||||||
; the values down to 16 bits.
|
; the values down to 16 bits.
|
||||||
%macro LOAD_TRAN_LOW 3
|
%macro LOAD_TRAN_LOW 3
|
||||||
%if CONFIG_VP9_HIGHBITDEPTH
|
%if CONFIG_VP9_HIGHBITDEPTH
|
||||||
mova m%1, [%2 + %3 * 32]
|
mova m%1, [%2 + %3 * 4]
|
||||||
packssdw m%1, [%2 + %3 * 32 + 16]
|
packssdw m%1, [%2 + %3 * 4 + 16]
|
||||||
%else
|
%else
|
||||||
mova m%1, [%2 + %3 * 16]
|
mova m%1, [%2 + %3 * 2]
|
||||||
%endif
|
%endif
|
||||||
%endmacro
|
%endmacro
|
||||||
|
|
||||||
; Store m%1 to %2 + %3.
|
; Store m%1 to %2 + %3.
|
||||||
; %3 is the offset in elements, not bits.
|
; %3 is the offset in elements, not bytes.
|
||||||
; If tran_low_t is 16 bits (low bit depth configuration) then store the value
|
; If tran_low_t is 16 bits (low bit depth configuration) then store the value
|
||||||
; directly. If tran_low_t is 32 bits (high bit depth configuration) then sign
|
; directly. If tran_low_t is 32 bits (high bit depth configuration) then sign
|
||||||
; extend the values first.
|
; extend the values first.
|
||||||
@ -58,9 +58,9 @@
|
|||||||
pcmpgtw m%4, m%1
|
pcmpgtw m%4, m%1
|
||||||
punpcklwd m%5, m%4
|
punpcklwd m%5, m%4
|
||||||
punpckhwd m%1, m%4
|
punpckhwd m%1, m%4
|
||||||
mova [%2 + %3 * 32 + 0], m%5
|
mova [%2 + %3 * 4 + 0], m%5
|
||||||
mova [%2 + %3 * 32 + 16], m%1
|
mova [%2 + %3 * 4 + 16], m%1
|
||||||
%else
|
%else
|
||||||
mova [%2 + %3 * 16], m%1
|
mova [%2 + %3 * 2], m%1
|
||||||
%endif
|
%endif
|
||||||
%endmacro
|
%endmacro
|
||||||
|
@ -984,14 +984,14 @@ idct32x32_135:
|
|||||||
mov r7, 2
|
mov r7, 2
|
||||||
|
|
||||||
idct32x32_135_transpose:
|
idct32x32_135_transpose:
|
||||||
LOAD_TRAN_LOW 0, r3, 0
|
LOAD_TRAN_LOW 0, r3, 0
|
||||||
LOAD_TRAN_LOW 1, r3, 4
|
LOAD_TRAN_LOW 1, r3, 32
|
||||||
LOAD_TRAN_LOW 2, r3, 8
|
LOAD_TRAN_LOW 2, r3, 64
|
||||||
LOAD_TRAN_LOW 3, r3, 12
|
LOAD_TRAN_LOW 3, r3, 96
|
||||||
LOAD_TRAN_LOW 4, r3, 16
|
LOAD_TRAN_LOW 4, r3, 128
|
||||||
LOAD_TRAN_LOW 5, r3, 20
|
LOAD_TRAN_LOW 5, r3, 160
|
||||||
LOAD_TRAN_LOW 6, r3, 24
|
LOAD_TRAN_LOW 6, r3, 192
|
||||||
LOAD_TRAN_LOW 7, r3, 28
|
LOAD_TRAN_LOW 7, r3, 224
|
||||||
|
|
||||||
TRANSPOSE8X8 0, 1, 2, 3, 4, 5, 6, 7, 9, 10
|
TRANSPOSE8X8 0, 1, 2, 3, 4, 5, 6, 7, 9, 10
|
||||||
|
|
||||||
@ -1422,14 +1422,14 @@ idct32x32_1024:
|
|||||||
mov r7, 4
|
mov r7, 4
|
||||||
|
|
||||||
idct32x32_1024_transpose:
|
idct32x32_1024_transpose:
|
||||||
LOAD_TRAN_LOW 0, r3, 0
|
LOAD_TRAN_LOW 0, r3, 0
|
||||||
LOAD_TRAN_LOW 1, r3, 4
|
LOAD_TRAN_LOW 1, r3, 32
|
||||||
LOAD_TRAN_LOW 2, r3, 8
|
LOAD_TRAN_LOW 2, r3, 64
|
||||||
LOAD_TRAN_LOW 3, r3, 12
|
LOAD_TRAN_LOW 3, r3, 96
|
||||||
LOAD_TRAN_LOW 4, r3, 16
|
LOAD_TRAN_LOW 4, r3, 128
|
||||||
LOAD_TRAN_LOW 5, r3, 20
|
LOAD_TRAN_LOW 5, r3, 160
|
||||||
LOAD_TRAN_LOW 6, r3, 24
|
LOAD_TRAN_LOW 6, r3, 192
|
||||||
LOAD_TRAN_LOW 7, r3, 28
|
LOAD_TRAN_LOW 7, r3, 224
|
||||||
|
|
||||||
TRANSPOSE8X8 0, 1, 2, 3, 4, 5, 6, 7, 9, 10
|
TRANSPOSE8X8 0, 1, 2, 3, 4, 5, 6, 7, 9, 10
|
||||||
|
|
||||||
|
@ -84,7 +84,7 @@ SECTION .text
|
|||||||
INIT_XMM sse2
|
INIT_XMM sse2
|
||||||
cglobal iwht4x4_16_add, 3, 3, 7, input, output, stride
|
cglobal iwht4x4_16_add, 3, 3, 7, input, output, stride
|
||||||
LOAD_TRAN_LOW 0, inputq, 0
|
LOAD_TRAN_LOW 0, inputq, 0
|
||||||
LOAD_TRAN_LOW 1, inputq, 1
|
LOAD_TRAN_LOW 1, inputq, 8
|
||||||
psraw m0, 2
|
psraw m0, 2
|
||||||
psraw m1, 2
|
psraw m1, 2
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user