Add ARM64 Code and UnitTest for reconstruction
This commit is contained in:
parent
5a60d0fef4
commit
d88b83df44
@ -47,6 +47,7 @@
|
||||
4CE4472E18BC605C0017DF25 /* welsEncoderExt.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 4CE4470618BC605C0017DF25 /* welsEncoderExt.cpp */; };
|
||||
9AED665019469FC1009A3567 /* welsCodecTrace.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 9AED664C19469FC1009A3567 /* welsCodecTrace.cpp */; };
|
||||
9AED66661946A2B3009A3567 /* utils.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 9AED66651946A2B3009A3567 /* utils.cpp */; };
|
||||
F5617A50196A833A006E2B20 /* reconstruct_aarch64_neon.S in Sources */ = {isa = PBXBuildFile; fileRef = F5617A4F196A833A006E2B20 /* reconstruct_aarch64_neon.S */; };
|
||||
/* End PBXBuildFile section */
|
||||
|
||||
/* Begin PBXCopyFilesBuildPhase section */
|
||||
@ -155,6 +156,7 @@
|
||||
9AED664C19469FC1009A3567 /* welsCodecTrace.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; name = welsCodecTrace.cpp; path = ../../../common/src/welsCodecTrace.cpp; sourceTree = "<group>"; };
|
||||
9AED66651946A2B3009A3567 /* utils.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; name = utils.cpp; path = ../../../common/src/utils.cpp; sourceTree = "<group>"; };
|
||||
9AED66671946A2C4009A3567 /* utils.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = utils.h; path = ../../../common/inc/utils.h; sourceTree = "<group>"; };
|
||||
F5617A4F196A833A006E2B20 /* reconstruct_aarch64_neon.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; name = reconstruct_aarch64_neon.S; path = arm64/reconstruct_aarch64_neon.S; sourceTree = "<group>"; };
|
||||
/* End PBXFileReference section */
|
||||
|
||||
/* Begin PBXFrameworksBuildPhase section */
|
||||
@ -184,6 +186,7 @@
|
||||
4CB8F2B219235FAC005D6386 /* arm64 */ = {
|
||||
isa = PBXGroup;
|
||||
children = (
|
||||
F5617A4F196A833A006E2B20 /* reconstruct_aarch64_neon.S */,
|
||||
4C23BC5F195A77E0003B81FC /* intra_pred_sad_3_opt_aarch64_neon.S */,
|
||||
4CBC1B82194ACBB400214D9E /* intra_pred_aarch64_neon.S */,
|
||||
4CB8F2B319235FC5005D6386 /* pixel_aarch64_neon.S */,
|
||||
@ -430,6 +433,7 @@
|
||||
4CE4470E18BC605C0017DF25 /* au_set.cpp in Sources */,
|
||||
4CBC1B83194ACBB400214D9E /* intra_pred_aarch64_neon.S in Sources */,
|
||||
4CE4471718BC605C0017DF25 /* mc.cpp in Sources */,
|
||||
F5617A50196A833A006E2B20 /* reconstruct_aarch64_neon.S in Sources */,
|
||||
4CE4472918BC605C0017DF25 /* svc_set_mb_syn_cavlc.cpp in Sources */,
|
||||
4CE4471818BC605C0017DF25 /* md.cpp in Sources */,
|
||||
4CE4471B18BC605C0017DF25 /* nal_encap.cpp in Sources */,
|
||||
|
947
codec/encoder/core/arm64/reconstruct_aarch64_neon.S
Normal file
947
codec/encoder/core/arm64/reconstruct_aarch64_neon.S
Normal file
@ -0,0 +1,947 @@
|
||||
/*!
|
||||
* \copy
|
||||
* Copyright (c) 2013, Cisco Systems
|
||||
* All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions
|
||||
* are met:
|
||||
*
|
||||
* * Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
*
|
||||
* * Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in
|
||||
* the documentation and/or other materials provided with the
|
||||
* distribution.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
|
||||
* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
|
||||
* COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
|
||||
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
|
||||
* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
||||
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
|
||||
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
|
||||
* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
*
|
||||
*/
|
||||
|
||||
#ifdef HAVE_NEON_AARCH64
|
||||
.text
|
||||
#include "arm_arch64_common_macro.S"
|
||||
|
||||
#ifdef __APPLE__
|
||||
.macro ZERO_COUNT_IN_2_QUARWORD
|
||||
// { // input: coef_0 (identy to $3 $4), coef_1(identy to $5 $6), mask_q
|
||||
cmeq $0.8h, $0.8h, #0
|
||||
cmeq $1.8h, $1.8h, #0
|
||||
uzp1 $0.16b, $0.16b, $1.16b
|
||||
ushr $0.16b, $0.16b, 7
|
||||
addv $2, $0.16b
|
||||
// }
|
||||
.endm
|
||||
|
||||
.macro NEWQUANT_COEF_EACH_16BITS // if coef <= 0, - coef; else , coef;
|
||||
// { // input: coef, ff (dst), mf
|
||||
eor $3.16b, $3.16b, $3.16b // init 0 , and keep 0;
|
||||
saba $1.8h, $0.8h, $3.8h // f + abs(coef - 0)
|
||||
smull $4.4s, $1.4h, $2.4h
|
||||
smull2 $5.4s, $1.8h, $2.8h
|
||||
shrn $1.4h, $4.4s, #16
|
||||
shrn2 $1.8h, $5.4s, #16
|
||||
|
||||
cmgt $4.8h, $0.8h, #0 // if true, location of coef == 11111111
|
||||
bif $3.16b, $1.16b, $4.16b // if (x<0) reserved part; else keep 0 untouched
|
||||
shl $3.8h, $3.8h, #1
|
||||
sub $1.8h, $1.8h, $3.8h // if x > 0, -= 0; else x-= 2x
|
||||
// }
|
||||
.endm
|
||||
|
||||
.macro NEWQUANT_COEF_EACH_16BITS_MAX // if coef <= 0, - coef; else , coef;
|
||||
// { // input: coef, ff (dst), mf
|
||||
eor $3.16b, $3.16b, $3.16b // init 0 , and keep 0;
|
||||
saba $1.8h, $0.8h, $3.8h // f + abs(coef - 0)
|
||||
smull $4.4s, $1.4h, $2.4h
|
||||
smull2 $5.4s, $1.8h, $2.8h
|
||||
shrn $1.4h, $4.4s, #16
|
||||
shrn2 $1.8h, $5.4s, #16
|
||||
|
||||
cmgt $4.8h, $0.8h, #0 // if true, location of coef == 11111111
|
||||
bif $3.16b, $1.16b, $4.16b // if (x<0) reserved part; else keep 0 untouched
|
||||
shl $3.8h, $3.8h, #1
|
||||
mov.8h $6, $1
|
||||
sub $1.8h, $1.8h, $3.8h // if x > 0, -= 0; else x-= 2x
|
||||
// }
|
||||
.endm
|
||||
|
||||
.macro QUANT_DUALWORD_COEF_EACH_16BITS // if coef <= 0, - coef; else , coef;
|
||||
// { // input: coef, ff (dst), mf
|
||||
saba $1.8h, $0.8h, $3.8h // f + abs(coef - 0)
|
||||
smull $4.4s, $1.4h, $2.4h
|
||||
shrn $1.4h, $4.4s, #16
|
||||
|
||||
cmgt $4.8h, $0.8h, #0 // if true, location of coef == 11111111
|
||||
bif $3.16b, $1.16b, $4.16b // if (x<0) reserved part; else keep 0 untouched
|
||||
shl $3.8h, $3.8h, #1
|
||||
sub $1.8h, $1.8h, $3.8h // if x > 0, -= 0; else x-= 2x
|
||||
// }
|
||||
.endm
|
||||
|
||||
.macro SELECT_MAX_IN_ABS_COEF
|
||||
// { // input: coef_0, coef_1, coef_2, coef_3, max_q (identy to follow two)
|
||||
umax $0.8h, $0.8h, $1.8h
|
||||
umaxv $4, $0.8h
|
||||
umax $2.8h, $2.8h, $3.8h
|
||||
umaxv $5, $2.8h
|
||||
// }
|
||||
.endm
|
||||
|
||||
.macro HDM_QUANT_2x2_TOTAL_16BITS
|
||||
// { // input: src_d[0][16][32][48], dst_d[0][16][32][48], working
|
||||
sshr $1.2d, $0.2d, #32
|
||||
add $2.4h, $0.4h, $1.4h // [0] = rs[0] + rs[32];[1] = rs[16] + rs[48];
|
||||
sub $1.4h, $0.4h, $1.4h // [0] = rs[0] - rs[32];[1] = rs[16] - rs[48];
|
||||
zip1 $1.4h, $2.4h, $1.4h
|
||||
// }
|
||||
.endm
|
||||
|
||||
|
||||
.macro DC_ZERO_COUNT_IN_DUALWORD
|
||||
// { // input: coef, dst_d, working_d (all 0x01)
|
||||
cmeq $0.4h, $0.4h, #0
|
||||
and $0.8b, $0.8b, $2.8b
|
||||
addv $1, $0.4h
|
||||
// }
|
||||
.endm
|
||||
|
||||
.macro IHDM_4x4_TOTAL_16BITS
|
||||
// { // input: each src_d[0]~[3](dst), working_q0, working_q1
|
||||
uzp2 $1.4s, $0.4s, $0.4s
|
||||
uzp1 $0.4s, $0.4s, $0.4s
|
||||
add $2.8h, $0.8h, $1.8h // [0] = rs[0] + rs[2];[1] = rs[1] + rs[3];[2] = rs[4] + rs[6];[3] = rs[5] + rs[7];
|
||||
sub $1.8h, $0.8h, $1.8h // [0] = rs[0] - rs[2];[1] = rs[1] - rs[3];[2] = rs[4] - rs[6];[3] = rs[5] - rs[7];
|
||||
zip1 $2.8h, $2.8h, $1.8h // [0] = rs[0] + rs[2]; [1] = rs[0] - rs[2]; ... [2]; [3]
|
||||
|
||||
uzp2 $1.4s, $2.4s, $2.4s
|
||||
uzp1 $0.4s, $2.4s, $2.4s
|
||||
add $2.8h, $0.8h, $1.8h // [0] = rs[0] + rs[2];[1] = rs[1] + rs[3];[2] = rs[4] + rs[6];[3] = rs[5] + rs[7];
|
||||
sub $1.8h, $0.8h, $1.8h // [0] = rs[0] - rs[2];[1] = rs[1] - rs[3];[2] = rs[4] - rs[6];[3] = rs[5] - rs[7];
|
||||
rev32 $1.4h, $1.4h // [0] = rs[1] - rs[3];[1] = rs[0] - rs[2];[2] = rs[5] - rs[7];[3] = rs[4] - rs[6];
|
||||
zip1 $0.4s, $2.4s, $1.4s
|
||||
// }
|
||||
.endm
|
||||
|
||||
.macro MATRIX_TRANSFORM_EACH_16BITS_2x8_OUT2
|
||||
// { // input & output: src_d[0]~[3];[0 1 2 3]+[4 5 6 7]+[8 9 10 11]+[12 13 14 15]
|
||||
uzp1 $2.4s, $0.4s, $1.4s //[0 1 4 5]+[8 9 12 13]
|
||||
uzp2 $3.4s, $0.4s, $1.4s //[2 3 6 7]+[10 11 14 15]
|
||||
|
||||
uzp1 $0.8h, $2.8h, $3.8h //[0 4 8 12]+[2 6 10 14]
|
||||
uzp2 $2.8h, $2.8h, $3.8h //[1 5 9 13]+[3 7 11 15]
|
||||
zip2 $1.2d, $0.2d, $2.2d //[2 6 10 14]+[3 7 11 15]
|
||||
zip1 $0.2d, $0.2d, $2.2d //[0 4 8 12]+[1 5 9 13]
|
||||
// }
|
||||
.endm
|
||||
|
||||
.macro MATRIX_TRANSFORM_EACH_16BITS_OUT4
|
||||
// { // input & output: src_d[0]~[3];[0 4 8 12],[1 5 9 13],[2 6 10 14],[3 7 11 15]
|
||||
trn1 $4.8h, v0.8h, v1.8h
|
||||
trn2 $5.8h, v0.8h, v1.8h
|
||||
trn1 $6.8h, v2.8h, v3.8h
|
||||
trn2 $7.8h, v2.8h, v3.8h
|
||||
|
||||
trn1 $0.4s, v4.4s, v6.4s
|
||||
trn2 $2.4s, v4.4s, v6.4s
|
||||
trn1 $1.4s, v5.4s, v7.4s
|
||||
trn2 $3.4s, v5.4s, v7.4s
|
||||
// }
|
||||
.endm
|
||||
|
||||
.macro MATRIX_TRANSFORM_EACH_16BITS_4x4_OUT2
|
||||
// { // input & output: src_d[0]~[3];[0 1 2 3],[4 5 6 7],[8 9 10 11],[12 13 14 15]
|
||||
mov $0.d[1], $1.d[0] //[0 1 2 3]+[4 5 6 7]
|
||||
mov $2.d[1], $3.d[0] //[8 9 10 11]+[12 13 14 15]
|
||||
uzp1 $1.4s, $0.4s, $2.4s //[0 1 4 5]+[8 9 12 13]
|
||||
uzp2 $3.4s, $0.4s, $2.4s //[2 3 6 7]+[10 11 14 15]
|
||||
|
||||
uzp1 $0.8h, $1.8h, $3.8h //[0 4 8 12]+[2 6 10 14]
|
||||
uzp2 $2.8h, $1.8h, $3.8h //[1 5 9 13]+[3 7 11 15]
|
||||
zip2 $1.2d, $0.2d, $2.2d //[2 6 10 14]+[3 7 11 15]
|
||||
zip1 $0.2d, $0.2d, $2.2d //[0 4 8 12]+[1 5 9 13]
|
||||
// }
|
||||
.endm
|
||||
|
||||
.macro LOAD_4x4_DATA_FOR_DCT
|
||||
ld1 {$0.s}[0], [$2], $3
|
||||
ld1 {$0.s}[1], [$2], $3
|
||||
ld1 {$0.s}[2], [$2], $3
|
||||
ld1 {$0.s}[3], [$2]
|
||||
|
||||
ld1 {$1.s}[0], [$4], $5
|
||||
ld1 {$1.s}[1], [$4], $5
|
||||
ld1 {$1.s}[2], [$4], $5
|
||||
ld1 {$1.s}[3], [$4]
|
||||
.endm
|
||||
|
||||
.macro DCT_ROW_TRANSFORM_TOTAL_16BITS
|
||||
// { // input: src_d[0]~[3], working: [4]~[7]
|
||||
add $4.8h, $0.8h, $3.8h //int16 s[0] = data[i] + data[i3];
|
||||
sub $7.8h, $0.8h, $3.8h //int16 s[3] = data[i] - data[i3];
|
||||
add $5.8h, $1.8h, $2.8h //int16 s[1] = data[i1] + data[i2];
|
||||
sub $6.8h, $1.8h, $2.8h //int16 s[2] = data[i1] - data[i2];
|
||||
|
||||
add $0.8h, $4.8h, $5.8h //int16 dct[i ] = s[0] + s[1];
|
||||
sub $2.8h, $4.8h, $5.8h //int16 dct[i2] = s[0] - s[1];
|
||||
shl $1.8h, $7.8h, #1
|
||||
shl $3.8h, $6.8h, #1
|
||||
add $1.8h, $1.8h, $6.8h //int16 dct[i1] = (s[3] << 1) + s[2];
|
||||
sub $3.8h, $7.8h, $3.8h //int16 dct[i3] = s[3] - (s[2] << 1);
|
||||
// }
|
||||
.endm
|
||||
|
||||
.macro LOAD_8x4_DATA_FOR_DCT
|
||||
// { // input: $0~$3, src1*, src2*; untouched r2:src1_stride &r4:src2_stride
|
||||
ld1 {$0.d}[0], [$8], x2
|
||||
ld1 {$1.d}[0], [$8], x2
|
||||
ld1 {$2.d}[0], [$8], x2
|
||||
ld1 {$3.d}[0], [$8], x2
|
||||
|
||||
ld1 {$4.d}[0], [$9], x4
|
||||
ld1 {$5.d}[0], [$9], x4
|
||||
ld1 {$6.d}[0], [$9], x4
|
||||
ld1 {$7.d}[0], [$9], x4
|
||||
// }
|
||||
.endm
|
||||
|
||||
.macro ROW_TRANSFORM_1_STEP_TOTAL_16BITS
|
||||
// { // input: src_d[0]~[3], output: e_d[0]~[3];
|
||||
add $4.8h, $0.8h, $2.8h //int16 e[i][0] = src[0] + src[2];
|
||||
sub $5.8h, $0.8h, $2.8h //int16 e[i][1] = src[0] - src[2];
|
||||
sshr $6.8h, $1.8h, #1
|
||||
sshr $7.8h, $3.8h, #1
|
||||
sub $6.8h, $6.8h, $3.8h //int16 e[i][2] = (src[1]>>1)-src[3];
|
||||
add $7.8h, $1.8h, $7.8h //int16 e[i][3] = src[1] + (src[3]>>1);
|
||||
// }
|
||||
.endm
|
||||
|
||||
.macro TRANSFORM_TOTAL_16BITS // both row & col transform used
|
||||
// { // output: f_q[0]~[3], input: e_q[0]~[3];
|
||||
add $0.8h, $4.8h, $7.8h //int16 f[i][0] = e[i][0] + e[i][3];
|
||||
add $1.8h, $5.8h, $6.8h //int16 f[i][1] = e[i][1] + e[i][2];
|
||||
sub $2.8h, $5.8h, $6.8h //int16 f[i][2] = e[i][1] - e[i][2];
|
||||
sub $3.8h, $4.8h, $7.8h //int16 f[i][3] = e[i][0] - e[i][3];
|
||||
// }
|
||||
.endm
|
||||
|
||||
.macro ROW_TRANSFORM_0_STEP
|
||||
// { // input: src_d[0]~[3], output: e_q[0]~[3];
|
||||
saddl $4.4s, $0.4h, $2.4h //int32 e[i][0] = src[0] + src[2];
|
||||
ssubl $5.4s, $0.4h, $2.4h //int32 e[i][1] = src[0] - src[2];
|
||||
ssubl $6.4s, $1.4h, $3.4h //int32 e[i][2] = src[1] - src[3];
|
||||
saddl $7.4s, $1.4h, $3.4h //int32 e[i][3] = src[1] + src[3];
|
||||
// }
|
||||
.endm
|
||||
|
||||
.macro COL_TRANSFORM_0_STEP
|
||||
// { // input: src_q[0]~[3], output: e_q[0]~[3];
|
||||
add $4.4s, $0.4s, $2.4s //int32 e[0][j] = f[0][j] + f[2][j];
|
||||
sub $5.4s, $0.4s, $2.4s //int32 e[1][j] = f[0][j] - f[2][j];
|
||||
sub $6.4s, $1.4s, $3.4s //int32 e[2][j] = (f[1][j]>>1) - f[3][j];
|
||||
add $7.4s, $1.4s, $3.4s //int32 e[3][j] = f[1][j] + (f[3][j]>>1);
|
||||
// }
|
||||
.endm
|
||||
|
||||
.macro TRANSFORM_4BYTES // both row & col transform used
|
||||
// { // output: f_q[0]~[3], input: e_q[0]~[3];
|
||||
add $0.4s, $4.4s, $7.4s //int16 f[i][0] = e[i][0] + e[i][3];
|
||||
add $1.4s, $5.4s, $6.4s //int16 f[i][1] = e[i][1] + e[i][2];
|
||||
sub $2.4s, $5.4s, $6.4s //int16 f[i][2] = e[i][1] - e[i][2];
|
||||
sub $3.4s, $4.4s, $7.4s //int16 f[i][3] = e[i][0] - e[i][3];
|
||||
// }
|
||||
.endm
|
||||
|
||||
.macro MB_PRED_8BITS_ADD_DCT_16BITS_CLIP
|
||||
// { // input: pred_d[0](output), dct_q0/1, working_q0/1;
|
||||
uxtl $3.8h, $0.8b
|
||||
uxtl2 $4.8h, $0.16b
|
||||
add $3.8h, $3.8h, $1.8h
|
||||
add $4.8h, $4.8h, $2.8h
|
||||
sqxtun $0.8b, $3.8h
|
||||
sqxtun2 $0.16b,$4.8h
|
||||
// }
|
||||
.endm
|
||||
#else
|
||||
.macro ZERO_COUNT_IN_2_QUARWORD arg0, arg1, arg2
|
||||
// { // input: coef_0 (identy to \arg3\() \arg4\()), coef_1(identy to \arg5\() \arg6\()), mask_q
|
||||
cmeq \arg0\().8h, \arg0\().8h, #0
|
||||
cmeq \arg1\().8h, \arg1\().8h, #0
|
||||
uzp1 \arg0\().16b, \arg0\().16b, \arg1\().16b
|
||||
ushr \arg0\().16b, \arg0\().16b, 7
|
||||
addv \arg2\(), \arg0\().16b
|
||||
// }
|
||||
.endm
|
||||
|
||||
.macro NEWQUANT_COEF_EACH_16BITS arg0, arg1, arg2, arg3, arg4, arg5
|
||||
// if coef <= 0, - coef; else , coef;
|
||||
// { // input: coef, ff (dst), mf
|
||||
eor \arg3\().16b, \arg3\().16b, \arg3\().16b // init 0 , and keep 0;
|
||||
saba \arg1\().8h, \arg0\().8h, \arg3\().8h // f + abs(coef - 0)
|
||||
smull \arg4\().4s, \arg1\().4h, \arg2\().4h
|
||||
smull2 \arg5\().4s, \arg1\().8h, \arg2\().8h
|
||||
shrn \arg1\().4h, \arg4\().4s, #16
|
||||
shrn2 \arg1\().8h, \arg5\().4s, #16
|
||||
|
||||
cmgt \arg4\().8h, \arg0\().8h, #0 // if true, location of coef == 11111111
|
||||
bif \arg3\().16b, \arg1\().16b, \arg4\().16b // if (x<0) reserved part; else keep 0 untouched
|
||||
shl \arg3\().8h, \arg3\().8h, #1
|
||||
sub \arg1\().8h, \arg1\().8h, \arg3\().8h // if x > 0, -= 0; else x-= 2x
|
||||
// }
|
||||
.endm
|
||||
|
||||
.macro NEWQUANT_COEF_EACH_16BITS_MAX arg0, arg1, arg2, arg3, arg4, arg5, arg6
|
||||
// if coef <= 0, - coef; else , coef;
|
||||
// { // input: coef, ff (dst), mf
|
||||
eor \arg3\().16b, \arg3\().16b, \arg3\().16b // init 0 , and keep 0;
|
||||
saba \arg1\().8h, \arg0\().8h, \arg3\().8h // f + abs(coef - 0)
|
||||
smull \arg4\().4s, \arg1\().4h, \arg2\().4h
|
||||
smull2 \arg5\().4s, \arg1\().8h, \arg2\().8h
|
||||
shrn \arg1\().4h, \arg4\().4s, #16
|
||||
shrn2 \arg1\().8h, \arg5\().4s, #16
|
||||
|
||||
cmgt \arg4\().8h, \arg0\().8h, #0 // if true, location of coef == 11111111
|
||||
bif \arg3\().16b, \arg1\().16b, \arg4\().16b // if (x<0) reserved part; else keep 0 untouched
|
||||
shl \arg3\().8h, \arg3\().8h, #1
|
||||
mov.8h \arg6, \arg1
|
||||
sub \arg1\().8h, \arg1\().8h, \arg3\().8h // if x > 0, -= 0; else x-= 2x
|
||||
// }
|
||||
.endm
|
||||
|
||||
.macro QUANT_DUALWORD_COEF_EACH_16BITS arg0, arg1, arg2, arg3, arg4
|
||||
// if coef <= 0, - coef; else , coef;
|
||||
// { // input: coef, ff (dst), mf
|
||||
saba \arg1\().8h, \arg0\().8h, \arg3\().8h // f + abs(coef - 0)
|
||||
smull \arg4\().4s, \arg1\().4h, \arg2\().4h
|
||||
shrn \arg1\().4h, \arg4\().4s, #16
|
||||
|
||||
cmgt \arg4\().8h, \arg0\().8h, #0 // if true, location of coef == 11111111
|
||||
bif \arg3\().16b, \arg1\().16b, \arg4\().16b // if (x<0) reserved part; else keep 0 untouched
|
||||
shl \arg3\().8h, \arg3\().8h, #1
|
||||
sub \arg1\().8h, \arg1\().8h, \arg3\().8h // if x > 0, -= 0; else x-= 2x
|
||||
// }
|
||||
.endm
|
||||
|
||||
.macro SELECT_MAX_IN_ABS_COEF arg0, arg1, arg2, arg3, arg4, arg5
|
||||
// { // input: coef_0, coef_1, coef_2, coef_3, max_q (identy to follow two)
|
||||
umax \arg0\().8h, \arg0\().8h, \arg1\().8h
|
||||
umaxv \arg4\(), \arg0\().8h
|
||||
umax \arg2\().8h, \arg2\().8h, \arg3\().8h
|
||||
umaxv \arg5\(), \arg2\().8h
|
||||
// }
|
||||
.endm
|
||||
|
||||
.macro HDM_QUANT_2x2_TOTAL_16BITS arg0, arg1, arg2
|
||||
// { // input: src_d[0][16][32][48], dst_d[0][16][32][48], working
|
||||
sshr \arg1\().2d, \arg0\().2d, #32
|
||||
add \arg2\().4h, \arg0\().4h, \arg1\().4h // [0] = rs[0] + rs[32];[1] = rs[16] + rs[48];
|
||||
sub \arg1\().4h, \arg0\().4h, \arg1\().4h // [0] = rs[0] - rs[32];[1] = rs[16] - rs[48];
|
||||
zip1 \arg1\().4h, \arg2\().4h, \arg1\().4h
|
||||
// }
|
||||
.endm
|
||||
|
||||
|
||||
.macro DC_ZERO_COUNT_IN_DUALWORD arg0, arg1, arg2
|
||||
// { // input: coef, dst_d, working_d (all 0x01)
|
||||
cmeq \arg0\().4h, \arg0\().4h, #0
|
||||
and \arg0\().8b, \arg0\().8b, \arg2\().8b
|
||||
addv \arg1\(), \arg0\().4h
|
||||
// }
|
||||
.endm
|
||||
|
||||
.macro IHDM_4x4_TOTAL_16BITS arg0, arg1, arg2
|
||||
// { // input: each src_d[0]~[3](dst), working_q0, working_q1
|
||||
uzp2 \arg1\().4s, \arg0\().4s, \arg0\().4s
|
||||
uzp1 \arg0\().4s, \arg0\().4s, \arg0\().4s
|
||||
add \arg2\().8h, \arg0\().8h, \arg1\().8h // [0] = rs[0] + rs[2];[1] = rs[1] + rs[3];[2] = rs[4] + rs[6];[3] = rs[5] + rs[7];
|
||||
sub \arg1\().8h, \arg0\().8h, \arg1\().8h // [0] = rs[0] - rs[2];[1] = rs[1] - rs[3];[2] = rs[4] - rs[6];[3] = rs[5] - rs[7];
|
||||
zip1 \arg2\().8h, \arg2\().8h, \arg1\().8h // [0] = rs[0] + rs[2]; [1] = rs[0] - rs[2]; ... [2]; [3]
|
||||
|
||||
uzp2 \arg1\().4s, \arg2\().4s, \arg2\().4s
|
||||
uzp1 \arg0\().4s, \arg2\().4s, \arg2\().4s
|
||||
add \arg2\().8h, \arg0\().8h, \arg1\().8h // [0] = rs[0] + rs[2];[1] = rs[1] + rs[3];[2] = rs[4] + rs[6];[3] = rs[5] + rs[7];
|
||||
sub \arg1\().8h, \arg0\().8h, \arg1\().8h // [0] = rs[0] - rs[2];[1] = rs[1] - rs[3];[2] = rs[4] - rs[6];[3] = rs[5] - rs[7];
|
||||
rev32 \arg1\().4h, \arg1\().4h // [0] = rs[1] - rs[3];[1] = rs[0] - rs[2];[2] = rs[5] - rs[7];[3] = rs[4] - rs[6];
|
||||
zip1 \arg0\().4s, \arg2\().4s, \arg1\().4s
|
||||
// }
|
||||
.endm
|
||||
|
||||
.macro MATRIX_TRANSFORM_EACH_16BITS_2x8_OUT2 arg0, arg1, arg2, arg3
|
||||
// { // input & output: src_d[0]~[3];[0 1 2 3]+[4 5 6 7]+[8 9 10 11]+[12 13 14 15]
|
||||
uzp1 \arg2\().4s, \arg0\().4s, \arg1\().4s //[0 1 4 5]+[8 9 12 13]
|
||||
uzp2 \arg3\().4s, \arg0\().4s, \arg1\().4s //[2 3 6 7]+[10 11 14 15]
|
||||
|
||||
uzp1 \arg0\().8h, \arg2\().8h, \arg3\().8h //[0 4 8 12]+[2 6 10 14]
|
||||
uzp2 \arg2\().8h, \arg2\().8h, \arg3\().8h //[1 5 9 13]+[3 7 11 15]
|
||||
zip2 \arg1\().2d, \arg0\().2d, \arg2\().2d //[2 6 10 14]+[3 7 11 15]
|
||||
zip1 \arg0\().2d, \arg0\().2d, \arg2\().2d //[0 4 8 12]+[1 5 9 13]
|
||||
// }
|
||||
.endm
|
||||
|
||||
.macro MATRIX_TRANSFORM_EACH_16BITS_OUT4 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7
|
||||
// { // input & output: src_d[0]~[3];[0 4 8 12],[1 5 9 13],[2 6 10 14],[3 7 11 15]
|
||||
trn1 \arg4\().8h, v0.8h, v1.8h
|
||||
trn2 \arg5\().8h, v0.8h, v1.8h
|
||||
trn1 \arg6\().8h, v2.8h, v3.8h
|
||||
trn2 \arg7\().8h, v2.8h, v3.8h
|
||||
|
||||
trn1 \arg0\().4s, v4.4s, v6.4s
|
||||
trn2 \arg2\().4s, v4.4s, v6.4s
|
||||
trn1 \arg1\().4s, v5.4s, v7.4s
|
||||
trn2 \arg3\().4s, v5.4s, v7.4s
|
||||
// }
|
||||
.endm
|
||||
|
||||
.macro MATRIX_TRANSFORM_EACH_16BITS_4x4_OUT2 arg0, arg1, arg2, arg3
|
||||
// { // input & output: src_d[0]~[3];[0 1 2 3],[4 5 6 7],[8 9 10 11],[12 13 14 15]
|
||||
mov \arg0\().d[1], \arg1\().d[0] //[0 1 2 3]+[4 5 6 7]
|
||||
mov \arg2\().d[1], \arg3\().d[0] //[8 9 10 11]+[12 13 14 15]
|
||||
uzp1 \arg1\().4s, \arg0\().4s, \arg2\().4s //[0 1 4 5]+[8 9 12 13]
|
||||
uzp2 \arg3\().4s, \arg0\().4s, \arg2\().4s //[2 3 6 7]+[10 11 14 15]
|
||||
|
||||
uzp1 \arg0\().8h, \arg1\().8h, \arg3\().8h //[0 4 8 12]+[2 6 10 14]
|
||||
uzp2 \arg2\().8h, \arg1\().8h, \arg3\().8h //[1 5 9 13]+[3 7 11 15]
|
||||
zip2 \arg1\().2d, \arg0\().2d, \arg2\().2d //[2 6 10 14]+[3 7 11 15]
|
||||
zip1 \arg0\().2d, \arg0\().2d, \arg2\().2d //[0 4 8 12]+[1 5 9 13]
|
||||
// }
|
||||
.endm
|
||||
|
||||
.macro LOAD_4x4_DATA_FOR_DCT arg0, arg1, arg2, arg3, arg4, arg5
|
||||
ld1 {\arg0\().s}[0], [\arg2\()], \arg3\()
|
||||
ld1 {\arg0\().s}[1], [\arg2\()], \arg3\()
|
||||
ld1 {\arg0\().s}[2], [\arg2\()], \arg3\()
|
||||
ld1 {\arg0\().s}[3], [\arg2\()]
|
||||
|
||||
ld1 {\arg1\().s}[0], [\arg4\()], \arg5\()
|
||||
ld1 {\arg1\().s}[1], [\arg4\()], \arg5\()
|
||||
ld1 {\arg1\().s}[2], [\arg4\()], \arg5\()
|
||||
ld1 {\arg1\().s}[3], [\arg4\()]
|
||||
.endm
|
||||
|
||||
.macro DCT_ROW_TRANSFORM_TOTAL_16BITS arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7
|
||||
// { // input: src_d[0]~[3], working: [4]~[7]
|
||||
add \arg4\().8h, \arg0\().8h, \arg3\().8h //int16 s[0] = data[i] + data[i3];
|
||||
sub \arg7\().8h, \arg0\().8h, \arg3\().8h //int16 s[3] = data[i] - data[i3];
|
||||
add \arg5\().8h, \arg1\().8h, \arg2\().8h //int16 s[1] = data[i1] + data[i2];
|
||||
sub \arg6\().8h, \arg1\().8h, \arg2\().8h //int16 s[2] = data[i1] - data[i2];
|
||||
|
||||
add \arg0\().8h, \arg4\().8h, \arg5\().8h //int16 dct[i ] = s[0] + s[1];
|
||||
sub \arg2\().8h, \arg4\().8h, \arg5\().8h //int16 dct[i2] = s[0] - s[1];
|
||||
shl \arg1\().8h, \arg7\().8h, #1
|
||||
shl \arg3\().8h, \arg6\().8h, #1
|
||||
add \arg1\().8h, \arg1\().8h, \arg6\().8h //int16 dct[i1] = (s[3] << 1) + s[2];
|
||||
sub \arg3\().8h, \arg7\().8h, \arg3\().8h //int16 dct[i3] = s[3] - (s[2] << 1);
|
||||
// }
|
||||
.endm
|
||||
|
||||
.macro LOAD_8x4_DATA_FOR_DCT arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9
|
||||
// { // input: \arg0\()~\arg3\(), src1*, src2*; untouched r2:src1_stride &r4:src2_stride
|
||||
ld1 {\arg0\().d}[0], [\arg8\()], x2
|
||||
ld1 {\arg1\().d}[0], [\arg8\()], x2
|
||||
ld1 {\arg2\().d}[0], [\arg8\()], x2
|
||||
ld1 {\arg3\().d}[0], [\arg8\()], x2
|
||||
|
||||
ld1 {\arg4\().d}[0], [\arg9\()], x4
|
||||
ld1 {\arg5\().d}[0], [\arg9\()], x4
|
||||
ld1 {\arg6\().d}[0], [\arg9\()], x4
|
||||
ld1 {\arg7\().d}[0], [\arg9\()], x4
|
||||
// }
|
||||
.endm
|
||||
|
||||
.macro ROW_TRANSFORM_1_STEP_TOTAL_16BITS arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7
|
||||
// { // input: src_d[0]~[3], output: e_d[0]~[3];
|
||||
add \arg4\().8h, \arg0\().8h, \arg2\().8h //int16 e[i][0] = src[0] + src[2];
|
||||
sub \arg5\().8h, \arg0\().8h, \arg2\().8h //int16 e[i][1] = src[0] - src[2];
|
||||
sshr \arg6\().8h, \arg1\().8h, #1
|
||||
sshr \arg7\().8h, \arg3\().8h, #1
|
||||
sub \arg6\().8h, \arg6\().8h, \arg3\().8h //int16 e[i][2] = (src[1]>>1)-src[3];
|
||||
add \arg7\().8h, \arg1\().8h, \arg7\().8h //int16 e[i][3] = src[1] + (src[3]>>1);
|
||||
// }
|
||||
.endm
|
||||
|
||||
.macro TRANSFORM_TOTAL_16BITS arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7
|
||||
// both row & col transform used
|
||||
// { // output: f_q[0]~[3], input: e_q[0]~[3];
|
||||
add \arg0\().8h, \arg4\().8h, \arg7\().8h //int16 f[i][0] = e[i][0] + e[i][3];
|
||||
add \arg1\().8h, \arg5\().8h, \arg6\().8h //int16 f[i][1] = e[i][1] + e[i][2];
|
||||
sub \arg2\().8h, \arg5\().8h, \arg6\().8h //int16 f[i][2] = e[i][1] - e[i][2];
|
||||
sub \arg3\().8h, \arg4\().8h, \arg7\().8h //int16 f[i][3] = e[i][0] - e[i][3];
|
||||
// }
|
||||
.endm
|
||||
|
||||
.macro ROW_TRANSFORM_0_STEP arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7
|
||||
// { // input: src_d[0]~[3], output: e_q[0]~[3];
|
||||
saddl \arg4\().4s, \arg0\().4h, \arg2\().4h //int32 e[i][0] = src[0] + src[2];
|
||||
ssubl \arg5\().4s, \arg0\().4h, \arg2\().4h //int32 e[i][1] = src[0] - src[2];
|
||||
ssubl \arg6\().4s, \arg1\().4h, \arg3\().4h //int32 e[i][2] = src[1] - src[3];
|
||||
saddl \arg7\().4s, \arg1\().4h, \arg3\().4h //int32 e[i][3] = src[1] + src[3];
|
||||
// }
|
||||
.endm
|
||||
|
||||
.macro COL_TRANSFORM_0_STEP arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7
|
||||
// { // input: src_q[0]~[3], output: e_q[0]~[3];
|
||||
add \arg4\().4s, \arg0\().4s, \arg2\().4s //int32 e[0][j] = f[0][j] + f[2][j];
|
||||
sub \arg5\().4s, \arg0\().4s, \arg2\().4s //int32 e[1][j] = f[0][j] - f[2][j];
|
||||
sub \arg6\().4s, \arg1\().4s, \arg3\().4s //int32 e[2][j] = (f[1][j]>>1) - f[3][j];
|
||||
add \arg7\().4s, \arg1\().4s, \arg3\().4s //int32 e[3][j] = f[1][j] + (f[3][j]>>1);
|
||||
// }
|
||||
.endm
|
||||
|
||||
.macro TRANSFORM_4BYTES arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7
|
||||
// both row & col transform used
|
||||
// { // output: f_q[0]~[3], input: e_q[0]~[3];
|
||||
add \arg0\().4s, \arg4\().4s, \arg7\().4s //int16 f[i][0] = e[i][0] + e[i][3];
|
||||
add \arg1\().4s, \arg5\().4s, \arg6\().4s //int16 f[i][1] = e[i][1] + e[i][2];
|
||||
sub \arg2\().4s, \arg5\().4s, \arg6\().4s //int16 f[i][2] = e[i][1] - e[i][2];
|
||||
sub \arg3\().4s, \arg4\().4s, \arg7\().4s //int16 f[i][3] = e[i][0] - e[i][3];
|
||||
// }
|
||||
.endm
|
||||
|
||||
.macro MB_PRED_8BITS_ADD_DCT_16BITS_CLIP arg0, arg1, arg2, arg3, arg4
|
||||
// { // input: pred_d[0](output), dct_q0/1, working_q0/1;
|
||||
uxtl \arg3\().8h, \arg0\().8b
|
||||
uxtl2 \arg4\().8h, \arg0\().16b
|
||||
add \arg3\().8h, \arg3\().8h, \arg1\().8h
|
||||
add \arg4\().8h, \arg4\().8h, \arg2\().8h
|
||||
sqxtun \arg0\().8b, \arg3\().8h
|
||||
sqxtun2 \arg0\().16b,\arg4\().8h
|
||||
// }
|
||||
.endm
|
||||
#endif
|
||||
|
||||
WELS_ASM_AARCH64_FUNC_BEGIN WelsGetNoneZeroCount_AArch64_neon
|
||||
ld1 {v0.8h, v1.8h}, [x0]
|
||||
ZERO_COUNT_IN_2_QUARWORD v0, v1, b0
|
||||
mov x0, v0.d[0]
|
||||
mov x1, #16
|
||||
subs x0, x1, x0
|
||||
WELS_ASM_AARCH64_FUNC_END
|
||||
|
||||
WELS_ASM_AARCH64_FUNC_BEGIN WelsQuant4x4_AArch64_neon
|
||||
ld1 {v2.8h}, [x1]
|
||||
ld1 {v0.8h, v1.8h}, [x0]
|
||||
ld1 {v3.8h}, [x2]
|
||||
mov.8h v4, v2
|
||||
NEWQUANT_COEF_EACH_16BITS v0, v2, v3, v5, v6, v7
|
||||
st1 {v2.8h}, [x0], #16
|
||||
NEWQUANT_COEF_EACH_16BITS v1, v4, v3, v5, v6, v7
|
||||
st1 {v4.8h}, [x0], #16
|
||||
WELS_ASM_AARCH64_FUNC_END
|
||||
|
||||
|
||||
WELS_ASM_AARCH64_FUNC_BEGIN WelsQuant4x4Dc_AArch64_neon
|
||||
ld1 {v0.8h, v1.8h}, [x0]
|
||||
dup v2.8h, w1 // even ff range [0, 768]
|
||||
dup v3.8h, w2
|
||||
mov.8h v4, v2
|
||||
NEWQUANT_COEF_EACH_16BITS v0, v2, v3, v5, v6, v7
|
||||
st1 {v2.8h}, [x0], #16
|
||||
NEWQUANT_COEF_EACH_16BITS v1, v4, v3, v5, v6, v7
|
||||
st1 {v4.8h}, [x0], #16
|
||||
WELS_ASM_AARCH64_FUNC_END
|
||||
|
||||
WELS_ASM_AARCH64_FUNC_BEGIN WelsQuantFour4x4_AArch64_neon
|
||||
ld1 {v2.8h}, [x1]
|
||||
ld1 {v3.8h}, [x2]
|
||||
mov x1, x0
|
||||
|
||||
.rept 4
|
||||
ld1 {v0.8h, v1.8h}, [x0], #32
|
||||
mov.8h v4, v2
|
||||
NEWQUANT_COEF_EACH_16BITS v0, v4, v3, v5, v6, v7
|
||||
st1 {v4.8h}, [x1], #16
|
||||
mov.8h v4, v2
|
||||
NEWQUANT_COEF_EACH_16BITS v1, v4, v3, v5, v6, v7
|
||||
st1 {v4.8h}, [x1], #16
|
||||
.endr
|
||||
WELS_ASM_AARCH64_FUNC_END
|
||||
|
||||
|
||||
WELS_ASM_AARCH64_FUNC_BEGIN WelsQuantFour4x4Max_AArch64_neon
|
||||
ld1 {v2.8h}, [x1]
|
||||
ld1 {v3.8h}, [x2]
|
||||
mov x1, x0
|
||||
|
||||
ld1 {v0.8h, v1.8h}, [x0], #32
|
||||
mov.8h v4, v2
|
||||
NEWQUANT_COEF_EACH_16BITS_MAX v0, v4, v3, v5, v6, v7, v16
|
||||
st1 {v4.8h}, [x1], #16
|
||||
mov.8h v4, v2
|
||||
NEWQUANT_COEF_EACH_16BITS_MAX v1, v4, v3, v5, v6, v7, v17
|
||||
st1 {v4.8h}, [x1], #16 // then 1st 16 elem in v16 & v17
|
||||
|
||||
ld1 {v0.8h, v1.8h}, [x0], #32
|
||||
mov.8h v4, v2
|
||||
NEWQUANT_COEF_EACH_16BITS_MAX v0, v4, v3, v5, v6, v7, v18
|
||||
st1 {v4.8h}, [x1], #16
|
||||
mov.8h v4, v2
|
||||
NEWQUANT_COEF_EACH_16BITS_MAX v1, v4, v3, v5, v6, v7, v19
|
||||
st1 {v4.8h}, [x1], #16 // then 2st 16 elem in v18 & v19
|
||||
|
||||
SELECT_MAX_IN_ABS_COEF v16, v17, v18, v19, h20, h21
|
||||
|
||||
ld1 {v0.8h, v1.8h}, [x0], #32
|
||||
mov.8h v4, v2
|
||||
NEWQUANT_COEF_EACH_16BITS_MAX v0, v4, v3, v5, v6, v7, v16
|
||||
st1 {v4.8h}, [x1], #16
|
||||
mov.8h v4, v2
|
||||
NEWQUANT_COEF_EACH_16BITS_MAX v1, v4, v3, v5, v6, v7, v17
|
||||
st1 {v4.8h}, [x1], #16 // then 1st 16 elem in v16 & v17
|
||||
|
||||
ld1 {v0.8h, v1.8h}, [x0], #32
|
||||
mov.8h v4, v2
|
||||
NEWQUANT_COEF_EACH_16BITS_MAX v0, v4, v3, v5, v6, v7, v18
|
||||
st1 {v4.8h}, [x1], #16
|
||||
mov.8h v4, v2
|
||||
NEWQUANT_COEF_EACH_16BITS_MAX v1, v4, v3, v5, v6, v7, v19
|
||||
st1 {v4.8h}, [x1], #16 // then 2st 16 elem in v18 & v19
|
||||
|
||||
SELECT_MAX_IN_ABS_COEF v16, v17, v18, v19, h22, h23
|
||||
|
||||
st4 {v20.h,v21.h,v22.h,v23.h}[0], [x3]
|
||||
WELS_ASM_AARCH64_FUNC_END
|
||||
|
||||
|
||||
WELS_ASM_AARCH64_FUNC_BEGIN WelsDequant4x4_AArch64_neon
|
||||
ld1 {v0.8h, v1.8h}, [x0]
|
||||
ld1 {v2.8h}, [x1]
|
||||
mul v3.8h, v0.8h, v2.8h
|
||||
mul v4.8h, v1.8h, v2.8h
|
||||
st1 {v3.8h, v4.8h}, [x0]
|
||||
WELS_ASM_AARCH64_FUNC_END
|
||||
|
||||
WELS_ASM_AARCH64_FUNC_BEGIN WelsDequantFour4x4_AArch64_neon
|
||||
ld1 {v2.8h}, [x1]
|
||||
mov x1, x0
|
||||
.rept 4
|
||||
ld1 {v0.8h,v1.8h}, [x0], #32
|
||||
mul v3.8h, v0.8h, v2.8h
|
||||
mul v4.8h, v1.8h, v2.8h
|
||||
st1 {v3.8h,v4.8h}, [x1], #32
|
||||
.endr
|
||||
WELS_ASM_AARCH64_FUNC_END
|
||||
|
||||
WELS_ASM_AARCH64_FUNC_BEGIN WelsHadamardQuant2x2SkipKernel_AArch64_neon
|
||||
dup v4.8h, w1
|
||||
mov x1, #32
|
||||
ld1 {v0.h}[0], [x0], x1 //rs[0]
|
||||
ld1 {v0.h}[1], [x0], x1 //rs[16]
|
||||
ld1 {v0.h}[2], [x0], x1 //rs[32]
|
||||
ld1 {v0.h}[3], [x0], x1 //rs[48]
|
||||
|
||||
HDM_QUANT_2x2_TOTAL_16BITS v0, v1, v2 // output v1
|
||||
|
||||
HDM_QUANT_2x2_TOTAL_16BITS v1, v0, v2 // output v0
|
||||
|
||||
abs v1.4h, v0.4h
|
||||
cmhi v1.4h, v1.4h, v4.4h // abs(dct[i])>threshold;
|
||||
mov w0, v0.s[0]
|
||||
mov w1, v0.s[1]
|
||||
orr w0, w0, w1
|
||||
WELS_ASM_AARCH64_FUNC_END
|
||||
|
||||
|
||||
WELS_ASM_AARCH64_FUNC_BEGIN WelsHadamardQuant2x2_AArch64_neon
|
||||
|
||||
dup v1.8h, w1 //ff
|
||||
dup v2.8h, w2 //mf
|
||||
eor v3.16b, v3.16b, v3.16b
|
||||
|
||||
mov x1, #32
|
||||
mov x2, x0
|
||||
ld1 {v0.h}[0], [x0], x1 //rs[0]
|
||||
st1 {v3.h}[0], [x2], x1 //rs[00]=0
|
||||
ld1 {v0.h}[1], [x0], x1 //rs[16]
|
||||
st1 {v3.h}[1], [x2], x1 //rs[16]=0
|
||||
ld1 {v0.h}[2], [x0], x1 //rs[32]
|
||||
st1 {v3.h}[2], [x2], x1 //rs[32]=0
|
||||
ld1 {v0.h}[3], [x0], x1 //rs[48]
|
||||
st1 {v3.h}[3], [x2], x1 //rs[48]=0
|
||||
|
||||
|
||||
HDM_QUANT_2x2_TOTAL_16BITS v0, v4, v5 // output v4
|
||||
|
||||
HDM_QUANT_2x2_TOTAL_16BITS v4, v0, v5 // output v0
|
||||
|
||||
QUANT_DUALWORD_COEF_EACH_16BITS v0, v1, v2, v3, v4
|
||||
|
||||
st1 {v1.d}[0], [x3] // store to dct
|
||||
st1 {v1.d}[0], [x4] // store to block
|
||||
|
||||
movi v3.8h, #1, lsl #0
|
||||
|
||||
movi v0.16b, #255
|
||||
|
||||
DC_ZERO_COUNT_IN_DUALWORD v1, h0, v3
|
||||
|
||||
mov x0, v0.d[0]
|
||||
mov x1, #16
|
||||
subs x0, x1, x0
|
||||
WELS_ASM_AARCH64_FUNC_END
|
||||
|
||||
|
||||
|
||||
WELS_ASM_AARCH64_FUNC_BEGIN WelsDequantIHadamard4x4_AArch64_neon
|
||||
ld1 {v0.8h, v1.8h}, [x0]
|
||||
dup v4.8h, w1
|
||||
|
||||
IHDM_4x4_TOTAL_16BITS v0, v2, v3
|
||||
IHDM_4x4_TOTAL_16BITS v1, v2, v3
|
||||
|
||||
MATRIX_TRANSFORM_EACH_16BITS_2x8_OUT2 v0, v1, v2, v3
|
||||
|
||||
IHDM_4x4_TOTAL_16BITS v0, v2, v3
|
||||
mul v0.8h, v0.8h, v4.8h
|
||||
|
||||
IHDM_4x4_TOTAL_16BITS v1, v2, v3
|
||||
mul v1.8h, v1.8h, v4.8h
|
||||
|
||||
MATRIX_TRANSFORM_EACH_16BITS_2x8_OUT2 v0, v1, v2, v3
|
||||
st1 {v0.16b, v1.16b}, [x0]
|
||||
WELS_ASM_AARCH64_FUNC_END
|
||||
|
||||
WELS_ASM_AARCH64_FUNC_BEGIN WelsDctT4_AArch64_neon
|
||||
LOAD_4x4_DATA_FOR_DCT v0, v1, x1, x2, x3, x4
|
||||
usubl v2.8h, v0.8b, v1.8b
|
||||
usubl2 v4.8h, v0.16b, v1.16b
|
||||
uzp1 v3.8h, v2.8h, v4.8h
|
||||
uzp2 v5.8h, v2.8h, v4.8h
|
||||
uzp2 v2.8h, v3.8h, v5.8h // s[2, 6, 10, 14] [3, 7, 11, 15]
|
||||
uzp1 v0.8h, v3.8h, v5.8h // s[0, 4, 8, 12] [1, 5, 9, 13]
|
||||
mov v3.d[0], v2.d[1] // s[3, 7, 11, 15]
|
||||
mov v1.d[0], v0.d[1] // s[1, 5, 9, 13]
|
||||
|
||||
// horizontal transform
|
||||
DCT_ROW_TRANSFORM_TOTAL_16BITS v0, v1, v2, v3, v4, v5, v6, v7
|
||||
// transform element
|
||||
MATRIX_TRANSFORM_EACH_16BITS_OUT4 v0, v1, v2, v3, v4, v5, v6, v7
|
||||
// vertical transform
|
||||
DCT_ROW_TRANSFORM_TOTAL_16BITS v0, v1, v2, v3, v4, v5, v6, v7
|
||||
|
||||
st4 {v0.d, v1.d, v2.d, v3.d}[0], [x0]
|
||||
WELS_ASM_AARCH64_FUNC_END
|
||||
|
||||
|
||||
WELS_ASM_AARCH64_FUNC_BEGIN WelsDctFourT4_AArch64_neon
|
||||
.rept 2
|
||||
LOAD_8x4_DATA_FOR_DCT v0, v1, v2, v3, v4, v5, v6, v7, x1, x3
|
||||
usubl v0.8h, v0.8b, v4.8b
|
||||
usubl v1.8h, v1.8b, v5.8b
|
||||
usubl v2.8h, v2.8b, v6.8b
|
||||
usubl v3.8h, v3.8b, v7.8b
|
||||
|
||||
MATRIX_TRANSFORM_EACH_16BITS_OUT4 v0, v1, v2, v3, v4, v5, v6, v7
|
||||
|
||||
// horizontal transform
|
||||
DCT_ROW_TRANSFORM_TOTAL_16BITS v0, v1, v2, v3, v4, v5, v6, v7
|
||||
|
||||
// transform element
|
||||
MATRIX_TRANSFORM_EACH_16BITS_OUT4 v0, v1, v2, v3, v4, v5, v6, v7
|
||||
|
||||
// vertical transform
|
||||
DCT_ROW_TRANSFORM_TOTAL_16BITS v0, v1, v2, v3, v4, v5, v6, v7
|
||||
|
||||
uzp1 v4.2d, v0.2d, v1.2d
|
||||
uzp2 v6.2d, v0.2d, v1.2d
|
||||
uzp1 v5.2d, v2.2d, v3.2d
|
||||
uzp2 v7.2d, v2.2d, v3.2d
|
||||
st1 {v4.16b, v5.16b}, [x0], #32
|
||||
st1 {v6.16b, v7.16b}, [x0], #32
|
||||
.endr
|
||||
WELS_ASM_AARCH64_FUNC_END
|
||||
|
||||
WELS_ASM_AARCH64_FUNC_BEGIN WelsIDctT4Rec_AArch64_neon
|
||||
ld1 {v16.s}[0], [x2], x3
|
||||
ld1 {v16.s}[1], [x2], x3
|
||||
ld1 {v16.s}[2], [x2], x3
|
||||
ld1 {v16.s}[3], [x2], x3 // Pred
|
||||
ld4 {v0.4h, v1.4h, v2.4h, v3.4h}, [x4] // dct coeff
|
||||
|
||||
ROW_TRANSFORM_1_STEP_TOTAL_16BITS v0, v1, v2, v3, v4, v5, v6, v7
|
||||
|
||||
TRANSFORM_TOTAL_16BITS v0, v1, v2, v3, v4, v5, v6, v7
|
||||
|
||||
MATRIX_TRANSFORM_EACH_16BITS_OUT4 v0, v1, v2, v3, v4, v5, v6, v7
|
||||
|
||||
ROW_TRANSFORM_1_STEP_TOTAL_16BITS v0, v1, v2, v3, v4, v5, v6, v7
|
||||
|
||||
TRANSFORM_TOTAL_16BITS v0, v1, v2, v3, v4, v5, v6, v7
|
||||
ins v0.d[1], v1.d[0]
|
||||
ins v2.d[1], v3.d[0]
|
||||
srshr v0.8h, v0.8h, #6
|
||||
srshr v2.8h, v2.8h, #6
|
||||
//after rounding 6, clip into [0, 255]
|
||||
uxtl v1.8h, v16.8b
|
||||
add v0.8h, v0.8h, v1.8h
|
||||
sqxtun v1.8b, v0.8h
|
||||
st1 {v1.s}[0],[x0],x1
|
||||
st1 {v1.s}[1],[x0],x1
|
||||
|
||||
uxtl2 v1.8h, v16.16b
|
||||
add v2.8h, v2.8h, v1.8h
|
||||
sqxtun v1.8b, v2.8h
|
||||
st1 {v1.s}[0],[x0],x1
|
||||
st1 {v1.s}[1],[x0],x1
|
||||
WELS_ASM_AARCH64_FUNC_END
|
||||
|
||||
WELS_ASM_AARCH64_FUNC_BEGIN WelsIDctFourT4Rec_AArch64_neon
|
||||
.rept 2
|
||||
ld1 {v16.d}[0], [x2], x3
|
||||
ld1 {v16.d}[1], [x2], x3
|
||||
ld1 {v17.d}[0], [x2], x3
|
||||
ld1 {v17.d}[1], [x2], x3 // Pred
|
||||
ld4 {v0.8h, v1.8h, v2.8h, v3.8h}, [x4], #64 // dct coeff
|
||||
|
||||
ROW_TRANSFORM_1_STEP_TOTAL_16BITS v0, v1, v2, v3, v4, v5, v6, v7
|
||||
|
||||
TRANSFORM_TOTAL_16BITS v0, v1, v2, v3, v4, v5, v6, v7
|
||||
|
||||
MATRIX_TRANSFORM_EACH_16BITS_OUT4 v0, v1, v2, v3, v4, v5, v6, v7
|
||||
|
||||
ROW_TRANSFORM_1_STEP_TOTAL_16BITS v0, v1, v2, v3, v4, v5, v6, v7
|
||||
|
||||
TRANSFORM_TOTAL_16BITS v0, v1, v2, v3, v4, v5, v6, v7
|
||||
srshr v0.8h, v0.8h, #6
|
||||
srshr v1.8h, v1.8h, #6
|
||||
srshr v2.8h, v2.8h, #6
|
||||
srshr v3.8h, v3.8h, #6
|
||||
|
||||
//after rounding 6, clip into [0, 255]
|
||||
uxtl v4.8h, v16.8b
|
||||
add v0.8h, v0.8h, v4.8h
|
||||
sqxtun v0.8b, v0.8h
|
||||
st1 {v0.d}[0],[x0],x1
|
||||
|
||||
uxtl2 v5.8h, v16.16b
|
||||
add v1.8h, v1.8h, v5.8h
|
||||
sqxtun v1.8b, v1.8h
|
||||
st1 {v1.d}[0],[x0],x1
|
||||
|
||||
uxtl v6.8h, v17.8b
|
||||
add v2.8h, v2.8h, v6.8h
|
||||
sqxtun v2.8b, v2.8h
|
||||
st1 {v2.d}[0],[x0],x1
|
||||
|
||||
uxtl2 v7.8h, v17.16b
|
||||
add v3.8h, v3.8h, v7.8h
|
||||
sqxtun v3.8b, v3.8h
|
||||
st1 {v3.d}[0],[x0],x1
|
||||
.endr
|
||||
WELS_ASM_AARCH64_FUNC_END
|
||||
|
||||
WELS_ASM_AARCH64_FUNC_BEGIN WelsHadamardT4Dc_AArch64_neon
|
||||
|
||||
mov x2, #32
|
||||
ld1 {v0.h}[0], [x1], x2
|
||||
ld1 {v1.h}[0], [x1], x2
|
||||
ld1 {v0.h}[1], [x1], x2
|
||||
ld1 {v1.h}[1], [x1], x2
|
||||
|
||||
ld1 {v2.h}[0], [x1], x2
|
||||
ld1 {v3.h}[0], [x1], x2
|
||||
ld1 {v2.h}[1], [x1], x2
|
||||
ld1 {v3.h}[1], [x1], x2
|
||||
|
||||
ld1 {v0.h}[2], [x1], x2
|
||||
ld1 {v1.h}[2], [x1], x2
|
||||
ld1 {v0.h}[3], [x1], x2
|
||||
ld1 {v1.h}[3], [x1], x2
|
||||
|
||||
ld1 {v2.h}[2], [x1], x2
|
||||
ld1 {v3.h}[2], [x1], x2
|
||||
ld1 {v2.h}[3], [x1], x2
|
||||
ld1 {v3.h}[3], [x1], x2 // v0[0 4 08 12],v1[1 5 09 13],v2[2 6 10 14],v3[3 7 11 15]
|
||||
|
||||
ROW_TRANSFORM_0_STEP v0, v1, v3, v2, v4, v7, v6, v5
|
||||
TRANSFORM_4BYTES v0, v1, v3, v2, v4, v7, v6, v5
|
||||
|
||||
// transform element 32bits
|
||||
uzp1 v4.4s, v0.4s, v1.4s // 0 2 4 6
|
||||
uzp2 v5.4s, v0.4s, v1.4s // 1 3 5 7
|
||||
uzp1 v6.4s, v2.4s, v3.4s // 8 10 12 14
|
||||
uzp2 v7.4s, v2.4s, v3.4s // 9 11 13 15
|
||||
|
||||
uzp1 v0.4s, v4.4s, v6.4s // 0 4 8 12
|
||||
uzp2 v2.4s, v4.4s, v6.4s // 2 6 10 14
|
||||
uzp1 v1.4s, v5.4s, v7.4s // 1 5 9 13
|
||||
uzp2 v3.4s, v5.4s, v7.4s // 3 7 11 15
|
||||
|
||||
COL_TRANSFORM_0_STEP v0, v1, v3, v2, v4, v7, v6, v5
|
||||
TRANSFORM_4BYTES v0, v1, v3, v2, v4, v7, v6, v5
|
||||
sqrshrn v4.4h, v0.4s, #1
|
||||
sqrshrn2 v4.8h, v1.4s, #1
|
||||
sqrshrn v5.4h, v2.4s, #1
|
||||
sqrshrn2 v5.8h, v3.4s, #1
|
||||
st1 {v4.16b, v5.16b}, [x0] //store
|
||||
WELS_ASM_AARCH64_FUNC_END
|
||||
|
||||
WELS_ASM_AARCH64_FUNC_BEGIN WelsIDctRecI16x16Dc_AArch64_neon
|
||||
ld1 {v16.16b,v17.16b}, [x4]
|
||||
srshr v16.8h, v16.8h, #6
|
||||
srshr v17.8h, v17.8h, #6
|
||||
|
||||
dup v0.8h, v16.h[0]
|
||||
dup v1.8h, v16.h[1]
|
||||
ins v0.d[1], v1.d[0]
|
||||
dup v1.8h, v16.h[2]
|
||||
dup v2.8h, v16.h[3]
|
||||
ins v1.d[1], v2.d[0]
|
||||
|
||||
.rept 4
|
||||
ld1 {v3.16b}, [x2], x3
|
||||
MB_PRED_8BITS_ADD_DCT_16BITS_CLIP v3, v0, v1, v4, v5
|
||||
st1 {v3.16b}, [x0], x1
|
||||
.endr
|
||||
|
||||
dup v0.8h, v16.h[4]
|
||||
dup v1.8h, v16.h[5]
|
||||
ins v0.d[1], v1.d[0]
|
||||
dup v1.8h, v16.h[6]
|
||||
dup v2.8h, v16.h[7]
|
||||
ins v1.d[1], v2.d[0]
|
||||
|
||||
.rept 4
|
||||
ld1 {v3.16b}, [x2], x3
|
||||
MB_PRED_8BITS_ADD_DCT_16BITS_CLIP v3, v0, v1, v4, v5
|
||||
st1 {v3.16b}, [x0], x1
|
||||
.endr
|
||||
|
||||
dup v0.8h, v17.h[0]
|
||||
dup v1.8h, v17.h[1]
|
||||
ins v0.d[1], v1.d[0]
|
||||
dup v1.8h, v17.h[2]
|
||||
dup v2.8h, v17.h[3]
|
||||
ins v1.d[1], v2.d[0]
|
||||
|
||||
.rept 4
|
||||
ld1 {v3.16b}, [x2], x3
|
||||
MB_PRED_8BITS_ADD_DCT_16BITS_CLIP v3, v0, v1, v4, v5
|
||||
st1 {v3.16b}, [x0], x1
|
||||
.endr
|
||||
|
||||
dup v0.8h, v17.h[4]
|
||||
dup v1.8h, v17.h[5]
|
||||
ins v0.d[1], v1.d[0]
|
||||
dup v1.8h, v17.h[6]
|
||||
dup v2.8h, v17.h[7]
|
||||
ins v1.d[1], v2.d[0]
|
||||
|
||||
.rept 4
|
||||
ld1 {v3.16b}, [x2], x3
|
||||
MB_PRED_8BITS_ADD_DCT_16BITS_CLIP v3, v0, v1, v4, v5
|
||||
st1 {v3.16b}, [x0], x1
|
||||
.endr
|
||||
WELS_ASM_AARCH64_FUNC_END
|
||||
#endif
|
@ -81,6 +81,17 @@ void WelsIDctRecI16x16Dc_neon (uint8_t* pRec, int32_t iStride, uint8_t* pPredict
|
||||
int16_t* pDctDc);
|
||||
#endif
|
||||
|
||||
#ifdef HAVE_NEON_AARCH64
|
||||
void WelsDequantFour4x4_AArch64_neon (int16_t* pDct, const uint16_t* kpMF);
|
||||
void WelsDequant4x4_AArch64_neon (int16_t* pDct, const uint16_t* kpMF);
|
||||
void WelsDequantIHadamard4x4_AArch64_neon (int16_t* pRes, const uint16_t kuiMF);
|
||||
|
||||
void WelsIDctT4Rec_AArch64_neon (uint8_t* pRec, int32_t iStride, uint8_t* pPrediction, int32_t iPredStride, int16_t* pDct);
|
||||
void WelsIDctFourT4Rec_AArch64_neon (uint8_t* pRec, int32_t iStride, uint8_t* pPrediction, int32_t iPredStride, int16_t* pDct);
|
||||
void WelsIDctRecI16x16Dc_AArch64_neon (uint8_t* pRec, int32_t iStride, uint8_t* pPrediction, int32_t iPredStride,
|
||||
int16_t* pDctDc);
|
||||
#endif
|
||||
|
||||
#if defined(__cplusplus)
|
||||
}
|
||||
#endif//__cplusplus
|
||||
|
@ -122,6 +122,22 @@ void WelsQuantFour4x4_neon (int16_t* pDct, const int16_t* pFF, const int16_t* pM
|
||||
void WelsQuantFour4x4Max_neon (int16_t* pDct, const int16_t* pFF, const int16_t* pMF, int16_t* pMax);
|
||||
#endif
|
||||
|
||||
#ifdef HAVE_NEON_AARCH64
|
||||
void WelsHadamardT4Dc_AArch64_neon (int16_t* pLumaDc, int16_t* pDct);
|
||||
int32_t WelsHadamardQuant2x2_AArch64_neon (int16_t* pRes, const int16_t kiFF, int16_t iMF, int16_t* pDct, int16_t* pBlock);
|
||||
int32_t WelsHadamardQuant2x2Skip_AArch64_neon (int16_t* pRes, int16_t iFF, int16_t iMF);
|
||||
int32_t WelsHadamardQuant2x2SkipKernel_AArch64_neon (int16_t* pRes, int16_t iThreshold); // avoid divide operator
|
||||
|
||||
void WelsDctT4_AArch64_neon (int16_t* pDct, uint8_t* pPixel1, int32_t iStride1, uint8_t* pPixel2, int32_t iStride2);
|
||||
void WelsDctFourT4_AArch64_neon (int16_t* pDct, uint8_t* pPixel1, int32_t iStride1, uint8_t* pPixel2, int32_t iStride2);
|
||||
|
||||
int32_t WelsGetNoneZeroCount_AArch64_neon (int16_t* pLevel);
|
||||
|
||||
void WelsQuant4x4_AArch64_neon (int16_t* pDct, const int16_t* pFF, const int16_t* pMF);
|
||||
void WelsQuant4x4Dc_AArch64_neon (int16_t* pDct, int16_t iFF, int16_t iMF);
|
||||
void WelsQuantFour4x4_AArch64_neon (int16_t* pDct, const int16_t* pFF, const int16_t* pMF);
|
||||
void WelsQuantFour4x4Max_AArch64_neon (int16_t* pDct, const int16_t* pFF, const int16_t* pMF, int16_t* pMax);
|
||||
#endif
|
||||
#if defined(__cplusplus)
|
||||
}
|
||||
#endif//__cplusplus
|
||||
|
@ -282,5 +282,17 @@ void WelsInitReconstructionFuncs (SWelsFuncPtrList* pFuncList, uint32_t uiCpuFl
|
||||
pFuncList->pfIDctI16x16Dc = WelsIDctRecI16x16Dc_neon;
|
||||
}
|
||||
#endif
|
||||
|
||||
#if defined(HAVE_NEON_AARCH64)
|
||||
if (uiCpuFlag & WELS_CPU_NEON) {
|
||||
pFuncList->pfDequantization4x4 = WelsDequant4x4_AArch64_neon;
|
||||
pFuncList->pfDequantizationFour4x4 = WelsDequantFour4x4_AArch64_neon;
|
||||
pFuncList->pfDequantizationIHadamard4x4 = WelsDequantIHadamard4x4_AArch64_neon;
|
||||
|
||||
pFuncList->pfIDctFourT4 = WelsIDctFourT4Rec_AArch64_neon;
|
||||
pFuncList->pfIDctT4 = WelsIDctT4Rec_AArch64_neon;
|
||||
pFuncList->pfIDctI16x16Dc = WelsIDctRecI16x16Dc_AArch64_neon;
|
||||
}
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
@ -455,7 +455,12 @@ int32_t WelsHadamardQuant2x2Skip_neon (int16_t* pRes, int16_t iFF, int16_t iMF)
|
||||
return WelsHadamardQuant2x2SkipKernel_neon (pRes, iThreshold);
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef HAVE_NEON_AARCH64
|
||||
int32_t WelsHadamardQuant2x2Skip_AArch64_neon (int16_t* pRes, int16_t iFF, int16_t iMF) {
|
||||
int16_t iThreshold = ((1 << 16) - 1) / iMF - iFF;
|
||||
return WelsHadamardQuant2x2SkipKernel_AArch64_neon (pRes, iThreshold);
|
||||
}
|
||||
#endif
|
||||
void WelsInitEncodingFuncs (SWelsFuncPtrList* pFuncList, uint32_t uiCpuFlag) {
|
||||
pFuncList->pfCopy8x8Aligned = WelsCopy8x8_c;
|
||||
pFuncList->pfCopy16x16Aligned =
|
||||
@ -542,5 +547,28 @@ void WelsInitEncodingFuncs (SWelsFuncPtrList* pFuncList, uint32_t uiCpuFlag) {
|
||||
pFuncList->pfDctFourT4 = WelsDctFourT4_neon;
|
||||
}
|
||||
#endif
|
||||
|
||||
#if defined(HAVE_NEON_AARCH64)
|
||||
if (uiCpuFlag & WELS_CPU_NEON) {
|
||||
pFuncList->pfQuantizationHadamard2x2 = WelsHadamardQuant2x2_AArch64_neon;
|
||||
pFuncList->pfQuantizationHadamard2x2Skip = WelsHadamardQuant2x2Skip_AArch64_neon;
|
||||
pFuncList->pfDctT4 = WelsDctT4_AArch64_neon;
|
||||
//pFuncList->pfCopy8x8Aligned = WelsCopy8x8_AArch64_neon; // will enable in next update
|
||||
//pFuncList->pfCopy8x16Aligned = WelsCopy8x16_AArch64_neon; // will enable in next update
|
||||
|
||||
pFuncList->pfGetNoneZeroCount = WelsGetNoneZeroCount_AArch64_neon;
|
||||
pFuncList->pfTransformHadamard4x4Dc = WelsHadamardT4Dc_AArch64_neon;
|
||||
|
||||
pFuncList->pfQuantization4x4 = WelsQuant4x4_AArch64_neon;
|
||||
pFuncList->pfQuantizationDc4x4 = WelsQuant4x4Dc_AArch64_neon;
|
||||
pFuncList->pfQuantizationFour4x4 = WelsQuantFour4x4_AArch64_neon;
|
||||
pFuncList->pfQuantizationFour4x4Max = WelsQuantFour4x4Max_AArch64_neon;
|
||||
|
||||
//pFuncList->pfCopy16x16Aligned = WelsCopy16x16_AArch64_neon; // will enable in next update
|
||||
//pFuncList->pfCopy16x16NotAligned = WelsCopy16x16NotAligned_AArch64_neon; // will enable in next update
|
||||
//pFuncList->pfCopy16x8NotAligned = WelsCopy16x8NotAligned_AArch64_neon; // will enable in next update
|
||||
pFuncList->pfDctFourT4 = WelsDctFourT4_AArch64_neon;
|
||||
}
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
@ -62,6 +62,7 @@ ENCODER_ASM_ARM64_SRCS=\
|
||||
$(ENCODER_SRCDIR)/core/arm64/intra_pred_aarch64_neon.S\
|
||||
$(ENCODER_SRCDIR)/core/arm64/intra_pred_sad_3_opt_aarch64_neon.S\
|
||||
$(ENCODER_SRCDIR)/core/arm64/pixel_aarch64_neon.S\
|
||||
$(ENCODER_SRCDIR)/core/arm64/reconstruct_aarch64_neon.S\
|
||||
|
||||
ENCODER_OBJS += $(ENCODER_ASM_ARM64_SRCS:.S=.$(OBJ))
|
||||
endif
|
||||
|
492
test/encoder/EncUT_Reconstruct.cpp
Normal file
492
test/encoder/EncUT_Reconstruct.cpp
Normal file
@ -0,0 +1,492 @@
|
||||
#include<gtest/gtest.h>
|
||||
#include<math.h>
|
||||
#include<stdlib.h>
|
||||
#include<time.h>
|
||||
|
||||
#include "cpu_core.h"
|
||||
#include "cpu.h"
|
||||
#include "macros.h"
|
||||
#include "encode_mb_aux.h"
|
||||
#include "decode_mb_aux.h"
|
||||
#include "wels_func_ptr_def.h"
|
||||
|
||||
using namespace WelsSVCEnc;
|
||||
#define RECONTEST_NUM 1000
|
||||
static void FillWithRandomData (uint8_t* p, int32_t Len) {
|
||||
for (int32_t i = 0; i < Len; i++) {
|
||||
p[i] = rand() % 256;
|
||||
}
|
||||
}
|
||||
|
||||
TEST (ReconstructionFunTest, WelsIDctRecI16x16Dc) {
|
||||
ENFORCE_STACK_ALIGN_2D (uint8_t, pRec, 2, 16 * 16, 16)
|
||||
ENFORCE_STACK_ALIGN_1D (uint8_t, pPred, 32 * 16, 16)
|
||||
ENFORCE_STACK_ALIGN_1D (int16_t, pDct, 16, 16)
|
||||
int32_t iCpuCores = 0;
|
||||
SWelsFuncPtrList sFuncPtrList;
|
||||
uint32_t m_uiCpuFeatureFlag = WelsCPUFeatureDetect (&iCpuCores);
|
||||
WelsInitReconstructionFuncs (&sFuncPtrList, m_uiCpuFeatureFlag);
|
||||
|
||||
for (int32_t k = 0; k < RECONTEST_NUM; k++) {
|
||||
FillWithRandomData (pPred, 32 * 16);
|
||||
FillWithRandomData ((uint8_t*)pDct, 16 * 2);
|
||||
for (int32_t i = 0 ; i < 16; i++) {
|
||||
pDct[i] = WELS_CLIP3 (pDct[i], -4080, 4080);
|
||||
}
|
||||
WelsIDctRecI16x16Dc_c (pRec[0], 16, pPred, 32, pDct);
|
||||
sFuncPtrList.pfIDctI16x16Dc (pRec[1], 16, pPred, 32, pDct);
|
||||
|
||||
for (int32_t j = 0 ; j < 16; j++) {
|
||||
for (int32_t i = 0 ; i < 16; i++) {
|
||||
ASSERT_EQ (pRec[0][i + j * 16], pRec[1][i + j * 16]);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
TEST (ReconstructionFunTest, WelsGetNoneZeroCount) {
|
||||
ENFORCE_STACK_ALIGN_1D (int16_t, pInput, 64, 16)
|
||||
int32_t iZeroCount[2];
|
||||
int32_t iCpuCores = 0;
|
||||
SWelsFuncPtrList sFuncPtrList;
|
||||
uint32_t m_uiCpuFeatureFlag = WelsCPUFeatureDetect (&iCpuCores);
|
||||
WelsInitEncodingFuncs (&sFuncPtrList, m_uiCpuFeatureFlag);
|
||||
|
||||
for (int32_t k = 0; k < RECONTEST_NUM; k++) {
|
||||
FillWithRandomData ((uint8_t*)pInput, 128);
|
||||
iZeroCount[0] = WelsGetNoneZeroCount_c (pInput);
|
||||
iZeroCount[1] = sFuncPtrList.pfGetNoneZeroCount (pInput);
|
||||
ASSERT_EQ (iZeroCount[0], iZeroCount[1]);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
TEST (ReconstructionFunTest, WelsHadamardT4Dc) {
|
||||
ENFORCE_STACK_ALIGN_1D (int16_t, pDct, 16 * 16, 16)
|
||||
ENFORCE_STACK_ALIGN_2D (int16_t, pLumaDc, 2, 16, 16)
|
||||
int32_t iCpuCores = 0;
|
||||
SWelsFuncPtrList sFuncPtrList;
|
||||
uint32_t m_uiCpuFeatureFlag = WelsCPUFeatureDetect (&iCpuCores);
|
||||
WelsInitEncodingFuncs (&sFuncPtrList, m_uiCpuFeatureFlag);
|
||||
|
||||
for (int32_t k = 0; k < RECONTEST_NUM; k++) {
|
||||
FillWithRandomData ((uint8_t*)pDct, 16 * 16 * 2);
|
||||
for (int32_t j = 0 ; j < 16; j++) {
|
||||
for (int32_t i = 0 ; i < 16; i++) {
|
||||
pDct[i + j * 16] = WELS_CLIP3 (pDct[i + j * 16], -4080, 4080);
|
||||
}
|
||||
}
|
||||
WelsHadamardT4Dc_c (pLumaDc[0], pDct);
|
||||
sFuncPtrList.pfTransformHadamard4x4Dc (pLumaDc[1], pDct);
|
||||
for (int32_t i = 0 ; i < 16; i++) {
|
||||
ASSERT_EQ (pLumaDc[0][i], pLumaDc[1][i]);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
||||
TEST (ReconstructionFunTest, WelsDctT4) {
|
||||
ENFORCE_STACK_ALIGN_1D (uint8_t, pInput1, 16 * 4, 16)
|
||||
ENFORCE_STACK_ALIGN_1D (uint8_t, pInput2, 32 * 4, 16)
|
||||
ENFORCE_STACK_ALIGN_2D (int16_t, pOut, 2, 16, 16)
|
||||
|
||||
int32_t iCpuCores = 0;
|
||||
SWelsFuncPtrList sFuncPtrList;
|
||||
uint32_t m_uiCpuFeatureFlag = WelsCPUFeatureDetect (&iCpuCores);
|
||||
WelsInitEncodingFuncs (&sFuncPtrList, m_uiCpuFeatureFlag);
|
||||
|
||||
for (int32_t k = 0; k < RECONTEST_NUM; k++) {
|
||||
FillWithRandomData (pInput1, 16 * 4);
|
||||
FillWithRandomData (pInput2, 32 * 4);
|
||||
WelsDctT4_c (pOut[0], pInput1, 16, pInput2, 32);
|
||||
sFuncPtrList.pfDctT4 (pOut[1], pInput1, 16, pInput2, 32);
|
||||
for (int32_t i = 0 ; i < 16; i++) {
|
||||
ASSERT_EQ (pOut[0][i], pOut[1][i]);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
memset (pInput1, 255, 16 * 4);
|
||||
memset (pInput2, 0, 32 * 4);
|
||||
WelsDctT4_c (pOut[0], pInput1, 16, pInput2, 32);
|
||||
sFuncPtrList.pfDctT4 (pOut[1], pInput1, 16, pInput2, 32);
|
||||
for (int32_t i = 0 ; i < 16; i++) {
|
||||
ASSERT_EQ (pOut[0][i], pOut[1][i]);
|
||||
}
|
||||
|
||||
memset (pInput1, 0, 16 * 4);
|
||||
memset (pInput2, 255, 32 * 4);
|
||||
WelsDctT4_c (pOut[0], pInput1, 16, pInput2, 32);
|
||||
sFuncPtrList.pfDctT4 (pOut[1], pInput1, 16, pInput2, 32);
|
||||
for (int32_t i = 0 ; i < 16; i++) {
|
||||
ASSERT_EQ (pOut[0][i], pOut[1][i]);
|
||||
}
|
||||
}
|
||||
|
||||
TEST (ReconstructionFunTest, WelsDctFourT4) {
|
||||
ENFORCE_STACK_ALIGN_1D (uint8_t, pInput1, 16 * 8, 16)
|
||||
ENFORCE_STACK_ALIGN_1D (uint8_t, pInput2, 32 * 8, 16)
|
||||
ENFORCE_STACK_ALIGN_2D (int16_t, pOut, 2, 64, 16)
|
||||
int32_t iCpuCores = 0;
|
||||
SWelsFuncPtrList sFuncPtrList;
|
||||
uint32_t m_uiCpuFeatureFlag = WelsCPUFeatureDetect (&iCpuCores);
|
||||
WelsInitEncodingFuncs (&sFuncPtrList, m_uiCpuFeatureFlag);
|
||||
|
||||
for (int32_t k = 0; k < RECONTEST_NUM; k++) {
|
||||
FillWithRandomData (pInput1, 16 * 8);
|
||||
FillWithRandomData (pInput2, 32 * 8);
|
||||
WelsDctFourT4_c (pOut[0], pInput1, 16, pInput2, 32);
|
||||
sFuncPtrList.pfDctFourT4 (pOut[1], pInput1, 16, pInput2, 32);
|
||||
for (int32_t i = 0 ; i < 64; i++) {
|
||||
ASSERT_EQ (pOut[0][i], pOut[1][i]);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
TEST (ReconstructionFunTest, WelsIDctT4Rec) {
|
||||
ENFORCE_STACK_ALIGN_2D (int16_t, pDct, 2, 16, 16)
|
||||
ENFORCE_STACK_ALIGN_1D (uint8_t, pPred, 32 * 4, 16)
|
||||
ENFORCE_STACK_ALIGN_2D (uint8_t, pRec, 2, 16 * 4, 16)
|
||||
ENFORCE_STACK_ALIGN_1D (uint8_t, pInput1, 16 * 4, 16)
|
||||
ENFORCE_STACK_ALIGN_1D (uint8_t, pInput2, 32 * 4, 16)
|
||||
int32_t iCpuCores = 0;
|
||||
SWelsFuncPtrList sFuncPtrList;
|
||||
uint32_t m_uiCpuFeatureFlag = WelsCPUFeatureDetect (&iCpuCores);
|
||||
WelsInitReconstructionFuncs (&sFuncPtrList, m_uiCpuFeatureFlag);
|
||||
WelsInitEncodingFuncs (&sFuncPtrList, m_uiCpuFeatureFlag);
|
||||
|
||||
for (int32_t k = 0; k < RECONTEST_NUM; k++) {
|
||||
FillWithRandomData (pPred, 32 * 4);
|
||||
FillWithRandomData (pInput1, 16 * 4);
|
||||
FillWithRandomData (pInput2, 32 * 4);
|
||||
WelsDctT4_c (pDct[0], pInput1, 16, pInput2, 32);
|
||||
sFuncPtrList.pfDctT4 (pDct[1], pInput1, 16, pInput2, 32);
|
||||
WelsIDctT4Rec_c (pRec[0], 16, pPred, 32, pDct[0]);
|
||||
sFuncPtrList.pfIDctT4 (pRec[1], 16, pPred, 32, pDct[1]);
|
||||
|
||||
for (int32_t j = 0 ; j < 4; j++) {
|
||||
for (int32_t i = 0 ; i < 4; i++) {
|
||||
ASSERT_EQ (pRec[0][i + j * 16], pRec[1][i + j * 16]);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
memset (pPred, 255, 32 * 4);
|
||||
memset (pInput1, 255, 16 * 4);
|
||||
memset (pInput2, 0, 32 * 4);
|
||||
WelsDctT4_c (pDct[0], pInput1, 16, pInput2, 32);
|
||||
sFuncPtrList.pfDctT4 (pDct[1], pInput1, 16, pInput2, 32);
|
||||
WelsIDctT4Rec_c (pRec[0], 16, pPred, 32, pDct[0]);
|
||||
sFuncPtrList.pfIDctT4 (pRec[1], 16, pPred, 32, pDct[1]);
|
||||
|
||||
for (int32_t j = 0 ; j < 4; j++) {
|
||||
for (int32_t i = 0 ; i < 4; i++) {
|
||||
ASSERT_EQ (pRec[0][i + j * 16], pRec[1][i + j * 16]);
|
||||
}
|
||||
}
|
||||
|
||||
memset (pPred, 255, 32 * 4);
|
||||
memset (pInput1, 0, 16 * 4);
|
||||
memset (pInput2, 255, 32 * 4);
|
||||
WelsDctT4_c (pDct[0], pInput1, 16, pInput2, 32);
|
||||
sFuncPtrList.pfDctT4 (pDct[1], pInput1, 16, pInput2, 32);
|
||||
WelsIDctT4Rec_c (pRec[0], 16, pPred, 32, pDct[0]);
|
||||
sFuncPtrList.pfIDctT4 (pRec[1], 16, pPred, 32, pDct[1]);
|
||||
|
||||
for (int32_t j = 0 ; j < 4; j++) {
|
||||
for (int32_t i = 0 ; i < 4; i++) {
|
||||
ASSERT_EQ (pRec[0][i + j * 16], pRec[1][i + j * 16]);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
TEST (ReconstructionFunTest, WelsIDctFourT4Rec) {
|
||||
ENFORCE_STACK_ALIGN_2D (int16_t, pDct, 2, 64, 16)
|
||||
ENFORCE_STACK_ALIGN_1D (uint8_t, pPred, 32 * 8, 16)
|
||||
ENFORCE_STACK_ALIGN_2D (uint8_t, pRec, 2, 16 * 8, 16)
|
||||
ENFORCE_STACK_ALIGN_1D (uint8_t, pInput1, 16 * 8, 16)
|
||||
ENFORCE_STACK_ALIGN_1D (uint8_t, pInput2, 32 * 8, 16)
|
||||
int32_t iCpuCores = 0;
|
||||
SWelsFuncPtrList sFuncPtrList;
|
||||
uint32_t m_uiCpuFeatureFlag = WelsCPUFeatureDetect (&iCpuCores);
|
||||
WelsInitReconstructionFuncs (&sFuncPtrList, m_uiCpuFeatureFlag);
|
||||
WelsInitEncodingFuncs (&sFuncPtrList, m_uiCpuFeatureFlag);
|
||||
|
||||
for (int32_t k = 0; k < RECONTEST_NUM; k++) {
|
||||
FillWithRandomData (pInput1, 16 * 8);
|
||||
FillWithRandomData (pInput2, 32 * 8);
|
||||
FillWithRandomData (pPred, 32 * 8);
|
||||
WelsDctFourT4_c (pDct[0], pInput1, 16, pInput2, 32);
|
||||
sFuncPtrList.pfDctFourT4 (pDct[1], pInput1, 16, pInput2, 32);
|
||||
WelsIDctFourT4Rec_c (pRec[0], 16, pPred, 32, pDct[0]);
|
||||
sFuncPtrList.pfIDctFourT4 (pRec[1], 16, pPred, 32, pDct[1]);
|
||||
for (int32_t j = 0 ; j < 8; j++) {
|
||||
for (int32_t i = 0 ; i < 8; i++) {
|
||||
ASSERT_EQ (pRec[0][i + j * 16], pRec[1][i + j * 16]);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
TEST (ReconstructionFunTest, WelsDequant4x4) {
|
||||
ENFORCE_STACK_ALIGN_2D (int16_t, pInput, 2, 16, 16)
|
||||
int32_t iCpuCores = 0;
|
||||
SWelsFuncPtrList sFuncPtrList;
|
||||
uint32_t m_uiCpuFeatureFlag = WelsCPUFeatureDetect (&iCpuCores);
|
||||
WelsInitReconstructionFuncs (&sFuncPtrList, m_uiCpuFeatureFlag);
|
||||
WelsInitEncodingFuncs (&sFuncPtrList, m_uiCpuFeatureFlag);
|
||||
|
||||
for (int32_t k = 0; k < RECONTEST_NUM; k++) {
|
||||
uint8_t uiQp = rand() % 52;
|
||||
FillWithRandomData ((uint8_t*)pInput[0], 32);
|
||||
for (int32_t i = 0 ; i < 16; i++) {
|
||||
pInput[0][i] = WELS_CLIP3 (pInput[0][i], -32640, 32640);
|
||||
}
|
||||
memcpy ((uint8_t*)pInput[1], (uint8_t*)pInput[0], 32);
|
||||
|
||||
const int16_t* pMF = g_kiQuantMF[uiQp];
|
||||
const int16_t* pFF = g_iQuantIntraFF[uiQp];
|
||||
WelsQuant4x4_c (pInput[0], pFF, pMF);
|
||||
sFuncPtrList.pfQuantization4x4 (pInput[1], pFF, pMF);
|
||||
|
||||
WelsDequant4x4_c (pInput[0], g_kuiDequantCoeff[uiQp]);
|
||||
sFuncPtrList.pfDequantization4x4 (pInput[1], g_kuiDequantCoeff[uiQp]);
|
||||
for (int32_t i = 0 ; i < 16; i++) {
|
||||
ASSERT_EQ (pInput[0][i], pInput[1][i]);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
TEST (ReconstructionFunTest, WelsDequantIHadamard4x4) {
|
||||
ENFORCE_STACK_ALIGN_2D (int16_t, pInput, 2, 16, 16)
|
||||
int32_t iCpuCores = 0;
|
||||
SWelsFuncPtrList sFuncPtrList;
|
||||
uint32_t m_uiCpuFeatureFlag = WelsCPUFeatureDetect (&iCpuCores);
|
||||
WelsInitReconstructionFuncs (&sFuncPtrList, m_uiCpuFeatureFlag);
|
||||
WelsInitEncodingFuncs (&sFuncPtrList, m_uiCpuFeatureFlag);
|
||||
|
||||
for (int32_t k = 0; k < RECONTEST_NUM; k++) {
|
||||
uint8_t uiQp = rand() % 52;
|
||||
FillWithRandomData ((uint8_t*)pInput[0], 32);
|
||||
for (int32_t i = 0 ; i < 16; i++) {
|
||||
pInput[0][i] = WELS_CLIP3 (pInput[0][i], -32640, 32640);
|
||||
}
|
||||
memcpy ((uint8_t*)pInput[1], (uint8_t*)pInput[0], 32);
|
||||
|
||||
const int16_t* pMF = g_kiQuantMF[uiQp];
|
||||
const int16_t* pFF = g_iQuantIntraFF[uiQp];
|
||||
WelsQuant4x4_c (pInput[0], pFF, pMF);
|
||||
sFuncPtrList.pfQuantization4x4 (pInput[1], pFF, pMF);
|
||||
|
||||
WelsDequantIHadamard4x4_c (pInput[0], g_kuiDequantCoeff[uiQp][0]);
|
||||
sFuncPtrList.pfDequantizationIHadamard4x4 (pInput[1], g_kuiDequantCoeff[uiQp][0]);
|
||||
for (int32_t i = 0 ; i < 16; i++) {
|
||||
ASSERT_EQ (pInput[0][i], pInput[1][i]);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
TEST (ReconstructionFunTest, WelsQuant4x4) {
|
||||
ENFORCE_STACK_ALIGN_2D (int16_t, pInput, 2, 16, 16)
|
||||
int32_t iCpuCores = 0;
|
||||
SWelsFuncPtrList sFuncPtrList;
|
||||
uint32_t m_uiCpuFeatureFlag = WelsCPUFeatureDetect (&iCpuCores);
|
||||
WelsInitEncodingFuncs (&sFuncPtrList, m_uiCpuFeatureFlag);
|
||||
|
||||
for (int32_t k = 0; k < RECONTEST_NUM; k++) {
|
||||
uint8_t uiQp = rand() % 52;
|
||||
FillWithRandomData ((uint8_t*)pInput[0], 32);
|
||||
for (int32_t i = 0 ; i < 16; i++) {
|
||||
pInput[0][i] = WELS_CLIP3 (pInput[0][i], -32640, 32640);
|
||||
}
|
||||
memcpy ((uint8_t*)pInput[1], (uint8_t*)pInput[0], 32);
|
||||
|
||||
const int16_t* pMF = g_kiQuantMF[uiQp];
|
||||
const int16_t* pFF = g_iQuantIntraFF[uiQp];
|
||||
WelsQuant4x4_c (pInput[0], pFF, pMF);
|
||||
sFuncPtrList.pfQuantization4x4 (pInput[1], pFF, pMF);
|
||||
for (int32_t i = 0 ; i < 16; i++) {
|
||||
ASSERT_EQ (pInput[0][i], pInput[1][i]);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
TEST (ReconstructionFunTest, WelsQuant4x4Dc) {
|
||||
ENFORCE_STACK_ALIGN_2D (int16_t, pInput, 2, 16, 16)
|
||||
int32_t iCpuCores = 0;
|
||||
SWelsFuncPtrList sFuncPtrList;
|
||||
uint32_t m_uiCpuFeatureFlag = WelsCPUFeatureDetect (&iCpuCores);
|
||||
WelsInitEncodingFuncs (&sFuncPtrList, m_uiCpuFeatureFlag);
|
||||
|
||||
for (int32_t k = 0; k < RECONTEST_NUM; k++) {
|
||||
uint8_t uiQp = rand() % 52;
|
||||
FillWithRandomData ((uint8_t*)pInput[0], 32);
|
||||
for (int32_t i = 0 ; i < 16; i++) {
|
||||
pInput[0][i] = WELS_CLIP3 (pInput[0][i], -32640, 32640);
|
||||
}
|
||||
memcpy ((uint8_t*)pInput[1], (uint8_t*)pInput[0], 32);
|
||||
|
||||
const int16_t* pMF = g_kiQuantMF[uiQp];
|
||||
const int16_t* pFF = g_iQuantIntraFF[uiQp];
|
||||
WelsQuant4x4Dc_c (pInput[0], pFF[0], pMF[0]);
|
||||
sFuncPtrList.pfQuantizationDc4x4 (pInput[1], pFF[0], pMF[0]);
|
||||
for (int32_t i = 0 ; i < 16; i++) {
|
||||
ASSERT_EQ (pInput[0][i], pInput[1][i]);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
TEST (ReconstructionFunTest, WelsQuantFour4x4) {
|
||||
ENFORCE_STACK_ALIGN_2D (int16_t, pInput, 2, 64, 16)
|
||||
int32_t iCpuCores = 0;
|
||||
SWelsFuncPtrList sFuncPtrList;
|
||||
uint32_t m_uiCpuFeatureFlag = WelsCPUFeatureDetect (&iCpuCores);
|
||||
WelsInitEncodingFuncs (&sFuncPtrList, m_uiCpuFeatureFlag);
|
||||
|
||||
for (int32_t k = 0; k < RECONTEST_NUM; k++) {
|
||||
uint8_t uiQp = rand() % 52;
|
||||
FillWithRandomData ((uint8_t*)pInput[0], 128);
|
||||
for (int32_t i = 0 ; i < 64; i++) {
|
||||
pInput[0][i] = WELS_CLIP3 (pInput[0][i], -32640, 32640);
|
||||
}
|
||||
memcpy ((uint8_t*)pInput[1], (uint8_t*)pInput[0], 128);
|
||||
|
||||
const int16_t* pMF = g_kiQuantMF[uiQp];
|
||||
const int16_t* pFF = g_iQuantIntraFF[uiQp];
|
||||
WelsQuantFour4x4_c (pInput[0], pFF, pMF);
|
||||
sFuncPtrList.pfQuantizationFour4x4 (pInput[1], pFF, pMF);
|
||||
for (int32_t i = 0 ; i < 64; i++) {
|
||||
ASSERT_EQ (pInput[0][i], pInput[1][i]);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
TEST (ReconstructionFunTest, WelsQuantFour4x4Max) {
|
||||
ENFORCE_STACK_ALIGN_2D (int16_t, pInput, 2, 64, 16)
|
||||
int32_t iCpuCores = 0;
|
||||
SWelsFuncPtrList sFuncPtrList;
|
||||
uint32_t m_uiCpuFeatureFlag = WelsCPUFeatureDetect (&iCpuCores);
|
||||
WelsInitEncodingFuncs (&sFuncPtrList, m_uiCpuFeatureFlag);
|
||||
|
||||
int16_t pMax[2][4];
|
||||
for (int32_t k = 0; k < RECONTEST_NUM; k++) {
|
||||
uint8_t uiQp = rand() % 52;
|
||||
FillWithRandomData ((uint8_t*)pInput[0], 128);
|
||||
for (int32_t i = 0 ; i < 64; i++) {
|
||||
pInput[0][i] = WELS_CLIP3 (pInput[0][i], -32640, 32640);
|
||||
}
|
||||
memcpy ((uint8_t*)pInput[1], (uint8_t*)pInput[0], 128);
|
||||
|
||||
const int16_t* pMF = g_kiQuantMF[uiQp];
|
||||
const int16_t* pFF = g_iQuantIntraFF[uiQp];
|
||||
WelsQuantFour4x4Max_c (pInput[0], pFF, pMF, pMax[0]);
|
||||
sFuncPtrList.pfQuantizationFour4x4Max (pInput[1], pFF, pMF, pMax[1]);
|
||||
for (int32_t i = 0 ; i < 64; i++) {
|
||||
ASSERT_EQ (pInput[0][i], pInput[1][i]);
|
||||
ASSERT_EQ (pMax[0][i >> 4], pMax[1][i >> 4]);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
TEST (ReconstructionFunTest, WelsDeQuantFour4x4) {
|
||||
ENFORCE_STACK_ALIGN_2D (int16_t, pInput, 2, 64, 16)
|
||||
int32_t iCpuCores = 0;
|
||||
SWelsFuncPtrList sFuncPtrList;
|
||||
uint32_t m_uiCpuFeatureFlag = WelsCPUFeatureDetect (&iCpuCores);
|
||||
WelsInitEncodingFuncs (&sFuncPtrList, m_uiCpuFeatureFlag);
|
||||
WelsInitReconstructionFuncs (&sFuncPtrList, m_uiCpuFeatureFlag);
|
||||
|
||||
for (int32_t k = 0; k < RECONTEST_NUM; k++) {
|
||||
uint8_t uiQp = rand() % 52;
|
||||
FillWithRandomData ((uint8_t*)pInput[0], 128);
|
||||
for (int32_t i = 0 ; i < 64; i++) {
|
||||
pInput[0][i] = WELS_CLIP3 (pInput[0][i], -32640, 32640);
|
||||
}
|
||||
memcpy ((uint8_t*)pInput[1], (uint8_t*)pInput[0], 128);
|
||||
|
||||
const int16_t* pMF = g_kiQuantMF[uiQp];
|
||||
const int16_t* pFF = g_iQuantIntraFF[uiQp];
|
||||
WelsQuantFour4x4_c (pInput[0], pFF, pMF);
|
||||
sFuncPtrList.pfQuantizationFour4x4 (pInput[1], pFF, pMF);
|
||||
|
||||
WelsDequantFour4x4_c (pInput[0], g_kuiDequantCoeff[uiQp]);
|
||||
sFuncPtrList.pfDequantizationFour4x4 (pInput[1], g_kuiDequantCoeff[uiQp]);
|
||||
for (int32_t i = 0 ; i < 64; i++) {
|
||||
ASSERT_EQ (pInput[0][i], pInput[1][i]);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
TEST (ReconstructionFunTest, WelsHadamardQuant2x2Skip) {
|
||||
ENFORCE_STACK_ALIGN_2D (int16_t, pInput, 2, 64, 16)
|
||||
int32_t iCpuCores = 0;
|
||||
SWelsFuncPtrList sFuncPtrList;
|
||||
uint32_t m_uiCpuFeatureFlag = WelsCPUFeatureDetect (&iCpuCores);
|
||||
WelsInitEncodingFuncs (&sFuncPtrList, m_uiCpuFeatureFlag);
|
||||
|
||||
for (int32_t k = 0; k < RECONTEST_NUM; k++) {
|
||||
uint8_t uiQp = rand() % 52;
|
||||
FillWithRandomData ((uint8_t*)pInput[0], 128);
|
||||
for (int32_t i = 0 ; i < 64; i++) {
|
||||
pInput[0][i] = WELS_CLIP3 (pInput[0][i], -4080, 4080);
|
||||
}
|
||||
memcpy ((uint8_t*)pInput[1], (uint8_t*)pInput[0], 128);
|
||||
|
||||
const int16_t* pMF = g_kiQuantMF[uiQp];
|
||||
const int16_t* pFF = g_iQuantIntraFF[uiQp];
|
||||
int32_t iSkip_c = WelsHadamardQuant2x2Skip_c (pInput[0], pFF[0], pMF[0]);
|
||||
int32_t iSkip_test = sFuncPtrList.pfQuantizationHadamard2x2Skip (pInput[1], pFF[0], pMF[0]);
|
||||
|
||||
ASSERT_EQ ((iSkip_test != 0), (iSkip_c != 0));
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
TEST (ReconstructionFunTest, WelsHadamardQuant2x2) {
|
||||
ENFORCE_STACK_ALIGN_2D (int16_t, pInput, 2, 64, 16)
|
||||
ENFORCE_STACK_ALIGN_2D (int16_t, pDct, 2, 4, 16)
|
||||
ENFORCE_STACK_ALIGN_2D (int16_t, pBlock, 2, 4, 16)
|
||||
int32_t iCpuCores = 0;
|
||||
SWelsFuncPtrList sFuncPtrList;
|
||||
uint32_t m_uiCpuFeatureFlag = WelsCPUFeatureDetect (&iCpuCores);
|
||||
WelsInitEncodingFuncs (&sFuncPtrList, m_uiCpuFeatureFlag);
|
||||
|
||||
for (int32_t k = 0; k < RECONTEST_NUM; k++) {
|
||||
uint8_t uiQp = rand() % 52;
|
||||
FillWithRandomData ((uint8_t*)pInput[0], 128);
|
||||
for (int32_t i = 0 ; i < 64; i++) {
|
||||
pInput[0][i] = WELS_CLIP3 (pInput[0][i], -4080, 4080);
|
||||
}
|
||||
memcpy ((uint8_t*)pInput[1], (uint8_t*)pInput[0], 128);
|
||||
|
||||
const int16_t* pMF = g_kiQuantMF[uiQp];
|
||||
const int16_t* pFF = g_iQuantIntraFF[uiQp];
|
||||
int32_t iSkip_c = WelsHadamardQuant2x2_c (pInput[0], pFF[0], pMF[0], pDct[0], pBlock[0]);
|
||||
int32_t iSkip_test = sFuncPtrList.pfQuantizationHadamard2x2 (pInput[1], pFF[0], pMF[0], pDct[1], pBlock[1]);
|
||||
|
||||
ASSERT_EQ ((iSkip_test != 0), (iSkip_c != 0));
|
||||
for (int32_t i = 0 ; i < 64; i++) {
|
||||
ASSERT_EQ (pInput[0][i], pInput[1][i]);
|
||||
}
|
||||
for (int32_t i = 0 ; i < 4; i++) {
|
||||
ASSERT_EQ (pDct[0][i], pDct[1][i]);
|
||||
ASSERT_EQ (pBlock[0][i], pBlock[1][i]);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
@ -8,6 +8,7 @@ ENCODER_UNITTEST_CPP_SRCS=\
|
||||
$(ENCODER_UNITTEST_SRCDIR)/EncUT_GetIntraPredictor.cpp\
|
||||
$(ENCODER_UNITTEST_SRCDIR)/EncUT_MemoryAlloc.cpp\
|
||||
$(ENCODER_UNITTEST_SRCDIR)/EncUT_MotionEstimate.cpp\
|
||||
$(ENCODER_UNITTEST_SRCDIR)/EncUT_Reconstruct.cpp\
|
||||
$(ENCODER_UNITTEST_SRCDIR)/EncUT_Sample.cpp\
|
||||
|
||||
ENCODER_UNITTEST_OBJS += $(ENCODER_UNITTEST_CPP_SRCS:.cpp=.$(OBJ))
|
||||
|
Loading…
Reference in New Issue
Block a user