Add ARM64 Code and UnitTest for reconstruction

This commit is contained in:
dongzhang 2014-06-27 12:53:27 +08:00
parent 5a60d0fef4
commit d88b83df44
9 changed files with 1513 additions and 1 deletions

View File

@ -47,6 +47,7 @@
4CE4472E18BC605C0017DF25 /* welsEncoderExt.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 4CE4470618BC605C0017DF25 /* welsEncoderExt.cpp */; };
9AED665019469FC1009A3567 /* welsCodecTrace.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 9AED664C19469FC1009A3567 /* welsCodecTrace.cpp */; };
9AED66661946A2B3009A3567 /* utils.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 9AED66651946A2B3009A3567 /* utils.cpp */; };
F5617A50196A833A006E2B20 /* reconstruct_aarch64_neon.S in Sources */ = {isa = PBXBuildFile; fileRef = F5617A4F196A833A006E2B20 /* reconstruct_aarch64_neon.S */; };
/* End PBXBuildFile section */
/* Begin PBXCopyFilesBuildPhase section */
@ -155,6 +156,7 @@
9AED664C19469FC1009A3567 /* welsCodecTrace.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; name = welsCodecTrace.cpp; path = ../../../common/src/welsCodecTrace.cpp; sourceTree = "<group>"; };
9AED66651946A2B3009A3567 /* utils.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; name = utils.cpp; path = ../../../common/src/utils.cpp; sourceTree = "<group>"; };
9AED66671946A2C4009A3567 /* utils.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = utils.h; path = ../../../common/inc/utils.h; sourceTree = "<group>"; };
F5617A4F196A833A006E2B20 /* reconstruct_aarch64_neon.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; name = reconstruct_aarch64_neon.S; path = arm64/reconstruct_aarch64_neon.S; sourceTree = "<group>"; };
/* End PBXFileReference section */
/* Begin PBXFrameworksBuildPhase section */
@ -184,6 +186,7 @@
4CB8F2B219235FAC005D6386 /* arm64 */ = {
isa = PBXGroup;
children = (
F5617A4F196A833A006E2B20 /* reconstruct_aarch64_neon.S */,
4C23BC5F195A77E0003B81FC /* intra_pred_sad_3_opt_aarch64_neon.S */,
4CBC1B82194ACBB400214D9E /* intra_pred_aarch64_neon.S */,
4CB8F2B319235FC5005D6386 /* pixel_aarch64_neon.S */,
@ -430,6 +433,7 @@
4CE4470E18BC605C0017DF25 /* au_set.cpp in Sources */,
4CBC1B83194ACBB400214D9E /* intra_pred_aarch64_neon.S in Sources */,
4CE4471718BC605C0017DF25 /* mc.cpp in Sources */,
F5617A50196A833A006E2B20 /* reconstruct_aarch64_neon.S in Sources */,
4CE4472918BC605C0017DF25 /* svc_set_mb_syn_cavlc.cpp in Sources */,
4CE4471818BC605C0017DF25 /* md.cpp in Sources */,
4CE4471B18BC605C0017DF25 /* nal_encap.cpp in Sources */,

View File

@ -0,0 +1,947 @@
/*!
* \copy
* Copyright (c) 2013, Cisco Systems
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
* COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*
*/
#ifdef HAVE_NEON_AARCH64
.text
#include "arm_arch64_common_macro.S"
#ifdef __APPLE__
.macro ZERO_COUNT_IN_2_QUARWORD
// { // input: coef_0 (identy to $3 $4), coef_1(identy to $5 $6), mask_q
cmeq $0.8h, $0.8h, #0
cmeq $1.8h, $1.8h, #0
uzp1 $0.16b, $0.16b, $1.16b
ushr $0.16b, $0.16b, 7
addv $2, $0.16b
// }
.endm
.macro NEWQUANT_COEF_EACH_16BITS // if coef <= 0, - coef; else , coef;
// { // input: coef, ff (dst), mf
eor $3.16b, $3.16b, $3.16b // init 0 , and keep 0;
saba $1.8h, $0.8h, $3.8h // f + abs(coef - 0)
smull $4.4s, $1.4h, $2.4h
smull2 $5.4s, $1.8h, $2.8h
shrn $1.4h, $4.4s, #16
shrn2 $1.8h, $5.4s, #16
cmgt $4.8h, $0.8h, #0 // if true, location of coef == 11111111
bif $3.16b, $1.16b, $4.16b // if (x<0) reserved part; else keep 0 untouched
shl $3.8h, $3.8h, #1
sub $1.8h, $1.8h, $3.8h // if x > 0, -= 0; else x-= 2x
// }
.endm
.macro NEWQUANT_COEF_EACH_16BITS_MAX // if coef <= 0, - coef; else , coef;
// { // input: coef, ff (dst), mf
eor $3.16b, $3.16b, $3.16b // init 0 , and keep 0;
saba $1.8h, $0.8h, $3.8h // f + abs(coef - 0)
smull $4.4s, $1.4h, $2.4h
smull2 $5.4s, $1.8h, $2.8h
shrn $1.4h, $4.4s, #16
shrn2 $1.8h, $5.4s, #16
cmgt $4.8h, $0.8h, #0 // if true, location of coef == 11111111
bif $3.16b, $1.16b, $4.16b // if (x<0) reserved part; else keep 0 untouched
shl $3.8h, $3.8h, #1
mov.8h $6, $1
sub $1.8h, $1.8h, $3.8h // if x > 0, -= 0; else x-= 2x
// }
.endm
.macro QUANT_DUALWORD_COEF_EACH_16BITS // if coef <= 0, - coef; else , coef;
// { // input: coef, ff (dst), mf
saba $1.8h, $0.8h, $3.8h // f + abs(coef - 0)
smull $4.4s, $1.4h, $2.4h
shrn $1.4h, $4.4s, #16
cmgt $4.8h, $0.8h, #0 // if true, location of coef == 11111111
bif $3.16b, $1.16b, $4.16b // if (x<0) reserved part; else keep 0 untouched
shl $3.8h, $3.8h, #1
sub $1.8h, $1.8h, $3.8h // if x > 0, -= 0; else x-= 2x
// }
.endm
.macro SELECT_MAX_IN_ABS_COEF
// { // input: coef_0, coef_1, coef_2, coef_3, max_q (identy to follow two)
umax $0.8h, $0.8h, $1.8h
umaxv $4, $0.8h
umax $2.8h, $2.8h, $3.8h
umaxv $5, $2.8h
// }
.endm
.macro HDM_QUANT_2x2_TOTAL_16BITS
// { // input: src_d[0][16][32][48], dst_d[0][16][32][48], working
sshr $1.2d, $0.2d, #32
add $2.4h, $0.4h, $1.4h // [0] = rs[0] + rs[32];[1] = rs[16] + rs[48];
sub $1.4h, $0.4h, $1.4h // [0] = rs[0] - rs[32];[1] = rs[16] - rs[48];
zip1 $1.4h, $2.4h, $1.4h
// }
.endm
.macro DC_ZERO_COUNT_IN_DUALWORD
// { // input: coef, dst_d, working_d (all 0x01)
cmeq $0.4h, $0.4h, #0
and $0.8b, $0.8b, $2.8b
addv $1, $0.4h
// }
.endm
.macro IHDM_4x4_TOTAL_16BITS
// { // input: each src_d[0]~[3](dst), working_q0, working_q1
uzp2 $1.4s, $0.4s, $0.4s
uzp1 $0.4s, $0.4s, $0.4s
add $2.8h, $0.8h, $1.8h // [0] = rs[0] + rs[2];[1] = rs[1] + rs[3];[2] = rs[4] + rs[6];[3] = rs[5] + rs[7];
sub $1.8h, $0.8h, $1.8h // [0] = rs[0] - rs[2];[1] = rs[1] - rs[3];[2] = rs[4] - rs[6];[3] = rs[5] - rs[7];
zip1 $2.8h, $2.8h, $1.8h // [0] = rs[0] + rs[2]; [1] = rs[0] - rs[2]; ... [2]; [3]
uzp2 $1.4s, $2.4s, $2.4s
uzp1 $0.4s, $2.4s, $2.4s
add $2.8h, $0.8h, $1.8h // [0] = rs[0] + rs[2];[1] = rs[1] + rs[3];[2] = rs[4] + rs[6];[3] = rs[5] + rs[7];
sub $1.8h, $0.8h, $1.8h // [0] = rs[0] - rs[2];[1] = rs[1] - rs[3];[2] = rs[4] - rs[6];[3] = rs[5] - rs[7];
rev32 $1.4h, $1.4h // [0] = rs[1] - rs[3];[1] = rs[0] - rs[2];[2] = rs[5] - rs[7];[3] = rs[4] - rs[6];
zip1 $0.4s, $2.4s, $1.4s
// }
.endm
.macro MATRIX_TRANSFORM_EACH_16BITS_2x8_OUT2
// { // input & output: src_d[0]~[3];[0 1 2 3]+[4 5 6 7]+[8 9 10 11]+[12 13 14 15]
uzp1 $2.4s, $0.4s, $1.4s //[0 1 4 5]+[8 9 12 13]
uzp2 $3.4s, $0.4s, $1.4s //[2 3 6 7]+[10 11 14 15]
uzp1 $0.8h, $2.8h, $3.8h //[0 4 8 12]+[2 6 10 14]
uzp2 $2.8h, $2.8h, $3.8h //[1 5 9 13]+[3 7 11 15]
zip2 $1.2d, $0.2d, $2.2d //[2 6 10 14]+[3 7 11 15]
zip1 $0.2d, $0.2d, $2.2d //[0 4 8 12]+[1 5 9 13]
// }
.endm
.macro MATRIX_TRANSFORM_EACH_16BITS_OUT4
// { // input & output: src_d[0]~[3];[0 4 8 12],[1 5 9 13],[2 6 10 14],[3 7 11 15]
trn1 $4.8h, v0.8h, v1.8h
trn2 $5.8h, v0.8h, v1.8h
trn1 $6.8h, v2.8h, v3.8h
trn2 $7.8h, v2.8h, v3.8h
trn1 $0.4s, v4.4s, v6.4s
trn2 $2.4s, v4.4s, v6.4s
trn1 $1.4s, v5.4s, v7.4s
trn2 $3.4s, v5.4s, v7.4s
// }
.endm
.macro MATRIX_TRANSFORM_EACH_16BITS_4x4_OUT2
// { // input & output: src_d[0]~[3];[0 1 2 3],[4 5 6 7],[8 9 10 11],[12 13 14 15]
mov $0.d[1], $1.d[0] //[0 1 2 3]+[4 5 6 7]
mov $2.d[1], $3.d[0] //[8 9 10 11]+[12 13 14 15]
uzp1 $1.4s, $0.4s, $2.4s //[0 1 4 5]+[8 9 12 13]
uzp2 $3.4s, $0.4s, $2.4s //[2 3 6 7]+[10 11 14 15]
uzp1 $0.8h, $1.8h, $3.8h //[0 4 8 12]+[2 6 10 14]
uzp2 $2.8h, $1.8h, $3.8h //[1 5 9 13]+[3 7 11 15]
zip2 $1.2d, $0.2d, $2.2d //[2 6 10 14]+[3 7 11 15]
zip1 $0.2d, $0.2d, $2.2d //[0 4 8 12]+[1 5 9 13]
// }
.endm
.macro LOAD_4x4_DATA_FOR_DCT
ld1 {$0.s}[0], [$2], $3
ld1 {$0.s}[1], [$2], $3
ld1 {$0.s}[2], [$2], $3
ld1 {$0.s}[3], [$2]
ld1 {$1.s}[0], [$4], $5
ld1 {$1.s}[1], [$4], $5
ld1 {$1.s}[2], [$4], $5
ld1 {$1.s}[3], [$4]
.endm
.macro DCT_ROW_TRANSFORM_TOTAL_16BITS
// { // input: src_d[0]~[3], working: [4]~[7]
add $4.8h, $0.8h, $3.8h //int16 s[0] = data[i] + data[i3];
sub $7.8h, $0.8h, $3.8h //int16 s[3] = data[i] - data[i3];
add $5.8h, $1.8h, $2.8h //int16 s[1] = data[i1] + data[i2];
sub $6.8h, $1.8h, $2.8h //int16 s[2] = data[i1] - data[i2];
add $0.8h, $4.8h, $5.8h //int16 dct[i ] = s[0] + s[1];
sub $2.8h, $4.8h, $5.8h //int16 dct[i2] = s[0] - s[1];
shl $1.8h, $7.8h, #1
shl $3.8h, $6.8h, #1
add $1.8h, $1.8h, $6.8h //int16 dct[i1] = (s[3] << 1) + s[2];
sub $3.8h, $7.8h, $3.8h //int16 dct[i3] = s[3] - (s[2] << 1);
// }
.endm
.macro LOAD_8x4_DATA_FOR_DCT
// { // input: $0~$3, src1*, src2*; untouched r2:src1_stride &r4:src2_stride
ld1 {$0.d}[0], [$8], x2
ld1 {$1.d}[0], [$8], x2
ld1 {$2.d}[0], [$8], x2
ld1 {$3.d}[0], [$8], x2
ld1 {$4.d}[0], [$9], x4
ld1 {$5.d}[0], [$9], x4
ld1 {$6.d}[0], [$9], x4
ld1 {$7.d}[0], [$9], x4
// }
.endm
.macro ROW_TRANSFORM_1_STEP_TOTAL_16BITS
// { // input: src_d[0]~[3], output: e_d[0]~[3];
add $4.8h, $0.8h, $2.8h //int16 e[i][0] = src[0] + src[2];
sub $5.8h, $0.8h, $2.8h //int16 e[i][1] = src[0] - src[2];
sshr $6.8h, $1.8h, #1
sshr $7.8h, $3.8h, #1
sub $6.8h, $6.8h, $3.8h //int16 e[i][2] = (src[1]>>1)-src[3];
add $7.8h, $1.8h, $7.8h //int16 e[i][3] = src[1] + (src[3]>>1);
// }
.endm
.macro TRANSFORM_TOTAL_16BITS // both row & col transform used
// { // output: f_q[0]~[3], input: e_q[0]~[3];
add $0.8h, $4.8h, $7.8h //int16 f[i][0] = e[i][0] + e[i][3];
add $1.8h, $5.8h, $6.8h //int16 f[i][1] = e[i][1] + e[i][2];
sub $2.8h, $5.8h, $6.8h //int16 f[i][2] = e[i][1] - e[i][2];
sub $3.8h, $4.8h, $7.8h //int16 f[i][3] = e[i][0] - e[i][3];
// }
.endm
.macro ROW_TRANSFORM_0_STEP
// { // input: src_d[0]~[3], output: e_q[0]~[3];
saddl $4.4s, $0.4h, $2.4h //int32 e[i][0] = src[0] + src[2];
ssubl $5.4s, $0.4h, $2.4h //int32 e[i][1] = src[0] - src[2];
ssubl $6.4s, $1.4h, $3.4h //int32 e[i][2] = src[1] - src[3];
saddl $7.4s, $1.4h, $3.4h //int32 e[i][3] = src[1] + src[3];
// }
.endm
.macro COL_TRANSFORM_0_STEP
// { // input: src_q[0]~[3], output: e_q[0]~[3];
add $4.4s, $0.4s, $2.4s //int32 e[0][j] = f[0][j] + f[2][j];
sub $5.4s, $0.4s, $2.4s //int32 e[1][j] = f[0][j] - f[2][j];
sub $6.4s, $1.4s, $3.4s //int32 e[2][j] = (f[1][j]>>1) - f[3][j];
add $7.4s, $1.4s, $3.4s //int32 e[3][j] = f[1][j] + (f[3][j]>>1);
// }
.endm
.macro TRANSFORM_4BYTES // both row & col transform used
// { // output: f_q[0]~[3], input: e_q[0]~[3];
add $0.4s, $4.4s, $7.4s //int16 f[i][0] = e[i][0] + e[i][3];
add $1.4s, $5.4s, $6.4s //int16 f[i][1] = e[i][1] + e[i][2];
sub $2.4s, $5.4s, $6.4s //int16 f[i][2] = e[i][1] - e[i][2];
sub $3.4s, $4.4s, $7.4s //int16 f[i][3] = e[i][0] - e[i][3];
// }
.endm
.macro MB_PRED_8BITS_ADD_DCT_16BITS_CLIP
// { // input: pred_d[0](output), dct_q0/1, working_q0/1;
uxtl $3.8h, $0.8b
uxtl2 $4.8h, $0.16b
add $3.8h, $3.8h, $1.8h
add $4.8h, $4.8h, $2.8h
sqxtun $0.8b, $3.8h
sqxtun2 $0.16b,$4.8h
// }
.endm
#else
.macro ZERO_COUNT_IN_2_QUARWORD arg0, arg1, arg2
// { // input: coef_0 (identy to \arg3\() \arg4\()), coef_1(identy to \arg5\() \arg6\()), mask_q
cmeq \arg0\().8h, \arg0\().8h, #0
cmeq \arg1\().8h, \arg1\().8h, #0
uzp1 \arg0\().16b, \arg0\().16b, \arg1\().16b
ushr \arg0\().16b, \arg0\().16b, 7
addv \arg2\(), \arg0\().16b
// }
.endm
.macro NEWQUANT_COEF_EACH_16BITS arg0, arg1, arg2, arg3, arg4, arg5
// if coef <= 0, - coef; else , coef;
// { // input: coef, ff (dst), mf
eor \arg3\().16b, \arg3\().16b, \arg3\().16b // init 0 , and keep 0;
saba \arg1\().8h, \arg0\().8h, \arg3\().8h // f + abs(coef - 0)
smull \arg4\().4s, \arg1\().4h, \arg2\().4h
smull2 \arg5\().4s, \arg1\().8h, \arg2\().8h
shrn \arg1\().4h, \arg4\().4s, #16
shrn2 \arg1\().8h, \arg5\().4s, #16
cmgt \arg4\().8h, \arg0\().8h, #0 // if true, location of coef == 11111111
bif \arg3\().16b, \arg1\().16b, \arg4\().16b // if (x<0) reserved part; else keep 0 untouched
shl \arg3\().8h, \arg3\().8h, #1
sub \arg1\().8h, \arg1\().8h, \arg3\().8h // if x > 0, -= 0; else x-= 2x
// }
.endm
.macro NEWQUANT_COEF_EACH_16BITS_MAX arg0, arg1, arg2, arg3, arg4, arg5, arg6
// if coef <= 0, - coef; else , coef;
// { // input: coef, ff (dst), mf
eor \arg3\().16b, \arg3\().16b, \arg3\().16b // init 0 , and keep 0;
saba \arg1\().8h, \arg0\().8h, \arg3\().8h // f + abs(coef - 0)
smull \arg4\().4s, \arg1\().4h, \arg2\().4h
smull2 \arg5\().4s, \arg1\().8h, \arg2\().8h
shrn \arg1\().4h, \arg4\().4s, #16
shrn2 \arg1\().8h, \arg5\().4s, #16
cmgt \arg4\().8h, \arg0\().8h, #0 // if true, location of coef == 11111111
bif \arg3\().16b, \arg1\().16b, \arg4\().16b // if (x<0) reserved part; else keep 0 untouched
shl \arg3\().8h, \arg3\().8h, #1
mov.8h \arg6, \arg1
sub \arg1\().8h, \arg1\().8h, \arg3\().8h // if x > 0, -= 0; else x-= 2x
// }
.endm
.macro QUANT_DUALWORD_COEF_EACH_16BITS arg0, arg1, arg2, arg3, arg4
// if coef <= 0, - coef; else , coef;
// { // input: coef, ff (dst), mf
saba \arg1\().8h, \arg0\().8h, \arg3\().8h // f + abs(coef - 0)
smull \arg4\().4s, \arg1\().4h, \arg2\().4h
shrn \arg1\().4h, \arg4\().4s, #16
cmgt \arg4\().8h, \arg0\().8h, #0 // if true, location of coef == 11111111
bif \arg3\().16b, \arg1\().16b, \arg4\().16b // if (x<0) reserved part; else keep 0 untouched
shl \arg3\().8h, \arg3\().8h, #1
sub \arg1\().8h, \arg1\().8h, \arg3\().8h // if x > 0, -= 0; else x-= 2x
// }
.endm
.macro SELECT_MAX_IN_ABS_COEF arg0, arg1, arg2, arg3, arg4, arg5
// { // input: coef_0, coef_1, coef_2, coef_3, max_q (identy to follow two)
umax \arg0\().8h, \arg0\().8h, \arg1\().8h
umaxv \arg4\(), \arg0\().8h
umax \arg2\().8h, \arg2\().8h, \arg3\().8h
umaxv \arg5\(), \arg2\().8h
// }
.endm
.macro HDM_QUANT_2x2_TOTAL_16BITS arg0, arg1, arg2
// { // input: src_d[0][16][32][48], dst_d[0][16][32][48], working
sshr \arg1\().2d, \arg0\().2d, #32
add \arg2\().4h, \arg0\().4h, \arg1\().4h // [0] = rs[0] + rs[32];[1] = rs[16] + rs[48];
sub \arg1\().4h, \arg0\().4h, \arg1\().4h // [0] = rs[0] - rs[32];[1] = rs[16] - rs[48];
zip1 \arg1\().4h, \arg2\().4h, \arg1\().4h
// }
.endm
.macro DC_ZERO_COUNT_IN_DUALWORD arg0, arg1, arg2
// { // input: coef, dst_d, working_d (all 0x01)
cmeq \arg0\().4h, \arg0\().4h, #0
and \arg0\().8b, \arg0\().8b, \arg2\().8b
addv \arg1\(), \arg0\().4h
// }
.endm
.macro IHDM_4x4_TOTAL_16BITS arg0, arg1, arg2
// { // input: each src_d[0]~[3](dst), working_q0, working_q1
uzp2 \arg1\().4s, \arg0\().4s, \arg0\().4s
uzp1 \arg0\().4s, \arg0\().4s, \arg0\().4s
add \arg2\().8h, \arg0\().8h, \arg1\().8h // [0] = rs[0] + rs[2];[1] = rs[1] + rs[3];[2] = rs[4] + rs[6];[3] = rs[5] + rs[7];
sub \arg1\().8h, \arg0\().8h, \arg1\().8h // [0] = rs[0] - rs[2];[1] = rs[1] - rs[3];[2] = rs[4] - rs[6];[3] = rs[5] - rs[7];
zip1 \arg2\().8h, \arg2\().8h, \arg1\().8h // [0] = rs[0] + rs[2]; [1] = rs[0] - rs[2]; ... [2]; [3]
uzp2 \arg1\().4s, \arg2\().4s, \arg2\().4s
uzp1 \arg0\().4s, \arg2\().4s, \arg2\().4s
add \arg2\().8h, \arg0\().8h, \arg1\().8h // [0] = rs[0] + rs[2];[1] = rs[1] + rs[3];[2] = rs[4] + rs[6];[3] = rs[5] + rs[7];
sub \arg1\().8h, \arg0\().8h, \arg1\().8h // [0] = rs[0] - rs[2];[1] = rs[1] - rs[3];[2] = rs[4] - rs[6];[3] = rs[5] - rs[7];
rev32 \arg1\().4h, \arg1\().4h // [0] = rs[1] - rs[3];[1] = rs[0] - rs[2];[2] = rs[5] - rs[7];[3] = rs[4] - rs[6];
zip1 \arg0\().4s, \arg2\().4s, \arg1\().4s
// }
.endm
.macro MATRIX_TRANSFORM_EACH_16BITS_2x8_OUT2 arg0, arg1, arg2, arg3
// { // input & output: src_d[0]~[3];[0 1 2 3]+[4 5 6 7]+[8 9 10 11]+[12 13 14 15]
uzp1 \arg2\().4s, \arg0\().4s, \arg1\().4s //[0 1 4 5]+[8 9 12 13]
uzp2 \arg3\().4s, \arg0\().4s, \arg1\().4s //[2 3 6 7]+[10 11 14 15]
uzp1 \arg0\().8h, \arg2\().8h, \arg3\().8h //[0 4 8 12]+[2 6 10 14]
uzp2 \arg2\().8h, \arg2\().8h, \arg3\().8h //[1 5 9 13]+[3 7 11 15]
zip2 \arg1\().2d, \arg0\().2d, \arg2\().2d //[2 6 10 14]+[3 7 11 15]
zip1 \arg0\().2d, \arg0\().2d, \arg2\().2d //[0 4 8 12]+[1 5 9 13]
// }
.endm
.macro MATRIX_TRANSFORM_EACH_16BITS_OUT4 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7
// { // input & output: src_d[0]~[3];[0 4 8 12],[1 5 9 13],[2 6 10 14],[3 7 11 15]
trn1 \arg4\().8h, v0.8h, v1.8h
trn2 \arg5\().8h, v0.8h, v1.8h
trn1 \arg6\().8h, v2.8h, v3.8h
trn2 \arg7\().8h, v2.8h, v3.8h
trn1 \arg0\().4s, v4.4s, v6.4s
trn2 \arg2\().4s, v4.4s, v6.4s
trn1 \arg1\().4s, v5.4s, v7.4s
trn2 \arg3\().4s, v5.4s, v7.4s
// }
.endm
.macro MATRIX_TRANSFORM_EACH_16BITS_4x4_OUT2 arg0, arg1, arg2, arg3
// { // input & output: src_d[0]~[3];[0 1 2 3],[4 5 6 7],[8 9 10 11],[12 13 14 15]
mov \arg0\().d[1], \arg1\().d[0] //[0 1 2 3]+[4 5 6 7]
mov \arg2\().d[1], \arg3\().d[0] //[8 9 10 11]+[12 13 14 15]
uzp1 \arg1\().4s, \arg0\().4s, \arg2\().4s //[0 1 4 5]+[8 9 12 13]
uzp2 \arg3\().4s, \arg0\().4s, \arg2\().4s //[2 3 6 7]+[10 11 14 15]
uzp1 \arg0\().8h, \arg1\().8h, \arg3\().8h //[0 4 8 12]+[2 6 10 14]
uzp2 \arg2\().8h, \arg1\().8h, \arg3\().8h //[1 5 9 13]+[3 7 11 15]
zip2 \arg1\().2d, \arg0\().2d, \arg2\().2d //[2 6 10 14]+[3 7 11 15]
zip1 \arg0\().2d, \arg0\().2d, \arg2\().2d //[0 4 8 12]+[1 5 9 13]
// }
.endm
.macro LOAD_4x4_DATA_FOR_DCT arg0, arg1, arg2, arg3, arg4, arg5
ld1 {\arg0\().s}[0], [\arg2\()], \arg3\()
ld1 {\arg0\().s}[1], [\arg2\()], \arg3\()
ld1 {\arg0\().s}[2], [\arg2\()], \arg3\()
ld1 {\arg0\().s}[3], [\arg2\()]
ld1 {\arg1\().s}[0], [\arg4\()], \arg5\()
ld1 {\arg1\().s}[1], [\arg4\()], \arg5\()
ld1 {\arg1\().s}[2], [\arg4\()], \arg5\()
ld1 {\arg1\().s}[3], [\arg4\()]
.endm
.macro DCT_ROW_TRANSFORM_TOTAL_16BITS arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7
// { // input: src_d[0]~[3], working: [4]~[7]
add \arg4\().8h, \arg0\().8h, \arg3\().8h //int16 s[0] = data[i] + data[i3];
sub \arg7\().8h, \arg0\().8h, \arg3\().8h //int16 s[3] = data[i] - data[i3];
add \arg5\().8h, \arg1\().8h, \arg2\().8h //int16 s[1] = data[i1] + data[i2];
sub \arg6\().8h, \arg1\().8h, \arg2\().8h //int16 s[2] = data[i1] - data[i2];
add \arg0\().8h, \arg4\().8h, \arg5\().8h //int16 dct[i ] = s[0] + s[1];
sub \arg2\().8h, \arg4\().8h, \arg5\().8h //int16 dct[i2] = s[0] - s[1];
shl \arg1\().8h, \arg7\().8h, #1
shl \arg3\().8h, \arg6\().8h, #1
add \arg1\().8h, \arg1\().8h, \arg6\().8h //int16 dct[i1] = (s[3] << 1) + s[2];
sub \arg3\().8h, \arg7\().8h, \arg3\().8h //int16 dct[i3] = s[3] - (s[2] << 1);
// }
.endm
.macro LOAD_8x4_DATA_FOR_DCT arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9
// { // input: \arg0\()~\arg3\(), src1*, src2*; untouched r2:src1_stride &r4:src2_stride
ld1 {\arg0\().d}[0], [\arg8\()], x2
ld1 {\arg1\().d}[0], [\arg8\()], x2
ld1 {\arg2\().d}[0], [\arg8\()], x2
ld1 {\arg3\().d}[0], [\arg8\()], x2
ld1 {\arg4\().d}[0], [\arg9\()], x4
ld1 {\arg5\().d}[0], [\arg9\()], x4
ld1 {\arg6\().d}[0], [\arg9\()], x4
ld1 {\arg7\().d}[0], [\arg9\()], x4
// }
.endm
.macro ROW_TRANSFORM_1_STEP_TOTAL_16BITS arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7
// { // input: src_d[0]~[3], output: e_d[0]~[3];
add \arg4\().8h, \arg0\().8h, \arg2\().8h //int16 e[i][0] = src[0] + src[2];
sub \arg5\().8h, \arg0\().8h, \arg2\().8h //int16 e[i][1] = src[0] - src[2];
sshr \arg6\().8h, \arg1\().8h, #1
sshr \arg7\().8h, \arg3\().8h, #1
sub \arg6\().8h, \arg6\().8h, \arg3\().8h //int16 e[i][2] = (src[1]>>1)-src[3];
add \arg7\().8h, \arg1\().8h, \arg7\().8h //int16 e[i][3] = src[1] + (src[3]>>1);
// }
.endm
.macro TRANSFORM_TOTAL_16BITS arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7
// both row & col transform used
// { // output: f_q[0]~[3], input: e_q[0]~[3];
add \arg0\().8h, \arg4\().8h, \arg7\().8h //int16 f[i][0] = e[i][0] + e[i][3];
add \arg1\().8h, \arg5\().8h, \arg6\().8h //int16 f[i][1] = e[i][1] + e[i][2];
sub \arg2\().8h, \arg5\().8h, \arg6\().8h //int16 f[i][2] = e[i][1] - e[i][2];
sub \arg3\().8h, \arg4\().8h, \arg7\().8h //int16 f[i][3] = e[i][0] - e[i][3];
// }
.endm
.macro ROW_TRANSFORM_0_STEP arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7
// { // input: src_d[0]~[3], output: e_q[0]~[3];
saddl \arg4\().4s, \arg0\().4h, \arg2\().4h //int32 e[i][0] = src[0] + src[2];
ssubl \arg5\().4s, \arg0\().4h, \arg2\().4h //int32 e[i][1] = src[0] - src[2];
ssubl \arg6\().4s, \arg1\().4h, \arg3\().4h //int32 e[i][2] = src[1] - src[3];
saddl \arg7\().4s, \arg1\().4h, \arg3\().4h //int32 e[i][3] = src[1] + src[3];
// }
.endm
.macro COL_TRANSFORM_0_STEP arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7
// { // input: src_q[0]~[3], output: e_q[0]~[3];
add \arg4\().4s, \arg0\().4s, \arg2\().4s //int32 e[0][j] = f[0][j] + f[2][j];
sub \arg5\().4s, \arg0\().4s, \arg2\().4s //int32 e[1][j] = f[0][j] - f[2][j];
sub \arg6\().4s, \arg1\().4s, \arg3\().4s //int32 e[2][j] = (f[1][j]>>1) - f[3][j];
add \arg7\().4s, \arg1\().4s, \arg3\().4s //int32 e[3][j] = f[1][j] + (f[3][j]>>1);
// }
.endm
.macro TRANSFORM_4BYTES arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7
// both row & col transform used
// { // output: f_q[0]~[3], input: e_q[0]~[3];
add \arg0\().4s, \arg4\().4s, \arg7\().4s //int16 f[i][0] = e[i][0] + e[i][3];
add \arg1\().4s, \arg5\().4s, \arg6\().4s //int16 f[i][1] = e[i][1] + e[i][2];
sub \arg2\().4s, \arg5\().4s, \arg6\().4s //int16 f[i][2] = e[i][1] - e[i][2];
sub \arg3\().4s, \arg4\().4s, \arg7\().4s //int16 f[i][3] = e[i][0] - e[i][3];
// }
.endm
.macro MB_PRED_8BITS_ADD_DCT_16BITS_CLIP arg0, arg1, arg2, arg3, arg4
// { // input: pred_d[0](output), dct_q0/1, working_q0/1;
uxtl \arg3\().8h, \arg0\().8b
uxtl2 \arg4\().8h, \arg0\().16b
add \arg3\().8h, \arg3\().8h, \arg1\().8h
add \arg4\().8h, \arg4\().8h, \arg2\().8h
sqxtun \arg0\().8b, \arg3\().8h
sqxtun2 \arg0\().16b,\arg4\().8h
// }
.endm
#endif
WELS_ASM_AARCH64_FUNC_BEGIN WelsGetNoneZeroCount_AArch64_neon
ld1 {v0.8h, v1.8h}, [x0]
ZERO_COUNT_IN_2_QUARWORD v0, v1, b0
mov x0, v0.d[0]
mov x1, #16
subs x0, x1, x0
WELS_ASM_AARCH64_FUNC_END
WELS_ASM_AARCH64_FUNC_BEGIN WelsQuant4x4_AArch64_neon
ld1 {v2.8h}, [x1]
ld1 {v0.8h, v1.8h}, [x0]
ld1 {v3.8h}, [x2]
mov.8h v4, v2
NEWQUANT_COEF_EACH_16BITS v0, v2, v3, v5, v6, v7
st1 {v2.8h}, [x0], #16
NEWQUANT_COEF_EACH_16BITS v1, v4, v3, v5, v6, v7
st1 {v4.8h}, [x0], #16
WELS_ASM_AARCH64_FUNC_END
WELS_ASM_AARCH64_FUNC_BEGIN WelsQuant4x4Dc_AArch64_neon
ld1 {v0.8h, v1.8h}, [x0]
dup v2.8h, w1 // even ff range [0, 768]
dup v3.8h, w2
mov.8h v4, v2
NEWQUANT_COEF_EACH_16BITS v0, v2, v3, v5, v6, v7
st1 {v2.8h}, [x0], #16
NEWQUANT_COEF_EACH_16BITS v1, v4, v3, v5, v6, v7
st1 {v4.8h}, [x0], #16
WELS_ASM_AARCH64_FUNC_END
WELS_ASM_AARCH64_FUNC_BEGIN WelsQuantFour4x4_AArch64_neon
ld1 {v2.8h}, [x1]
ld1 {v3.8h}, [x2]
mov x1, x0
.rept 4
ld1 {v0.8h, v1.8h}, [x0], #32
mov.8h v4, v2
NEWQUANT_COEF_EACH_16BITS v0, v4, v3, v5, v6, v7
st1 {v4.8h}, [x1], #16
mov.8h v4, v2
NEWQUANT_COEF_EACH_16BITS v1, v4, v3, v5, v6, v7
st1 {v4.8h}, [x1], #16
.endr
WELS_ASM_AARCH64_FUNC_END
WELS_ASM_AARCH64_FUNC_BEGIN WelsQuantFour4x4Max_AArch64_neon
ld1 {v2.8h}, [x1]
ld1 {v3.8h}, [x2]
mov x1, x0
ld1 {v0.8h, v1.8h}, [x0], #32
mov.8h v4, v2
NEWQUANT_COEF_EACH_16BITS_MAX v0, v4, v3, v5, v6, v7, v16
st1 {v4.8h}, [x1], #16
mov.8h v4, v2
NEWQUANT_COEF_EACH_16BITS_MAX v1, v4, v3, v5, v6, v7, v17
st1 {v4.8h}, [x1], #16 // then 1st 16 elem in v16 & v17
ld1 {v0.8h, v1.8h}, [x0], #32
mov.8h v4, v2
NEWQUANT_COEF_EACH_16BITS_MAX v0, v4, v3, v5, v6, v7, v18
st1 {v4.8h}, [x1], #16
mov.8h v4, v2
NEWQUANT_COEF_EACH_16BITS_MAX v1, v4, v3, v5, v6, v7, v19
st1 {v4.8h}, [x1], #16 // then 2st 16 elem in v18 & v19
SELECT_MAX_IN_ABS_COEF v16, v17, v18, v19, h20, h21
ld1 {v0.8h, v1.8h}, [x0], #32
mov.8h v4, v2
NEWQUANT_COEF_EACH_16BITS_MAX v0, v4, v3, v5, v6, v7, v16
st1 {v4.8h}, [x1], #16
mov.8h v4, v2
NEWQUANT_COEF_EACH_16BITS_MAX v1, v4, v3, v5, v6, v7, v17
st1 {v4.8h}, [x1], #16 // then 1st 16 elem in v16 & v17
ld1 {v0.8h, v1.8h}, [x0], #32
mov.8h v4, v2
NEWQUANT_COEF_EACH_16BITS_MAX v0, v4, v3, v5, v6, v7, v18
st1 {v4.8h}, [x1], #16
mov.8h v4, v2
NEWQUANT_COEF_EACH_16BITS_MAX v1, v4, v3, v5, v6, v7, v19
st1 {v4.8h}, [x1], #16 // then 2st 16 elem in v18 & v19
SELECT_MAX_IN_ABS_COEF v16, v17, v18, v19, h22, h23
st4 {v20.h,v21.h,v22.h,v23.h}[0], [x3]
WELS_ASM_AARCH64_FUNC_END
WELS_ASM_AARCH64_FUNC_BEGIN WelsDequant4x4_AArch64_neon
ld1 {v0.8h, v1.8h}, [x0]
ld1 {v2.8h}, [x1]
mul v3.8h, v0.8h, v2.8h
mul v4.8h, v1.8h, v2.8h
st1 {v3.8h, v4.8h}, [x0]
WELS_ASM_AARCH64_FUNC_END
WELS_ASM_AARCH64_FUNC_BEGIN WelsDequantFour4x4_AArch64_neon
ld1 {v2.8h}, [x1]
mov x1, x0
.rept 4
ld1 {v0.8h,v1.8h}, [x0], #32
mul v3.8h, v0.8h, v2.8h
mul v4.8h, v1.8h, v2.8h
st1 {v3.8h,v4.8h}, [x1], #32
.endr
WELS_ASM_AARCH64_FUNC_END
WELS_ASM_AARCH64_FUNC_BEGIN WelsHadamardQuant2x2SkipKernel_AArch64_neon
dup v4.8h, w1
mov x1, #32
ld1 {v0.h}[0], [x0], x1 //rs[0]
ld1 {v0.h}[1], [x0], x1 //rs[16]
ld1 {v0.h}[2], [x0], x1 //rs[32]
ld1 {v0.h}[3], [x0], x1 //rs[48]
HDM_QUANT_2x2_TOTAL_16BITS v0, v1, v2 // output v1
HDM_QUANT_2x2_TOTAL_16BITS v1, v0, v2 // output v0
abs v1.4h, v0.4h
cmhi v1.4h, v1.4h, v4.4h // abs(dct[i])>threshold;
mov w0, v0.s[0]
mov w1, v0.s[1]
orr w0, w0, w1
WELS_ASM_AARCH64_FUNC_END
WELS_ASM_AARCH64_FUNC_BEGIN WelsHadamardQuant2x2_AArch64_neon
dup v1.8h, w1 //ff
dup v2.8h, w2 //mf
eor v3.16b, v3.16b, v3.16b
mov x1, #32
mov x2, x0
ld1 {v0.h}[0], [x0], x1 //rs[0]
st1 {v3.h}[0], [x2], x1 //rs[00]=0
ld1 {v0.h}[1], [x0], x1 //rs[16]
st1 {v3.h}[1], [x2], x1 //rs[16]=0
ld1 {v0.h}[2], [x0], x1 //rs[32]
st1 {v3.h}[2], [x2], x1 //rs[32]=0
ld1 {v0.h}[3], [x0], x1 //rs[48]
st1 {v3.h}[3], [x2], x1 //rs[48]=0
HDM_QUANT_2x2_TOTAL_16BITS v0, v4, v5 // output v4
HDM_QUANT_2x2_TOTAL_16BITS v4, v0, v5 // output v0
QUANT_DUALWORD_COEF_EACH_16BITS v0, v1, v2, v3, v4
st1 {v1.d}[0], [x3] // store to dct
st1 {v1.d}[0], [x4] // store to block
movi v3.8h, #1, lsl #0
movi v0.16b, #255
DC_ZERO_COUNT_IN_DUALWORD v1, h0, v3
mov x0, v0.d[0]
mov x1, #16
subs x0, x1, x0
WELS_ASM_AARCH64_FUNC_END
WELS_ASM_AARCH64_FUNC_BEGIN WelsDequantIHadamard4x4_AArch64_neon
ld1 {v0.8h, v1.8h}, [x0]
dup v4.8h, w1
IHDM_4x4_TOTAL_16BITS v0, v2, v3
IHDM_4x4_TOTAL_16BITS v1, v2, v3
MATRIX_TRANSFORM_EACH_16BITS_2x8_OUT2 v0, v1, v2, v3
IHDM_4x4_TOTAL_16BITS v0, v2, v3
mul v0.8h, v0.8h, v4.8h
IHDM_4x4_TOTAL_16BITS v1, v2, v3
mul v1.8h, v1.8h, v4.8h
MATRIX_TRANSFORM_EACH_16BITS_2x8_OUT2 v0, v1, v2, v3
st1 {v0.16b, v1.16b}, [x0]
WELS_ASM_AARCH64_FUNC_END
WELS_ASM_AARCH64_FUNC_BEGIN WelsDctT4_AArch64_neon
LOAD_4x4_DATA_FOR_DCT v0, v1, x1, x2, x3, x4
usubl v2.8h, v0.8b, v1.8b
usubl2 v4.8h, v0.16b, v1.16b
uzp1 v3.8h, v2.8h, v4.8h
uzp2 v5.8h, v2.8h, v4.8h
uzp2 v2.8h, v3.8h, v5.8h // s[2, 6, 10, 14] [3, 7, 11, 15]
uzp1 v0.8h, v3.8h, v5.8h // s[0, 4, 8, 12] [1, 5, 9, 13]
mov v3.d[0], v2.d[1] // s[3, 7, 11, 15]
mov v1.d[0], v0.d[1] // s[1, 5, 9, 13]
// horizontal transform
DCT_ROW_TRANSFORM_TOTAL_16BITS v0, v1, v2, v3, v4, v5, v6, v7
// transform element
MATRIX_TRANSFORM_EACH_16BITS_OUT4 v0, v1, v2, v3, v4, v5, v6, v7
// vertical transform
DCT_ROW_TRANSFORM_TOTAL_16BITS v0, v1, v2, v3, v4, v5, v6, v7
st4 {v0.d, v1.d, v2.d, v3.d}[0], [x0]
WELS_ASM_AARCH64_FUNC_END
WELS_ASM_AARCH64_FUNC_BEGIN WelsDctFourT4_AArch64_neon
.rept 2
LOAD_8x4_DATA_FOR_DCT v0, v1, v2, v3, v4, v5, v6, v7, x1, x3
usubl v0.8h, v0.8b, v4.8b
usubl v1.8h, v1.8b, v5.8b
usubl v2.8h, v2.8b, v6.8b
usubl v3.8h, v3.8b, v7.8b
MATRIX_TRANSFORM_EACH_16BITS_OUT4 v0, v1, v2, v3, v4, v5, v6, v7
// horizontal transform
DCT_ROW_TRANSFORM_TOTAL_16BITS v0, v1, v2, v3, v4, v5, v6, v7
// transform element
MATRIX_TRANSFORM_EACH_16BITS_OUT4 v0, v1, v2, v3, v4, v5, v6, v7
// vertical transform
DCT_ROW_TRANSFORM_TOTAL_16BITS v0, v1, v2, v3, v4, v5, v6, v7
uzp1 v4.2d, v0.2d, v1.2d
uzp2 v6.2d, v0.2d, v1.2d
uzp1 v5.2d, v2.2d, v3.2d
uzp2 v7.2d, v2.2d, v3.2d
st1 {v4.16b, v5.16b}, [x0], #32
st1 {v6.16b, v7.16b}, [x0], #32
.endr
WELS_ASM_AARCH64_FUNC_END
WELS_ASM_AARCH64_FUNC_BEGIN WelsIDctT4Rec_AArch64_neon
ld1 {v16.s}[0], [x2], x3
ld1 {v16.s}[1], [x2], x3
ld1 {v16.s}[2], [x2], x3
ld1 {v16.s}[3], [x2], x3 // Pred
ld4 {v0.4h, v1.4h, v2.4h, v3.4h}, [x4] // dct coeff
ROW_TRANSFORM_1_STEP_TOTAL_16BITS v0, v1, v2, v3, v4, v5, v6, v7
TRANSFORM_TOTAL_16BITS v0, v1, v2, v3, v4, v5, v6, v7
MATRIX_TRANSFORM_EACH_16BITS_OUT4 v0, v1, v2, v3, v4, v5, v6, v7
ROW_TRANSFORM_1_STEP_TOTAL_16BITS v0, v1, v2, v3, v4, v5, v6, v7
TRANSFORM_TOTAL_16BITS v0, v1, v2, v3, v4, v5, v6, v7
ins v0.d[1], v1.d[0]
ins v2.d[1], v3.d[0]
srshr v0.8h, v0.8h, #6
srshr v2.8h, v2.8h, #6
//after rounding 6, clip into [0, 255]
uxtl v1.8h, v16.8b
add v0.8h, v0.8h, v1.8h
sqxtun v1.8b, v0.8h
st1 {v1.s}[0],[x0],x1
st1 {v1.s}[1],[x0],x1
uxtl2 v1.8h, v16.16b
add v2.8h, v2.8h, v1.8h
sqxtun v1.8b, v2.8h
st1 {v1.s}[0],[x0],x1
st1 {v1.s}[1],[x0],x1
WELS_ASM_AARCH64_FUNC_END
WELS_ASM_AARCH64_FUNC_BEGIN WelsIDctFourT4Rec_AArch64_neon
.rept 2
ld1 {v16.d}[0], [x2], x3
ld1 {v16.d}[1], [x2], x3
ld1 {v17.d}[0], [x2], x3
ld1 {v17.d}[1], [x2], x3 // Pred
ld4 {v0.8h, v1.8h, v2.8h, v3.8h}, [x4], #64 // dct coeff
ROW_TRANSFORM_1_STEP_TOTAL_16BITS v0, v1, v2, v3, v4, v5, v6, v7
TRANSFORM_TOTAL_16BITS v0, v1, v2, v3, v4, v5, v6, v7
MATRIX_TRANSFORM_EACH_16BITS_OUT4 v0, v1, v2, v3, v4, v5, v6, v7
ROW_TRANSFORM_1_STEP_TOTAL_16BITS v0, v1, v2, v3, v4, v5, v6, v7
TRANSFORM_TOTAL_16BITS v0, v1, v2, v3, v4, v5, v6, v7
srshr v0.8h, v0.8h, #6
srshr v1.8h, v1.8h, #6
srshr v2.8h, v2.8h, #6
srshr v3.8h, v3.8h, #6
//after rounding 6, clip into [0, 255]
uxtl v4.8h, v16.8b
add v0.8h, v0.8h, v4.8h
sqxtun v0.8b, v0.8h
st1 {v0.d}[0],[x0],x1
uxtl2 v5.8h, v16.16b
add v1.8h, v1.8h, v5.8h
sqxtun v1.8b, v1.8h
st1 {v1.d}[0],[x0],x1
uxtl v6.8h, v17.8b
add v2.8h, v2.8h, v6.8h
sqxtun v2.8b, v2.8h
st1 {v2.d}[0],[x0],x1
uxtl2 v7.8h, v17.16b
add v3.8h, v3.8h, v7.8h
sqxtun v3.8b, v3.8h
st1 {v3.d}[0],[x0],x1
.endr
WELS_ASM_AARCH64_FUNC_END
WELS_ASM_AARCH64_FUNC_BEGIN WelsHadamardT4Dc_AArch64_neon
mov x2, #32
ld1 {v0.h}[0], [x1], x2
ld1 {v1.h}[0], [x1], x2
ld1 {v0.h}[1], [x1], x2
ld1 {v1.h}[1], [x1], x2
ld1 {v2.h}[0], [x1], x2
ld1 {v3.h}[0], [x1], x2
ld1 {v2.h}[1], [x1], x2
ld1 {v3.h}[1], [x1], x2
ld1 {v0.h}[2], [x1], x2
ld1 {v1.h}[2], [x1], x2
ld1 {v0.h}[3], [x1], x2
ld1 {v1.h}[3], [x1], x2
ld1 {v2.h}[2], [x1], x2
ld1 {v3.h}[2], [x1], x2
ld1 {v2.h}[3], [x1], x2
ld1 {v3.h}[3], [x1], x2 // v0[0 4 08 12],v1[1 5 09 13],v2[2 6 10 14],v3[3 7 11 15]
ROW_TRANSFORM_0_STEP v0, v1, v3, v2, v4, v7, v6, v5
TRANSFORM_4BYTES v0, v1, v3, v2, v4, v7, v6, v5
// transform element 32bits
uzp1 v4.4s, v0.4s, v1.4s // 0 2 4 6
uzp2 v5.4s, v0.4s, v1.4s // 1 3 5 7
uzp1 v6.4s, v2.4s, v3.4s // 8 10 12 14
uzp2 v7.4s, v2.4s, v3.4s // 9 11 13 15
uzp1 v0.4s, v4.4s, v6.4s // 0 4 8 12
uzp2 v2.4s, v4.4s, v6.4s // 2 6 10 14
uzp1 v1.4s, v5.4s, v7.4s // 1 5 9 13
uzp2 v3.4s, v5.4s, v7.4s // 3 7 11 15
COL_TRANSFORM_0_STEP v0, v1, v3, v2, v4, v7, v6, v5
TRANSFORM_4BYTES v0, v1, v3, v2, v4, v7, v6, v5
sqrshrn v4.4h, v0.4s, #1
sqrshrn2 v4.8h, v1.4s, #1
sqrshrn v5.4h, v2.4s, #1
sqrshrn2 v5.8h, v3.4s, #1
st1 {v4.16b, v5.16b}, [x0] //store
WELS_ASM_AARCH64_FUNC_END
WELS_ASM_AARCH64_FUNC_BEGIN WelsIDctRecI16x16Dc_AArch64_neon
ld1 {v16.16b,v17.16b}, [x4]
srshr v16.8h, v16.8h, #6
srshr v17.8h, v17.8h, #6
dup v0.8h, v16.h[0]
dup v1.8h, v16.h[1]
ins v0.d[1], v1.d[0]
dup v1.8h, v16.h[2]
dup v2.8h, v16.h[3]
ins v1.d[1], v2.d[0]
.rept 4
ld1 {v3.16b}, [x2], x3
MB_PRED_8BITS_ADD_DCT_16BITS_CLIP v3, v0, v1, v4, v5
st1 {v3.16b}, [x0], x1
.endr
dup v0.8h, v16.h[4]
dup v1.8h, v16.h[5]
ins v0.d[1], v1.d[0]
dup v1.8h, v16.h[6]
dup v2.8h, v16.h[7]
ins v1.d[1], v2.d[0]
.rept 4
ld1 {v3.16b}, [x2], x3
MB_PRED_8BITS_ADD_DCT_16BITS_CLIP v3, v0, v1, v4, v5
st1 {v3.16b}, [x0], x1
.endr
dup v0.8h, v17.h[0]
dup v1.8h, v17.h[1]
ins v0.d[1], v1.d[0]
dup v1.8h, v17.h[2]
dup v2.8h, v17.h[3]
ins v1.d[1], v2.d[0]
.rept 4
ld1 {v3.16b}, [x2], x3
MB_PRED_8BITS_ADD_DCT_16BITS_CLIP v3, v0, v1, v4, v5
st1 {v3.16b}, [x0], x1
.endr
dup v0.8h, v17.h[4]
dup v1.8h, v17.h[5]
ins v0.d[1], v1.d[0]
dup v1.8h, v17.h[6]
dup v2.8h, v17.h[7]
ins v1.d[1], v2.d[0]
.rept 4
ld1 {v3.16b}, [x2], x3
MB_PRED_8BITS_ADD_DCT_16BITS_CLIP v3, v0, v1, v4, v5
st1 {v3.16b}, [x0], x1
.endr
WELS_ASM_AARCH64_FUNC_END
#endif

View File

@ -81,6 +81,17 @@ void WelsIDctRecI16x16Dc_neon (uint8_t* pRec, int32_t iStride, uint8_t* pPredict
int16_t* pDctDc);
#endif
#ifdef HAVE_NEON_AARCH64
void WelsDequantFour4x4_AArch64_neon (int16_t* pDct, const uint16_t* kpMF);
void WelsDequant4x4_AArch64_neon (int16_t* pDct, const uint16_t* kpMF);
void WelsDequantIHadamard4x4_AArch64_neon (int16_t* pRes, const uint16_t kuiMF);
void WelsIDctT4Rec_AArch64_neon (uint8_t* pRec, int32_t iStride, uint8_t* pPrediction, int32_t iPredStride, int16_t* pDct);
void WelsIDctFourT4Rec_AArch64_neon (uint8_t* pRec, int32_t iStride, uint8_t* pPrediction, int32_t iPredStride, int16_t* pDct);
void WelsIDctRecI16x16Dc_AArch64_neon (uint8_t* pRec, int32_t iStride, uint8_t* pPrediction, int32_t iPredStride,
int16_t* pDctDc);
#endif
#if defined(__cplusplus)
}
#endif//__cplusplus

View File

@ -122,6 +122,22 @@ void WelsQuantFour4x4_neon (int16_t* pDct, const int16_t* pFF, const int16_t* pM
void WelsQuantFour4x4Max_neon (int16_t* pDct, const int16_t* pFF, const int16_t* pMF, int16_t* pMax);
#endif
#ifdef HAVE_NEON_AARCH64
void WelsHadamardT4Dc_AArch64_neon (int16_t* pLumaDc, int16_t* pDct);
int32_t WelsHadamardQuant2x2_AArch64_neon (int16_t* pRes, const int16_t kiFF, int16_t iMF, int16_t* pDct, int16_t* pBlock);
int32_t WelsHadamardQuant2x2Skip_AArch64_neon (int16_t* pRes, int16_t iFF, int16_t iMF);
int32_t WelsHadamardQuant2x2SkipKernel_AArch64_neon (int16_t* pRes, int16_t iThreshold); // avoid divide operator
void WelsDctT4_AArch64_neon (int16_t* pDct, uint8_t* pPixel1, int32_t iStride1, uint8_t* pPixel2, int32_t iStride2);
void WelsDctFourT4_AArch64_neon (int16_t* pDct, uint8_t* pPixel1, int32_t iStride1, uint8_t* pPixel2, int32_t iStride2);
int32_t WelsGetNoneZeroCount_AArch64_neon (int16_t* pLevel);
void WelsQuant4x4_AArch64_neon (int16_t* pDct, const int16_t* pFF, const int16_t* pMF);
void WelsQuant4x4Dc_AArch64_neon (int16_t* pDct, int16_t iFF, int16_t iMF);
void WelsQuantFour4x4_AArch64_neon (int16_t* pDct, const int16_t* pFF, const int16_t* pMF);
void WelsQuantFour4x4Max_AArch64_neon (int16_t* pDct, const int16_t* pFF, const int16_t* pMF, int16_t* pMax);
#endif
#if defined(__cplusplus)
}
#endif//__cplusplus

View File

@ -282,5 +282,17 @@ void WelsInitReconstructionFuncs (SWelsFuncPtrList* pFuncList, uint32_t uiCpuFl
pFuncList->pfIDctI16x16Dc = WelsIDctRecI16x16Dc_neon;
}
#endif
#if defined(HAVE_NEON_AARCH64)
if (uiCpuFlag & WELS_CPU_NEON) {
pFuncList->pfDequantization4x4 = WelsDequant4x4_AArch64_neon;
pFuncList->pfDequantizationFour4x4 = WelsDequantFour4x4_AArch64_neon;
pFuncList->pfDequantizationIHadamard4x4 = WelsDequantIHadamard4x4_AArch64_neon;
pFuncList->pfIDctFourT4 = WelsIDctFourT4Rec_AArch64_neon;
pFuncList->pfIDctT4 = WelsIDctT4Rec_AArch64_neon;
pFuncList->pfIDctI16x16Dc = WelsIDctRecI16x16Dc_AArch64_neon;
}
#endif
}
}

View File

@ -455,7 +455,12 @@ int32_t WelsHadamardQuant2x2Skip_neon (int16_t* pRes, int16_t iFF, int16_t iMF)
return WelsHadamardQuant2x2SkipKernel_neon (pRes, iThreshold);
}
#endif
#ifdef HAVE_NEON_AARCH64
int32_t WelsHadamardQuant2x2Skip_AArch64_neon (int16_t* pRes, int16_t iFF, int16_t iMF) {
int16_t iThreshold = ((1 << 16) - 1) / iMF - iFF;
return WelsHadamardQuant2x2SkipKernel_AArch64_neon (pRes, iThreshold);
}
#endif
void WelsInitEncodingFuncs (SWelsFuncPtrList* pFuncList, uint32_t uiCpuFlag) {
pFuncList->pfCopy8x8Aligned = WelsCopy8x8_c;
pFuncList->pfCopy16x16Aligned =
@ -542,5 +547,28 @@ void WelsInitEncodingFuncs (SWelsFuncPtrList* pFuncList, uint32_t uiCpuFlag) {
pFuncList->pfDctFourT4 = WelsDctFourT4_neon;
}
#endif
#if defined(HAVE_NEON_AARCH64)
if (uiCpuFlag & WELS_CPU_NEON) {
pFuncList->pfQuantizationHadamard2x2 = WelsHadamardQuant2x2_AArch64_neon;
pFuncList->pfQuantizationHadamard2x2Skip = WelsHadamardQuant2x2Skip_AArch64_neon;
pFuncList->pfDctT4 = WelsDctT4_AArch64_neon;
//pFuncList->pfCopy8x8Aligned = WelsCopy8x8_AArch64_neon; // will enable in next update
//pFuncList->pfCopy8x16Aligned = WelsCopy8x16_AArch64_neon; // will enable in next update
pFuncList->pfGetNoneZeroCount = WelsGetNoneZeroCount_AArch64_neon;
pFuncList->pfTransformHadamard4x4Dc = WelsHadamardT4Dc_AArch64_neon;
pFuncList->pfQuantization4x4 = WelsQuant4x4_AArch64_neon;
pFuncList->pfQuantizationDc4x4 = WelsQuant4x4Dc_AArch64_neon;
pFuncList->pfQuantizationFour4x4 = WelsQuantFour4x4_AArch64_neon;
pFuncList->pfQuantizationFour4x4Max = WelsQuantFour4x4Max_AArch64_neon;
//pFuncList->pfCopy16x16Aligned = WelsCopy16x16_AArch64_neon; // will enable in next update
//pFuncList->pfCopy16x16NotAligned = WelsCopy16x16NotAligned_AArch64_neon; // will enable in next update
//pFuncList->pfCopy16x8NotAligned = WelsCopy16x8NotAligned_AArch64_neon; // will enable in next update
pFuncList->pfDctFourT4 = WelsDctFourT4_AArch64_neon;
}
#endif
}
}

View File

@ -62,6 +62,7 @@ ENCODER_ASM_ARM64_SRCS=\
$(ENCODER_SRCDIR)/core/arm64/intra_pred_aarch64_neon.S\
$(ENCODER_SRCDIR)/core/arm64/intra_pred_sad_3_opt_aarch64_neon.S\
$(ENCODER_SRCDIR)/core/arm64/pixel_aarch64_neon.S\
$(ENCODER_SRCDIR)/core/arm64/reconstruct_aarch64_neon.S\
ENCODER_OBJS += $(ENCODER_ASM_ARM64_SRCS:.S=.$(OBJ))
endif

View File

@ -0,0 +1,492 @@
#include<gtest/gtest.h>
#include<math.h>
#include<stdlib.h>
#include<time.h>
#include "cpu_core.h"
#include "cpu.h"
#include "macros.h"
#include "encode_mb_aux.h"
#include "decode_mb_aux.h"
#include "wels_func_ptr_def.h"
using namespace WelsSVCEnc;
#define RECONTEST_NUM 1000
static void FillWithRandomData (uint8_t* p, int32_t Len) {
for (int32_t i = 0; i < Len; i++) {
p[i] = rand() % 256;
}
}
TEST (ReconstructionFunTest, WelsIDctRecI16x16Dc) {
ENFORCE_STACK_ALIGN_2D (uint8_t, pRec, 2, 16 * 16, 16)
ENFORCE_STACK_ALIGN_1D (uint8_t, pPred, 32 * 16, 16)
ENFORCE_STACK_ALIGN_1D (int16_t, pDct, 16, 16)
int32_t iCpuCores = 0;
SWelsFuncPtrList sFuncPtrList;
uint32_t m_uiCpuFeatureFlag = WelsCPUFeatureDetect (&iCpuCores);
WelsInitReconstructionFuncs (&sFuncPtrList, m_uiCpuFeatureFlag);
for (int32_t k = 0; k < RECONTEST_NUM; k++) {
FillWithRandomData (pPred, 32 * 16);
FillWithRandomData ((uint8_t*)pDct, 16 * 2);
for (int32_t i = 0 ; i < 16; i++) {
pDct[i] = WELS_CLIP3 (pDct[i], -4080, 4080);
}
WelsIDctRecI16x16Dc_c (pRec[0], 16, pPred, 32, pDct);
sFuncPtrList.pfIDctI16x16Dc (pRec[1], 16, pPred, 32, pDct);
for (int32_t j = 0 ; j < 16; j++) {
for (int32_t i = 0 ; i < 16; i++) {
ASSERT_EQ (pRec[0][i + j * 16], pRec[1][i + j * 16]);
}
}
}
}
TEST (ReconstructionFunTest, WelsGetNoneZeroCount) {
ENFORCE_STACK_ALIGN_1D (int16_t, pInput, 64, 16)
int32_t iZeroCount[2];
int32_t iCpuCores = 0;
SWelsFuncPtrList sFuncPtrList;
uint32_t m_uiCpuFeatureFlag = WelsCPUFeatureDetect (&iCpuCores);
WelsInitEncodingFuncs (&sFuncPtrList, m_uiCpuFeatureFlag);
for (int32_t k = 0; k < RECONTEST_NUM; k++) {
FillWithRandomData ((uint8_t*)pInput, 128);
iZeroCount[0] = WelsGetNoneZeroCount_c (pInput);
iZeroCount[1] = sFuncPtrList.pfGetNoneZeroCount (pInput);
ASSERT_EQ (iZeroCount[0], iZeroCount[1]);
}
}
TEST (ReconstructionFunTest, WelsHadamardT4Dc) {
ENFORCE_STACK_ALIGN_1D (int16_t, pDct, 16 * 16, 16)
ENFORCE_STACK_ALIGN_2D (int16_t, pLumaDc, 2, 16, 16)
int32_t iCpuCores = 0;
SWelsFuncPtrList sFuncPtrList;
uint32_t m_uiCpuFeatureFlag = WelsCPUFeatureDetect (&iCpuCores);
WelsInitEncodingFuncs (&sFuncPtrList, m_uiCpuFeatureFlag);
for (int32_t k = 0; k < RECONTEST_NUM; k++) {
FillWithRandomData ((uint8_t*)pDct, 16 * 16 * 2);
for (int32_t j = 0 ; j < 16; j++) {
for (int32_t i = 0 ; i < 16; i++) {
pDct[i + j * 16] = WELS_CLIP3 (pDct[i + j * 16], -4080, 4080);
}
}
WelsHadamardT4Dc_c (pLumaDc[0], pDct);
sFuncPtrList.pfTransformHadamard4x4Dc (pLumaDc[1], pDct);
for (int32_t i = 0 ; i < 16; i++) {
ASSERT_EQ (pLumaDc[0][i], pLumaDc[1][i]);
}
}
}
TEST (ReconstructionFunTest, WelsDctT4) {
ENFORCE_STACK_ALIGN_1D (uint8_t, pInput1, 16 * 4, 16)
ENFORCE_STACK_ALIGN_1D (uint8_t, pInput2, 32 * 4, 16)
ENFORCE_STACK_ALIGN_2D (int16_t, pOut, 2, 16, 16)
int32_t iCpuCores = 0;
SWelsFuncPtrList sFuncPtrList;
uint32_t m_uiCpuFeatureFlag = WelsCPUFeatureDetect (&iCpuCores);
WelsInitEncodingFuncs (&sFuncPtrList, m_uiCpuFeatureFlag);
for (int32_t k = 0; k < RECONTEST_NUM; k++) {
FillWithRandomData (pInput1, 16 * 4);
FillWithRandomData (pInput2, 32 * 4);
WelsDctT4_c (pOut[0], pInput1, 16, pInput2, 32);
sFuncPtrList.pfDctT4 (pOut[1], pInput1, 16, pInput2, 32);
for (int32_t i = 0 ; i < 16; i++) {
ASSERT_EQ (pOut[0][i], pOut[1][i]);
}
}
memset (pInput1, 255, 16 * 4);
memset (pInput2, 0, 32 * 4);
WelsDctT4_c (pOut[0], pInput1, 16, pInput2, 32);
sFuncPtrList.pfDctT4 (pOut[1], pInput1, 16, pInput2, 32);
for (int32_t i = 0 ; i < 16; i++) {
ASSERT_EQ (pOut[0][i], pOut[1][i]);
}
memset (pInput1, 0, 16 * 4);
memset (pInput2, 255, 32 * 4);
WelsDctT4_c (pOut[0], pInput1, 16, pInput2, 32);
sFuncPtrList.pfDctT4 (pOut[1], pInput1, 16, pInput2, 32);
for (int32_t i = 0 ; i < 16; i++) {
ASSERT_EQ (pOut[0][i], pOut[1][i]);
}
}
TEST (ReconstructionFunTest, WelsDctFourT4) {
ENFORCE_STACK_ALIGN_1D (uint8_t, pInput1, 16 * 8, 16)
ENFORCE_STACK_ALIGN_1D (uint8_t, pInput2, 32 * 8, 16)
ENFORCE_STACK_ALIGN_2D (int16_t, pOut, 2, 64, 16)
int32_t iCpuCores = 0;
SWelsFuncPtrList sFuncPtrList;
uint32_t m_uiCpuFeatureFlag = WelsCPUFeatureDetect (&iCpuCores);
WelsInitEncodingFuncs (&sFuncPtrList, m_uiCpuFeatureFlag);
for (int32_t k = 0; k < RECONTEST_NUM; k++) {
FillWithRandomData (pInput1, 16 * 8);
FillWithRandomData (pInput2, 32 * 8);
WelsDctFourT4_c (pOut[0], pInput1, 16, pInput2, 32);
sFuncPtrList.pfDctFourT4 (pOut[1], pInput1, 16, pInput2, 32);
for (int32_t i = 0 ; i < 64; i++) {
ASSERT_EQ (pOut[0][i], pOut[1][i]);
}
}
}
TEST (ReconstructionFunTest, WelsIDctT4Rec) {
ENFORCE_STACK_ALIGN_2D (int16_t, pDct, 2, 16, 16)
ENFORCE_STACK_ALIGN_1D (uint8_t, pPred, 32 * 4, 16)
ENFORCE_STACK_ALIGN_2D (uint8_t, pRec, 2, 16 * 4, 16)
ENFORCE_STACK_ALIGN_1D (uint8_t, pInput1, 16 * 4, 16)
ENFORCE_STACK_ALIGN_1D (uint8_t, pInput2, 32 * 4, 16)
int32_t iCpuCores = 0;
SWelsFuncPtrList sFuncPtrList;
uint32_t m_uiCpuFeatureFlag = WelsCPUFeatureDetect (&iCpuCores);
WelsInitReconstructionFuncs (&sFuncPtrList, m_uiCpuFeatureFlag);
WelsInitEncodingFuncs (&sFuncPtrList, m_uiCpuFeatureFlag);
for (int32_t k = 0; k < RECONTEST_NUM; k++) {
FillWithRandomData (pPred, 32 * 4);
FillWithRandomData (pInput1, 16 * 4);
FillWithRandomData (pInput2, 32 * 4);
WelsDctT4_c (pDct[0], pInput1, 16, pInput2, 32);
sFuncPtrList.pfDctT4 (pDct[1], pInput1, 16, pInput2, 32);
WelsIDctT4Rec_c (pRec[0], 16, pPred, 32, pDct[0]);
sFuncPtrList.pfIDctT4 (pRec[1], 16, pPred, 32, pDct[1]);
for (int32_t j = 0 ; j < 4; j++) {
for (int32_t i = 0 ; i < 4; i++) {
ASSERT_EQ (pRec[0][i + j * 16], pRec[1][i + j * 16]);
}
}
}
memset (pPred, 255, 32 * 4);
memset (pInput1, 255, 16 * 4);
memset (pInput2, 0, 32 * 4);
WelsDctT4_c (pDct[0], pInput1, 16, pInput2, 32);
sFuncPtrList.pfDctT4 (pDct[1], pInput1, 16, pInput2, 32);
WelsIDctT4Rec_c (pRec[0], 16, pPred, 32, pDct[0]);
sFuncPtrList.pfIDctT4 (pRec[1], 16, pPred, 32, pDct[1]);
for (int32_t j = 0 ; j < 4; j++) {
for (int32_t i = 0 ; i < 4; i++) {
ASSERT_EQ (pRec[0][i + j * 16], pRec[1][i + j * 16]);
}
}
memset (pPred, 255, 32 * 4);
memset (pInput1, 0, 16 * 4);
memset (pInput2, 255, 32 * 4);
WelsDctT4_c (pDct[0], pInput1, 16, pInput2, 32);
sFuncPtrList.pfDctT4 (pDct[1], pInput1, 16, pInput2, 32);
WelsIDctT4Rec_c (pRec[0], 16, pPred, 32, pDct[0]);
sFuncPtrList.pfIDctT4 (pRec[1], 16, pPred, 32, pDct[1]);
for (int32_t j = 0 ; j < 4; j++) {
for (int32_t i = 0 ; i < 4; i++) {
ASSERT_EQ (pRec[0][i + j * 16], pRec[1][i + j * 16]);
}
}
}
TEST (ReconstructionFunTest, WelsIDctFourT4Rec) {
ENFORCE_STACK_ALIGN_2D (int16_t, pDct, 2, 64, 16)
ENFORCE_STACK_ALIGN_1D (uint8_t, pPred, 32 * 8, 16)
ENFORCE_STACK_ALIGN_2D (uint8_t, pRec, 2, 16 * 8, 16)
ENFORCE_STACK_ALIGN_1D (uint8_t, pInput1, 16 * 8, 16)
ENFORCE_STACK_ALIGN_1D (uint8_t, pInput2, 32 * 8, 16)
int32_t iCpuCores = 0;
SWelsFuncPtrList sFuncPtrList;
uint32_t m_uiCpuFeatureFlag = WelsCPUFeatureDetect (&iCpuCores);
WelsInitReconstructionFuncs (&sFuncPtrList, m_uiCpuFeatureFlag);
WelsInitEncodingFuncs (&sFuncPtrList, m_uiCpuFeatureFlag);
for (int32_t k = 0; k < RECONTEST_NUM; k++) {
FillWithRandomData (pInput1, 16 * 8);
FillWithRandomData (pInput2, 32 * 8);
FillWithRandomData (pPred, 32 * 8);
WelsDctFourT4_c (pDct[0], pInput1, 16, pInput2, 32);
sFuncPtrList.pfDctFourT4 (pDct[1], pInput1, 16, pInput2, 32);
WelsIDctFourT4Rec_c (pRec[0], 16, pPred, 32, pDct[0]);
sFuncPtrList.pfIDctFourT4 (pRec[1], 16, pPred, 32, pDct[1]);
for (int32_t j = 0 ; j < 8; j++) {
for (int32_t i = 0 ; i < 8; i++) {
ASSERT_EQ (pRec[0][i + j * 16], pRec[1][i + j * 16]);
}
}
}
}
TEST (ReconstructionFunTest, WelsDequant4x4) {
ENFORCE_STACK_ALIGN_2D (int16_t, pInput, 2, 16, 16)
int32_t iCpuCores = 0;
SWelsFuncPtrList sFuncPtrList;
uint32_t m_uiCpuFeatureFlag = WelsCPUFeatureDetect (&iCpuCores);
WelsInitReconstructionFuncs (&sFuncPtrList, m_uiCpuFeatureFlag);
WelsInitEncodingFuncs (&sFuncPtrList, m_uiCpuFeatureFlag);
for (int32_t k = 0; k < RECONTEST_NUM; k++) {
uint8_t uiQp = rand() % 52;
FillWithRandomData ((uint8_t*)pInput[0], 32);
for (int32_t i = 0 ; i < 16; i++) {
pInput[0][i] = WELS_CLIP3 (pInput[0][i], -32640, 32640);
}
memcpy ((uint8_t*)pInput[1], (uint8_t*)pInput[0], 32);
const int16_t* pMF = g_kiQuantMF[uiQp];
const int16_t* pFF = g_iQuantIntraFF[uiQp];
WelsQuant4x4_c (pInput[0], pFF, pMF);
sFuncPtrList.pfQuantization4x4 (pInput[1], pFF, pMF);
WelsDequant4x4_c (pInput[0], g_kuiDequantCoeff[uiQp]);
sFuncPtrList.pfDequantization4x4 (pInput[1], g_kuiDequantCoeff[uiQp]);
for (int32_t i = 0 ; i < 16; i++) {
ASSERT_EQ (pInput[0][i], pInput[1][i]);
}
}
}
TEST (ReconstructionFunTest, WelsDequantIHadamard4x4) {
ENFORCE_STACK_ALIGN_2D (int16_t, pInput, 2, 16, 16)
int32_t iCpuCores = 0;
SWelsFuncPtrList sFuncPtrList;
uint32_t m_uiCpuFeatureFlag = WelsCPUFeatureDetect (&iCpuCores);
WelsInitReconstructionFuncs (&sFuncPtrList, m_uiCpuFeatureFlag);
WelsInitEncodingFuncs (&sFuncPtrList, m_uiCpuFeatureFlag);
for (int32_t k = 0; k < RECONTEST_NUM; k++) {
uint8_t uiQp = rand() % 52;
FillWithRandomData ((uint8_t*)pInput[0], 32);
for (int32_t i = 0 ; i < 16; i++) {
pInput[0][i] = WELS_CLIP3 (pInput[0][i], -32640, 32640);
}
memcpy ((uint8_t*)pInput[1], (uint8_t*)pInput[0], 32);
const int16_t* pMF = g_kiQuantMF[uiQp];
const int16_t* pFF = g_iQuantIntraFF[uiQp];
WelsQuant4x4_c (pInput[0], pFF, pMF);
sFuncPtrList.pfQuantization4x4 (pInput[1], pFF, pMF);
WelsDequantIHadamard4x4_c (pInput[0], g_kuiDequantCoeff[uiQp][0]);
sFuncPtrList.pfDequantizationIHadamard4x4 (pInput[1], g_kuiDequantCoeff[uiQp][0]);
for (int32_t i = 0 ; i < 16; i++) {
ASSERT_EQ (pInput[0][i], pInput[1][i]);
}
}
}
TEST (ReconstructionFunTest, WelsQuant4x4) {
ENFORCE_STACK_ALIGN_2D (int16_t, pInput, 2, 16, 16)
int32_t iCpuCores = 0;
SWelsFuncPtrList sFuncPtrList;
uint32_t m_uiCpuFeatureFlag = WelsCPUFeatureDetect (&iCpuCores);
WelsInitEncodingFuncs (&sFuncPtrList, m_uiCpuFeatureFlag);
for (int32_t k = 0; k < RECONTEST_NUM; k++) {
uint8_t uiQp = rand() % 52;
FillWithRandomData ((uint8_t*)pInput[0], 32);
for (int32_t i = 0 ; i < 16; i++) {
pInput[0][i] = WELS_CLIP3 (pInput[0][i], -32640, 32640);
}
memcpy ((uint8_t*)pInput[1], (uint8_t*)pInput[0], 32);
const int16_t* pMF = g_kiQuantMF[uiQp];
const int16_t* pFF = g_iQuantIntraFF[uiQp];
WelsQuant4x4_c (pInput[0], pFF, pMF);
sFuncPtrList.pfQuantization4x4 (pInput[1], pFF, pMF);
for (int32_t i = 0 ; i < 16; i++) {
ASSERT_EQ (pInput[0][i], pInput[1][i]);
}
}
}
TEST (ReconstructionFunTest, WelsQuant4x4Dc) {
ENFORCE_STACK_ALIGN_2D (int16_t, pInput, 2, 16, 16)
int32_t iCpuCores = 0;
SWelsFuncPtrList sFuncPtrList;
uint32_t m_uiCpuFeatureFlag = WelsCPUFeatureDetect (&iCpuCores);
WelsInitEncodingFuncs (&sFuncPtrList, m_uiCpuFeatureFlag);
for (int32_t k = 0; k < RECONTEST_NUM; k++) {
uint8_t uiQp = rand() % 52;
FillWithRandomData ((uint8_t*)pInput[0], 32);
for (int32_t i = 0 ; i < 16; i++) {
pInput[0][i] = WELS_CLIP3 (pInput[0][i], -32640, 32640);
}
memcpy ((uint8_t*)pInput[1], (uint8_t*)pInput[0], 32);
const int16_t* pMF = g_kiQuantMF[uiQp];
const int16_t* pFF = g_iQuantIntraFF[uiQp];
WelsQuant4x4Dc_c (pInput[0], pFF[0], pMF[0]);
sFuncPtrList.pfQuantizationDc4x4 (pInput[1], pFF[0], pMF[0]);
for (int32_t i = 0 ; i < 16; i++) {
ASSERT_EQ (pInput[0][i], pInput[1][i]);
}
}
}
TEST (ReconstructionFunTest, WelsQuantFour4x4) {
ENFORCE_STACK_ALIGN_2D (int16_t, pInput, 2, 64, 16)
int32_t iCpuCores = 0;
SWelsFuncPtrList sFuncPtrList;
uint32_t m_uiCpuFeatureFlag = WelsCPUFeatureDetect (&iCpuCores);
WelsInitEncodingFuncs (&sFuncPtrList, m_uiCpuFeatureFlag);
for (int32_t k = 0; k < RECONTEST_NUM; k++) {
uint8_t uiQp = rand() % 52;
FillWithRandomData ((uint8_t*)pInput[0], 128);
for (int32_t i = 0 ; i < 64; i++) {
pInput[0][i] = WELS_CLIP3 (pInput[0][i], -32640, 32640);
}
memcpy ((uint8_t*)pInput[1], (uint8_t*)pInput[0], 128);
const int16_t* pMF = g_kiQuantMF[uiQp];
const int16_t* pFF = g_iQuantIntraFF[uiQp];
WelsQuantFour4x4_c (pInput[0], pFF, pMF);
sFuncPtrList.pfQuantizationFour4x4 (pInput[1], pFF, pMF);
for (int32_t i = 0 ; i < 64; i++) {
ASSERT_EQ (pInput[0][i], pInput[1][i]);
}
}
}
TEST (ReconstructionFunTest, WelsQuantFour4x4Max) {
ENFORCE_STACK_ALIGN_2D (int16_t, pInput, 2, 64, 16)
int32_t iCpuCores = 0;
SWelsFuncPtrList sFuncPtrList;
uint32_t m_uiCpuFeatureFlag = WelsCPUFeatureDetect (&iCpuCores);
WelsInitEncodingFuncs (&sFuncPtrList, m_uiCpuFeatureFlag);
int16_t pMax[2][4];
for (int32_t k = 0; k < RECONTEST_NUM; k++) {
uint8_t uiQp = rand() % 52;
FillWithRandomData ((uint8_t*)pInput[0], 128);
for (int32_t i = 0 ; i < 64; i++) {
pInput[0][i] = WELS_CLIP3 (pInput[0][i], -32640, 32640);
}
memcpy ((uint8_t*)pInput[1], (uint8_t*)pInput[0], 128);
const int16_t* pMF = g_kiQuantMF[uiQp];
const int16_t* pFF = g_iQuantIntraFF[uiQp];
WelsQuantFour4x4Max_c (pInput[0], pFF, pMF, pMax[0]);
sFuncPtrList.pfQuantizationFour4x4Max (pInput[1], pFF, pMF, pMax[1]);
for (int32_t i = 0 ; i < 64; i++) {
ASSERT_EQ (pInput[0][i], pInput[1][i]);
ASSERT_EQ (pMax[0][i >> 4], pMax[1][i >> 4]);
}
}
}
TEST (ReconstructionFunTest, WelsDeQuantFour4x4) {
ENFORCE_STACK_ALIGN_2D (int16_t, pInput, 2, 64, 16)
int32_t iCpuCores = 0;
SWelsFuncPtrList sFuncPtrList;
uint32_t m_uiCpuFeatureFlag = WelsCPUFeatureDetect (&iCpuCores);
WelsInitEncodingFuncs (&sFuncPtrList, m_uiCpuFeatureFlag);
WelsInitReconstructionFuncs (&sFuncPtrList, m_uiCpuFeatureFlag);
for (int32_t k = 0; k < RECONTEST_NUM; k++) {
uint8_t uiQp = rand() % 52;
FillWithRandomData ((uint8_t*)pInput[0], 128);
for (int32_t i = 0 ; i < 64; i++) {
pInput[0][i] = WELS_CLIP3 (pInput[0][i], -32640, 32640);
}
memcpy ((uint8_t*)pInput[1], (uint8_t*)pInput[0], 128);
const int16_t* pMF = g_kiQuantMF[uiQp];
const int16_t* pFF = g_iQuantIntraFF[uiQp];
WelsQuantFour4x4_c (pInput[0], pFF, pMF);
sFuncPtrList.pfQuantizationFour4x4 (pInput[1], pFF, pMF);
WelsDequantFour4x4_c (pInput[0], g_kuiDequantCoeff[uiQp]);
sFuncPtrList.pfDequantizationFour4x4 (pInput[1], g_kuiDequantCoeff[uiQp]);
for (int32_t i = 0 ; i < 64; i++) {
ASSERT_EQ (pInput[0][i], pInput[1][i]);
}
}
}
TEST (ReconstructionFunTest, WelsHadamardQuant2x2Skip) {
ENFORCE_STACK_ALIGN_2D (int16_t, pInput, 2, 64, 16)
int32_t iCpuCores = 0;
SWelsFuncPtrList sFuncPtrList;
uint32_t m_uiCpuFeatureFlag = WelsCPUFeatureDetect (&iCpuCores);
WelsInitEncodingFuncs (&sFuncPtrList, m_uiCpuFeatureFlag);
for (int32_t k = 0; k < RECONTEST_NUM; k++) {
uint8_t uiQp = rand() % 52;
FillWithRandomData ((uint8_t*)pInput[0], 128);
for (int32_t i = 0 ; i < 64; i++) {
pInput[0][i] = WELS_CLIP3 (pInput[0][i], -4080, 4080);
}
memcpy ((uint8_t*)pInput[1], (uint8_t*)pInput[0], 128);
const int16_t* pMF = g_kiQuantMF[uiQp];
const int16_t* pFF = g_iQuantIntraFF[uiQp];
int32_t iSkip_c = WelsHadamardQuant2x2Skip_c (pInput[0], pFF[0], pMF[0]);
int32_t iSkip_test = sFuncPtrList.pfQuantizationHadamard2x2Skip (pInput[1], pFF[0], pMF[0]);
ASSERT_EQ ((iSkip_test != 0), (iSkip_c != 0));
}
}
TEST (ReconstructionFunTest, WelsHadamardQuant2x2) {
ENFORCE_STACK_ALIGN_2D (int16_t, pInput, 2, 64, 16)
ENFORCE_STACK_ALIGN_2D (int16_t, pDct, 2, 4, 16)
ENFORCE_STACK_ALIGN_2D (int16_t, pBlock, 2, 4, 16)
int32_t iCpuCores = 0;
SWelsFuncPtrList sFuncPtrList;
uint32_t m_uiCpuFeatureFlag = WelsCPUFeatureDetect (&iCpuCores);
WelsInitEncodingFuncs (&sFuncPtrList, m_uiCpuFeatureFlag);
for (int32_t k = 0; k < RECONTEST_NUM; k++) {
uint8_t uiQp = rand() % 52;
FillWithRandomData ((uint8_t*)pInput[0], 128);
for (int32_t i = 0 ; i < 64; i++) {
pInput[0][i] = WELS_CLIP3 (pInput[0][i], -4080, 4080);
}
memcpy ((uint8_t*)pInput[1], (uint8_t*)pInput[0], 128);
const int16_t* pMF = g_kiQuantMF[uiQp];
const int16_t* pFF = g_iQuantIntraFF[uiQp];
int32_t iSkip_c = WelsHadamardQuant2x2_c (pInput[0], pFF[0], pMF[0], pDct[0], pBlock[0]);
int32_t iSkip_test = sFuncPtrList.pfQuantizationHadamard2x2 (pInput[1], pFF[0], pMF[0], pDct[1], pBlock[1]);
ASSERT_EQ ((iSkip_test != 0), (iSkip_c != 0));
for (int32_t i = 0 ; i < 64; i++) {
ASSERT_EQ (pInput[0][i], pInput[1][i]);
}
for (int32_t i = 0 ; i < 4; i++) {
ASSERT_EQ (pDct[0][i], pDct[1][i]);
ASSERT_EQ (pBlock[0][i], pBlock[1][i]);
}
}
}

View File

@ -8,6 +8,7 @@ ENCODER_UNITTEST_CPP_SRCS=\
$(ENCODER_UNITTEST_SRCDIR)/EncUT_GetIntraPredictor.cpp\
$(ENCODER_UNITTEST_SRCDIR)/EncUT_MemoryAlloc.cpp\
$(ENCODER_UNITTEST_SRCDIR)/EncUT_MotionEstimate.cpp\
$(ENCODER_UNITTEST_SRCDIR)/EncUT_Reconstruct.cpp\
$(ENCODER_UNITTEST_SRCDIR)/EncUT_Sample.cpp\
ENCODER_UNITTEST_OBJS += $(ENCODER_UNITTEST_CPP_SRCS:.cpp=.$(OBJ))