openh264/codec/encoder/core/arm/reconstruct_neon.S
Martin Storsjö 811c647c0e Remap registers to avoid clobbering the neon registers q4-q7
According to the calling convention, the registers q4-q7 should be
preserved by functions. The caller (generated by the compiler) could
be using those registers anywhere for any intermediate data.

Functions that use 12 or less of the qX registers can avoid
violating the calling convention by simply using other registers instead
of the callee saved registers q4-q7.

This change only remaps the registers used within functions - therefore
this does not affect performance at all. E.g. in functions using
registers q0-q7, we now use q0-q3 and q8-q11 instead.
2014-03-10 22:07:25 +02:00

1313 lines
38 KiB
ArmAsm

/*!
* \copy
* Copyright (c) 2013, Cisco Systems
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
* COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*
*/
#ifdef HAVE_NEON
.text
#include "arm_arch_common_macro.S"
#ifdef APPLE_IOS
.macro LOAD_ALIGNED_DATA_WITH_STRIDE
// { // input: $0~$3, src*, src_stride
vld1.64 {$0}, [$4,:128], $5
vld1.64 {$1}, [$4,:128], $5
vld1.64 {$2}, [$4,:128], $5
vld1.64 {$3}, [$4,:128], $5
// }
.endm
.macro STORE_ALIGNED_DATA_WITH_STRIDE
// { // input: $0~$3, dst*, dst_stride
vst1.64 {$0}, [$4,:128], $5
vst1.64 {$1}, [$4,:128], $5
vst1.64 {$2}, [$4,:128], $5
vst1.64 {$3}, [$4,:128], $5
// }
.endm
.macro LOAD_UNALIGNED_DATA_WITH_STRIDE
// { // input: $0~$3, src*, src_stride
vld1.64 {$0}, [$4], $5
vld1.64 {$1}, [$4], $5
vld1.64 {$2}, [$4], $5
vld1.64 {$3}, [$4], $5
// }
.endm
.macro STORE_UNALIGNED_DATA_WITH_STRIDE
// { // input: $0~$3, dst*, dst_stride
vst1.64 {$0}, [$4], $5
vst1.64 {$1}, [$4], $5
vst1.64 {$2}, [$4], $5
vst1.64 {$3}, [$4], $5
// }
.endm
.macro LOAD_4x4_DATA_FOR_DCT
// { // input: $0~$3, src1*, src1_stride, src2*, src2_stride
vld2.16 {$0[0],$1[0]}, [$4], $5
vld2.16 {$2[0],$3[0]}, [$6], $7
vld2.16 {$0[1],$1[1]}, [$4], $5
vld2.16 {$2[1],$3[1]}, [$6], $7
vld2.16 {$0[2],$1[2]}, [$4], $5
vld2.16 {$2[2],$3[2]}, [$6], $7
vld2.16 {$0[3],$1[3]}, [$4], $5
vld2.16 {$2[3],$3[3]}, [$6], $7
// }
.endm
.macro LOAD_8x8_DATA_FOR_DCT
// { // input: $0~$3, src1*, src2*; untouched r2:src1_stride &r4:src2_stride
vld1.64 {$0}, [$8], r2
vld1.64 {$4}, [$9], r4
vld1.64 {$1}, [$8], r2
vld1.64 {$5}, [$9], r4
vld1.64 {$2}, [$8], r2
vld1.64 {$6}, [$9], r4
vld1.64 {$3}, [$8], r2
vld1.64 {$7}, [$9], r4
// }
.endm
.macro DCT_ROW_TRANSFORM_TOTAL_16BITS
// { // input: src_d[0]~[3], working: [4]~[7]
vadd.s16 $4, $0, $3 //int16 s[0] = data[i] + data[i3];
vsub.s16 $7, $0, $3 //int16 s[3] = data[i] - data[i3];
vadd.s16 $5, $1, $2 //int16 s[1] = data[i1] + data[i2];
vsub.s16 $6, $1, $2 //int16 s[2] = data[i1] - data[i2];
vadd.s16 $0, $4, $5 //int16 dct[i ] = s[0] + s[1];
vsub.s16 $2, $4, $5 //int16 dct[i2] = s[0] - s[1];
vshl.s16 $1, $7, #1
vshl.s16 $3, $6, #1
vadd.s16 $1, $1, $6 //int16 dct[i1] = (s[3] << 1) + s[2];
vsub.s16 $3, $7, $3 //int16 dct[i3] = s[3] - (s[2] << 1);
// }
.endm
.macro MATRIX_TRANSFORM_EACH_16BITS
// { // input & output: src_d[0]~[3];[0 1 2 3]+[4 5 6 7]+[8 9 10 11]+[12 13 14 15]
vtrn.s16 $0, $1 //[0 1 2 3]+[4 5 6 7]-->[0 4 2 6]+[1 5 3 7]
vtrn.s16 $2, $3 //[8 9 10 11]+[12 13 14 15]-->[8 12 10 14]+[9 13 11 15]
vtrn.32 $0, $2 //[0 4 2 6]+[8 12 10 14]-->[0 4 8 12]+[2 6 10 14]
vtrn.32 $1, $3 //[1 5 3 7]+[9 13 11 15]-->[1 5 9 13]+[3 7 11 15]
// }
.endm
.macro NEWQUANT_COEF_EACH_16BITS // if coef <= 0, - coef; else , coef;
// { // input: coef, ff (dst), ff_d0, ff_d1, mf_d0, md_d1
veor.s16 $6, $6 // init 0 , and keep 0;
vaba.s16 $1, $0, $6 // f + abs(coef - 0)
vmull.s16 $7, $2, $4
vmull.s16 $8, $3, $5
vshr.s32 $7, #16
vshr.s32 $8, #16
vmovn.s32 $2, $7
vmovn.s32 $3, $8
vcgt.s16 $7, $0, #0 // if true, location of coef == 11111111
vbif.s16 $6, $1, $7 // if (x<0) reserved part; else keep 0 untouched
vshl.s16 $6, #1
vsub.s16 $1, $1, $6 // if x > 0, -= 0; else x-= 2x
// }
.endm
.macro NEWQUANT_COEF_EACH_16BITS_MAX // if coef <= 0, - coef; else , coef;
// { // input: coef, ff (dst), ff_d0, ff_d1, mf_d0(max), md_d1
veor.s16 $6, $6 // init 0 , and keep 0;
vaba.s16 $1, $0, $6 // f + abs(coef - 0)
vmull.s16 $7, $2, $4
vmull.s16 $8, $3, $5
vshr.s32 $7, #16
vshr.s32 $8, #16
vmovn.s32 $2, $7
vmovn.s32 $3, $8
vcgt.s16 $7, $0, #0 // if true, location of coef == 11111111
vbif.s16 $6, $1, $7 // if (x<0) reserved part; else keep 0 untouched
vshl.s16 $6, #1
vmax.s16 $9, $2, $3
vsub.s16 $1, $1, $6 // if x > 0, -= 0; else x-= 2x
// }
.endm
.macro QUANT_DUALWORD_COEF_EACH_16BITS // if coef <= 0, - coef; else , coef;
// { // input: coef, ff (dst), mf , working_d (all 0), working_q
vaba.s16 $1, $0, $3 // f + abs(coef - 0)
vmull.s16 $4, $1, $2 // *= mf
vshr.s32 $4, #16
vmovn.s32 $1, $4 // >> 16
vcgt.s16 $2, $0, #0 // if true, location of coef == 11111111
vbif.s16 $3, $1, $2 // if (x<0) reserved part; else keep 0 untouched
vshl.s16 $3, #1
vsub.s16 $1, $1, $3 // if x > 0, -= 0; else x-= 2x
// }
.endm
.macro DC_ZERO_COUNT_IN_DUALWORD
// { // input: coef, dst_d, working_d (all 0x01)
vceq.s16 $1, $0, #0
vand.s16 $1, $2
vpadd.s16 $1, $1, $1
vpadd.s16 $1, $1, $1
// }
.endm
.macro SELECT_MAX_IN_ABS_COEF
// { // input: coef_0, coef_1, max_q (identy to follow two)
vmax.s16 $2, $0, $1 // max 1st in $3 & max 2nd in $4
vpmax.s16 $3, $3, $4 // max 1st in $3[0][1] & max 2nd in $3[2][3]
vpmax.s16 $3, $3, $4 // max 1st in $3[0][1]
// }
.endm
.macro ZERO_COUNT_IN_2_QUARWORD
// { // input: coef_0 (identy to $3 $4), coef_1(identy to $5 $6), mask_q
vceq.s16 $0, #0
vceq.s16 $1, #0
vand.s16 $0, $2
vand.s16 $1, $2
vpadd.s16 $3, $3, $5
vpadd.s16 $4, $4, $6
vpadd.s16 $3, $3, $4 // 8-->4
vpadd.s16 $3, $3, $3
vpadd.s16 $3, $3, $3
// }
.endm
.macro HDM_QUANT_2x2_TOTAL_16BITS
// { // input: src_d[0]~[3], working_d, dst_d
vshr.s64 $1, $0, #32
vadd.s16 $2, $0, $1 // [0] = rs[0] + rs[32];[1] = rs[16] + rs[48];
vsub.s16 $1, $0, $1 // [0] = rs[0] - rs[32];[1] = rs[16] - rs[48];
vtrn.s16 $2, $1
vtrn.s32 $2, $1
// }
.endm
.macro IHDM_4x4_TOTAL_16BITS
// { // input: each src_d[0]~[3](dst), working_q0, working_q1, working_q2
vshr.s64 $1, $0, #32
vadd.s16 $2, $0, $1 // [0] = rs[0] + rs[2];[1] = rs[1] + rs[3];
vsub.s16 $1, $0, $1 // [0] = rs[0] - rs[2];[1] = rs[1] - rs[3];
vtrn.s16 $2, $1
vrev32.16 $1, $1
vtrn.s32 $2, $1 // [0] = rs[0] + rs[2];[1] = rs[0] - rs[2];[2] = rs[1] - rs[3];[3] = rs[1] + rs[3];
vrev64.16 $1, $2
vadd.s16 $0, $2, $1 // [0] = rs[0] + rs[3];[1] = rs[1] + rs[2];
vsub.s16 $1, $2, $1
vrev32.16 $1, $1 // [0] = rs[1] - rs[2];[1] = rs[0] - rs[3];
vtrn.s32 $0, $1 // [0] = rs[0] + rs[3];[1] = rs[1] + rs[2];[2] = rs[1] - rs[2];[3] = rs[0] - rs[3];
// }
.endm
.macro MB_PRED_8BITS_ADD_DCT_16BITS_CLIP
// { // input: pred_d[0]/[1](output), dct_q0/1, working_q0/1;
vmovl.u8 $4,$0
vmovl.u8 $5,$1
vadd.s16 $4,$2
vadd.s16 $5,$3
vqmovun.s16 $0,$4
vqmovun.s16 $1,$5
// }
.endm
.macro ROW_TRANSFORM_1_STEP_TOTAL_16BITS
// { // input: src_d[0]~[3], output: e_d[0]~[3];
vadd.s16 $4, $0, $2 //int16 e[i][0] = src[0] + src[2];
vsub.s16 $5, $0, $2 //int16 e[i][1] = src[0] - src[2];
vshr.s16 $6, $1, #1
vshr.s16 $7, $3, #1
vsub.s16 $6, $6, $3 //int16 e[i][2] = (src[1]>>1)-src[3];
vadd.s16 $7, $1, $7 //int16 e[i][3] = src[1] + (src[3]>>1);
// }
.endm
.macro TRANSFORM_TOTAL_16BITS // both row & col transform used
// { // output: f_q[0]~[3], input: e_q[0]~[3];
vadd.s16 $0, $4, $7 //int16 f[i][0] = e[i][0] + e[i][3];
vadd.s16 $1, $5, $6 //int16 f[i][1] = e[i][1] + e[i][2];
vsub.s16 $2, $5, $6 //int16 f[i][2] = e[i][1] - e[i][2];
vsub.s16 $3, $4, $7 //int16 f[i][3] = e[i][0] - e[i][3];
// }
.endm
.macro ROW_TRANSFORM_0_STEP
// { // input: src_d[0]~[3], output: e_q[0]~[3];
vaddl.s16 $4, $0, $2 //int32 e[i][0] = src[0] + src[2];
vsubl.s16 $5, $0, $2 //int32 e[i][1] = src[0] - src[2];
vsubl.s16 $6, $1, $3 //int32 e[i][2] = src[1] - src[3];
vaddl.s16 $7, $1, $3 //int32 e[i][3] = src[1] + src[3];
// }
.endm
.macro ROW_TRANSFORM_1_STEP
// { // input: src_d[0]~[3], output: e_q[0]~[3]; working: $8 $9
vaddl.s16 $4, $0, $2 //int32 e[i][0] = src[0] + src[2];
vsubl.s16 $5, $0, $2 //int32 e[i][1] = src[0] - src[2];
vshr.s16 $8, $1, #1
vshr.s16 $9, $3, #1
vsubl.s16 $6, $8, $3 //int32 e[i][2] = (src[1]>>1)-src[3];
vaddl.s16 $7, $1, $9 //int32 e[i][3] = src[1] + (src[3]>>1);
// }
.endm
.macro TRANSFORM_4BYTES // both row & col transform used
// { // output: f_q[0]~[3], input: e_q[0]~[3];
vadd.s32 $0, $4, $7 //int16 f[i][0] = e[i][0] + e[i][3];
vadd.s32 $1, $5, $6 //int16 f[i][1] = e[i][1] + e[i][2];
vsub.s32 $2, $5, $6 //int16 f[i][2] = e[i][1] - e[i][2];
vsub.s32 $3, $4, $7 //int16 f[i][3] = e[i][0] - e[i][3];
// }
.endm
.macro COL_TRANSFORM_0_STEP
// { // input: src_q[0]~[3], output: e_q[0]~[3];
vadd.s32 $4, $0, $2 //int32 e[0][j] = f[0][j] + f[2][j];
vsub.s32 $5, $0, $2 //int32 e[1][j] = f[0][j] - f[2][j];
vsub.s32 $6, $1, $3 //int32 e[2][j] = (f[1][j]>>1) - f[3][j];
vadd.s32 $7, $1, $3 //int32 e[3][j] = f[1][j] + (f[3][j]>>1);
// }
.endm
.macro COL_TRANSFORM_1_STEP
// { // input: src_q[0]~[3], output: e_q[0]~[3];
vadd.s32 $4, $0, $2 //int32 e[0][j] = f[0][j] + f[2][j];
vsub.s32 $5, $0, $2 //int32 e[1][j] = f[0][j] - f[2][j];
vshr.s32 $6, $1, #1
vshr.s32 $7, $3, #1
vsub.s32 $6, $6, $3 //int32 e[2][j] = (f[1][j]>>1) - f[3][j];
vadd.s32 $7, $1, $7 //int32 e[3][j] = f[1][j] + (f[3][j]>>1);
// }
.endm
#else
.macro LOAD_ALIGNED_DATA_WITH_STRIDE arg0, arg1, arg2, arg3, arg4, arg5
// { // input: \arg0~\arg3, src*, src_stride
vld1.64 {\arg0}, [\arg4,:128], \arg5
vld1.64 {\arg1}, [\arg4,:128], \arg5
vld1.64 {\arg2}, [\arg4,:128], \arg5
vld1.64 {\arg3}, [\arg4,:128], \arg5
// }
.endm
.macro STORE_ALIGNED_DATA_WITH_STRIDE arg0, arg1, arg2, arg3, arg4, arg5
// { // input: \arg0~\arg3, dst*, dst_stride
vst1.64 {\arg0}, [\arg4,:128], \arg5
vst1.64 {\arg1}, [\arg4,:128], \arg5
vst1.64 {\arg2}, [\arg4,:128], \arg5
vst1.64 {\arg3}, [\arg4,:128], \arg5
// }
.endm
.macro LOAD_UNALIGNED_DATA_WITH_STRIDE arg0, arg1, arg2, arg3, arg4, arg5
// { // input: \arg0~\arg3, src*, src_stride
vld1.64 {\arg0}, [\arg4], \arg5
vld1.64 {\arg1}, [\arg4], \arg5
vld1.64 {\arg2}, [\arg4], \arg5
vld1.64 {\arg3}, [\arg4], \arg5
// }
.endm
.macro STORE_UNALIGNED_DATA_WITH_STRIDE arg0, arg1, arg2, arg3, arg4, arg5
// { // input: \arg0~\arg3, dst*, dst_stride
vst1.64 {\arg0}, [\arg4], \arg5
vst1.64 {\arg1}, [\arg4], \arg5
vst1.64 {\arg2}, [\arg4], \arg5
vst1.64 {\arg3}, [\arg4], \arg5
// }
.endm
.macro LOAD_4x4_DATA_FOR_DCT arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7
// { // input: \arg0~\arg3, src1*, src1_stride, src2*, src2_stride
vld2.16 {\arg0[0],\arg1[0]}, [\arg4], \arg5
vld2.16 {\arg2[0],\arg3[0]}, [\arg6], \arg7
vld2.16 {\arg0[1],\arg1[1]}, [\arg4], \arg5
vld2.16 {\arg2[1],\arg3[1]}, [\arg6], \arg7
vld2.16 {\arg0[2],\arg1[2]}, [\arg4], \arg5
vld2.16 {\arg2[2],\arg3[2]}, [\arg6], \arg7
vld2.16 {\arg0[3],\arg1[3]}, [\arg4], \arg5
vld2.16 {\arg2[3],\arg3[3]}, [\arg6], \arg7
// }
.endm
.macro LOAD_8x8_DATA_FOR_DCT arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9
// { // input: \arg0~\arg3, src1*, src2*; untouched r2:src1_stride &r4:src2_stride
vld1.64 {\arg0}, [\arg8], r2
vld1.64 {\arg4}, [\arg9], r4
vld1.64 {\arg1}, [\arg8], r2
vld1.64 {\arg5}, [\arg9], r4
vld1.64 {\arg2}, [\arg8], r2
vld1.64 {\arg6}, [\arg9], r4
vld1.64 {\arg3}, [\arg8], r2
vld1.64 {\arg7}, [\arg9], r4
// }
.endm
.macro DCT_ROW_TRANSFORM_TOTAL_16BITS arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7
// { // input: src_d[0]~[3], working: [4]~[7]
vadd.s16 \arg4, \arg0, \arg3 //int16 s[0] = data[i] + data[i3];
vsub.s16 \arg7, \arg0, \arg3 //int16 s[3] = data[i] - data[i3];
vadd.s16 \arg5, \arg1, \arg2 //int16 s[1] = data[i1] + data[i2];
vsub.s16 \arg6, \arg1, \arg2 //int16 s[2] = data[i1] - data[i2];
vadd.s16 \arg0, \arg4, \arg5 //int16 dct[i ] = s[0] + s[1];
vsub.s16 \arg2, \arg4, \arg5 //int16 dct[i2] = s[0] - s[1];
vshl.s16 \arg1, \arg7, #1
vshl.s16 \arg3, \arg6, #1
vadd.s16 \arg1, \arg1, \arg6 //int16 dct[i1] = (s[3] << 1) + s[2];
vsub.s16 \arg3, \arg7, \arg3 //int16 dct[i3] = s[3] - (s[2] << 1);
// }
.endm
.macro MATRIX_TRANSFORM_EACH_16BITS arg0, arg1, arg2, arg3
// { // input & output: src_d[0]~[3];[0 1 2 3]+[4 5 6 7]+[8 9 10 11]+[12 13 14 15]
vtrn.s16 \arg0, \arg1 //[0 1 2 3]+[4 5 6 7]-->[0 4 2 6]+[1 5 3 7]
vtrn.s16 \arg2, \arg3 //[8 9 10 11]+[12 13 14 15]-->[8 12 10 14]+[9 13 11 15]
vtrn.32 \arg0, \arg2 //[0 4 2 6]+[8 12 10 14]-->[0 4 8 12]+[2 6 10 14]
vtrn.32 \arg1, \arg3 //[1 5 3 7]+[9 13 11 15]-->[1 5 9 13]+[3 7 11 15]
// }
.endm
.macro NEWQUANT_COEF_EACH_16BITS arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8
// { // input: coef, ff (dst), ff_d0, ff_d1, mf_d0, md_d1
veor.s16 \arg6, \arg6 // init 0 , and keep 0;
vaba.s16 \arg1, \arg0, \arg6 // f + abs(coef - 0)
vmull.s16 \arg7, \arg2, \arg4
vmull.s16 \arg8, \arg3, \arg5
vshr.s32 \arg7, #16
vshr.s32 \arg8, #16
vmovn.s32 \arg2, \arg7
vmovn.s32 \arg3, \arg8
vcgt.s16 \arg7, \arg0, #0 // if true, location of coef == 11111111
vbif.s16 \arg6, \arg1, \arg7 // if (x<0) reserved part; else keep 0 untouched
vshl.s16 \arg6, #1
vsub.s16 \arg1, \arg1, \arg6 // if x > 0, -= 0; else x-= 2x
// }
.endm
.macro NEWQUANT_COEF_EACH_16BITS_MAX arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9
// { // input: coef, ff (dst), ff_d0, ff_d1, mf_d0(max), md_d1
veor.s16 \arg6, \arg6 // init 0 , and keep 0;
vaba.s16 \arg1, \arg0, \arg6 // f + abs(coef - 0)
vmull.s16 \arg7, \arg2, \arg4
vmull.s16 \arg8, \arg3, \arg5
vshr.s32 \arg7, #16
vshr.s32 \arg8, #16
vmovn.s32 \arg2, \arg7
vmovn.s32 \arg3, \arg8
vcgt.s16 \arg7, \arg0, #0 // if true, location of coef == 11111111
vbif.s16 \arg6, \arg1, \arg7 // if (x<0) reserved part; else keep 0 untouched
vshl.s16 \arg6, #1
vmax.s16 \arg9, \arg2, \arg3
vsub.s16 \arg1, \arg1, \arg6 // if x > 0, -= 0; else x-= 2x
// }
.endm
.macro QUANT_DUALWORD_COEF_EACH_16BITS arg0, arg1, arg2, arg3, arg4
// { // input: coef, ff (dst), mf , working_d (all 0), working_q
vaba.s16 \arg1, \arg0, \arg3 // f + abs(coef - 0)
vmull.s16 \arg4, \arg1, \arg2 // *= mf
vshr.s32 \arg4, #16
vmovn.s32 \arg1, \arg4 // >> 16
vcgt.s16 \arg2, \arg0, #0 // if true, location of coef == 11111111
vbif.s16 \arg3, \arg1, \arg2 // if (x<0) reserved part; else keep 0 untouched
vshl.s16 \arg3, #1
vsub.s16 \arg1, \arg1, \arg3 // if x > 0, -= 0; else x-= 2x
// }
.endm
.macro DC_ZERO_COUNT_IN_DUALWORD arg0, arg1, arg2
// { // input: coef, dst_d, working_d (all 0x01)
vceq.s16 \arg1, \arg0, #0
vand.s16 \arg1, \arg2
vpadd.s16 \arg1, \arg1, \arg1
vpadd.s16 \arg1, \arg1, \arg1
// }
.endm
.macro SELECT_MAX_IN_ABS_COEF arg0, arg1, arg2, arg3, arg4
// { // input: coef_0, coef_1, max_q (identy to follow two), output: max_d0, max_d1
vmax.s16 \arg2, \arg0, \arg1 // max 1st in \arg3 & max 2nd in \arg4
vpmax.s16 \arg3, \arg3, \arg4 // max 1st in \arg3[0][1] & max 2nd in \arg3[2][3]
vpmax.s16 \arg3, \arg3, \arg4 // max 1st in \arg3[0][1]
// }
.endm
.macro ZERO_COUNT_IN_2_QUARWORD arg0, arg1, arg2, arg3, arg4, arg5, arg6
// { // input: coef_0 (identy to \arg3 \arg4), coef_1(identy to \arg5 \arg6), mask_q
vceq.s16 \arg0, #0
vceq.s16 \arg1, #0
vand.s16 \arg0, \arg2
vand.s16 \arg1, \arg2
vpadd.s16 \arg3, \arg3, \arg5
vpadd.s16 \arg4, \arg4, \arg6
vpadd.s16 \arg3, \arg3, \arg4 // 8-->4
vpadd.s16 \arg3, \arg3, \arg3
vpadd.s16 \arg3, \arg3, \arg3
// }
.endm
.macro HDM_QUANT_2x2_TOTAL_16BITS arg0, arg1, arg2
// { // input: src_d[0]~[3], working_d, dst_d
vshr.s64 \arg1, \arg0, #32
vadd.s16 \arg2, \arg0, \arg1 // [0] = rs[0] + rs[32];[1] = rs[16] + rs[48];
vsub.s16 \arg1, \arg0, \arg1 // [0] = rs[0] - rs[32];[1] = rs[16] - rs[48];
vtrn.s16 \arg2, \arg1
vtrn.s32 \arg2, \arg1
// }
.endm
.macro IHDM_4x4_TOTAL_16BITS arg0, arg1, arg2
// { // input: each src_d[0]~[3](dst), working_q0, working_q1, working_q2
vshr.s64 \arg1, \arg0, #32
vadd.s16 \arg2, \arg0, \arg1 // [0] = rs[0] + rs[2];[1] = rs[1] + rs[3];
vsub.s16 \arg1, \arg0, \arg1 // [0] = rs[0] - rs[2];[1] = rs[1] - rs[3];
vtrn.s16 \arg2, \arg1
vrev32.16 \arg1, \arg1
vtrn.s32 \arg2, \arg1 // [0] = rs[0] + rs[2];[1] = rs[0] - rs[2];[2] = rs[1] - rs[3];[3] = rs[1] + rs[3];
vrev64.16 \arg1, \arg2
vadd.s16 \arg0, \arg2, \arg1 // [0] = rs[0] + rs[3];[1] = rs[1] + rs[2];
vsub.s16 \arg1, \arg2, \arg1
vrev32.16 \arg1, \arg1 // [0] = rs[1] - rs[2];[1] = rs[0] - rs[3];
vtrn.s32 \arg0, \arg1 // [0] = rs[0] + rs[3];[1] = rs[1] + rs[2];[2] = rs[1] - rs[2];[3] = rs[0] - rs[3];
// }
.endm
.macro MB_PRED_8BITS_ADD_DCT_16BITS_CLIP arg0, arg1, arg2, arg3, arg4, arg5
// { // input: pred_d[0]/[1](output), dct_q0/1, working_q0/1;
vmovl.u8 \arg4,\arg0
vmovl.u8 \arg5,\arg1
vadd.s16 \arg4,\arg2
vadd.s16 \arg5,\arg3
vqmovun.s16 \arg0,\arg4
vqmovun.s16 \arg1,\arg5
// }
.endm
.macro ROW_TRANSFORM_1_STEP_TOTAL_16BITS arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7
// { // input: src_d[0]~[3], output: e_d[0]~[3];
vadd.s16 \arg4, \arg0, \arg2 //int16 e[i][0] = src[0] + src[2];
vsub.s16 \arg5, \arg0, \arg2 //int16 e[i][1] = src[0] - src[2];
vshr.s16 \arg6, \arg1, #1
vshr.s16 \arg7, \arg3, #1
vsub.s16 \arg6, \arg6, \arg3 //int16 e[i][2] = (src[1]>>1)-src[3];
vadd.s16 \arg7, \arg1, \arg7 //int16 e[i][3] = src[1] + (src[3]>>1);
// }
.endm
.macro TRANSFORM_TOTAL_16BITS arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7 // both row & col transform used
// { // output: f_q[0]~[3], input: e_q[0]~[3];
vadd.s16 \arg0, \arg4, \arg7 //int16 f[i][0] = e[i][0] + e[i][3];
vadd.s16 \arg1, \arg5, \arg6 //int16 f[i][1] = e[i][1] + e[i][2];
vsub.s16 \arg2, \arg5, \arg6 //int16 f[i][2] = e[i][1] - e[i][2];
vsub.s16 \arg3, \arg4, \arg7 //int16 f[i][3] = e[i][0] - e[i][3];
// }
.endm
.macro ROW_TRANSFORM_0_STEP arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7
// { // input: src_d[0]~[3], output: e_q[0]~[3];
vaddl.s16 \arg4, \arg0, \arg2 //int32 e[i][0] = src[0] + src[2];
vsubl.s16 \arg5, \arg0, \arg2 //int32 e[i][1] = src[0] - src[2];
vsubl.s16 \arg6, \arg1, \arg3 //int32 e[i][2] = src[1] - src[3];
vaddl.s16 \arg7, \arg1, \arg3 //int32 e[i][3] = src[1] + src[3];
// }
.endm
.macro ROW_TRANSFORM_1_STEP arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9
// { // input: src_d[0]~[3], output: e_q[0]~[3]; working: \arg8 \arg9
vaddl.s16 \arg4, \arg0, \arg2 //int32 e[i][0] = src[0] + src[2];
vsubl.s16 \arg5, \arg0, \arg2 //int32 e[i][1] = src[0] - src[2];
vshr.s16 \arg8, \arg1, #1
vshr.s16 \arg9, \arg3, #1
vsubl.s16 \arg6, \arg8, \arg3 //int32 e[i][2] = (src[1]>>1)-src[3];
vaddl.s16 \arg7, \arg1, \arg9 //int32 e[i][3] = src[1] + (src[3]>>1);
// }
.endm
.macro TRANSFORM_4BYTES arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7 // both row & col transform used
// { // output: f_q[0]~[3], input: e_q[0]~[3];
vadd.s32 \arg0, \arg4, \arg7 //int16 f[i][0] = e[i][0] + e[i][3];
vadd.s32 \arg1, \arg5, \arg6 //int16 f[i][1] = e[i][1] + e[i][2];
vsub.s32 \arg2, \arg5, \arg6 //int16 f[i][2] = e[i][1] - e[i][2];
vsub.s32 \arg3, \arg4, \arg7 //int16 f[i][3] = e[i][0] - e[i][3];
// }
.endm
.macro COL_TRANSFORM_0_STEP arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7
// { // input: src_q[0]~[3], output: e_q[0]~[3];
vadd.s32 \arg4, \arg0, \arg2 //int32 e[0][j] = f[0][j] + f[2][j];
vsub.s32 \arg5, \arg0, \arg2 //int32 e[1][j] = f[0][j] - f[2][j];
vsub.s32 \arg6, \arg1, \arg3 //int32 e[2][j] = (f[1][j]>>1) - f[3][j];
vadd.s32 \arg7, \arg1, \arg3 //int32 e[3][j] = f[1][j] + (f[3][j]>>1);
// }
.endm
.macro COL_TRANSFORM_1_STEP arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7
// { // input: src_q[0]~[3], output: e_q[0]~[3];
vadd.s32 \arg4, \arg0, \arg2 //int32 e[0][j] = f[0][j] + f[2][j];
vsub.s32 \arg5, \arg0, \arg2 //int32 e[1][j] = f[0][j] - f[2][j];
vshr.s32 \arg6, \arg1, #1
vshr.s32 \arg7, \arg3, #1
vsub.s32 \arg6, \arg6, \arg3 //int32 e[2][j] = (f[1][j]>>1) - f[3][j];
vadd.s32 \arg7, \arg1, \arg7 //int32 e[3][j] = f[1][j] + (f[3][j]>>1);
// }
.endm
#endif
WELS_ASM_FUNC_BEGIN WelsCopy8x8_neon
LOAD_UNALIGNED_DATA_WITH_STRIDE d0, d1, d2, d3, r2, r3
STORE_UNALIGNED_DATA_WITH_STRIDE d0, d1, d2, d3, r0, r1
LOAD_UNALIGNED_DATA_WITH_STRIDE d4, d5, d6, d7, r2, r3
STORE_UNALIGNED_DATA_WITH_STRIDE d4, d5, d6, d7, r0, r1
WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN WelsCopy16x16_neon
LOAD_ALIGNED_DATA_WITH_STRIDE q0, q1, q2, q3, r2, r3
STORE_ALIGNED_DATA_WITH_STRIDE q0, q1, q2, q3, r0, r1
LOAD_ALIGNED_DATA_WITH_STRIDE q8, q9, q10, q11, r2, r3
STORE_ALIGNED_DATA_WITH_STRIDE q8, q9, q10, q11, r0, r1
LOAD_ALIGNED_DATA_WITH_STRIDE q0, q1, q2, q3, r2, r3
STORE_ALIGNED_DATA_WITH_STRIDE q0, q1, q2, q3, r0, r1
LOAD_ALIGNED_DATA_WITH_STRIDE q8, q9, q10, q11, r2, r3
STORE_ALIGNED_DATA_WITH_STRIDE q8, q9, q10, q11, r0, r1
WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN WelsCopy16x16NotAligned_neon
LOAD_UNALIGNED_DATA_WITH_STRIDE q0, q1, q2, q3, r2, r3
STORE_UNALIGNED_DATA_WITH_STRIDE q0, q1, q2, q3, r0, r1
LOAD_UNALIGNED_DATA_WITH_STRIDE q8, q9, q10, q11, r2, r3
STORE_UNALIGNED_DATA_WITH_STRIDE q8, q9, q10, q11, r0, r1
LOAD_UNALIGNED_DATA_WITH_STRIDE q0, q1, q2, q3, r2, r3
STORE_UNALIGNED_DATA_WITH_STRIDE q0, q1, q2, q3, r0, r1
LOAD_UNALIGNED_DATA_WITH_STRIDE q8, q9, q10, q11, r2, r3
STORE_UNALIGNED_DATA_WITH_STRIDE q8, q9, q10, q11, r0, r1
WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN WelsCopy16x8NotAligned_neon
LOAD_UNALIGNED_DATA_WITH_STRIDE q0, q1, q2, q3, r2, r3
STORE_UNALIGNED_DATA_WITH_STRIDE q0, q1, q2, q3, r0, r1
LOAD_UNALIGNED_DATA_WITH_STRIDE q8, q9, q10, q11, r2, r3
STORE_UNALIGNED_DATA_WITH_STRIDE q8, q9, q10, q11, r0, r1
WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN WelsCopy8x16_neon
LOAD_UNALIGNED_DATA_WITH_STRIDE d0, d1, d2, d3, r2, r3
STORE_UNALIGNED_DATA_WITH_STRIDE d0, d1, d2, d3, r0, r1
LOAD_UNALIGNED_DATA_WITH_STRIDE d4, d5, d6, d7, r2, r3
STORE_UNALIGNED_DATA_WITH_STRIDE d4, d5, d6, d7, r0, r1
LOAD_UNALIGNED_DATA_WITH_STRIDE d0, d1, d2, d3, r2, r3
STORE_UNALIGNED_DATA_WITH_STRIDE d0, d1, d2, d3, r0, r1
LOAD_UNALIGNED_DATA_WITH_STRIDE d4, d5, d6, d7, r2, r3
STORE_UNALIGNED_DATA_WITH_STRIDE d4, d5, d6, d7, r0, r1
WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN WelsDctT4_neon
push {r4}
ldr r4, [sp, #4]
LOAD_4x4_DATA_FOR_DCT d4, d5, d6, d7, r1, r2, r3, r4
vsubl.u8 q0, d4, d6
vsubl.u8 q1, d5, d7
vtrn.s32 q0, q1
vswp d1, d2
// horizontal transform
DCT_ROW_TRANSFORM_TOTAL_16BITS d0, d1, d2, d3, d4, d5, d6, d7
// transform element
MATRIX_TRANSFORM_EACH_16BITS d0, d1, d2, d3
// vertical transform
DCT_ROW_TRANSFORM_TOTAL_16BITS d0, d1, d2, d3, d4, d5, d6, d7
// transform element
MATRIX_TRANSFORM_EACH_16BITS d0, d1, d2, d3
vst1.s16 {q0, q1}, [r0]!
pop {r4}
WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN WelsDctFourT4_neon
push {r4}
ldr r4, [sp, #4]
LOAD_8x8_DATA_FOR_DCT d16, d17, d18, d19, d20, d21, d22, d23, r1, r3
vsubl.u8 q0, d16, d20
vsubl.u8 q1, d17, d21
vsubl.u8 q2, d18, d22
vsubl.u8 q3, d19, d23
MATRIX_TRANSFORM_EACH_16BITS q0, q1, q2, q3
// horizontal transform
DCT_ROW_TRANSFORM_TOTAL_16BITS q0, q1, q2, q3, q8, q9, q10, q11
// transform element
MATRIX_TRANSFORM_EACH_16BITS q0, q1, q2, q3
// vertical transform
DCT_ROW_TRANSFORM_TOTAL_16BITS q0, q1, q2, q3, q8, q9, q10, q11
vswp d1, d2
vswp d5, d6
vswp q1, q2
vst1.s16 {q0, q1}, [r0]!
vst1.s16 {q2, q3}, [r0]!
////////////////
LOAD_8x8_DATA_FOR_DCT d16, d17, d18, d19, d20, d21, d22, d23, r1, r3
vsubl.u8 q0, d16, d20
vsubl.u8 q1, d17, d21
vsubl.u8 q2, d18, d22
vsubl.u8 q3, d19, d23
MATRIX_TRANSFORM_EACH_16BITS q0, q1, q2, q3
// horizontal transform
DCT_ROW_TRANSFORM_TOTAL_16BITS q0, q1, q2, q3, q8, q9, q10, q11
// transform element
MATRIX_TRANSFORM_EACH_16BITS q0, q1, q2, q3
// vertical transform
DCT_ROW_TRANSFORM_TOTAL_16BITS q0, q1, q2, q3, q8, q9, q10, q11
vswp d1, d2
vswp d5, d6
vswp q1, q2
vst1.s16 {q0, q1}, [r0]!
vst1.s16 {q2, q3}, [r0]!
pop {r4}
WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN WelsQuant4x4_neon
vld1.s16 {q2}, [r1]
vld1.s16 {q0, q1}, [r0]
vld1.s16 {q3}, [r2]
vmov q8, q2
NEWQUANT_COEF_EACH_16BITS q0, q2, d4, d5, d6, d7, q9, q10, q11
vst1.s16 {q2}, [r0]!
NEWQUANT_COEF_EACH_16BITS q1, q8, d16, d17, d6, d7, q9, q10, q11
vst1.s16 {q8}, [r0]!
WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN WelsQuant4x4Dc_neon
vld1.s16 {q0, q1}, [r0]
vdup.s16 q2, r1 // even ff range [0, 768]
vdup.s16 q3, r2
vmov q8, q2
NEWQUANT_COEF_EACH_16BITS q0, q2, d4, d5, d6, d7, q9, q10, q11
vst1.s16 {q2}, [r0]!
NEWQUANT_COEF_EACH_16BITS q1, q8, d16, d17, d6, d7, q9, q10, q11
vst1.s16 {q8}, [r0]!
WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN WelsQuantFour4x4_neon
vld1.s16 {q2}, [r1]
vld1.s16 {q3}, [r2]
mov r1, r0
vld1.s16 {q0, q1}, [r0]!
vmov q8, q2
NEWQUANT_COEF_EACH_16BITS q0, q8, d16, d17, d6, d7, q9, q10, q11
vst1.s16 {q8}, [r1]!
vmov q8, q2
NEWQUANT_COEF_EACH_16BITS q1, q8, d16, d17, d6, d7, q9, q10, q11
vst1.s16 {q8}, [r1]!
vld1.s16 {q0, q1}, [r0]!
vmov q8, q2
NEWQUANT_COEF_EACH_16BITS q0, q8, d16, d17, d6, d7, q9, q10, q11
vst1.s16 {q8}, [r1]!
vmov q8, q2
NEWQUANT_COEF_EACH_16BITS q1, q8, d16, d17, d6, d7, q9, q10, q11
vst1.s16 {q8}, [r1]!
vld1.s16 {q0, q1}, [r0]!
vmov q8, q2
NEWQUANT_COEF_EACH_16BITS q0, q8, d16, d17, d6, d7, q9, q10, q11
vst1.s16 {q8}, [r1]!
vmov q8, q2
NEWQUANT_COEF_EACH_16BITS q1, q8, d16, d17, d6, d7, q9, q10, q11
vst1.s16 {q8}, [r1]!
vld1.s16 {q0, q1}, [r0]!
vmov q8, q2
NEWQUANT_COEF_EACH_16BITS q0, q8, d16, d17, d6, d7, q9, q10, q11
vst1.s16 {q8}, [r1]!
vmov q8, q2
NEWQUANT_COEF_EACH_16BITS q1, q8, d16, d17, d6, d7, q9, q10, q11
vst1.s16 {q8}, [r1]!
WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN WelsQuantFour4x4Max_neon
vld1.s16 {q2}, [r1]
vld1.s16 {q3}, [r2]
mov r1, r0
vld1.s16 {q0, q1}, [r0]!
vmov q8, q2
NEWQUANT_COEF_EACH_16BITS_MAX q0, q8, d16, d17, d6, d7, q9, q10, q11, d26
vst1.s16 {q8}, [r1]!
vmov q12, q2
NEWQUANT_COEF_EACH_16BITS_MAX q1, q12, d24, d25, d6, d7, q9, q10, q11, d28
vst1.s16 {q12}, [r1]! // then 1st 16 elem in d26 & d28
vld1.s16 {q0, q1}, [r0]!
vmov q8, q2
NEWQUANT_COEF_EACH_16BITS_MAX q0, q8, d16, d17, d6, d7, q9, q10, q11, d27
vst1.s16 {q8}, [r1]!
vmov q12, q2
NEWQUANT_COEF_EACH_16BITS_MAX q1, q12, d24, d25, d6, d7, q9, q10, q11, d29
vst1.s16 {q12}, [r1]! // then 2nd 16 elem in d27 & d29
SELECT_MAX_IN_ABS_COEF q13, q14, q0, d0, d1
vst1.s32 {d0[0]}, [r3]!
///////////
vld1.s16 {q0, q1}, [r0]!
vmov q8, q2
NEWQUANT_COEF_EACH_16BITS_MAX q0, q8, d16, d17, d6, d7, q9, q10, q11, d26
vst1.s16 {q8}, [r1]!
vmov q12, q2
NEWQUANT_COEF_EACH_16BITS_MAX q1, q12, d24, d25, d6, d7, q9, q10, q11, d28
vst1.s16 {q12}, [r1]! // then 3rd 16 elem in d26 & d28
vld1.s16 {q0, q1}, [r0]!
vmov q8, q2
NEWQUANT_COEF_EACH_16BITS_MAX q0, q8, d16, d17, d6, d7, q9, q10, q11, d27
vst1.s16 {q8}, [r1]!
vmov q12, q2
NEWQUANT_COEF_EACH_16BITS_MAX q1, q12, d24, d25, d6, d7, q9, q10, q11, d29
vst1.s16 {q12}, [r1]! // then 4th 16 elem in d27 & d29
SELECT_MAX_IN_ABS_COEF q13, q14, q0, d0, d1
vst1.s32 {d0[0]}, [r3]!
WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN WelsHadamardT4Dc_neon
push {r2,r3}
mov r2, #64 // 2*16*sizeof(int16_t)
add r3, r1, #32
vld1.s16 {d0}, [r1], r2
vld1.s16 {d1}, [r3], r2
vld1.s16 {d4}, [r1], r2
vld1.s16 {d5}, [r3], r2
vld1.s16 {d2}, [r1], r2
vld1.s16 {d3}, [r3], r2
vld1.s16 {d6}, [r1], r2
vld1.s16 {d7}, [r3], r2
vtrn.16 q0, q2 // d0[0 4], d1[1 5]
vtrn.16 q1, q3 // d2[2 6], d3[3 7]
vld1.s16 {d16}, [r1], r2
vld1.s16 {d17}, [r3], r2
vld1.s16 {d20}, [r1], r2
vld1.s16 {d21}, [r3], r2
vld1.s16 {d18}, [r1], r2
vld1.s16 {d19}, [r3], r2
vld1.s16 {d22}, [r1], r2
vld1.s16 {d23}, [r3], r2
vtrn.16 q8, q10 //d16[08 12],d17[09 13]
vtrn.16 q9, q11 //d18[10 14],d19[11 15]
vtrn.32 q0, q8 // d0 [0 4 08 12] = dct[idx], d1[1 5 09 13] = dct[idx+16]
vtrn.32 q1, q9 // d2 [2 6 10 14] = dct[idx+64], d3[3 7 11 15] = dct[idx+80]
ROW_TRANSFORM_0_STEP d0, d1, d3, d2, q8, q11, q10, q9
TRANSFORM_4BYTES q0, q1, q3, q2, q8, q11, q10, q9
// transform element 32bits
vtrn.s32 q0, q1 //[0 1 2 3]+[4 5 6 7]-->[0 4 2 6]+[1 5 3 7]
vtrn.s32 q2, q3 //[8 9 10 11]+[12 13 14 15]-->[8 12 10 14]+[9 13 11 15]
vswp d1, d4 //[0 4 2 6]+[8 12 10 14]-->[0 4 8 12]+[2 6 10 14]
vswp d3, d6 //[1 5 3 7]+[9 13 11 15]-->[1 5 9 13]+[3 7 11 15]
COL_TRANSFORM_0_STEP q0, q1, q3, q2, q8, q11, q10, q9
TRANSFORM_4BYTES q0, q1, q3, q2, q8, q11, q10, q9
vrshrn.s32 d16, q0, #1
vrshrn.s32 d17, q1, #1
vrshrn.s32 d18, q2, #1
vrshrn.s32 d19, q3, #1
vst1.16 {q8, q9}, [r0] //store
pop {r2,r3}
WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN WelsHadamardQuant2x2_neon
vdup.s16 d1, r1 //ff
vdup.s16 d2, r2 //mf
veor d3, d3
mov r1, #32
mov r2, r0
vld1.s16 {d0[0]}, [r0], r1 //rs[00]
vst1.s16 {d3[0]}, [r2], r1 //rs[00]=0
vld1.s16 {d0[1]}, [r0], r1 //rs[16]
vst1.s16 {d3[0]}, [r2], r1 //rs[16]=0
vld1.s16 {d0[2]}, [r0], r1 //rs[32]
vst1.s16 {d3[0]}, [r2], r1 //rs[32]=0
vld1.s16 {d0[3]}, [r0], r1 //rs[48]
vst1.s16 {d3[0]}, [r2], r1 //rs[48]=0
HDM_QUANT_2x2_TOTAL_16BITS d0, d4, d5 // output d5
HDM_QUANT_2x2_TOTAL_16BITS d5, d4, d0 // output d0
QUANT_DUALWORD_COEF_EACH_16BITS d0, d1, d2, d3, q2
vst1.s16 d1, [r3] // store to dct
ldr r2, [sp, #0]
vst1.s16 d1, [r2] // store to block
mov r1, #1
vdup.s16 d3, r1
DC_ZERO_COUNT_IN_DUALWORD d1, d0, d3
vmov r0, r1, d0
and r0, #0x07 // range [0~4]
rsb r0, #4
WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN WelsHadamardQuant2x2SkipKernel_neon
vdup.s16 d3, r1
mov r1, #32
vld1.s16 {d0[0]}, [r0], r1 //rs[00]
vld1.s16 {d0[1]}, [r0], r1 //rs[16]
vld1.s16 {d0[2]}, [r0], r1 //rs[32]
vld1.s16 {d0[3]}, [r0], r1 //rs[48]
HDM_QUANT_2x2_TOTAL_16BITS d0, d1, d2 // output d2
HDM_QUANT_2x2_TOTAL_16BITS d2, d1, d0 // output d0
vabs.s16 d1, d0
vcgt.s16 d1, d1, d3 // abs(dct[i])>threshold;
vmov r0, r1, d1
orr r0, r1
WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN WelsGetNoneZeroCount_neon
push {r1}
vld1.s16 {q0, q1}, [r0]
vmov.s16 q8, #1
ZERO_COUNT_IN_2_QUARWORD q0, q1, q8, d0, d1, d2, d3
vmov r0, r1, d0
and r0, #0x1F // range [0~16]
rsb r0, #16
pop {r1}
WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN WelsDequant4x4_neon
vld1.s16 {q0, q1}, [r0]
vld1.u16 {q2}, [r1]
vmul.s16 q8, q0, q2
vmul.s16 q9, q1, q2
vst1.s16 {q8, q9}, [r0]
WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN WelsDequantFour4x4_neon
vld1.u16 {q12}, [r1]
mov r1, r0
vld1.s16 {q0, q1}, [r0]!
vld1.s16 {q2, q3}, [r0]!
vmul.s16 q0, q0, q12
vld1.s16 {q8, q9}, [r0]!
vmul.s16 q1, q1, q12
vld1.s16 {q10, q11}, [r0]!
vst1.s16 {q0, q1}, [r1]!
vmul.s16 q2, q2, q12
vmul.s16 q3, q3, q12
vmul.s16 q8, q8, q12
vst1.s16 {q2, q3}, [r1]!
vmul.s16 q9, q9, q12
vmul.s16 q10, q10, q12
vmul.s16 q11, q11, q12
vst1.s16 {q8, q9}, [r1]!
vst1.s16 {q10, q11}, [r1]!
WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN WelsDequantIHadamard4x4_neon
vld1.s16 {q0, q1}, [r0]
vdup.s16 q8, r1
IHDM_4x4_TOTAL_16BITS q0, q2, q3
IHDM_4x4_TOTAL_16BITS q1, q2, q3
MATRIX_TRANSFORM_EACH_16BITS d0, d1, d2, d3
IHDM_4x4_TOTAL_16BITS q0, q2, q3
vmul.s16 q0, q8
IHDM_4x4_TOTAL_16BITS q1, q2, q3
vmul.s16 q1, q8
MATRIX_TRANSFORM_EACH_16BITS d0, d1, d2, d3
vst1.s16 {q0, q1}, [r0]
WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN WelsIDctT4Rec_neon
vld1.u32 {d16[0]}, [r2], r3
push {r4}
ldr r4, [sp, #4]
vld1.u32 {d16[1]}, [r2], r3
vld4.s16 {d0, d1, d2, d3}, [r4] // cost 3 cycles!
vld1.u32 {d17[0]}, [r2], r3
vld1.u32 {d17[1]}, [r2], r3 // q7 is pred
ROW_TRANSFORM_1_STEP_TOTAL_16BITS d0, d1, d2, d3, d4, d5, d6, d7
TRANSFORM_TOTAL_16BITS d0, d1, d2, d3, d4, d5, d6, d7
MATRIX_TRANSFORM_EACH_16BITS d0, d1, d2, d3
ROW_TRANSFORM_1_STEP_TOTAL_16BITS d0, d1, d2, d3, d4, d5, d6, d7
TRANSFORM_TOTAL_16BITS d0, d1, d2, d3, d4, d5, d6, d7
vrshr.s16 d0, d0, #6
vrshr.s16 d1, d1, #6
vrshr.s16 d2, d2, #6
vrshr.s16 d3, d3, #6
//after rounding 6, clip into [0, 255]
vmovl.u8 q2,d16
vadd.s16 q0,q2
vqmovun.s16 d16,q0
vst1.32 {d16[0]},[r0],r1
vst1.32 {d16[1]},[r0],r1
vmovl.u8 q2,d17
vadd.s16 q1,q2
vqmovun.s16 d17,q1
vst1.32 {d17[0]},[r0],r1
vst1.32 {d17[1]},[r0]
pop {r4}
WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN WelsIDctFourT4Rec_neon
vld1.u64 {d24}, [r2], r3
push {r4}
ldr r4, [sp, #4]
vld1.u64 {d25}, [r2], r3
vld4.s16 {d0, d1, d2, d3}, [r4]! // cost 3 cycles!
vld1.u64 {d26}, [r2], r3
vld1.u64 {d27}, [r2], r3
vld4.s16 {d4, d5, d6, d7}, [r4]! // cost 3 cycles!
vswp d1, d4
vswp d3, d6
vswp q1, q2 // q0~q3
ROW_TRANSFORM_1_STEP_TOTAL_16BITS q0, q1, q2, q3, q8, q9, q10, q11
TRANSFORM_TOTAL_16BITS q0, q1, q2, q3, q8, q9, q10, q11
MATRIX_TRANSFORM_EACH_16BITS q0, q1, q2, q3
ROW_TRANSFORM_1_STEP_TOTAL_16BITS q0, q1, q2, q3, q8, q9, q10, q11
TRANSFORM_TOTAL_16BITS q0, q1, q2, q3, q8, q9, q10, q11
vrshr.s16 q0, q0, #6
vrshr.s16 q1, q1, #6
vrshr.s16 q2, q2, #6
vrshr.s16 q3, q3, #6
//after rounding 6, clip into [0, 255]
vmovl.u8 q8,d24
vadd.s16 q0,q8
vqmovun.s16 d24,q0
vst1.u8 {d24},[r0],r1
vmovl.u8 q8,d25
vadd.s16 q1,q8
vqmovun.s16 d25,q1
vst1.u8 {d25},[r0],r1
vmovl.u8 q8,d26
vadd.s16 q2,q8
vqmovun.s16 d26,q2
vst1.u8 {d26},[r0],r1
vmovl.u8 q8,d27
vadd.s16 q3,q8
vqmovun.s16 d27,q3
vst1.u8 {d27},[r0],r1
vld1.u64 {d24}, [r2], r3
vld1.u64 {d25}, [r2], r3
vld4.s16 {d0, d1, d2, d3}, [r4]! // cost 3 cycles!
vld1.u64 {d26}, [r2], r3
vld1.u64 {d27}, [r2], r3
vld4.s16 {d4, d5, d6, d7}, [r4]! // cost 3 cycles!
vswp d1, d4
vswp d3, d6
vswp q1, q2 // q0~q3
ROW_TRANSFORM_1_STEP_TOTAL_16BITS q0, q1, q2, q3, q8, q9, q10, q11
TRANSFORM_TOTAL_16BITS q0, q1, q2, q3, q8, q9, q10, q11
MATRIX_TRANSFORM_EACH_16BITS q0, q1, q2, q3
ROW_TRANSFORM_1_STEP_TOTAL_16BITS q0, q1, q2, q3, q8, q9, q10, q11
TRANSFORM_TOTAL_16BITS q0, q1, q2, q3, q8, q9, q10, q11
vrshr.s16 q0, q0, #6
vrshr.s16 q1, q1, #6
vrshr.s16 q2, q2, #6
vrshr.s16 q3, q3, #6
//after rounding 6, clip into [0, 255]
vmovl.u8 q8,d24
vadd.s16 q0,q8
vqmovun.s16 d24,q0
vst1.u8 {d24},[r0],r1
vmovl.u8 q8,d25
vadd.s16 q1,q8
vqmovun.s16 d25,q1
vst1.u8 {d25},[r0],r1
vmovl.u8 q8,d26
vadd.s16 q2,q8
vqmovun.s16 d26,q2
vst1.u8 {d26},[r0],r1
vmovl.u8 q8,d27
vadd.s16 q3,q8
vqmovun.s16 d27,q3
vst1.u8 {d27},[r0],r1
pop {r4}
WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN WelsIDctRecI16x16Dc_neon
push {r4}
ldr r4, [sp, #4]
vld1.s16 {q8,q9}, [r4]
vrshr.s16 q8, q8, #6
vrshr.s16 q9, q9, #6
vdup.s16 d20, d16[0]
vdup.s16 d21, d16[1]
vdup.s16 d22, d16[2]
vdup.s16 d23, d16[3]
vld1.u8 {q0}, [r2], r3
MB_PRED_8BITS_ADD_DCT_16BITS_CLIP d0, d1, q10, q11, q12, q13
vst1.u8 {q0}, [r0], r1
vld1.u8 {q0}, [r2], r3
MB_PRED_8BITS_ADD_DCT_16BITS_CLIP d0, d1, q10, q11, q12, q13
vst1.u8 {q0}, [r0], r1
vld1.u8 {q0}, [r2], r3
MB_PRED_8BITS_ADD_DCT_16BITS_CLIP d0, d1, q10, q11, q12, q13
vst1.u8 {q0}, [r0], r1
vld1.u8 {q0}, [r2], r3
MB_PRED_8BITS_ADD_DCT_16BITS_CLIP d0, d1, q10, q11, q12, q13
vst1.u8 {q0}, [r0], r1
vdup.s16 d20, d17[0]
vdup.s16 d21, d17[1]
vdup.s16 d22, d17[2]
vdup.s16 d23, d17[3]
vld1.u8 {q0}, [r2], r3
MB_PRED_8BITS_ADD_DCT_16BITS_CLIP d0, d1, q10, q11, q12, q13
vst1.u8 {q0}, [r0], r1
vld1.u8 {q0}, [r2], r3
MB_PRED_8BITS_ADD_DCT_16BITS_CLIP d0, d1, q10, q11, q12, q13
vst1.u8 {q0}, [r0], r1
vld1.u8 {q0}, [r2], r3
MB_PRED_8BITS_ADD_DCT_16BITS_CLIP d0, d1, q10, q11, q12, q13
vst1.u8 {q0}, [r0], r1
vld1.u8 {q0}, [r2], r3
MB_PRED_8BITS_ADD_DCT_16BITS_CLIP d0, d1, q10, q11, q12, q13
vst1.u8 {q0}, [r0], r1
vdup.s16 d20, d18[0]
vdup.s16 d21, d18[1]
vdup.s16 d22, d18[2]
vdup.s16 d23, d18[3]
vld1.u8 {q0}, [r2], r3
MB_PRED_8BITS_ADD_DCT_16BITS_CLIP d0, d1, q10, q11, q12, q13
vst1.u8 {q0}, [r0], r1
vld1.u8 {q0}, [r2], r3
MB_PRED_8BITS_ADD_DCT_16BITS_CLIP d0, d1, q10, q11, q12, q13
vst1.u8 {q0}, [r0], r1
vld1.u8 {q0}, [r2], r3
MB_PRED_8BITS_ADD_DCT_16BITS_CLIP d0, d1, q10, q11, q12, q13
vst1.u8 {q0}, [r0], r1
vld1.u8 {q0}, [r2], r3
MB_PRED_8BITS_ADD_DCT_16BITS_CLIP d0, d1, q10, q11, q12, q13
vst1.u8 {q0}, [r0], r1
vdup.s16 d20, d19[0]
vdup.s16 d21, d19[1]
vdup.s16 d22, d19[2]
vdup.s16 d23, d19[3]
vld1.u8 {q0}, [r2], r3
MB_PRED_8BITS_ADD_DCT_16BITS_CLIP d0, d1, q10, q11, q12, q13
vst1.u8 {q0}, [r0], r1
vld1.u8 {q0}, [r2], r3
MB_PRED_8BITS_ADD_DCT_16BITS_CLIP d0, d1, q10, q11, q12, q13
vst1.u8 {q0}, [r0], r1
vld1.u8 {q0}, [r2], r3
MB_PRED_8BITS_ADD_DCT_16BITS_CLIP d0, d1, q10, q11, q12, q13
vst1.u8 {q0}, [r0], r1
vld1.u8 {q0}, [r2], r3
MB_PRED_8BITS_ADD_DCT_16BITS_CLIP d0, d1, q10, q11, q12, q13
vst1.u8 {q0}, [r0], r1
pop {r4}
WELS_ASM_FUNC_END
#endif