Merge pull request #1249 from dongzha/addArm32SCCNew

add arm 32/64 code and UT for SVC SCC motion estimation
This commit is contained in:
zhilwang 2014-08-08 09:19:51 +08:00
commit 439e51bc11
8 changed files with 532 additions and 1 deletions

View File

@ -45,6 +45,8 @@
4CE4472918BC605C0017DF25 /* svc_set_mb_syn_cavlc.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 4CE446F818BC605C0017DF25 /* svc_set_mb_syn_cavlc.cpp */; }; 4CE4472918BC605C0017DF25 /* svc_set_mb_syn_cavlc.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 4CE446F818BC605C0017DF25 /* svc_set_mb_syn_cavlc.cpp */; };
4CE4472B18BC605C0017DF25 /* wels_preprocess.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 4CE446FA18BC605C0017DF25 /* wels_preprocess.cpp */; }; 4CE4472B18BC605C0017DF25 /* wels_preprocess.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 4CE446FA18BC605C0017DF25 /* wels_preprocess.cpp */; };
4CE4472E18BC605C0017DF25 /* welsEncoderExt.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 4CE4470618BC605C0017DF25 /* welsEncoderExt.cpp */; }; 4CE4472E18BC605C0017DF25 /* welsEncoderExt.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 4CE4470618BC605C0017DF25 /* welsEncoderExt.cpp */; };
6CA38DA31991CACE003EAAE0 /* svc_motion_estimation.S in Sources */ = {isa = PBXBuildFile; fileRef = 6CA38DA21991CACE003EAAE0 /* svc_motion_estimation.S */; };
6CA38DA51991D31A003EAAE0 /* svc_motion_estimation_aarch64_neon.S in Sources */ = {isa = PBXBuildFile; fileRef = 6CA38DA41991D31A003EAAE0 /* svc_motion_estimation_aarch64_neon.S */; };
9AED665019469FC1009A3567 /* welsCodecTrace.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 9AED664C19469FC1009A3567 /* welsCodecTrace.cpp */; }; 9AED665019469FC1009A3567 /* welsCodecTrace.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 9AED664C19469FC1009A3567 /* welsCodecTrace.cpp */; };
9AED66661946A2B3009A3567 /* utils.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 9AED66651946A2B3009A3567 /* utils.cpp */; }; 9AED66661946A2B3009A3567 /* utils.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 9AED66651946A2B3009A3567 /* utils.cpp */; };
F5617A50196A833A006E2B20 /* reconstruct_aarch64_neon.S in Sources */ = {isa = PBXBuildFile; fileRef = F5617A4F196A833A006E2B20 /* reconstruct_aarch64_neon.S */; }; F5617A50196A833A006E2B20 /* reconstruct_aarch64_neon.S in Sources */ = {isa = PBXBuildFile; fileRef = F5617A4F196A833A006E2B20 /* reconstruct_aarch64_neon.S */; };
@ -154,6 +156,8 @@
4CE446FE18BC605C0017DF25 /* welsEncoderExt.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = welsEncoderExt.h; sourceTree = "<group>"; }; 4CE446FE18BC605C0017DF25 /* welsEncoderExt.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = welsEncoderExt.h; sourceTree = "<group>"; };
4CE4470418BC605C0017DF25 /* wels_enc_export.def */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = text; path = wels_enc_export.def; sourceTree = "<group>"; }; 4CE4470418BC605C0017DF25 /* wels_enc_export.def */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = text; path = wels_enc_export.def; sourceTree = "<group>"; };
4CE4470618BC605C0017DF25 /* welsEncoderExt.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = welsEncoderExt.cpp; sourceTree = "<group>"; }; 4CE4470618BC605C0017DF25 /* welsEncoderExt.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = welsEncoderExt.cpp; sourceTree = "<group>"; };
6CA38DA21991CACE003EAAE0 /* svc_motion_estimation.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = svc_motion_estimation.S; sourceTree = "<group>"; };
6CA38DA41991D31A003EAAE0 /* svc_motion_estimation_aarch64_neon.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; name = svc_motion_estimation_aarch64_neon.S; path = arm64/svc_motion_estimation_aarch64_neon.S; sourceTree = "<group>"; };
9AED664819469FAF009A3567 /* welsCodecTrace.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = welsCodecTrace.h; path = ../../../common/inc/welsCodecTrace.h; sourceTree = "<group>"; }; 9AED664819469FAF009A3567 /* welsCodecTrace.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = welsCodecTrace.h; path = ../../../common/inc/welsCodecTrace.h; sourceTree = "<group>"; };
9AED664C19469FC1009A3567 /* welsCodecTrace.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; name = welsCodecTrace.cpp; path = ../../../common/src/welsCodecTrace.cpp; sourceTree = "<group>"; }; 9AED664C19469FC1009A3567 /* welsCodecTrace.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; name = welsCodecTrace.cpp; path = ../../../common/src/welsCodecTrace.cpp; sourceTree = "<group>"; };
9AED66651946A2B3009A3567 /* utils.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; name = utils.cpp; path = ../../../common/src/utils.cpp; sourceTree = "<group>"; }; 9AED66651946A2B3009A3567 /* utils.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; name = utils.cpp; path = ../../../common/src/utils.cpp; sourceTree = "<group>"; };
@ -177,6 +181,7 @@
4C34066418C57D0400DFA14A /* arm */ = { 4C34066418C57D0400DFA14A /* arm */ = {
isa = PBXGroup; isa = PBXGroup;
children = ( children = (
6CA38DA21991CACE003EAAE0 /* svc_motion_estimation.S */,
4C34066618C57D0400DFA14A /* intra_pred_neon.S */, 4C34066618C57D0400DFA14A /* intra_pred_neon.S */,
4C34066718C57D0400DFA14A /* intra_pred_sad_3_opt_neon.S */, 4C34066718C57D0400DFA14A /* intra_pred_sad_3_opt_neon.S */,
4C34066918C57D0400DFA14A /* memory_neon.S */, 4C34066918C57D0400DFA14A /* memory_neon.S */,
@ -189,6 +194,7 @@
4CB8F2B219235FAC005D6386 /* arm64 */ = { 4CB8F2B219235FAC005D6386 /* arm64 */ = {
isa = PBXGroup; isa = PBXGroup;
children = ( children = (
6CA38DA41991D31A003EAAE0 /* svc_motion_estimation_aarch64_neon.S */,
F5BE8004196B913200ED02ED /* memory_aarch64_neon.S */, F5BE8004196B913200ED02ED /* memory_aarch64_neon.S */,
F5617A4F196A833A006E2B20 /* reconstruct_aarch64_neon.S */, F5617A4F196A833A006E2B20 /* reconstruct_aarch64_neon.S */,
4C23BC5F195A77E0003B81FC /* intra_pred_sad_3_opt_aarch64_neon.S */, 4C23BC5F195A77E0003B81FC /* intra_pred_sad_3_opt_aarch64_neon.S */,
@ -423,6 +429,7 @@
4CE4471D18BC605C0017DF25 /* property.cpp in Sources */, 4CE4471D18BC605C0017DF25 /* property.cpp in Sources */,
4CE4471018BC605C0017DF25 /* decode_mb_aux.cpp in Sources */, 4CE4471018BC605C0017DF25 /* decode_mb_aux.cpp in Sources */,
4CE4472018BC605C0017DF25 /* sample.cpp in Sources */, 4CE4472018BC605C0017DF25 /* sample.cpp in Sources */,
6CA38DA31991CACE003EAAE0 /* svc_motion_estimation.S in Sources */,
4CE4471318BC605C0017DF25 /* encoder_data_tables.cpp in Sources */, 4CE4471318BC605C0017DF25 /* encoder_data_tables.cpp in Sources */,
4C34067118C57D0400DFA14A /* pixel_neon.S in Sources */, 4C34067118C57D0400DFA14A /* pixel_neon.S in Sources */,
9AED665019469FC1009A3567 /* welsCodecTrace.cpp in Sources */, 9AED665019469FC1009A3567 /* welsCodecTrace.cpp in Sources */,
@ -455,6 +462,7 @@
4CE4471218BC605C0017DF25 /* encoder.cpp in Sources */, 4CE4471218BC605C0017DF25 /* encoder.cpp in Sources */,
4CE4471618BC605C0017DF25 /* get_intra_predictor.cpp in Sources */, 4CE4471618BC605C0017DF25 /* get_intra_predictor.cpp in Sources */,
4CE4472E18BC605C0017DF25 /* welsEncoderExt.cpp in Sources */, 4CE4472E18BC605C0017DF25 /* welsEncoderExt.cpp in Sources */,
6CA38DA51991D31A003EAAE0 /* svc_motion_estimation_aarch64_neon.S in Sources */,
4CE4471418BC605C0017DF25 /* encoder_ext.cpp in Sources */, 4CE4471418BC605C0017DF25 /* encoder_ext.cpp in Sources */,
4C34067218C57D0400DFA14A /* reconstruct_neon.S in Sources */, 4C34067218C57D0400DFA14A /* reconstruct_neon.S in Sources */,
); );

View File

@ -0,0 +1,168 @@
/*!
* \copy
* Copyright (c) 2013, Cisco Systems
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
* COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*
*/
#ifdef HAVE_NEON
.text
#include "arm_arch_common_macro.S"
WELS_ASM_FUNC_BEGIN SumOf8x8SingleBlock_neon
vld1.64 {d0}, [r0], r1
vld1.64 {d1}, [r0], r1
vld1.64 {d2}, [r0], r1
vld1.64 {d3}, [r0], r1
vld1.64 {d4}, [r0], r1
vld1.64 {d5}, [r0], r1
vld1.64 {d6}, [r0], r1
vld1.64 {d7}, [r0]
vpaddl.u8 q0, q0
vpadal.u8 q0, q1
vpadal.u8 q0, q2
vpadal.u8 q0, q3
vpaddl.u16 q0, q0
vpadd.i32 d0, d1
vpadd.i32 d0, d0
vmov r0, r1, d0
WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN SumOf16x16SingleBlock_neon
vld1.64 {q0}, [r0], r1
vpaddl.u8 q0, q0
.rept 15
vld1.64 {q1}, [r0], r1
vpadal.u8 q0, q1
.endr
vpaddl.u16 q0, q0
vpadd.i32 d0, d1
vpadd.i32 d0, d0
vmov r0, r1, d0
WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN SumOf8x8BlockOfFrame_neon
//(uint8_t* pRefPicture, const int32_t kiWidth, const int32_t kiHeight,const int32_t kiRefStride,uint16_t* pFeatureOfBlock, uint32_t pTimesOfFeatureValue[])
stmdb sp!, {r4-r8}
ldr r5, [sp, #24] //pTimesOfFeatureValue
ldr r4, [sp, #20] //pFeatureOfBlock
mov r8, r0
mov r6, r1
add r8, r6
add r4, r6, lsl #1
_height_loop8x8:
mov r7, r6
_width_loop8x8:
subs r0, r8, r7
vld1.64 {d0}, [r0], r3
vld1.64 {d1}, [r0], r3
vld1.64 {d2}, [r0], r3
vld1.64 {d3}, [r0], r3
vld1.64 {d4}, [r0], r3
vld1.64 {d5}, [r0], r3
vld1.64 {d6}, [r0], r3
vld1.64 {d7}, [r0]
vpaddl.u8 q0, q0
vpadal.u8 q0, q1
vpadal.u8 q0, q2
vpadal.u8 q0, q3
vpaddl.u16 q0, q0
vpadd.i32 d0, d1
vpadd.i32 d0, d0
subs r1, r4, r7, lsl #1
vst1.16 {d0[0]}, [r1] // sum -> pFeatureOfBlock[i]
vmov r0, r1, d0
add r1, r5, r0, lsl #2
ldr r0, [r1]
add r0, #1
str r0, [r1]
subs r7, #1
bne _width_loop8x8
add r8, r3
add r4, r6, lsl #1
subs r2, #1
bne _height_loop8x8
ldmia sp!, {r4-r8}
WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN SumOf16x16BlockOfFrame_neon
//(uint8_t* pRefPicture, const int32_t kiWidth, const int32_t kiHeight,const int32_t kiRefStride,uint16_t* pFeatureOfBlock, uint32_t pTimesOfFeatureValue[])
stmdb sp!, {r4-r8}
ldr r5, [sp, #24] //pTimesOfFeatureValue
ldr r4, [sp, #20] //pFeatureOfBlock
mov r8, r0
mov r6, r1
add r8, r6
add r4, r6, lsl #1
_height_loop16x16:
mov r7, r6
_width_loop16x16:
subs r0, r8, r7
vld1.64 {q0}, [r0], r3
vpaddl.u8 q0, q0
.rept 15
vld1.64 {q1}, [r0], r3
vpadal.u8 q0, q1
.endr
vpaddl.u16 q0, q0
vpadd.i32 d0, d1
vpadd.i32 d0, d0
subs r1, r4, r7, lsl #1
vst1.16 {d0[0]}, [r1] // sum -> pFeatureOfBlock[i]
vmov r0, r1, d0
add r1, r5, r0, lsl #2
ldr r0, [r1]
add r0, #1
str r0, [r1]
subs r7, #1
bne _width_loop16x16
add r8, r3
add r4, r6, lsl #1
subs r2, #1
bne _height_loop16x16
ldmia sp!, {r4-r8}
WELS_ASM_FUNC_END
#endif

View File

@ -0,0 +1,151 @@
/*!
* \copy
* Copyright (c) 2013, Cisco Systems
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
* COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*
*/
#ifdef HAVE_NEON_AARCH64
.text
#include "arm_arch64_common_macro.S"
WELS_ASM_AARCH64_FUNC_BEGIN SumOf8x8SingleBlock_AArch64_neon
ld1 {v0.d}[0], [x0], x1
ld1 {v0.d}[1], [x0], x1
ld1 {v1.d}[0], [x0], x1
ld1 {v1.d}[1], [x0], x1
ld1 {v2.d}[0], [x0], x1
ld1 {v2.d}[1], [x0], x1
ld1 {v3.d}[0], [x0], x1
ld1 {v3.d}[1], [x0]
uaddlp v0.8h, v0.16b
uadalp v0.8h, v1.16b
uadalp v0.8h, v2.16b
uadalp v0.8h, v3.16b
uaddlv s0, v0.8h
mov x0, v0.d[0]
WELS_ASM_AARCH64_FUNC_END
WELS_ASM_AARCH64_FUNC_BEGIN SumOf16x16SingleBlock_AArch64_neon
ld1 {v0.16b}, [x0], x1
uaddlp v0.8h, v0.16b
.rept 15
ld1 {v1.16b}, [x0], x1
uadalp v0.8h, v1.16b
.endr
uaddlv s0, v0.8h
mov x0, v0.d[0]
WELS_ASM_AARCH64_FUNC_END
WELS_ASM_AARCH64_FUNC_BEGIN SumOf8x8BlockOfFrame_AArch64_neon
//(uint8_t* pRefPicture, const int32_t kiWidth, const int32_t kiHeight,const int32_t kiRefStride,uint16_t* pFeatureOfBlock, uint32_t pTimesOfFeatureValue[])
//x5: pTimesOfFeatureValue
//x4: pFeatureOfBlock
mov x8, x0
mov x6, x1
add x8, x8, x6
add x4, x4, x6, lsl #1
_height_loop8x8:
mov x7, x6
_width_loop8x8:
subs x0, x8, x7
ld1 {v0.d}[0], [x0], x3
ld1 {v0.d}[1], [x0], x3
ld1 {v1.d}[0], [x0], x3
ld1 {v1.d}[1], [x0], x3
ld1 {v2.d}[0], [x0], x3
ld1 {v2.d}[1], [x0], x3
ld1 {v3.d}[0], [x0], x3
ld1 {v3.d}[1], [x0]
uaddlp v0.8h, v0.16b
uadalp v0.8h, v1.16b
uadalp v0.8h, v2.16b
uadalp v0.8h, v3.16b
uaddlv s0, v0.8h
subs x1, x4, x7, lsl #1
st1 {v0.h}[0], [x1] // sum -> pFeatureOfBlock[i]
mov w0, #0
ins v0.s[1], w0
mov x0, v0.d[0]
add x1, x5, x0, lsl #2
ldr w0, [x1]
add w0, w0, #1
str w0, [x1]
subs x7, x7, #1
cbnz x7, _width_loop8x8
add x8, x8, x3
add x4, x4, x6, lsl #1
subs x2, x2, #1
cbnz x2, _height_loop8x8
WELS_ASM_AARCH64_FUNC_END
WELS_ASM_AARCH64_FUNC_BEGIN SumOf16x16BlockOfFrame_AArch64_neon
//(uint8_t* pRefPicture, const int32_t kiWidth, const int32_t kiHeight,const int32_t kiRefStride,uint16_t* pFeatureOfBlock, uint32_t pTimesOfFeatureValue[])
//x5: pTimesOfFeatureValue
//x4: pFeatureOfBlock
mov x8, x0
mov x6, x1
add x8, x8, x6
add x4, x4, x6, lsl #1
_height_loop16x16:
mov x7, x6
_width_loop16x16:
subs x0, x8, x7
ld1 {v0.16b}, [x0], x3
uaddlp v0.8h, v0.16b
.rept 15
ld1 {v1.16b}, [x0], x3
uadalp v0.8h, v1.16b
.endr
uaddlv s0, v0.8h
subs x1, x4, x7, lsl #1
st1 {v0.h}[0], [x1] // sum -> pFeatureOfBlock[i]
mov w0, #0
ins v0.s[1], w0
mov x0, v0.d[0]
add x1, x5, x0, lsl #2
ldr w0, [x1]
add w0, w0, #1
str w0, [x1]
subs x7, x7, #1
cbnz x7, _width_loop16x16
add x8, x8, x3
add x4, x4, x6, lsl #1
subs x2, x2, #1
cbnz x2, _height_loop16x16
WELS_ASM_AARCH64_FUNC_END
#endif

View File

@ -244,6 +244,33 @@ void SumOf8x8BlockOfFrame_c (uint8_t* pRefPicture, const int32_t kiWidth, const
void SumOf16x16BlockOfFrame_c (uint8_t* pRefPicture, const int32_t kiWidth, const int32_t kiHeight, void SumOf16x16BlockOfFrame_c (uint8_t* pRefPicture, const int32_t kiWidth, const int32_t kiHeight,
const int32_t kiRefStride, const int32_t kiRefStride,
uint16_t* pFeatureOfBlock, uint32_t pTimesOfFeatureValue[]); uint16_t* pFeatureOfBlock, uint32_t pTimesOfFeatureValue[]);
#ifdef HAVE_NEON
extern "C"
{
int32_t SumOf8x8SingleBlock_neon (uint8_t* pRef, const int32_t kiRefStride);
int32_t SumOf16x16SingleBlock_neon (uint8_t* pRef, const int32_t kiRefStride);
void SumOf8x8BlockOfFrame_neon (uint8_t* pRefPicture, const int32_t kiWidth, const int32_t kiHeight,
const int32_t kiRefStride,
uint16_t* pFeatureOfBlock, uint32_t pTimesOfFeatureValue[]);
void SumOf16x16BlockOfFrame_neon (uint8_t* pRefPicture, const int32_t kiWidth, const int32_t kiHeight,
const int32_t kiRefStride,
uint16_t* pFeatureOfBlock, uint32_t pTimesOfFeatureValue[]);
}
#endif
#ifdef HAVE_NEON_AARCH64
extern "C"
{
int32_t SumOf8x8SingleBlock_AArch64_neon (uint8_t* pRef, const int32_t kiRefStride);
int32_t SumOf16x16SingleBlock_AArch64_neon (uint8_t* pRef, const int32_t kiRefStride);
void SumOf8x8BlockOfFrame_AArch64_neon (uint8_t* pRefPicture, const int32_t kiWidth, const int32_t kiHeight,
const int32_t kiRefStride,
uint16_t* pFeatureOfBlock, uint32_t pTimesOfFeatureValue[]);
void SumOf16x16BlockOfFrame_AArch64_neon (uint8_t* pRefPicture, const int32_t kiWidth, const int32_t kiHeight,
const int32_t kiRefStride,
uint16_t* pFeatureOfBlock, uint32_t pTimesOfFeatureValue[]);
}
#endif
int32_t RequestScreenBlockFeatureStorage (CMemoryAlign* pMa, const int32_t kiFrameWidth, const int32_t kiFrameHeight, int32_t RequestScreenBlockFeatureStorage (CMemoryAlign* pMa, const int32_t kiFrameWidth, const int32_t kiFrameHeight,
const int32_t iNeedFeatureStorage, const int32_t iNeedFeatureStorage,
SScreenBlockFeatureStorage* pScreenBlockFeatureStorage); SScreenBlockFeatureStorage* pScreenBlockFeatureStorage);

View File

@ -102,6 +102,23 @@ void WelsInitMeFunc (SWelsFuncPtrList* pFuncList, uint32_t uiCpuFlag, bool bScre
//TODO: it is possible to differentiate width that is times of 8, so as to accelerate the speed when width is times of 8? //TODO: it is possible to differentiate width that is times of 8, so as to accelerate the speed when width is times of 8?
pFuncList->pfCalculateSingleBlockFeature[0] = SumOf8x8SingleBlock_c; pFuncList->pfCalculateSingleBlockFeature[0] = SumOf8x8SingleBlock_c;
pFuncList->pfCalculateSingleBlockFeature[1] = SumOf16x16SingleBlock_c; pFuncList->pfCalculateSingleBlockFeature[1] = SumOf16x16SingleBlock_c;
#if defined (HAVE_NEON)
//for feature search
pFuncList->pfCalculateBlockFeatureOfFrame[0] = SumOf8x8BlockOfFrame_neon;
pFuncList->pfCalculateBlockFeatureOfFrame[1] = SumOf16x16BlockOfFrame_neon;
//TODO: it is possible to differentiate width that is times of 8, so as to accelerate the speed when width is times of 8?
pFuncList->pfCalculateSingleBlockFeature[0] = SumOf8x8SingleBlock_neon;
pFuncList->pfCalculateSingleBlockFeature[1] = SumOf16x16SingleBlock_neon;
#endif
#if defined (HAVE_NEON_AARCH64)
//for feature search
pFuncList->pfCalculateBlockFeatureOfFrame[0] = SumOf8x8BlockOfFrame_AArch64_neon;
pFuncList->pfCalculateBlockFeatureOfFrame[1] = SumOf16x16BlockOfFrame_AArch64_neon;
//TODO: it is possible to differentiate width that is times of 8, so as to accelerate the speed when width is times of 8?
pFuncList->pfCalculateSingleBlockFeature[0] = SumOf8x8SingleBlock_AArch64_neon;
pFuncList->pfCalculateSingleBlockFeature[1] = SumOf16x16SingleBlock_AArch64_neon;
#endif
} }
} }

View File

@ -53,6 +53,7 @@ ENCODER_ASM_ARM_SRCS=\
$(ENCODER_SRCDIR)/core/arm/memory_neon.S\ $(ENCODER_SRCDIR)/core/arm/memory_neon.S\
$(ENCODER_SRCDIR)/core/arm/pixel_neon.S\ $(ENCODER_SRCDIR)/core/arm/pixel_neon.S\
$(ENCODER_SRCDIR)/core/arm/reconstruct_neon.S\ $(ENCODER_SRCDIR)/core/arm/reconstruct_neon.S\
$(ENCODER_SRCDIR)/core/arm/svc_motion_estimation.S\
ENCODER_OBJS += $(ENCODER_ASM_ARM_SRCS:.S=.$(OBJ)) ENCODER_OBJS += $(ENCODER_ASM_ARM_SRCS:.S=.$(OBJ))
endif endif
@ -64,6 +65,7 @@ ENCODER_ASM_ARM64_SRCS=\
$(ENCODER_SRCDIR)/core/arm64/memory_aarch64_neon.S\ $(ENCODER_SRCDIR)/core/arm64/memory_aarch64_neon.S\
$(ENCODER_SRCDIR)/core/arm64/pixel_aarch64_neon.S\ $(ENCODER_SRCDIR)/core/arm64/pixel_aarch64_neon.S\
$(ENCODER_SRCDIR)/core/arm64/reconstruct_aarch64_neon.S\ $(ENCODER_SRCDIR)/core/arm64/reconstruct_aarch64_neon.S\
$(ENCODER_SRCDIR)/core/arm64/svc_motion_estimation_aarch64_neon.S\
ENCODER_OBJS += $(ENCODER_ASM_ARM64_SRCS:.S=.$(OBJ)) ENCODER_OBJS += $(ENCODER_ASM_ARM64_SRCS:.S=.$(OBJ))
endif endif

View File

@ -0,0 +1,157 @@
#include <gtest/gtest.h>
#include <math.h>
#include <stdlib.h>
#include <time.h>
#include "cpu_core.h"
#include "cpu.h"
#include "macros.h"
#include "svc_motion_estimate.h"
using namespace WelsSVCEnc;
#define SVC_ME_TEST_NUM 10
static void FillWithRandomData (uint8_t* p, int32_t Len) {
for (int32_t i = 0; i < Len; i++) {
p[i] = rand() % 256;
}
}
//preprocess related
int32_t SumOf8x8SingleBlock_ref (uint8_t* pRef, const int32_t kiRefStride) {
int32_t iSum = 0, i;
for (i = 0; i < 8; i++) {
iSum += pRef[0] + pRef[1] + pRef[2] + pRef[3];
iSum += pRef[4] + pRef[5] + pRef[6] + pRef[7];
pRef += kiRefStride;
}
return iSum;
}
int32_t SumOf16x16SingleBlock_ref (uint8_t* pRef, const int32_t kiRefStride) {
int32_t iSum = 0, i;
for (i = 0; i < 16; i++) {
iSum += pRef[0] + pRef[1] + pRef[2] + pRef[3];
iSum += pRef[4] + pRef[5] + pRef[6] + pRef[7];
iSum += pRef[8] + pRef[9] + pRef[10] + pRef[11];
iSum += pRef[12] + pRef[13] + pRef[14] + pRef[15];
pRef += kiRefStride;
}
return iSum;
}
void SumOf8x8BlockOfFrame_ref (uint8_t* pRefPicture, const int32_t kiWidth, const int32_t kiHeight,
const int32_t kiRefStride,
uint16_t* pFeatureOfBlock, uint32_t pTimesOfFeatureValue[]) {
int32_t x, y;
uint8_t* pRef;
uint16_t* pBuffer;
int32_t iSum;
for (y = 0; y < kiHeight; y++) {
pRef = pRefPicture + kiRefStride * y;
pBuffer = pFeatureOfBlock + kiWidth * y;
for (x = 0; x < kiWidth; x++) {
iSum = SumOf8x8SingleBlock_c (pRef + x, kiRefStride);
pBuffer[x] = iSum;
pTimesOfFeatureValue[iSum]++;
}
}
}
void SumOf16x16BlockOfFrame_ref (uint8_t* pRefPicture, const int32_t kiWidth, const int32_t kiHeight,
const int32_t kiRefStride,
uint16_t* pFeatureOfBlock, uint32_t pTimesOfFeatureValue[]) {
//TODO: this is similar to SumOf8x8BlockOfFrame_c expect the calling of single block func, refactor-able?
int32_t x, y;
uint8_t* pRef;
uint16_t* pBuffer;
int32_t iSum;
for (y = 0; y < kiHeight; y++) {
pRef = pRefPicture + kiRefStride * y;
pBuffer = pFeatureOfBlock + kiWidth * y;
for (x = 0; x < kiWidth; x++) {
iSum = SumOf16x16SingleBlock_c (pRef + x, kiRefStride);
pBuffer[x] = iSum;
pTimesOfFeatureValue[iSum]++;
}
}
}
#define GENERATE_SumOfSingleBlock(anchor, method) \
TEST (SVC_ME_FunTest, method) {\
ENFORCE_STACK_ALIGN_1D (uint8_t, uiRefBuf, 16*320, 16);\
int32_t iRes[2];\
for (int32_t k = 0; k < SVC_ME_TEST_NUM; k++) {\
FillWithRandomData (uiRefBuf,16*320);\
iRes[0] = anchor (uiRefBuf,320);\
iRes[1] = method (uiRefBuf,320);\
ASSERT_EQ (iRes[0], iRes[1]);\
}\
}
GENERATE_SumOfSingleBlock (SumOf8x8SingleBlock_ref, SumOf8x8SingleBlock_c)
GENERATE_SumOfSingleBlock (SumOf16x16SingleBlock_ref, SumOf16x16SingleBlock_c)
#ifdef HAVE_NEON
GENERATE_SumOfSingleBlock (SumOf8x8SingleBlock_ref, SumOf8x8SingleBlock_neon)
GENERATE_SumOfSingleBlock (SumOf16x16SingleBlock_ref, SumOf16x16SingleBlock_neon)
#endif
#ifdef HAVE_NEON_AARCH64
GENERATE_SumOfSingleBlock (SumOf8x8SingleBlock_ref, SumOf8x8SingleBlock_AArch64_neon)
GENERATE_SumOfSingleBlock (SumOf16x16SingleBlock_ref, SumOf16x16SingleBlock_AArch64_neon)
#endif
#define ENFORCE_NEW_ALIGN_1D(_tp, _nm, _nbuff, _sz, _al) \
_tp *_nbuff = new _tp[(_sz)+(_al)-1]; \
_tp *_nm = _nbuff + ((_al)-1) - (((uintptr_t)(_nbuff + ((_al)-1)) & ((_al)-1))/sizeof(_tp));
#define GENERATE_SumOfFrame(anchor, method, kiWidth, kiHeight) \
TEST (SVC_ME_FunTest, method##_##kiWidth##x##kiHeight) {\
ENFORCE_NEW_ALIGN_1D (uint8_t, pRefPicture, pRefPictureBuff, ((kiHeight+16)*((((kiWidth+15)>>4)<<4)+16)), 16) \
ENFORCE_NEW_ALIGN_1D (uint16_t, pFeatureOfBlock1, pFeatureOfBlockBuff1, (kiWidth*kiHeight), 16) \
ENFORCE_NEW_ALIGN_1D (uint16_t, pFeatureOfBlock2, pFeatureOfBlockBuff2, (kiWidth*kiHeight), 16) \
uint32_t pTimesOfFeatureValue[2][65536]; \
for (int32_t k = 0; k < SVC_ME_TEST_NUM; k++) {\
FillWithRandomData (pRefPicture,(kiHeight+16)*((((kiWidth+15)>>4)<<4)+16));\
memset(pTimesOfFeatureValue[0], 0, 65536*sizeof(uint32_t)); \
memset(pTimesOfFeatureValue[1], 0, 65536*sizeof(uint32_t)); \
anchor (pRefPicture,kiWidth,kiHeight,((((kiWidth+15)>>4)<<4)+16),pFeatureOfBlock1,pTimesOfFeatureValue[0]); \
method (pRefPicture,kiWidth,kiHeight,((((kiWidth+15)>>4)<<4)+16),pFeatureOfBlock2,pTimesOfFeatureValue[1]); \
for(int32_t j=0;j<kiWidth*kiHeight;j++){\
ASSERT_EQ (pFeatureOfBlock1[j], pFeatureOfBlock2[j]);\
}\
for(int32_t j=0;j<65536;j++){\
ASSERT_EQ (pTimesOfFeatureValue[0][j], pTimesOfFeatureValue[1][j]);\
}\
}\
delete[] pRefPictureBuff; \
delete[] pFeatureOfBlockBuff1; \
delete[] pFeatureOfBlockBuff2; \
}
GENERATE_SumOfFrame (SumOf8x8BlockOfFrame_ref, SumOf8x8BlockOfFrame_c, 1, 1)
GENERATE_SumOfFrame (SumOf16x16BlockOfFrame_ref, SumOf16x16BlockOfFrame_c, 1, 1)
GENERATE_SumOfFrame (SumOf8x8BlockOfFrame_ref, SumOf8x8BlockOfFrame_c, 1, 320)
GENERATE_SumOfFrame (SumOf16x16BlockOfFrame_ref, SumOf16x16BlockOfFrame_c, 1, 320)
GENERATE_SumOfFrame (SumOf8x8BlockOfFrame_ref, SumOf8x8BlockOfFrame_c, 640, 320)
GENERATE_SumOfFrame (SumOf16x16BlockOfFrame_ref, SumOf16x16BlockOfFrame_c, 640, 320)
#ifdef HAVE_NEON
GENERATE_SumOfFrame (SumOf8x8BlockOfFrame_ref, SumOf8x8BlockOfFrame_neon, 1, 1)
GENERATE_SumOfFrame (SumOf16x16BlockOfFrame_ref, SumOf16x16BlockOfFrame_neon, 1, 1)
GENERATE_SumOfFrame (SumOf8x8BlockOfFrame_ref, SumOf8x8BlockOfFrame_neon, 1, 320)
GENERATE_SumOfFrame (SumOf16x16BlockOfFrame_ref, SumOf16x16BlockOfFrame_neon, 1, 320)
GENERATE_SumOfFrame (SumOf8x8BlockOfFrame_ref, SumOf8x8BlockOfFrame_neon, 640, 320)
GENERATE_SumOfFrame (SumOf16x16BlockOfFrame_ref, SumOf16x16BlockOfFrame_neon, 640, 320)
#endif
#ifdef HAVE_NEON_AARCH64
GENERATE_SumOfFrame (SumOf8x8BlockOfFrame_ref, SumOf8x8BlockOfFrame_AArch64_neon, 1, 1)
GENERATE_SumOfFrame (SumOf16x16BlockOfFrame_ref, SumOf16x16BlockOfFrame_AArch64_neon, 1, 1)
GENERATE_SumOfFrame (SumOf8x8BlockOfFrame_ref, SumOf8x8BlockOfFrame_AArch64_neon, 1, 320)
GENERATE_SumOfFrame (SumOf16x16BlockOfFrame_ref, SumOf16x16BlockOfFrame_AArch64_neon, 1, 320)
GENERATE_SumOfFrame (SumOf8x8BlockOfFrame_ref, SumOf8x8BlockOfFrame_AArch64_neon, 640, 320)
GENERATE_SumOfFrame (SumOf16x16BlockOfFrame_ref, SumOf16x16BlockOfFrame_AArch64_neon, 640, 320)
#endif

View File

@ -1,7 +1,7 @@
ENCODER_UNITTEST_SRCDIR=test/encoder ENCODER_UNITTEST_SRCDIR=test/encoder
ENCODER_UNITTEST_CPP_SRCS=\ ENCODER_UNITTEST_CPP_SRCS=\
$(ENCODER_UNITTEST_SRCDIR)/EncUT_DecodeMbAux.cpp\ $(ENCODER_UNITTEST_SRCDIR)/EncUT_DecodeMbAux.cpp\
$(ENCODER_UNITTEST_SRCDIR)/EncUT_EncoderExt.cpp\ $(ENCODER_UNITTEST_SRCDIR)/EncUT_EncoderExt.cpp\
$(ENCODER_UNITTEST_SRCDIR)/EncUT_EncoderMb.cpp\ $(ENCODER_UNITTEST_SRCDIR)/EncUT_EncoderMb.cpp\
$(ENCODER_UNITTEST_SRCDIR)/EncUT_EncoderMbAux.cpp\ $(ENCODER_UNITTEST_SRCDIR)/EncUT_EncoderMbAux.cpp\
$(ENCODER_UNITTEST_SRCDIR)/EncUT_ExpGolomb.cpp\ $(ENCODER_UNITTEST_SRCDIR)/EncUT_ExpGolomb.cpp\
@ -13,6 +13,7 @@ ENCODER_UNITTEST_CPP_SRCS=\
$(ENCODER_UNITTEST_SRCDIR)/EncUT_MotionEstimate.cpp\ $(ENCODER_UNITTEST_SRCDIR)/EncUT_MotionEstimate.cpp\
$(ENCODER_UNITTEST_SRCDIR)/EncUT_Reconstruct.cpp\ $(ENCODER_UNITTEST_SRCDIR)/EncUT_Reconstruct.cpp\
$(ENCODER_UNITTEST_SRCDIR)/EncUT_Sample.cpp\ $(ENCODER_UNITTEST_SRCDIR)/EncUT_Sample.cpp\
$(ENCODER_UNITTEST_SRCDIR)/EncUT_SVC_me.cpp\
ENCODER_UNITTEST_OBJS += $(ENCODER_UNITTEST_CPP_SRCS:.cpp=.$(OBJ)) ENCODER_UNITTEST_OBJS += $(ENCODER_UNITTEST_CPP_SRCS:.cpp=.$(OBJ))