From eace9b7b00bf5f87a0ace0e55ca5b13eb8c9407c Mon Sep 17 00:00:00 2001 From: dongzhang <dongzha@cisco.com> Date: Tue, 8 Jul 2014 11:18:45 +0800 Subject: [PATCH] add MemoryZero Arm64 code and UT --- .../welsenc/welsenc.xcodeproj/project.pbxproj | 4 + .../encoder/core/arm64/memory_aarch64_neon.S | 63 +++++++++++++ codec/encoder/core/inc/encoder.h | 2 + codec/encoder/core/src/encoder.cpp | 8 ++ codec/encoder/targets.mk | 1 + test/encoder/EncUT_MemoryZero.cpp | 94 +++++++++++++++++++ test/encoder/targets.mk | 1 + 7 files changed, 173 insertions(+) create mode 100644 codec/encoder/core/arm64/memory_aarch64_neon.S create mode 100644 test/encoder/EncUT_MemoryZero.cpp diff --git a/codec/build/iOS/enc/welsenc/welsenc.xcodeproj/project.pbxproj b/codec/build/iOS/enc/welsenc/welsenc.xcodeproj/project.pbxproj index 0d780f19..776badf3 100644 --- a/codec/build/iOS/enc/welsenc/welsenc.xcodeproj/project.pbxproj +++ b/codec/build/iOS/enc/welsenc/welsenc.xcodeproj/project.pbxproj @@ -48,6 +48,7 @@ 9AED665019469FC1009A3567 /* welsCodecTrace.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 9AED664C19469FC1009A3567 /* welsCodecTrace.cpp */; }; 9AED66661946A2B3009A3567 /* utils.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 9AED66651946A2B3009A3567 /* utils.cpp */; }; F5617A50196A833A006E2B20 /* reconstruct_aarch64_neon.S in Sources */ = {isa = PBXBuildFile; fileRef = F5617A4F196A833A006E2B20 /* reconstruct_aarch64_neon.S */; }; + F5BE8005196B913200ED02ED /* memory_aarch64_neon.S in Sources */ = {isa = PBXBuildFile; fileRef = F5BE8004196B913200ED02ED /* memory_aarch64_neon.S */; }; /* End PBXBuildFile section */ /* Begin PBXCopyFilesBuildPhase section */ @@ -157,6 +158,7 @@ 9AED66651946A2B3009A3567 /* utils.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; name = utils.cpp; path = ../../../common/src/utils.cpp; sourceTree = "<group>"; }; 9AED66671946A2C4009A3567 /* utils.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = utils.h; path = ../../../common/inc/utils.h; sourceTree = "<group>"; }; F5617A4F196A833A006E2B20 /* reconstruct_aarch64_neon.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; name = reconstruct_aarch64_neon.S; path = arm64/reconstruct_aarch64_neon.S; sourceTree = "<group>"; }; + F5BE8004196B913200ED02ED /* memory_aarch64_neon.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; name = memory_aarch64_neon.S; path = arm64/memory_aarch64_neon.S; sourceTree = "<group>"; }; /* End PBXFileReference section */ /* Begin PBXFrameworksBuildPhase section */ @@ -186,6 +188,7 @@ 4CB8F2B219235FAC005D6386 /* arm64 */ = { isa = PBXGroup; children = ( + F5BE8004196B913200ED02ED /* memory_aarch64_neon.S */, F5617A4F196A833A006E2B20 /* reconstruct_aarch64_neon.S */, 4C23BC5F195A77E0003B81FC /* intra_pred_sad_3_opt_aarch64_neon.S */, 4CBC1B82194ACBB400214D9E /* intra_pred_aarch64_neon.S */, @@ -431,6 +434,7 @@ 4C23BC60195A77E0003B81FC /* intra_pred_sad_3_opt_aarch64_neon.S in Sources */, 4CE4472B18BC605C0017DF25 /* wels_preprocess.cpp in Sources */, 4CE4470E18BC605C0017DF25 /* au_set.cpp in Sources */, + F5BE8005196B913200ED02ED /* memory_aarch64_neon.S in Sources */, 4CBC1B83194ACBB400214D9E /* intra_pred_aarch64_neon.S in Sources */, 4CE4471718BC605C0017DF25 /* mc.cpp in Sources */, F5617A50196A833A006E2B20 /* reconstruct_aarch64_neon.S in Sources */, diff --git a/codec/encoder/core/arm64/memory_aarch64_neon.S b/codec/encoder/core/arm64/memory_aarch64_neon.S new file mode 100644 index 00000000..7901efd8 --- /dev/null +++ b/codec/encoder/core/arm64/memory_aarch64_neon.S @@ -0,0 +1,63 @@ +/*! + * \copy + * Copyright (c) 2013, Cisco Systems + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS + * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE + * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN + * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + */ + +#ifdef HAVE_NEON_AARCH64 +.text +#include "arm_arch64_common_macro.S" + + +WELS_ASM_AARCH64_FUNC_BEGIN WelsSetMemZero_AArch64_neon + eor v0.16b, v0.16b, v0.16b + cmp x1, #32 + b.eq mem_zero_32_neon_start + b.lt mem_zero_24_neon_start +mem_zero_loop: + subs x1, x1, #64 + st1 {v0.16b}, [x0], #16 + st1 {v0.16b}, [x0], #16 + st1 {v0.16b}, [x0], #16 + st1 {v0.16b}, [x0], #16 + b.ne mem_zero_loop + b mem_zero_end + +mem_zero_32_neon_start: + st1 {v0.16b}, [x0], #16 + st1 {v0.16b}, [x0], #16 + b mem_zero_end +mem_zero_24_neon_start: + st1 {v0.16b}, [x0], #16 + st1 {v0.8b}, [x0], #8 +mem_zero_end: + +WELS_ASM_AARCH64_FUNC_END + +#endif diff --git a/codec/encoder/core/inc/encoder.h b/codec/encoder/core/inc/encoder.h index 13afb5d2..fef9c7a4 100644 --- a/codec/encoder/core/inc/encoder.h +++ b/codec/encoder/core/inc/encoder.h @@ -129,6 +129,8 @@ void WelsSetMemZeroSize8_mmx (void* pDst, int32_t iSize); void WelsPrefetchZero_mmx (int8_t const* kpDst); #elif defined(HAVE_NEON) void WelsSetMemZero_neon (void* pDst, int32_t iSize); +#elif defined(HAVE_NEON_AARCH64) +void WelsSetMemZero_AArch64_neon (void* pDst, int32_t iSize); #endif #if defined(__cplusplus) diff --git a/codec/encoder/core/src/encoder.cpp b/codec/encoder/core/src/encoder.cpp index 40355912..fba3891f 100644 --- a/codec/encoder/core/src/encoder.cpp +++ b/codec/encoder/core/src/encoder.cpp @@ -179,6 +179,14 @@ int32_t InitFunctionPointers (SWelsFuncPtrList* pFuncList, SWelsSvcCodingParam* } #endif +#if defined(HAVE_NEON_AARCH64) + if (uiCpuFlag & WELS_CPU_NEON) { + pFuncList->pfSetMemZeroSize8 = WelsSetMemZero_AArch64_neon; + pFuncList->pfSetMemZeroSize64Aligned16 = WelsSetMemZero_AArch64_neon; + pFuncList->pfSetMemZeroSize64 = WelsSetMemZero_AArch64_neon; + } +#endif + InitExpandPictureFunc (& (pFuncList->sExpandPicFunc), uiCpuFlag); /* Intra_Prediction_fn*/ diff --git a/codec/encoder/targets.mk b/codec/encoder/targets.mk index bb1e843d..74f620d8 100644 --- a/codec/encoder/targets.mk +++ b/codec/encoder/targets.mk @@ -61,6 +61,7 @@ ifeq ($(ASM_ARCH), arm64) ENCODER_ASM_ARM64_SRCS=\ $(ENCODER_SRCDIR)/core/arm64/intra_pred_aarch64_neon.S\ $(ENCODER_SRCDIR)/core/arm64/intra_pred_sad_3_opt_aarch64_neon.S\ + $(ENCODER_SRCDIR)/core/arm64/memory_aarch64_neon.S\ $(ENCODER_SRCDIR)/core/arm64/pixel_aarch64_neon.S\ $(ENCODER_SRCDIR)/core/arm64/reconstruct_aarch64_neon.S\ diff --git a/test/encoder/EncUT_MemoryZero.cpp b/test/encoder/EncUT_MemoryZero.cpp new file mode 100644 index 00000000..275ff16c --- /dev/null +++ b/test/encoder/EncUT_MemoryZero.cpp @@ -0,0 +1,94 @@ +#include<gtest/gtest.h> +#include<math.h> +#include<stdlib.h> +#include<time.h> + +#include "cpu_core.h" +#include "cpu.h" +#include "macros.h" +#include "wels_func_ptr_def.h" +#include "../../codec/encoder/core/src/encoder.cpp" + +using namespace WelsSVCEnc; +#define MEMORYZEROTEST_NUM 1000 + +TEST (SetMemZeroFunTest, WelsSetMemZero) { + int32_t iLen =64; + int32_t iCpuCores = 0; + SWelsFuncPtrList sFuncPtrList; + uint32_t uiCpuFlag = WelsCPUFeatureDetect (&iCpuCores); + /* Functionality utilization of CPU instructions dependency */ + sFuncPtrList.pfSetMemZeroSize8 = WelsSetMemZero_c; // confirmed_safe_unsafe_usage + sFuncPtrList.pfSetMemZeroSize64Aligned16 = WelsSetMemZero_c; // confirmed_safe_unsafe_usage + sFuncPtrList.pfSetMemZeroSize64 = WelsSetMemZero_c; // confirmed_safe_unsafe_usage +#if defined(X86_ASM) + if (uiCpuFlag & WELS_CPU_MMXEXT) { + sFuncPtrList.pfSetMemZeroSize8 = WelsSetMemZeroSize8_mmx; // confirmed_safe_unsafe_usage + sFuncPtrList.pfSetMemZeroSize64Aligned16 = WelsSetMemZeroSize64_mmx; // confirmed_safe_unsafe_usage + sFuncPtrList.pfSetMemZeroSize64 = WelsSetMemZeroSize64_mmx; // confirmed_safe_unsafe_usage + } + if (uiCpuFlag & WELS_CPU_SSE2) { + sFuncPtrList.pfSetMemZeroSize64Aligned16 = WelsSetMemZeroAligned64_sse2; // confirmed_safe_unsafe_usage + } +#endif//X86_ASM + +#if defined(HAVE_NEON) + if (uiCpuFlag & WELS_CPU_NEON) { + sFuncPtrList.pfSetMemZeroSize8 = WelsSetMemZero_neon; + sFuncPtrList.pfSetMemZeroSize64Aligned16 = WelsSetMemZero_neon; + sFuncPtrList.pfSetMemZeroSize64 = WelsSetMemZero_neon; + } +#endif + +#if defined(HAVE_NEON_AARCH64) + if (uiCpuFlag & WELS_CPU_NEON) { + sFuncPtrList.pfSetMemZeroSize8 = WelsSetMemZero_AArch64_neon; + sFuncPtrList.pfSetMemZeroSize64Aligned16 = WelsSetMemZero_AArch64_neon; + sFuncPtrList.pfSetMemZeroSize64 = WelsSetMemZero_AArch64_neon; + } +#endif + + ENFORCE_STACK_ALIGN_2D (uint8_t, pInputAlign, 2, 64*101, 16) + + for (int32_t k = 0; k < MEMORYZEROTEST_NUM; k++) { + memset(pInputAlign[0], 255, 64*101); + memset(pInputAlign[1], 255, 64*101); + iLen = 64*(1+(rand()%100)); + WelsSetMemZero_c(pInputAlign[0],iLen); + sFuncPtrList.pfSetMemZeroSize64Aligned16(pInputAlign[1],iLen); + for (int32_t i = 0 ; i < 64*101; i++) { + ASSERT_EQ (pInputAlign[0][i], pInputAlign[1][i]); + } + } + + for (int32_t k = 0; k < MEMORYZEROTEST_NUM; k++) { + memset(pInputAlign[0], 255, 64*101); + memset(pInputAlign[1], 255, 64*101); + iLen = 64*(1+(rand()%100)); + WelsSetMemZero_c(pInputAlign[0]+1,iLen); + sFuncPtrList.pfSetMemZeroSize64(pInputAlign[1]+1,iLen); + for (int32_t i = 0 ; i < 64*101; i++) { + ASSERT_EQ (pInputAlign[0][i], pInputAlign[1][i]); + } + } + + memset(pInputAlign[0], 255, 64*101); + memset(pInputAlign[1], 255, 64*101); + iLen = 32; + WelsSetMemZero_c(pInputAlign[0]+1,iLen); + sFuncPtrList.pfSetMemZeroSize8(pInputAlign[1]+1,iLen); + for (int32_t i = 0 ; i < 64*101; i++) { + ASSERT_EQ (pInputAlign[0][i], pInputAlign[1][i]); + } + + memset(pInputAlign[0], 255, 64*101); + memset(pInputAlign[1], 255, 64*101); + iLen = 24; + WelsSetMemZero_c(pInputAlign[0]+1,iLen); + sFuncPtrList.pfSetMemZeroSize8(pInputAlign[1]+1,iLen); + for (int32_t i = 0 ; i < 64*101; i++) { + ASSERT_EQ (pInputAlign[0][i], pInputAlign[1][i]); + } +} + + diff --git a/test/encoder/targets.mk b/test/encoder/targets.mk index e8a1e7bb..565dc8da 100644 --- a/test/encoder/targets.mk +++ b/test/encoder/targets.mk @@ -7,6 +7,7 @@ ENCODER_UNITTEST_CPP_SRCS=\ $(ENCODER_UNITTEST_SRCDIR)/EncUT_ExpGolomb.cpp\ $(ENCODER_UNITTEST_SRCDIR)/EncUT_GetIntraPredictor.cpp\ $(ENCODER_UNITTEST_SRCDIR)/EncUT_MemoryAlloc.cpp\ + $(ENCODER_UNITTEST_SRCDIR)/EncUT_MemoryZero.cpp\ $(ENCODER_UNITTEST_SRCDIR)/EncUT_MotionEstimate.cpp\ $(ENCODER_UNITTEST_SRCDIR)/EncUT_Reconstruct.cpp\ $(ENCODER_UNITTEST_SRCDIR)/EncUT_Sample.cpp\