From 910c64ef22eab59856acfafbcfac8a14ed13b7cf Mon Sep 17 00:00:00 2001 From: HFVideoMac Date: Tue, 22 Jul 2014 13:06:34 +0800 Subject: [PATCH] add ARM64 Adaptative Quantization code and UT --- Makefile | 1 + .../processing.xcodeproj/project.pbxproj | 4 + .../AdaptiveQuantization.cpp | 5 ++ .../AdaptiveQuantization.h | 6 ++ .../adaptive_quantization_aarch64_neon.S | 88 +++++++++++++++++++ codec/processing/targets.mk | 1 + .../ProcessUT_AdaptiveQuantization.cpp | 76 ++++++++++++++++ test/processing/targets.mk | 1 + 8 files changed, 182 insertions(+) create mode 100644 codec/processing/src/arm64/adaptive_quantization_aarch64_neon.S create mode 100644 test/processing/ProcessUT_AdaptiveQuantization.cpp diff --git a/Makefile b/Makefile index 369dd0b1..3c694866 100644 --- a/Makefile +++ b/Makefile @@ -89,6 +89,7 @@ ENCODER_INCLUDES += \ PROCESSING_INCLUDES += \ -I$(SRC_PATH)codec/processing/interface \ -I$(SRC_PATH)codec/processing/src/common \ + -I$(SRC_PATH)codec/processing/src/adaptivequantization \ -I$(SRC_PATH)codec/processing/src/scrolldetection GTEST_INCLUDES += \ diff --git a/codec/build/iOS/processing/processing.xcodeproj/project.pbxproj b/codec/build/iOS/processing/processing.xcodeproj/project.pbxproj index 2708758d..3182f3c7 100644 --- a/codec/build/iOS/processing/processing.xcodeproj/project.pbxproj +++ b/codec/build/iOS/processing/processing.xcodeproj/project.pbxproj @@ -31,6 +31,7 @@ 549947F2196A3FB400BA3D87 /* ScrollDetectionFuncs.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 549947D5196A3FB400BA3D87 /* ScrollDetectionFuncs.cpp */; }; 549947F3196A3FB400BA3D87 /* vaacalcfuncs.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 549947D8196A3FB400BA3D87 /* vaacalcfuncs.cpp */; }; 549947F4196A3FB400BA3D87 /* vaacalculation.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 549947D9196A3FB400BA3D87 /* vaacalculation.cpp */; }; + 6C749B78197E2A2000A111F9 /* adaptive_quantization_aarch64_neon.S in Sources */ = {isa = PBXBuildFile; fileRef = 6C749B77197E2A2000A111F9 /* adaptive_quantization_aarch64_neon.S */; }; /* End PBXBuildFile section */ /* Begin PBXCopyFilesBuildPhase section */ @@ -91,6 +92,7 @@ 549947D8196A3FB400BA3D87 /* vaacalcfuncs.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = vaacalcfuncs.cpp; sourceTree = ""; }; 549947D9196A3FB400BA3D87 /* vaacalculation.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = vaacalculation.cpp; sourceTree = ""; }; 549947DA196A3FB400BA3D87 /* vaacalculation.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = vaacalculation.h; sourceTree = ""; }; + 6C749B77197E2A2000A111F9 /* adaptive_quantization_aarch64_neon.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; name = adaptive_quantization_aarch64_neon.S; path = arm64/adaptive_quantization_aarch64_neon.S; sourceTree = ""; }; /* End PBXFileReference section */ /* Begin PBXFrameworksBuildPhase section */ @@ -108,6 +110,7 @@ 4CC6094D197E008B00BE8B8B /* arm64 */ = { isa = PBXGroup; children = ( + 6C749B77197E2A2000A111F9 /* adaptive_quantization_aarch64_neon.S */, 4CC6094E197E009D00BE8B8B /* down_sample_aarch64_neon.S */, ); name = arm64; @@ -337,6 +340,7 @@ isa = PBXSourcesBuildPhase; buildActionMask = 2147483647; files = ( + 6C749B78197E2A2000A111F9 /* adaptive_quantization_aarch64_neon.S in Sources */, 549947F4196A3FB400BA3D87 /* vaacalculation.cpp in Sources */, 549947E9196A3FB400BA3D87 /* ComplexityAnalysis.cpp in Sources */, 549947E3196A3FB400BA3D87 /* vaa_calc_neon.S in Sources */, diff --git a/codec/processing/src/adaptivequantization/AdaptiveQuantization.cpp b/codec/processing/src/adaptivequantization/AdaptiveQuantization.cpp index 8bd327f8..cee854b9 100644 --- a/codec/processing/src/adaptivequantization/AdaptiveQuantization.cpp +++ b/codec/processing/src/adaptivequantization/AdaptiveQuantization.cpp @@ -235,6 +235,11 @@ void CAdaptiveQuantization::WelsInitVarFunc (PVarFunc& pfVar, int32_t iCpuFlag) pfVar = SampleVariance16x16_neon; } #endif +#ifdef HAVE_NEON_AARCH64 + if (iCpuFlag & WELS_CPU_NEON) { + pfVar = SampleVariance16x16_AArch64_neon; + } +#endif } void SampleVariance16x16_c (uint8_t* pRefY, int32_t iRefStride, uint8_t* pSrcY, int32_t iSrcStride, diff --git a/codec/processing/src/adaptivequantization/AdaptiveQuantization.h b/codec/processing/src/adaptivequantization/AdaptiveQuantization.h index 985daecd..f0d6f53e 100644 --- a/codec/processing/src/adaptivequantization/AdaptiveQuantization.h +++ b/codec/processing/src/adaptivequantization/AdaptiveQuantization.h @@ -68,6 +68,12 @@ VarFunc SampleVariance16x16_neon; WELSVP_EXTERN_C_END #endif +#ifdef HAVE_NEON_AARCH64 +WELSVP_EXTERN_C_BEGIN +VarFunc SampleVariance16x16_AArch64_neon; +WELSVP_EXTERN_C_END +#endif + class CAdaptiveQuantization : public IStrategy { public: CAdaptiveQuantization (int32_t iCpuFlag); diff --git a/codec/processing/src/arm64/adaptive_quantization_aarch64_neon.S b/codec/processing/src/arm64/adaptive_quantization_aarch64_neon.S new file mode 100644 index 00000000..42711a10 --- /dev/null +++ b/codec/processing/src/arm64/adaptive_quantization_aarch64_neon.S @@ -0,0 +1,88 @@ +/*! + * \copy + * Copyright (c) 2013, Cisco Systems + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS + * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE + * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN + * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + */ + +#ifdef HAVE_NEON_AARCH64 +.text +#include "arm_arch64_common_macro.S" +WELS_ASM_AARCH64_FUNC_BEGIN SampleVariance16x16_AArch64_neon + ld1 {v1.16b}, [x0], x1 //save the ref data (16bytes) + ld1 {v0.16b}, [x2], x3 //save the src data (16bytes) + uabd v2.16b, v0.16b, v1.16b + umull v3.8h, v2.8b, v2.8b + umull2 v4.8h, v2.16b, v2.16b + uaddlp v4.4s, v4.8h + uadalp v4.4s, v3.8h //sqr + uaddlp v2.8h, v2.16b //sum + + uaddlp v1.8h, v0.16b //sum_cur + + umull v3.8h, v0.8b, v0.8b + umull2 v5.8h, v0.16b, v0.16b + uaddlp v3.4s, v3.8h + uadalp v3.4s, v5.8h //sqr_cur +.rept 15 + ld1 {v5.16b}, [x0], x1 //save the ref data (16bytes) + ld1 {v0.16b}, [x2], x3 //save the src data (16bytes) + + uabd v6.16b, v0.16b, v5.16b + + //v1 save sum_cur + uadalp v1.8h, v0.16b + + //v4 save sqr + umull v5.8h, v6.8b, v6.8b + umull2 v7.8h, v6.16b, v6.16b + uadalp v4.4s, v5.8h //sqr + uadalp v4.4s, v7.8h //sqr + + //v2 save sum + uadalp v2.8h, v6.16b + + //v3 save sqr_cur + umull v5.8h, v0.8b, v0.8b + umull2 v7.8h, v0.16b, v0.16b + uadalp v3.4s, v5.8h //sqr_cur + uadalp v3.4s, v7.8h //sqr_cur +.endr + uaddlv s2, v2.8h //sum + uaddlv s1, v1.8h //sum_cur + ins v2.s[1], v1.s[0] // sum, sum_cur + shrn v2.4h, v2.4s, #8 // sum, sum_cur>>8 + mul v2.4h, v2.4h, v2.4h//// sum*sum, sum_cur*sum_cur + uaddlv d4, v4.4s //sqr + uaddlv d3, v3.4s //sqr_cur + ins v4.s[1], v3.s[0] // sqr, sqr_cur + shrn v4.4h, v4.4s, #8 // sqr, sqr_cur>>8 + sub v4.4h, v4.4h, v2.4h + st1 {v4.s}[0], [x4] +WELS_ASM_AARCH64_FUNC_END +#endif diff --git a/codec/processing/targets.mk b/codec/processing/targets.mk index 02f4873c..268f414a 100644 --- a/codec/processing/targets.mk +++ b/codec/processing/targets.mk @@ -42,6 +42,7 @@ endif ifeq ($(ASM_ARCH), arm64) PROCESSING_ASM_ARM64_SRCS=\ + $(PROCESSING_SRCDIR)/src/arm64/adaptive_quantization_aarch64_neon.S\ $(PROCESSING_SRCDIR)/src/arm64/down_sample_aarch64_neon.S\ PROCESSING_OBJS += $(PROCESSING_ASM_ARM64_SRCS:.S=.$(OBJ)) diff --git a/test/processing/ProcessUT_AdaptiveQuantization.cpp b/test/processing/ProcessUT_AdaptiveQuantization.cpp new file mode 100644 index 00000000..8ca6ebb2 --- /dev/null +++ b/test/processing/ProcessUT_AdaptiveQuantization.cpp @@ -0,0 +1,76 @@ +#include +#include +#include +#include "cpu.h" +#include "cpu_core.h" +#include "util.h" +#include "macros.h" +#include "IWelsVP.h" +#include "AdaptiveQuantization.h" + + +using namespace nsWelsVP; + +static void FillWithRandomData (uint8_t* p, int32_t Len) { + for (int32_t i = 0; i < Len; i++) { + p[i] = rand() % 256; + } +} + +void SampleVariance16x16_ref (uint8_t* pRefY, int32_t iRefStride, uint8_t* pSrcY, int32_t iSrcStride, + SMotionTextureUnit* pMotionTexture) { + uint32_t uiCurSquare = 0, uiSquare = 0; + uint16_t uiCurSum = 0, uiSum = 0; + + for (int32_t y = 0; y < MB_WIDTH_LUMA; y++) { + for (int32_t x = 0; x < MB_WIDTH_LUMA; x++) { + uint32_t uiDiff = WELS_ABS (pRefY[x] - pSrcY[x]); + uiSum += uiDiff; + uiSquare += uiDiff * uiDiff; + + uiCurSum += pSrcY[x]; + uiCurSquare += pSrcY[x] * pSrcY[x]; + } + pRefY += iRefStride; + pSrcY += iSrcStride; + } + + uiSum = uiSum >> 8; + pMotionTexture->uiMotionIndex = (uiSquare >> 8) - (uiSum * uiSum); + + uiCurSum = uiCurSum >> 8; + pMotionTexture->uiTextureIndex = (uiCurSquare >> 8) - (uiCurSum * uiCurSum); +} + +#define GENERATE_AQTEST(method) \ +TEST (AdaptiveQuantization, method) {\ + ENFORCE_STACK_ALIGN_1D (uint8_t, pRefY,32*16,16)\ + ENFORCE_STACK_ALIGN_1D (uint8_t, pSrcY,48*16,16)\ + SMotionTextureUnit pMotionTexture[2];\ + FillWithRandomData (pRefY,32*16);\ + FillWithRandomData (pSrcY,48*16);\ + SampleVariance16x16_ref (pRefY,32,pSrcY,48,&pMotionTexture[0]);\ + method(pRefY,32,pSrcY,48,&pMotionTexture[1]);\ + ASSERT_EQ(pMotionTexture[0].uiMotionIndex,pMotionTexture[1].uiMotionIndex);\ + ASSERT_EQ(pMotionTexture[0].uiMotionIndex,pMotionTexture[1].uiMotionIndex);\ + memset (pRefY,0,32*16);\ + memset (pSrcY,255,48*16);\ + SampleVariance16x16_ref (pRefY,32,pSrcY,48,&pMotionTexture[0]);\ + method(pRefY,32,pSrcY,48,&pMotionTexture[1]);\ + ASSERT_EQ(pMotionTexture[0].uiMotionIndex,pMotionTexture[1].uiMotionIndex);\ + ASSERT_EQ(pMotionTexture[0].uiMotionIndex,pMotionTexture[1].uiMotionIndex);\ +} + +GENERATE_AQTEST (SampleVariance16x16_c) +#if defined(X86_ASM) +GENERATE_AQTEST (SampleVariance16x16_sse2) +#endif + +#if defined(HAVE_NEON) +GENERATE_AQTEST (SampleVariance16x16_neon) +#endif + +#if defined(HAVE_NEON_AARCH64) +GENERATE_AQTEST (SampleVariance16x16_AArch64_neon) +#endif + diff --git a/test/processing/targets.mk b/test/processing/targets.mk index fc38db09..50edbff3 100644 --- a/test/processing/targets.mk +++ b/test/processing/targets.mk @@ -1,5 +1,6 @@ PROCESSING_UNITTEST_SRCDIR=test/processing PROCESSING_UNITTEST_CPP_SRCS=\ + $(PROCESSING_UNITTEST_SRCDIR)/ProcessUT_AdaptiveQuantization.cpp\ $(PROCESSING_UNITTEST_SRCDIR)/ProcessUT_ScrollDetection.cpp\ PROCESSING_UNITTEST_OBJS += $(PROCESSING_UNITTEST_CPP_SRCS:.cpp=.$(OBJ))