From 910c64ef22eab59856acfafbcfac8a14ed13b7cf Mon Sep 17 00:00:00 2001
From: HFVideoMac <hfvideomac@HFVideoMacs-Mac-mini.local>
Date: Tue, 22 Jul 2014 13:06:34 +0800
Subject: [PATCH] add ARM64 Adaptative Quantization code and UT

---
 Makefile                                      |  1 +
 .../processing.xcodeproj/project.pbxproj      |  4 +
 .../AdaptiveQuantization.cpp                  |  5 ++
 .../AdaptiveQuantization.h                    |  6 ++
 .../adaptive_quantization_aarch64_neon.S      | 88 +++++++++++++++++++
 codec/processing/targets.mk                   |  1 +
 .../ProcessUT_AdaptiveQuantization.cpp        | 76 ++++++++++++++++
 test/processing/targets.mk                    |  1 +
 8 files changed, 182 insertions(+)
 create mode 100644 codec/processing/src/arm64/adaptive_quantization_aarch64_neon.S
 create mode 100644 test/processing/ProcessUT_AdaptiveQuantization.cpp
diff --git a/Makefile b/Makefile
index 369dd0b1..3c694866 100644
--- a/Makefile
+++ b/Makefile
@@ -89,6 +89,7 @@ ENCODER_INCLUDES += \
 PROCESSING_INCLUDES += \
     -I$(SRC_PATH)codec/processing/interface \
     -I$(SRC_PATH)codec/processing/src/common \
+    -I$(SRC_PATH)codec/processing/src/adaptivequantization \
     -I$(SRC_PATH)codec/processing/src/scrolldetection
 
 GTEST_INCLUDES += \
diff --git a/codec/build/iOS/processing/processing.xcodeproj/project.pbxproj b/codec/build/iOS/processing/processing.xcodeproj/project.pbxproj
index 2708758d..3182f3c7 100644
--- a/codec/build/iOS/processing/processing.xcodeproj/project.pbxproj
+++ b/codec/build/iOS/processing/processing.xcodeproj/project.pbxproj
@@ -31,6 +31,7 @@
 		549947F2196A3FB400BA3D87 /* ScrollDetectionFuncs.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 549947D5196A3FB400BA3D87 /* ScrollDetectionFuncs.cpp */; };
 		549947F3196A3FB400BA3D87 /* vaacalcfuncs.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 549947D8196A3FB400BA3D87 /* vaacalcfuncs.cpp */; };
 		549947F4196A3FB400BA3D87 /* vaacalculation.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 549947D9196A3FB400BA3D87 /* vaacalculation.cpp */; };
+		6C749B78197E2A2000A111F9 /* adaptive_quantization_aarch64_neon.S in Sources */ = {isa = PBXBuildFile; fileRef = 6C749B77197E2A2000A111F9 /* adaptive_quantization_aarch64_neon.S */; };
 /* End PBXBuildFile section */
 
 /* Begin PBXCopyFilesBuildPhase section */
@@ -91,6 +92,7 @@
 		549947D8196A3FB400BA3D87 /* vaacalcfuncs.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = vaacalcfuncs.cpp; sourceTree = "<group>"; };
 		549947D9196A3FB400BA3D87 /* vaacalculation.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = vaacalculation.cpp; sourceTree = "<group>"; };
 		549947DA196A3FB400BA3D87 /* vaacalculation.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = vaacalculation.h; sourceTree = "<group>"; };
+		6C749B77197E2A2000A111F9 /* adaptive_quantization_aarch64_neon.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; name = adaptive_quantization_aarch64_neon.S; path = arm64/adaptive_quantization_aarch64_neon.S; sourceTree = "<group>"; };
 /* End PBXFileReference section */
 
 /* Begin PBXFrameworksBuildPhase section */
@@ -108,6 +110,7 @@
 		4CC6094D197E008B00BE8B8B /* arm64 */ = {
 			isa = PBXGroup;
 			children = (
+				6C749B77197E2A2000A111F9 /* adaptive_quantization_aarch64_neon.S */,
 				4CC6094E197E009D00BE8B8B /* down_sample_aarch64_neon.S */,
 			);
 			name = arm64;
@@ -337,6 +340,7 @@
 			isa = PBXSourcesBuildPhase;
 			buildActionMask = 2147483647;
 			files = (
+				6C749B78197E2A2000A111F9 /* adaptive_quantization_aarch64_neon.S in Sources */,
 				549947F4196A3FB400BA3D87 /* vaacalculation.cpp in Sources */,
 				549947E9196A3FB400BA3D87 /* ComplexityAnalysis.cpp in Sources */,
 				549947E3196A3FB400BA3D87 /* vaa_calc_neon.S in Sources */,
diff --git a/codec/processing/src/adaptivequantization/AdaptiveQuantization.cpp b/codec/processing/src/adaptivequantization/AdaptiveQuantization.cpp
index 8bd327f8..cee854b9 100644
--- a/codec/processing/src/adaptivequantization/AdaptiveQuantization.cpp
+++ b/codec/processing/src/adaptivequantization/AdaptiveQuantization.cpp
@@ -235,6 +235,11 @@ void CAdaptiveQuantization::WelsInitVarFunc (PVarFunc& pfVar,  int32_t iCpuFlag)
     pfVar = SampleVariance16x16_neon;
   }
 #endif
+#ifdef HAVE_NEON_AARCH64
+  if (iCpuFlag & WELS_CPU_NEON) {
+    pfVar = SampleVariance16x16_AArch64_neon;
+  }
+#endif
 }
 
 void SampleVariance16x16_c (uint8_t* pRefY, int32_t iRefStride, uint8_t* pSrcY, int32_t iSrcStride,
diff --git a/codec/processing/src/adaptivequantization/AdaptiveQuantization.h b/codec/processing/src/adaptivequantization/AdaptiveQuantization.h
index 985daecd..f0d6f53e 100644
--- a/codec/processing/src/adaptivequantization/AdaptiveQuantization.h
+++ b/codec/processing/src/adaptivequantization/AdaptiveQuantization.h
@@ -68,6 +68,12 @@ VarFunc      SampleVariance16x16_neon;
 WELSVP_EXTERN_C_END
 #endif
 
+#ifdef HAVE_NEON_AARCH64
+WELSVP_EXTERN_C_BEGIN
+VarFunc      SampleVariance16x16_AArch64_neon;
+WELSVP_EXTERN_C_END
+#endif
+
 class CAdaptiveQuantization : public IStrategy {
  public:
   CAdaptiveQuantization (int32_t iCpuFlag);
diff --git a/codec/processing/src/arm64/adaptive_quantization_aarch64_neon.S b/codec/processing/src/arm64/adaptive_quantization_aarch64_neon.S
new file mode 100644
index 00000000..42711a10
--- /dev/null
+++ b/codec/processing/src/arm64/adaptive_quantization_aarch64_neon.S
@@ -0,0 +1,88 @@
+/*!
+ * \copy
+ *     Copyright (c)  2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#ifdef HAVE_NEON_AARCH64
+.text
+#include "arm_arch64_common_macro.S"
+WELS_ASM_AARCH64_FUNC_BEGIN SampleVariance16x16_AArch64_neon
+    ld1   {v1.16b}, [x0], x1 //save the ref data (16bytes)
+    ld1   {v0.16b}, [x2], x3 //save the src data (16bytes)
+    uabd  v2.16b, v0.16b, v1.16b
+    umull  v3.8h, v2.8b, v2.8b
+    umull2 v4.8h, v2.16b, v2.16b
+    uaddlp v4.4s, v4.8h
+    uadalp v4.4s, v3.8h //sqr
+    uaddlp v2.8h, v2.16b //sum
+
+    uaddlp v1.8h, v0.16b //sum_cur
+
+    umull  v3.8h, v0.8b, v0.8b
+    umull2 v5.8h, v0.16b, v0.16b
+    uaddlp v3.4s, v3.8h
+    uadalp v3.4s, v5.8h //sqr_cur
+.rept 15
+    ld1   {v5.16b}, [x0], x1 //save the ref data (16bytes)
+    ld1   {v0.16b}, [x2], x3 //save the src data (16bytes)
+
+    uabd  v6.16b, v0.16b, v5.16b
+
+    //v1 save sum_cur
+    uadalp v1.8h, v0.16b
+
+    //v4 save sqr
+    umull  v5.8h, v6.8b, v6.8b
+    umull2 v7.8h, v6.16b, v6.16b
+    uadalp v4.4s, v5.8h //sqr
+    uadalp v4.4s, v7.8h //sqr
+
+    //v2 save sum
+    uadalp v2.8h, v6.16b
+
+    //v3 save sqr_cur
+    umull  v5.8h, v0.8b, v0.8b
+    umull2 v7.8h, v0.16b, v0.16b
+    uadalp v3.4s, v5.8h //sqr_cur
+    uadalp v3.4s, v7.8h //sqr_cur
+.endr
+    uaddlv s2, v2.8h //sum
+    uaddlv s1, v1.8h //sum_cur
+    ins    v2.s[1], v1.s[0] // sum, sum_cur
+    shrn   v2.4h, v2.4s, #8 // sum, sum_cur>>8
+    mul    v2.4h, v2.4h, v2.4h//// sum*sum, sum_cur*sum_cur
+    uaddlv d4, v4.4s //sqr
+    uaddlv d3, v3.4s //sqr_cur
+    ins    v4.s[1], v3.s[0] // sqr, sqr_cur
+    shrn   v4.4h, v4.4s, #8 // sqr, sqr_cur>>8
+    sub    v4.4h, v4.4h, v2.4h
+    st1  {v4.s}[0], [x4]
+WELS_ASM_AARCH64_FUNC_END
+#endif
diff --git a/codec/processing/targets.mk b/codec/processing/targets.mk
index 02f4873c..268f414a 100644
--- a/codec/processing/targets.mk
+++ b/codec/processing/targets.mk
@@ -42,6 +42,7 @@ endif
 
 ifeq ($(ASM_ARCH), arm64)
 PROCESSING_ASM_ARM64_SRCS=\
+	$(PROCESSING_SRCDIR)/src/arm64/adaptive_quantization_aarch64_neon.S\
 	$(PROCESSING_SRCDIR)/src/arm64/down_sample_aarch64_neon.S\
 
 PROCESSING_OBJS += $(PROCESSING_ASM_ARM64_SRCS:.S=.$(OBJ))
diff --git a/test/processing/ProcessUT_AdaptiveQuantization.cpp b/test/processing/ProcessUT_AdaptiveQuantization.cpp
new file mode 100644
index 00000000..8ca6ebb2
--- /dev/null
+++ b/test/processing/ProcessUT_AdaptiveQuantization.cpp
@@ -0,0 +1,76 @@
+#include <gtest/gtest.h>
+#include <math.h>
+#include <string.h>
+#include "cpu.h"
+#include "cpu_core.h"
+#include "util.h"
+#include "macros.h"
+#include "IWelsVP.h"
+#include "AdaptiveQuantization.h"
+
+
+using namespace nsWelsVP;
+
+static void FillWithRandomData (uint8_t* p, int32_t Len) {
+  for (int32_t i = 0; i < Len; i++) {
+    p[i] = rand() % 256;
+  }
+}
+
+void SampleVariance16x16_ref (uint8_t* pRefY, int32_t iRefStride, uint8_t* pSrcY, int32_t iSrcStride,
+                              SMotionTextureUnit* pMotionTexture) {
+  uint32_t uiCurSquare = 0,  uiSquare = 0;
+  uint16_t uiCurSum = 0,  uiSum = 0;
+
+  for (int32_t y = 0; y < MB_WIDTH_LUMA; y++) {
+    for (int32_t x = 0; x < MB_WIDTH_LUMA; x++) {
+      uint32_t uiDiff = WELS_ABS (pRefY[x] - pSrcY[x]);
+      uiSum += uiDiff;
+      uiSquare += uiDiff * uiDiff;
+
+      uiCurSum += pSrcY[x];
+      uiCurSquare += pSrcY[x] * pSrcY[x];
+    }
+    pRefY += iRefStride;
+    pSrcY += iSrcStride;
+  }
+
+  uiSum = uiSum >> 8;
+  pMotionTexture->uiMotionIndex = (uiSquare >> 8) - (uiSum * uiSum);
+
+  uiCurSum = uiCurSum >> 8;
+  pMotionTexture->uiTextureIndex = (uiCurSquare >> 8) - (uiCurSum * uiCurSum);
+}
+
+#define GENERATE_AQTEST(method) \
+TEST (AdaptiveQuantization, method) {\
+  ENFORCE_STACK_ALIGN_1D (uint8_t, pRefY,32*16,16)\
+  ENFORCE_STACK_ALIGN_1D (uint8_t, pSrcY,48*16,16)\
+  SMotionTextureUnit pMotionTexture[2];\
+  FillWithRandomData (pRefY,32*16);\
+  FillWithRandomData (pSrcY,48*16);\
+  SampleVariance16x16_ref (pRefY,32,pSrcY,48,&pMotionTexture[0]);\
+  method(pRefY,32,pSrcY,48,&pMotionTexture[1]);\
+  ASSERT_EQ(pMotionTexture[0].uiMotionIndex,pMotionTexture[1].uiMotionIndex);\
+  ASSERT_EQ(pMotionTexture[0].uiMotionIndex,pMotionTexture[1].uiMotionIndex);\
+  memset (pRefY,0,32*16);\
+  memset (pSrcY,255,48*16);\
+  SampleVariance16x16_ref (pRefY,32,pSrcY,48,&pMotionTexture[0]);\
+  method(pRefY,32,pSrcY,48,&pMotionTexture[1]);\
+  ASSERT_EQ(pMotionTexture[0].uiMotionIndex,pMotionTexture[1].uiMotionIndex);\
+  ASSERT_EQ(pMotionTexture[0].uiMotionIndex,pMotionTexture[1].uiMotionIndex);\
+}
+
+GENERATE_AQTEST (SampleVariance16x16_c)
+#if defined(X86_ASM)
+GENERATE_AQTEST (SampleVariance16x16_sse2)
+#endif
+
+#if defined(HAVE_NEON)
+GENERATE_AQTEST (SampleVariance16x16_neon)
+#endif
+
+#if defined(HAVE_NEON_AARCH64)
+GENERATE_AQTEST (SampleVariance16x16_AArch64_neon)
+#endif
+
diff --git a/test/processing/targets.mk b/test/processing/targets.mk
index fc38db09..50edbff3 100644
--- a/test/processing/targets.mk
+++ b/test/processing/targets.mk
@@ -1,5 +1,6 @@
 PROCESSING_UNITTEST_SRCDIR=test/processing
 PROCESSING_UNITTEST_CPP_SRCS=\
+	$(PROCESSING_UNITTEST_SRCDIR)/ProcessUT_AdaptiveQuantization.cpp\
 	$(PROCESSING_UNITTEST_SRCDIR)/ProcessUT_ScrollDetection.cpp\
 
 PROCESSING_UNITTEST_OBJS += $(PROCESSING_UNITTEST_CPP_SRCS:.cpp=.$(OBJ))