Refactored NEON optimization usage

2011-05-06 12:24:56 +00:00
parent 94a5bf88d0
commit 9f29506d2c
5 changed files with 45 additions and 39 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -447,6 +447,7 @@ if(UNIX)
        endif()
        if(WITH_V4L)

+
            CHECK_MODULE(libv4l1 HAVE_LIBV4L)
            CHECK_INCLUDE_FILE(linux/videodev.h HAVE_CAMV4L)
            CHECK_INCLUDE_FILE(linux/videodev2.h HAVE_CAMV4L2)
@@ -889,6 +890,7 @@ if(MSVC)
    # 64-bit portability warnings, in MSVC8
    if(MSVC80)
        set(EXTRA_C_FLAGS "${EXTRA_C_FLAGS} /Wp64")
+
    endif()
    #if(MSVC90)
    #    set(EXTRA_C_FLAGS "${EXTRA_C_FLAGS} /D _BIND_TO_CURRENT_CRT_VERSION=1 /D _BIND_TO_CURRENT_VCLIBS_VERSION=1")
@@ -1016,10 +1018,6 @@ if(CMAKE_COMPILER_IS_GNUCXX)
    set(EXTRA_C_FLAGS_RELEASE "${EXTRA_C_FLAGS_RELEASE} -DNDEBUG")
    set(EXTRA_C_FLAGS_DEBUG "${EXTRA_C_FLAGS_DEBUG} -O0 -ggdb3 -DDEBUG -D_DEBUG")

-    if(ANDROID)
-      #force compiler to interpret char as signed char
-      set(EXTRA_C_FLAGS "${EXTRA_C_FLAGS} -fsigned-char")
-    endif()
 endif()

 # Extra link libs if the user selects building static libs:
--- a/android/android-opencv/cmake_android.sh
+++ b/android/android-opencv/cmake_android.sh
@@ -1,5 +1,9 @@
-opencv_dir=`pwd`/../build
-mkdir build
-cd build
-cmake -DOpenCVDIR=$opencv_dir -DCMAKE_TOOLCHAIN_FILE=$ANDTOOLCHAIN ..
+#!/bin/sh
+cd `dirname $0`
+
+opencv_build_dir=`pwd`/../build
+mkdir -p build
+cd build
+
+cmake -DOpenCVDIR=$opencv_build_dir -DCMAKE_TOOLCHAIN_FILE=../../android.toolchain.cmake ..

--- a/android/android-opencv/cmake_android_neon.sh
+++ b/android/android-opencv/cmake_android_neon.sh
@@ -1,5 +1,8 @@
-opencv_dir=`pwd`/../build_neon
-mkdir build_neon
-cd build_neon
-cmake -DOpenCV_DIR=$opencv_dir -DARM_TARGETS="armeabi-v7a with NEON" -DCMAKE_TOOLCHAIN_FILE=$ANDTOOLCHAIN ..
+#!/bin/sh
+cd `dirname $0`

+opencv_build_dir=`pwd`/../build_neon
+mkdir -p build_neon
+cd build_neon
+
+cmake -DOpenCVDIR=$opencv_build_dir -DARM_TARGET="armeabi-v7a with NEON" -DCMAKE_TOOLCHAIN_FILE=../../android.toolchain.cmake ..
--- a/modules/core/include/opencv2/core/internal.hpp
+++ b/modules/core/include/opencv2/core/internal.hpp
@@ -122,8 +122,13 @@ CV_INLINE IppiSize ippiSize(int width, int height)
 #if defined ANDROID && defined __ARM_NEON__
 #include "arm_neon.h"
 #define CV_NEON 1
+
+#define CPU_HAS_NEON_FEATURE (true)
+//TODO: make real check using stuff from "cpu-features.h"
+//((bool)android_getCpuFeatures() & ANDROID_CPU_ARM_FEATURE_NEON)
 #else
 #define CV_NEON 0
+#define CPU_HAS_NEON_FEATURE (false)
 #endif

 #ifndef IPPI_CALL
--- a/modules/features2d/src/brief.cpp
+++ b/modules/features2d/src/brief.cpp
@@ -44,11 +44,6 @@
 #include <algorithm>
 #include <vector>

-#if ANDROID  && HAVE_NEON
-#include <cpu-features.h>
-#include <arm_neon.h>
-#endif
-
 #include <iostream>
 #include <iomanip>

@@ -115,9 +110,8 @@ Hamming::ResultType Hamming::operator()(const unsigned char* a, const unsigned c
 {
 #if __GNUC__
  ResultType result = 0;
-#if ANDROID && HAVE_NEON
-  static uint64_t features = android_getCpuFeatures();
-  if ((features & ANDROID_CPU_ARM_FEATURE_NEON))
+#if CV_NEON
+  if (CPU_HAS_NEON_FEATURE)
  {
    for (size_t i = 0; i < size; i += 16)
    {
@@ -126,7 +120,7 @@ Hamming::ResultType Hamming::operator()(const unsigned char* a, const unsigned c
      //uint8x16_t veorq_u8 (uint8x16_t, uint8x16_t)
      uint8x16_t AxorB = veorq_u8 (A_vec, B_vec);

-      uint8x16_t bitsSet += vcntq_u8 (AxorB);
+      uint8x16_t bitsSet = vcntq_u8 (AxorB);
      //uint16x8_t vpadalq_u8 (uint16x8_t, uint8x16_t)
      uint16x8_t bitSet8 = vpaddlq_u8 (bitsSet);
      uint32x4_t bitSet4 = vpaddlq_u16 (bitSet8);
@@ -138,25 +132,27 @@ Hamming::ResultType Hamming::operator()(const unsigned char* a, const unsigned c
  }
  else
 #endif  
-  //for portability just use unsigned long -- and use the __builtin_popcountll (see docs for __builtin_popcountll)
-  typedef unsigned long long pop_t;
-  const size_t modulo = size % sizeof(pop_t);
-  const pop_t * a2 = reinterpret_cast<const pop_t*> (a);
-  const pop_t * b2 = reinterpret_cast<const pop_t*> (b);
-  const pop_t * a2_end = a2 + (size/sizeof(pop_t));
-
-  for (; a2 != a2_end; ++a2, ++b2)
-    result += __builtin_popcountll((*a2) ^ (*b2));
-
-  if (modulo)
  {
-    //in the case where size is not divisible by sizeof(size_t)
-    //need to mask off the bits at the end  
-    pop_t a_final=0,b_final=0;
-    memcpy(&a_final,a2,modulo);
-    memcpy(&b_final,b2,modulo);
-    result += __builtin_popcountll(a_final ^ b_final);
-  }  
+    //for portability just use unsigned long -- and use the __builtin_popcountll (see docs for __builtin_popcountll)
+    typedef unsigned long long pop_t;
+    const size_t modulo = size % sizeof(pop_t);
+    const pop_t * a2 = reinterpret_cast<const pop_t*> (a);
+    const pop_t * b2 = reinterpret_cast<const pop_t*> (b);
+    const pop_t * a2_end = a2 + (size/sizeof(pop_t));
+
+    for (; a2 != a2_end; ++a2, ++b2)
+      result += __builtin_popcountll((*a2) ^ (*b2));
+
+    if (modulo)
+    {
+      //in the case where size is not divisible by sizeof(size_t)
+      //need to mask off the bits at the end  
+      pop_t a_final=0,b_final=0;
+      memcpy(&a_final,a2,modulo);
+      memcpy(&b_final,b2,modulo);
+      result += __builtin_popcountll(a_final ^ b_final);
+    }  
+  }
  return result;
 #else
  return HammingLUT()(a,b,size);