Refactored NEON optimization usage
This commit is contained in:
parent
94a5bf88d0
commit
9f29506d2c
@ -447,6 +447,7 @@ if(UNIX)
|
|||||||
endif()
|
endif()
|
||||||
if(WITH_V4L)
|
if(WITH_V4L)
|
||||||
|
|
||||||
|
|
||||||
CHECK_MODULE(libv4l1 HAVE_LIBV4L)
|
CHECK_MODULE(libv4l1 HAVE_LIBV4L)
|
||||||
CHECK_INCLUDE_FILE(linux/videodev.h HAVE_CAMV4L)
|
CHECK_INCLUDE_FILE(linux/videodev.h HAVE_CAMV4L)
|
||||||
CHECK_INCLUDE_FILE(linux/videodev2.h HAVE_CAMV4L2)
|
CHECK_INCLUDE_FILE(linux/videodev2.h HAVE_CAMV4L2)
|
||||||
@ -889,6 +890,7 @@ if(MSVC)
|
|||||||
# 64-bit portability warnings, in MSVC8
|
# 64-bit portability warnings, in MSVC8
|
||||||
if(MSVC80)
|
if(MSVC80)
|
||||||
set(EXTRA_C_FLAGS "${EXTRA_C_FLAGS} /Wp64")
|
set(EXTRA_C_FLAGS "${EXTRA_C_FLAGS} /Wp64")
|
||||||
|
|
||||||
endif()
|
endif()
|
||||||
#if(MSVC90)
|
#if(MSVC90)
|
||||||
# set(EXTRA_C_FLAGS "${EXTRA_C_FLAGS} /D _BIND_TO_CURRENT_CRT_VERSION=1 /D _BIND_TO_CURRENT_VCLIBS_VERSION=1")
|
# set(EXTRA_C_FLAGS "${EXTRA_C_FLAGS} /D _BIND_TO_CURRENT_CRT_VERSION=1 /D _BIND_TO_CURRENT_VCLIBS_VERSION=1")
|
||||||
@ -1016,10 +1018,6 @@ if(CMAKE_COMPILER_IS_GNUCXX)
|
|||||||
set(EXTRA_C_FLAGS_RELEASE "${EXTRA_C_FLAGS_RELEASE} -DNDEBUG")
|
set(EXTRA_C_FLAGS_RELEASE "${EXTRA_C_FLAGS_RELEASE} -DNDEBUG")
|
||||||
set(EXTRA_C_FLAGS_DEBUG "${EXTRA_C_FLAGS_DEBUG} -O0 -ggdb3 -DDEBUG -D_DEBUG")
|
set(EXTRA_C_FLAGS_DEBUG "${EXTRA_C_FLAGS_DEBUG} -O0 -ggdb3 -DDEBUG -D_DEBUG")
|
||||||
|
|
||||||
if(ANDROID)
|
|
||||||
#force compiler to interpret char as signed char
|
|
||||||
set(EXTRA_C_FLAGS "${EXTRA_C_FLAGS} -fsigned-char")
|
|
||||||
endif()
|
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
# Extra link libs if the user selects building static libs:
|
# Extra link libs if the user selects building static libs:
|
||||||
|
@ -1,5 +1,9 @@
|
|||||||
opencv_dir=`pwd`/../build
|
#!/bin/sh
|
||||||
mkdir build
|
cd `dirname $0`
|
||||||
cd build
|
|
||||||
cmake -DOpenCVDIR=$opencv_dir -DCMAKE_TOOLCHAIN_FILE=$ANDTOOLCHAIN ..
|
opencv_build_dir=`pwd`/../build
|
||||||
|
mkdir -p build
|
||||||
|
cd build
|
||||||
|
|
||||||
|
cmake -DOpenCVDIR=$opencv_build_dir -DCMAKE_TOOLCHAIN_FILE=../../android.toolchain.cmake ..
|
||||||
|
|
||||||
|
@ -1,5 +1,8 @@
|
|||||||
opencv_dir=`pwd`/../build_neon
|
#!/bin/sh
|
||||||
mkdir build_neon
|
cd `dirname $0`
|
||||||
cd build_neon
|
|
||||||
cmake -DOpenCV_DIR=$opencv_dir -DARM_TARGETS="armeabi-v7a with NEON" -DCMAKE_TOOLCHAIN_FILE=$ANDTOOLCHAIN ..
|
|
||||||
|
|
||||||
|
opencv_build_dir=`pwd`/../build_neon
|
||||||
|
mkdir -p build_neon
|
||||||
|
cd build_neon
|
||||||
|
|
||||||
|
cmake -DOpenCVDIR=$opencv_build_dir -DARM_TARGET="armeabi-v7a with NEON" -DCMAKE_TOOLCHAIN_FILE=../../android.toolchain.cmake ..
|
||||||
|
@ -122,8 +122,13 @@ CV_INLINE IppiSize ippiSize(int width, int height)
|
|||||||
#if defined ANDROID && defined __ARM_NEON__
|
#if defined ANDROID && defined __ARM_NEON__
|
||||||
#include "arm_neon.h"
|
#include "arm_neon.h"
|
||||||
#define CV_NEON 1
|
#define CV_NEON 1
|
||||||
|
|
||||||
|
#define CPU_HAS_NEON_FEATURE (true)
|
||||||
|
//TODO: make real check using stuff from "cpu-features.h"
|
||||||
|
//((bool)android_getCpuFeatures() & ANDROID_CPU_ARM_FEATURE_NEON)
|
||||||
#else
|
#else
|
||||||
#define CV_NEON 0
|
#define CV_NEON 0
|
||||||
|
#define CPU_HAS_NEON_FEATURE (false)
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifndef IPPI_CALL
|
#ifndef IPPI_CALL
|
||||||
|
@ -44,11 +44,6 @@
|
|||||||
#include <algorithm>
|
#include <algorithm>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
|
|
||||||
#if ANDROID && HAVE_NEON
|
|
||||||
#include <cpu-features.h>
|
|
||||||
#include <arm_neon.h>
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#include <iostream>
|
#include <iostream>
|
||||||
#include <iomanip>
|
#include <iomanip>
|
||||||
|
|
||||||
@ -115,9 +110,8 @@ Hamming::ResultType Hamming::operator()(const unsigned char* a, const unsigned c
|
|||||||
{
|
{
|
||||||
#if __GNUC__
|
#if __GNUC__
|
||||||
ResultType result = 0;
|
ResultType result = 0;
|
||||||
#if ANDROID && HAVE_NEON
|
#if CV_NEON
|
||||||
static uint64_t features = android_getCpuFeatures();
|
if (CPU_HAS_NEON_FEATURE)
|
||||||
if ((features & ANDROID_CPU_ARM_FEATURE_NEON))
|
|
||||||
{
|
{
|
||||||
for (size_t i = 0; i < size; i += 16)
|
for (size_t i = 0; i < size; i += 16)
|
||||||
{
|
{
|
||||||
@ -126,7 +120,7 @@ Hamming::ResultType Hamming::operator()(const unsigned char* a, const unsigned c
|
|||||||
//uint8x16_t veorq_u8 (uint8x16_t, uint8x16_t)
|
//uint8x16_t veorq_u8 (uint8x16_t, uint8x16_t)
|
||||||
uint8x16_t AxorB = veorq_u8 (A_vec, B_vec);
|
uint8x16_t AxorB = veorq_u8 (A_vec, B_vec);
|
||||||
|
|
||||||
uint8x16_t bitsSet += vcntq_u8 (AxorB);
|
uint8x16_t bitsSet = vcntq_u8 (AxorB);
|
||||||
//uint16x8_t vpadalq_u8 (uint16x8_t, uint8x16_t)
|
//uint16x8_t vpadalq_u8 (uint16x8_t, uint8x16_t)
|
||||||
uint16x8_t bitSet8 = vpaddlq_u8 (bitsSet);
|
uint16x8_t bitSet8 = vpaddlq_u8 (bitsSet);
|
||||||
uint32x4_t bitSet4 = vpaddlq_u16 (bitSet8);
|
uint32x4_t bitSet4 = vpaddlq_u16 (bitSet8);
|
||||||
@ -138,25 +132,27 @@ Hamming::ResultType Hamming::operator()(const unsigned char* a, const unsigned c
|
|||||||
}
|
}
|
||||||
else
|
else
|
||||||
#endif
|
#endif
|
||||||
//for portability just use unsigned long -- and use the __builtin_popcountll (see docs for __builtin_popcountll)
|
|
||||||
typedef unsigned long long pop_t;
|
|
||||||
const size_t modulo = size % sizeof(pop_t);
|
|
||||||
const pop_t * a2 = reinterpret_cast<const pop_t*> (a);
|
|
||||||
const pop_t * b2 = reinterpret_cast<const pop_t*> (b);
|
|
||||||
const pop_t * a2_end = a2 + (size/sizeof(pop_t));
|
|
||||||
|
|
||||||
for (; a2 != a2_end; ++a2, ++b2)
|
|
||||||
result += __builtin_popcountll((*a2) ^ (*b2));
|
|
||||||
|
|
||||||
if (modulo)
|
|
||||||
{
|
{
|
||||||
//in the case where size is not divisible by sizeof(size_t)
|
//for portability just use unsigned long -- and use the __builtin_popcountll (see docs for __builtin_popcountll)
|
||||||
//need to mask off the bits at the end
|
typedef unsigned long long pop_t;
|
||||||
pop_t a_final=0,b_final=0;
|
const size_t modulo = size % sizeof(pop_t);
|
||||||
memcpy(&a_final,a2,modulo);
|
const pop_t * a2 = reinterpret_cast<const pop_t*> (a);
|
||||||
memcpy(&b_final,b2,modulo);
|
const pop_t * b2 = reinterpret_cast<const pop_t*> (b);
|
||||||
result += __builtin_popcountll(a_final ^ b_final);
|
const pop_t * a2_end = a2 + (size/sizeof(pop_t));
|
||||||
}
|
|
||||||
|
for (; a2 != a2_end; ++a2, ++b2)
|
||||||
|
result += __builtin_popcountll((*a2) ^ (*b2));
|
||||||
|
|
||||||
|
if (modulo)
|
||||||
|
{
|
||||||
|
//in the case where size is not divisible by sizeof(size_t)
|
||||||
|
//need to mask off the bits at the end
|
||||||
|
pop_t a_final=0,b_final=0;
|
||||||
|
memcpy(&a_final,a2,modulo);
|
||||||
|
memcpy(&b_final,b2,modulo);
|
||||||
|
result += __builtin_popcountll(a_final ^ b_final);
|
||||||
|
}
|
||||||
|
}
|
||||||
return result;
|
return result;
|
||||||
#else
|
#else
|
||||||
return HammingLUT()(a,b,size);
|
return HammingLUT()(a,b,size);
|
||||||
|
Loading…
x
Reference in New Issue
Block a user