Merge branch 'master' into better_png_transparency

* master: (468 commits)
  added suppression for TBB valgrind issue
  update CUDA architecture flags initialization
  increase minimal supported CUDA toolkit to 6.5
  check the CPU flag correctly
  opencv_visualization: check cmdline args
  provide better error messages
  stop search of markers in Exif reader to prevent infinite loop
  Fix calibration fail on python with CALIB_THIN_PRISM_MODEL flag
  clarify CUDA arithm operations usage with mask
  fixed empty image condition in resize
  fixed memory leak in flann tests
  fisheye: add CALIB_FIX_PRINCIPAL_POINT
  get/put: more type-safety and code unification using templates
  py_tutorials: fix cv2.findContours return val
  imgproc: speed up threshold of 64F version using NEON and SSE   * use NEON under aarch64 only   * check 64F version correctly
  bigdata: add test, resolve split/merge issue
  Improved Carotene library linear resize evaluation precision and enabled it as HAL implementation.
  persistence: fixing crash with space-only values
  Removed unnecessary check for Android API level and unused flags.
  Fix for median blur of 2-channel images
  ...
This commit is contained in:
thierry 2016-07-14 14:05:16 +02:00
commit d40e46bc9b
597 changed files with 62455 additions and 8567 deletions

30
.github/ISSUE_TEMPLATE.md vendored Normal file
View File

@ -0,0 +1,30 @@
<!--
If you have a question rather than reporting a bug please go to http://answers.opencv.org where you get much faster responses.
If you need further assistance please read [How To Contribute](https://github.com/Itseez/opencv/wiki/How_to_contribute).
This is a template helping you to create an issue which can be processed as quickly as possible. This is the bug reporting section for the OpenCV library.
-->
##### System information (version)
<!-- Example
- OpenCV => 3.1
- Operating System / Platform => Windows 64 Bit
- Compiler => Visual Studio 2015
-->
- OpenCV => :grey_question:
- Operating System / Platform => :grey_question:
- Compiler => :grey_question:
##### Detailed description
<!-- your description -->
##### Steps to reproduce
<!-- to add code example fence it with triple backticks and optional file extension
```.cpp
// C++ code example
```
or attach as .txt or .zip file
-->

9
.github/PULL_REQUEST_TEMPLATE.md vendored Normal file
View File

@ -0,0 +1,9 @@
<!-- Please use this line to close one or multiple issues when this pullrequest gets merged
You can add another line right under the first one:
resolves #1234
resolves #1235
-->
### This pullrequest changes
<!-- Please describe what your pullrequest is changing -->

8
3rdparty/carotene/.gitignore vendored Normal file
View File

@ -0,0 +1,8 @@
# Gedit temp files
*~
# Qt Creator file
*.user
# MacOS-specific (Desktop Services Store)
.DS_Store

42
3rdparty/carotene/CMakeLists.txt vendored Normal file
View File

@ -0,0 +1,42 @@
cmake_minimum_required(VERSION 2.8.11 FATAL_ERROR)
project(Carotene)
set(CAROTENE_NS "carotene" CACHE STRING "Namespace for Carotene definitions")
set(CAROTENE_INCLUDE_DIR include)
set(CAROTENE_SOURCE_DIR src)
file(GLOB_RECURSE carotene_headers RELATIVE "${CMAKE_CURRENT_LIST_DIR}" "${CAROTENE_INCLUDE_DIR}/*.hpp")
file(GLOB_RECURSE carotene_sources RELATIVE "${CMAKE_CURRENT_LIST_DIR}" "${CAROTENE_SOURCE_DIR}/*.cpp"
"${CAROTENE_SOURCE_DIR}/*.hpp")
include_directories(${CAROTENE_INCLUDE_DIR})
if(CMAKE_COMPILER_IS_GNUCC)
set(CMAKE_CXX_FLAGS "-fvisibility=hidden ${CMAKE_CXX_FLAGS}")
# allow more inlines - these parameters improve performance for:
# - matchTemplate about 5-10%
# - goodFeaturesToTrack 10-20%
# - cornerHarris 30% for some cases
set_source_files_properties(${carotene_sources} COMPILE_FLAGS "--param ipcp-unit-growth=100000 --param inline-unit-growth=100000 --param large-stack-frame-growth=5000")
endif()
add_library(carotene_objs OBJECT
${carotene_headers}
${carotene_sources}
)
if(NOT CAROTENE_NS STREQUAL "carotene")
target_compile_definitions(carotene_objs PUBLIC "-DCAROTENE_NS=${CAROTENE_NS}")
endif()
if(WITH_NEON)
target_compile_definitions(carotene_objs PRIVATE "-DWITH_NEON")
endif()
set_target_properties(carotene_objs PROPERTIES POSITION_INDEPENDENT_CODE TRUE)
add_library(carotene STATIC EXCLUDE_FROM_ALL "$<TARGET_OBJECTS:carotene_objs>")

2
3rdparty/carotene/README.md vendored Normal file
View File

@ -0,0 +1,2 @@
This is Carotene, a low-level library containing optimized CPU routines
that are useful for computer vision algorithms.

112
3rdparty/carotene/hal/CMakeLists.txt vendored Normal file
View File

@ -0,0 +1,112 @@
cmake_minimum_required(VERSION 2.8.8 FATAL_ERROR)
include(CheckCCompilerFlag)
include(CheckCXXCompilerFlag)
set(CMAKE_POSITION_INDEPENDENT_CODE ON)
set(TEGRA_HAL_DIR "${CMAKE_CURRENT_SOURCE_DIR}")
set(CAROTENE_DIR "${TEGRA_HAL_DIR}/../")
if(CMAKE_SYSTEM_PROCESSOR MATCHES "^(arm.*|ARM.*)")
set(ARM TRUE)
elseif (CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64.*|AARCH64.*")
set(AARCH64 TRUE)
endif()
set(TEGRA_COMPILER_FLAGS "")
if(CMAKE_COMPILER_IS_GNUCXX)
# Generate unwind information even for functions that can't throw/propagate exceptions.
# This lets debuggers and such get non-broken backtraces for such functions, even without debugging symbols.
list(APPEND TEGRA_COMPILER_FLAGS -funwind-tables)
endif()
if(CMAKE_COMPILER_IS_GNUCXX)
if(X86 OR ARMEABI_V6 OR (MIPS AND ANDROID_COMPILER_VERSION VERSION_LESS "4.6"))
list(APPEND TEGRA_COMPILER_FLAGS -fweb -fwrapv -frename-registers -fsched-stalled-insns-dep=100 -fsched-stalled-insns=2)
else()
list(APPEND TEGRA_COMPILER_FLAGS -fweb -fwrapv -frename-registers -fsched2-use-superblocks -fsched2-use-traces
-fsched-stalled-insns-dep=100 -fsched-stalled-insns=2)
endif()
if((ANDROID_COMPILER_IS_CLANG OR NOT ANDROID_COMPILER_VERSION VERSION_LESS "4.7") AND ANDROID_NDK_RELEASE STRGREATER "r8d" )
list(APPEND TEGRA_COMPILER_FLAGS -fgraphite -fgraphite-identity -floop-block -floop-flatten -floop-interchange
-floop-strip-mine -floop-parallelize-all -ftree-loop-linear)
endif()
endif()
string(REPLACE ";" " " TEGRA_COMPILER_FLAGS "${TEGRA_COMPILER_FLAGS}")
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${TEGRA_COMPILER_FLAGS}")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${TEGRA_COMPILER_FLAGS}")
if(ARMEABI_V7A)
if (CMAKE_COMPILER_IS_GNUCXX)
set( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fno-tree-vectorize" )
set( CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fno-tree-vectorize" )
endif()
endif()
if(WITH_LOGS)
add_definitions(-DHAVE_LOGS)
endif()
set(CAROTENE_NS "carotene_o4t" CACHE STRING "" FORCE)
function(compile_carotene)
if(ENABLE_NEON)
set(WITH_NEON ON)
endif()
add_subdirectory("${CAROTENE_DIR}" "${CMAKE_CURRENT_BINARY_DIR}/carotene")
if(ARM OR AARCH64)
if(CMAKE_BUILD_TYPE)
set(CMAKE_TRY_COMPILE_CONFIGURATION ${CMAKE_BUILD_TYPE})
endif()
check_cxx_compiler_flag("-mfpu=neon" CXX_HAS_MFPU_NEON)
check_c_compiler_flag("-mfpu=neon" C_HAS_MFPU_NEON)
if(${CXX_HAS_MFPU_NEON} AND ${C_HAS_MFPU_NEON})
get_target_property(old_flags "carotene_objs" COMPILE_FLAGS)
if(old_flags)
set_target_properties("carotene_objs" PROPERTIES COMPILE_FLAGS "${old_flags} -mfpu=neon")
else()
set_target_properties("carotene_objs" PROPERTIES COMPILE_FLAGS "-mfpu=neon")
endif()
endif()
endif()
endfunction()
compile_carotene()
include_directories("${CAROTENE_DIR}/include")
get_target_property(carotene_defs carotene_objs INTERFACE_COMPILE_DEFINITIONS)
set_property(DIRECTORY APPEND PROPERTY COMPILE_DEFINITIONS ${carotene_defs})
if (CMAKE_COMPILER_IS_GNUCXX)
# allow more inlines - these parameters improve performance for:
# matchTemplate about 5-10%
# goodFeaturesToTrack 10-20%
# cornerHarris 30% for some cases
set_source_files_properties(impl.cpp $<TARGET_OBJECTS:carotene_objs> COMPILE_FLAGS "--param ipcp-unit-growth=100000 --param inline-unit-growth=100000 --param large-stack-frame-growth=5000")
# set_source_files_properties(impl.cpp $<TARGET_OBJECTS:carotene_objs> COMPILE_FLAGS "--param ipcp-unit-growth=100000 --param inline-unit-growth=100000 --param large-stack-frame-growth=5000")
endif()
add_library(tegra_hal STATIC $<TARGET_OBJECTS:carotene_objs>)
set_target_properties(tegra_hal PROPERTIES POSITION_INDEPENDENT_CODE TRUE)
set_target_properties(tegra_hal PROPERTIES ARCHIVE_OUTPUT_DIRECTORY ${3P_LIBRARY_OUTPUT_PATH})
set(OPENCV_SRC_DIR "${CMAKE_SOURCE_DIR}")
if(NOT BUILD_SHARED_LIBS)
ocv_install_target(tegra_hal EXPORT OpenCVModules ARCHIVE DESTINATION ${OPENCV_3P_LIB_INSTALL_PATH} COMPONENT dev)
endif()
target_include_directories(tegra_hal PRIVATE ${CMAKE_CURRENT_SOURCE_DIR} ${OPENCV_SRC_DIR}/modules/core/include)
set(CAROTENE_HAL_VERSION "0.0.1" PARENT_SCOPE)
set(CAROTENE_HAL_LIBRARIES "tegra_hal" PARENT_SCOPE)
set(CAROTENE_HAL_HEADERS "carotene/tegra_hal.hpp" PARENT_SCOPE)
set(CAROTENE_HAL_INCLUDE_DIRS "${CMAKE_BINARY_DIR}" PARENT_SCOPE)
configure_file("tegra_hal.hpp" "${CMAKE_BINARY_DIR}/carotene/tegra_hal.hpp" COPYONLY)
configure_file("${CAROTENE_DIR}/include/carotene/definitions.hpp" "${CMAKE_BINARY_DIR}/carotene/definitions.hpp" COPYONLY)
configure_file("${CAROTENE_DIR}/include/carotene/functions.hpp" "${CMAKE_BINARY_DIR}/carotene/functions.hpp" COPYONLY)
configure_file("${CAROTENE_DIR}/include/carotene/types.hpp" "${CMAKE_BINARY_DIR}/carotene/types.hpp" COPYONLY)

1851
3rdparty/carotene/hal/tegra_hal.hpp vendored Normal file

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,47 @@
/*
* By downloading, copying, installing or using the software you agree to this license.
* If you do not agree to this license, do not download, install,
* copy or use the software.
*
*
* License Agreement
* For Open Source Computer Vision Library
* (3-clause BSD License)
*
* Copyright (C) 2015, NVIDIA Corporation, all rights reserved.
* Third party copyrights are property of their respective owners.
*
* Redistribution and use in source and binary forms, with or without modification,
* are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* * Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* * Neither the names of the copyright holders nor the names of the contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* This software is provided by the copyright holders and contributors "as is" and
* any express or implied warranties, including, but not limited to, the implied
* warranties of merchantability and fitness for a particular purpose are disclaimed.
* In no event shall copyright holders or contributors be liable for any direct,
* indirect, incidental, special, exemplary, or consequential damages
* (including, but not limited to, procurement of substitute goods or services;
* loss of use, data, or profits; or business interruption) however caused
* and on any theory of liability, whether in contract, strict liability,
* or tort (including negligence or otherwise) arising in any way out of
* the use of this software, even if advised of the possibility of such damage.
*/
#ifndef CAROTENE_DEFINITIONS_HPP
#define CAROTENE_DEFINITIONS_HPP
#ifndef CAROTENE_NS
#define CAROTENE_NS carotene
#endif
#endif

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,125 @@
/*
* By downloading, copying, installing or using the software you agree to this license.
* If you do not agree to this license, do not download, install,
* copy or use the software.
*
*
* License Agreement
* For Open Source Computer Vision Library
* (3-clause BSD License)
*
* Copyright (C) 2014-2015, NVIDIA Corporation, all rights reserved.
* Third party copyrights are property of their respective owners.
*
* Redistribution and use in source and binary forms, with or without modification,
* are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* * Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* * Neither the names of the copyright holders nor the names of the contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* This software is provided by the copyright holders and contributors "as is" and
* any express or implied warranties, including, but not limited to, the implied
* warranties of merchantability and fitness for a particular purpose are disclaimed.
* In no event shall copyright holders or contributors be liable for any direct,
* indirect, incidental, special, exemplary, or consequential damages
* (including, but not limited to, procurement of substitute goods or services;
* loss of use, data, or profits; or business interruption) however caused
* and on any theory of liability, whether in contract, strict liability,
* or tort (including negligence or otherwise) arising in any way out of
* the use of this software, even if advised of the possibility of such damage.
*/
#ifndef CAROTENE_TYPES_HPP
#define CAROTENE_TYPES_HPP
#include <carotene/definitions.hpp>
#include <stdint.h>
#include <cstddef>
#ifndef UINT32_MAX
#define UINT32_MAX (4294967295U)
#endif
namespace CAROTENE_NS {
using std::size_t;
using std::ptrdiff_t;
typedef int8_t s8;
typedef uint8_t u8;
typedef int16_t s16;
typedef uint16_t u16;
typedef int32_t s32;
typedef uint32_t u32;
typedef float f32;
typedef int64_t s64;
typedef uint64_t u64;
typedef double f64;
typedef ptrdiff_t stride_t;
enum CONVERT_POLICY
{
CONVERT_POLICY_WRAP,
CONVERT_POLICY_SATURATE
};
enum BORDER_MODE
{
BORDER_MODE_UNDEFINED,
BORDER_MODE_CONSTANT,
BORDER_MODE_REPLICATE,
BORDER_MODE_REFLECT,
BORDER_MODE_REFLECT101,
BORDER_MODE_WRAP
};
enum FLIP_MODE
{
FLIP_HORIZONTAL_MODE = 1,
FLIP_VERTICAL_MODE = 2,
FLIP_BOTH_MODE = FLIP_HORIZONTAL_MODE | FLIP_VERTICAL_MODE
};
enum COLOR_SPACE
{
COLOR_SPACE_BT601,
COLOR_SPACE_BT709
};
struct Size2D {
Size2D() : width(0), height(0) {}
Size2D(size_t width_, size_t height_) : width(width_), height(height_) {}
size_t width;
size_t height;
inline size_t total() const
{
return width * height;
}
};
struct Margin {
Margin() : left(0), right(0), top(0), bottom(0) {}
Margin(size_t left_, size_t right_, size_t top_, size_t bottom_)
: left(left_), right(right_), top(top_), bottom(bottom_) {}
// these are measured in elements
size_t left, right, top, bottom;
};
struct KeypointStore {
virtual void push(f32 kpX, f32 kpY, f32 kpSize, f32 kpAngle=-1, f32 kpResponse=0, s32 kpOctave=0, s32 kpClass_id=-1) = 0;
virtual ~KeypointStore() {};
};
}
#endif

241
3rdparty/carotene/src/absdiff.cpp vendored Normal file
View File

@ -0,0 +1,241 @@
/*
* By downloading, copying, installing or using the software you agree to this license.
* If you do not agree to this license, do not download, install,
* copy or use the software.
*
*
* License Agreement
* For Open Source Computer Vision Library
* (3-clause BSD License)
*
* Copyright (C) 2014-2015, NVIDIA Corporation, all rights reserved.
* Third party copyrights are property of their respective owners.
*
* Redistribution and use in source and binary forms, with or without modification,
* are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* * Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* * Neither the names of the copyright holders nor the names of the contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* This software is provided by the copyright holders and contributors "as is" and
* any express or implied warranties, including, but not limited to, the implied
* warranties of merchantability and fitness for a particular purpose are disclaimed.
* In no event shall copyright holders or contributors be liable for any direct,
* indirect, incidental, special, exemplary, or consequential damages
* (including, but not limited to, procurement of substitute goods or services;
* loss of use, data, or profits; or business interruption) however caused
* and on any theory of liability, whether in contract, strict liability,
* or tort (including negligence or otherwise) arising in any way out of
* the use of this software, even if advised of the possibility of such damage.
*/
#include <algorithm>
#include "common.hpp"
#include "vtransform.hpp"
namespace CAROTENE_NS {
#ifdef CAROTENE_NEON
namespace {
template <typename T>
struct AbsDiff
{
typedef T type;
void operator() (const typename internal::VecTraits<T>::vec128 & v_src0,
const typename internal::VecTraits<T>::vec128 & v_src1,
typename internal::VecTraits<T>::vec128 & v_dst) const
{
v_dst = internal::vabdq(v_src0, v_src1);
}
void operator() (const typename internal::VecTraits<T>::vec64 & v_src0,
const typename internal::VecTraits<T>::vec64 & v_src1,
typename internal::VecTraits<T>::vec64 & v_dst) const
{
v_dst = internal::vabd(v_src0, v_src1);
}
void operator() (const T * src0, const T * src1, T * dst) const
{
dst[0] = src0[0] >= src1[0] ? src0[0] - src1[0] : src1[0] - src0[0];
}
};
template <typename T>
struct AbsDiffSigned
{
typedef T type;
void operator() (const typename internal::VecTraits<T>::vec128 & v_src0,
const typename internal::VecTraits<T>::vec128 & v_src1,
typename internal::VecTraits<T>::vec128 & v_dst) const
{
typename internal::VecTraits<T>::vec128 v_min = internal::vminq(v_src0, v_src1);
typename internal::VecTraits<T>::vec128 v_max = internal::vmaxq(v_src0, v_src1);
v_dst = internal::vqsubq(v_max, v_min);
}
void operator() (const typename internal::VecTraits<T>::vec64 & v_src0,
const typename internal::VecTraits<T>::vec64 & v_src1,
typename internal::VecTraits<T>::vec64 & v_dst) const
{
typename internal::VecTraits<T>::vec64 v_min = internal::vmin(v_src0, v_src1);
typename internal::VecTraits<T>::vec64 v_max = internal::vmax(v_src0, v_src1);
v_dst = internal::vqsub(v_max, v_min);
}
void operator() (const T * src0, const T * src1, T * dst) const
{
dst[0] = internal::saturate_cast<T>(src0[0] >= src1[0] ? (s64)src0[0] - src1[0] : (s64)src1[0] - src0[0]);
}
};
} // namespace
#endif
void absDiff(const Size2D &size,
const u8 *src0Base, ptrdiff_t src0Stride,
const u8 *src1Base, ptrdiff_t src1Stride,
u8 *dstBase, ptrdiff_t dstStride)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
internal::vtransform(size,
src0Base, src0Stride,
src1Base, src1Stride,
dstBase, dstStride, AbsDiff<u8>());
#else
(void)size;
(void)src0Base;
(void)src0Stride;
(void)src1Base;
(void)src1Stride;
(void)dstBase;
(void)dstStride;
#endif
}
void absDiff(const Size2D &size,
const u16 *src0Base, ptrdiff_t src0Stride,
const u16 *src1Base, ptrdiff_t src1Stride,
u16 *dstBase, ptrdiff_t dstStride)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
internal::vtransform(size,
src0Base, src0Stride,
src1Base, src1Stride,
dstBase, dstStride, AbsDiff<u16>());
#else
(void)size;
(void)src0Base;
(void)src0Stride;
(void)src1Base;
(void)src1Stride;
(void)dstBase;
(void)dstStride;
#endif
}
void absDiff(const Size2D &size,
const s8 *src0Base, ptrdiff_t src0Stride,
const s8 *src1Base, ptrdiff_t src1Stride,
s8 *dstBase, ptrdiff_t dstStride)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
internal::vtransform(size,
src0Base, src0Stride,
src1Base, src1Stride,
dstBase, dstStride, AbsDiffSigned<s8>());
#else
(void)size;
(void)src0Base;
(void)src0Stride;
(void)src1Base;
(void)src1Stride;
(void)dstBase;
(void)dstStride;
#endif
}
void absDiff(const Size2D &size,
const s16 *src0Base, ptrdiff_t src0Stride,
const s16 *src1Base, ptrdiff_t src1Stride,
s16 *dstBase, ptrdiff_t dstStride)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
internal::vtransform(size,
src0Base, src0Stride,
src1Base, src1Stride,
dstBase, dstStride, AbsDiffSigned<s16>());
#else
(void)size;
(void)src0Base;
(void)src0Stride;
(void)src1Base;
(void)src1Stride;
(void)dstBase;
(void)dstStride;
#endif
}
void absDiff(const Size2D &size,
const s32 *src0Base, ptrdiff_t src0Stride,
const s32 *src1Base, ptrdiff_t src1Stride,
s32 *dstBase, ptrdiff_t dstStride)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
internal::vtransform(size,
src0Base, src0Stride,
src1Base, src1Stride,
dstBase, dstStride, AbsDiffSigned<s32>());
#else
(void)size;
(void)src0Base;
(void)src0Stride;
(void)src1Base;
(void)src1Stride;
(void)dstBase;
(void)dstStride;
#endif
}
void absDiff(const Size2D &size,
const f32 * src0Base, ptrdiff_t src0Stride,
const f32 * src1Base, ptrdiff_t src1Stride,
f32 * dstBase, ptrdiff_t dstStride)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
internal::vtransform(size,
src0Base, src0Stride,
src1Base, src1Stride,
dstBase, dstStride, AbsDiff<f32>());
#else
(void)size;
(void)src0Base;
(void)src0Stride;
(void)src1Base;
(void)src1Stride;
(void)dstBase;
(void)dstStride;
#endif
}
} // namespace CAROTENE_NS

408
3rdparty/carotene/src/accumulate.cpp vendored Normal file
View File

@ -0,0 +1,408 @@
/*
* By downloading, copying, installing or using the software you agree to this license.
* If you do not agree to this license, do not download, install,
* copy or use the software.
*
*
* License Agreement
* For Open Source Computer Vision Library
* (3-clause BSD License)
*
* Copyright (C) 2014, NVIDIA Corporation, all rights reserved.
* Third party copyrights are property of their respective owners.
*
* Redistribution and use in source and binary forms, with or without modification,
* are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* * Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* * Neither the names of the copyright holders nor the names of the contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* This software is provided by the copyright holders and contributors "as is" and
* any express or implied warranties, including, but not limited to, the implied
* warranties of merchantability and fitness for a particular purpose are disclaimed.
* In no event shall copyright holders or contributors be liable for any direct,
* indirect, incidental, special, exemplary, or consequential damages
* (including, but not limited to, procurement of substitute goods or services;
* loss of use, data, or profits; or business interruption) however caused
* and on any theory of liability, whether in contract, strict liability,
* or tort (including negligence or otherwise) arising in any way out of
* the use of this software, even if advised of the possibility of such damage.
*/
#include "common.hpp"
#include "vtransform.hpp"
#include <cstring>
namespace CAROTENE_NS {
void accumulate(const Size2D &size,
const u8 *srcBase, ptrdiff_t srcStride,
s16 *dstBase, ptrdiff_t dstStride)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
size_t roiw16 = size.width >= 15 ? size.width - 15 : 0;
size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
for (size_t i = 0; i < size.height; ++i)
{
const u8* src = internal::getRowPtr(srcBase, srcStride, i);
s16* dst = internal::getRowPtr(dstBase, dstStride, i);
size_t j = 0;
for (; j < roiw16; j += 16)
{
internal::prefetch(src + j);
internal::prefetch(dst + j);
uint8x16_t v_src = vld1q_u8(src + j);
int16x8_t v_dst0 = vld1q_s16(dst + j);
int16x8_t v_dst1 = vld1q_s16(dst + j + 8);
int16x8_t v_src0 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(v_src)));
int16x8_t v_src1 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(v_src)));
v_dst0 = vqaddq_s16(v_dst0, v_src0);
v_dst1 = vqaddq_s16(v_dst1, v_src1);
vst1q_s16(dst + j, v_dst0);
vst1q_s16(dst + j + 8, v_dst1);
}
for (; j < roiw8; j += 8)
{
uint8x8_t v_src = vld1_u8(src + j);
int16x8_t v_src16 = vreinterpretq_s16_u16(vmovl_u8(v_src));
int16x8_t v_dst = vld1q_s16(dst + j);
v_dst = vqaddq_s16(v_dst, v_src16);
vst1q_s16(dst + j, v_dst);
}
for (; j < size.width; j++)
dst[j] = internal::saturate_cast<s16>(src[j] + dst[j]);
}
#else
(void)size;
(void)srcBase;
(void)srcStride;
(void)dstBase;
(void)dstStride;
#endif
}
#ifdef CAROTENE_NEON
namespace {
template <int shift>
void accumulateSquareConst(const Size2D &size,
const u8 *srcBase, ptrdiff_t srcStride,
s16 *dstBase, ptrdiff_t dstStride)
{
size_t roiw16 = size.width >= 15 ? size.width - 15 : 0;
size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
for (size_t i = 0; i < size.height; ++i)
{
const u8* src = internal::getRowPtr(srcBase, srcStride, i);
s16* dst = internal::getRowPtr(dstBase, dstStride, i);
size_t j = 0;
for (; j < roiw16; j += 16)
{
internal::prefetch(src + j);
internal::prefetch(dst + j);
uint8x16_t v_src = vld1q_u8(src + j);
int16x8_t v_dst0 = vld1q_s16(dst + j), v_dst1 = vld1q_s16(dst + j + 8);
int16x8_t v_src0 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(v_src)));
int16x8_t v_src1 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(v_src)));
int16x4_t v_srclo = vget_low_s16(v_src0), v_srchi = vget_high_s16(v_src0);
v_dst0 = vcombine_s16(vqmovn_s32(vaddw_s16(vshrq_n_s32(vmull_s16(v_srclo, v_srclo), shift), vget_low_s16(v_dst0))),
vqmovn_s32(vaddw_s16(vshrq_n_s32(vmull_s16(v_srchi, v_srchi), shift), vget_high_s16(v_dst0))));
v_srclo = vget_low_s16(v_src1);
v_srchi = vget_high_s16(v_src1);
v_dst1 = vcombine_s16(vqmovn_s32(vaddw_s16(vshrq_n_s32(vmull_s16(v_srclo, v_srclo), shift), vget_low_s16(v_dst1))),
vqmovn_s32(vaddw_s16(vshrq_n_s32(vmull_s16(v_srchi, v_srchi), shift), vget_high_s16(v_dst1))));
vst1q_s16(dst + j, v_dst0);
vst1q_s16(dst + j + 8, v_dst1);
}
for (; j < roiw8; j += 8)
{
int16x8_t v_src = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(src + j)));
int16x8_t v_dst = vld1q_s16(dst + j);
int16x4_t v_srclo = vget_low_s16(v_src), v_srchi = vget_high_s16(v_src);
v_dst = vcombine_s16(vqmovn_s32(vaddw_s16(vshrq_n_s32(vmull_s16(v_srclo, v_srclo), shift), vget_low_s16(v_dst))),
vqmovn_s32(vaddw_s16(vshrq_n_s32(vmull_s16(v_srchi, v_srchi), shift), vget_high_s16(v_dst))));
vst1q_s16(dst + j, v_dst);
}
for (; j < size.width; j++)
{
s32 srcVal = src[j];
dst[j] = internal::saturate_cast<s16>(dst[j] + ((srcVal * srcVal) >> shift));
}
}
}
template <>
void accumulateSquareConst<0>(const Size2D &size,
const u8 *srcBase, ptrdiff_t srcStride,
s16 *dstBase, ptrdiff_t dstStride)
{
size_t roiw16 = size.width >= 15 ? size.width - 15 : 0;
size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
for (size_t i = 0; i < size.height; ++i)
{
const u8* src = internal::getRowPtr(srcBase, srcStride, i);
s16* dst = internal::getRowPtr(dstBase, dstStride, i);
size_t j = 0;
for (; j < roiw16; j += 16)
{
internal::prefetch(src + j);
internal::prefetch(dst + j);
uint8x16_t v_src = vld1q_u8(src + j);
int16x8_t v_dst0 = vld1q_s16(dst + j), v_dst1 = vld1q_s16(dst + j + 8);
int16x8_t v_src0 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(v_src)));
int16x8_t v_src1 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(v_src)));
int16x4_t v_srclo = vget_low_s16(v_src0), v_srchi = vget_high_s16(v_src0);
v_dst0 = vcombine_s16(vqmovn_s32(vaddw_s16(vmull_s16(v_srclo, v_srclo), vget_low_s16(v_dst0))),
vqmovn_s32(vaddw_s16(vmull_s16(v_srchi, v_srchi), vget_high_s16(v_dst0))));
v_srclo = vget_low_s16(v_src1);
v_srchi = vget_high_s16(v_src1);
v_dst1 = vcombine_s16(vqmovn_s32(vaddw_s16(vmull_s16(v_srclo, v_srclo), vget_low_s16(v_dst1))),
vqmovn_s32(vaddw_s16(vmull_s16(v_srchi, v_srchi), vget_high_s16(v_dst1))));
vst1q_s16(dst + j, v_dst0);
vst1q_s16(dst + j + 8, v_dst1);
}
for (; j < roiw8; j += 8)
{
int16x8_t v_src = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(src + j)));
int16x8_t v_dst = vld1q_s16(dst + j);
int16x4_t v_srclo = vget_low_s16(v_src), v_srchi = vget_high_s16(v_src);
v_dst = vcombine_s16(vqmovn_s32(vaddw_s16(vmull_s16(v_srclo, v_srclo), vget_low_s16(v_dst))),
vqmovn_s32(vaddw_s16(vmull_s16(v_srchi, v_srchi), vget_high_s16(v_dst))));
vst1q_s16(dst + j, v_dst);
}
for (; j < size.width; j++)
{
s32 srcVal = src[j];
dst[j] = internal::saturate_cast<s16>(dst[j] + srcVal * srcVal);
}
}
}
typedef void (* accumulateSquareConstFunc)(const Size2D &size,
const u8 *srcBase, ptrdiff_t srcStride,
s16 *dstBase, ptrdiff_t dstStride);
} // namespace
#endif
void accumulateSquare(const Size2D &size,
const u8 *srcBase, ptrdiff_t srcStride,
s16 *dstBase, ptrdiff_t dstStride,
u32 shift)
{
if (shift >= 16)
{
for (size_t i = 0; i < size.height; ++i)
{
s16 * dst = internal::getRowPtr(dstBase, dstStride, i);
std::memset(dst, 0, sizeof(s16) * size.width);
}
return;
}
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
// this ugly contruction is needed to avoid:
// /usr/lib/gcc/arm-linux-gnueabihf/4.8/include/arm_neon.h:3581:59: error: argument must be a constant
// return (int16x8_t)__builtin_neon_vshr_nv8hi (__a, __b, 1);
accumulateSquareConstFunc funcs[16] =
{
accumulateSquareConst<0>,
accumulateSquareConst<1>,
accumulateSquareConst<2>,
accumulateSquareConst<3>,
accumulateSquareConst<4>,
accumulateSquareConst<5>,
accumulateSquareConst<6>,
accumulateSquareConst<7>,
accumulateSquareConst<8>,
accumulateSquareConst<9>,
accumulateSquareConst<10>,
accumulateSquareConst<11>,
accumulateSquareConst<12>,
accumulateSquareConst<13>,
accumulateSquareConst<14>,
accumulateSquareConst<15>
}, func = funcs[shift];
func(size, srcBase, srcStride, dstBase, dstStride);
#else
(void)size;
(void)srcBase;
(void)srcStride;
(void)dstBase;
(void)dstStride;
(void)shift;
#endif
}
#ifdef CAROTENE_NEON
namespace {
struct AccumulateWeightedHalf
{
typedef u8 type;
void operator() (const uint8x16_t & v_src0, const uint8x16_t & v_src1,
uint8x16_t & v_dst) const
{
v_dst = vhaddq_u8(v_src0, v_src1);
}
void operator() (const uint8x8_t & v_src0, const uint8x8_t & v_src1,
uint8x8_t & v_dst) const
{
v_dst = vhadd_u8(v_src0, v_src1);
}
void operator() (const u8 * src0, const u8 * src1, u8 * dst) const
{
dst[0] = ((u16)(src0[0]) + src1[0]) >> 1;
}
};
struct AccumulateWeighted
{
typedef u8 type;
float alpha, beta;
float32x4_t v_alpha, v_beta;
explicit AccumulateWeighted(float _alpha) :
alpha(_alpha), beta(1 - _alpha)
{
v_alpha = vdupq_n_f32(alpha);
v_beta = vdupq_n_f32(beta);
}
void operator() (const uint8x16_t & v_src0, const uint8x16_t & v_src1,
uint8x16_t & v_dst) const
{
uint16x8_t v_src0_p = vmovl_u8(vget_low_u8(v_src0));
uint16x8_t v_src1_p = vmovl_u8(vget_low_u8(v_src1));
float32x4_t v_dst0f = vmlaq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src1_p))), v_beta),
v_alpha, vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src0_p))));
float32x4_t v_dst1f = vmlaq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src1_p))), v_beta),
v_alpha, vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src0_p))));
uint16x8_t v_dst0 = vcombine_u16(vmovn_u32(vcvtq_u32_f32(v_dst0f)),
vmovn_u32(vcvtq_u32_f32(v_dst1f)));
v_src0_p = vmovl_u8(vget_high_u8(v_src0));
v_src1_p = vmovl_u8(vget_high_u8(v_src1));
v_dst0f = vmlaq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src1_p))), v_beta),
v_alpha, vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src0_p))));
v_dst1f = vmlaq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src1_p))), v_beta),
v_alpha, vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src0_p))));
uint16x8_t v_dst1 = vcombine_u16(vmovn_u32(vcvtq_u32_f32(v_dst0f)),
vmovn_u32(vcvtq_u32_f32(v_dst1f)));
v_dst = vcombine_u8(vmovn_u16(v_dst0), vmovn_u16(v_dst1));
}
void operator() (const uint8x8_t & _v_src0, const uint8x8_t & _v_src1,
uint8x8_t & v_dst) const
{
uint16x8_t v_src0 = vmovl_u8(_v_src0), v_src1 = vmovl_u8(_v_src1);
float32x4_t v_dst0f = vmlaq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src1))), v_beta),
v_alpha, vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src0))));
float32x4_t v_dst1f = vmlaq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src1))), v_beta),
v_alpha, vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src0))));
uint16x8_t _v_dst = vcombine_u16(vmovn_u32(vcvtq_u32_f32(v_dst0f)),
vmovn_u32(vcvtq_u32_f32(v_dst1f)));
v_dst = vmovn_u16(_v_dst);
}
void operator() (const u8 * src0, const u8 * src1, u8 * dst) const
{
dst[0] = beta * src1[0] + alpha * src0[0];
}
};
} // namespace
#endif
void accumulateWeighted(const Size2D &size,
const u8 *srcBase, ptrdiff_t srcStride,
u8 *dstBase, ptrdiff_t dstStride,
f32 alpha)
{
if (alpha == 0.0f)
return;
if (alpha == 1.0f)
{
for (size_t i = 0; i < size.height; ++i)
{
const u8 * src = internal::getRowPtr(srcBase, srcStride, i);
u8 * dst = internal::getRowPtr(dstBase, dstStride, i);
std::memcpy(dst, src, sizeof(u8) * size.width);
}
return;
}
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
// in this case we can use the following scheme:
// dst[p] = (src[p] + dst[p]) >> 1
// which is faster
if (alpha == 0.5f)
{
internal::vtransform(size,
srcBase, srcStride,
dstBase, dstStride,
dstBase, dstStride,
AccumulateWeightedHalf());
return;
}
internal::vtransform(size,
srcBase, srcStride,
dstBase, dstStride,
dstBase, dstStride,
AccumulateWeighted(alpha));
#else
(void)size;
(void)srcBase;
(void)srcStride;
(void)dstBase;
(void)dstStride;
(void)alpha;
#endif
}
} //namespace CAROTENE_NS

475
3rdparty/carotene/src/add.cpp vendored Normal file
View File

@ -0,0 +1,475 @@
/*
* By downloading, copying, installing or using the software you agree to this license.
* If you do not agree to this license, do not download, install,
* copy or use the software.
*
*
* License Agreement
* For Open Source Computer Vision Library
* (3-clause BSD License)
*
* Copyright (C) 2014, NVIDIA Corporation, all rights reserved.
* Third party copyrights are property of their respective owners.
*
* Redistribution and use in source and binary forms, with or without modification,
* are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* * Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* * Neither the names of the copyright holders nor the names of the contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* This software is provided by the copyright holders and contributors "as is" and
* any express or implied warranties, including, but not limited to, the implied
* warranties of merchantability and fitness for a particular purpose are disclaimed.
* In no event shall copyright holders or contributors be liable for any direct,
* indirect, incidental, special, exemplary, or consequential damages
* (including, but not limited to, procurement of substitute goods or services;
* loss of use, data, or profits; or business interruption) however caused
* and on any theory of liability, whether in contract, strict liability,
* or tort (including negligence or otherwise) arising in any way out of
* the use of this software, even if advised of the possibility of such damage.
*/
#include "common.hpp"
#include "vtransform.hpp"
namespace CAROTENE_NS {
#ifdef CAROTENE_NEON
namespace {
template <typename T, typename WT>
struct AddWrap
{
typedef T type;
void operator() (const typename internal::VecTraits<T>::vec128 & v_src0,
const typename internal::VecTraits<T>::vec128 & v_src1,
typename internal::VecTraits<T>::vec128 & v_dst) const
{
v_dst = internal::vaddq(v_src0, v_src1);
}
void operator() (const typename internal::VecTraits<T>::vec64 & v_src0,
const typename internal::VecTraits<T>::vec64 & v_src1,
typename internal::VecTraits<T>::vec64 & v_dst) const
{
v_dst = internal::vadd(v_src0, v_src1);
}
void operator() (const T * src0, const T * src1, T * dst) const
{
dst[0] = (T)((WT)src0[0] + (WT)src1[0]);
}
};
template <typename T, typename WT>
struct AddSaturate
{
typedef T type;
void operator() (const typename internal::VecTraits<T>::vec128 & v_src0,
const typename internal::VecTraits<T>::vec128 & v_src1,
typename internal::VecTraits<T>::vec128 & v_dst) const
{
v_dst = internal::vqaddq(v_src0, v_src1);
}
void operator() (const typename internal::VecTraits<T>::vec64 & v_src0,
const typename internal::VecTraits<T>::vec64 & v_src1,
typename internal::VecTraits<T>::vec64 & v_dst) const
{
v_dst = internal::vqadd(v_src0, v_src1);
}
void operator() (const T * src0, const T * src1, T * dst) const
{
dst[0] = internal::saturate_cast<T>((WT)src0[0] + (WT)src1[0]);
}
};
} // namespace
#endif
void add(const Size2D &size,
const u8 * src0Base, ptrdiff_t src0Stride,
const u8 * src1Base, ptrdiff_t src1Stride,
u8 *dstBase, ptrdiff_t dstStride,
CONVERT_POLICY policy)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
if (policy == CONVERT_POLICY_SATURATE)
{
internal::vtransform(size,
src0Base, src0Stride,
src1Base, src1Stride,
dstBase, dstStride,
AddSaturate<u8, u16>());
}
else
{
internal::vtransform(size,
src0Base, src0Stride,
src1Base, src1Stride,
dstBase, dstStride,
AddWrap<u8, u16>());
}
#else
(void)size;
(void)src0Base;
(void)src0Stride;
(void)src1Base;
(void)src1Stride;
(void)dstBase;
(void)dstStride;
(void)policy;
#endif
}
void add(const Size2D &size,
const s8 * src0Base, ptrdiff_t src0Stride,
const s8 * src1Base, ptrdiff_t src1Stride,
s8 *dstBase, ptrdiff_t dstStride,
CONVERT_POLICY policy)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
if (policy == CONVERT_POLICY_SATURATE)
{
internal::vtransform(size,
src0Base, src0Stride,
src1Base, src1Stride,
dstBase, dstStride,
AddSaturate<s8, s16>());
}
else
{
internal::vtransform(size,
src0Base, src0Stride,
src1Base, src1Stride,
dstBase, dstStride,
AddWrap<s8, s16>());
}
#else
(void)size;
(void)src0Base;
(void)src0Stride;
(void)src1Base;
(void)src1Stride;
(void)dstBase;
(void)dstStride;
(void)policy;
#endif
}
void add(const Size2D &size,
const u8 * src0Base, ptrdiff_t src0Stride,
const u8 * src1Base, ptrdiff_t src1Stride,
s16 *dstBase, ptrdiff_t dstStride,
CONVERT_POLICY)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
size_t roiw32 = size.width >= 31 ? size.width - 31 : 0;
size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
for (size_t i = 0; i < size.height; ++i)
{
const u8 * src0 = internal::getRowPtr(src0Base, src0Stride, i);
const u8 * src1 = internal::getRowPtr(src1Base, src1Stride, i);
u16 * dst = internal::getRowPtr((u16 *)dstBase, dstStride, i);
size_t j = 0;
for (; j < roiw32; j += 32)
{
internal::prefetch(src0 + j);
internal::prefetch(src1 + j);
uint8x16_t v_src00 = vld1q_u8(src0 + j), v_src01 = vld1q_u8(src0 + j + 16);
uint8x16_t v_src10 = vld1q_u8(src1 + j), v_src11 = vld1q_u8(src1 + j + 16);
vst1q_u16(dst + j, vaddl_u8(vget_low_u8(v_src00), vget_low_u8(v_src10)));
vst1q_u16(dst + j + 8, vaddl_u8(vget_high_u8(v_src00), vget_high_u8(v_src10)));
vst1q_u16(dst + j + 16, vaddl_u8(vget_low_u8(v_src01), vget_low_u8(v_src11)));
vst1q_u16(dst + j + 24, vaddl_u8(vget_high_u8(v_src01), vget_high_u8(v_src11)));
}
for (; j < roiw8; j += 8)
{
uint8x8_t v_src0 = vld1_u8(src0 + j);
uint8x8_t v_src1 = vld1_u8(src1 + j);
vst1q_u16(dst + j, vaddl_u8(v_src0, v_src1));
}
for (; j < size.width; j++)
dst[j] = (u16)src0[j] + (u16)src1[j];
}
#else
(void)size;
(void)src0Base;
(void)src0Stride;
(void)src1Base;
(void)src1Stride;
(void)dstBase;
(void)dstStride;
#endif
}
void add(const Size2D &size,
const u8 * src0Base, ptrdiff_t src0Stride,
const s16 * src1Base, ptrdiff_t src1Stride,
s16 *dstBase, ptrdiff_t dstStride,
CONVERT_POLICY policy)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
size_t roiw16 = size.width >= 15 ? size.width - 15 : 0;
size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
for (size_t i = 0; i < size.height; ++i)
{
const u8 * src0 = internal::getRowPtr(src0Base, src0Stride, i);
const s16 * src1 = internal::getRowPtr(src1Base, src1Stride, i);
s16 * dst = internal::getRowPtr(dstBase, dstStride, i);
size_t j = 0;
if (policy == CONVERT_POLICY_SATURATE)
{
for (; j < roiw16; j += 16)
{
internal::prefetch(src0 + j);
internal::prefetch(src1 + j);
uint8x16_t v_src0 = vld1q_u8(src0 + j);
int16x8_t v_src00 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(v_src0)));
int16x8_t v_src01 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(v_src0)));
int16x8_t v_src10 = vld1q_s16(src1 + j), v_src11 = vld1q_s16(src1 + j + 8);
int16x8_t v_dst0 = vqaddq_s16(v_src00, v_src10);
int16x8_t v_dst1 = vqaddq_s16(v_src01, v_src11);
vst1q_s16(dst + j, v_dst0);
vst1q_s16(dst + j + 8, v_dst1);
}
for (; j < roiw8; j += 8)
{
int16x8_t v_src0 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(src0 + j)));
int16x8_t v_src1 = vld1q_s16(src1 + j);
int16x8_t v_dst = vqaddq_s16(v_src0, v_src1);
vst1q_s16(dst + j, v_dst);
}
for (; j < size.width; j++)
dst[j] = internal::saturate_cast<s16>((s32)src0[j] + (s32)src1[j]);
}
else
{
for (; j < roiw16; j += 16)
{
internal::prefetch(src0 + j);
internal::prefetch(src1 + j);
uint8x16_t v_src0 = vld1q_u8(src0 + j);
int16x8_t v_src00 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(v_src0)));
int16x8_t v_src01 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(v_src0)));
int16x8_t v_src10 = vld1q_s16(src1 + j), v_src11 = vld1q_s16(src1 + j + 8);
int16x8_t v_dst0 = vaddq_s16(v_src00, v_src10);
int16x8_t v_dst1 = vaddq_s16(v_src01, v_src11);
vst1q_s16(dst + j, v_dst0);
vst1q_s16(dst + j + 8, v_dst1);
}
for (; j < roiw8; j += 8)
{
int16x8_t v_src0 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(src0 + j)));
int16x8_t v_src1 = vld1q_s16(src1 + j);
int16x8_t v_dst = vaddq_s16(v_src0, v_src1);
vst1q_s16(dst + j, v_dst);
}
for (; j < size.width; j++)
dst[j] = (s16)((s32)src0[j] + (s32)src1[j]);
}
}
#else
(void)size;
(void)src0Base;
(void)src0Stride;
(void)src1Base;
(void)src1Stride;
(void)dstBase;
(void)dstStride;
(void)policy;
#endif
}
void add(const Size2D &size,
const s16 * src0Base, ptrdiff_t src0Stride,
const s16 * src1Base, ptrdiff_t src1Stride,
s16 *dstBase, ptrdiff_t dstStride,
CONVERT_POLICY policy)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
if (policy == CONVERT_POLICY_SATURATE)
{
internal::vtransform(size,
src0Base, src0Stride,
src1Base, src1Stride,
dstBase, dstStride,
AddSaturate<s16, s32>());
}
else
{
internal::vtransform(size,
src0Base, src0Stride,
src1Base, src1Stride,
dstBase, dstStride,
AddWrap<s16, s32>());
}
#else
(void)size;
(void)src0Base;
(void)src0Stride;
(void)src1Base;
(void)src1Stride;
(void)dstBase;
(void)dstStride;
(void)policy;
#endif
}
void add(const Size2D &size,
const u16 * src0Base, ptrdiff_t src0Stride,
const u16 * src1Base, ptrdiff_t src1Stride,
u16 * dstBase, ptrdiff_t dstStride,
CONVERT_POLICY policy)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
if (policy == CONVERT_POLICY_SATURATE)
{
internal::vtransform(size,
src0Base, src0Stride,
src1Base, src1Stride,
dstBase, dstStride,
AddSaturate<u16, u32>());
}
else
{
internal::vtransform(size,
src0Base, src0Stride,
src1Base, src1Stride,
dstBase, dstStride,
AddWrap<u16, u32>());
}
#else
(void)size;
(void)src0Base;
(void)src0Stride;
(void)src1Base;
(void)src1Stride;
(void)dstBase;
(void)dstStride;
(void)policy;
#endif
}
void add(const Size2D &size,
const s32 * src0Base, ptrdiff_t src0Stride,
const s32 * src1Base, ptrdiff_t src1Stride,
s32 *dstBase, ptrdiff_t dstStride,
CONVERT_POLICY policy)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
if (policy == CONVERT_POLICY_SATURATE)
{
internal::vtransform(size,
src0Base, src0Stride,
src1Base, src1Stride,
dstBase, dstStride,
AddSaturate<s32, s64>());
}
else
{
internal::vtransform(size,
src0Base, src0Stride,
src1Base, src1Stride,
dstBase, dstStride,
AddWrap<s32, s64>());
}
#else
(void)size;
(void)src0Base;
(void)src0Stride;
(void)src1Base;
(void)src1Stride;
(void)dstBase;
(void)dstStride;
(void)policy;
#endif
}
void add(const Size2D &size,
const u32 * src0Base, ptrdiff_t src0Stride,
const u32 * src1Base, ptrdiff_t src1Stride,
u32 * dstBase, ptrdiff_t dstStride,
CONVERT_POLICY policy)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
if (policy == CONVERT_POLICY_SATURATE)
{
internal::vtransform(size,
src0Base, src0Stride,
src1Base, src1Stride,
dstBase, dstStride,
AddSaturate<u32, u64>());
}
else
{
internal::vtransform(size,
src0Base, src0Stride,
src1Base, src1Stride,
dstBase, dstStride,
AddWrap<u32, u64>());
}
#else
(void)size;
(void)src0Base;
(void)src0Stride;
(void)src1Base;
(void)src1Stride;
(void)dstBase;
(void)dstStride;
(void)policy;
#endif
}
void add(const Size2D &size,
const f32 * src0Base, ptrdiff_t src0Stride,
const f32 * src1Base, ptrdiff_t src1Stride,
f32 * dstBase, ptrdiff_t dstStride)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
internal::vtransform(size,
src0Base, src0Stride,
src1Base, src1Stride,
dstBase, dstStride,
AddWrap<f32, f32>());
#else
(void)size;
(void)src0Base;
(void)src0Stride;
(void)src1Base;
(void)src1Stride;
(void)dstBase;
(void)dstStride;
#endif
}
} // namespace CAROTENE_NS

265
3rdparty/carotene/src/add_weighted.cpp vendored Normal file
View File

@ -0,0 +1,265 @@
/*
* By downloading, copying, installing or using the software you agree to this license.
* If you do not agree to this license, do not download, install,
* copy or use the software.
*
*
* License Agreement
* For Open Source Computer Vision Library
* (3-clause BSD License)
*
* Copyright (C) 2012-2015, NVIDIA Corporation, all rights reserved.
* Third party copyrights are property of their respective owners.
*
* Redistribution and use in source and binary forms, with or without modification,
* are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* * Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* * Neither the names of the copyright holders nor the names of the contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* This software is provided by the copyright holders and contributors "as is" and
* any express or implied warranties, including, but not limited to, the implied
* warranties of merchantability and fitness for a particular purpose are disclaimed.
* In no event shall copyright holders or contributors be liable for any direct,
* indirect, incidental, special, exemplary, or consequential damages
* (including, but not limited to, procurement of substitute goods or services;
* loss of use, data, or profits; or business interruption) however caused
* and on any theory of liability, whether in contract, strict liability,
* or tort (including negligence or otherwise) arising in any way out of
* the use of this software, even if advised of the possibility of such damage.
*/
#include "common.hpp"
#include "vtransform.hpp"
namespace CAROTENE_NS {
#ifdef CAROTENE_NEON
namespace {
using namespace internal;
template <typename T> struct TypeTraits;
template <> struct TypeTraits< u8> { typedef u16 wide; typedef u8 unsign; typedef uint8x16_t vec128; };
template <> struct TypeTraits< s8> { typedef s16 wide; typedef u8 unsign; typedef int8x16_t vec128; };
template <> struct TypeTraits<u16> { typedef u32 wide; typedef u8 narrow; typedef u16 unsign; typedef uint16x8_t vec128; };
template <> struct TypeTraits<s16> { typedef s32 wide; typedef s8 narrow; typedef u16 unsign; typedef int16x8_t vec128; };
template <> struct TypeTraits<u32> { typedef u64 wide; typedef u16 narrow; typedef u32 unsign; typedef uint32x4_t vec128; };
template <> struct TypeTraits<s32> { typedef s64 wide; typedef s16 narrow; typedef u32 unsign; typedef int32x4_t vec128; };
template <> struct TypeTraits<f32> { typedef f64 wide; typedef float32x4_t vec128; };
template <typename T> struct wAdd
{
typedef T type;
f32 alpha, beta, gamma;
typedef typename TypeTraits<T>::wide wtype;
wAdd<wtype> wideAdd;
wAdd(f32 _alpha, f32 _beta, f32 _gamma):
alpha(_alpha), beta(_beta), gamma(_gamma),
wideAdd(_alpha, _beta, _gamma) {}
void operator() (const typename VecTraits<T>::vec128 & v_src0,
const typename VecTraits<T>::vec128 & v_src1,
typename VecTraits<T>::vec128 & v_dst) const
{
typename VecTraits<wtype>::vec128 vrl, vrh;
wideAdd(vmovl( vget_low(v_src0)), vmovl( vget_low(v_src1)), vrl);
wideAdd(vmovl(vget_high(v_src0)), vmovl(vget_high(v_src1)), vrh);
v_dst = vcombine(vqmovn(vrl), vqmovn(vrh));
}
void operator() (const typename VecTraits<T>::vec64 & v_src0,
const typename VecTraits<T>::vec64 & v_src1,
typename VecTraits<T>::vec64 & v_dst) const
{
typename VecTraits<wtype>::vec128 vr;
wideAdd(vmovl(v_src0), vmovl(v_src1), vr);
v_dst = vqmovn(vr);
}
void operator() (const T * src0, const T * src1, T * dst) const
{
dst[0] = saturate_cast<T>(alpha*src0[0] + beta*src1[0] + gamma);
}
};
template <> struct wAdd<s32>
{
typedef s32 type;
f32 alpha, beta, gamma;
float32x4_t valpha, vbeta, vgamma;
wAdd(f32 _alpha, f32 _beta, f32 _gamma):
alpha(_alpha), beta(_beta), gamma(_gamma)
{
valpha = vdupq_n_f32(_alpha);
vbeta = vdupq_n_f32(_beta);
vgamma = vdupq_n_f32(_gamma + 0.5);
}
void operator() (const typename VecTraits<s32>::vec128 & v_src0,
const typename VecTraits<s32>::vec128 & v_src1,
typename VecTraits<s32>::vec128 & v_dst) const
{
float32x4_t vs1 = vcvtq_f32_s32(v_src0);
float32x4_t vs2 = vcvtq_f32_s32(v_src1);
vs1 = vmlaq_f32(vgamma, vs1, valpha);
vs1 = vmlaq_f32(vs1, vs2, vbeta);
v_dst = vcvtq_s32_f32(vs1);
}
void operator() (const typename VecTraits<s32>::vec64 & v_src0,
const typename VecTraits<s32>::vec64 & v_src1,
typename VecTraits<s32>::vec64 & v_dst) const
{
float32x2_t vs1 = vcvt_f32_s32(v_src0);
float32x2_t vs2 = vcvt_f32_s32(v_src1);
vs1 = vmla_f32(vget_low(vgamma), vs1, vget_low(valpha));
vs1 = vmla_f32(vs1, vs2, vget_low(vbeta));
v_dst = vcvt_s32_f32(vs1);
}
void operator() (const s32 * src0, const s32 * src1, s32 * dst) const
{
dst[0] = saturate_cast<s32>(alpha*src0[0] + beta*src1[0] + gamma);
}
};
template <> struct wAdd<u32>
{
typedef u32 type;
f32 alpha, beta, gamma;
float32x4_t valpha, vbeta, vgamma;
wAdd(f32 _alpha, f32 _beta, f32 _gamma):
alpha(_alpha), beta(_beta), gamma(_gamma)
{
valpha = vdupq_n_f32(_alpha);
vbeta = vdupq_n_f32(_beta);
vgamma = vdupq_n_f32(_gamma + 0.5);
}
void operator() (const typename VecTraits<u32>::vec128 & v_src0,
const typename VecTraits<u32>::vec128 & v_src1,
typename VecTraits<u32>::vec128 & v_dst) const
{
float32x4_t vs1 = vcvtq_f32_u32(v_src0);
float32x4_t vs2 = vcvtq_f32_u32(v_src1);
vs1 = vmlaq_f32(vgamma, vs1, valpha);
vs1 = vmlaq_f32(vs1, vs2, vbeta);
v_dst = vcvtq_u32_f32(vs1);
}
void operator() (const typename VecTraits<u32>::vec64 & v_src0,
const typename VecTraits<u32>::vec64 & v_src1,
typename VecTraits<u32>::vec64 & v_dst) const
{
float32x2_t vs1 = vcvt_f32_u32(v_src0);
float32x2_t vs2 = vcvt_f32_u32(v_src1);
vs1 = vmla_f32(vget_low(vgamma), vs1, vget_low(valpha));
vs1 = vmla_f32(vs1, vs2, vget_low(vbeta));
v_dst = vcvt_u32_f32(vs1);
}
void operator() (const u32 * src0, const u32 * src1, u32 * dst) const
{
dst[0] = saturate_cast<u32>(alpha*src0[0] + beta*src1[0] + gamma);
}
};
template <> struct wAdd<f32>
{
typedef f32 type;
f32 alpha, beta, gamma;
float32x4_t valpha, vbeta, vgamma;
wAdd(f32 _alpha, f32 _beta, f32 _gamma):
alpha(_alpha), beta(_beta), gamma(_gamma)
{
valpha = vdupq_n_f32(_alpha);
vbeta = vdupq_n_f32(_beta);
vgamma = vdupq_n_f32(_gamma + 0.5);
}
void operator() (const typename VecTraits<f32>::vec128 & v_src0,
const typename VecTraits<f32>::vec128 & v_src1,
typename VecTraits<f32>::vec128 & v_dst) const
{
float32x4_t vs1 = vmlaq_f32(vgamma, v_src0, valpha);
v_dst = vmlaq_f32(vs1, v_src1, vbeta);
}
void operator() (const typename VecTraits<f32>::vec64 & v_src0,
const typename VecTraits<f32>::vec64 & v_src1,
typename VecTraits<f32>::vec64 & v_dst) const
{
float32x2_t vs1 = vmla_f32(vget_low(vgamma), v_src0, vget_low(valpha));
v_dst = vmla_f32(vs1, v_src1, vget_low(vbeta));
}
void operator() (const f32 * src0, const f32 * src1, f32 * dst) const
{
dst[0] = alpha*src0[0] + beta*src1[0] + gamma;
}
};
} // namespace
#define IMPL_ADDWEIGHTED(type) \
void addWeighted(const Size2D &size, \
const type * src0Base, ptrdiff_t src0Stride, \
const type * src1Base, ptrdiff_t src1Stride, \
type * dstBase, ptrdiff_t dstStride, \
f32 alpha, f32 beta, f32 gamma) \
{ \
internal::assertSupportedConfiguration(); \
wAdd<type> wgtAdd(alpha, \
beta, \
gamma); \
internal::vtransform(size, \
src0Base, src0Stride, \
src1Base, src1Stride, \
dstBase, dstStride, \
wgtAdd); \
}
#else
#define IMPL_ADDWEIGHTED(type) \
void addWeighted(const Size2D &, \
const type *, ptrdiff_t, \
const type *, ptrdiff_t, \
type *, ptrdiff_t, \
f32, f32, f32) \
{ \
internal::assertSupportedConfiguration(); \
}
#endif
IMPL_ADDWEIGHTED(u8)
IMPL_ADDWEIGHTED(s8)
IMPL_ADDWEIGHTED(u16)
IMPL_ADDWEIGHTED(s16)
IMPL_ADDWEIGHTED(u32)
IMPL_ADDWEIGHTED(s32)
IMPL_ADDWEIGHTED(f32)
} // namespace CAROTENE_NS

225
3rdparty/carotene/src/bitwise.cpp vendored Normal file
View File

@ -0,0 +1,225 @@
/*
* By downloading, copying, installing or using the software you agree to this license.
* If you do not agree to this license, do not download, install,
* copy or use the software.
*
*
* License Agreement
* For Open Source Computer Vision Library
* (3-clause BSD License)
*
* Copyright (C) 2014, NVIDIA Corporation, all rights reserved.
* Third party copyrights are property of their respective owners.
*
* Redistribution and use in source and binary forms, with or without modification,
* are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* * Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* * Neither the names of the copyright holders nor the names of the contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* This software is provided by the copyright holders and contributors "as is" and
* any express or implied warranties, including, but not limited to, the implied
* warranties of merchantability and fitness for a particular purpose are disclaimed.
* In no event shall copyright holders or contributors be liable for any direct,
* indirect, incidental, special, exemplary, or consequential damages
* (including, but not limited to, procurement of substitute goods or services;
* loss of use, data, or profits; or business interruption) however caused
* and on any theory of liability, whether in contract, strict liability,
* or tort (including negligence or otherwise) arising in any way out of
* the use of this software, even if advised of the possibility of such damage.
*/
#include "common.hpp"
#include "vtransform.hpp"
namespace CAROTENE_NS {
#ifdef CAROTENE_NEON
struct BitwiseAnd
{
typedef u8 type;
void operator() (const uint8x16_t & v_src0, const uint8x16_t & v_src1,
uint8x16_t & v_dst) const
{
v_dst = vandq_u8(v_src0, v_src1);
}
void operator() (const uint8x8_t & v_src0, const uint8x8_t & v_src1,
uint8x8_t & v_dst) const
{
v_dst = vand_u8(v_src0, v_src1);
}
void operator() (const u8 * src0, const u8 * src1, u8 * dst) const
{
dst[0] = src0[0] & src1[0];
}
};
struct BitwiseOr
{
typedef u8 type;
void operator() (const uint8x16_t & v_src0, const uint8x16_t & v_src1,
uint8x16_t & v_dst) const
{
v_dst = vorrq_u8(v_src0, v_src1);
}
void operator() (const uint8x8_t & v_src0, const uint8x8_t & v_src1,
uint8x8_t & v_dst) const
{
v_dst = vorr_u8(v_src0, v_src1);
}
void operator() (const u8 * src0, const u8 * src1, u8 * dst) const
{
dst[0] = src0[0] | src1[0];
}
};
struct BitwiseXor
{
typedef u8 type;
void operator() (const uint8x16_t & v_src0, const uint8x16_t & v_src1,
uint8x16_t & v_dst) const
{
v_dst = veorq_u8(v_src0, v_src1);
}
void operator() (const uint8x8_t & v_src0, const uint8x8_t & v_src1,
uint8x8_t & v_dst) const
{
v_dst = veor_u8(v_src0, v_src1);
}
void operator() (const u8 * src0, const u8 * src1, u8 * dst) const
{
dst[0] = src0[0] ^ src1[0];
}
};
#endif
void bitwiseNot(const Size2D &size,
const u8 *srcBase, ptrdiff_t srcStride,
u8 *dstBase, ptrdiff_t dstStride)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
size_t roiw32 = size.width >= 31 ? size.width - 31 : 0;
size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
for (size_t i = 0; i < size.height; ++i)
{
const u8* src = internal::getRowPtr(srcBase, srcStride, i);
u8* dst = internal::getRowPtr(dstBase, dstStride, i);
size_t j = 0;
for (; j < roiw32; j += 32)
{
internal::prefetch(src + j);
uint8x16_t v_src0 = vld1q_u8(src + j), v_src1 = vld1q_u8(src + j + 16);
uint8x16_t v_dst0 = vmvnq_u8(v_src0), v_dst1 = vmvnq_u8(v_src1);
vst1q_u8(dst + j, v_dst0);
vst1q_u8(dst + j + 16, v_dst1);
}
for (; j < roiw8; j += 8)
{
uint8x8_t v_src = vld1_u8(src + j);
uint8x8_t v_dst = vmvn_u8(v_src);
vst1_u8(dst + j, v_dst);
}
for (; j < size.width; j++)
{
dst[j] = ~src[j];
}
}
#else
(void)size;
(void)srcBase;
(void)srcStride;
(void)dstBase;
(void)dstStride;
#endif
}
void bitwiseAnd(const Size2D &size,
const u8 *src0Base, ptrdiff_t src0Stride,
const u8 *src1Base, ptrdiff_t src1Stride,
u8 *dstBase, ptrdiff_t dstStride)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
internal::vtransform(size,
src0Base, src0Stride,
src1Base, src1Stride,
dstBase, dstStride, BitwiseAnd());
#else
(void)size;
(void)src0Base;
(void)src0Stride;
(void)src1Base;
(void)src1Stride;
(void)dstBase;
(void)dstStride;
#endif
}
void bitwiseOr(const Size2D &size,
const u8 *src0Base, ptrdiff_t src0Stride,
const u8 *src1Base, ptrdiff_t src1Stride,
u8 *dstBase, ptrdiff_t dstStride)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
internal::vtransform(size,
src0Base, src0Stride,
src1Base, src1Stride,
dstBase, dstStride, BitwiseOr());
#else
(void)size;
(void)src0Base;
(void)src0Stride;
(void)src1Base;
(void)src1Stride;
(void)dstBase;
(void)dstStride;
#endif
}
void bitwiseXor(const Size2D &size,
const u8 *src0Base, ptrdiff_t src0Stride,
const u8 *src1Base, ptrdiff_t src1Stride,
u8 *dstBase, ptrdiff_t dstStride)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
internal::vtransform(size,
src0Base, src0Stride,
src1Base, src1Stride,
dstBase, dstStride, BitwiseXor());
#else
(void)size;
(void)src0Base;
(void)src0Stride;
(void)src1Base;
(void)src1Stride;
(void)dstBase;
(void)dstStride;
#endif
}
} // namespace CAROTENE_NS

1337
3rdparty/carotene/src/blur.cpp vendored Normal file

File diff suppressed because it is too large Load Diff

773
3rdparty/carotene/src/canny.cpp vendored Normal file
View File

@ -0,0 +1,773 @@
/*
* By downloading, copying, installing or using the software you agree to this license.
* If you do not agree to this license, do not download, install,
* copy or use the software.
*
*
* License Agreement
* For Open Source Computer Vision Library
* (3-clause BSD License)
*
* Copyright (C) 2012-2015, NVIDIA Corporation, all rights reserved.
* Third party copyrights are property of their respective owners.
*
* Redistribution and use in source and binary forms, with or without modification,
* are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* * Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* * Neither the names of the copyright holders nor the names of the contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* This software is provided by the copyright holders and contributors "as is" and
* any express or implied warranties, including, but not limited to, the implied
* warranties of merchantability and fitness for a particular purpose are disclaimed.
* In no event shall copyright holders or contributors be liable for any direct,
* indirect, incidental, special, exemplary, or consequential damages
* (including, but not limited to, procurement of substitute goods or services;
* loss of use, data, or profits; or business interruption) however caused
* and on any theory of liability, whether in contract, strict liability,
* or tort (including negligence or otherwise) arising in any way out of
* the use of this software, even if advised of the possibility of such damage.
*/
#include "common.hpp"
#include "saturate_cast.hpp"
#include <vector>
#include <cstring>
namespace CAROTENE_NS {
#ifdef CAROTENE_NEON
namespace {
struct RowFilter3x3Canny
{
inline RowFilter3x3Canny(const ptrdiff_t borderxl, const ptrdiff_t borderxr)
{
vfmask = vreinterpret_u8_u64(vmov_n_u64(borderxl ? 0x0000FFffFFffFFffULL : 0x0100FFffFFffFFffULL));
vtmask = vreinterpret_u8_u64(vmov_n_u64(borderxr ? 0x0707060504030201ULL : 0x0706050403020100ULL));
lookLeft = offsetk - borderxl;
lookRight = offsetk - borderxr;
}
inline void operator()(const u8* src, s16* dstx, s16* dsty, ptrdiff_t width)
{
uint8x8_t l = vtbl1_u8(vld1_u8(src - lookLeft), vfmask);
ptrdiff_t i = 0;
for (; i < width - 8 + lookRight; i += 8)
{
internal::prefetch(src + i);
uint8x8_t l18u = vld1_u8(src + i + 1);
uint8x8_t l2 = l18u;
uint8x8_t l0 = vext_u8(l, l18u, 6);
int16x8_t l1x2 = vreinterpretq_s16_u16(vshll_n_u8(vext_u8(l, l18u, 7), 1));
l = l18u;
int16x8_t l02 = vreinterpretq_s16_u16(vaddl_u8(l2, l0));
int16x8_t ldx = vreinterpretq_s16_u16(vsubl_u8(l2, l0));
int16x8_t ldy = vaddq_s16(l02, l1x2);
vst1q_s16(dstx + i, ldx);
vst1q_s16(dsty + i, ldy);
}
//tail
if (lookRight == 0 || i != width)
{
uint8x8_t tail0 = vld1_u8(src + (width - 9));//can't get left 1 pixel another way if width==8*k+1
uint8x8_t tail2 = vtbl1_u8(vld1_u8(src + (width - 8 + lookRight)), vtmask);
uint8x8_t tail1 = vext_u8(vreinterpret_u8_u64(vshl_n_u64(vreinterpret_u64_u8(tail0), 8*6)), tail2, 7);
int16x8_t tail02 = vreinterpretq_s16_u16(vaddl_u8(tail2, tail0));
int16x8_t tail1x2 = vreinterpretq_s16_u16(vshll_n_u8(tail1, 1));
int16x8_t taildx = vreinterpretq_s16_u16(vsubl_u8(tail2, tail0));
int16x8_t taildy = vqaddq_s16(tail02, tail1x2);
vst1q_s16(dstx + (width - 8), taildx);
vst1q_s16(dsty + (width - 8), taildy);
}
}
uint8x8_t vfmask;
uint8x8_t vtmask;
enum { offsetk = 1};
ptrdiff_t lookLeft;
ptrdiff_t lookRight;
};
template <bool L2gradient>
inline void ColFilter3x3Canny(const s16* src0, const s16* src1, const s16* src2, s16* dstx, s16* dsty, s32* mag, ptrdiff_t width)
{
ptrdiff_t j = 0;
for (; j <= width - 8; j += 8)
{
ColFilter3x3CannyL1Loop:
int16x8_t line0x = vld1q_s16(src0 + j);
int16x8_t line1x = vld1q_s16(src1 + j);
int16x8_t line2x = vld1q_s16(src2 + j);
int16x8_t line0y = vld1q_s16(src0 + j + width);
int16x8_t line2y = vld1q_s16(src2 + j + width);
int16x8_t l02 = vaddq_s16(line0x, line2x);
int16x8_t l1x2 = vshlq_n_s16(line1x, 1);
int16x8_t dy = vsubq_s16(line2y, line0y);
int16x8_t dx = vaddq_s16(l1x2, l02);
int16x8_t dya = vabsq_s16(dy);
int16x8_t dxa = vabsq_s16(dx);
int16x8_t norm = vaddq_s16(dya, dxa);
int32x4_t normh = vmovl_s16(vget_high_s16(norm));
int32x4_t norml = vmovl_s16(vget_low_s16(norm));
vst1q_s16(dsty + j, dy);
vst1q_s16(dstx + j, dx);
vst1q_s32(mag + j + 4, normh);
vst1q_s32(mag + j, norml);
}
if (j != width)
{
j = width - 8;
goto ColFilter3x3CannyL1Loop;
}
}
template <>
inline void ColFilter3x3Canny<true>(const s16* src0, const s16* src1, const s16* src2, s16* dstx, s16* dsty, s32* mag, ptrdiff_t width)
{
ptrdiff_t j = 0;
for (; j <= width - 8; j += 8)
{
ColFilter3x3CannyL2Loop:
int16x8_t line0x = vld1q_s16(src0 + j);
int16x8_t line1x = vld1q_s16(src1 + j);
int16x8_t line2x = vld1q_s16(src2 + j);
int16x8_t line0y = vld1q_s16(src0 + j + width);
int16x8_t line2y = vld1q_s16(src2 + j + width);
int16x8_t l02 = vaddq_s16(line0x, line2x);
int16x8_t l1x2 = vshlq_n_s16(line1x, 1);
int16x8_t dy = vsubq_s16(line2y, line0y);
int16x8_t dx = vaddq_s16(l1x2, l02);
int32x4_t norml = vmull_s16(vget_low_s16(dx), vget_low_s16(dx));
int32x4_t normh = vmull_s16(vget_high_s16(dy), vget_high_s16(dy));
norml = vmlal_s16(norml, vget_low_s16(dy), vget_low_s16(dy));
normh = vmlal_s16(normh, vget_high_s16(dx), vget_high_s16(dx));
vst1q_s16(dsty + j, dy);
vst1q_s16(dstx + j, dx);
vst1q_s32(mag + j, norml);
vst1q_s32(mag + j + 4, normh);
}
if (j != width)
{
j = width - 8;
goto ColFilter3x3CannyL2Loop;
}
}
template <bool L2gradient>
inline void NormCanny(const ptrdiff_t colscn, s16* _dx, s16* _dy, s32* _norm)
{
ptrdiff_t j = 0;
if (colscn >= 8)
{
int16x8_t vx = vld1q_s16(_dx);
int16x8_t vy = vld1q_s16(_dy);
for (; j <= colscn - 16; j+=8)
{
internal::prefetch(_dx);
internal::prefetch(_dy);
int16x8_t vx2 = vld1q_s16(_dx + j + 8);
int16x8_t vy2 = vld1q_s16(_dy + j + 8);
int16x8_t vabsx = vabsq_s16(vx);
int16x8_t vabsy = vabsq_s16(vy);
int16x8_t norm = vaddq_s16(vabsx, vabsy);
int32x4_t normh = vmovl_s16(vget_high_s16(norm));
int32x4_t norml = vmovl_s16(vget_low_s16(norm));
vst1q_s32(_norm + j + 4, normh);
vst1q_s32(_norm + j + 0, norml);
vx = vx2;
vy = vy2;
}
int16x8_t vabsx = vabsq_s16(vx);
int16x8_t vabsy = vabsq_s16(vy);
int16x8_t norm = vaddq_s16(vabsx, vabsy);
int32x4_t normh = vmovl_s16(vget_high_s16(norm));
int32x4_t norml = vmovl_s16(vget_low_s16(norm));
vst1q_s32(_norm + j + 4, normh);
vst1q_s32(_norm + j + 0, norml);
}
for (; j < colscn; j++)
_norm[j] = std::abs(s32(_dx[j])) + std::abs(s32(_dy[j]));
}
template <>
inline void NormCanny<true>(const ptrdiff_t colscn, s16* _dx, s16* _dy, s32* _norm)
{
ptrdiff_t j = 0;
if (colscn >= 8)
{
int16x8_t vx = vld1q_s16(_dx);
int16x8_t vy = vld1q_s16(_dy);
for (; j <= colscn - 16; j+=8)
{
internal::prefetch(_dx);
internal::prefetch(_dy);
int16x8_t vxnext = vld1q_s16(_dx + j + 8);
int16x8_t vynext = vld1q_s16(_dy + j + 8);
int32x4_t norml = vmull_s16(vget_low_s16(vx), vget_low_s16(vx));
int32x4_t normh = vmull_s16(vget_high_s16(vy), vget_high_s16(vy));
norml = vmlal_s16(norml, vget_low_s16(vy), vget_low_s16(vy));
normh = vmlal_s16(normh, vget_high_s16(vx), vget_high_s16(vx));
vst1q_s32(_norm + j + 0, norml);
vst1q_s32(_norm + j + 4, normh);
vx = vxnext;
vy = vynext;
}
int32x4_t norml = vmull_s16(vget_low_s16(vx), vget_low_s16(vx));
int32x4_t normh = vmull_s16(vget_high_s16(vy), vget_high_s16(vy));
norml = vmlal_s16(norml, vget_low_s16(vy), vget_low_s16(vy));
normh = vmlal_s16(normh, vget_high_s16(vx), vget_high_s16(vx));
vst1q_s32(_norm + j + 0, norml);
vst1q_s32(_norm + j + 4, normh);
}
for (; j < colscn; j++)
_norm[j] = s32(_dx[j])*_dx[j] + s32(_dy[j])*_dy[j];
}
template <bool L2gradient>
inline void prepareThresh(f64 low_thresh, f64 high_thresh,
s32 &low, s32 &high)
{
if (low_thresh > high_thresh)
std::swap(low_thresh, high_thresh);
#if defined __GNUC__
low = (s32)low_thresh;
high = (s32)high_thresh;
low -= (low > low_thresh);
high -= (high > high_thresh);
#else
low = internal::round(low_thresh);
high = internal::round(high_thresh);
f32 ldiff = (f32)(low_thresh - low);
f32 hdiff = (f32)(high_thresh - high);
low -= (ldiff < 0);
high -= (hdiff < 0);
#endif
}
template <>
inline void prepareThresh<true>(f64 low_thresh, f64 high_thresh,
s32 &low, s32 &high)
{
if (low_thresh > high_thresh)
std::swap(low_thresh, high_thresh);
if (low_thresh > 0) low_thresh *= low_thresh;
if (high_thresh > 0) high_thresh *= high_thresh;
#if defined __GNUC__
low = (s32)low_thresh;
high = (s32)high_thresh;
low -= (low > low_thresh);
high -= (high > high_thresh);
#else
low = internal::round(low_thresh);
high = internal::round(high_thresh);
f32 ldiff = (f32)(low_thresh - low);
f32 hdiff = (f32)(high_thresh - high);
low -= (ldiff < 0);
high -= (hdiff < 0);
#endif
}
template <bool L2gradient, bool externalSobel>
struct _normEstimator
{
ptrdiff_t magstep;
ptrdiff_t dxOffset;
ptrdiff_t dyOffset;
ptrdiff_t shxOffset;
ptrdiff_t shyOffset;
std::vector<u8> buffer;
const ptrdiff_t offsetk;
ptrdiff_t borderyt, borderyb;
RowFilter3x3Canny sobelRow;
inline _normEstimator(const Size2D &size, s32, Margin borderMargin,
ptrdiff_t &mapstep, s32** mag_buf, u8* &map):
offsetk(1),
sobelRow(std::max<ptrdiff_t>(0, offsetk - (ptrdiff_t)borderMargin.left),
std::max<ptrdiff_t>(0, offsetk - (ptrdiff_t)borderMargin.right))
{
mapstep = size.width + 2;
magstep = size.width + 2 + size.width * (4 * sizeof(s16)/sizeof(s32));
dxOffset = mapstep * sizeof(s32)/sizeof(s16);
dyOffset = dxOffset + size.width * 1;
shxOffset = dxOffset + size.width * 2;
shyOffset = dxOffset + size.width * 3;
buffer.resize( (size.width+2)*(size.height+2) + magstep*3*sizeof(s32) );
mag_buf[0] = (s32*)&buffer[0];
mag_buf[1] = mag_buf[0] + magstep;
mag_buf[2] = mag_buf[1] + magstep;
memset(mag_buf[0], 0, mapstep * sizeof(s32));
map = (u8*)(mag_buf[2] + magstep);
memset(map, 1, mapstep);
memset(map + mapstep*(size.height + 1), 1, mapstep);
borderyt = std::max<ptrdiff_t>(0, offsetk - (ptrdiff_t)borderMargin.top);
borderyb = std::max<ptrdiff_t>(0, offsetk - (ptrdiff_t)borderMargin.bottom);
}
inline void firstRow(const Size2D &size, s32,
const u8 *srcBase, ptrdiff_t srcStride,
s16*, ptrdiff_t,
s16*, ptrdiff_t,
s32** mag_buf)
{
//sobelH row #0
const u8* _src = internal::getRowPtr(srcBase, srcStride, 0);
sobelRow(_src, ((s16*)mag_buf[0]) + shxOffset, ((s16*)mag_buf[0]) + shyOffset, size.width);
//sobelH row #1
_src = internal::getRowPtr(srcBase, srcStride, 1);
sobelRow(_src, ((s16*)mag_buf[1]) + shxOffset, ((s16*)mag_buf[1]) + shyOffset, size.width);
mag_buf[1][0] = mag_buf[1][size.width+1] = 0;
if (borderyt == 0)
{
//sobelH row #-1
_src = internal::getRowPtr(srcBase, srcStride, -1);
sobelRow(_src, ((s16*)mag_buf[2]) + shxOffset, ((s16*)mag_buf[2]) + shyOffset, size.width);
ColFilter3x3Canny<L2gradient>( ((s16*)mag_buf[2]) + shxOffset, ((s16*)mag_buf[0]) + shxOffset, ((s16*)mag_buf[1]) + shxOffset,
((s16*)mag_buf[1]) + dxOffset, ((s16*)mag_buf[1]) + dyOffset, mag_buf[1] + 1, size.width);
}
else
{
ColFilter3x3Canny<L2gradient>( ((s16*)mag_buf[0]) + shxOffset, ((s16*)mag_buf[0]) + shxOffset, ((s16*)mag_buf[1]) + shxOffset,
((s16*)mag_buf[1]) + dxOffset, ((s16*)mag_buf[1]) + dyOffset, mag_buf[1] + 1, size.width);
}
}
inline void nextRow(const Size2D &size, s32,
const u8 *srcBase, ptrdiff_t srcStride,
s16*, ptrdiff_t,
s16*, ptrdiff_t,
const ptrdiff_t &mapstep, s32** mag_buf,
size_t i, const s16* &_x, const s16* &_y)
{
mag_buf[2][0] = mag_buf[2][size.width+1] = 0;
if (i < size.height - borderyb)
{
const u8* _src = internal::getRowPtr(srcBase, srcStride, i+1);
//sobelH row #i+1
sobelRow(_src, ((s16*)mag_buf[2]) + shxOffset, ((s16*)mag_buf[2]) + shyOffset, size.width);
ColFilter3x3Canny<L2gradient>( ((s16*)mag_buf[0]) + shxOffset, ((s16*)mag_buf[1]) + shxOffset, ((s16*)mag_buf[2]) + shxOffset,
((s16*)mag_buf[2]) + dxOffset, ((s16*)mag_buf[2]) + dyOffset, mag_buf[2] + 1, size.width);
}
else if (i < size.height)
{
ColFilter3x3Canny<L2gradient>( ((s16*)mag_buf[0]) + shxOffset, ((s16*)mag_buf[1]) + shxOffset, ((s16*)mag_buf[1]) + shxOffset,
((s16*)mag_buf[2]) + dxOffset, ((s16*)mag_buf[2]) + dyOffset, mag_buf[2] + 1, size.width);
}
else
memset(mag_buf[2], 0, mapstep*sizeof(s32));
_x = ((s16*)mag_buf[1]) + dxOffset;
_y = ((s16*)mag_buf[1]) + dyOffset;
}
};
template <bool L2gradient>
struct _normEstimator<L2gradient, true>
{
std::vector<u8> buffer;
inline _normEstimator(const Size2D &size, s32 cn, Margin,
ptrdiff_t &mapstep, s32** mag_buf, u8* &map)
{
mapstep = size.width + 2;
buffer.resize( (size.width+2)*(size.height+2) + cn*mapstep*3*sizeof(s32) );
mag_buf[0] = (s32*)&buffer[0];
mag_buf[1] = mag_buf[0] + mapstep*cn;
mag_buf[2] = mag_buf[1] + mapstep*cn;
memset(mag_buf[0], 0, /* cn* */mapstep * sizeof(s32));
map = (u8*)(mag_buf[2] + mapstep*cn);
memset(map, 1, mapstep);
memset(map + mapstep*(size.height + 1), 1, mapstep);
}
inline void firstRow(const Size2D &size, s32 cn,
const u8 *, ptrdiff_t,
s16* dxBase, ptrdiff_t dxStride,
s16* dyBase, ptrdiff_t dyStride,
s32** mag_buf)
{
s32* _norm = mag_buf[1] + 1;
s16* _dx = internal::getRowPtr(dxBase, dxStride, 0);
s16* _dy = internal::getRowPtr(dyBase, dyStride, 0);
NormCanny<L2gradient>(size.width*cn, _dx, _dy, _norm);
if(cn > 1)
{
for(size_t j = 0, jn = 0; j < size.width; ++j, jn += cn)
{
size_t maxIdx = jn;
for(s32 k = 1; k < cn; ++k)
if(_norm[jn + k] > _norm[maxIdx]) maxIdx = jn + k;
_norm[j] = _norm[maxIdx];
_dx[j] = _dx[maxIdx];
_dy[j] = _dy[maxIdx];
}
}
_norm[-1] = _norm[size.width] = 0;
}
inline void nextRow(const Size2D &size, s32 cn,
const u8 *, ptrdiff_t,
s16* dxBase, ptrdiff_t dxStride,
s16* dyBase, ptrdiff_t dyStride,
const ptrdiff_t &mapstep, s32** mag_buf,
size_t i, const s16* &_x, const s16* &_y)
{
s32* _norm = mag_buf[(i > 0) + 1] + 1;
if (i < size.height)
{
s16* _dx = internal::getRowPtr(dxBase, dxStride, i);
s16* _dy = internal::getRowPtr(dyBase, dyStride, i);
NormCanny<L2gradient>(size.width*cn, _dx, _dy, _norm);
if(cn > 1)
{
for(size_t j = 0, jn = 0; j < size.width; ++j, jn += cn)
{
size_t maxIdx = jn;
for(s32 k = 1; k < cn; ++k)
if(_norm[jn + k] > _norm[maxIdx]) maxIdx = jn + k;
_norm[j] = _norm[maxIdx];
_dx[j] = _dx[maxIdx];
_dy[j] = _dy[maxIdx];
}
}
_norm[-1] = _norm[size.width] = 0;
}
else
memset(_norm-1, 0, /* cn* */mapstep*sizeof(s32));
_x = internal::getRowPtr(dxBase, dxStride, i-1);
_y = internal::getRowPtr(dyBase, dyStride, i-1);
}
};
template <bool L2gradient, bool externalSobel>
inline void Canny3x3(const Size2D &size, s32 cn,
const u8 * srcBase, ptrdiff_t srcStride,
u8 * dstBase, ptrdiff_t dstStride,
s16 * dxBase, ptrdiff_t dxStride,
s16 * dyBase, ptrdiff_t dyStride,
f64 low_thresh, f64 high_thresh,
Margin borderMargin)
{
s32 low, high;
prepareThresh<L2gradient>(low_thresh, high_thresh, low, high);
ptrdiff_t mapstep;
s32* mag_buf[3];
u8* map;
_normEstimator<L2gradient, externalSobel> normEstimator(size, cn, borderMargin, mapstep, mag_buf, map);
size_t maxsize = std::max<size_t>( 1u << 10, size.width * size.height / 10 );
std::vector<u8*> stack( maxsize );
u8 **stack_top = &stack[0];
u8 **stack_bottom = &stack[0];
/* sector numbers
(Top-Left Origin)
1 2 3
* * *
* * *
0*******0
* * *
* * *
3 2 1
*/
#define CANNY_PUSH(d) *(d) = u8(2), *stack_top++ = (d)
#define CANNY_POP(d) (d) = *--stack_top
//i == 0
normEstimator.firstRow(size, cn, srcBase, srcStride, dxBase, dxStride, dyBase, dyStride, mag_buf);
// calculate magnitude and angle of gradient, perform non-maxima supression.
// fill the map with one of the following values:
// 0 - the pixel might belong to an edge
// 1 - the pixel can not belong to an edge
// 2 - the pixel does belong to an edge
for (size_t i = 1; i <= size.height; i++)
{
const s16 *_x, *_y;
normEstimator.nextRow(size, cn, srcBase, srcStride, dxBase, dxStride, dyBase, dyStride, mapstep, mag_buf, i, _x, _y);
u8* _map = map + mapstep*i + 1;
_map[-1] = _map[size.width] = 1;
s32* _mag = mag_buf[1] + 1; // take the central row
ptrdiff_t magstep1 = mag_buf[2] - mag_buf[1];
ptrdiff_t magstep2 = mag_buf[0] - mag_buf[1];
if ((stack_top - stack_bottom) + size.width > maxsize)
{
ptrdiff_t sz = (ptrdiff_t)(stack_top - stack_bottom);
maxsize = maxsize * 3/2;
stack.resize(maxsize);
stack_bottom = &stack[0];
stack_top = stack_bottom + sz;
}
s32 prev_flag = 0;
for (ptrdiff_t j = 0; j < (ptrdiff_t)size.width; j++)
{
#define CANNY_SHIFT 15
const s32 TG22 = (s32)(0.4142135623730950488016887242097*(1<<CANNY_SHIFT) + 0.5);
s32 m = _mag[j];
if (m > low)
{
s32 xs = _x[j];
s32 ys = _y[j];
s32 x = abs(xs);
s32 y = abs(ys) << CANNY_SHIFT;
s32 tg22x = x * TG22;
if (y < tg22x)
{
if (m > _mag[j-1] && m >= _mag[j+1]) goto __push;
}
else
{
s32 tg67x = tg22x + (x << (CANNY_SHIFT+1));
if (y > tg67x)
{
if (m > _mag[j+magstep2] && m >= _mag[j+magstep1]) goto __push;
}
else
{
s32 s = (xs ^ ys) < 0 ? -1 : 1;
if(m > _mag[j+magstep2-s] && m > _mag[j+magstep1+s]) goto __push;
}
}
}
prev_flag = 0;
_map[j] = u8(1);
continue;
__push:
if (!prev_flag && m > high && _map[j-mapstep] != 2)
{
CANNY_PUSH(_map + j);
prev_flag = 1;
}
else
_map[j] = 0;
}
// scroll the ring buffer
_mag = mag_buf[0];
mag_buf[0] = mag_buf[1];
mag_buf[1] = mag_buf[2];
mag_buf[2] = _mag;
}
// now track the edges (hysteresis thresholding)
while (stack_top > stack_bottom)
{
u8* m;
if ((size_t)(stack_top - stack_bottom) + 8u > maxsize)
{
ptrdiff_t sz = (ptrdiff_t)(stack_top - stack_bottom);
maxsize = maxsize * 3/2;
stack.resize(maxsize);
stack_bottom = &stack[0];
stack_top = stack_bottom + sz;
}
CANNY_POP(m);
if (!m[-1]) CANNY_PUSH(m - 1);
if (!m[1]) CANNY_PUSH(m + 1);
if (!m[-mapstep-1]) CANNY_PUSH(m - mapstep - 1);
if (!m[-mapstep]) CANNY_PUSH(m - mapstep);
if (!m[-mapstep+1]) CANNY_PUSH(m - mapstep + 1);
if (!m[mapstep-1]) CANNY_PUSH(m + mapstep - 1);
if (!m[mapstep]) CANNY_PUSH(m + mapstep);
if (!m[mapstep+1]) CANNY_PUSH(m + mapstep + 1);
}
// the final pass, form the final image
uint8x16_t v2 = vmovq_n_u8(2);
const u8* ptrmap = map + mapstep + 1;
for (size_t i = 0; i < size.height; i++, ptrmap += mapstep)
{
u8* _dst = internal::getRowPtr(dstBase, dstStride, i);
ptrdiff_t j = 0;
for (; j < (ptrdiff_t)size.width - 16; j += 16)
{
internal::prefetch(ptrmap);
uint8x16_t vmap = vld1q_u8(ptrmap + j);
uint8x16_t vdst = vceqq_u8(vmap, v2);
vst1q_u8(_dst+j, vdst);
}
for (; j < (ptrdiff_t)size.width; j++)
_dst[j] = (u8)-(ptrmap[j] >> 1);
}
}
} // namespace
#endif
bool isCanny3x3Supported(const Size2D &size)
{
return isSupportedConfiguration() &&
size.height >= 2 && size.width >= 9;
}
void Canny3x3L1(const Size2D &size,
const u8 * srcBase, ptrdiff_t srcStride,
u8 * dstBase, ptrdiff_t dstStride,
f64 low_thresh, f64 high_thresh,
Margin borderMargin)
{
internal::assertSupportedConfiguration(isCanny3x3Supported(size));
#ifdef CAROTENE_NEON
Canny3x3<false, false>(size, 1,
srcBase, srcStride,
dstBase, dstStride,
NULL, 0,
NULL, 0,
low_thresh, high_thresh,
borderMargin);
#else
(void)size;
(void)srcBase;
(void)srcStride;
(void)dstBase;
(void)dstStride;
(void)low_thresh;
(void)high_thresh;
(void)borderMargin;
#endif
}
void Canny3x3L2(const Size2D &size,
const u8 * srcBase, ptrdiff_t srcStride,
u8 * dstBase, ptrdiff_t dstStride,
f64 low_thresh, f64 high_thresh,
Margin borderMargin)
{
internal::assertSupportedConfiguration(isCanny3x3Supported(size));
#ifdef CAROTENE_NEON
Canny3x3<true, false>(size, 1,
srcBase, srcStride,
dstBase, dstStride,
NULL, 0,
NULL, 0,
low_thresh, high_thresh,
borderMargin);
#else
(void)size;
(void)srcBase;
(void)srcStride;
(void)dstBase;
(void)dstStride;
(void)low_thresh;
(void)high_thresh;
(void)borderMargin;
#endif
}
void Canny3x3L1(const Size2D &size, s32 cn,
s16 * dxBase, ptrdiff_t dxStride,
s16 * dyBase, ptrdiff_t dyStride,
u8 * dstBase, ptrdiff_t dstStride,
f64 low_thresh, f64 high_thresh)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
Canny3x3<false, true>(size, cn,
NULL, 0,
dstBase, dstStride,
dxBase, dxStride,
dyBase, dyStride,
low_thresh, high_thresh,
Margin());
#else
(void)size;
(void)cn;
(void)dstBase;
(void)dstStride;
(void)dxBase;
(void)dxStride;
(void)dyBase;
(void)dyStride;
(void)low_thresh;
(void)high_thresh;
#endif
}
void Canny3x3L2(const Size2D &size, s32 cn,
s16 * dxBase, ptrdiff_t dxStride,
s16 * dyBase, ptrdiff_t dyStride,
u8 * dstBase, ptrdiff_t dstStride,
f64 low_thresh, f64 high_thresh)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
Canny3x3<true, true>(size, cn,
NULL, 0,
dstBase, dstStride,
dxBase, dxStride,
dyBase, dyStride,
low_thresh, high_thresh,
Margin());
#else
(void)size;
(void)cn;
(void)dstBase;
(void)dstStride;
(void)dxBase;
(void)dxStride;
(void)dyBase;
(void)dyStride;
(void)low_thresh;
(void)high_thresh;
#endif
}
} // namespace CAROTENE_NS

View File

@ -0,0 +1,486 @@
/*
* By downloading, copying, installing or using the software you agree to this license.
* If you do not agree to this license, do not download, install,
* copy or use the software.
*
*
* License Agreement
* For Open Source Computer Vision Library
* (3-clause BSD License)
*
* Copyright (C) 2014-2015, NVIDIA Corporation, all rights reserved.
* Third party copyrights are property of their respective owners.
*
* Redistribution and use in source and binary forms, with or without modification,
* are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* * Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* * Neither the names of the copyright holders nor the names of the contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* This software is provided by the copyright holders and contributors "as is" and
* any express or implied warranties, including, but not limited to, the implied
* warranties of merchantability and fitness for a particular purpose are disclaimed.
* In no event shall copyright holders or contributors be liable for any direct,
* indirect, incidental, special, exemplary, or consequential damages
* (including, but not limited to, procurement of substitute goods or services;
* loss of use, data, or profits; or business interruption) however caused
* and on any theory of liability, whether in contract, strict liability,
* or tort (including negligence or otherwise) arising in any way out of
* the use of this software, even if advised of the possibility of such damage.
*/
#include "common.hpp"
#include "vtransform.hpp"
namespace CAROTENE_NS {
void extract2(const Size2D &size,
const u8 * srcBase, ptrdiff_t srcStride,
u8 * dstBase, ptrdiff_t dstStride,
u32 coi)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
#ifndef ANDROID
size_t roiw32 = size.width >= 31 ? size.width - 31 : 0;
#endif
size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
for (size_t i = 0u; i < size.height; ++i)
{
const u8 * src = internal::getRowPtr(srcBase, srcStride, i);
u8 * dst = internal::getRowPtr(dstBase, dstStride, i);
size_t sj = 0u, dj = 0u;
#ifndef ANDROID
for (; dj < roiw32; sj += 64, dj += 32)
{
internal::prefetch(src + sj);
uint8x16x2_t v_src = vld2q_u8(src + sj);
vst1q_u8(dst + dj, v_src.val[coi]);
v_src = vld2q_u8(src + sj + 32);
vst1q_u8(dst + dj + 16, v_src.val[coi]);
}
#endif
for (; dj < roiw8; sj += 16, dj += 8)
{
uint8x8x2_t v_src = vld2_u8(src + sj);
vst1_u8(dst + dj, v_src.val[coi]);
}
for (; dj < size.width; sj += 2, ++dj)
{
dst[dj] = src[sj + coi];
}
}
#else
(void)size;
(void)srcBase;
(void)srcStride;
(void)dstBase;
(void)dstStride;
(void)coi;
#endif
}
void extract3(const Size2D &size,
const u8 * srcBase, ptrdiff_t srcStride,
u8 * dstBase, ptrdiff_t dstStride,
u32 coi)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
#ifndef ANDROID
size_t roiw32 = size.width >= 31 ? size.width - 31 : 0;
#endif
size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
for (size_t i = 0u; i < size.height; ++i)
{
const u8 * src = internal::getRowPtr(srcBase, srcStride, i);
u8 * dst = internal::getRowPtr(dstBase, dstStride, i);
size_t sj = 0u, dj = 0u;
#ifndef ANDROID
for (; dj < roiw32; sj += 96, dj += 32)
{
internal::prefetch(src + sj);
uint8x16x3_t v_src = vld3q_u8(src + sj);
vst1q_u8(dst + dj, v_src.val[coi]);
v_src = vld3q_u8(src + sj + 48);
vst1q_u8(dst + dj + 16, v_src.val[coi]);
}
#endif
for (; dj < roiw8; sj += 24, dj += 8)
{
uint8x8x3_t v_src = vld3_u8(src + sj);
vst1_u8(dst + dj, v_src.val[coi]);
}
for (; dj < size.width; sj += 3, ++dj)
{
dst[dj] = src[sj + coi];
}
}
#else
(void)size;
(void)srcBase;
(void)srcStride;
(void)dstBase;
(void)dstStride;
(void)coi;
#endif
}
void extract4(const Size2D &size,
const u8 * srcBase, ptrdiff_t srcStride,
u8 * dstBase, ptrdiff_t dstStride,
u32 coi)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
#ifndef ANDROID
size_t roiw32 = size.width >= 31 ? size.width - 31 : 0;
#endif
size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
for (size_t i = 0u; i < size.height; ++i)
{
const u8 * src = internal::getRowPtr(srcBase, srcStride, i);
u8 * dst = internal::getRowPtr(dstBase, dstStride, i);
size_t sj = 0u, dj = 0u;
#ifndef ANDROID
for (; dj < roiw32; sj += 128, dj += 32)
{
internal::prefetch(src + sj);
uint8x16x4_t v_src = vld4q_u8(src + sj);
vst1q_u8(dst + dj, v_src.val[coi]);
v_src = vld4q_u8(src + sj + 64);
vst1q_u8(dst + dj + 16, v_src.val[coi]);
}
#endif
for (; dj < roiw8; sj += 32, dj += 8)
{
uint8x8x4_t v_src = vld4_u8(src + sj);
vst1_u8(dst + dj, v_src.val[coi]);
}
for (; dj < size.width; sj += 4, ++dj)
{
dst[dj] = src[sj + coi];
}
}
#else
(void)size;
(void)srcBase;
(void)srcStride;
(void)dstBase;
(void)dstStride;
(void)coi;
#endif
}
#define FILL_LINES2(macro,type) \
macro##_LINE(type,0) \
macro##_LINE(type,1)
#define FILL_LINES3(macro,type) \
FILL_LINES2(macro,type) \
macro##_LINE(type,2)
#define FILL_LINES4(macro,type) \
FILL_LINES3(macro,type) \
macro##_LINE(type,3)
#define FARG_LINE(type, n) , type * dst##n##Base, ptrdiff_t dst##n##Stride
#ifdef CAROTENE_NEON
#define VROW_LINE(type, n) type * dst##n = internal::getRowPtr(dst##n##Base, dst##n##Stride, i);
#define VST1Q_LINE(type, n) vst1q_##type(dst##n + dj, v_src.val[n]);
#define VST1_LINE(type, n) vst1_##type(dst##n + dj, v_src.val[n]);
#define SST_LINE(type, n) dst##n[dj] = src[sj + n];
#define MUL2(val) (val << 1)
#define MUL3(val) (MUL2(val) + val)
#define MUL4(val) (val << 2)
#define CONTDST2 srcStride == dst0Stride && \
srcStride == dst1Stride &&
#define CONTDST3 srcStride == dst0Stride && \
srcStride == dst1Stride && \
srcStride == dst2Stride &&
#define CONTDST4 srcStride == dst0Stride && \
srcStride == dst1Stride && \
srcStride == dst2Stride && \
srcStride == dst3Stride &&
#if __GNUC__ == 4 && __GNUC_MINOR__ < 7
#define SPLIT_ASM2(sgn, bits) __asm__ ( \
"vld2." #bits " {d0, d2}, [%[in0]] \n\t" \
"vld2." #bits " {d1, d3}, [%[in1]] \n\t" \
"vst1." #bits " {d0-d1}, [%[out0]] \n\t" \
"vst1." #bits " {d2-d3}, [%[out1]] \n\t" \
: \
: [out0] "r" (dst0 + dj), [out1] "r" (dst1 + dj), \
[in0] "r" (src + sj), [in1] "r" (src + sj + MUL2(8)/sizeof(sgn##bits)) \
: "d0","d1","d2","d3" \
);
#define SPLIT_ASM3(sgn, bits) __asm__ ( \
"vld3." #bits " {d0, d2, d4}, [%[in0]] \n\t" \
"vld3." #bits " {d1, d3, d5}, [%[in1]] \n\t" \
"vst1." #bits " {d0-d1}, [%[out0]] \n\t" \
"vst1." #bits " {d2-d3}, [%[out1]] \n\t" \
"vst1." #bits " {d4-d5}, [%[out2]] \n\t" \
: \
: [out0] "r" (dst0 + dj), [out1] "r" (dst1 + dj), [out2] "r" (dst2 + dj), \
[in0] "r" (src + sj), [in1] "r" (src + sj + MUL3(8)/sizeof(sgn##bits)) \
: "d0","d1","d2","d3","d4","d5" \
);
#define SPLIT_ASM4(sgn, bits) __asm__ ( \
"vld4." #bits " {d0, d2, d4, d6}, [%[in0]] \n\t" \
"vld4." #bits " {d1, d3, d5, d7}, [%[in1]] \n\t" \
"vst1." #bits " {d0-d1}, [%[out0]] \n\t" \
"vst1." #bits " {d2-d3}, [%[out1]] \n\t" \
"vst1." #bits " {d4-d5}, [%[out2]] \n\t" \
"vst1." #bits " {d6-d7}, [%[out3]] \n\t" \
: \
: [out0] "r" (dst0 + dj), [out1] "r" (dst1 + dj), [out2] "r" (dst2 + dj), [out3] "r" (dst3 + dj), \
[in0] "r" (src + sj), [in1] "r" (src + sj + MUL4(8)/sizeof(sgn##bits)) \
: "d0","d1","d2","d3","d4","d5","d6","d7" \
);
#define SPLIT_QUAD(sgn, bits, n) { \
internal::prefetch(src + sj); \
SPLIT_ASM##n(sgn, bits) \
}
#else
#define SPLIT_QUAD(sgn, bits, n) { \
internal::prefetch(src + sj); \
vec128 v_src = vld##n##q_##sgn##bits(src + sj); \
FILL_LINES##n(VST1Q, sgn##bits) \
}
#endif // __GNUC__ == 4 && __GNUC_MINOR__ < 7
#define SPLIT(sgn,bits,n) void split##n(const Size2D &_size, \
const sgn##bits * srcBase, ptrdiff_t srcStride \
FILL_LINES##n(FARG, sgn##bits) ) \
{ \
internal::assertSupportedConfiguration(); \
Size2D size(_size); \
if (CONTDST##n \
dst0Stride == (ptrdiff_t)(size.width)) \
{ \
size.width *= size.height; \
size.height = 1; \
} \
typedef internal::VecTraits<sgn##bits, n>::vec128 vec128; \
size_t roiw16 = size.width >= (16/sizeof(sgn##bits)-1) ? size.width - (16/sizeof(sgn##bits)-1) : 0; \
typedef internal::VecTraits<sgn##bits, n>::vec64 vec64; \
size_t roiw8 = size.width >= (8/sizeof(sgn##bits)-1) ? size.width - (8/sizeof(sgn##bits)-1) : 0; \
\
for (size_t i = 0u; i < size.height; ++i) \
{ \
const sgn##bits * src = internal::getRowPtr(srcBase, srcStride, i); \
FILL_LINES##n(VROW, sgn##bits) \
size_t sj = 0u, dj = 0u; \
\
for (; dj < roiw16; sj += MUL##n(16)/sizeof(sgn##bits), dj += 16/sizeof(sgn##bits)) \
SPLIT_QUAD(sgn, bits, n) \
\
if (dj < roiw8) \
{ \
vec64 v_src = vld##n##_##sgn##bits(src + sj); \
FILL_LINES##n(VST1, sgn##bits) \
sj += MUL##n(8)/sizeof(sgn##bits); \
dj += 8/sizeof(sgn##bits); \
} \
\
for (; dj < size.width; sj += n, ++dj) \
{ \
FILL_LINES##n(SST, sgn##bits) \
} \
} \
}
#define SPLIT64(sgn,n) void split##n(const Size2D &_size, \
const sgn##64 * srcBase, ptrdiff_t srcStride \
FILL_LINES##n(FARG, sgn##64) ) \
{ \
internal::assertSupportedConfiguration(); \
Size2D size(_size); \
if (CONTDST##n \
dst0Stride == (ptrdiff_t)(size.width)) \
{ \
size.width *= size.height; \
size.height = 1; \
} \
typedef internal::VecTraits<sgn##64, n>::vec64 vec64; \
\
for (size_t i = 0u; i < size.height; ++i) \
{ \
const sgn##64 * src = internal::getRowPtr(srcBase, srcStride, i); \
FILL_LINES##n(VROW, sgn##64) \
size_t sj = 0u, dj = 0u; \
\
for (; dj < size.width; sj += n, ++dj) \
{ \
vec64 v_src = vld##n##_##sgn##64(src + sj); \
FILL_LINES##n(VST1, sgn##64) \
} \
} \
}
#if __GNUC__ == 4 && __GNUC_MINOR__ < 7
#define ALPHA_QUAD(sgn, bits) { \
internal::prefetch(src + sj); \
__asm__ ( \
"vld4." #bits " {d0, d2, d4, d6}, [%[in0]] \n\t" \
"vld4." #bits " {d1, d3, d5, d7}, [%[in1]] \n\t" \
"vst3." #bits " {d0, d2, d4}, [%[out3_1]] \n\t" \
"vst3." #bits " {d1, d3, d5}, [%[out3_2]] \n\t" \
"vst1." #bits " {d6-d7}, [%[out1]] \n\t" \
: \
: [out3_1] "r" (dst3 + d3j), [out3_2] "r" (dst3 + d3j + 24/sizeof(sgn##bits)), [out1] "r" (dst1 + d1j), \
[in0] "r" (src + sj), [in1] "r" (src + sj + 32/sizeof(sgn##bits)) \
: "d0","d1","d2","d3","d4","d5","d6","d7" \
); \
}
#else
#define ALPHA_QUAD(sgn, bits) { \
internal::prefetch(src + sj); \
union { vec128_4 v4; vec128_3 v3; } vals; \
vals.v4 = vld4q_##sgn##bits(src + sj); \
vst3q_##sgn##bits(dst3 + d3j, vals.v3); \
vst1q_##sgn##bits(dst1 + d1j, vals.v4.val[3]); \
}
#endif // __GNUC__ == 4 && __GNUC_MINOR__ < 7
#define SPLIT4ALPHA(sgn,bits) void split4(const Size2D &_size, \
const sgn##bits * srcBase, ptrdiff_t srcStride, \
sgn##bits * dst3Base, ptrdiff_t dst3Stride, \
sgn##bits * dst1Base, ptrdiff_t dst1Stride) \
{ \
internal::assertSupportedConfiguration(); \
Size2D size(_size); \
if (srcStride == dst3Stride && \
srcStride == dst1Stride && \
srcStride == (ptrdiff_t)(size.width)) \
{ \
size.width *= size.height; \
size.height = 1; \
} \
typedef internal::VecTraits<sgn##bits, 4>::vec128 vec128_4; \
typedef internal::VecTraits<sgn##bits, 3>::vec128 vec128_3; \
size_t roiw16 = size.width >= (16/sizeof(sgn##bits)-1) ? size.width - (16/sizeof(sgn##bits)-1) : 0; \
typedef internal::VecTraits<sgn##bits, 4>::vec64 vec64_4; \
typedef internal::VecTraits<sgn##bits, 3>::vec64 vec64_3; \
size_t roiw8 = size.width >= (8/sizeof(sgn##bits)-1) ? size.width - (8/sizeof(sgn##bits)-1) : 0; \
\
for (size_t i = 0u; i < size.height; ++i) \
{ \
const sgn##bits * src = internal::getRowPtr(srcBase, srcStride, i); \
sgn##bits * dst3 = internal::getRowPtr(dst3Base, dst3Stride, i); \
sgn##bits * dst1 = internal::getRowPtr(dst1Base, dst1Stride, i); \
size_t sj = 0u, d3j = 0u, d1j = 0u; \
\
for (; d1j < roiw16; sj += MUL4(16)/sizeof(sgn##bits), d3j += MUL3(16)/sizeof(sgn##bits), \
d1j += 16/sizeof(sgn##bits)) \
ALPHA_QUAD(sgn, bits) \
\
if (d1j < roiw8) \
{ \
union { vec64_4 v4; vec64_3 v3; } vals; \
vals.v4 = vld4_##sgn##bits(src + sj); \
vst3_u8(dst3 + d3j, vals.v3); \
vst1_u8(dst1 + d1j, vals.v4.val[3]); \
sj += MUL4(8)/sizeof(sgn##bits); \
d3j += MUL3(8)/sizeof(sgn##bits); \
d1j += 8/sizeof(sgn##bits); \
} \
\
for (; d1j < size.width; sj += 4, d3j += 3, ++d1j) \
{ \
dst3[d3j+0] = src[sj + 0]; \
dst3[d3j+1] = src[sj + 1]; \
dst3[d3j+2] = src[sj + 2]; \
dst1[d1j] = src[sj + 3]; \
} \
} \
}
#else
#define VOID_LINE(type, n) (void)dst##n##Base; (void)dst##n##Stride;
#define SPLIT(sgn,bits,n) void split##n(const Size2D &size, \
const sgn##bits * srcBase, ptrdiff_t srcStride \
FILL_LINES##n(FARG, sgn##bits) ) \
{ \
internal::assertSupportedConfiguration(); \
(void)size; \
(void)srcBase; \
(void)srcStride; \
FILL_LINES##n(VOID, sgn##bits) \
}
#define SPLIT64(sgn,n) SPLIT(sgn,64,n)
#define SPLIT4ALPHA(sgn,bits) void split4(const Size2D &size, \
const sgn##bits * srcBase, ptrdiff_t srcStride, \
sgn##bits * dst3Base, ptrdiff_t dst3Stride, \
sgn##bits * dst1Base, ptrdiff_t dst1Stride) \
{ \
internal::assertSupportedConfiguration(); \
(void)size; \
(void)srcBase; \
(void)srcStride; \
(void)dst3Base; \
(void)dst3Stride; \
(void)dst1Base; \
(void)dst1Stride; \
}
#endif //CAROTENE_NEON
SPLIT(u, 8,2)
SPLIT(u, 8,3)
SPLIT(u, 8,4)
SPLIT(u,16,2)
SPLIT(u,16,3)
SPLIT(u,16,4)
SPLIT(s,32,2)
SPLIT(s,32,3)
SPLIT(s,32,4)
SPLIT64(s, 2)
SPLIT64(s, 3)
SPLIT64(s, 4)
SPLIT4ALPHA(u,8)
} // namespace CAROTENE_NS

View File

@ -0,0 +1,389 @@
/*
* By downloading, copying, installing or using the software you agree to this license.
* If you do not agree to this license, do not download, install,
* copy or use the software.
*
*
* License Agreement
* For Open Source Computer Vision Library
* (3-clause BSD License)
*
* Copyright (C) 2014-2015, NVIDIA Corporation, all rights reserved.
* Third party copyrights are property of their respective owners.
*
* Redistribution and use in source and binary forms, with or without modification,
* are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* * Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* * Neither the names of the copyright holders nor the names of the contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* This software is provided by the copyright holders and contributors "as is" and
* any express or implied warranties, including, but not limited to, the implied
* warranties of merchantability and fitness for a particular purpose are disclaimed.
* In no event shall copyright holders or contributors be liable for any direct,
* indirect, incidental, special, exemplary, or consequential damages
* (including, but not limited to, procurement of substitute goods or services;
* loss of use, data, or profits; or business interruption) however caused
* and on any theory of liability, whether in contract, strict liability,
* or tort (including negligence or otherwise) arising in any way out of
* the use of this software, even if advised of the possibility of such damage.
*/
#include "common.hpp"
#include "vtransform.hpp"
namespace CAROTENE_NS {
#define FILL_LINES2(macro,type) \
macro##_LINE(type,0) \
macro##_LINE(type,1)
#define FILL_LINES3(macro,type) \
FILL_LINES2(macro,type) \
macro##_LINE(type,2)
#define FILL_LINES4(macro,type) \
FILL_LINES3(macro,type) \
macro##_LINE(type,3)
#define FARG_LINE(type, n) , const type * src##n##Base, ptrdiff_t src##n##Stride
#ifdef CAROTENE_NEON
#define VROW_LINE(type, n) const type * src##n = internal::getRowPtr(src##n##Base, src##n##Stride, i);
#define PREF_LINE(type, n) internal::prefetch(src##n + sj);
#define VLD1Q_LINE(type, n) v_dst.val[n] = vld1q_##type(src##n + sj);
#define PRLD_LINE(type, n) internal::prefetch(src##n + sj); v_dst.val[n] = vld1q_##type(src##n + sj);
#define VLD1_LINE(type, n) v_dst.val[n] = vld1_##type(src##n + sj);
#define SLD_LINE(type, n) dst[dj + n] = src##n[sj];
#define MUL2(val) (val << 1)
#define MUL3(val) (MUL2(val) + val)
#define MUL4(val) (val << 2)
#define CONTSRC2 dstStride == src0Stride && \
dstStride == src1Stride &&
#define CONTSRC3 dstStride == src0Stride && \
dstStride == src1Stride && \
dstStride == src2Stride &&
#define CONTSRC4 dstStride == src0Stride && \
dstStride == src1Stride && \
dstStride == src2Stride && \
dstStride == src3Stride &&
#if __GNUC__ == 4 && __GNUC_MINOR__ < 7
#define MERGE_ASM2(sgn, bits) __asm__ ( \
"vld1." #bits " {d0-d1}, [%[in0]] \n\t" \
"vld1." #bits " {d2-d3}, [%[in1]] \n\t" \
"vst2." #bits " {d0, d2}, [%[out0]] \n\t" \
"vst2." #bits " {d1, d3}, [%[out1]] \n\t" \
: \
: [in0] "r" (src0 + sj), [in1] "r" (src1 + sj), \
[out0] "r" (dst + dj), [out1] "r" (dst + dj + MUL2(8)/sizeof(sgn##bits)) \
: "d0","d1","d2","d3" \
);
#define MERGE_ASM3(sgn, bits) __asm__ ( \
"vld1." #bits " {d0-d1}, [%[in0]] \n\t" \
"vld1." #bits " {d2-d3}, [%[in1]] \n\t" \
"vld1." #bits " {d4-d5}, [%[in2]] \n\t" \
"vst3." #bits " {d0, d2, d4}, [%[out0]] \n\t" \
"vst3." #bits " {d1, d3, d5}, [%[out1]] \n\t" \
: \
: [in0] "r" (src0 + sj), [in1] "r" (src1 + sj), [in2] "r" (src2 + sj), \
[out0] "r" (dst + dj), [out1] "r" (dst + dj + MUL3(8)/sizeof(sgn##bits)) \
: "d0","d1","d2","d3","d4","d5" \
);
#define MERGE_ASM4(sgn, bits) __asm__ ( \
"vld1." #bits " {d0-d1}, [%[in0]] \n\t" \
"vld1." #bits " {d2-d3}, [%[in1]] \n\t" \
"vld1." #bits " {d4-d5}, [%[in2]] \n\t" \
"vld1." #bits " {d6-d7}, [%[in3]] \n\t" \
"vst4." #bits " {d0, d2, d4, d6}, [%[out0]] \n\t" \
"vst4." #bits " {d1, d3, d5, d7}, [%[out1]] \n\t" \
: \
: [in0] "r" (src0 + sj), [in1] "r" (src1 + sj), [in2] "r" (src2 + sj), [in3] "r" (src3 + sj), \
[out0] "r" (dst + dj), [out1] "r" (dst + dj + MUL4(8)/sizeof(sgn##bits)) \
: "d0","d1","d2","d3","d4","d5","d6","d7" \
);
#define MERGE_QUAD(sgn, bits, n) { \
FILL_LINES##n(PREF, sgn##bits) \
MERGE_ASM##n(sgn, bits) \
}
#else
#define MERGE_QUAD(sgn, bits, n) { \
vec128 v_dst; \
/*FILL_LINES##n(PREF, sgn##bits) \
FILL_LINES##n(VLD1Q, sgn##bits)*/ \
FILL_LINES##n(PRLD, sgn##bits) \
vst##n##q_##sgn##bits(dst + dj, v_dst); \
}
#endif // __GNUC__ == 4 && __GNUC_MINOR__ < 7
#define COMBINE(sgn,bits,n) void combine##n(const Size2D &_size \
FILL_LINES##n(FARG, sgn##bits), \
sgn##bits * dstBase, ptrdiff_t dstStride) \
{ \
internal::assertSupportedConfiguration(); \
Size2D size(_size); \
if (CONTSRC##n \
dstStride == (ptrdiff_t)(size.width)) \
{ \
size.width *= size.height; \
size.height = 1; \
} \
typedef internal::VecTraits<sgn##bits, n>::vec128 vec128; \
size_t roiw16 = size.width >= (16/sizeof(sgn##bits) - 1) ? size.width - (16/sizeof(sgn##bits) - 1) : 0; \
typedef internal::VecTraits<sgn##bits, n>::vec64 vec64; \
size_t roiw8 = size.width >= (8/sizeof(sgn##bits) - 1) ? size.width - (8/sizeof(sgn##bits) - 1) : 0; \
\
for (size_t i = 0u; i < size.height; ++i) \
{ \
FILL_LINES##n(VROW, sgn##bits) \
sgn##bits * dst = internal::getRowPtr(dstBase, dstStride, i); \
size_t sj = 0u, dj = 0u; \
\
for (; sj < roiw16; sj += 16/sizeof(sgn##bits), dj += MUL##n(16)/sizeof(sgn##bits)) \
MERGE_QUAD(sgn, bits, n) \
\
if ( sj < roiw8 ) \
{ \
vec64 v_dst; \
FILL_LINES##n(VLD1, sgn##bits) \
vst##n##_##sgn##bits(dst + dj, v_dst); \
sj += 8/sizeof(sgn##bits); dj += MUL##n(8)/sizeof(sgn##bits); \
} \
\
for (; sj < size.width; ++sj, dj += n) \
{ \
FILL_LINES##n(SLD, sgn##bits) \
} \
} \
}
#define COMBINE64(sgn,n) void combine##n(const Size2D &_size \
FILL_LINES##n(FARG, sgn##64), \
sgn##64 * dstBase, ptrdiff_t dstStride) \
{ \
internal::assertSupportedConfiguration(); \
Size2D size(_size); \
if (CONTSRC##n \
dstStride == (ptrdiff_t)(size.width)) \
{ \
size.width *= size.height; \
size.height = 1; \
} \
typedef internal::VecTraits<sgn##64, n>::vec64 vec64; \
\
for (size_t i = 0u; i < size.height; ++i) \
{ \
FILL_LINES##n(VROW, sgn##64) \
sgn##64 * dst = internal::getRowPtr(dstBase, dstStride, i); \
size_t sj = 0u, dj = 0u; \
\
for (; sj < size.width; ++sj, dj += n) \
{ \
vec64 v_dst; \
FILL_LINES##n(VLD1, sgn##64) \
vst##n##_##sgn##64(dst + dj, v_dst); \
/*FILL_LINES##n(SLD, sgn##64)*/ \
} \
} \
}
#else
#define VOID_LINE(type, n) (void)src##n##Base; (void)src##n##Stride;
#define COMBINE(sgn,bits,n) void combine##n(const Size2D &size \
FILL_LINES##n(FARG, sgn##bits), \
sgn##bits * dstBase, ptrdiff_t dstStride) \
{ \
internal::assertSupportedConfiguration(); \
(void)size; \
FILL_LINES##n(VOID, sgn##bits) \
(void)dstBase; \
(void)dstStride; \
}
#define COMBINE64(sgn,n) COMBINE(sgn,64,n)
#endif //CAROTENE_NEON
COMBINE(u, 8,2)
COMBINE(u, 8,3)
COMBINE(u, 8,4)
COMBINE(u,16,2)
COMBINE(u,16,3)
COMBINE(u,16,4)
COMBINE(s,32,2)
COMBINE(s,32,3)
COMBINE(s,32,4)
COMBINE64(s, 2)
COMBINE64(s, 3)
COMBINE64(s, 4)
void combineYUYV(const Size2D &size,
const u8 * srcyBase, ptrdiff_t srcyStride,
const u8 * srcuBase, ptrdiff_t srcuStride,
const u8 * srcvBase, ptrdiff_t srcvStride,
u8 * dstBase, ptrdiff_t dstStride)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
#ifndef ANDROID
size_t roiw32 = size.width >= 31 ? size.width - 31 : 0;
#endif
size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
for (size_t i = 0u; i < size.height; i += 1)
{
const u8 * srcy = internal::getRowPtr(srcyBase, srcyStride, i);
const u8 * srcu = internal::getRowPtr(srcuBase, srcuStride, i);
const u8 * srcv = internal::getRowPtr(srcvBase, srcvStride, i);
u8 * dst = internal::getRowPtr(dstBase, dstStride, i);
size_t syj = 0u, sj = 0u, dj = 0u;
#ifndef ANDROID
for (; sj < roiw32; sj += 32, syj += 64, dj += 128)
{
internal::prefetch(srcy + syj);
internal::prefetch(srcu + sj);
internal::prefetch(srcv + sj);
uint8x16x2_t v_y = vld2q_u8(srcy + syj);
uint8x16x4_t v_dst;
v_dst.val[0] = v_y.val[0];
v_dst.val[1] = vld1q_u8(srcu + sj);
v_dst.val[2] = v_y.val[1];
v_dst.val[3] = vld1q_u8(srcv + sj);
vst4q_u8(dst + dj, v_dst);
v_y = vld2q_u8(srcy + syj + 32);
v_dst.val[0] = v_y.val[0];
v_dst.val[1] = vld1q_u8(srcu + sj + 16);
v_dst.val[2] = v_y.val[1];
v_dst.val[3] = vld1q_u8(srcv + sj + 16);
vst4q_u8(dst + dj + 64, v_dst);
}
#endif
for (; sj < roiw8; sj += 8, syj += 16, dj += 32)
{
uint8x8x2_t v_y = vld2_u8(srcy + syj);
uint8x8x4_t v_dst;
v_dst.val[0] = v_y.val[0];
v_dst.val[1] = vld1_u8(srcu + sj);
v_dst.val[2] = v_y.val[1];
v_dst.val[3] = vld1_u8(srcv + sj);
vst4_u8(dst + dj, v_dst);
}
for (; sj < size.width; ++sj, syj += 2, dj += 4)
{
dst[dj] = srcy[syj];
dst[dj + 1] = srcu[sj];
dst[dj + 2] = srcy[syj + 1];
dst[dj + 3] = srcv[sj];
}
}
#else
(void)size;
(void)srcyBase;
(void)srcyStride;
(void)srcuBase;
(void)srcuStride;
(void)srcvBase;
(void)srcvStride;
(void)dstBase;
(void)dstStride;
#endif
}
void combineUYVY(const Size2D &size,
const u8 * srcyBase, ptrdiff_t srcyStride,
const u8 * srcuBase, ptrdiff_t srcuStride,
const u8 * srcvBase, ptrdiff_t srcvStride,
u8 * dstBase, ptrdiff_t dstStride)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
#ifndef ANDROID
size_t roiw32 = size.width >= 31 ? size.width - 31 : 0;
#endif
size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
for (size_t i = 0u; i < size.height; ++i)
{
const u8 * srcy = internal::getRowPtr(srcyBase, srcyStride, i);
const u8 * srcu = internal::getRowPtr(srcuBase, srcuStride, i);
const u8 * srcv = internal::getRowPtr(srcvBase, srcvStride, i);
u8 * dst = internal::getRowPtr(dstBase, dstStride, i);
size_t syj = 0u, sj = 0u, dj = 0u;
#ifndef ANDROID
for (; sj < roiw32; sj += 32, syj += 64, dj += 128)
{
internal::prefetch(srcy + syj);
internal::prefetch(srcu + sj);
internal::prefetch(srcv + sj);
uint8x16x2_t v_y = vld2q_u8(srcy + syj);
uint8x16x4_t v_dst;
v_dst.val[0] = vld1q_u8(srcu + sj);
v_dst.val[1] = v_y.val[0];
v_dst.val[2] = vld1q_u8(srcv + sj);
v_dst.val[3] = v_y.val[1];
vst4q_u8(dst + dj, v_dst);
v_y = vld2q_u8(srcy + syj + 32);
v_dst.val[0] = vld1q_u8(srcu + sj + 16);
v_dst.val[1] = v_y.val[0];
v_dst.val[2] = vld1q_u8(srcv + sj + 16);
v_dst.val[3] = v_y.val[1];
vst4q_u8(dst + dj + 64, v_dst);
}
#endif
for (; sj < roiw8; sj += 8, syj += 16, dj += 32)
{
uint8x8x2_t v_y = vld2_u8(srcy + syj);
uint8x8x4_t v_dst;
v_dst.val[0] = vld1_u8(srcu + sj);
v_dst.val[1] = v_y.val[0];
v_dst.val[2] = vld1_u8(srcv + sj);
v_dst.val[3] = v_y.val[1];
vst4_u8(dst + dj, v_dst);
}
for (; sj < size.width; ++sj, syj += 2, dj += 4)
{
dst[dj] = srcu[sj];
dst[dj + 1] = srcy[syj];
dst[dj + 2] = srcv[sj];
dst[dj + 3] = srcy[syj + 1];
}
}
#else
(void)size;
(void)srcyBase;
(void)srcyStride;
(void)srcuBase;
(void)srcuStride;
(void)srcvBase;
(void)srcvStride;
(void)dstBase;
(void)dstStride;
#endif
}
} // namespace CAROTENE_NS

340
3rdparty/carotene/src/cmp.cpp vendored Normal file
View File

@ -0,0 +1,340 @@
/*
* By downloading, copying, installing or using the software you agree to this license.
* If you do not agree to this license, do not download, install,
* copy or use the software.
*
*
* License Agreement
* For Open Source Computer Vision Library
* (3-clause BSD License)
*
* Copyright (C) 2014-2015, NVIDIA Corporation, all rights reserved.
* Third party copyrights are property of their respective owners.
*
* Redistribution and use in source and binary forms, with or without modification,
* are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* * Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* * Neither the names of the copyright holders nor the names of the contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* This software is provided by the copyright holders and contributors "as is" and
* any express or implied warranties, including, but not limited to, the implied
* warranties of merchantability and fitness for a particular purpose are disclaimed.
* In no event shall copyright holders or contributors be liable for any direct,
* indirect, incidental, special, exemplary, or consequential damages
* (including, but not limited to, procurement of substitute goods or services;
* loss of use, data, or profits; or business interruption) however caused
* and on any theory of liability, whether in contract, strict liability,
* or tort (including negligence or otherwise) arising in any way out of
* the use of this software, even if advised of the possibility of such damage.
*/
#include "common.hpp"
#include "vtransform.hpp"
namespace CAROTENE_NS {
#ifdef CAROTENE_NEON
namespace {
inline void vnst(u8* dst, uint8x16_t v1, uint8x16_t v2) { vst1q_u8(dst, v1); vst1q_u8(dst+16, v2); }
inline void vnst(u8* dst, uint16x8_t v1, uint16x8_t v2) { vst1q_u8(dst, vcombine_u8(vmovn_u16(v1), vmovn_u16(v2))); }
inline void vnst(u8* dst, uint32x4_t v1, uint32x4_t v2) { vst1_u8(dst, vmovn_u16(vcombine_u16(vmovn_u32(v1), vmovn_u32(v2)))); }
template <typename Op, int elsize> struct vtail
{
static inline void compare(const typename Op::type * src0, const typename Op::type * src1,
u8 * dst, const Op & op,
size_t &x, size_t width)
{
//do nothing since there couldn't be enough data
(void)src0;
(void)src1;
(void)dst;
(void)op;
(void)x;
(void)width;
}
};
template <typename Op> struct vtail<Op, 2>
{
static inline void compare(const typename Op::type * src0, const typename Op::type * src1,
u8 * dst, const Op & op,
size_t &x, size_t width)
{
typedef typename Op::type type;
typedef typename internal::VecTraits<type>::vec128 vec128;
typedef typename internal::VecTraits<type>::unsign::vec128 uvec128;
//There no more than 15 elements in the tail, so we could handle 8 element vector only once
if( x + 8 < width)
{
vec128 v_src0, v_src1;
uvec128 v_dst;
v_src0 = internal::vld1q(src0 + x);
v_src1 = internal::vld1q(src1 + x);
op(v_src0, v_src1, v_dst);
internal::vst1(dst + x, internal::vmovn(v_dst));
x+=8;
}
}
};
template <typename Op> struct vtail<Op, 1>
{
static inline void compare(const typename Op::type * src0, const typename Op::type * src1,
u8 * dst, const Op & op,
size_t &x, size_t width)
{
typedef typename Op::type type;
typedef typename internal::VecTraits<type>::vec128 vec128;
typedef typename internal::VecTraits<type>::unsign::vec128 uvec128;
typedef typename internal::VecTraits<type>::vec64 vec64;
typedef typename internal::VecTraits<type>::unsign::vec64 uvec64;
//There no more than 31 elements in the tail, so we could handle once 16+8 or 16 or 8 elements
if( x + 16 < width)
{
vec128 v_src0, v_src1;
uvec128 v_dst;
v_src0 = internal::vld1q(src0 + x);
v_src1 = internal::vld1q(src1 + x);
op(v_src0, v_src1, v_dst);
internal::vst1q(dst + x, v_dst);
x+=16;
}
if( x + 8 < width)
{
vec64 v_src0, v_src1;
uvec64 v_dst;
v_src0 = internal::vld1(src0 + x);
v_src1 = internal::vld1(src1 + x);
op(v_src0, v_src1, v_dst);
internal::vst1(dst + x, v_dst);
x+=8;
}
}
};
template <typename Op>
void vcompare(Size2D size,
const typename Op::type * src0Base, ptrdiff_t src0Stride,
const typename Op::type * src1Base, ptrdiff_t src1Stride,
u8 * dstBase, ptrdiff_t dstStride, const Op & op)
{
typedef typename Op::type type;
typedef typename internal::VecTraits<type>::vec128 vec128;
typedef typename internal::VecTraits<type>::unsign::vec128 uvec128;
if (src0Stride == src1Stride && src0Stride == dstStride &&
src0Stride == (ptrdiff_t)(size.width * sizeof(type)))
{
size.width *= size.height;
size.height = 1;
}
const u32 step_base = 32 / sizeof(type);
size_t roiw_base = size.width >= (step_base - 1) ? size.width - step_base + 1 : 0;
for (size_t y = 0; y < size.height; ++y)
{
const type * src0 = internal::getRowPtr(src0Base, src0Stride, y);
const type * src1 = internal::getRowPtr(src1Base, src1Stride, y);
u8 * dst = internal::getRowPtr(dstBase, dstStride, y);
size_t x = 0;
for( ; x < roiw_base; x += step_base )
{
internal::prefetch(src0 + x);
internal::prefetch(src1 + x);
vec128 v_src00 = internal::vld1q(src0 + x), v_src01 = internal::vld1q(src0 + x + 16 / sizeof(type));
vec128 v_src10 = internal::vld1q(src1 + x), v_src11 = internal::vld1q(src1 + x + 16 / sizeof(type));
uvec128 v_dst0;
uvec128 v_dst1;
op(v_src00, v_src10, v_dst0);
op(v_src01, v_src11, v_dst1);
vnst(dst + x, v_dst0, v_dst1);
}
vtail<Op, sizeof(type)>::compare(src0, src1, dst, op, x, size.width);
for (; x < size.width; ++x)
{
op(src0 + x, src1 + x, dst + x);
}
}
}
template<typename T>
struct OpCmpEQ
{
typedef T type;
void operator() (const typename internal::VecTraits<T>::vec128 & v_src0, const typename internal::VecTraits<T>::vec128 & v_src1,
typename internal::VecTraits<T>::unsign::vec128 & v_dst) const
{
v_dst = internal::vceqq(v_src0, v_src1);
}
void operator() (const typename internal::VecTraits<T>::vec64 & v_src0, const typename internal::VecTraits<T>::vec64 & v_src1,
typename internal::VecTraits<T>::unsign::vec64 & v_dst) const
{
v_dst = internal::vceq(v_src0, v_src1);
}
void operator() (const T * src0, const T * src1, u8 * dst) const
{
dst[0] = src0[0] == src1[0] ? 255 : 0;
}
};
template<typename T>
struct OpCmpNE
{
typedef T type;
void operator() (const typename internal::VecTraits<T>::vec128 & v_src0, const typename internal::VecTraits<T>::vec128 & v_src1,
typename internal::VecTraits<T>::unsign::vec128 & v_dst) const
{
v_dst = internal::vmvnq(internal::vceqq(v_src0, v_src1));
}
void operator() (const typename internal::VecTraits<T>::vec64 & v_src0, const typename internal::VecTraits<T>::vec64 & v_src1,
typename internal::VecTraits<T>::unsign::vec64 & v_dst) const
{
v_dst = internal::vmvn(internal::vceq(v_src0, v_src1));
}
void operator() (const T * src0, const T * src1, u8 * dst) const
{
dst[0] = src0[0] == src1[0] ? 0 : 255;
}
};
template<typename T>
struct OpCmpGT
{
typedef T type;
void operator() (const typename internal::VecTraits<T>::vec128 & v_src0, const typename internal::VecTraits<T>::vec128 & v_src1,
typename internal::VecTraits<T>::unsign::vec128 & v_dst) const
{
v_dst = internal::vcgtq(v_src0, v_src1);
}
void operator() (const typename internal::VecTraits<T>::vec64 & v_src0, const typename internal::VecTraits<T>::vec64 & v_src1,
typename internal::VecTraits<T>::unsign::vec64 & v_dst) const
{
v_dst = internal::vcgt(v_src0, v_src1);
}
void operator() (const T * src0, const T * src1, u8 * dst) const
{
dst[0] = src0[0] > src1[0] ? 255 : 0;
}
};
template<typename T>
struct OpCmpGE
{
typedef T type;
void operator() (const typename internal::VecTraits<T>::vec128 & v_src0, const typename internal::VecTraits<T>::vec128 & v_src1,
typename internal::VecTraits<T>::unsign::vec128 & v_dst) const
{
v_dst = internal::vcgeq(v_src0, v_src1);
}
void operator() (const typename internal::VecTraits<T>::vec64 & v_src0, const typename internal::VecTraits<T>::vec64 & v_src1,
typename internal::VecTraits<T>::unsign::vec64 & v_dst) const
{
v_dst = internal::vcge(v_src0, v_src1);
}
void operator() (const T * src0, const T * src1, u8 * dst) const
{
dst[0] = src0[0] >= src1[0] ? 255 : 0;
}
};
}
#define IMPL_CMPOP(op, type) \
void cmp##op(const Size2D &size, \
const type * src0Base, ptrdiff_t src0Stride, \
const type * src1Base, ptrdiff_t src1Stride, \
u8 *dstBase, ptrdiff_t dstStride) \
{ \
internal::assertSupportedConfiguration(); \
vcompare(size, \
src0Base, src0Stride, \
src1Base, src1Stride, \
dstBase, dstStride, \
OpCmp##op<type>()); \
}
#else
#define IMPL_CMPOP(op, type) \
void cmp##op(const Size2D &size, \
const type * src0Base, ptrdiff_t src0Stride, \
const type * src1Base, ptrdiff_t src1Stride, \
u8 *dstBase, ptrdiff_t dstStride) \
{ \
internal::assertSupportedConfiguration(); \
(void)size; \
(void)src0Base; \
(void)src0Stride; \
(void)src1Base; \
(void)src1Stride; \
(void)dstBase; \
(void)dstStride; \
}
#endif
IMPL_CMPOP(EQ, u8)
IMPL_CMPOP(EQ, s8)
IMPL_CMPOP(EQ, u16)
IMPL_CMPOP(EQ, s16)
IMPL_CMPOP(EQ, u32)
IMPL_CMPOP(EQ, s32)
IMPL_CMPOP(EQ, f32)
IMPL_CMPOP(NE, u8)
IMPL_CMPOP(NE, s8)
IMPL_CMPOP(NE, u16)
IMPL_CMPOP(NE, s16)
IMPL_CMPOP(NE, u32)
IMPL_CMPOP(NE, s32)
IMPL_CMPOP(NE, f32)
IMPL_CMPOP(GT, u8)
IMPL_CMPOP(GT, s8)
IMPL_CMPOP(GT, u16)
IMPL_CMPOP(GT, s16)
IMPL_CMPOP(GT, u32)
IMPL_CMPOP(GT, s32)
IMPL_CMPOP(GT, f32)
IMPL_CMPOP(GE, u8)
IMPL_CMPOP(GE, s8)
IMPL_CMPOP(GE, u16)
IMPL_CMPOP(GE, s16)
IMPL_CMPOP(GE, u32)
IMPL_CMPOP(GE, s32)
IMPL_CMPOP(GE, f32)
} // namespace CAROTENE_NS

2846
3rdparty/carotene/src/colorconvert.cpp vendored Normal file

File diff suppressed because it is too large Load Diff

108
3rdparty/carotene/src/common.cpp vendored Normal file
View File

@ -0,0 +1,108 @@
/*
* By downloading, copying, installing or using the software you agree to this license.
* If you do not agree to this license, do not download, install,
* copy or use the software.
*
*
* License Agreement
* For Open Source Computer Vision Library
* (3-clause BSD License)
*
* Copyright (C) 2014, NVIDIA Corporation, all rights reserved.
* Third party copyrights are property of their respective owners.
*
* Redistribution and use in source and binary forms, with or without modification,
* are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* * Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* * Neither the names of the copyright holders nor the names of the contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* This software is provided by the copyright holders and contributors "as is" and
* any express or implied warranties, including, but not limited to, the implied
* warranties of merchantability and fitness for a particular purpose are disclaimed.
* In no event shall copyright holders or contributors be liable for any direct,
* indirect, incidental, special, exemplary, or consequential damages
* (including, but not limited to, procurement of substitute goods or services;
* loss of use, data, or profits; or business interruption) however caused
* and on any theory of liability, whether in contract, strict liability,
* or tort (including negligence or otherwise) arising in any way out of
* the use of this software, even if advised of the possibility of such damage.
*/
#include <cstdlib>
#include <iostream>
#include "common.hpp"
namespace CAROTENE_NS {
bool isSupportedConfiguration()
{
#ifdef CAROTENE_NEON
return true;
#else
return false;
#endif
}
namespace internal {
void assertSupportedConfiguration(bool parametersSupported)
{
if (!isSupportedConfiguration()) {
std::cerr << "internal error: attempted to use an unavailable function" << std::endl;
std::abort();
}
if (!parametersSupported) {
std::cerr << "internal error: attempted to use a function with unsupported parameters" << std::endl;
std::abort();
}
}
ptrdiff_t borderInterpolate(ptrdiff_t _p, size_t _len, BORDER_MODE borderType, size_t startMargin, size_t endMargin)
{
ptrdiff_t p = _p + (ptrdiff_t)startMargin;
size_t len = _len + startMargin + endMargin;
if( (size_t)p < len )
return _p;
else if( borderType == BORDER_MODE_REPLICATE )
p = p < 0 ? 0 : (ptrdiff_t)len - 1;
else if( borderType == BORDER_MODE_REFLECT || borderType == BORDER_MODE_REFLECT101 )
{
s32 delta = borderType == BORDER_MODE_REFLECT101;
if( len == 1 )
return 0;
do
{
if( p < 0 )
p = -p - 1 + delta;
else
p = (ptrdiff_t)len - 1 - (p - (ptrdiff_t)len) - delta;
}
while( (size_t)p >= len );
}
else if( borderType == BORDER_MODE_WRAP )
{
if( p < 0 )
p -= ((p-(ptrdiff_t)len+1)/(ptrdiff_t)len)*(ptrdiff_t)len;
if( p >= (ptrdiff_t)len )
p %= (ptrdiff_t)len;
}
else if( borderType == BORDER_MODE_CONSTANT )
p = -1;
else
internal::assertSupportedConfiguration(false);
return p - (ptrdiff_t)startMargin;
}
} // namespace internal
} // namespace CAROTENE_NS

96
3rdparty/carotene/src/common.hpp vendored Normal file
View File

@ -0,0 +1,96 @@
/*
* By downloading, copying, installing or using the software you agree to this license.
* If you do not agree to this license, do not download, install,
* copy or use the software.
*
*
* License Agreement
* For Open Source Computer Vision Library
* (3-clause BSD License)
*
* Copyright (C) 2014-2015, NVIDIA Corporation, all rights reserved.
* Third party copyrights are property of their respective owners.
*
* Redistribution and use in source and binary forms, with or without modification,
* are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* * Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* * Neither the names of the copyright holders nor the names of the contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* This software is provided by the copyright holders and contributors "as is" and
* any express or implied warranties, including, but not limited to, the implied
* warranties of merchantability and fitness for a particular purpose are disclaimed.
* In no event shall copyright holders or contributors be liable for any direct,
* indirect, incidental, special, exemplary, or consequential damages
* (including, but not limited to, procurement of substitute goods or services;
* loss of use, data, or profits; or business interruption) however caused
* and on any theory of liability, whether in contract, strict liability,
* or tort (including negligence or otherwise) arising in any way out of
* the use of this software, even if advised of the possibility of such damage.
*/
#ifndef CAROTENE_SRC_COMMON_HPP
#define CAROTENE_SRC_COMMON_HPP
#include <cstddef>
#include <algorithm>
#if defined WITH_NEON && (defined __ARM_NEON__ || defined __ARM_NEON)
#define CAROTENE_NEON
#endif
#ifdef CAROTENE_NEON
#include <arm_neon.h>
#include "intrinsics.hpp"
#endif
#include <carotene/functions.hpp>
#include "saturate_cast.hpp"
namespace CAROTENE_NS { namespace internal {
inline void prefetch(const void *ptr, size_t offset = 32*10)
{
#if defined __GNUC__
__builtin_prefetch(reinterpret_cast<const char*>(ptr) + offset);
#elif defined _MSC_VER && defined CAROTENE_NEON
__prefetch(reinterpret_cast<const char*>(ptr) + offset);
#else
(void)ptr;
(void)offset;
#endif
}
template <typename T>
inline T *getRowPtr(T *base, ptrdiff_t stride, size_t row)
{
char *baseRaw = const_cast<char *>(reinterpret_cast<const char *>(base));
return reinterpret_cast<T *>(baseRaw + ptrdiff_t(row) * stride);
}
void assertSupportedConfiguration(bool parametersSupported = true);
ptrdiff_t borderInterpolate(ptrdiff_t _p, size_t _len, BORDER_MODE borderType, size_t startMargin = 0, size_t endMargin = 0);
/*!
* Aligns pointer by the certain number of bytes
*
* This small inline function aligns the pointer by the certain number of bytes by shifting
* it forward by 0 or a positive offset.
*/
template<typename T> inline T* alignPtr(T* ptr, size_t n=sizeof(T))
{
return (T*)(((size_t)ptr + n-1) & -n);
}
}}
#endif

1331
3rdparty/carotene/src/convert.cpp vendored Normal file

File diff suppressed because it is too large Load Diff

399
3rdparty/carotene/src/convert_depth.cpp vendored Normal file
View File

@ -0,0 +1,399 @@
/*
* By downloading, copying, installing or using the software you agree to this license.
* If you do not agree to this license, do not download, install,
* copy or use the software.
*
*
* License Agreement
* For Open Source Computer Vision Library
* (3-clause BSD License)
*
* Copyright (C) 2014, NVIDIA Corporation, all rights reserved.
* Third party copyrights are property of their respective owners.
*
* Redistribution and use in source and binary forms, with or without modification,
* are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* * Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* * Neither the names of the copyright holders nor the names of the contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* This software is provided by the copyright holders and contributors "as is" and
* any express or implied warranties, including, but not limited to, the implied
* warranties of merchantability and fitness for a particular purpose are disclaimed.
* In no event shall copyright holders or contributors be liable for any direct,
* indirect, incidental, special, exemplary, or consequential damages
* (including, but not limited to, procurement of substitute goods or services;
* loss of use, data, or profits; or business interruption) however caused
* and on any theory of liability, whether in contract, strict liability,
* or tort (including negligence or otherwise) arising in any way out of
* the use of this software, even if advised of the possibility of such damage.
*/
#include "common.hpp"
#include <cstring>
namespace CAROTENE_NS {
#ifdef CAROTENE_NEON
namespace {
template <int shift>
void lshiftConst(const Size2D &size,
const u8 * srcBase, ptrdiff_t srcStride,
s16 * dstBase, ptrdiff_t dstStride)
{
size_t roiw16 = size.width >= 15 ? size.width - 15 : 0;
size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
for (size_t i = 0; i < size.height; ++i)
{
const u8 * src = internal::getRowPtr(srcBase, srcStride, i);
s16 * dst = internal::getRowPtr(dstBase, dstStride, i);
size_t j = 0;
for (; j < roiw16; j += 16)
{
internal::prefetch(src + j);
uint8x16_t v_src = vld1q_u8(src + j);
int16x8_t v_dst0 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(v_src)));
int16x8_t v_dst1 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(v_src)));
vst1q_s16(dst + j, vshlq_n_s16(v_dst0, shift));
vst1q_s16(dst + j + 8, vshlq_n_s16(v_dst1, shift));
}
for (; j < roiw8; j += 8)
{
int16x8_t v_dst = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(src + j)));
vst1q_s16(dst + j, vshlq_n_s16(v_dst, shift));
}
for (; j < size.width; j++)
{
dst[j] = ((s16)src[j] << shift);
}
}
}
template <>
void lshiftConst<0>(const Size2D &size,
const u8 * srcBase, ptrdiff_t srcStride,
s16 * dstBase, ptrdiff_t dstStride)
{
size_t roiw16 = size.width >= 15 ? size.width - 15 : 0;
size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
for (size_t i = 0; i < size.height; ++i)
{
const u8 * src = internal::getRowPtr(srcBase, srcStride, i);
s16 * dst = internal::getRowPtr(dstBase, dstStride, i);
size_t j = 0;
for (; j < roiw16; j += 16)
{
internal::prefetch(src + j);
uint8x16_t v_src = vld1q_u8(src + j);
int16x8_t v_dst0 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(v_src)));
int16x8_t v_dst1 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(v_src)));
vst1q_s16(dst + j, v_dst0);
vst1q_s16(dst + j + 8, v_dst1);
}
for (; j < roiw8; j += 8)
{
int16x8_t v_dst = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(src + j)));
vst1q_s16(dst + j, v_dst);
}
for (; j < size.width; j++)
{
dst[j] = (s16)src[j];
}
}
}
template <int shift>
void rshiftConst(const Size2D &size,
const s16 * srcBase, ptrdiff_t srcStride,
u8 * dstBase, ptrdiff_t dstStride,
CONVERT_POLICY cpolicy)
{
size_t roiw16 = size.width >= 15 ? size.width - 15 : 0;
size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
for (size_t i = 0; i < size.height; ++i)
{
const s16 * src = internal::getRowPtr(srcBase, srcStride, i);
u8 * dst = internal::getRowPtr(dstBase, dstStride, i);
size_t j = 0;
if (cpolicy == CONVERT_POLICY_SATURATE)
{
for (; j < roiw16; j += 16)
{
internal::prefetch(src + j);
int16x8_t v_src0 = vshrq_n_s16(vld1q_s16(src + j), shift),
v_src1 = vshrq_n_s16(vld1q_s16(src + j + 8), shift);
uint8x16_t v_dst = vcombine_u8(vqmovun_s16(v_src0),
vqmovun_s16(v_src1));
vst1q_u8(dst + j, v_dst);
}
for (; j < roiw8; j += 8)
{
int16x8_t v_src = vshrq_n_s16(vld1q_s16(src + j), shift);
vst1_u8(dst + j, vqmovun_s16(v_src));
}
for (; j < size.width; j++)
{
dst[j] = internal::saturate_cast<u8>((src[j] >> shift));
}
}
else // CONVERT_POLICY_WRAP
{
for (; j < roiw16; j += 16)
{
internal::prefetch(src + j);
int16x8_t v_src0 = vshrq_n_s16(vld1q_s16(src + j), shift),
v_src1 = vshrq_n_s16(vld1q_s16(src + j + 8), shift);
int8x16_t v_dst = vcombine_s8(vmovn_s16(v_src0),
vmovn_s16(v_src1));
vst1q_u8(dst + j, vreinterpretq_u8_s8(v_dst));
}
for (; j < roiw8; j += 8)
{
int16x8_t v_src = vshrq_n_s16(vld1q_s16(src + j), shift);
vst1_u8(dst + j, vreinterpret_u8_s8(vmovn_s16(v_src)));
}
for (; j < size.width; j++)
{
dst[j] = (u8)((src[j] >> shift));
}
}
}
}
template <>
void rshiftConst<0>(const Size2D &size,
const s16 * srcBase, ptrdiff_t srcStride,
u8 * dstBase, ptrdiff_t dstStride,
CONVERT_POLICY cpolicy)
{
size_t roiw16 = size.width >= 15 ? size.width - 15 : 0;
size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
for (size_t i = 0; i < size.height; ++i)
{
const s16 * src = internal::getRowPtr(srcBase, srcStride, i);
u8 * dst = internal::getRowPtr(dstBase, dstStride, i);
size_t j = 0;
if (cpolicy == CONVERT_POLICY_SATURATE)
{
for (; j < roiw16; j += 16)
{
internal::prefetch(src + j);
int16x8_t v_src0 = vld1q_s16(src + j), v_src1 = vld1q_s16(src + j + 8);
uint8x16_t v_dst = vcombine_u8(vqmovun_s16(v_src0), vqmovun_s16(v_src1));
vst1q_u8(dst + j, v_dst);
}
for (; j < roiw8; j += 8)
{
int16x8_t v_src = vld1q_s16(src + j);
vst1_u8(dst + j, vqmovun_s16(v_src));
}
for (; j < size.width; j++)
{
dst[j] = internal::saturate_cast<u8>(src[j]);
}
}
else // CONVERT_POLICY_WRAP
{
for (; j < roiw16; j += 16)
{
internal::prefetch(src + j);
int16x8_t v_src0 = vld1q_s16(src + j), v_src1 = vld1q_s16(src + j + 8);
int8x16_t v_dst = vcombine_s8(vmovn_s16(v_src0), vmovn_s16(v_src1));
vst1q_u8(dst + j, vreinterpretq_u8_s8(v_dst));
}
for (; j < roiw8; j += 8)
{
int16x8_t v_src = vld1q_s16(src + j);
vst1_u8(dst + j, vreinterpret_u8_s8(vmovn_s16(v_src)));
}
for (; j < size.width; j++)
{
dst[j] = (u8)src[j];
}
}
}
}
typedef void (* lshiftConstFunc)(const Size2D &size,
const u8 * srcBase, ptrdiff_t srcStride,
s16 * dstBase, ptrdiff_t dstStride);
typedef void (* rshiftConstFunc)(const Size2D &size,
const s16 * srcBase, ptrdiff_t srcStride,
u8 * dstBase, ptrdiff_t dstStride,
CONVERT_POLICY cpolicy);
} // namespace
#endif
void lshift(const Size2D &size,
const u8 * srcBase, ptrdiff_t srcStride,
s16 * dstBase, ptrdiff_t dstStride,
u32 shift)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
if (shift >= 16u)
{
for (size_t i = 0; i < size.height; ++i)
{
s16 * dst = internal::getRowPtr(dstBase, dstStride, i);
std::memset(dst, 0, sizeof(s16) * size.width);
}
return;
}
// this ugly contruction is needed to avoid:
// /usr/lib/gcc/arm-linux-gnueabihf/4.8/include/arm_neon.h:3581:59: error: argument must be a constant
// return (int16x8_t)__builtin_neon_vshl_nv8hi (__a, __b, 1);
lshiftConstFunc funcs[16] =
{
lshiftConst<0>,
lshiftConst<1>,
lshiftConst<2>,
lshiftConst<3>,
lshiftConst<4>,
lshiftConst<5>,
lshiftConst<6>,
lshiftConst<7>,
lshiftConst<8>,
lshiftConst<9>,
lshiftConst<10>,
lshiftConst<11>,
lshiftConst<12>,
lshiftConst<13>,
lshiftConst<14>,
lshiftConst<15>
}, func = funcs[shift];
func(size, srcBase, srcStride, dstBase, dstStride);
#else
(void)size;
(void)srcBase;
(void)srcStride;
(void)dstBase;
(void)dstStride;
(void)shift;
#endif
}
void rshift(const Size2D &size,
const s16 * srcBase, ptrdiff_t srcStride,
u8 * dstBase, ptrdiff_t dstStride,
u32 shift, CONVERT_POLICY cpolicy)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
if (shift >= 16)
{
if (cpolicy == CONVERT_POLICY_WRAP)
{
size_t roiw16 = size.width >= 15 ? size.width - 15 : 0;
size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
int16x8_t v_zero = vdupq_n_s16(0);
for (size_t i = 0; i < size.height; ++i)
{
const s16 * src = internal::getRowPtr(srcBase, srcStride, i);
u8 * dst = internal::getRowPtr(dstBase, dstStride, i);
size_t j = 0;
for (; j < roiw16; j += 16)
{
internal::prefetch(src + j);
int16x8_t v_src0 = vld1q_s16(src + j), v_src1 = vld1q_s16(src + j + 8);
uint8x16_t v_dst = vcombine_u8(vmovn_u16(vcltq_s16(v_src0, v_zero)),
vmovn_u16(vcltq_s16(v_src1, v_zero)));
vst1q_u8(dst + j, v_dst);
}
for (; j < roiw8; j += 8)
{
int16x8_t v_src = vld1q_s16(src + j);
vst1_u8(dst + j, vmovn_u16(vcltq_s16(v_src, v_zero)));
}
for (; j < size.width; j++)
{
dst[j] = src[j] >= 0 ? 0 : 255;
}
}
}
else
{
for (size_t i = 0; i < size.height; ++i)
{
u8 * dst = internal::getRowPtr(dstBase, dstStride, i);
std::memset(dst, 0, sizeof(u8) * size.width);
}
}
return;
}
// this ugly contruction is needed to avoid:
// /usr/lib/gcc/arm-linux-gnueabihf/4.8/include/arm_neon.h:3581:59: error: argument must be a constant
// return (int16x8_t)__builtin_neon_vshr_nv8hi (__a, __b, 1);
rshiftConstFunc funcs[16] =
{
rshiftConst<0>,
rshiftConst<1>,
rshiftConst<2>,
rshiftConst<3>,
rshiftConst<4>,
rshiftConst<5>,
rshiftConst<6>,
rshiftConst<7>,
rshiftConst<8>,
rshiftConst<9>,
rshiftConst<10>,
rshiftConst<11>,
rshiftConst<12>,
rshiftConst<13>,
rshiftConst<14>,
rshiftConst<15>
}, func = funcs[shift];
func(size, srcBase, srcStride, dstBase, dstStride, cpolicy);
#else
(void)size;
(void)srcBase;
(void)srcStride;
(void)dstBase;
(void)dstStride;
(void)shift;
(void)cpolicy;
#endif
}
} // namespace CAROTENE_NS

2498
3rdparty/carotene/src/convert_scale.cpp vendored Normal file

File diff suppressed because it is too large Load Diff

340
3rdparty/carotene/src/convolution.cpp vendored Normal file
View File

@ -0,0 +1,340 @@
/*
* By downloading, copying, installing or using the software you agree to this license.
* If you do not agree to this license, do not download, install,
* copy or use the software.
*
*
* License Agreement
* For Open Source Computer Vision Library
* (3-clause BSD License)
*
* Copyright (C) 2014, NVIDIA Corporation, all rights reserved.
* Third party copyrights are property of their respective owners.
*
* Redistribution and use in source and binary forms, with or without modification,
* are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* * Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* * Neither the names of the copyright holders nor the names of the contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* This software is provided by the copyright holders and contributors "as is" and
* any express or implied warranties, including, but not limited to, the implied
* warranties of merchantability and fitness for a particular purpose are disclaimed.
* In no event shall copyright holders or contributors be liable for any direct,
* indirect, incidental, special, exemplary, or consequential damages
* (including, but not limited to, procurement of substitute goods or services;
* loss of use, data, or profits; or business interruption) however caused
* and on any theory of liability, whether in contract, strict liability,
* or tort (including negligence or otherwise) arising in any way out of
* the use of this software, even if advised of the possibility of such damage.
*/
#include "common.hpp"
#include "saturate_cast.hpp"
namespace CAROTENE_NS {
bool isConvolutionSupported(const Size2D &size, const Size2D &ksize,
BORDER_MODE border)
{
return isSupportedConfiguration() && size.width >= 8 &&
(border == BORDER_MODE_CONSTANT ||
border == BORDER_MODE_REPLICATE) &&
(ksize.width == 3) && (ksize.height == 3);
}
#ifdef CAROTENE_NEON
namespace {
template <int shift>
int32x4_t vshrq_s32(int32x4_t value)
{
return vshrq_n_s32(value, shift);
}
template <>
int32x4_t vshrq_s32<0>(int32x4_t value)
{
return value;
}
} // namespace
typedef int32x4_t (* vshrq_s32_func)(int32x4_t value);
#endif
void convolution(const Size2D &size,
const u8 * srcBase, ptrdiff_t srcStride,
u8 * dstBase, ptrdiff_t dstStride,
BORDER_MODE border, u8 borderValue,
const Size2D & ksize, s16 * kernelBase, u32 scale)
{
internal::assertSupportedConfiguration(isConvolutionSupported(size, ksize, border));
#ifdef CAROTENE_NEON
const uint8x8_t v_zero_u8 = vdup_n_u8(0);
const uint8x8_t v_border = vdup_n_u8(borderValue);
const int32x4_t v_zero_s32 = vdupq_n_s32(0);
uint8x8_t tprev[3] = { v_zero_u8, v_zero_u8, v_zero_u8 },
tcurr[3] = { v_zero_u8, v_zero_u8, v_zero_u8 },
tnext[3] = { v_zero_u8, v_zero_u8, v_zero_u8 };
uint8x8_t t0 = v_zero_u8, t1 = v_zero_u8, t2 = v_zero_u8;
ptrdiff_t width = (ptrdiff_t)size.width, height = (ptrdiff_t)size.height;
static const vshrq_s32_func vshrq_s32_a[33] =
{
vshrq_s32<0>,
vshrq_s32<1>,
vshrq_s32<2>,
vshrq_s32<3>,
vshrq_s32<4>,
vshrq_s32<5>,
vshrq_s32<6>,
vshrq_s32<7>,
vshrq_s32<8>,
vshrq_s32<9>,
vshrq_s32<10>,
vshrq_s32<11>,
vshrq_s32<12>,
vshrq_s32<13>,
vshrq_s32<14>,
vshrq_s32<15>,
vshrq_s32<16>,
vshrq_s32<17>,
vshrq_s32<18>,
vshrq_s32<19>,
vshrq_s32<20>,
vshrq_s32<21>,
vshrq_s32<22>,
vshrq_s32<23>,
vshrq_s32<24>,
vshrq_s32<25>,
vshrq_s32<26>,
vshrq_s32<27>,
vshrq_s32<28>,
vshrq_s32<29>,
vshrq_s32<30>,
vshrq_s32<31>,
vshrq_s32<32>
};
vshrq_s32_func vshrq_s32_p = vshrq_s32_a[scale];
for (ptrdiff_t y = 0; y < height; ++y)
{
const u8 * srow0 = y == 0 && border == BORDER_MODE_CONSTANT ? NULL : internal::getRowPtr(srcBase, srcStride, std::max<ptrdiff_t>(y - 1, 0));
const u8 * srow1 = internal::getRowPtr(srcBase, srcStride, y);
const u8 * srow2 = y + 1 == height && border == BORDER_MODE_CONSTANT ? NULL : internal::getRowPtr(srcBase, srcStride, std::min(y + 1, height - 1));
u8 * drow = internal::getRowPtr(dstBase, dstStride, y);
u8 prevx[3] = { 0, 0, 0 },
currx[3] = { 0, 0, 0 },
nextx[3] = { 0, 0, 0 };
ptrdiff_t x = 0;
const ptrdiff_t bwidth = y + 2 < height ? width : (width - 8);
// perform vertical convolution
for ( ; x <= bwidth; x += 8)
{
internal::prefetch(srow0 + x);
internal::prefetch(srow1 + x);
internal::prefetch(srow2 + x);
uint8x8_t x0 = !srow0 ? v_border : vld1_u8(srow0 + x);
uint8x8_t x1 = vld1_u8(srow1 + x);
uint8x8_t x2 = !srow2 ? v_border : vld1_u8(srow2 + x);
// calculate values for plain CPU part below if needed
if (x + 8 >= bwidth)
{
ptrdiff_t x3 = x == width ? width - 1 : x;
ptrdiff_t x4 = border == BORDER_MODE_CONSTANT ? x3 - 1 : std::max<ptrdiff_t>(x3 - 1, 0);
if (border == BORDER_MODE_CONSTANT && x4 < 0)
prevx[0] = prevx[1] = prevx[2] = borderValue;
else
{
prevx[0] = srow0 ? srow0[x4] : borderValue;
prevx[1] = srow1[x4] ;
prevx[2] = srow2 ? srow2[x4] : borderValue;
}
currx[0] = srow0 ? srow0[x3] : borderValue;
currx[1] = srow1[x3] ;
currx[2] = srow2 ? srow2[x3] : borderValue;
}
// make shift
if (x)
{
tprev[0] = tcurr[0];
tcurr[0] = tnext[0];
tprev[1] = tcurr[1];
tcurr[1] = tnext[1];
tprev[2] = tcurr[2];
tcurr[2] = tnext[2];
}
tnext[0] = x0;
tnext[1] = x1;
tnext[2] = x2;
// make extrapolation for the first elements
if (!x)
{
// make border
if (border == BORDER_MODE_CONSTANT)
tcurr[0] = tcurr[1] = tcurr[2] = v_border;
else if (border == BORDER_MODE_REPLICATE)
{
tcurr[0] = vdup_n_u8(vget_lane_u8(tnext[0], 0));
tcurr[1] = vdup_n_u8(vget_lane_u8(tnext[1], 0));
tcurr[2] = vdup_n_u8(vget_lane_u8(tnext[2], 0));
}
continue;
}
int32x4_t v_dst0 = v_zero_s32, v_dst1 = v_zero_s32;
{
// combine 3 "shifted" vectors
t0 = vext_u8(tprev[0], tcurr[0], 7);
t1 = tcurr[0];
t2 = vext_u8(tcurr[0], tnext[0], 1);
int16x8_t t0_16s = vreinterpretq_s16_u16(vmovl_u8(t0));
int16x8_t t1_16s = vreinterpretq_s16_u16(vmovl_u8(t1));
int16x8_t t2_16s = vreinterpretq_s16_u16(vmovl_u8(t2));
v_dst0 = vmlal_n_s16(v_dst0, vget_low_s16(t0_16s), kernelBase[8]);
v_dst0 = vmlal_n_s16(v_dst0, vget_low_s16(t1_16s), kernelBase[7]);
v_dst0 = vmlal_n_s16(v_dst0, vget_low_s16(t2_16s), kernelBase[6]);
v_dst1 = vmlal_n_s16(v_dst1, vget_high_s16(t0_16s), kernelBase[8]);
v_dst1 = vmlal_n_s16(v_dst1, vget_high_s16(t1_16s), kernelBase[7]);
v_dst1 = vmlal_n_s16(v_dst1, vget_high_s16(t2_16s), kernelBase[6]);
}
{
// combine 3 "shifted" vectors
t0 = vext_u8(tprev[1], tcurr[1], 7);
t1 = tcurr[1];
t2 = vext_u8(tcurr[1], tnext[1], 1);
int16x8_t t0_16s = vreinterpretq_s16_u16(vmovl_u8(t0));
int16x8_t t1_16s = vreinterpretq_s16_u16(vmovl_u8(t1));
int16x8_t t2_16s = vreinterpretq_s16_u16(vmovl_u8(t2));
v_dst0 = vmlal_n_s16(v_dst0, vget_low_s16(t0_16s), kernelBase[5]);
v_dst0 = vmlal_n_s16(v_dst0, vget_low_s16(t1_16s), kernelBase[4]);
v_dst0 = vmlal_n_s16(v_dst0, vget_low_s16(t2_16s), kernelBase[3]);
v_dst1 = vmlal_n_s16(v_dst1, vget_high_s16(t0_16s), kernelBase[5]);
v_dst1 = vmlal_n_s16(v_dst1, vget_high_s16(t1_16s), kernelBase[4]);
v_dst1 = vmlal_n_s16(v_dst1, vget_high_s16(t2_16s), kernelBase[3]);
}
{
// combine 3 "shifted" vectors
t0 = vext_u8(tprev[2], tcurr[2], 7);
t1 = tcurr[2];
t2 = vext_u8(tcurr[2], tnext[2], 1);
int16x8_t t0_16s = vreinterpretq_s16_u16(vmovl_u8(t0));
int16x8_t t1_16s = vreinterpretq_s16_u16(vmovl_u8(t1));
int16x8_t t2_16s = vreinterpretq_s16_u16(vmovl_u8(t2));
v_dst0 = vmlal_n_s16(v_dst0, vget_low_s16(t0_16s), kernelBase[2]);
v_dst0 = vmlal_n_s16(v_dst0, vget_low_s16(t1_16s), kernelBase[1]);
v_dst0 = vmlal_n_s16(v_dst0, vget_low_s16(t2_16s), kernelBase[0]);
v_dst1 = vmlal_n_s16(v_dst1, vget_high_s16(t0_16s), kernelBase[2]);
v_dst1 = vmlal_n_s16(v_dst1, vget_high_s16(t1_16s), kernelBase[1]);
v_dst1 = vmlal_n_s16(v_dst1, vget_high_s16(t2_16s), kernelBase[0]);
}
// make scale
v_dst0 = vshrq_s32_p(v_dst0);
v_dst1 = vshrq_s32_p(v_dst1);
// and add them
vst1_u8(drow + x - 8, vqmovn_u16(vcombine_u16(vqmovun_s32(v_dst0),
vqmovun_s32(v_dst1))));
}
x -= 8;
if (x == width)
--x;
for ( ; x < width; ++x)
{
// make extrapolation for the last elements
if (x + 1 >= width)
{
if (border == BORDER_MODE_CONSTANT)
{
nextx[0] = borderValue;
nextx[1] = borderValue;
nextx[2] = borderValue;
}
else if (border == BORDER_MODE_REPLICATE)
{
nextx[0] = srow0[x];
nextx[1] = srow1[x];
nextx[2] = srow2[x];
}
}
else
{
nextx[0] = srow0 ? srow0[x + 1] : borderValue;
nextx[1] = srow1[x + 1] ;
nextx[2] = srow2 ? srow2[x + 1] : borderValue;
}
s32 val = 0;
for (s32 _y = 0; _y < 3; ++_y)
val += prevx[_y] * kernelBase[(2 - _y) * 3 + 2] +
currx[_y] * kernelBase[(2 - _y) * 3 + 1] +
nextx[_y] * kernelBase[(2 - _y) * 3 + 0];
drow[x] = internal::saturate_cast<u8>(val >> scale);
// make shift
prevx[0] = currx[0];
currx[0] = nextx[0];
prevx[1] = currx[1];
currx[1] = nextx[1];
prevx[2] = currx[2];
currx[2] = nextx[2];
}
}
#else
(void)size;
(void)srcBase;
(void)srcStride;
(void)dstBase;
(void)dstStride;
(void)border;
(void)borderValue;
(void)ksize;
(void)kernelBase;
(void)scale;
#endif
}
} // namespace CAROTENE_NS

430
3rdparty/carotene/src/count_nonzero.cpp vendored Normal file
View File

@ -0,0 +1,430 @@
/*
* By downloading, copying, installing or using the software you agree to this license.
* If you do not agree to this license, do not download, install,
* copy or use the software.
*
*
* License Agreement
* For Open Source Computer Vision Library
* (3-clause BSD License)
*
* Copyright (C) 2012-2015, NVIDIA Corporation, all rights reserved.
* Third party copyrights are property of their respective owners.
*
* Redistribution and use in source and binary forms, with or without modification,
* are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* * Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* * Neither the names of the copyright holders nor the names of the contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* This software is provided by the copyright holders and contributors "as is" and
* any express or implied warranties, including, but not limited to, the implied
* warranties of merchantability and fitness for a particular purpose are disclaimed.
* In no event shall copyright holders or contributors be liable for any direct,
* indirect, incidental, special, exemplary, or consequential damages
* (including, but not limited to, procurement of substitute goods or services;
* loss of use, data, or profits; or business interruption) however caused
* and on any theory of liability, whether in contract, strict liability,
* or tort (including negligence or otherwise) arising in any way out of
* the use of this software, even if advised of the possibility of such damage.
*/
#include "common.hpp"
#include <limits>
namespace CAROTENE_NS {
s32 countNonZero(const Size2D &_size,
const u8 * srcBase, ptrdiff_t srcStride)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
Size2D size(_size);
if (srcStride == (ptrdiff_t)(size.width))
{
size.width *= size.height;
size.height = 1;
}
size_t roiw16 = size.width & ~15u;
s32 result = 0;
for(size_t k = 0; k < size.height; ++k)
{
const u8* src = internal::getRowPtr( srcBase, srcStride, k);
size_t i = 0;
#define COUNTNONZERO8U_BLOCK_SIZE (16*255)
uint8x16_t vc1 = vmovq_n_u8(1);
for (; i < roiw16;)
{
size_t lim = std::min(i + COUNTNONZERO8U_BLOCK_SIZE, size.width) - 16;
uint8x16_t vs = vmovq_n_u8(0);
for (; i <= lim; i+= 16)
{
internal::prefetch(src + i);
uint8x16_t vln = vld1q_u8(src + i);
uint8x16_t vnz = vminq_u8(vln, vc1);
vs = vaddq_u8(vs, vnz);
}
uint32x4_t vs4 = vpaddlq_u16(vpaddlq_u8(vs));
uint32x2_t vs2 = vadd_u32(vget_low_u32(vs4), vget_high_u32(vs4));
s32 s[2];
vst1_u32((u32*)s, vs2);
if (s[0] < 0 || s[1] < 0)//saturate in case of overflow ~ 2GB of non-zeros...
{
return 0x7fFFffFF;
}
result += (s[0] += s[1]);
if (s[0] < 0 || result < 0)
{
return 0x7fFFffFF;
}
}
for (; i < size.width; i++)
result += (src[i] != 0)?1:0;
if (result < 0)//saturate in case of overflow ~ 2GB of non-zeros...
{
return 0x7fFFffFF;
}
}
return result;
#else
(void)_size;
(void)srcBase;
(void)srcStride;
return 0;
#endif
}
s32 countNonZero(const Size2D &_size,
const u16 * srcBase, ptrdiff_t srcStride)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
Size2D size(_size);
if (srcStride == (ptrdiff_t)(size.width))
{
size.width *= size.height;
size.height = 1;
}
size_t roiw8 = size.width & ~7u;
s32 result = 0;
for(size_t k = 0; k < size.height; ++k)
{
const u16* src = internal::getRowPtr( srcBase, srcStride, k);
size_t i = 0;
#define COUNTNONZERO16U_BLOCK_SIZE (8*(256*256-1))
uint16x8_t vc1 = vmovq_n_u16(1);
for (; i < roiw8;)
{
size_t lim = std::min(i + COUNTNONZERO16U_BLOCK_SIZE, size.width) - 8;
uint16x8_t vs = vmovq_n_u16(0);
for (; i <= lim; i+= 8)
{
internal::prefetch(src + i);
uint16x8_t vln = vld1q_u16(src + i);
uint16x8_t vnz = vminq_u16(vln, vc1);
vs = vaddq_u16(vs, vnz);
}
uint32x4_t vs4 = vpaddlq_u16(vs);
uint32x2_t vs2 = vadd_u32(vget_low_u32(vs4), vget_high_u32(vs4));
s32 s[2];
vst1_u32((u32*)s, vs2);
if (s[0] < 0 || s[1] < 0)//saturate in case of overflow ~ 4GB of non-zeros...
{
return 0x7fFFffFF;
}
result += (s[0] += s[1]);
if (s[0] < 0 || result < 0)
{
return 0x7fFFffFF;
}
}
for (; i < size.width; i++)
result += (src[i] != 0)?1:0;
if (result < 0)//saturate in case of overflow ~ 4GB of non-zeros...
{
return 0x7fFFffFF;
}
}
return result;
#else
(void)_size;
(void)srcBase;
(void)srcStride;
return 0;
#endif
}
s32 countNonZero(const Size2D &_size,
const s32 * srcBase, ptrdiff_t srcStride)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
Size2D size(_size);
if (srcStride == (ptrdiff_t)(size.width))
{
size.width *= size.height;
size.height = 1;
}
size_t roiw4 = size.width & ~3u;
s32 result = 0;
for(size_t k = 0; k < size.height; ++k)
{
const u32* src = (const u32*)internal::getRowPtr( srcBase, srcStride, k);
u32 i = 0;
uint32x4_t vc1 = vmovq_n_u32(1);
uint32x4_t vs = vmovq_n_u32(0);
for (; i < roiw4; i += 4 )
{
internal::prefetch(src + i);
uint32x4_t vln = vld1q_u32(src + i);
uint32x4_t vnz = vminq_u32(vln, vc1);
vs = vqaddq_u32(vs, vnz);
}
uint32x2_t vs2 = vqadd_u32(vget_low_u32(vs), vget_high_u32(vs));
s32 s[2];
vst1_u32((u32*)s, vs2);
if (s[0] < 0 || s[1] < 0)//saturate in case of overflow ~ 8GB of non-zeros...
{
return 0x7fFFffFF;
}
result += (s[0] += s[1]);
if (s[0] < 0 || result < 0)
{
return 0x7fFFffFF;
}
for (; i < size.width; i++)
result += (src[i] != 0)?1:0;
if (result < 0)//saturate in case of overflow ~ 8GB of non-zeros...
{
return 0x7fFFffFF;
}
}
return result;
#else
(void)_size;
(void)srcBase;
(void)srcStride;
return 0;
#endif
}
s32 countNonZero(const Size2D &_size,
const f32 * srcBase, ptrdiff_t srcStride)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
Size2D size(_size);
if (srcStride == (ptrdiff_t)(size.width))
{
size.width *= size.height;
size.height = 1;
}
size_t roiw4 = size.width & ~3u;
s32 result = 0;
for(size_t k = 0; k < size.height; ++k)
{
const f32* src = internal::getRowPtr( srcBase, srcStride, k);
size_t i = 0;
float32x4_t vc0 = vmovq_n_f32(0);
int32x4_t vs = vmovq_n_s32(0);
for (; i < roiw4; i += 4 )
{
internal::prefetch(src + i);
float32x4_t vln = vld1q_f32(src + i);
int32x4_t vnz = vreinterpretq_s32_u32(vmvnq_u32(vceqq_f32(vln, vc0)));
vs = vqaddq_s32(vs, vnz);
}
int32x2_t vs2 = vqneg_s32(vqadd_s32(vget_low_s32(vs), vget_high_s32(vs)));
int s[2];
vst1_s32(s, vs2);
result += (s[0] += s[1]);
if (s[0] < 0 || result < 0)//case of overflow ~ 8GB of non-zeros...
{
return 0x7fFFffFF;
}
for (; i < size.width; i++)
result += (src[i] < std::numeric_limits<float>::min() && src[i] > -std::numeric_limits<float>::min())?0:1;
if (result < 0)
{
return 0x7fFFffFF;
}
}
return result;
#else
(void)_size;
(void)srcBase;
(void)srcStride;
return 0;
#endif
}
s32 countNonZero(const Size2D &_size,
const f64 * srcBase, ptrdiff_t srcStride)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
Size2D size(_size);
if (srcStride == (ptrdiff_t)(size.width))
{
size.width *= size.height;
size.height = 1;
}
size_t roiw8 = size.width & ~7u;
size_t roiw4 = size.width & ~3u;
size_t roiw2 = size.width & ~1u;
uint64x2_t vmask1 = vdupq_n_u64(0x7fFFffFFffFFffFFULL); //will treat denormals as non-zero
uint32x4_t vc0 = vmovq_n_u32(0);
s32 result = 0;
for(size_t k = 0; k < size.height; ++k)
{
const f64* src = internal::getRowPtr( srcBase, srcStride, k);
size_t i = 0;
int32x2_t vs1 = vmov_n_s32(0);
int32x2_t vs2 = vmov_n_s32(0);
int32x2_t vs3 = vmov_n_s32(0);
int32x2_t vs4 = vmov_n_s32(0);
for (; i < roiw8; i += 8 )
{
internal::prefetch(src + i + 6);
uint64x2_t vln1 = vld1q_u64((const u64*)(src + i));
uint64x2_t vln2 = vld1q_u64((const u64*)(src + i + 2));
uint64x2_t vln3 = vld1q_u64((const u64*)(src + i + 4));
uint64x2_t vln4 = vld1q_u64((const u64*)(src + i + 6));
uint64x2_t vm1 = vandq_u64(vln1, vmask1);
uint64x2_t vm2 = vandq_u64(vln2, vmask1);
uint64x2_t vm3 = vandq_u64(vln3, vmask1);
uint64x2_t vm4 = vandq_u64(vln4, vmask1);
uint32x4_t vequ1 = vceqq_u32(vreinterpretq_u32_u64(vm1), vc0);
uint32x4_t vequ2 = vceqq_u32(vreinterpretq_u32_u64(vm2), vc0);
uint32x4_t vequ3 = vceqq_u32(vreinterpretq_u32_u64(vm3), vc0);
uint32x4_t vequ4 = vceqq_u32(vreinterpretq_u32_u64(vm4), vc0);
uint32x4_t vlx1 = vmvnq_u32(vequ1);
uint32x4_t vlx2 = vmvnq_u32(vequ2);
uint32x4_t vlx3 = vmvnq_u32(vequ3);
uint32x4_t vlx4 = vmvnq_u32(vequ4);
int32x2_t vnz1 = vreinterpret_s32_u32(vpmax_u32(vget_low_u32(vlx1), vget_high_u32(vlx1)));
int32x2_t vnz2 = vreinterpret_s32_u32(vpmax_u32(vget_low_u32(vlx2), vget_high_u32(vlx2)));
int32x2_t vnz3 = vreinterpret_s32_u32(vpmax_u32(vget_low_u32(vlx3), vget_high_u32(vlx3)));
int32x2_t vnz4 = vreinterpret_s32_u32(vpmax_u32(vget_low_u32(vlx4), vget_high_u32(vlx4)));
vs1 = vqadd_s32(vs1, vnz1);
vs2 = vqadd_s32(vs2, vnz2);
vs3 = vqadd_s32(vs3, vnz3);
vs4 = vqadd_s32(vs4, vnz4);
}
if (i < roiw4)
{
internal::prefetch(src + i + 2);
uint64x2_t vln1 = vld1q_u64((const u64*)(src + i));
uint64x2_t vln2 = vld1q_u64((const u64*)(src + i + 2));
uint64x2_t vm1 = vandq_u64(vln1, vmask1);
uint64x2_t vm2 = vandq_u64(vln2, vmask1);
uint32x4_t vequ1 = vceqq_u32(vreinterpretq_u32_u64(vm1), vc0);
uint32x4_t vequ2 = vceqq_u32(vreinterpretq_u32_u64(vm2), vc0);
uint32x4_t vlx1 = vmvnq_u32(vequ1);
uint32x4_t vlx2 = vmvnq_u32(vequ2);
int32x2_t vnz1 = vreinterpret_s32_u32(vpmax_u32(vget_low_u32(vlx1), vget_high_u32(vlx1)));
int32x2_t vnz2 = vreinterpret_s32_u32(vpmax_u32(vget_low_u32(vlx2), vget_high_u32(vlx2)));
vs1 = vqadd_s32(vs1, vnz1);
vs2 = vqadd_s32(vs2, vnz2);
i += 4;
}
if (i < roiw2)
{
internal::prefetch(src + i);
uint64x2_t vln1 = vld1q_u64((const u64*)(src + i));
uint64x2_t vm1 = vandq_u64(vln1, vmask1);
uint32x4_t vequ1 = vceqq_u32(vreinterpretq_u32_u64(vm1), vc0);
uint32x4_t vlx1 = vmvnq_u32(vequ1);
int32x2_t vnz1 = vreinterpret_s32_u32(vpmax_u32(vget_low_u32(vlx1), vget_high_u32(vlx1)));
vs1 = vqadd_s32(vs1, vnz1);
i += 2;
}
vs1 = vqadd_s32(vs1, vs2);
vs3 = vqadd_s32(vs3, vs4);
vs1 = vqadd_s32(vs1, vs3);
int32x2_t vsneg = vqneg_s32(vs1);
s32 s[2];
vst1_s32(s, vsneg);
result += (s[0] += s[1]);
if (s[0] < 0 || result < 0)//case of overflow ~ 16GB of non-zeros...
{
return 0x7fFFffFF;
}
for (; i < size.width; i++)
result += (src[i] < std::numeric_limits<double>::min() && src[i] > -std::numeric_limits<double>::min())?0:1;
if (result < 0)
{
return 0x7fFFffFF;
}
}
return result;
#else
(void)_size;
(void)srcBase;
(void)srcStride;
return 0;
#endif
}
} // namespace CAROTENE_NS

694
3rdparty/carotene/src/div.cpp vendored Normal file
View File

@ -0,0 +1,694 @@
/*
* By downloading, copying, installing or using the software you agree to this license.
* If you do not agree to this license, do not download, install,
* copy or use the software.
*
*
* License Agreement
* For Open Source Computer Vision Library
* (3-clause BSD License)
*
* Copyright (C) 2016, NVIDIA Corporation, all rights reserved.
* Third party copyrights are property of their respective owners.
*
* Redistribution and use in source and binary forms, with or without modification,
* are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* * Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* * Neither the names of the copyright holders nor the names of the contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* This software is provided by the copyright holders and contributors "as is" and
* any express or implied warranties, including, but not limited to, the implied
* warranties of merchantability and fitness for a particular purpose are disclaimed.
* In no event shall copyright holders or contributors be liable for any direct,
* indirect, incidental, special, exemplary, or consequential damages
* (including, but not limited to, procurement of substitute goods or services;
* loss of use, data, or profits; or business interruption) however caused
* and on any theory of liability, whether in contract, strict liability,
* or tort (including negligence or otherwise) arising in any way out of
* the use of this software, even if advised of the possibility of such damage.
*/
#include "common.hpp"
#include "vtransform.hpp"
#include <cstring>
#include <cfloat>
#include <cmath>
#include <limits>
namespace CAROTENE_NS {
namespace {
#ifdef CAROTENE_NEON
template <typename T>
inline T divSaturateQ(const T &v1, const T &v2, const float scale)
{
return internal::vcombine(internal::vqmovn(divSaturateQ(internal::vmovl(internal::vget_low(v1)),
internal::vmovl(internal::vget_low(v2)), scale)),
internal::vqmovn(divSaturateQ(internal::vmovl(internal::vget_high(v1)),
internal::vmovl(internal::vget_high(v2)), scale))
);
}
template <>
inline int32x4_t divSaturateQ<int32x4_t>(const int32x4_t &v1, const int32x4_t &v2, const float scale)
{ return vcvtq_s32_f32(vmulq_f32(vmulq_n_f32(vcvtq_f32_s32(v1), scale), internal::vrecpq_f32(vcvtq_f32_s32(v2)))); }
template <>
inline uint32x4_t divSaturateQ<uint32x4_t>(const uint32x4_t &v1, const uint32x4_t &v2, const float scale)
{ return vcvtq_u32_f32(vmulq_f32(vmulq_n_f32(vcvtq_f32_u32(v1), scale), internal::vrecpq_f32(vcvtq_f32_u32(v2)))); }
template <typename T>
inline T divSaturate(const T &v1, const T &v2, const float scale)
{
return internal::vqmovn(divSaturateQ(internal::vmovl(v1), internal::vmovl(v2), scale));
}
template <>
inline int32x2_t divSaturate<int32x2_t>(const int32x2_t &v1, const int32x2_t &v2, const float scale)
{ return vcvt_s32_f32(vmul_f32(vmul_n_f32(vcvt_f32_s32(v1), scale), internal::vrecp_f32(vcvt_f32_s32(v2)))); }
template <>
inline uint32x2_t divSaturate<uint32x2_t>(const uint32x2_t &v1, const uint32x2_t &v2, const float scale)
{ return vcvt_u32_f32(vmul_f32(vmul_n_f32(vcvt_f32_u32(v1), scale), internal::vrecp_f32(vcvt_f32_u32(v2)))); }
template <typename T>
inline T divWrapQ(const T &v1, const T &v2, const float scale)
{
return internal::vcombine(internal::vmovn(divWrapQ(internal::vmovl(internal::vget_low(v1)),
internal::vmovl(internal::vget_low(v2)), scale)),
internal::vmovn(divWrapQ(internal::vmovl(internal::vget_high(v1)),
internal::vmovl(internal::vget_high(v2)), scale))
);
}
template <>
inline int32x4_t divWrapQ<int32x4_t>(const int32x4_t &v1, const int32x4_t &v2, const float scale)
{ return vcvtq_s32_f32(vmulq_f32(vmulq_n_f32(vcvtq_f32_s32(v1), scale), internal::vrecpq_f32(vcvtq_f32_s32(v2)))); }
template <>
inline uint32x4_t divWrapQ<uint32x4_t>(const uint32x4_t &v1, const uint32x4_t &v2, const float scale)
{ return vcvtq_u32_f32(vmulq_f32(vmulq_n_f32(vcvtq_f32_u32(v1), scale), internal::vrecpq_f32(vcvtq_f32_u32(v2)))); }
template <typename T>
inline T divWrap(const T &v1, const T &v2, const float scale)
{
return internal::vmovn(divWrapQ(internal::vmovl(v1), internal::vmovl(v2), scale));
}
template <>
inline int32x2_t divWrap<int32x2_t>(const int32x2_t &v1, const int32x2_t &v2, const float scale)
{ return vcvt_s32_f32(vmul_f32(vmul_n_f32(vcvt_f32_s32(v1), scale), internal::vrecp_f32(vcvt_f32_s32(v2)))); }
template <>
inline uint32x2_t divWrap<uint32x2_t>(const uint32x2_t &v1, const uint32x2_t &v2, const float scale)
{ return vcvt_u32_f32(vmul_f32(vmul_n_f32(vcvt_f32_u32(v1), scale), internal::vrecp_f32(vcvt_f32_u32(v2)))); }
inline uint8x16_t vtstq(const uint8x16_t & v0, const uint8x16_t & v1) { return vtstq_u8 (v0, v1); }
inline uint16x8_t vtstq(const uint16x8_t & v0, const uint16x8_t & v1) { return vtstq_u16(v0, v1); }
inline uint32x4_t vtstq(const uint32x4_t & v0, const uint32x4_t & v1) { return vtstq_u32(v0, v1); }
inline int8x16_t vtstq(const int8x16_t & v0, const int8x16_t & v1) { return vreinterpretq_s8_u8 (vtstq_s8 (v0, v1)); }
inline int16x8_t vtstq(const int16x8_t & v0, const int16x8_t & v1) { return vreinterpretq_s16_u16(vtstq_s16(v0, v1)); }
inline int32x4_t vtstq(const int32x4_t & v0, const int32x4_t & v1) { return vreinterpretq_s32_u32(vtstq_s32(v0, v1)); }
inline uint8x8_t vtst(const uint8x8_t & v0, const uint8x8_t & v1) { return vtst_u8 (v0, v1); }
inline uint16x4_t vtst(const uint16x4_t & v0, const uint16x4_t & v1) { return vtst_u16(v0, v1); }
inline uint32x2_t vtst(const uint32x2_t & v0, const uint32x2_t & v1) { return vtst_u32(v0, v1); }
inline int8x8_t vtst(const int8x8_t & v0, const int8x8_t & v1) { return vreinterpret_s8_u8 (vtst_s8 (v0, v1)); }
inline int16x4_t vtst(const int16x4_t & v0, const int16x4_t & v1) { return vreinterpret_s16_u16(vtst_s16(v0, v1)); }
inline int32x2_t vtst(const int32x2_t & v0, const int32x2_t & v1) { return vreinterpret_s32_u32(vtst_s32(v0, v1)); }
#endif
template <typename T>
void div(const Size2D &size,
const T * src0Base, ptrdiff_t src0Stride,
const T * src1Base, ptrdiff_t src1Stride,
T * dstBase, ptrdiff_t dstStride,
f32 scale,
CONVERT_POLICY cpolicy)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
typedef typename internal::VecTraits<T>::vec128 vec128;
typedef typename internal::VecTraits<T>::vec64 vec64;
if (scale == 0.0f ||
(std::numeric_limits<T>::is_integer &&
(scale * std::numeric_limits<T>::max()) < 1.0f &&
(scale * std::numeric_limits<T>::max()) > -1.0f))
{
for (size_t y = 0; y < size.height; ++y)
{
T * dst = internal::getRowPtr(dstBase, dstStride, y);
std::memset(dst, 0, sizeof(T) * size.width);
}
return;
}
const size_t step128 = 16 / sizeof(T);
size_t roiw128 = size.width >= (step128 - 1) ? size.width - step128 + 1 : 0;
const size_t step64 = 8 / sizeof(T);
size_t roiw64 = size.width >= (step64 - 1) ? size.width - step64 + 1 : 0;
for (size_t i = 0; i < size.height; ++i)
{
const T * src0 = internal::getRowPtr(src0Base, src0Stride, i);
const T * src1 = internal::getRowPtr(src1Base, src1Stride, i);
T * dst = internal::getRowPtr(dstBase, dstStride, i);
size_t j = 0;
if (cpolicy == CONVERT_POLICY_SATURATE)
{
for (; j < roiw128; j += step128)
{
internal::prefetch(src0 + j);
internal::prefetch(src1 + j);
vec128 v_src0 = internal::vld1q(src0 + j);
vec128 v_src1 = internal::vld1q(src1 + j);
vec128 v_mask = vtstq(v_src1,v_src1);
internal::vst1q(dst + j, internal::vandq(v_mask, divSaturateQ(v_src0, v_src1, scale)));
}
for (; j < roiw64; j += step64)
{
vec64 v_src0 = internal::vld1(src0 + j);
vec64 v_src1 = internal::vld1(src1 + j);
vec64 v_mask = vtst(v_src1,v_src1);
internal::vst1(dst + j, internal::vand(v_mask,divSaturate(v_src0, v_src1, scale)));
}
for (; j < size.width; j++)
{
dst[j] = src1[j] ? internal::saturate_cast<T>(scale * src0[j] / src1[j]) : 0;
}
}
else // CONVERT_POLICY_WRAP
{
for (; j < roiw128; j += step128)
{
internal::prefetch(src0 + j);
internal::prefetch(src1 + j);
vec128 v_src0 = internal::vld1q(src0 + j);
vec128 v_src1 = internal::vld1q(src1 + j);
vec128 v_mask = vtstq(v_src1,v_src1);
internal::vst1q(dst + j, internal::vandq(v_mask, divWrapQ(v_src0, v_src1, scale)));
}
for (; j < roiw64; j += step64)
{
vec64 v_src0 = internal::vld1(src0 + j);
vec64 v_src1 = internal::vld1(src1 + j);
vec64 v_mask = vtst(v_src1,v_src1);
internal::vst1(dst + j, internal::vand(v_mask,divWrap(v_src0, v_src1, scale)));
}
for (; j < size.width; j++)
{
dst[j] = src1[j] ? (T)((s32)trunc(scale * src0[j] / src1[j])) : 0;
}
}
}
#else
(void)size;
(void)src0Base;
(void)src0Stride;
(void)src1Base;
(void)src1Stride;
(void)dstBase;
(void)dstStride;
(void)cpolicy;
(void)scale;
#endif
}
#ifdef CAROTENE_NEON
template <typename T>
inline T recipSaturateQ(const T &v2, const float scale)
{
return internal::vcombine(internal::vqmovn(recipSaturateQ(internal::vmovl(internal::vget_low(v2)), scale)),
internal::vqmovn(recipSaturateQ(internal::vmovl(internal::vget_high(v2)), scale))
);
}
template <>
inline int32x4_t recipSaturateQ<int32x4_t>(const int32x4_t &v2, const float scale)
{ return vcvtq_s32_f32(vmulq_n_f32(internal::vrecpq_f32(vcvtq_f32_s32(v2)), scale)); }
template <>
inline uint32x4_t recipSaturateQ<uint32x4_t>(const uint32x4_t &v2, const float scale)
{ return vcvtq_u32_f32(vmulq_n_f32(internal::vrecpq_f32(vcvtq_f32_u32(v2)), scale)); }
template <typename T>
inline T recipSaturate(const T &v2, const float scale)
{
return internal::vqmovn(recipSaturateQ(internal::vmovl(v2), scale));
}
template <>
inline int32x2_t recipSaturate<int32x2_t>(const int32x2_t &v2, const float scale)
{ return vcvt_s32_f32(vmul_n_f32(internal::vrecp_f32(vcvt_f32_s32(v2)), scale)); }
template <>
inline uint32x2_t recipSaturate<uint32x2_t>(const uint32x2_t &v2, const float scale)
{ return vcvt_u32_f32(vmul_n_f32(internal::vrecp_f32(vcvt_f32_u32(v2)), scale)); }
template <typename T>
inline T recipWrapQ(const T &v2, const float scale)
{
return internal::vcombine(internal::vmovn(recipWrapQ(internal::vmovl(internal::vget_low(v2)), scale)),
internal::vmovn(recipWrapQ(internal::vmovl(internal::vget_high(v2)), scale))
);
}
template <>
inline int32x4_t recipWrapQ<int32x4_t>(const int32x4_t &v2, const float scale)
{ return vcvtq_s32_f32(vmulq_n_f32(internal::vrecpq_f32(vcvtq_f32_s32(v2)), scale)); }
template <>
inline uint32x4_t recipWrapQ<uint32x4_t>(const uint32x4_t &v2, const float scale)
{ return vcvtq_u32_f32(vmulq_n_f32(internal::vrecpq_f32(vcvtq_f32_u32(v2)), scale)); }
template <typename T>
inline T recipWrap(const T &v2, const float scale)
{
return internal::vmovn(recipWrapQ(internal::vmovl(v2), scale));
}
template <>
inline int32x2_t recipWrap<int32x2_t>(const int32x2_t &v2, const float scale)
{ return vcvt_s32_f32(vmul_n_f32(internal::vrecp_f32(vcvt_f32_s32(v2)), scale)); }
template <>
inline uint32x2_t recipWrap<uint32x2_t>(const uint32x2_t &v2, const float scale)
{ return vcvt_u32_f32(vmul_n_f32(internal::vrecp_f32(vcvt_f32_u32(v2)), scale)); }
#endif
template <typename T>
void recip(const Size2D &size,
const T * src1Base, ptrdiff_t src1Stride,
T * dstBase, ptrdiff_t dstStride,
f32 scale,
CONVERT_POLICY cpolicy)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
typedef typename internal::VecTraits<T>::vec128 vec128;
typedef typename internal::VecTraits<T>::vec64 vec64;
if (scale == 0.0f ||
(std::numeric_limits<T>::is_integer &&
scale < 1.0f &&
scale > -1.0f))
{
for (size_t y = 0; y < size.height; ++y)
{
T * dst = internal::getRowPtr(dstBase, dstStride, y);
std::memset(dst, 0, sizeof(T) * size.width);
}
return;
}
const size_t step128 = 16 / sizeof(T);
size_t roiw128 = size.width >= (step128 - 1) ? size.width - step128 + 1 : 0;
const size_t step64 = 8 / sizeof(T);
size_t roiw64 = size.width >= (step64 - 1) ? size.width - step64 + 1 : 0;
for (size_t i = 0; i < size.height; ++i)
{
const T * src1 = internal::getRowPtr(src1Base, src1Stride, i);
T * dst = internal::getRowPtr(dstBase, dstStride, i);
size_t j = 0;
if (cpolicy == CONVERT_POLICY_SATURATE)
{
for (; j < roiw128; j += step128)
{
internal::prefetch(src1 + j);
vec128 v_src1 = internal::vld1q(src1 + j);
vec128 v_mask = vtstq(v_src1,v_src1);
internal::vst1q(dst + j, internal::vandq(v_mask, recipSaturateQ(v_src1, scale)));
}
for (; j < roiw64; j += step64)
{
vec64 v_src1 = internal::vld1(src1 + j);
vec64 v_mask = vtst(v_src1,v_src1);
internal::vst1(dst + j, internal::vand(v_mask, recipSaturate(v_src1, scale)));
}
for (; j < size.width; j++)
{
dst[j] = src1[j] ? internal::saturate_cast<T>(scale / src1[j]) : 0;
}
}
else // CONVERT_POLICY_WRAP
{
for (; j < roiw128; j += step128)
{
internal::prefetch(src1 + j);
vec128 v_src1 = internal::vld1q(src1 + j);
vec128 v_mask = vtstq(v_src1,v_src1);
internal::vst1q(dst + j, internal::vandq(v_mask, recipWrapQ(v_src1, scale)));
}
for (; j < roiw64; j += step64)
{
vec64 v_src1 = internal::vld1(src1 + j);
vec64 v_mask = vtst(v_src1,v_src1);
internal::vst1(dst + j, internal::vand(v_mask, recipWrap(v_src1, scale)));
}
for (; j < size.width; j++)
{
dst[j] = src1[j] ? (T)((s32)trunc(scale / src1[j])) : 0;
}
}
}
#else
(void)size;
(void)src1Base;
(void)src1Stride;
(void)dstBase;
(void)dstStride;
(void)cpolicy;
(void)scale;
#endif
}
}
void div(const Size2D &size,
const u8 * src0Base, ptrdiff_t src0Stride,
const u8 * src1Base, ptrdiff_t src1Stride,
u8 * dstBase, ptrdiff_t dstStride,
f32 scale,
CONVERT_POLICY cpolicy)
{
div<u8>(size, src0Base, src0Stride, src1Base, src1Stride, dstBase, dstStride, scale, cpolicy);
}
void div(const Size2D &size,
const s8 * src0Base, ptrdiff_t src0Stride,
const s8 * src1Base, ptrdiff_t src1Stride,
s8 * dstBase, ptrdiff_t dstStride,
f32 scale,
CONVERT_POLICY cpolicy)
{
div<s8>(size, src0Base, src0Stride, src1Base, src1Stride, dstBase, dstStride, scale, cpolicy);
}
void div(const Size2D &size,
const u16 * src0Base, ptrdiff_t src0Stride,
const u16 * src1Base, ptrdiff_t src1Stride,
u16 * dstBase, ptrdiff_t dstStride,
f32 scale,
CONVERT_POLICY cpolicy)
{
div<u16>(size, src0Base, src0Stride, src1Base, src1Stride, dstBase, dstStride, scale, cpolicy);
}
void div(const Size2D &size,
const s16 * src0Base, ptrdiff_t src0Stride,
const s16 * src1Base, ptrdiff_t src1Stride,
s16 * dstBase, ptrdiff_t dstStride,
f32 scale,
CONVERT_POLICY cpolicy)
{
div<s16>(size, src0Base, src0Stride, src1Base, src1Stride, dstBase, dstStride, scale, cpolicy);
}
void div(const Size2D &size,
const s32 * src0Base, ptrdiff_t src0Stride,
const s32 * src1Base, ptrdiff_t src1Stride,
s32 * dstBase, ptrdiff_t dstStride,
f32 scale,
CONVERT_POLICY cpolicy)
{
div<s32>(size, src0Base, src0Stride, src1Base, src1Stride, dstBase, dstStride, scale, cpolicy);
}
void div(const Size2D &size,
const f32 * src0Base, ptrdiff_t src0Stride,
const f32 * src1Base, ptrdiff_t src1Stride,
f32 * dstBase, ptrdiff_t dstStride,
f32 scale)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
if (scale == 0.0f)
{
for (size_t y = 0; y < size.height; ++y)
{
f32 * dst = internal::getRowPtr(dstBase, dstStride, y);
std::memset(dst, 0, sizeof(f32) * size.width);
}
return;
}
float32x4_t v_zero = vdupq_n_f32(0.0f);
size_t roiw128 = size.width >= 3 ? size.width - 3 : 0;
size_t roiw64 = size.width >= 1 ? size.width - 1 : 0;
if (std::fabs(scale - 1.0f) < FLT_EPSILON)
{
for (size_t i = 0; i < size.height; ++i)
{
const f32 * src0 = internal::getRowPtr(src0Base, src0Stride, i);
const f32 * src1 = internal::getRowPtr(src1Base, src1Stride, i);
f32 * dst = internal::getRowPtr(dstBase, dstStride, i);
size_t j = 0;
for (; j < roiw128; j += 4)
{
internal::prefetch(src0 + j);
internal::prefetch(src1 + j);
float32x4_t v_src0 = vld1q_f32(src0 + j);
float32x4_t v_src1 = vld1q_f32(src1 + j);
uint32x4_t v_mask = vceqq_f32(v_src1,v_zero);
vst1q_f32(dst + j, vreinterpretq_f32_u32(vbicq_u32(
vreinterpretq_u32_f32(vmulq_f32(v_src0, internal::vrecpq_f32(v_src1))), v_mask)));
}
for (; j < roiw64; j += 2)
{
float32x2_t v_src0 = vld1_f32(src0 + j);
float32x2_t v_src1 = vld1_f32(src1 + j);
uint32x2_t v_mask = vceq_f32(v_src1,vget_low_f32(v_zero));
vst1_f32(dst + j, vreinterpret_f32_u32(vbic_u32(
vreinterpret_u32_f32(vmul_f32(v_src0, internal::vrecp_f32(v_src1))), v_mask)));
}
for (; j < size.width; j++)
{
dst[j] = src1[j] ? src0[j] / src1[j] : 0.0f;
}
}
}
else
{
for (size_t i = 0; i < size.height; ++i)
{
const f32 * src0 = internal::getRowPtr(src0Base, src0Stride, i);
const f32 * src1 = internal::getRowPtr(src1Base, src1Stride, i);
f32 * dst = internal::getRowPtr(dstBase, dstStride, i);
size_t j = 0;
for (; j < roiw128; j += 4)
{
internal::prefetch(src0 + j);
internal::prefetch(src1 + j);
float32x4_t v_src0 = vld1q_f32(src0 + j);
float32x4_t v_src1 = vld1q_f32(src1 + j);
uint32x4_t v_mask = vceqq_f32(v_src1,v_zero);
vst1q_f32(dst + j, vreinterpretq_f32_u32(vbicq_u32(
vreinterpretq_u32_f32(vmulq_f32(vmulq_n_f32(v_src0, scale),
internal::vrecpq_f32(v_src1))), v_mask)));
}
for (; j < roiw64; j += 2)
{
float32x2_t v_src0 = vld1_f32(src0 + j);
float32x2_t v_src1 = vld1_f32(src1 + j);
uint32x2_t v_mask = vceq_f32(v_src1,vget_low_f32(v_zero));
vst1_f32(dst + j, vreinterpret_f32_u32(vbic_u32(
vreinterpret_u32_f32(vmul_f32(vmul_n_f32(v_src0, scale),
internal::vrecp_f32(v_src1))), v_mask)));
}
for (; j < size.width; j++)
{
dst[j] = src1[j] ? src0[j] * scale / src1[j] : 0.0f;
}
}
}
#else
(void)size;
(void)src0Base;
(void)src0Stride;
(void)src1Base;
(void)src1Stride;
(void)dstBase;
(void)dstStride;
(void)scale;
#endif
}
void reciprocal(const Size2D &size,
const u8 * srcBase, ptrdiff_t srcStride,
u8 * dstBase, ptrdiff_t dstStride,
f32 scale,
CONVERT_POLICY cpolicy)
{
recip<u8>(size, srcBase, srcStride, dstBase, dstStride, scale, cpolicy);
}
void reciprocal(const Size2D &size,
const s8 * srcBase, ptrdiff_t srcStride,
s8 * dstBase, ptrdiff_t dstStride,
f32 scale,
CONVERT_POLICY cpolicy)
{
recip<s8>(size, srcBase, srcStride, dstBase, dstStride, scale, cpolicy);
}
void reciprocal(const Size2D &size,
const u16 * srcBase, ptrdiff_t srcStride,
u16 * dstBase, ptrdiff_t dstStride,
f32 scale,
CONVERT_POLICY cpolicy)
{
recip<u16>(size, srcBase, srcStride, dstBase, dstStride, scale, cpolicy);
}
void reciprocal(const Size2D &size,
const s16 * srcBase, ptrdiff_t srcStride,
s16 * dstBase, ptrdiff_t dstStride,
f32 scale,
CONVERT_POLICY cpolicy)
{
recip<s16>(size, srcBase, srcStride, dstBase, dstStride, scale, cpolicy);
}
void reciprocal(const Size2D &size,
const s32 * srcBase, ptrdiff_t srcStride,
s32 * dstBase, ptrdiff_t dstStride,
f32 scale,
CONVERT_POLICY cpolicy)
{
recip<s32>(size, srcBase, srcStride, dstBase, dstStride, scale, cpolicy);
}
void reciprocal(const Size2D &size,
const f32 * srcBase, ptrdiff_t srcStride,
f32 * dstBase, ptrdiff_t dstStride,
f32 scale)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
if (scale == 0.0f)
{
for (size_t y = 0; y < size.height; ++y)
{
f32 * dst = internal::getRowPtr(dstBase, dstStride, y);
std::memset(dst, 0, sizeof(f32) * size.width);
}
return;
}
float32x4_t v_zero = vdupq_n_f32(0.0f);
size_t roiw128 = size.width >= 3 ? size.width - 3 : 0;
size_t roiw64 = size.width >= 1 ? size.width - 1 : 0;
if (std::fabs(scale - 1.0f) < FLT_EPSILON)
{
for (size_t i = 0; i < size.height; ++i)
{
const f32 * src1 = internal::getRowPtr(srcBase, srcStride, i);
f32 * dst = internal::getRowPtr(dstBase, dstStride, i);
size_t j = 0;
for (; j < roiw128; j += 4)
{
internal::prefetch(src1 + j);
float32x4_t v_src1 = vld1q_f32(src1 + j);
uint32x4_t v_mask = vceqq_f32(v_src1,v_zero);
vst1q_f32(dst + j, vreinterpretq_f32_u32(vbicq_u32(
vreinterpretq_u32_f32(internal::vrecpq_f32(v_src1)), v_mask)));
}
for (; j < roiw64; j += 2)
{
float32x2_t v_src1 = vld1_f32(src1 + j);
uint32x2_t v_mask = vceq_f32(v_src1,vget_low_f32(v_zero));
vst1_f32(dst + j, vreinterpret_f32_u32(vbic_u32(
vreinterpret_u32_f32(internal::vrecp_f32(v_src1)), v_mask)));
}
for (; j < size.width; j++)
{
dst[j] = src1[j] ? 1.0f / src1[j] : 0;
}
}
}
else
{
for (size_t i = 0; i < size.height; ++i)
{
const f32 * src1 = internal::getRowPtr(srcBase, srcStride, i);
f32 * dst = internal::getRowPtr(dstBase, dstStride, i);
size_t j = 0;
for (; j < roiw128; j += 4)
{
internal::prefetch(src1 + j);
float32x4_t v_src1 = vld1q_f32(src1 + j);
uint32x4_t v_mask = vceqq_f32(v_src1,v_zero);
vst1q_f32(dst + j, vreinterpretq_f32_u32(vbicq_u32(
vreinterpretq_u32_f32(vmulq_n_f32(internal::vrecpq_f32(v_src1),
scale)),v_mask)));
}
for (; j < roiw64; j += 2)
{
float32x2_t v_src1 = vld1_f32(src1 + j);
uint32x2_t v_mask = vceq_f32(v_src1,vget_low_f32(v_zero));
vst1_f32(dst + j, vreinterpret_f32_u32(vbic_u32(
vreinterpret_u32_f32(vmul_n_f32(internal::vrecp_f32(v_src1),
scale)), v_mask)));
}
for (; j < size.width; j++)
{
dst[j] = src1[j] ? scale / src1[j] : 0;
}
}
}
#else
(void)size;
(void)srcBase;
(void)srcStride;
(void)dstBase;
(void)dstStride;
(void)scale;
#endif
}
} // namespace CAROTENE_NS

260
3rdparty/carotene/src/dot_product.cpp vendored Normal file
View File

@ -0,0 +1,260 @@
/*
* By downloading, copying, installing or using the software you agree to this license.
* If you do not agree to this license, do not download, install,
* copy or use the software.
*
*
* License Agreement
* For Open Source Computer Vision Library
* (3-clause BSD License)
*
* Copyright (C) 2012-2015, NVIDIA Corporation, all rights reserved.
* Third party copyrights are property of their respective owners.
*
* Redistribution and use in source and binary forms, with or without modification,
* are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* * Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* * Neither the names of the copyright holders nor the names of the contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* This software is provided by the copyright holders and contributors "as is" and
* any express or implied warranties, including, but not limited to, the implied
* warranties of merchantability and fitness for a particular purpose are disclaimed.
* In no event shall copyright holders or contributors be liable for any direct,
* indirect, incidental, special, exemplary, or consequential damages
* (including, but not limited to, procurement of substitute goods or services;
* loss of use, data, or profits; or business interruption) however caused
* and on any theory of liability, whether in contract, strict liability,
* or tort (including negligence or otherwise) arising in any way out of
* the use of this software, even if advised of the possibility of such damage.
*/
#include "common.hpp"
namespace CAROTENE_NS {
f64 dotProduct(const Size2D &_size,
const u8 * src0Base, ptrdiff_t src0Stride,
const u8 * src1Base, ptrdiff_t src1Stride)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
Size2D size(_size);
if (src0Stride == src1Stride &&
src0Stride == (ptrdiff_t)(size.width))
{
size.width *= size.height;
size.height = 1;
}
// It is possible to accumulate up to 66051 uchar multiplication results in uint32 without overflow
// We process 16 elements and accumulate two new elements per step. So we could handle 66051/2*16 elements
#define DOT_UINT_BLOCKSIZE 66050*8
f64 result = 0.0;
for (size_t row = 0; row < size.height; ++row)
{
const u8 * src0 = internal::getRowPtr(src0Base, src0Stride, row);
const u8 * src1 = internal::getRowPtr(src1Base, src1Stride, row);
size_t i = 0;
uint64x2_t ws = vmovq_n_u64(0);
while(i + 16 <= size.width)
{
size_t lim = std::min(i + DOT_UINT_BLOCKSIZE, size.width) - 16;
uint32x4_t s1 = vmovq_n_u32(0);
uint32x4_t s2 = vmovq_n_u32(0);
for (; i <= lim; i += 16)
{
internal::prefetch(src0 + i);
internal::prefetch(src1 + i);
uint8x16_t vs1 = vld1q_u8(src0 + i);
uint8x16_t vs2 = vld1q_u8(src1 + i);
uint16x8_t vdot1 = vmull_u8(vget_low_u8(vs1), vget_low_u8(vs2));
uint16x8_t vdot2 = vmull_u8(vget_high_u8(vs1), vget_high_u8(vs2));
s1 = vpadalq_u16(s1, vdot1);
s2 = vpadalq_u16(s2, vdot2);
}
ws = vpadalq_u32(ws, s1);
ws = vpadalq_u32(ws, s2);
}
if(i + 8 <= size.width)
{
uint8x8_t vs1 = vld1_u8(src0 + i);
uint8x8_t vs2 = vld1_u8(src1 + i);
ws = vpadalq_u32(ws, vpaddlq_u16(vmull_u8(vs1, vs2)));
i += 8;
}
result += (double)vget_lane_u64(vadd_u64(vget_low_u64(ws), vget_high_u64(ws)), 0);
for (; i < size.width; ++i)
result += s32(src0[i]) * s32(src1[i]);
}
return result;
#else
(void)_size;
(void)src0Base;
(void)src0Stride;
(void)src1Base;
(void)src1Stride;
return 0;
#endif
}
f64 dotProduct(const Size2D &_size,
const s8 * src0Base, ptrdiff_t src0Stride,
const s8 * src1Base, ptrdiff_t src1Stride)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
Size2D size(_size);
if (src0Stride == src1Stride &&
src0Stride == (ptrdiff_t)(size.width))
{
size.width *= size.height;
size.height = 1;
}
// It is possible to accumulate up to 131071 schar multiplication results in sint32 without overflow
// We process 16 elements and accumulate two new elements per step. So we could handle 131071/2*16 elements
#define DOT_INT_BLOCKSIZE 131070*8
f64 result = 0.0;
for (size_t row = 0; row < size.height; ++row)
{
const s8 * src0 = internal::getRowPtr(src0Base, src0Stride, row);
const s8 * src1 = internal::getRowPtr(src1Base, src1Stride, row);
size_t i = 0;
int64x2_t ws = vmovq_n_s64(0);
while(i + 16 <= size.width)
{
size_t lim = std::min(i + DOT_UINT_BLOCKSIZE, size.width) - 16;
int32x4_t s1 = vmovq_n_s32(0);
int32x4_t s2 = vmovq_n_s32(0);
for (; i <= lim; i += 16)
{
internal::prefetch(src0 + i);
internal::prefetch(src1 + i);
int8x16_t vs1 = vld1q_s8(src0 + i);
int8x16_t vs2 = vld1q_s8(src1 + i);
int16x8_t vdot1 = vmull_s8(vget_low_s8(vs1), vget_low_s8(vs2));
int16x8_t vdot2 = vmull_s8(vget_high_s8(vs1), vget_high_s8(vs2));
s1 = vpadalq_s16(s1, vdot1);
s2 = vpadalq_s16(s2, vdot2);
}
ws = vpadalq_s32(ws, s1);
ws = vpadalq_s32(ws, s2);
}
if(i + 8 <= size.width)
{
int8x8_t vs1 = vld1_s8(src0 + i);
int8x8_t vs2 = vld1_s8(src1 + i);
ws = vpadalq_s32(ws, vpaddlq_s16(vmull_s8(vs1, vs2)));
i += 8;
}
result += (double)vget_lane_s64(vadd_s64(vget_low_s64(ws), vget_high_s64(ws)), 0);
for (; i < size.width; ++i)
result += s32(src0[i]) * s32(src1[i]);
}
return result;
#else
(void)_size;
(void)src0Base;
(void)src0Stride;
(void)src1Base;
(void)src1Stride;
return 0;
#endif
}
f64 dotProduct(const Size2D &_size,
const f32 * src0Base, ptrdiff_t src0Stride,
const f32 * src1Base, ptrdiff_t src1Stride)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
Size2D size(_size);
if (src0Stride == src1Stride &&
src0Stride == (ptrdiff_t)(size.width * sizeof(f32)))
{
size.width *= size.height;
size.height = 1;
}
#define DOT_FLOAT_BLOCKSIZE (1 << 13)
f64 result = 0.0;
for (size_t row = 0; row < size.height; ++row)
{
const f32 * src0 = internal::getRowPtr(src0Base, src0Stride, row);
const f32 * src1 = internal::getRowPtr(src1Base, src1Stride, row);
size_t i = 0;
while(i + 4 <= size.width)
{
size_t lim = std::min(i + DOT_FLOAT_BLOCKSIZE, size.width) - 4;
float32x4_t v_sum = vdupq_n_f32(0.0f);
for( ; i <= lim; i += 4 )
{
internal::prefetch(src0 + i);
internal::prefetch(src1 + i);
v_sum = vmlaq_f32(v_sum, vld1q_f32(src0 + i), vld1q_f32(src1 + i));
}
float32x2_t vres = vpadd_f32(vget_low_f32(v_sum),vget_high_f32(v_sum));
result += vget_lane_f32(vres, 0) + vget_lane_f32(vres, 1);
}
if(i + 2 <= size.width)
{
float32x2_t vres = vmul_f32(vld1_f32(src0 + i), vld1_f32(src1 + i));
result += vget_lane_f32(vres, 0) + vget_lane_f32(vres, 1);
i += 2;
}
for (; i < size.width; ++i)
result += src0[i] * src1[i];
}
return result;
#else
(void)_size;
(void)src0Base;
(void)src0Stride;
(void)src1Base;
(void)src1Stride;
return 0;
#endif
}
} // namespace CAROTENE_NS

428
3rdparty/carotene/src/fast.cpp vendored Normal file
View File

@ -0,0 +1,428 @@
/*
* By downloading, copying, installing or using the software you agree to this license.
* If you do not agree to this license, do not download, install,
* copy or use the software.
*
*
* License Agreement
* For Open Source Computer Vision Library
* (3-clause BSD License)
*
* Copyright (C) 2012-2015, NVIDIA Corporation, all rights reserved.
* Third party copyrights are property of their respective owners.
*
* Redistribution and use in source and binary forms, with or without modification,
* are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* * Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* * Neither the names of the copyright holders nor the names of the contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* This software is provided by the copyright holders and contributors "as is" and
* any express or implied warranties, including, but not limited to, the implied
* warranties of merchantability and fitness for a particular purpose are disclaimed.
* In no event shall copyright holders or contributors be liable for any direct,
* indirect, incidental, special, exemplary, or consequential damages
* (including, but not limited to, procurement of substitute goods or services;
* loss of use, data, or profits; or business interruption) however caused
* and on any theory of liability, whether in contract, strict liability,
* or tort (including negligence or otherwise) arising in any way out of
* the use of this software, even if advised of the possibility of such damage.
*/
/* This is FAST corner detector, contributed to OpenCV by the author, Edward Rosten.
Below is the original copyright and the references */
/*
Copyright (c) 2006, 2008 Edward Rosten
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions
are met:
*Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
*Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
*Neither the name of the University of Cambridge nor the names of
its contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
/*
The references are:
* Machine learning for high-speed corner detection,
E. Rosten and T. Drummond, ECCV 2006
* Faster and better: A machine learning approach to corner detection
E. Rosten, R. Porter and T. Drummond, PAMI, 2009
*/
#include "common.hpp"
#include <vector>
#include <cstring>
namespace CAROTENE_NS {
#ifdef CAROTENE_NEON
namespace
{
void makeOffsets(ptrdiff_t pixel[], ptrdiff_t row_stride)
{
pixel[0] = 0 + row_stride * 3;
pixel[1] = 1 + row_stride * 3;
pixel[2] = 2 + row_stride * 2;
pixel[3] = 3 + row_stride * 1;
pixel[4] = 3 + row_stride * 0;
pixel[5] = 3 + row_stride * -1;
pixel[6] = 2 + row_stride * -2;
pixel[7] = 1 + row_stride * -3;
pixel[8] = 0 + row_stride * -3;
pixel[9] = -1 + row_stride * -3;
pixel[10] = -2 + row_stride * -2;
pixel[11] = -3 + row_stride * -1;
pixel[12] = -3 + row_stride * 0;
pixel[13] = -3 + row_stride * 1;
pixel[14] = -2 + row_stride * 2;
pixel[15] = -1 + row_stride * 3;
}
u8 cornerScore(const u8* ptr, const ptrdiff_t pixel[])
{
const s32 K = 8, N = 16 + K + 1;
s32 k, v = ptr[0];
s16 d[(N + 7) & ~7];
for( k = 0; k < N; k++ )
d[k] = (s16)(v - ptr[pixel[k]]);
int16x8_t q0 = vdupq_n_s16((s16)(-1000));
int16x8_t q1 = vdupq_n_s16((s16)(1000));
int16x8_t d0_7 = vld1q_s16(d + 0);
int16x8_t d8_15 = vld1q_s16(d + 8);
int16x8_t d16_23 = vld1q_s16(d + 16);
int16x8_t d24 = vld1q_s16(d + 24);
//k == 0
int16x8_t v0k0 = vextq_s16(d0_7, d8_15, 1);
int16x8_t v1k0 = vextq_s16(d0_7, d8_15, 2);
int16x8_t ak0 = vminq_s16(v0k0, v1k0);
int16x8_t bk0 = vmaxq_s16(v0k0, v1k0);
v0k0 = vextq_s16(d0_7, d8_15, 3);
ak0 = vminq_s16(ak0, v0k0);
bk0 = vmaxq_s16(bk0, v0k0);
v1k0 = vextq_s16(d0_7, d8_15, 4);
ak0 = vminq_s16(ak0, v1k0);
bk0 = vmaxq_s16(bk0, v1k0);
v0k0 = vextq_s16(d0_7, d8_15, 5);
ak0 = vminq_s16(ak0, v0k0);
bk0 = vmaxq_s16(bk0, v0k0);
v1k0 = vextq_s16(d0_7, d8_15, 6);
ak0 = vminq_s16(ak0, v1k0);
bk0 = vmaxq_s16(bk0, v1k0);
v0k0 = vextq_s16(d0_7, d8_15, 7);
ak0 = vminq_s16(ak0, v0k0);
bk0 = vmaxq_s16(bk0, v0k0);
ak0 = vminq_s16(ak0, d8_15);
bk0 = vmaxq_s16(bk0, d8_15);
q0 = vmaxq_s16(q0, vminq_s16(ak0, d0_7));
q1 = vminq_s16(q1, vmaxq_s16(bk0, d0_7));
v1k0 = vextq_s16(d8_15, d16_23, 1);
q0 = vmaxq_s16(q0, vminq_s16(ak0, v1k0));
q1 = vminq_s16(q1, vmaxq_s16(bk0, v1k0));
//k == 8
int16x8_t v0k8 = v1k0;
int16x8_t v1k8 = vextq_s16(d8_15, d16_23, 2);
int16x8_t ak8 = vminq_s16(v0k8, v1k8);
int16x8_t bk8 = vmaxq_s16(v0k8, v1k8);
v0k8 = vextq_s16(d8_15, d16_23, 3);
ak8 = vminq_s16(ak8, v0k8);
bk8 = vmaxq_s16(bk8, v0k8);
v1k8 = vextq_s16(d8_15, d16_23, 4);
ak8 = vminq_s16(ak8, v1k8);
bk8 = vmaxq_s16(bk8, v1k8);
v0k8 = vextq_s16(d8_15, d16_23, 5);
ak8 = vminq_s16(ak8, v0k8);
bk8 = vmaxq_s16(bk8, v0k8);
v1k8 = vextq_s16(d8_15, d16_23, 6);
ak8 = vminq_s16(ak8, v1k8);
bk8 = vmaxq_s16(bk8, v1k8);
v0k8 = vextq_s16(d8_15, d16_23, 7);
ak8 = vminq_s16(ak8, v0k8);
bk8 = vmaxq_s16(bk8, v0k8);
ak8 = vminq_s16(ak8, d16_23);
bk8 = vmaxq_s16(bk8, d16_23);
q0 = vmaxq_s16(q0, vminq_s16(ak8, d8_15));
q1 = vminq_s16(q1, vmaxq_s16(bk8, d8_15));
v1k8 = vextq_s16(d16_23, d24, 1);
q0 = vmaxq_s16(q0, vminq_s16(ak8, v1k8));
q1 = vminq_s16(q1, vmaxq_s16(bk8, v1k8));
//fin
int16x8_t q = vmaxq_s16(q0, vsubq_s16(vmovq_n_s16(0), q1));
int16x4_t q2 = vmax_s16(vget_low_s16(q), vget_high_s16(q));
int32x4_t q2w = vmovl_s16(q2);
int32x2_t q4 = vmax_s32(vget_low_s32(q2w), vget_high_s32(q2w));
int32x2_t q8 = vmax_s32(q4, vreinterpret_s32_s64(vshr_n_s64(vreinterpret_s64_s32(q4), 32)));
return (u8)(vget_lane_s32(q8, 0) - 1);
}
} //namespace
#endif
void FAST(const Size2D &size,
u8 *srcBase, ptrdiff_t srcStride,
KeypointStore *keypoints,
u8 threshold, bool nonmax_suppression)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
//keypoints.clear();
const s32 K = 8, N = 16 + K + 1;
ptrdiff_t i, j, k, pixel[N];
makeOffsets(pixel, srcStride);
for(k = 16; k < N; k++)
pixel[k] = pixel[k - 16];
uint8x16_t delta = vdupq_n_u8(128);
uint8x16_t t = vdupq_n_u8(threshold);
uint8x16_t K16 = vdupq_n_u8((u8)K);
u8 threshold_tab[512];
for( i = -255; i <= 255; i++ )
threshold_tab[i+255] = (u8)(i < -threshold ? 1 : i > threshold ? 2 : 0);
std::vector<u8> _buf((size.width+16)*3*(sizeof(ptrdiff_t) + sizeof(u8)) + 128);
u8* buf[3];
buf[0] = &_buf[0]; buf[1] = buf[0] + size.width; buf[2] = buf[1] + size.width;
ptrdiff_t* cpbuf[3];
cpbuf[0] = (ptrdiff_t*)internal::alignPtr(buf[2] + size.width, sizeof(ptrdiff_t)) + 1;
cpbuf[1] = cpbuf[0] + size.width + 1;
cpbuf[2] = cpbuf[1] + size.width + 1;
memset(buf[0], 0, size.width*3);
for(i = 3; i < (ptrdiff_t)size.height-2; i++)
{
const u8* ptr = internal::getRowPtr(srcBase, srcStride, i) + 3;
u8* curr = buf[(i - 3)%3];
ptrdiff_t* cornerpos = cpbuf[(i - 3)%3];
memset(curr, 0, size.width);
ptrdiff_t ncorners = 0;
if( i < (ptrdiff_t)size.height - 3 )
{
j = 3;
for(; j < (ptrdiff_t)size.width - 16 - 3; j += 16, ptr += 16)
{
internal::prefetch(ptr);
internal::prefetch(ptr + pixel[0]);
internal::prefetch(ptr + pixel[2]);
uint8x16_t v0 = vld1q_u8(ptr);
int8x16_t v1 = vreinterpretq_s8_u8(veorq_u8(vqsubq_u8(v0, t), delta));
int8x16_t v2 = vreinterpretq_s8_u8(veorq_u8(vqaddq_u8(v0, t), delta));
int8x16_t x0 = vreinterpretq_s8_u8(vsubq_u8(vld1q_u8(ptr + pixel[0]), delta));
int8x16_t x1 = vreinterpretq_s8_u8(vsubq_u8(vld1q_u8(ptr + pixel[4]), delta));
int8x16_t x2 = vreinterpretq_s8_u8(vsubq_u8(vld1q_u8(ptr + pixel[8]), delta));
int8x16_t x3 = vreinterpretq_s8_u8(vsubq_u8(vld1q_u8(ptr + pixel[12]), delta));
uint8x16_t m0 = vandq_u8(vcgtq_s8(x0, v2), vcgtq_s8(x1, v2));
uint8x16_t m1 = vandq_u8(vcgtq_s8(v1, x0), vcgtq_s8(v1, x1));
m0 = vorrq_u8(m0, vandq_u8(vcgtq_s8(x1, v2), vcgtq_s8(x2, v2)));
m1 = vorrq_u8(m1, vandq_u8(vcgtq_s8(v1, x1), vcgtq_s8(v1, x2)));
m0 = vorrq_u8(m0, vandq_u8(vcgtq_s8(x2, v2), vcgtq_s8(x3, v2)));
m1 = vorrq_u8(m1, vandq_u8(vcgtq_s8(v1, x2), vcgtq_s8(v1, x3)));
m0 = vorrq_u8(m0, vandq_u8(vcgtq_s8(x3, v2), vcgtq_s8(x0, v2)));
m1 = vorrq_u8(m1, vandq_u8(vcgtq_s8(v1, x3), vcgtq_s8(v1, x0)));
m0 = vorrq_u8(m0, m1);
u64 mask[2];
vst1q_u64(mask, vreinterpretq_u64_u8(m0));
if( mask[0] == 0 )
{
if (mask[1] != 0)
{
j -= 8;
ptr -= 8;
}
continue;
}
uint8x16_t c0 = vmovq_n_u8(0);
uint8x16_t c1 = vmovq_n_u8(0);
uint8x16_t max0 = vmovq_n_u8(0);
uint8x16_t max1 = vmovq_n_u8(0);
for( k = 0; k < N; k++ )
{
int8x16_t x = vreinterpretq_s8_u8(veorq_u8(vld1q_u8(ptr + pixel[k]), delta));
m0 = vcgtq_s8(x, v2);
m1 = vcgtq_s8(v1, x);
c0 = vandq_u8(vsubq_u8(c0, m0), m0);
c1 = vandq_u8(vsubq_u8(c1, m1), m1);
max0 = vmaxq_u8(max0, c0);
max1 = vmaxq_u8(max1, c1);
}
max0 = vmaxq_u8(max0, max1);
u8 m[16];
vst1q_u8(m, vcgtq_u8(max0, K16));
for( k = 0; k < 16; ++k )
if(m[k])
{
cornerpos[ncorners++] = j+k;
if(nonmax_suppression)
curr[j+k] = cornerScore(ptr+k, pixel);
}
}
for( ; j < (s32)size.width - 3; j++, ptr++ )
{
s32 v = ptr[0];
const u8* tab = &threshold_tab[0] - v + 255;
s32 d = tab[ptr[pixel[0]]] | tab[ptr[pixel[8]]];
if( d == 0 )
continue;
d &= tab[ptr[pixel[2]]] | tab[ptr[pixel[10]]];
d &= tab[ptr[pixel[4]]] | tab[ptr[pixel[12]]];
d &= tab[ptr[pixel[6]]] | tab[ptr[pixel[14]]];
if( d == 0 )
continue;
d &= tab[ptr[pixel[1]]] | tab[ptr[pixel[9]]];
d &= tab[ptr[pixel[3]]] | tab[ptr[pixel[11]]];
d &= tab[ptr[pixel[5]]] | tab[ptr[pixel[13]]];
d &= tab[ptr[pixel[7]]] | tab[ptr[pixel[15]]];
if( d & 1 )
{
s32 vt = v - threshold, count = 0;
for( k = 0; k < N; k++ )
{
s32 x = ptr[pixel[k]];
if(x < vt)
{
if( ++count > K )
{
cornerpos[ncorners++] = j;
if(nonmax_suppression)
curr[j] = cornerScore(ptr, pixel);
break;
}
}
else
count = 0;
}
}
if( d & 2 )
{
s32 vt = v + threshold, count = 0;
for( k = 0; k < N; k++ )
{
s32 x = ptr[pixel[k]];
if(x > vt)
{
if( ++count > K )
{
cornerpos[ncorners++] = j;
if(nonmax_suppression)
curr[j] = cornerScore(ptr, pixel);
break;
}
}
else
count = 0;
}
}
}
}
cornerpos[-1] = ncorners;
if( i == 3 )
continue;
const u8* prev = buf[(i - 4 + 3)%3];
const u8* pprev = buf[(i - 5 + 3)%3];
cornerpos = cpbuf[(i - 4 + 3)%3];
ncorners = cornerpos[-1];
for( k = 0; k < ncorners; k++ )
{
j = cornerpos[k];
s32 score = prev[j];
if( !nonmax_suppression ||
(score > prev[j+1] && score > prev[j-1] &&
score > pprev[j-1] && score > pprev[j] && score > pprev[j+1] &&
score > curr[j-1] && score > curr[j] && score > curr[j+1]) )
{
keypoints->push((f32)j, (f32)(i-1), 7.f, -1, (f32)score);
}
}
}
#else
(void)size;
(void)srcBase;
(void)srcStride;
(void)keypoints;
(void)threshold;
(void)nonmax_suppression;
#endif
}
} // namespace CAROTENE_NS

442
3rdparty/carotene/src/fill_minmaxloc.cpp vendored Normal file
View File

@ -0,0 +1,442 @@
/*
* By downloading, copying, installing or using the software you agree to this license.
* If you do not agree to this license, do not download, install,
* copy or use the software.
*
*
* License Agreement
* For Open Source Computer Vision Library
* (3-clause BSD License)
*
* Copyright (C) 2014, NVIDIA Corporation, all rights reserved.
* Third party copyrights are property of their respective owners.
*
* Redistribution and use in source and binary forms, with or without modification,
* are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* * Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* * Neither the names of the copyright holders nor the names of the contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* This software is provided by the copyright holders and contributors "as is" and
* any express or implied warranties, including, but not limited to, the implied
* warranties of merchantability and fitness for a particular purpose are disclaimed.
* In no event shall copyright holders or contributors be liable for any direct,
* indirect, incidental, special, exemplary, or consequential damages
* (including, but not limited to, procurement of substitute goods or services;
* loss of use, data, or profits; or business interruption) however caused
* and on any theory of liability, whether in contract, strict liability,
* or tort (including negligence or otherwise) arising in any way out of
* the use of this software, even if advised of the possibility of such damage.
*/
#include "common.hpp"
namespace CAROTENE_NS {
#ifdef CAROTENE_NEON
namespace {
template <typename T>
void process(const T * src, size_t j0, size_t j1, size_t i,
T minVal, size_t * minLocPtr, s32 & minLocCount, s32 minLocCapacity,
T maxVal, size_t * maxLocPtr, s32 & maxLocCount, s32 maxLocCapacity)
{
for (size_t j = j0; j < j1; ++j)
{
T val = src[j];
if (val == maxVal)
{
if (maxLocCount < maxLocCapacity)
{
maxLocPtr[maxLocCount] = j;
maxLocPtr[maxLocCount + 1] = i;
}
maxLocCount += 2;
}
if (val == minVal)
{
if (minLocCount < minLocCapacity)
{
minLocPtr[minLocCount] = j;
minLocPtr[minLocCount + 1] = i;
}
minLocCount += 2;
}
}
}
} // namespace
#endif
void fillMinMaxLocs(const Size2D & size,
const u8 * srcBase, ptrdiff_t srcStride,
u8 minVal, size_t * minLocPtr, s32 & minLocCount, s32 minLocCapacity,
u8 maxVal, size_t * maxLocPtr, s32 & maxLocCount, s32 maxLocCapacity)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
size_t roiw16 = size.width >= 15 ? size.width - 15 : 0;
size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
uint8x16_t v_maxval16 = vdupq_n_u8(maxVal), v_minval16 = vdupq_n_u8(minVal);
uint8x8_t v_maxval8 = vdup_n_u8(maxVal), v_minval8 = vdup_n_u8(minVal);
u64 mask[2] = { 0ul };
minLocCapacity <<= 1;
maxLocCapacity <<= 1;
for (size_t i = 0; i < size.height; ++i)
{
const u8 * src = internal::getRowPtr(srcBase, srcStride, i);
size_t j = 0;
for ( ; j < roiw16; j += 16)
{
internal::prefetch(src + j);
uint8x16_t v_src = vld1q_u8(src + j);
uint8x16_t v_maxmask = vceqq_u8(v_src, v_maxval16);
uint8x16_t v_minmask = vceqq_u8(v_src, v_minval16);
uint8x16_t v_mask = vorrq_u8(v_maxmask, v_minmask);
vst1q_u8((u8 *)&mask[0], v_mask);
if (mask[0])
process(src, j, j + 8, i,
minVal, minLocPtr, minLocCount, minLocCapacity,
maxVal, maxLocPtr, maxLocCount, maxLocCapacity);
if (mask[1])
process(src, j + 8, j + 16, i,
minVal, minLocPtr, minLocCount, minLocCapacity,
maxVal, maxLocPtr, maxLocCount, maxLocCapacity);
}
for ( ; j < roiw8; j += 8)
{
uint8x8_t v_src = vld1_u8(src + j);
uint8x8_t v_maxmask = vceq_u8(v_src, v_maxval8);
uint8x8_t v_minmask = vceq_u8(v_src, v_minval8);
uint8x8_t v_mask = vorr_u8(v_maxmask, v_minmask);
vst1_u8((u8 *)&mask[0], v_mask);
if (mask[0])
process(src, j, j + 8, i,
minVal, minLocPtr, minLocCount, minLocCapacity,
maxVal, maxLocPtr, maxLocCount, maxLocCapacity);
}
process(src, j, size.width, i,
minVal, minLocPtr, minLocCount, minLocCapacity,
maxVal, maxLocPtr, maxLocCount, maxLocCapacity);
}
minLocCount >>= 1;
maxLocCount >>= 1;
#else
(void)size;
(void)srcBase;
(void)srcStride;
(void)minVal;
(void)minLocPtr;
(void)minLocCount;
(void)minLocCapacity;
(void)maxVal;
(void)maxLocPtr;
(void)maxLocCount;
(void)maxLocCapacity;
#endif
}
void fillMinMaxLocs(const Size2D & size,
const u16 * srcBase, ptrdiff_t srcStride,
u16 minVal, size_t * minLocPtr, s32 & minLocCount, s32 minLocCapacity,
u16 maxVal, size_t * maxLocPtr, s32 & maxLocCount, s32 maxLocCapacity)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
size_t roiw16 = size.width >= 15 ? size.width - 15 : 0;
size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
uint16x8_t v_maxval8 = vdupq_n_u16(maxVal),
v_minval8 = vdupq_n_u16(minVal);
u64 mask[2] = { 0ul };
minLocCapacity <<= 1;
maxLocCapacity <<= 1;
for (size_t i = 0; i < size.height; ++i)
{
const u16 * src = internal::getRowPtr(srcBase, srcStride, i);
size_t j = 0;
for ( ; j < roiw16; j += 16)
{
internal::prefetch(src + j);
uint16x8_t v_src0 = vld1q_u16(src + j), v_src1 = vld1q_u16(src + j + 8);
uint16x8_t v_mask0 = vorrq_u16(vceqq_u16(v_src0, v_maxval8), vceqq_u16(v_src0, v_minval8));
uint16x8_t v_mask1 = vorrq_u16(vceqq_u16(v_src1, v_maxval8), vceqq_u16(v_src1, v_minval8));
vst1q_u8((u8 *)&mask[0], vcombine_u8(vmovn_u16(v_mask0), vmovn_u16(v_mask1)));
if (mask[0])
process(src, j, j + 8, i,
minVal, minLocPtr, minLocCount, minLocCapacity,
maxVal, maxLocPtr, maxLocCount, maxLocCapacity);
if (mask[1])
process(src, j + 8, j + 16, i,
minVal, minLocPtr, minLocCount, minLocCapacity,
maxVal, maxLocPtr, maxLocCount, maxLocCapacity);
}
for ( ; j < roiw8; j += 8)
{
internal::prefetch(src + j);
uint16x8_t v_src = vld1q_u16(src + j);
uint16x8_t v_maxmask = vceqq_u16(v_src, v_maxval8);
uint16x8_t v_minmask = vceqq_u16(v_src, v_minval8);
uint16x8_t v_mask = vorrq_u16(v_maxmask, v_minmask);
vst1_u8((u8 *)&mask[0], vmovn_u16(v_mask));
if (mask[0])
process(src, j, j + 8, i,
minVal, minLocPtr, minLocCount, minLocCapacity,
maxVal, maxLocPtr, maxLocCount, maxLocCapacity);
}
process(src, j, size.width, i,
minVal, minLocPtr, minLocCount, minLocCapacity,
maxVal, maxLocPtr, maxLocCount, maxLocCapacity);
}
minLocCount >>= 1;
maxLocCount >>= 1;
#else
(void)size;
(void)srcBase;
(void)srcStride;
(void)minVal;
(void)minLocPtr;
(void)minLocCount;
(void)minLocCapacity;
(void)maxVal;
(void)maxLocPtr;
(void)maxLocCount;
(void)maxLocCapacity;
#endif
}
void fillMinMaxLocs(const Size2D & size,
const s16 * srcBase, ptrdiff_t srcStride,
s16 minVal, size_t * minLocPtr, s32 & minLocCount, s32 minLocCapacity,
s16 maxVal, size_t * maxLocPtr, s32 & maxLocCount, s32 maxLocCapacity)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
size_t roiw16 = size.width >= 15 ? size.width - 15 : 0;
size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
int16x8_t v_maxval8 = vdupq_n_s16(maxVal),
v_minval8 = vdupq_n_s16(minVal);
u64 mask[2] = { 0ul };
minLocCapacity <<= 1;
maxLocCapacity <<= 1;
for (size_t i = 0; i < size.height; ++i)
{
const s16 * src = internal::getRowPtr(srcBase, srcStride, i);
size_t j = 0;
for ( ; j < roiw16; j += 16)
{
internal::prefetch(src + j);
int16x8_t v_src0 = vld1q_s16(src + j), v_src1 = vld1q_s16(src + j + 8);
uint16x8_t v_mask0 = vorrq_u16(vceqq_s16(v_src0, v_maxval8), vceqq_s16(v_src0, v_minval8));
uint16x8_t v_mask1 = vorrq_u16(vceqq_s16(v_src1, v_maxval8), vceqq_s16(v_src1, v_minval8));
vst1q_u8((u8 *)&mask[0], vcombine_u8(vmovn_u16(v_mask0), vmovn_u16(v_mask1)));
if (mask[0])
process(src, j, j + 8, i,
minVal, minLocPtr, minLocCount, minLocCapacity,
maxVal, maxLocPtr, maxLocCount, maxLocCapacity);
if (mask[1])
process(src, j + 8, j + 16, i,
minVal, minLocPtr, minLocCount, minLocCapacity,
maxVal, maxLocPtr, maxLocCount, maxLocCapacity);
}
for ( ; j < roiw8; j += 8)
{
internal::prefetch(src + j);
int16x8_t v_src = vld1q_s16(src + j);
uint16x8_t v_maxmask = vceqq_s16(v_src, v_maxval8);
uint16x8_t v_minmask = vceqq_s16(v_src, v_minval8);
uint16x8_t v_mask = vorrq_u16(v_maxmask, v_minmask);
vst1_u8((u8 *)&mask[0], vmovn_u16(v_mask));
if (mask[0])
process(src, j, j + 8, i,
minVal, minLocPtr, minLocCount, minLocCapacity,
maxVal, maxLocPtr, maxLocCount, maxLocCapacity);
}
process(src, j, size.width, i,
minVal, minLocPtr, minLocCount, minLocCapacity,
maxVal, maxLocPtr, maxLocCount, maxLocCapacity);
}
minLocCount >>= 1;
maxLocCount >>= 1;
#else
(void)size;
(void)srcBase;
(void)srcStride;
(void)minVal;
(void)minLocPtr;
(void)minLocCount;
(void)minLocCapacity;
(void)maxVal;
(void)maxLocPtr;
(void)maxLocCount;
(void)maxLocCapacity;
#endif
}
void fillMinMaxLocs(const Size2D & size,
const s32 * srcBase, ptrdiff_t srcStride,
s32 minVal, size_t * minLocPtr, s32 & minLocCount, s32 minLocCapacity,
s32 maxVal, size_t * maxLocPtr, s32 & maxLocCount, s32 maxLocCapacity)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
int32x4_t v_maxval4 = vdupq_n_s32(maxVal),
v_minval4 = vdupq_n_s32(minVal);
u64 mask = 0ul;
minLocCapacity <<= 1;
maxLocCapacity <<= 1;
for (size_t i = 0; i < size.height; ++i)
{
const s32 * src = internal::getRowPtr(srcBase, srcStride, i);
size_t j = 0;
for ( ; j < roiw8; j += 8)
{
internal::prefetch(src + j);
int32x4_t v_src0 = vld1q_s32(src + j), v_src1 = vld1q_s32(src + j + 4);
uint32x4_t v_mask0 = vorrq_u32(vceqq_s32(v_src0, v_maxval4), vceqq_s32(v_src0, v_minval4));
uint32x4_t v_mask1 = vorrq_u32(vceqq_s32(v_src1, v_maxval4), vceqq_s32(v_src1, v_minval4));
vst1_u8((u8 *)&mask, vmovn_u16(vcombine_u16(vmovn_u32(v_mask0), vmovn_u32(v_mask1))));
if (mask)
process(src, j, j + 8, i,
minVal, minLocPtr, minLocCount, minLocCapacity,
maxVal, maxLocPtr, maxLocCount, maxLocCapacity);
}
process(src, j, size.width, i,
minVal, minLocPtr, minLocCount, minLocCapacity,
maxVal, maxLocPtr, maxLocCount, maxLocCapacity);
}
minLocCount >>= 1;
maxLocCount >>= 1;
#else
(void)size;
(void)srcBase;
(void)srcStride;
(void)minVal;
(void)minLocPtr;
(void)minLocCount;
(void)minLocCapacity;
(void)maxVal;
(void)maxLocPtr;
(void)maxLocCount;
(void)maxLocCapacity;
#endif
}
void fillMinMaxLocs(const Size2D & size,
const u32 * srcBase, ptrdiff_t srcStride,
u32 minVal, size_t * minLocPtr, s32 & minLocCount, s32 minLocCapacity,
u32 maxVal, size_t * maxLocPtr, s32 & maxLocCount, s32 maxLocCapacity)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
uint32x4_t v_maxval4 = vdupq_n_u32(maxVal),
v_minval4 = vdupq_n_u32(minVal);
u64 mask = 0ul;
minLocCapacity <<= 1;
maxLocCapacity <<= 1;
for (size_t i = 0; i < size.height; ++i)
{
const u32 * src = internal::getRowPtr(srcBase, srcStride, i);
size_t j = 0;
for ( ; j < roiw8; j += 8)
{
internal::prefetch(src + j);
uint32x4_t v_src0 = vld1q_u32(src + j), v_src1 = vld1q_u32(src + j + 4);
uint32x4_t v_mask0 = vorrq_u32(vceqq_u32(v_src0, v_maxval4), vceqq_u32(v_src0, v_minval4));
uint32x4_t v_mask1 = vorrq_u32(vceqq_u32(v_src1, v_maxval4), vceqq_u32(v_src1, v_minval4));
vst1_u8((u8 *)&mask, vmovn_u16(vcombine_u16(vmovn_u32(v_mask0), vmovn_u32(v_mask1))));
if (mask)
process(src, j, j + 8, i,
minVal, minLocPtr, minLocCount, minLocCapacity,
maxVal, maxLocPtr, maxLocCount, maxLocCapacity);
}
process(src, j, size.width, i,
minVal, minLocPtr, minLocCount, minLocCapacity,
maxVal, maxLocPtr, maxLocCount, maxLocCapacity);
}
minLocCount >>= 1;
maxLocCount >>= 1;
#else
(void)size;
(void)srcBase;
(void)srcStride;
(void)minVal;
(void)minLocPtr;
(void)minLocCount;
(void)minLocCapacity;
(void)maxVal;
(void)maxLocPtr;
(void)maxLocCount;
(void)maxLocCapacity;
#endif
}
} // namespace CAROTENE_NS

222
3rdparty/carotene/src/flip.cpp vendored Normal file
View File

@ -0,0 +1,222 @@
/*
* By downloading, copying, installing or using the software you agree to this license.
* If you do not agree to this license, do not download, install,
* copy or use the software.
*
*
* License Agreement
* For Open Source Computer Vision Library
* (3-clause BSD License)
*
* Copyright (C) 2014, NVIDIA Corporation, all rights reserved.
* Third party copyrights are property of their respective owners.
*
* Redistribution and use in source and binary forms, with or without modification,
* are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* * Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* * Neither the names of the copyright holders nor the names of the contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* This software is provided by the copyright holders and contributors "as is" and
* any express or implied warranties, including, but not limited to, the implied
* warranties of merchantability and fitness for a particular purpose are disclaimed.
* In no event shall copyright holders or contributors be liable for any direct,
* indirect, incidental, special, exemplary, or consequential damages
* (including, but not limited to, procurement of substitute goods or services;
* loss of use, data, or profits; or business interruption) however caused
* and on any theory of liability, whether in contract, strict liability,
* or tort (including negligence or otherwise) arising in any way out of
* the use of this software, even if advised of the possibility of such damage.
*/
#include "common.hpp"
#include "vtransform.hpp"
#include <cstring>
namespace CAROTENE_NS {
bool isFlipSupported(FLIP_MODE flipMode, u32 elemSize)
{
bool supportedElemSize = (elemSize == 1) || (elemSize == 2) || (elemSize == 3) || (elemSize == 4);
return isSupportedConfiguration() &&
((supportedElemSize && ((flipMode == FLIP_BOTH_MODE) || (flipMode == FLIP_HORIZONTAL_MODE))) ||
(flipMode == FLIP_VERTICAL_MODE));
}
#ifdef CAROTENE_NEON
namespace {
template <typename T>
void flip(const Size2D & size,
const void * srcBase, ptrdiff_t srcStride,
void * dstBase, ptrdiff_t dstStride,
FLIP_MODE flipMode)
{
using namespace internal;
typedef typename VecTraits<T>::vec128 vec128;
typedef typename VecTraits<T>::vec64 vec64;
u32 step_base = 16 / sizeof(T), step_tail = 8 / sizeof(T);
size_t roiw_base = size.width >= (step_base - 1) ? size.width - step_base + 1 : 0;
size_t roiw_tail = size.width >= (step_tail - 1) ? size.width - step_tail + 1 : 0;
for (size_t i = 0; i < size.height; ++i)
{
const T * src = getRowPtr((const T *)srcBase, srcStride, i);
T * dst = getRowPtr((T *)dstBase, dstStride, (flipMode & FLIP_VERTICAL_MODE) != 0 ? size.height - i - 1 : i);
size_t js = 0, jd = size.width;
for (; js < roiw_base; js += step_base, jd -= step_base)
{
prefetch(src + js);
vec128 v_src = vld1q(src + js);
vec128 v_dst = vrev64q(v_src);
v_dst = vcombine(vget_high(v_dst), vget_low(v_dst));
vst1q(dst + jd - step_base, v_dst);
}
for (; js < roiw_tail; js += step_tail, jd -= step_tail)
{
vec64 v_src = vld1(src + js);
vst1(dst + jd - step_tail, vrev64(v_src));
}
for (--jd; js < size.width; ++js, --jd)
dst[jd] = src[js];
}
}
template <typename T>
void flip3(const Size2D & size,
const void * srcBase, ptrdiff_t srcStride,
void * dstBase, ptrdiff_t dstStride,
FLIP_MODE flipMode)
{
using namespace internal;
#ifndef ANDROID
typedef typename VecTraits<T, 3>::vec128 vec128;
#endif
typedef typename VecTraits<T, 3>::vec64 vec64;
#ifndef ANDROID
u32 step_base = 16 / sizeof(T), step_base3 = step_base * 3;
size_t roiw_base = size.width >= (step_base - 1) ? size.width - step_base + 1 : 0;
#endif
u32 step_tail = 8 / sizeof(T), step_tail3 = step_tail * 3;
size_t roiw_tail = size.width >= (step_tail - 1) ? size.width - step_tail + 1 : 0;
for (size_t i = 0; i < size.height; ++i)
{
const T * src = getRowPtr((const T *)srcBase, srcStride, i);
T * dst = getRowPtr((T *)dstBase, dstStride, (flipMode & FLIP_VERTICAL_MODE) != 0 ? size.height - i - 1 : i);
size_t j = 0, js = 0, jd = size.width * 3;
#ifndef ANDROID
for (; j < roiw_base; j += step_base, js += step_base3, jd -= step_base3)
{
prefetch(src + js);
vec128 v_src = vld3q(src + js), v_dst;
v_src.val[0] = vrev64q(v_src.val[0]);
v_src.val[1] = vrev64q(v_src.val[1]);
v_src.val[2] = vrev64q(v_src.val[2]);
v_dst.val[0] = vcombine(vget_high(v_src.val[0]), vget_low(v_src.val[0]));
v_dst.val[1] = vcombine(vget_high(v_src.val[1]), vget_low(v_src.val[1]));
v_dst.val[2] = vcombine(vget_high(v_src.val[2]), vget_low(v_src.val[2]));
vst3q(dst + jd - step_base3, v_dst);
}
#endif // ANDROID
for (; j < roiw_tail; j += step_tail, js += step_tail3, jd -= step_tail3)
{
vec64 v_src = vld3(src + js), v_dst;
v_dst.val[0] = vrev64(v_src.val[0]);
v_dst.val[1] = vrev64(v_src.val[1]);
v_dst.val[2] = vrev64(v_src.val[2]);
vst3(dst + jd - step_tail3, v_dst);
}
for (jd -= 3; j < size.width; ++j, js += 3, jd -= 3)
{
dst[jd] = src[js];
dst[jd + 1] = src[js + 1];
dst[jd + 2] = src[js + 2];
}
}
}
typedef void (* flipFunc)(const Size2D &size,
const void * srcBase, ptrdiff_t srcStride,
void * dstBase, ptrdiff_t dstStride,
FLIP_MODE flipMode);
} // namespace
#endif
void flip(const Size2D &size,
const u8 * srcBase, ptrdiff_t srcStride,
u8 * dstBase, ptrdiff_t dstStride,
FLIP_MODE flipMode, u32 elemSize)
{
internal::assertSupportedConfiguration(isFlipSupported(flipMode, elemSize));
#ifdef CAROTENE_NEON
if (flipMode == FLIP_VERTICAL_MODE)
{
for (size_t y = 0; y < size.height; ++y)
{
const u8 * src_row = internal::getRowPtr(srcBase, srcStride, y);
u8 * dst_row = internal::getRowPtr(dstBase, dstStride, size.height - y - 1);
std::memcpy(dst_row, src_row, elemSize * size.width);
}
return;
}
flipFunc func = NULL;
if (elemSize == (u32)sizeof(u8))
func = &flip<u8>;
if (elemSize == (u32)sizeof(u16))
func = &flip<u16>;
if (elemSize == (u32)sizeof(u32))
func = &flip<u32>;
if (elemSize == (u32)sizeof(u8) * 3)
func = &flip3<u8>;
if (func == NULL)
return;
func(size,
srcBase, srcStride,
dstBase, dstStride,
flipMode);
#else
(void)size;
(void)srcBase;
(void)srcStride;
(void)dstBase;
(void)dstStride;
(void)flipMode;
(void)elemSize;
#endif
}
} // namespace CAROTENE_NS

1059
3rdparty/carotene/src/gaussian_blur.cpp vendored Normal file

File diff suppressed because it is too large Load Diff

195
3rdparty/carotene/src/in_range.cpp vendored Normal file
View File

@ -0,0 +1,195 @@
/*
* By downloading, copying, installing or using the software you agree to this license.
* If you do not agree to this license, do not download, install,
* copy or use the software.
*
*
* License Agreement
* For Open Source Computer Vision Library
* (3-clause BSD License)
*
* Copyright (C) 2012-2015, NVIDIA Corporation, all rights reserved.
* Third party copyrights are property of their respective owners.
*
* Redistribution and use in source and binary forms, with or without modification,
* are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* * Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* * Neither the names of the copyright holders nor the names of the contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* This software is provided by the copyright holders and contributors "as is" and
* any express or implied warranties, including, but not limited to, the implied
* warranties of merchantability and fitness for a particular purpose are disclaimed.
* In no event shall copyright holders or contributors be liable for any direct,
* indirect, incidental, special, exemplary, or consequential damages
* (including, but not limited to, procurement of substitute goods or services;
* loss of use, data, or profits; or business interruption) however caused
* and on any theory of liability, whether in contract, strict liability,
* or tort (including negligence or otherwise) arising in any way out of
* the use of this software, even if advised of the possibility of such damage.
*/
#include "common.hpp"
#include "vtransform.hpp"
namespace CAROTENE_NS {
#ifdef CAROTENE_NEON
namespace {
inline void vnst(u8* dst, uint8x16_t v1, uint8x16_t v2) { vst1q_u8(dst, v1); vst1q_u8(dst+16, v2); }
inline void vnst(u8* dst, uint16x8_t v1, uint16x8_t v2) { vst1q_u8(dst, vcombine_u8(vmovn_u16(v1), vmovn_u16(v2))); }
inline void vnst(u8* dst, uint32x4_t v1, uint32x4_t v2) { vst1_u8(dst, vmovn_u16(vcombine_u16(vmovn_u32(v1), vmovn_u32(v2)))); }
template <typename T, int elsize> struct vtail
{
static inline void inRange(const T *, const T *, const T *,
u8 *, size_t &, size_t)
{
//do nothing since there couldn't be enough data
}
};
template <typename T> struct vtail<T, 2>
{
static inline void inRange(const T * src, const T * rng1, const T * rng2,
u8 * dst, size_t &x, size_t width)
{
typedef typename internal::VecTraits<T>::vec128 vec128;
typedef typename internal::VecTraits<T>::unsign::vec128 uvec128;
//There no more than 15 elements in the tail, so we could handle 8 element vector only once
if( x + 8 < width)
{
vec128 vs = internal::vld1q( src + x);
vec128 vr1 = internal::vld1q(rng1 + x);
vec128 vr2 = internal::vld1q(rng2 + x);
uvec128 vd = internal::vandq(internal::vcgeq(vs, vr1), internal::vcgeq(vr2, vs));
internal::vst1(dst + x, internal::vmovn(vd));
x+=8;
}
}
};
template <typename T> struct vtail<T, 1>
{
static inline void inRange(const T * src, const T * rng1, const T * rng2,
u8 * dst, size_t &x, size_t width)
{
typedef typename internal::VecTraits<T>::vec128 vec128;
typedef typename internal::VecTraits<T>::unsign::vec128 uvec128;
typedef typename internal::VecTraits<T>::vec64 vec64;
typedef typename internal::VecTraits<T>::unsign::vec64 uvec64;
//There no more than 31 elements in the tail, so we could handle once 16+8 or 16 or 8 elements
if( x + 16 < width)
{
vec128 vs = internal::vld1q( src + x);
vec128 vr1 = internal::vld1q(rng1 + x);
vec128 vr2 = internal::vld1q(rng2 + x);
uvec128 vd = internal::vandq(internal::vcgeq(vs, vr1), internal::vcgeq(vr2, vs));
internal::vst1q(dst + x, vd);
x+=16;
}
if( x + 8 < width)
{
vec64 vs = internal::vld1( src + x);
vec64 vr1 = internal::vld1(rng1 + x);
vec64 vr2 = internal::vld1(rng2 + x);
uvec64 vd = internal::vand(internal::vcge(vs, vr1), internal::vcge(vr2, vs));
internal::vst1(dst + x, vd);
x+=8;
}
}
};
template <typename T>
inline void inRangeCheck(const Size2D &_size,
const T * srcBase, ptrdiff_t srcStride,
const T * rng1Base, ptrdiff_t rng1Stride,
const T * rng2Base, ptrdiff_t rng2Stride,
u8 * dstBase, ptrdiff_t dstStride)
{
typedef typename internal::VecTraits<T>::vec128 vec128;
typedef typename internal::VecTraits<T>::unsign::vec128 uvec128;
Size2D size(_size);
if (srcStride == dstStride &&
srcStride == rng1Stride &&
srcStride == rng2Stride &&
srcStride == (ptrdiff_t)(size.width))
{
size.width *= size.height;
size.height = 1;
}
const size_t width = size.width & ~( 32/sizeof(T) - 1 );
for(size_t j = 0; j < size.height; ++j)
{
const T * src = internal::getRowPtr( srcBase, srcStride, j);
const T * rng1 = internal::getRowPtr(rng1Base, rng1Stride, j);
const T * rng2 = internal::getRowPtr(rng2Base, rng2Stride, j);
u8 * dst = internal::getRowPtr( dstBase, dstStride, j);
size_t i = 0;
for( ; i < width; i += 32/sizeof(T) )
{
internal::prefetch(src + i);
internal::prefetch(rng1 + i);
internal::prefetch(rng2 + i);
vec128 vs = internal::vld1q( src + i);
vec128 vr1 = internal::vld1q(rng1 + i);
vec128 vr2 = internal::vld1q(rng2 + i);
uvec128 vd1 = internal::vandq(internal::vcgeq(vs, vr1), internal::vcgeq(vr2, vs));
vs = internal::vld1q( src + i + 16/sizeof(T));
vr1 = internal::vld1q(rng1 + i + 16/sizeof(T));
vr2 = internal::vld1q(rng2 + i + 16/sizeof(T));
uvec128 vd2 = internal::vandq(internal::vcgeq(vs, vr1), internal::vcgeq(vr2, vs));
vnst(dst + i, vd1, vd2);
}
vtail<T, sizeof(T)>::inRange(src, rng1, rng2, dst, i, size.width);
for( ; i < size.width; i++ )
dst[i] = (u8)(-(rng1[i] <= src[i] && src[i] <= rng2[i]));
}
}
}
#define INRANGEFUNC(T) \
void inRange(const Size2D &_size, \
const T * srcBase, ptrdiff_t srcStride, \
const T * rng1Base, ptrdiff_t rng1Stride, \
const T * rng2Base, ptrdiff_t rng2Stride, \
u8 * dstBase, ptrdiff_t dstStride) \
{ \
internal::assertSupportedConfiguration(); \
inRangeCheck(_size, srcBase, srcStride, \
rng1Base, rng1Stride, rng2Base, rng2Stride, \
dstBase, dstStride); \
}
#else
#define INRANGEFUNC(T) \
void inRange(const Size2D &, \
const T *, ptrdiff_t, \
const T *, ptrdiff_t, \
const T *, ptrdiff_t, \
u8 *, ptrdiff_t) \
{ \
internal::assertSupportedConfiguration(); \
}
#endif
INRANGEFUNC(u8)
INRANGEFUNC(s8)
INRANGEFUNC(u16)
INRANGEFUNC(s16)
INRANGEFUNC(s32)
INRANGEFUNC(f32)
} // namespace CAROTENE_NS

238
3rdparty/carotene/src/integral.cpp vendored Normal file
View File

@ -0,0 +1,238 @@
/*
* By downloading, copying, installing or using the software you agree to this license.
* If you do not agree to this license, do not download, install,
* copy or use the software.
*
*
* License Agreement
* For Open Source Computer Vision Library
* (3-clause BSD License)
*
* Copyright (C) 2012-2014, NVIDIA Corporation, all rights reserved.
* Third party copyrights are property of their respective owners.
*
* Redistribution and use in source and binary forms, with or without modification,
* are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* * Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* * Neither the names of the copyright holders nor the names of the contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* This software is provided by the copyright holders and contributors "as is" and
* any express or implied warranties, including, but not limited to, the implied
* warranties of merchantability and fitness for a particular purpose are disclaimed.
* In no event shall copyright holders or contributors be liable for any direct,
* indirect, incidental, special, exemplary, or consequential damages
* (including, but not limited to, procurement of substitute goods or services;
* loss of use, data, or profits; or business interruption) however caused
* and on any theory of liability, whether in contract, strict liability,
* or tort (including negligence or otherwise) arising in any way out of
* the use of this software, even if advised of the possibility of such damage.
*/
#include "common.hpp"
namespace CAROTENE_NS {
void integral(const Size2D &size,
const u8 * srcBase, ptrdiff_t srcStride,
u32 * sumBase, ptrdiff_t sumStride)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
uint32x4_t v_zero = vmovq_n_u32(0u);
// the first iteration
const u8 * src = internal::getRowPtr(srcBase, srcStride, 0);
u32 * sum = internal::getRowPtr(sumBase, sumStride, 0);
uint32x4_t prev = v_zero;
size_t j = 0u;
for ( ; j + 7 < size.width; j += 8)
{
internal::prefetch(sum + j);
internal::prefetch(src + j);
uint8x8_t el8shr0 = vld1_u8(src + j);
uint8x8_t el8shr1 = vreinterpret_u8_u64(vshl_n_u64(vreinterpret_u64_u8(el8shr0), 8));
uint8x8_t el8shr2 = vreinterpret_u8_u64(vshl_n_u64(vreinterpret_u64_u8(el8shr0), 16));
uint8x8_t el8shr3 = vreinterpret_u8_u64(vshl_n_u64(vreinterpret_u64_u8(el8shr0), 24));
uint16x8_t el8shr12 = vaddl_u8(el8shr1, el8shr2);
uint16x8_t el8shr03 = vaddl_u8(el8shr0, el8shr3);
uint16x8_t el8 = vaddq_u16(el8shr12, el8shr03);
uint16x4_t el4h = vadd_u16(vget_low_u16(el8), vget_high_u16(el8));
uint32x4_t vsuml = vaddw_u16(prev, vget_low_u16(el8));
uint32x4_t vsumh = vaddw_u16(prev, el4h);
vst1q_u32(sum + j, vsuml);
vst1q_u32(sum + j + 4, vsumh);
prev = vaddw_u16(prev, vdup_lane_u16(el4h, 3));
}
for (u32 v = vgetq_lane_u32(prev, 3); j < size.width; ++j)
sum[j] = (v += src[j]);
// the others
for (size_t i = 1; i < size.height ; ++i)
{
src = internal::getRowPtr(srcBase, srcStride, i);
u32 * prevSum = internal::getRowPtr(sumBase, sumStride, i - 1);
sum = internal::getRowPtr(sumBase, sumStride, i);
prev = v_zero;
j = 0u;
for ( ; j + 7 < size.width; j += 8)
{
internal::prefetch(sum + j);
internal::prefetch(src + j);
uint32x4_t vsuml = vld1q_u32(prevSum + j);
uint32x4_t vsumh = vld1q_u32(prevSum + j + 4);
uint8x8_t el8shr0 = vld1_u8(src + j);
uint8x8_t el8shr1 = vreinterpret_u8_u64(vshl_n_u64(vreinterpret_u64_u8(el8shr0), 8));
uint8x8_t el8shr2 = vreinterpret_u8_u64(vshl_n_u64(vreinterpret_u64_u8(el8shr0), 16));
uint8x8_t el8shr3 = vreinterpret_u8_u64(vshl_n_u64(vreinterpret_u64_u8(el8shr0), 24));
vsuml = vaddq_u32(vsuml, prev);
vsumh = vaddq_u32(vsumh, prev);
uint16x8_t el8shr12 = vaddl_u8(el8shr1, el8shr2);
uint16x8_t el8shr03 = vaddl_u8(el8shr0, el8shr3);
uint16x8_t el8 = vaddq_u16(el8shr12, el8shr03);
uint16x4_t el4h = vadd_u16(vget_low_u16(el8), vget_high_u16(el8));
vsuml = vaddw_u16(vsuml, vget_low_u16(el8));
vsumh = vaddw_u16(vsumh, el4h);
vst1q_u32(sum + j, vsuml);
vst1q_u32(sum + j + 4, vsumh);
prev = vaddw_u16(prev, vdup_lane_u16(el4h, 3));
}
for (u32 v = vgetq_lane_u32(prev, 3); j < size.width; ++j)
sum[j] = (v += src[j]) + prevSum[j];
}
#else
(void)size;
(void)srcBase;
(void)srcStride;
(void)sumBase;
(void)sumStride;
#endif
}
void sqrIntegral(const Size2D &size,
const u8 * srcBase, ptrdiff_t srcStride,
f64 * sqsumBase, ptrdiff_t sqsumStride)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
uint16x8_t v_zero8 = vmovq_n_u16(0u);
// the first iteration
const u8 * src = internal::getRowPtr(srcBase, srcStride, 0);
f64 * sqsum = internal::getRowPtr(sqsumBase, sqsumStride, 0);
double prev = 0.;
size_t j = 0u;
for ( ; j + 7 < size.width; j += 8)
{
internal::prefetch(sqsum + j);
internal::prefetch(src + j);
uint8x8_t vsrc = vld1_u8(src + j);
uint16x8_t el8shr0 = vmull_u8(vsrc, vsrc);
uint16x8_t el8shr1 = vextq_u16(v_zero8, el8shr0, 7);
uint32x4_t el8shr01l = vaddl_u16(vget_low_u16(el8shr0), vget_low_u16(el8shr1));
uint32x4_t el8shr01h = vaddl_u16(vget_high_u16(el8shr0), vget_high_u16(el8shr1));
uint32x4_t el4h = vaddq_u32(el8shr01l, el8shr01h);
uint32x2_t el2l = vadd_u32(vget_low_u32(el8shr01l), vget_high_u32(el8shr01l));
uint32x2_t el2hl = vadd_u32(vget_low_u32(el4h), vget_high_u32(el8shr01l));
uint32x2_t el2hh = vadd_u32(vget_low_u32(el4h), vget_high_u32(el4h));
u32 buf[8];
vst1_u32(buf, vget_low_u32(el8shr01l));
vst1_u32(buf+2, el2l);
vst1_u32(buf+4, el2hl);
vst1_u32(buf+6, el2hh);
for(u32 k=0; k < 8; k++)
sqsum[j+k] = prev + buf[k];
prev += buf[7];
}
for (; j < size.width; ++j)
sqsum[j] = (prev += src[j]*src[j]);
// the others
for (size_t i = 1; i < size.height ; ++i)
{
src = internal::getRowPtr(srcBase, srcStride, i);
f64 * prevSqSum = internal::getRowPtr(sqsumBase, sqsumStride, i - 1);
sqsum = internal::getRowPtr(sqsumBase, sqsumStride, i);
prev = 0.;
j = 0u;
for ( ; j + 7 < size.width; j += 8)
{
internal::prefetch(sqsum + j);
internal::prefetch(src + j);
uint8x8_t vsrc = vld1_u8(src + j);
uint16x8_t el8shr0 = vmull_u8(vsrc, vsrc);
uint16x8_t el8shr1 = vextq_u16(v_zero8, el8shr0, 7);
uint32x4_t el8shr01l = vaddl_u16(vget_low_u16(el8shr0), vget_low_u16(el8shr1));
uint32x4_t el8shr01h = vaddl_u16(vget_high_u16(el8shr0), vget_high_u16(el8shr1));
uint32x4_t el4h = vaddq_u32(el8shr01l, el8shr01h);
uint32x2_t el2l = vadd_u32(vget_low_u32(el8shr01l), vget_high_u32(el8shr01l));
uint32x2_t el2hl = vadd_u32(vget_low_u32(el4h), vget_high_u32(el8shr01l));
uint32x2_t el2hh = vadd_u32(vget_low_u32(el4h), vget_high_u32(el4h));
u32 buf[8];
vst1_u32(buf, vget_low_u32(el8shr01l));
vst1_u32(buf+2, el2l);
vst1_u32(buf+4, el2hl);
vst1_u32(buf+6, el2hh);
for(u32 k=0; k < 8; k++)
sqsum[j+k] = prev + prevSqSum[j+k] + buf[k];
prev += buf[7];
}
for (; j < size.width; ++j)
sqsum[j] = (prev += src[j]*src[j]) + prevSqSum[j];
}
#else
(void)size;
(void)srcBase;
(void)srcStride;
(void)sqsumBase;
(void)sqsumStride;
#endif
}
} // namespace CAROTENE_NS

112
3rdparty/carotene/src/intrinsics.hpp vendored Normal file
View File

@ -0,0 +1,112 @@
/*
* By downloading, copying, installing or using the software you agree to this license.
* If you do not agree to this license, do not download, install,
* copy or use the software.
*
*
* License Agreement
* For Open Source Computer Vision Library
* (3-clause BSD License)
*
* Copyright (C) 2014, NVIDIA Corporation, all rights reserved.
* Third party copyrights are property of their respective owners.
*
* Redistribution and use in source and binary forms, with or without modification,
* are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* * Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* * Neither the names of the copyright holders nor the names of the contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* This software is provided by the copyright holders and contributors "as is" and
* any express or implied warranties, including, but not limited to, the implied
* warranties of merchantability and fitness for a particular purpose are disclaimed.
* In no event shall copyright holders or contributors be liable for any direct,
* indirect, incidental, special, exemplary, or consequential damages
* (including, but not limited to, procurement of substitute goods or services;
* loss of use, data, or profits; or business interruption) however caused
* and on any theory of liability, whether in contract, strict liability,
* or tort (including negligence or otherwise) arising in any way out of
* the use of this software, even if advised of the possibility of such damage.
*/
#ifndef CAROTENE_INTRINSICS_HPP
#define CAROTENE_INTRINSICS_HPP
#include <carotene/definitions.hpp>
#include <arm_neon.h>
namespace CAROTENE_NS { namespace internal {
/////////////// Custom NEON intrinsics ///////////////////
// calculate reciprocal value
inline float32x4_t vrecpq_f32(float32x4_t val)
{
float32x4_t reciprocal = vrecpeq_f32(val);
reciprocal = vmulq_f32(vrecpsq_f32(val, reciprocal), reciprocal);
reciprocal = vmulq_f32(vrecpsq_f32(val, reciprocal), reciprocal);
return reciprocal;
}
inline float32x2_t vrecp_f32(float32x2_t val)
{
float32x2_t reciprocal = vrecpe_f32(val);
reciprocal = vmul_f32(vrecps_f32(val, reciprocal), reciprocal);
reciprocal = vmul_f32(vrecps_f32(val, reciprocal), reciprocal);
return reciprocal;
}
// caclulate sqrt value
inline float32x4_t vrsqrtq_f32(float32x4_t val)
{
float32x4_t e = vrsqrteq_f32(val);
e = vmulq_f32(vrsqrtsq_f32(vmulq_f32(e, e), val), e);
e = vmulq_f32(vrsqrtsq_f32(vmulq_f32(e, e), val), e);
return e;
}
inline float32x2_t vrsqrt_f32(float32x2_t val)
{
float32x2_t e = vrsqrte_f32(val);
e = vmul_f32(vrsqrts_f32(vmul_f32(e, e), val), e);
e = vmul_f32(vrsqrts_f32(vmul_f32(e, e), val), e);
return e;
}
inline float32x4_t vsqrtq_f32(float32x4_t val)
{
return vrecpq_f32(vrsqrtq_f32(val));
}
inline float32x2_t vsqrt_f32(float32x2_t val)
{
return vrecp_f32(vrsqrt_f32(val));
}
// table lookup with the table in a 128-bit register
inline uint8x8_t vqtbl1_u8 (uint8x16_t a, uint8x8_t b)
{
#ifdef __aarch64__
// AArch64 supports this natively
return ::vqtbl1_u8(a, b);
#else
union { uint8x16_t v; uint8x8x2_t w; } u = { a };
return vtbl2_u8(u.w, b);
#endif
}
} }
#endif

713
3rdparty/carotene/src/laplacian.cpp vendored Normal file
View File

@ -0,0 +1,713 @@
/*
* By downloading, copying, installing or using the software you agree to this license.
* If you do not agree to this license, do not download, install,
* copy or use the software.
*
*
* License Agreement
* For Open Source Computer Vision Library
* (3-clause BSD License)
*
* Copyright (C) 2015, NVIDIA Corporation, all rights reserved.
* Third party copyrights are property of their respective owners.
*
* Redistribution and use in source and binary forms, with or without modification,
* are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* * Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* * Neither the names of the copyright holders nor the names of the contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* This software is provided by the copyright holders and contributors "as is" and
* any express or implied warranties, including, but not limited to, the implied
* warranties of merchantability and fitness for a particular purpose are disclaimed.
* In no event shall copyright holders or contributors be liable for any direct,
* indirect, incidental, special, exemplary, or consequential damages
* (including, but not limited to, procurement of substitute goods or services;
* loss of use, data, or profits; or business interruption) however caused
* and on any theory of liability, whether in contract, strict liability,
* or tort (including negligence or otherwise) arising in any way out of
* the use of this software, even if advised of the possibility of such damage.
*/
#include "common.hpp"
#include "saturate_cast.hpp"
#include <vector>
namespace CAROTENE_NS {
bool isLaplacian3x3Supported(const Size2D &size, BORDER_MODE border)
{
return isSupportedConfiguration() && size.width >= 8 &&
(border == BORDER_MODE_CONSTANT ||
border == BORDER_MODE_REPLICATE);
}
void Laplacian3x3(const Size2D &size,
const u8 * srcBase, ptrdiff_t srcStride,
u8 * dstBase, ptrdiff_t dstStride,
BORDER_MODE border, u8 borderValue)
{
internal::assertSupportedConfiguration(isLaplacian3x3Supported(size, border));
#ifdef CAROTENE_NEON
const uint16x8_t v_border_x3 = vdupq_n_u16(borderValue * 3);
const uint16x8_t v_zero = vdupq_n_u16(0);
const uint8x8_t v_border = vdup_n_u8(borderValue);
uint8x8_t vsub;
uint16x8_t tprev = v_zero, tcurr = v_zero, tnext = v_zero;
uint16x8_t t0 = v_zero, t1 = v_zero, t2 = v_zero;
ptrdiff_t width = (ptrdiff_t)size.width, height = (ptrdiff_t)size.height;
for (ptrdiff_t y = 0; y < height; ++y)
{
const u8 * srow0 = y == 0 && border == BORDER_MODE_CONSTANT ? NULL : internal::getRowPtr(srcBase, srcStride, std::max<ptrdiff_t>(y - 1, 0));
const u8 * srow1 = internal::getRowPtr(srcBase, srcStride, y);
const u8 * srow2 = y + 1 == height && border == BORDER_MODE_CONSTANT ? NULL : internal::getRowPtr(srcBase, srcStride, std::min(y + 1, height - 1));
u8 * drow = internal::getRowPtr(dstBase, dstStride, y);
s16 prevx = 0, currx = 0, nextx = 0;
ptrdiff_t x = 0;
const ptrdiff_t bwidth = y + 2 < height ? width : (width - 8);
// perform vertical convolution
for ( ; x <= bwidth; x += 8)
{
internal::prefetch(srow0 + x);
internal::prefetch(srow1 + x);
internal::prefetch(srow2 + x);
uint8x8_t x0 = !srow0 ? v_border : vld1_u8(srow0 + x);
uint8x8_t x1 = vld1_u8(srow1 + x);
uint8x8_t x2 = !srow2 ? v_border : vld1_u8(srow2 + x);
// calculate values for plain CPU part below if needed
if (x + 8 >= bwidth)
{
ptrdiff_t x3 = x == width ? width - 1 : x;
ptrdiff_t x4 = border == BORDER_MODE_CONSTANT ? x3 - 1 : std::max<ptrdiff_t>(x3 - 1, 0);
if (border == BORDER_MODE_CONSTANT && x4 < 0)
prevx = borderValue;
else
prevx = (srow2 ? srow2[x4] : borderValue) + srow1[x4] + (srow0 ? srow0[x4] : borderValue);
currx = (srow2 ? srow2[x3] : borderValue) + srow1[x3] + (srow0 ? srow0[x3] : borderValue);
}
// make shift
if (x)
{
tprev = tcurr;
tcurr = tnext;
}
// and calculate next value
tnext = vaddw_u8(vaddl_u8(x0, x1), x2);
// make extrapolation for the first elements
if (!x)
{
// make border
if (border == BORDER_MODE_CONSTANT)
tcurr = v_border_x3;
else if (border == BORDER_MODE_REPLICATE)
tcurr = vdupq_n_u16(vgetq_lane_u16(tnext, 0));
vsub = x1;
continue;
}
// combine 3 "shifted" vectors
t0 = vextq_u16(tprev, tcurr, 7);
t1 = tcurr;
t2 = vextq_u16(tcurr, tnext, 1);
// and add them
t0 = vqaddq_u16(t0, vqaddq_u16(t1, t2));
int16x8_t tt0 = vsubq_s16(vreinterpretq_s16_u16(t0),
vreinterpretq_s16_u16(vaddw_u8(vshll_n_u8(vsub, 3), vsub)));
uint8x8_t it0 = vqmovun_s16(tt0);
vst1_u8(drow + x - 8, it0);
vsub = x1;
}
x -= 8;
if (x == width)
--x;
for ( ; x < width; ++x)
{
// make extrapolation for the last elements
if (x + 1 >= width)
{
if (border == BORDER_MODE_CONSTANT)
nextx = borderValue * 3;
else if (border == BORDER_MODE_REPLICATE)
nextx = srow2[x] + srow1[x] + srow0[x];
}
else
{
nextx = (srow2 ? srow2[x + 1] : borderValue) +
srow1[x + 1] +
(srow0 ? srow0[x + 1] : borderValue);
}
s32 val = (prevx + currx + nextx) - 9 * srow1[x];
drow[x] = internal::saturate_cast<u8>((s32)val);
// make shift
prevx = currx;
currx = nextx;
}
}
#else
(void)size;
(void)srcBase;
(void)srcStride;
(void)dstBase;
(void)dstStride;
(void)border;
(void)borderValue;
#endif
}
bool isLaplacianOpenCVSupported(const Size2D &size, BORDER_MODE border)
{
return isSupportedConfiguration() &&
size.width >= 8 && size.height >= 1 &&
(border == BORDER_MODE_CONSTANT ||
border == BORDER_MODE_REFLECT ||
border == BORDER_MODE_REFLECT101 ||
border == BORDER_MODE_REPLICATE);
}
void Laplacian1OpenCV(const Size2D &size,
const u8 * srcBase, ptrdiff_t srcStride,
s16 * dstBase, ptrdiff_t dstStride,
BORDER_MODE border, u8 borderValue)
{
internal::assertSupportedConfiguration(isLaplacianOpenCVSupported(size, border));
#ifdef CAROTENE_NEON
ptrdiff_t rows = size.height, cols = size.width;
std::vector<u8> _tmp;
u8 *tmp = 0;
if (border == BORDER_MODE_CONSTANT)
{
_tmp.assign(cols + 4,borderValue);
tmp = &_tmp[2];
}
for( ptrdiff_t y = 0; y < rows; y++ )
{
const u8* v0 = 0;
const u8* v1 = internal::getRowPtr(srcBase, srcStride, y);
const u8* v2 = 0;
// make border
if (border == BORDER_MODE_REFLECT101) {
v0 = internal::getRowPtr(srcBase, srcStride, y > 0 ? y-1 : y+1);
v2 = internal::getRowPtr(srcBase, srcStride, y < rows-1 ? y+1 : rows > 1 ? rows-2 : 0);
} else if (border == BORDER_MODE_CONSTANT) {
v0 = y > 0 ? internal::getRowPtr(srcBase, srcStride, y-1) : tmp;
v2 = y < rows-1 ? internal::getRowPtr(srcBase, srcStride, y+1) : tmp;
} else {
v0 = internal::getRowPtr(srcBase, srcStride, y > 0 ? y-1 : 0);
v2 = internal::getRowPtr(srcBase, srcStride, y < rows-1 ? y+1 : rows > 0 ? rows-1 : 0);
}
s16* drow = internal::getRowPtr(dstBase, dstStride, y);
int16x8_t tcurr = vmovq_n_s16(0x0);
int16x8_t tnext = vmovq_n_s16(0x0);
int16x8_t t0, t2;
uint8x8_t xx0 = vmov_n_u8(0x0);
uint8x8_t xx1 = vmov_n_u8(0x0);
uint8x8_t xx2 = vmov_n_u8(0x0);
ptrdiff_t x = 0;
const ptrdiff_t bcols = y + 2 < rows ? cols : (cols - 8);
for( ; x <= bcols; x += 8 )
{
internal::prefetch(v0 + x);
internal::prefetch(v1 + x);
internal::prefetch(v2 + x);
uint8x8_t x0 = vld1_u8(v0 + x);
uint8x8_t x1 = vld1_u8(v1 + x);
uint8x8_t x2 = vld1_u8(v2 + x);
if(x) {
xx0 = xx1;
xx1 = xx2;
} else {
xx1 = x1;
// make border
if (border == BORDER_MODE_REPLICATE || border == BORDER_MODE_REFLECT)
{
xx1 = vset_lane_u8(vget_lane_u8(x1, 0),x1, 7);
}
else if (border == BORDER_MODE_CONSTANT)
{
xx1 = vset_lane_u8(borderValue, x1, 7);
}
else if (border == BORDER_MODE_REFLECT101)
{
xx1 = vset_lane_u8(vget_lane_u8(x1, 1),x1, 7);
}
}
xx2 = x1;
if(x) {
tcurr = tnext;
}
tnext = vsubq_s16(vreinterpretq_s16_u16(vaddl_u8(x0, x2)),
vreinterpretq_s16_u16(vshll_n_u8(x1, 2)));
if(!x) {
tcurr = tnext;
continue;
}
t0 = vreinterpretq_s16_u16(vmovl_u8(vext_u8(xx0, xx1, 7)));
t2 = vreinterpretq_s16_u16(vmovl_u8(vext_u8(xx1, xx2, 1)));
t0 = vaddq_s16(vqaddq_s16(t0, t2), tcurr);
vst1q_s16(drow + x - 8, t0);
}
x -= 8;
if(x == cols){
x--;
}
for( ; x < cols; x++ )
{
s16 nextx;
s16 prevx;
// make border
if (border == BORDER_MODE_REPLICATE || border == BORDER_MODE_REFLECT)
{
prevx = x == 0 ? v1[0] : v1[x-1];
nextx = x == cols-1 ? v1[x] : v1[x+1];
}
else if (border == BORDER_MODE_REFLECT101)
{
prevx = x == 0 ? v1[1] : v1[x-1];
nextx = x == cols-1 ? v1[x-1] : v1[x+1];
}
else //if (border == BORDER_MODE_CONSTANT)
{
prevx = x == 0 ? borderValue : v1[x-1];
nextx = x == cols-1 ? borderValue : v1[x+1];
}
*(drow+x) = prevx + nextx - 4*v1[x] + v0[x] + v2[x];
}
}
#else
(void)size;
(void)srcBase;
(void)srcStride;
(void)dstBase;
(void)dstStride;
(void)border;
(void)borderValue;
#endif
}
void Laplacian3OpenCV(const Size2D &size,
const u8 * srcBase, ptrdiff_t srcStride,
s16 * dstBase, ptrdiff_t dstStride,
BORDER_MODE border, u8 borderValue)
{
internal::assertSupportedConfiguration(isLaplacianOpenCVSupported(size, border));
#ifdef CAROTENE_NEON
ptrdiff_t rows = size.height, cols = size.width;
std::vector<u8> _tmp;
u8 *tmp = 0;
if (border == BORDER_MODE_CONSTANT)
{
_tmp.assign(cols + 4,borderValue);
tmp = &_tmp[2];
}
for( ptrdiff_t y = 0; y < rows; y++ )
{
const u8* v0 = 0;
const u8* v1 = internal::getRowPtr(srcBase, srcStride, y);
const u8* v2 = 0;
// make border
if (border == BORDER_MODE_REFLECT101) {
v0 = internal::getRowPtr(srcBase, srcStride, y > 0 ? y-1 : y+1);
v2 = internal::getRowPtr(srcBase, srcStride, y < rows-1 ? y+1 : rows > 1 ? rows-2 : 0);
} else if (border == BORDER_MODE_CONSTANT) {
v0 = y > 0 ? internal::getRowPtr(srcBase, srcStride, y-1) : tmp;
v2 = y < rows-1 ? internal::getRowPtr(srcBase, srcStride, y+1) : tmp;
} else {
v0 = internal::getRowPtr(srcBase, srcStride, y > 0 ? y-1 : 0);
v2 = internal::getRowPtr(srcBase, srcStride, y < rows-1 ? y+1 : rows > 0 ? rows-1 : 0);
}
s16* drow = internal::getRowPtr(dstBase, dstStride, y);
int16x8_t tprev = vmovq_n_s16(0x0);
int16x8_t tcurr = vmovq_n_s16(0x0);
int16x8_t tnext = vmovq_n_s16(0x0);
int16x8_t tc = vmovq_n_s16(0x0);
int16x8_t t0, t2, tcnext;
ptrdiff_t x = 0;
const ptrdiff_t bcols = y + 2 < rows ? cols : (cols - 8);
for( ; x <= bcols; x += 8 )
{
internal::prefetch(v0 + x);
internal::prefetch(v1 + x);
internal::prefetch(v2 + x);
uint8x8_t x0 = vld1_u8(v0 + x);
uint8x8_t x1 = vld1_u8(v1 + x);
uint8x8_t x2 = vld1_u8(v2 + x);
tcnext = vreinterpretq_s16_u16(vshll_n_u8(x1, 2));
if(x) {
tprev = tcurr;
tcurr = tnext;
}
tnext = vreinterpretq_s16_u16(vaddl_u8(x0, x2));
if(!x) {
tcurr = tnext;
tc = tcnext;
// make border
if (border == BORDER_MODE_REPLICATE || border == BORDER_MODE_REFLECT)
{
tcurr = vsetq_lane_s16(vgetq_lane_s16(tcurr, 0),tcurr, 7);
}
else if (border == BORDER_MODE_CONSTANT)
{
tcurr = vsetq_lane_s16(borderValue, tcurr, 7);
}
else if (border == BORDER_MODE_REFLECT101)
{
tcurr = vsetq_lane_s16(vgetq_lane_s16(tcurr, 1),tcurr, 7);
}
continue;
}
t0 = vextq_s16(tprev, tcurr, 7);
t2 = vextq_s16(tcurr, tnext, 1);
t0 = vsubq_s16(vqaddq_s16(t0, t2), tc);
tc = tcnext;
t0 = vshlq_n_s16(t0, 1);
vst1q_s16(drow + x - 8, t0);
}
x -= 8;
if(x == cols){
x--;
}
for( ; x < cols; x++ )
{
s16 nextx, nextx2;
s16 prevx, prevx2;
// make border
if (border == BORDER_MODE_REPLICATE || border == BORDER_MODE_REFLECT)
{
prevx = x == 0 ? v0[0] : v0[x-1];
prevx2 = x == 0 ? v2[0] : v2[x-1];
nextx = x == cols-1 ? v0[x] : v0[x+1];
nextx2 = x == cols-1 ? v2[x] : v2[x+1];
}
else if (border == BORDER_MODE_REFLECT101)
{
prevx = x == 0 ? v0[1] : v0[x-1];
prevx2 = x == 0 ? v2[1] : v2[x-1];
nextx = x == cols-1 ? v0[x-1] : v0[x+1];
nextx2 = x == cols-1 ? v2[x-1] : v2[x+1];
}
else //if (border == BORDER_MODE_CONSTANT)
{
prevx = x == 0 ? borderValue : v0[x-1];
prevx2 = x == 0 ? borderValue : v2[x-1];
nextx = x == cols-1 ? borderValue : v0[x+1];
nextx2 = x == cols-1 ? borderValue : v2[x+1];
}
s16 res = prevx + nextx - 4*v1[x] + prevx2 + nextx2;
*(drow+x) = 2*res;
}
}
#else
(void)size;
(void)srcBase;
(void)srcStride;
(void)dstBase;
(void)dstStride;
(void)border;
(void)borderValue;
#endif
}
void Laplacian5OpenCV(const Size2D &size,
const u8 * srcBase, ptrdiff_t srcStride,
s16 * dstBase, ptrdiff_t dstStride,
BORDER_MODE border, u8 borderValue)
{
internal::assertSupportedConfiguration(isLaplacianOpenCVSupported(size, border));
#ifdef CAROTENE_NEON
ptrdiff_t rows = size.height, cols = size.width;
std::vector<u8> _tmp;
u8 *tmp = 0;
if (border == BORDER_MODE_CONSTANT)
{
_tmp.assign(cols + 4,borderValue);
tmp = &_tmp[2];
}
for( ptrdiff_t y = 0; y < rows; y++ )
{
const u8* v0 = 0;
const u8* v1 = 0;
const u8* v2 = internal::getRowPtr(srcBase, srcStride, y);
const u8* v3 = 0;
const u8* v4 = 0;
// make border
if (border == BORDER_MODE_REPLICATE) {
v0 = internal::getRowPtr(srcBase, srcStride, y > 1 ? y-2 : 0);
v1 = internal::getRowPtr(srcBase, srcStride, y > 0 ? y-1 : 0);
v3 = internal::getRowPtr(srcBase, srcStride, y < rows-1 ? y+1 : rows > 0 ? rows-1 : 0);
v4 = internal::getRowPtr(srcBase, srcStride, y < rows-2 ? y+2 : rows > 0 ? rows-1 : 0);
} else if (border == BORDER_MODE_REFLECT) {
v0 = internal::getRowPtr(srcBase, srcStride, y > 1 ? y-2 : rows > 1 ? 1-y : 0);
v1 = internal::getRowPtr(srcBase, srcStride, y > 0 ? y-1 : 0);
v3 = internal::getRowPtr(srcBase, srcStride, y < rows-1 ? y+1 : rows > 0 ? rows-1 : 0);
v4 = internal::getRowPtr(srcBase, srcStride, y < rows-2 ? y+2 : rows > 1 ? 2*rows-(y+3) : 0);
} else if (border == BORDER_MODE_REFLECT101) {
v0 = internal::getRowPtr(srcBase, srcStride, y > 1 ? y-2 : rows > 2-y ? 2-y : 0); ///check
v1 = internal::getRowPtr(srcBase, srcStride, y > 0 ? y-1 : rows > 1 ? 1 : 0);
v3 = internal::getRowPtr(srcBase, srcStride, y < rows-1 ? y+1 : rows > 1 ? rows-2 : 0);
v4 = internal::getRowPtr(srcBase, srcStride, y < rows-2 ? y+2 : rows > 2 ? 2*rows-(y+4) : 0);///bad if rows=2 y=1 rows - 4 + (2,1)
} else if (border == BORDER_MODE_CONSTANT) {
v0 = y > 1 ? internal::getRowPtr(srcBase, srcStride, y-2) : tmp;
v1 = y > 0 ? internal::getRowPtr(srcBase, srcStride, y-1) : tmp;
v3 = y < rows-1 ? internal::getRowPtr(srcBase, srcStride, y+1) : tmp;
v4 = y < rows-2 ? internal::getRowPtr(srcBase, srcStride, y+2) : tmp;
}
s16* drow = internal::getRowPtr(dstBase, dstStride, y);
int16x8_t tnext, tc, t0;
int16x8_t tnext2, tnext3;
int16x8_t tnext1Old, tnext2Old, tnext3Old;
int16x8_t tnext4OldOldOld, tnext5OldOldOld;
int16x8_t tcurr1 = vmovq_n_s16(0x0);
int16x8_t tnext1 = vmovq_n_s16(0x0);
int16x8_t tprev1 = vmovq_n_s16(0x0);
int16x8_t tpprev1 = vmovq_n_s16(0x0);
int16x8_t tppprev1 = vmovq_n_s16(0x0);
int16x8_t tnext4Old = vmovq_n_s16(0x0);
int16x8_t tnext5Old = vmovq_n_s16(0x0);
int16x8_t tnext1OldOld = vmovq_n_s16(0x0);
int16x8_t tnext2OldOld = vmovq_n_s16(0x0);
int16x8_t tnext3OldOld = vmovq_n_s16(0x0);
int16x8_t tnext4OldOld = vmovq_n_s16(0x0);
int16x8_t tnext5OldOld = vmovq_n_s16(0x0);
// do vertical convolution
ptrdiff_t x = 0;
const ptrdiff_t bcols = y + 3 < rows ? cols : (cols - 8);
for( ; x <= bcols; x += 8 )
{
internal::prefetch(v0 + x);
internal::prefetch(v1 + x);
internal::prefetch(v2 + x);
internal::prefetch(v3 + x);
internal::prefetch(v4 + x);
uint8x8_t x0 = vld1_u8(v0 + x);
uint8x8_t x1 = vld1_u8(v1 + x);
uint8x8_t x2 = vld1_u8(v2 + x);
uint8x8_t x3 = vld1_u8(v3 + x);
uint8x8_t x4 = vld1_u8(v4 + x);
if(x) {
tcurr1 = tnext1;
}
tnext4OldOldOld = tnext4Old;
tnext5OldOldOld = tnext5Old;
tnext1Old = tnext1OldOld;
tnext2Old = tnext2OldOld;
tnext3Old = tnext3OldOld;
tnext4Old = tnext4OldOld;
tnext5Old = tnext5OldOld;
tnext3 = vreinterpretq_s16_u16(vaddq_u16(vaddl_u8(x3, x2),vaddl_u8(x2, x1)));
tnext3 = vshlq_n_s16(tnext3, 1);
tc = vreinterpretq_s16_u16(vsubl_u8(x4, x2));
tnext = vreinterpretq_s16_u16(vsubl_u8(x2, x0));
tnext2 = vsubq_s16(tc, tnext);
tnext1 = vaddq_s16(tnext3, tnext2);
// tnext1 = x0 + 2*x1 + 2*x2 + 2*x3 + x4
tnext2 = vshlq_n_s16(tnext2, 1);
// tnext2 = 2*x4 - 4*x2 + 2*x0
tnext3 = vsubq_s16(tnext2, vshlq_n_s16(tnext3, 1));
// tnext3 = 2*x0 - 4*x1 - 12*x2 - 4*x3 + 2*x4
tnext1OldOld = tnext1;
tnext2OldOld = tnext2;
tnext3OldOld = tnext3;
tnext4OldOld = tnext2;
tnext5OldOld = tnext1;
if(x) {
tnext1 = vextq_s16(tnext1Old, tnext1, 2);
tcurr1 = vextq_s16(tnext2Old, tnext2, 1);
tprev1 = tnext3Old;
if(x!=8) {
tpprev1 = vextq_s16(tnext4OldOldOld, tnext4Old, 7);
tppprev1 = vextq_s16(tnext5OldOldOld, tnext5Old, 6);
}
}
if(!x) {
// make border
if (border == BORDER_MODE_REPLICATE) {
tpprev1 = vextq_s16(tnext2, tnext2, 7);
tpprev1 = vsetq_lane_s16(vgetq_lane_s16(tpprev1, 1),tpprev1, 0);
tprev1 = vextq_s16(tnext1, tnext1, 6);
tprev1 = vsetq_lane_s16(vgetq_lane_s16(tprev1, 2),tprev1, 0);
tprev1 = vsetq_lane_s16(vgetq_lane_s16(tprev1, 2),tprev1, 1);
} else if (border == BORDER_MODE_REFLECT) {
tpprev1 = vextq_s16(tnext2, tnext2, 7);
tpprev1 = vsetq_lane_s16(vgetq_lane_s16(tpprev1, 1),tpprev1, 0);
tprev1 = vextq_s16(tnext1, tnext1, 6);
tprev1 = vsetq_lane_s16(vgetq_lane_s16(tprev1, 3),tprev1, 0);
tprev1 = vsetq_lane_s16(vgetq_lane_s16(tprev1, 2),tprev1, 1);
} else if (border == BORDER_MODE_REFLECT101) {
tpprev1 = vextq_s16(tnext2, tnext2, 7);
tpprev1 = vsetq_lane_s16(vgetq_lane_s16(tpprev1, 2),tpprev1, 0);
tprev1 = vextq_s16(tnext1, tnext1, 6);
tprev1 = vsetq_lane_s16(vgetq_lane_s16(tprev1, 3),tprev1, 1);
tprev1 = vsetq_lane_s16(vgetq_lane_s16(tprev1, 4),tprev1, 0);
} else if (border == BORDER_MODE_CONSTANT) {
tpprev1 = vextq_s16(tnext2, tnext2, 7);
tpprev1 = vsetq_lane_s16(borderValue, tpprev1, 0);
tprev1 = vextq_s16(tnext1, tnext1, 6);
tprev1 = vsetq_lane_s16(borderValue, tprev1, 0);
tprev1 = vsetq_lane_s16(borderValue, tprev1, 1);
}
tppprev1 = tprev1;
continue;
}
t0 = vaddq_s16(vaddq_s16(vqaddq_s16(tcurr1, tprev1), vqaddq_s16(tpprev1, tppprev1)), tnext1);
t0 = vaddq_s16(t0, t0);
vst1q_s16(drow + x - 8, t0);
}
x -= 8;
if(x >= cols - 1)
x = cols-2;
s16 pprevx = 0;
s16 prevx = 0;
s16 nextx = 0;
s16 nnextx = 0;
for( ; x < cols; x++ )
{
if (x == 0) {
// make border
if (border == BORDER_MODE_REPLICATE) {
pprevx = v0[0] + 2*v1[0] + 2*v2[0] + 2*v3[0] + v4[0];
prevx = 2*v0[0] - 4*v2[0] + 2*v4[0];
} else if (border == BORDER_MODE_REFLECT) {
pprevx = v0[1] + 2*v1[1] + 2*v2[1] + 2*v3[1] + v4[1];
prevx = 2*v0[0] - 4*v2[0] + 2*v4[0];
} else if (border == BORDER_MODE_REFLECT101) {
pprevx = v0[2] + 2*v1[2] + 2*v2[2] + 2*v3[2] + v4[2];
prevx = 2*v0[1] - 4*v2[1] + 2*v4[1];
} else if (border == BORDER_MODE_CONSTANT) {
pprevx = 8 * borderValue;
prevx = 0;
}
} else if (x == 1) {
// make border
if (border == BORDER_MODE_REPLICATE || border == BORDER_MODE_REFLECT) {
pprevx = v0[0] + 2*v1[0] + 2*v2[0] + 2*v3[0] + v4[0];
} else if (border == BORDER_MODE_REFLECT101) {
pprevx = v0[1] + 2*v1[1] + 2*v2[1] + 2*v3[1] + v4[1];
} else if (border == BORDER_MODE_CONSTANT) {
pprevx = 8 * borderValue;
}
prevx = 2*v0[0] - 4*v2[0] + 2*v4[0];
} else {
pprevx = v0[x-2] + 2*v1[x-2] + 2*v2[x-2] + 2*v3[x-2] + v4[x-2];
prevx = 2*v0[x-1] - 4*v2[x-1] + 2*v4[x-1];
}
s16 currx = 2*v0[x] - 4*v1[x] - 12*v2[x] - 4*v3[x] + 2*v4[x];
if (x == cols-1) {
// make border
if (border == BORDER_MODE_REPLICATE) {
nextx = 2*v0[x] - 4*v2[x] + 2*v4[x];
nnextx = v0[x] + 2*v1[x] + 2*v2[x] + 2*v3[x] + v4[x];
} else if (border == BORDER_MODE_REFLECT) {
nextx = 2*v0[x] - 4*v2[x] + 2*v4[x];
nnextx = v0[x-1] + 2*v1[x-1] + 2*v2[x-1] + 2*v3[x-1] + v4[x-1];
} else if (border == BORDER_MODE_REFLECT101) {
nextx = 2*v0[x-1] - 4*v2[x-1] + 2*v4[x-1];
nnextx = v0[x-2] + 2*v1[x-2] + 2*v2[x-2] + 2*v3[x-2] + v4[x-2];
} else if (border == BORDER_MODE_CONSTANT) {
nextx = 0;
nnextx = 8 * borderValue;
}
} else if (x == cols-2) {
// make border
if (border == BORDER_MODE_REPLICATE || border == BORDER_MODE_REFLECT) {
nnextx = v0[x+1] + 2*v1[x+1] + 2*v2[x+1] + 2*v3[x+1] + v4[x+1];
} else if (border == BORDER_MODE_REFLECT101) {
nnextx = v0[x] + 2*v1[x] + 2*v2[x] + 2*v3[x] + v4[x];
} else if (border == BORDER_MODE_CONSTANT) {
nnextx = 8 * borderValue;
}
nextx = 2*v0[x+1] - 4*v2[x+1] + 2*v4[x+1];
} else {
nextx = 2*v0[x+1] - 4*v2[x+1] + 2*v4[x+1];
nnextx = v0[x+2] + 2*v1[x+2] + 2*v2[x+2] + 2*v3[x+2] + v4[x+2];
}
s16 res = pprevx + prevx + currx + nextx + nnextx;
*(drow+x) = 2*res;
}
}
#else
(void)size;
(void)srcBase;
(void)srcStride;
(void)dstBase;
(void)dstStride;
(void)border;
(void)borderValue;
#endif
}
} // namespace CAROTENE_NS

160
3rdparty/carotene/src/magnitude.cpp vendored Normal file
View File

@ -0,0 +1,160 @@
/*
* By downloading, copying, installing or using the software you agree to this license.
* If you do not agree to this license, do not download, install,
* copy or use the software.
*
*
* License Agreement
* For Open Source Computer Vision Library
* (3-clause BSD License)
*
* Copyright (C) 2014, NVIDIA Corporation, all rights reserved.
* Third party copyrights are property of their respective owners.
*
* Redistribution and use in source and binary forms, with or without modification,
* are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* * Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* * Neither the names of the copyright holders nor the names of the contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* This software is provided by the copyright holders and contributors "as is" and
* any express or implied warranties, including, but not limited to, the implied
* warranties of merchantability and fitness for a particular purpose are disclaimed.
* In no event shall copyright holders or contributors be liable for any direct,
* indirect, incidental, special, exemplary, or consequential damages
* (including, but not limited to, procurement of substitute goods or services;
* loss of use, data, or profits; or business interruption) however caused
* and on any theory of liability, whether in contract, strict liability,
* or tort (including negligence or otherwise) arising in any way out of
* the use of this software, even if advised of the possibility of such damage.
*/
#include "common.hpp"
#include "vtransform.hpp"
#include <cmath>
namespace CAROTENE_NS {
#ifdef CAROTENE_NEON
namespace {
struct Magnitude
{
typedef s16 type;
void operator() (const int16x8_t & v_src0, const int16x8_t & v_src1,
int16x8_t & v_dst) const
{
int16x4_t v_src0_p = vget_low_s16(v_src0), v_src1_p = vget_low_s16(v_src1);
float32x4_t v_sqr0 = vaddq_f32(vcvtq_f32_s32(vmull_s16(v_src0_p, v_src0_p)),
vcvtq_f32_s32(vmull_s16(v_src1_p, v_src1_p)));
v_src0_p = vget_high_s16(v_src0);
v_src1_p = vget_high_s16(v_src1);
float32x4_t v_sqr1 = vaddq_f32(vcvtq_f32_s32(vmull_s16(v_src0_p, v_src0_p)),
vcvtq_f32_s32(vmull_s16(v_src1_p, v_src1_p)));
int32x4_t v_sqrt0 = vcvtq_s32_f32(internal::vsqrtq_f32(v_sqr0));
int32x4_t v_sqrt1 = vcvtq_s32_f32(internal::vsqrtq_f32(v_sqr1));
v_dst = vcombine_s16(vqmovn_s32(v_sqrt0), vqmovn_s32(v_sqrt1));
}
void operator() (const int16x4_t & v_src0, const int16x4_t & v_src1,
int16x4_t & v_dst) const
{
float32x4_t v_tmp = vaddq_f32(vcvtq_f32_s32(vmull_s16(v_src0, v_src0)),
vcvtq_f32_s32(vmull_s16(v_src1, v_src1)));
int32x4_t v_sqrt = vcvtq_s32_f32(internal::vsqrtq_f32(v_tmp));
v_dst = vqmovn_s32(v_sqrt);
}
void operator() (const short * src0, const short * src1, short * dst) const
{
f32 src0val = (f32)src0[0], src1val = (f32)src1[0];
dst[0] = internal::saturate_cast<s16>((s32)sqrtf(src0val * src0val + src1val * src1val));
}
};
struct MagnitudeF32
{
typedef f32 type;
void operator() (const float32x4_t & v_src0, const float32x4_t & v_src1,
float32x4_t & v_dst) const
{
v_dst = internal::vsqrtq_f32(vaddq_f32(vmulq_f32(v_src0, v_src0), vmulq_f32(v_src1, v_src1)));
}
void operator() (const float32x2_t & v_src0, const float32x2_t & v_src1,
float32x2_t & v_dst) const
{
v_dst = internal::vsqrt_f32(vadd_f32(vmul_f32(v_src0, v_src0), vmul_f32(v_src1, v_src1)));
}
void operator() (const f32 * src0, const f32 * src1, f32 * dst) const
{
dst[0] = sqrtf(src0[0] * src0[0] + src1[0] * src1[0]);
}
};
} // namespace
#endif
void magnitude(const Size2D &size,
const s16 * src0Base, ptrdiff_t src0Stride,
const s16 * src1Base, ptrdiff_t src1Stride,
s16 * dstBase, ptrdiff_t dstStride)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
internal::vtransform(size,
src0Base, src0Stride,
src1Base, src1Stride,
dstBase, dstStride,
Magnitude());
#else
(void)size;
(void)src0Base;
(void)src0Stride;
(void)src1Base;
(void)src1Stride;
(void)dstBase;
(void)dstStride;
#endif
}
void magnitude(const Size2D &size,
const f32 * src0Base, ptrdiff_t src0Stride,
const f32 * src1Base, ptrdiff_t src1Stride,
f32 * dstBase, ptrdiff_t dstStride)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
internal::vtransform(size,
src0Base, src0Stride,
src1Base, src1Stride,
dstBase, dstStride,
MagnitudeF32());
#else
(void)size;
(void)src0Base;
(void)src0Stride;
(void)src1Base;
(void)src1Stride;
(void)dstBase;
(void)dstStride;
#endif
}
} // namespace CAROTENE_NS

163
3rdparty/carotene/src/meanstddev.cpp vendored Normal file
View File

@ -0,0 +1,163 @@
/*
* By downloading, copying, installing or using the software you agree to this license.
* If you do not agree to this license, do not download, install,
* copy or use the software.
*
*
* License Agreement
* For Open Source Computer Vision Library
* (3-clause BSD License)
*
* Copyright (C) 2014, NVIDIA Corporation, all rights reserved.
* Third party copyrights are property of their respective owners.
*
* Redistribution and use in source and binary forms, with or without modification,
* are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* * Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* * Neither the names of the copyright holders nor the names of the contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* This software is provided by the copyright holders and contributors "as is" and
* any express or implied warranties, including, but not limited to, the implied
* warranties of merchantability and fitness for a particular purpose are disclaimed.
* In no event shall copyright holders or contributors be liable for any direct,
* indirect, incidental, special, exemplary, or consequential damages
* (including, but not limited to, procurement of substitute goods or services;
* loss of use, data, or profits; or business interruption) however caused
* and on any theory of liability, whether in contract, strict liability,
* or tort (including negligence or otherwise) arising in any way out of
* the use of this software, even if advised of the possibility of such damage.
*/
#include "common.hpp"
#include <cmath>
namespace CAROTENE_NS {
void meanStdDev(const Size2D &size,
const u8 * srcBase, ptrdiff_t srcStride,
f32 * pMean, f32 * pStdDev)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
f64 fsum = 0.0f, fsqsum = 0.0f;
sqsum(size, srcBase, srcStride, &fsum, &fsqsum, 1);
// calc mean and stddev
f64 itotal = 1.0 / size.total();
f64 mean = fsum * itotal;
f64 stddev = sqrt(std::max(fsqsum * itotal - mean * mean, 0.0));
if (pMean)
*pMean = mean;
if (pStdDev)
*pStdDev = stddev;
#else
(void)size;
(void)srcBase;
(void)srcStride;
(void)pMean;
(void)pStdDev;
#endif
}
void meanStdDev(const Size2D &size,
const u16 * srcBase, ptrdiff_t srcStride,
f32 * pMean, f32 * pStdDev)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
size_t blockSize0 = 1 << 10, roiw4 = size.width & ~3;
f64 fsum = 0.0f, fsqsum = 0.0f;
f32 arsum[8];
uint32x4_t v_zero = vdupq_n_u32(0u), v_sum;
float32x4_t v_zero_f = vdupq_n_f32(0.0f), v_sqsum;
for (size_t i = 0; i < size.height; ++i)
{
const u16 * src = internal::getRowPtr(srcBase, srcStride, i);
size_t j = 0u;
while (j < roiw4)
{
size_t blockSize = std::min(roiw4 - j, blockSize0) + j;
v_sum = v_zero;
v_sqsum = v_zero_f;
for ( ; j + 16 < blockSize ; j += 16)
{
internal::prefetch(src + j);
uint16x8_t v_src0 = vld1q_u16(src + j), v_src1 = vld1q_u16(src + j + 8);
// 0
uint32x4_t v_srclo = vmovl_u16(vget_low_u16(v_src0));
uint32x4_t v_srchi = vmovl_u16(vget_high_u16(v_src0));
v_sum = vaddq_u32(v_sum, vaddq_u32(v_srclo, v_srchi));
float32x4_t v_srclo_f = vcvtq_f32_u32(v_srclo);
float32x4_t v_srchi_f = vcvtq_f32_u32(v_srchi);
v_sqsum = vmlaq_f32(v_sqsum, v_srclo_f, v_srclo_f);
v_sqsum = vmlaq_f32(v_sqsum, v_srchi_f, v_srchi_f);
// 1
v_srclo = vmovl_u16(vget_low_u16(v_src1));
v_srchi = vmovl_u16(vget_high_u16(v_src1));
v_sum = vaddq_u32(v_sum, vaddq_u32(v_srclo, v_srchi));
v_srclo_f = vcvtq_f32_u32(v_srclo);
v_srchi_f = vcvtq_f32_u32(v_srchi);
v_sqsum = vmlaq_f32(v_sqsum, v_srclo_f, v_srclo_f);
v_sqsum = vmlaq_f32(v_sqsum, v_srchi_f, v_srchi_f);
}
for ( ; j < blockSize; j += 4)
{
uint32x4_t v_src = vmovl_u16(vld1_u16(src + j));
float32x4_t v_src_f = vcvtq_f32_u32(v_src);
v_sum = vaddq_u32(v_sum, v_src);
v_sqsum = vmlaq_f32(v_sqsum, v_src_f, v_src_f);
}
vst1q_f32(arsum, vcvtq_f32_u32(v_sum));
vst1q_f32(arsum + 4, v_sqsum);
fsum += (f64)arsum[0] + arsum[1] + arsum[2] + arsum[3];
fsqsum += (f64)arsum[4] + arsum[5] + arsum[6] + arsum[7];
}
// collect a few last elements in the current row
for ( ; j < size.width; ++j)
{
f32 srcval = src[j];
fsum += srcval;
fsqsum += srcval * srcval;
}
}
// calc mean and stddev
f64 itotal = 1.0 / size.total();
f64 mean = fsum * itotal;
f64 stddev = sqrt(std::max(fsqsum * itotal - mean * mean, 0.0));
if (pMean)
*pMean = mean;
if (pStdDev)
*pStdDev = stddev;
#else
(void)size;
(void)srcBase;
(void)srcStride;
(void)pMean;
(void)pStdDev;
#endif
}
} // namespace CAROTENE_NS

227
3rdparty/carotene/src/median_filter.cpp vendored Normal file
View File

@ -0,0 +1,227 @@
/*
* By downloading, copying, installing or using the software you agree to this license.
* If you do not agree to this license, do not download, install,
* copy or use the software.
*
*
* License Agreement
* For Open Source Computer Vision Library
* (3-clause BSD License)
*
* Copyright (C) 2012-2014, NVIDIA Corporation, all rights reserved.
* Third party copyrights are property of their respective owners.
*
* Redistribution and use in source and binary forms, with or without modification,
* are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* * Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* * Neither the names of the copyright holders nor the names of the contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* This software is provided by the copyright holders and contributors "as is" and
* any express or implied warranties, including, but not limited to, the implied
* warranties of merchantability and fitness for a particular purpose are disclaimed.
* In no event shall copyright holders or contributors be liable for any direct,
* indirect, incidental, special, exemplary, or consequential damages
* (including, but not limited to, procurement of substitute goods or services;
* loss of use, data, or profits; or business interruption) however caused
* and on any theory of liability, whether in contract, strict liability,
* or tort (including negligence or otherwise) arising in any way out of
* the use of this software, even if advised of the possibility of such damage.
*/
#include "common.hpp"
/*
* The code here is based on the code in
* <http://ndevilla.free.fr/median/median/src/optmed.c>, which is in public domain.
* See also <http://ndevilla.free.fr/median/median/index.html>.
*/
namespace CAROTENE_NS {
#ifdef CAROTENE_NEON
namespace {
uint8x16_t getLeftReplicate(uint8x16_t r, u32 cn)
{
u8 buf[16+8];
vst1q_u8(buf+cn, r);
for (u32 i = 0; i < cn; ++i) buf[i] = buf[cn+i];
return vld1q_u8(buf);
}
uint8x8_t getRightReplicate(uint8x8_t r, u32 cn)
{
u8 buf[8+8];
vst1_u8(buf, r);
for (u32 i = 0; i < cn; ++i) buf[8+i] = buf[8-cn+i];
return vld1_u8(buf+cn);
}
} // namespace
//o------^-------^-----------------------------o 0
// | |
//o--^---v---^---|-------^---------------------o 1
// | | | |
//o--v-------v---|-------|-^-------^-------^---o 2
// | | | | |
//o------^-------v-----^-|-|-------|-------|---o 3
// | | | | | |
//o--^---v---^-----^---|-v-|---^---v---^---v---o 4
// | | | | | | |
//o--v-------v---^-|---|---v---|-------|-------o 5
// | | | | |
//o------^-------|-|---v-------|-------v-------o 6
// | | | |
//o--^---v---^---|-v-----------v---------------o 7
// | | |
//o--v-------v---v-----------------------------o 8
#define ELT(num, level) v ## num ## _lv ## level
#define PIX_SORT(a, alvl, b, blvl, newlvl) \
PIX_MIN(a, alvl, b, blvl, newlvl); \
PIX_MAX(a, alvl, b, blvl, newlvl);
#define SORT9 \
PIX_SORT(1, 00, 2, 00, 01); \
PIX_SORT(4, 00, 5, 00, 02); \
PIX_SORT(7, 00, 8, 00, 03); \
PIX_SORT(0, 00, 1, 01, 04); \
PIX_SORT(3, 00, 4, 02, 05); \
PIX_SORT(6, 00, 7, 03, 06); \
PIX_SORT(1, 04, 2, 01, 07); \
PIX_SORT(4, 05, 5, 02, 08); \
PIX_SORT(7, 06, 8, 03, 09); \
PIX_MAX (0, 04, 3, 05, 10); \
PIX_MIN (5, 08, 8, 09, 11); \
PIX_SORT(4, 08, 7, 09, 12); \
PIX_MAX (3, 10, 6, 06, 13); \
PIX_MAX (1, 07, 4, 12, 14); \
PIX_MIN (2, 07, 5, 11, 15); \
PIX_MIN (4, 14, 7, 12, 16); \
PIX_SORT(4, 16, 2, 15, 17); \
PIX_MAX (6, 13, 4, 17, 18); \
PIX_MIN (4, 18, 2, 17, 19);
#endif
bool isMedianFilter3x3Supported(const Size2D &size, u32 numChannels)
{
return isSupportedConfiguration() && size.width >= 16 + numChannels && numChannels <= 8;
}
void medianFilter3x3(const Size2D &size, u32 numChannels,
const u8 *srcBase, ptrdiff_t srcStride,
const Margin &srcMargin,
u8 *dstBase, ptrdiff_t dstStride)
{
internal::assertSupportedConfiguration(isMedianFilter3x3Supported(size, numChannels));
#ifdef CAROTENE_NEON
u32 cn = numChannels;
size_t colsn = size.width * cn;
for (size_t i = 0; i < size.height; ++i) {
const u8* psrc1 = internal::getRowPtr(srcBase, srcStride, i);
const u8* psrc0 = i == 0 && srcMargin.top == 0 ? psrc1 : psrc1 - srcStride;
const u8* psrc2 = i + 1 == size.height && srcMargin.bottom == 0 ? psrc1 : psrc1 + srcStride;
u8* pdst = internal::getRowPtr(dstBase, dstStride, i);
size_t j = 0;
{
uint8x16_t v3_lv00 = vld1q_u8(psrc0);
uint8x16_t v4_lv00 = vld1q_u8(psrc1);
uint8x16_t v5_lv00 = vld1q_u8(psrc2);
uint8x16_t v6_lv00 = vld1q_u8(psrc0 + cn);
uint8x16_t v7_lv00 = vld1q_u8(psrc1 + cn);
uint8x16_t v8_lv00 = vld1q_u8(psrc2 + cn);
uint8x16_t v0_lv00 = srcMargin.left > 0 ? vld1q_u8(psrc0 - cn) : getLeftReplicate(v3_lv00, cn);
uint8x16_t v1_lv00 = srcMargin.left > 0 ? vld1q_u8(psrc1 - cn) : getLeftReplicate(v4_lv00, cn);
uint8x16_t v2_lv00 = srcMargin.left > 0 ? vld1q_u8(psrc2 - cn) : getLeftReplicate(v5_lv00, cn);
goto medianBlur3x3_mainBody;
for (; j < colsn - 16; j += 16) {
internal::prefetch(psrc0 + j);
internal::prefetch(psrc1 + j);
internal::prefetch(psrc2 + j);
v0_lv00 = vld1q_u8(psrc0 + j - cn);
v1_lv00 = vld1q_u8(psrc1 + j - cn);
v2_lv00 = vld1q_u8(psrc2 + j - cn);
v3_lv00 = vld1q_u8(psrc0 + j);
v4_lv00 = vld1q_u8(psrc1 + j);
v5_lv00 = vld1q_u8(psrc2 + j);
v6_lv00 = vld1q_u8(psrc0 + j + cn);
v7_lv00 = vld1q_u8(psrc1 + j + cn);
v8_lv00 = vld1q_u8(psrc2 + j + cn);
medianBlur3x3_mainBody:
#define PIX_MIN(a, alvl, b, blvl, newlvl) uint8x16_t ELT(a, newlvl) = vminq_u8(ELT(a, alvl), ELT(b, blvl))
#define PIX_MAX(a, alvl, b, blvl, newlvl) uint8x16_t ELT(b, newlvl) = vmaxq_u8(ELT(a, alvl), ELT(b, blvl))
SORT9;
#undef PIX_MAX
#undef PIX_MIN
vst1q_u8(pdst + j, v4_lv19);
}
}
{
size_t k = colsn - 8;
uint8x8_t v0_lv00 = vld1_u8(psrc0 + k - cn);
uint8x8_t v1_lv00 = vld1_u8(psrc1 + k - cn);
uint8x8_t v2_lv00 = vld1_u8(psrc2 + k - cn);
uint8x8_t v3_lv00 = vld1_u8(psrc0 + k);
uint8x8_t v4_lv00 = vld1_u8(psrc1 + k);
uint8x8_t v5_lv00 = vld1_u8(psrc2 + k);
uint8x8_t v6_lv00 = srcMargin.right > 0 ? vld1_u8(psrc0 + k + cn) : getRightReplicate(v3_lv00, cn);
uint8x8_t v7_lv00 = srcMargin.right > 0 ? vld1_u8(psrc1 + k + cn) : getRightReplicate(v4_lv00, cn);
uint8x8_t v8_lv00 = srcMargin.right > 0 ? vld1_u8(psrc2 + k + cn) : getRightReplicate(v5_lv00, cn);
goto medianBlur3x3_tailBody;
for (; k >= j - 8; k -= 8) {
v0_lv00 = vld1_u8(psrc0 + k - cn);
v1_lv00 = vld1_u8(psrc1 + k - cn);
v2_lv00 = vld1_u8(psrc2 + k - cn);
v3_lv00 = vld1_u8(psrc0 + k);
v4_lv00 = vld1_u8(psrc1 + k);
v5_lv00 = vld1_u8(psrc2 + k);
v6_lv00 = vld1_u8(psrc0 + k + cn);
v7_lv00 = vld1_u8(psrc1 + k + cn);
v8_lv00 = vld1_u8(psrc2 + k + cn);
medianBlur3x3_tailBody:
#define PIX_MIN(a, alvl, b, blvl, newlvl) uint8x8_t ELT(a, newlvl) = vmin_u8(ELT(a, alvl), ELT(b, blvl))
#define PIX_MAX(a, alvl, b, blvl, newlvl) uint8x8_t ELT(b, newlvl) = vmax_u8(ELT(a, alvl), ELT(b, blvl))
SORT9;
#undef PIX_MAX
#undef PIX_MIN
vst1_u8(pdst + k, v4_lv19);
}
}
}
#else
(void)size;
(void)numChannels;
(void)srcBase;
(void)srcStride;
(void)srcMargin;
(void)dstBase;
(void)dstStride;
#endif
}
} // namespace CAROTENE_NS

139
3rdparty/carotene/src/min_max.cpp vendored Normal file
View File

@ -0,0 +1,139 @@
/*
* By downloading, copying, installing or using the software you agree to this license.
* If you do not agree to this license, do not download, install,
* copy or use the software.
*
*
* License Agreement
* For Open Source Computer Vision Library
* (3-clause BSD License)
*
* Copyright (C) 2014, NVIDIA Corporation, all rights reserved.
* Third party copyrights are property of their respective owners.
*
* Redistribution and use in source and binary forms, with or without modification,
* are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* * Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* * Neither the names of the copyright holders nor the names of the contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* This software is provided by the copyright holders and contributors "as is" and
* any express or implied warranties, including, but not limited to, the implied
* warranties of merchantability and fitness for a particular purpose are disclaimed.
* In no event shall copyright holders or contributors be liable for any direct,
* indirect, incidental, special, exemplary, or consequential damages
* (including, but not limited to, procurement of substitute goods or services;
* loss of use, data, or profits; or business interruption) however caused
* and on any theory of liability, whether in contract, strict liability,
* or tort (including negligence or otherwise) arising in any way out of
* the use of this software, even if advised of the possibility of such damage.
*/
#include <algorithm>
#include "common.hpp"
#include "vtransform.hpp"
namespace CAROTENE_NS {
#ifdef CAROTENE_NEON
namespace {
template <typename T>
struct Min
{
typedef T type;
void operator() (const typename internal::VecTraits<T>::vec128 & v_src0,
const typename internal::VecTraits<T>::vec128 & v_src1,
typename internal::VecTraits<T>::vec128 & v_dst) const
{
v_dst = internal::vminq(v_src0, v_src1);
}
void operator() (const typename internal::VecTraits<T>::vec64 & v_src0,
const typename internal::VecTraits<T>::vec64 & v_src1,
typename internal::VecTraits<T>::vec64 & v_dst) const
{
v_dst = internal::vmin(v_src0, v_src1);
}
void operator() (const T * src0, const T * src1, T * dst) const
{
dst[0] = std::min(src0[0], src1[0]);
}
};
template <typename T>
struct Max
{
typedef T type;
void operator() (const typename internal::VecTraits<T>::vec128 & v_src0,
const typename internal::VecTraits<T>::vec128 & v_src1,
typename internal::VecTraits<T>::vec128 & v_dst) const
{
v_dst = internal::vmaxq(v_src0, v_src1);
}
void operator() (const typename internal::VecTraits<T>::vec64 & v_src0,
const typename internal::VecTraits<T>::vec64 & v_src1,
typename internal::VecTraits<T>::vec64 & v_dst) const
{
v_dst = internal::vmax(v_src0, v_src1);
}
void operator() (const T * src0, const T * src1, T * dst) const
{
dst[0] = std::max(src0[0], src1[0]);
}
};
} // namespace
#define IMPL_OP(fun, op, type) \
void fun(const Size2D &size, \
const type * src0Base, ptrdiff_t src0Stride, \
const type * src1Base, ptrdiff_t src1Stride, \
type * dstBase, ptrdiff_t dstStride) \
{ \
internal::assertSupportedConfiguration(); \
internal::vtransform(size, \
src0Base, src0Stride, \
src1Base, src1Stride, \
dstBase, dstStride, op<type>()); \
}
#else
#define IMPL_OP(fun, op, type) \
void fun(const Size2D &, \
const type *, ptrdiff_t, \
const type *, ptrdiff_t, \
type *, ptrdiff_t) \
{ \
internal::assertSupportedConfiguration(); \
}
#endif
#define IMPL_MINMAX(type) IMPL_OP(min, Min, type) IMPL_OP(max, Max, type)
IMPL_MINMAX(u8)
IMPL_MINMAX(s8)
IMPL_MINMAX(u16)
IMPL_MINMAX(s16)
IMPL_MINMAX(u32)
IMPL_MINMAX(s32)
IMPL_MINMAX(f32)
} // namespace CAROTENE_NS

1340
3rdparty/carotene/src/minmaxloc.cpp vendored Normal file

File diff suppressed because it is too large Load Diff

728
3rdparty/carotene/src/morph.cpp vendored Normal file
View File

@ -0,0 +1,728 @@
/*
* By downloading, copying, installing or using the software you agree to this license.
* If you do not agree to this license, do not download, install,
* copy or use the software.
*
*
* License Agreement
* For Open Source Computer Vision Library
* (3-clause BSD License)
*
* Copyright (C) 2014, NVIDIA Corporation, all rights reserved.
* Third party copyrights are property of their respective owners.
*
* Redistribution and use in source and binary forms, with or without modification,
* are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* * Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* * Neither the names of the copyright holders nor the names of the contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* This software is provided by the copyright holders and contributors "as is" and
* any express or implied warranties, including, but not limited to, the implied
* warranties of merchantability and fitness for a particular purpose are disclaimed.
* In no event shall copyright holders or contributors be liable for any direct,
* indirect, incidental, special, exemplary, or consequential damages
* (including, but not limited to, procurement of substitute goods or services;
* loss of use, data, or profits; or business interruption) however caused
* and on any theory of liability, whether in contract, strict liability,
* or tort (including negligence or otherwise) arising in any way out of
* the use of this software, even if advised of the possibility of such damage.
*/
#include "common.hpp"
#include <algorithm>
#include <limits>
#include <vector>
#include <cstring>
namespace CAROTENE_NS {
bool isMorph3x3Supported(const Size2D &size, BORDER_MODE border)
{
return isSupportedConfiguration() && size.width >= 16 &&
(border == BORDER_MODE_CONSTANT ||
border == BORDER_MODE_REPLICATE);
}
#ifdef CAROTENE_NEON
namespace {
struct ErodeVecOp
{
ErodeVecOp():borderValue(0){}
ErodeVecOp(BORDER_MODE border, u8 borderValue_) :
borderValue(borderValue_)
{
if (border == BORDER_MODE_REPLICATE)
borderValue = std::numeric_limits<u8>::max();
}
inline uint8x16_t operator()(uint8x16_t a, uint8x16_t b) const
{
return vminq_u8(a, b);
}
inline uint8x8_t operator()(uint8x8_t a, uint8x8_t b) const
{
return vmin_u8(a, b);
}
inline u8 operator()(u8 a, u8 b) const
{
return std::min(a, b);
}
u8 borderValue;
};
struct DilateVecOp
{
DilateVecOp():borderValue(0){}
DilateVecOp(BORDER_MODE border, u8 borderValue_) :
borderValue(borderValue_)
{
if (border == BORDER_MODE_REPLICATE)
borderValue = std::numeric_limits<u8>::min();
}
inline uint8x16_t operator()(uint8x16_t a, uint8x16_t b) const
{
return vmaxq_u8(a, b);
}
inline uint8x8_t operator()(uint8x8_t a, uint8x8_t b) const
{
return vmax_u8(a, b);
}
inline u8 operator()(u8 a, u8 b) const
{
return std::max(a, b);
}
u8 borderValue;
};
template <typename VecOp>
void morph3x3(const Size2D &size,
const u8 * srcBase, ptrdiff_t srcStride,
u8 * dstBase, ptrdiff_t dstStride,
BORDER_MODE border, const VecOp & vop)
{
u8 borderValue = vop.borderValue;
ptrdiff_t width = (ptrdiff_t)size.width, height = (ptrdiff_t)size.height;
const uint8x16_t v_zero = vdupq_n_u8(0);
const uint8x16_t v_border = vdupq_n_u8(borderValue);
uint8x16_t tprev = v_zero, tcurr = v_zero, tnext = v_zero;
uint8x16_t t0 = v_zero, t1 = v_zero, t2 = v_zero;
for (ptrdiff_t y = 0; y < height; ++y)
{
const u8 * srow0 = y == 0 && border == BORDER_MODE_CONSTANT ? NULL : internal::getRowPtr(srcBase, srcStride, std::max<ptrdiff_t>(y - 1, 0));
const u8 * srow1 = internal::getRowPtr(srcBase, srcStride, y);
const u8 * srow2 = y + 1 == height && border == BORDER_MODE_CONSTANT ? NULL : internal::getRowPtr(srcBase, srcStride, std::min(y + 1, height - 1));
u8 * drow = internal::getRowPtr(dstBase, dstStride, y);
u8 prevx = 0, currx = 0, nextx = 0;
ptrdiff_t x = 0;
const ptrdiff_t bwidth = y + 2 < height ? width : (width - 16);
// perform vertical convolution
for ( ; x <= bwidth; x += 16)
{
internal::prefetch(srow0 + x);
internal::prefetch(srow1 + x);
internal::prefetch(srow2 + x);
uint8x16_t x0 = !srow0 ? v_border : vld1q_u8(srow0 + x);
uint8x16_t x1 = vld1q_u8(srow1 + x);
uint8x16_t x2 = !srow2 ? v_border : vld1q_u8(srow2 + x);
// calculate values for plain CPU part below if needed
if (x + 16 >= bwidth)
{
ptrdiff_t x3 = x == width ? width - 1 : x;
ptrdiff_t x4 = border == BORDER_MODE_CONSTANT ? x3 - 1 : std::max<ptrdiff_t>(x3 - 1, 0);
if (border == BORDER_MODE_CONSTANT && x4 < 0)
prevx = borderValue;
else
prevx = vop(srow1[x4],
vop(srow2 ? srow2[x4] : borderValue,
srow0 ? srow0[x4] : borderValue));
currx = vop(srow2 ? srow2[x3] : borderValue, vop(srow1[x3], srow0 ? srow0[x3] : borderValue));
}
// make shift
if (x)
{
tprev = tcurr;
tcurr = tnext;
}
// and calculate next value
tnext = vop(vop(x0, x1), x2);
// make extrapolation for the first elements
if (!x)
{
// make border
if (border == BORDER_MODE_CONSTANT)
tcurr = v_border;
else if (border == BORDER_MODE_REPLICATE)
tcurr = vdupq_n_u8(vgetq_lane_u8(tnext, 0));
continue;
}
// combine 3 "shifted" vectors
t0 = vextq_u8(tprev, tcurr, 15);
t1 = tcurr;
t2 = vextq_u8(tcurr, tnext, 1);
// and add them
t0 = vop(t0, vop(t1, t2));
vst1q_u8(drow + x - 16, t0);
}
x -= 16;
if (x == width)
--x;
for ( ; x < width; ++x)
{
// make extrapolation for the last elements
if (x + 1 >= width)
{
if (border == BORDER_MODE_CONSTANT)
nextx = borderValue;
else if (border == BORDER_MODE_REPLICATE)
nextx = vop(srow2[x], vop(srow1[x], srow0[x]));
}
else
nextx = vop(vop(srow2 ? srow2[x + 1] : borderValue,
srow0 ? srow0[x + 1] : borderValue),
srow1[x + 1]);
drow[x] = vop(prevx, vop(currx, nextx));
// make shift
prevx = currx;
currx = nextx;
}
}
}
} // namespace
#endif
void erode3x3(const Size2D &size,
const u8 * srcBase, ptrdiff_t srcStride,
u8 * dstBase, ptrdiff_t dstStride,
BORDER_MODE border, u8 borderValue)
{
internal::assertSupportedConfiguration(isMorph3x3Supported(size, border));
#ifdef CAROTENE_NEON
morph3x3(size,
srcBase, srcStride,
dstBase, dstStride,
border, ErodeVecOp(border, borderValue));
#else
(void)size;
(void)srcBase;
(void)srcStride;
(void)dstBase;
(void)dstStride;
(void)border;
(void)borderValue;
#endif
}
void dilate3x3(const Size2D &size,
const u8 * srcBase, ptrdiff_t srcStride,
u8 * dstBase, ptrdiff_t dstStride,
BORDER_MODE border, u8 borderValue)
{
internal::assertSupportedConfiguration(isMorph3x3Supported(size, border));
#ifdef CAROTENE_NEON
morph3x3(size,
srcBase, srcStride,
dstBase, dstStride,
border, DilateVecOp(border, borderValue));
#else
(void)size;
(void)srcBase;
(void)srcStride;
(void)dstBase;
(void)dstStride;
(void)border;
(void)borderValue;
#endif
}
#ifdef CAROTENE_NEON
namespace {
template<class VecUpdate>
void MorphRow(const u8* src, u8* dst, size_t width, s32 cn, size_t ksize)
{
size_t i, j, k;
size_t width16 = (width & -16) * cn;
size_t width8 = (width & -8) * cn;
width *= cn;
if (ksize == 1)
{
for (i = 0; i < width; i++)
dst[i] = src[i];
return;
}
ksize = ksize*cn;
VecUpdate updateOp;
switch(cn)
{
case 1:
for (i = 0; i < width16; i += 16)
{
const u8* sptr = src + i;
uint8x16_t s = vld1q_u8(sptr);
internal::prefetch(sptr);
for( k = 1; k < ksize; ++k)
s = updateOp(s, vld1q_u8(sptr + k));
vst1q_u8(dst + i, s);
}
for (; i < width8; i += 8)
{
const u8* sptr = src + i;
uint8x8_t s = vld1_u8(sptr);
internal::prefetch(sptr);
for( k = 1; k < ksize; ++k)
s = updateOp(s, vld1_u8(sptr + k));
vst1_u8(dst + i, s);
}
break;
default:
for (i = 0; i < width16; i += 16)
{
uint8x16_t s = vld1q_u8(src + i);
internal::prefetch(src + i);
for (k = cn; k < ksize; k += cn)
s = updateOp(s, vld1q_u8(src + i + k));
vst1q_u8(dst + i, s);
}
for (; i < width8; i += 8)
{
uint8x8_t s = vld1_u8(src + i);
internal::prefetch(src + i);
for (k = cn; k < ksize; k += cn)
s = updateOp(s, vld1_u8(src + i + k));
vst1_u8(dst + i, s);
}
break;
}
ptrdiff_t i0 = i;
for( k = 0; k < (size_t)cn; k++, src++, dst++ )
{
for( i = i0; i <= width - cn*2; i += cn*2 )
{
const u8* s = src + i;
u8 m = s[cn];
for( j = cn*2; j < ksize; j += cn )
m = updateOp(m, s[j]);
dst[i] = updateOp(m, s[0]);
dst[i+cn] = updateOp(m, s[j]);
}
for( ; i < width; i += cn )
{
const u8* s = src + i;
u8 m = s[0];
for( j = cn; j < ksize; j += cn )
m = updateOp(m, s[j]);
dst[i] = m;
}
}
}
template<class VecUpdate>
void MorphColumn(const u8** src, u8* dst, ptrdiff_t dststep, size_t count, size_t width, size_t ksize)
{
size_t i, k;
size_t width32 = width & -32;
VecUpdate updateOp;
uint8x16_t x0,x1,s0,s1;
if (ksize == 3)
{
for (; count > 1; count -= 2, dst += dststep * 2, src += 2)
{
for (i = 0; i < width32; i += 32)
{
const u8* sptr = src[1] + i;
s0 = vld1q_u8(sptr);
s1 = vld1q_u8(sptr + 16);
internal::prefetch(sptr);
sptr = src[2] + i;
x0 = vld1q_u8(sptr);
x1 = vld1q_u8(sptr + 16);
internal::prefetch(sptr);
s0 = updateOp(s0, x0);
s1 = updateOp(s1, x1);
sptr = src[0] + i;
x0 = vld1q_u8(sptr);
x1 = vld1q_u8(sptr + 16);
internal::prefetch(sptr);
vst1q_u8(dst+i, updateOp(s0, x0));
vst1q_u8(dst+i+16, updateOp(s1, x1));
sptr = src[3] + i;
x0 = vld1q_u8(sptr);
x1 = vld1q_u8(sptr + 16);
internal::prefetch(sptr);
vst1q_u8(dst + dststep + i, updateOp(s0, x0));
vst1q_u8(dst + dststep + i + 16, updateOp(s1, x1));
}
for(; i < width; i++ )
{
u8 s = src[1][i];
for( k = 2; k < ksize; k++ )
s = updateOp(s, src[k][i]);
dst[i] = updateOp(s, src[0][i]);
dst[i+dststep] = updateOp(s, src[k][i]);
}
}
}
else if (ksize > 1)
for (; count > 1; count -= 2, dst += dststep*2, src += 2)
{
for (i = 0; i < width32; i += 32)
{
const u8* sptr = src[1] + i;
s0 = vld1q_u8(sptr);
s1 = vld1q_u8(sptr + 16);
internal::prefetch(sptr);
for (k = 2; k < ksize; k++)
{
sptr = src[k] + i;
x0 = vld1q_u8(sptr);
x1 = vld1q_u8(sptr + 16);
internal::prefetch(sptr);
s0 = updateOp(s0, x0);
s1 = updateOp(s1, x1);
}
sptr = src[0] + i;
x0 = vld1q_u8(sptr);
x1 = vld1q_u8(sptr + 16);
internal::prefetch(sptr);
vst1q_u8(dst+i, updateOp(s0, x0));
vst1q_u8(dst+i+16, updateOp(s1, x1));
sptr = src[k] + i;
x0 = vld1q_u8(sptr);
x1 = vld1q_u8(sptr + 16);
internal::prefetch(sptr);
vst1q_u8(dst + dststep + i, updateOp(s0, x0));
vst1q_u8(dst + dststep + i + 16, updateOp(s1, x1));
}
for(; i < width; i++ )
{
u8 s = src[1][i];
for( k = 2; k < ksize; k++ )
s = updateOp(s, src[k][i]);
dst[i] = updateOp(s, src[0][i]);
dst[i+dststep] = updateOp(s, src[k][i]);
}
}
for (; count > 0; count--, dst += dststep, src++)
{
for (i = 0; i < width32; i += 32)
{
const u8* sptr = src[0] + i;
s0 = vld1q_u8(sptr);
s1 = vld1q_u8(sptr + 16);
internal::prefetch(sptr);
for (k = 1; k < ksize; k++)
{
sptr = src[k] + i;
x0 = vld1q_u8(sptr);
x1 = vld1q_u8(sptr + 16);
internal::prefetch(sptr);
s0 = updateOp(s0, x0);
s1 = updateOp(s1, x1);
}
vst1q_u8(dst + i, s0);
vst1q_u8(dst + i + 16, s1);
}
for(; i < width; i++ )
{
u8 s = src[0][i];
for( k = 1; k < ksize; k++ )
s = updateOp(s, src[k][i]);
dst[i] = s;
}
}
}
template <class Op>
inline void morphology(const Size2D &ssize, u32 cn,
const u8 * srcBase, ptrdiff_t srcStride,
u8 * dstBase, ptrdiff_t dstStride,
const Size2D &ksize,
size_t anchorX, size_t anchorY,
BORDER_MODE rowBorderType, BORDER_MODE columnBorderType,
const u8 * borderValues, Margin borderMargin)
{
//Temporary buffers common for all iterations
std::vector<u8> _srcRow(cn*(ssize.width + ksize.width - 1));
u8* srcRow = &_srcRow[0];
size_t bufRows = std::max<size_t>(ksize.height + 3, std::max<size_t>(anchorY, ksize.height-anchorY-1)*2+1);
std::vector<u8*> _rows(bufRows);
u8** rows = &_rows[0];
// adjust swidthcn so that the used part of buffers stays compact in memory
ptrdiff_t swidthcn = cn*((ssize.width + 15) & -16);// cn * (aligned ssize.width size)
std::vector<u8> _ringBuf(swidthcn*bufRows+16);
u8 * ringBuf = internal::alignPtr(&_ringBuf[0], 16);
size_t borderLength = std::max<size_t>(ksize.width - 1, 1) * cn;
std::vector<ptrdiff_t> _borderTab(borderLength);
ptrdiff_t * borderTab = &_borderTab[0];
std::vector<u8> _constBorderValue;
std::vector<u8> _constBorderRow;
u8 * constBorderValue = NULL;
u8 * constBorderRow = NULL;
if( rowBorderType == BORDER_MODE_CONSTANT || columnBorderType == BORDER_MODE_CONSTANT )
{
_constBorderValue.resize(borderLength);
constBorderValue = &_constBorderValue[0];
size_t i;
for(i = 0; i < cn; i++)
constBorderValue[i] = borderValues[i];
for(; i < borderLength; i++)
constBorderValue[i] = constBorderValue[i-cn];
if( columnBorderType == BORDER_MODE_CONSTANT )
{
_constBorderRow.resize(cn*(ssize.width + ksize.width - 1 + 16));
constBorderRow = internal::alignPtr(&_constBorderRow[0], 16);
size_t N = (ssize.width + ksize.width - 1)*cn;
for( i = 0; i < N; i += borderLength )
{
size_t n = std::min( borderLength, N - i );
for(size_t j = 0; j < n; j++)
srcRow[i+j] = constBorderValue[j];
}
MorphRow<Op>(srcRow, constBorderRow, ssize.width, cn, ksize.width);
}
}
Size2D wholeSize(ssize.width + borderMargin.left + borderMargin.right,
ssize.height + borderMargin.top + borderMargin.bottom);
ptrdiff_t dx1 = std::max<ptrdiff_t>(anchorX - (ptrdiff_t)borderMargin.left, 0);
ptrdiff_t dx2 = std::max<ptrdiff_t>((ptrdiff_t)ksize.width - anchorX - 1 - (ptrdiff_t)borderMargin.right, 0);
// recompute border tables
if( dx1 > 0 || dx2 > 0 )
{
if( rowBorderType == BORDER_MODE_CONSTANT )
{
memcpy( srcRow, &constBorderValue[0], dx1*cn );
memcpy( srcRow + (ssize.width + ksize.width - 1 - dx2)*cn, &constBorderValue[0], dx2*cn );
}
else
{
ptrdiff_t xofs1 = std::min<ptrdiff_t>(borderMargin.left, anchorX) - borderMargin.left;
ptrdiff_t wholeWidth = wholeSize.width;
ptrdiff_t i, j;
for( i = 0; i < dx1; i++ )
{
ptrdiff_t p0 = (internal::borderInterpolate(i-dx1, wholeWidth, rowBorderType) + xofs1)*cn;
for( j = 0; j < (ptrdiff_t)cn; j++ )
borderTab[i*cn + j] = p0 + j;
}
for( i = 0; i < dx2; i++ )
{
ptrdiff_t p0 = (internal::borderInterpolate(wholeWidth + i, wholeWidth, rowBorderType) + xofs1)*cn;
for( j = 0; j < (ptrdiff_t)cn; j++ )
borderTab[(i + dx1)*cn + j] = p0 + j;
}
}
}
ptrdiff_t startY, startY0, endY, rowCount;
startY = startY0 = std::max<ptrdiff_t>(borderMargin.top - anchorY, 0);
endY = std::min<ptrdiff_t>(borderMargin.top + ssize.height + ksize.height - anchorY - 1, wholeSize.height);
const u8* src = srcBase + (startY - borderMargin.top)*srcStride;
u8* dst = dstBase;
ptrdiff_t width = ssize.width, kwidth = ksize.width;
ptrdiff_t kheight = ksize.height, ay = anchorY;
ptrdiff_t width1 = ssize.width + kwidth - 1;
ptrdiff_t xofs1 = std::min<ptrdiff_t>(borderMargin.left, anchorX);
bool makeBorder = (dx1 > 0 || dx2 > 0) && rowBorderType != BORDER_MODE_CONSTANT;
ptrdiff_t dy = 0, i = 0;
src -= xofs1*cn;
ptrdiff_t count = endY - startY;
rowCount = 0;
for(;; dst += dstStride*i, dy += i)
{
ptrdiff_t dcount = bufRows - ay - startY - rowCount + borderMargin.top;
dcount = dcount > 0 ? dcount : bufRows - kheight + 1;
dcount = std::min(dcount, count);
count -= dcount;
for( ; dcount-- > 0; src += srcStride )
{
ptrdiff_t bi = (startY - startY0 + rowCount) % bufRows;
u8* brow = ringBuf + bi*swidthcn;
if( (size_t)(++rowCount) > bufRows )
{
--rowCount;
++startY;
}
memcpy( srcRow + dx1*cn, src, (width1 - dx2 - dx1)*cn );
if( makeBorder )
{
for( i = 0; i < (ptrdiff_t)(dx1*cn); i++ )
srcRow[i] = src[borderTab[i]];
for( i = 0; i < (ptrdiff_t)(dx2*cn); i++ )
srcRow[i + (width1 - dx2)*cn] = src[borderTab[i+dx1*cn]];
}
MorphRow<Op>(srcRow, brow, width, cn, ksize.width);
}
ptrdiff_t max_i = std::min<ptrdiff_t>(bufRows, ssize.height - dy + (kheight - 1));
for( i = 0; i < max_i; i++ )
{
ptrdiff_t srcY = internal::borderInterpolate(dy + i + borderMargin.top - ay,
wholeSize.height, columnBorderType);
if( srcY < 0 ) // can happen only with constant border type
rows[i] = constBorderRow;
else
{
if( srcY >= startY + rowCount )
break;
ptrdiff_t bi = (srcY - startY0) % bufRows;
rows[i] = ringBuf + bi*swidthcn;
}
}
if( i < kheight )
break;
i -= kheight - 1;
MorphColumn<Op>((const u8**)rows, dst, dstStride, i, ssize.width*cn, ksize.height);
}
}
} // namespace
#endif // CAROTENE_NEON
void erode(const Size2D &ssize, u32 cn,
const u8 * srcBase, ptrdiff_t srcStride,
u8 * dstBase, ptrdiff_t dstStride,
const Size2D &ksize,
size_t anchorX, size_t anchorY,
BORDER_MODE rowBorderType, BORDER_MODE columnBorderType,
const u8 * borderValues, Margin borderMargin)
{
internal::assertSupportedConfiguration(ssize.width > 0 && ssize.height > 0 &&
anchorX < ksize.width && anchorY < ksize.height);
#ifdef CAROTENE_NEON
morphology<ErodeVecOp>(ssize, cn, srcBase, srcStride, dstBase, dstStride,
ksize, anchorX, anchorY, rowBorderType, columnBorderType,
borderValues, borderMargin);
#else
(void)cn;
(void)srcBase;
(void)srcStride;
(void)dstBase;
(void)dstStride;
(void)rowBorderType;
(void)columnBorderType;
(void)borderValues;
(void)borderMargin;
#endif
}
void dilate(const Size2D &ssize, u32 cn,
const u8 * srcBase, ptrdiff_t srcStride,
u8 * dstBase, ptrdiff_t dstStride,
const Size2D &ksize,
size_t anchorX, size_t anchorY,
BORDER_MODE rowBorderType, BORDER_MODE columnBorderType,
const u8 * borderValues, Margin borderMargin)
{
internal::assertSupportedConfiguration(ssize.width > 0 && ssize.height > 0 &&
anchorX < ksize.width && anchorY < ksize.height);
#ifdef CAROTENE_NEON
morphology<DilateVecOp>(ssize, cn, srcBase, srcStride, dstBase, dstStride,
ksize, anchorX, anchorY, rowBorderType, columnBorderType,
borderValues, borderMargin);
#else
(void)cn;
(void)srcBase;
(void)srcStride;
(void)dstBase;
(void)dstStride;
(void)rowBorderType;
(void)columnBorderType;
(void)borderValues;
(void)borderMargin;
#endif
}
} // namespace CAROTENE_NS

1572
3rdparty/carotene/src/mul.cpp vendored Normal file

File diff suppressed because it is too large Load Diff

1310
3rdparty/carotene/src/norm.cpp vendored Normal file

File diff suppressed because it is too large Load Diff

539
3rdparty/carotene/src/opticalflow.cpp vendored Normal file
View File

@ -0,0 +1,539 @@
/*
* By downloading, copying, installing or using the software you agree to this license.
* If you do not agree to this license, do not download, install,
* copy or use the software.
*
*
* License Agreement
* For Open Source Computer Vision Library
* (3-clause BSD License)
*
* Copyright (C) 2012-2015, NVIDIA Corporation, all rights reserved.
* Third party copyrights are property of their respective owners.
*
* Redistribution and use in source and binary forms, with or without modification,
* are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* * Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* * Neither the names of the copyright holders nor the names of the contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* This software is provided by the copyright holders and contributors "as is" and
* any express or implied warranties, including, but not limited to, the implied
* warranties of merchantability and fitness for a particular purpose are disclaimed.
* In no event shall copyright holders or contributors be liable for any direct,
* indirect, incidental, special, exemplary, or consequential damages
* (including, but not limited to, procurement of substitute goods or services;
* loss of use, data, or profits; or business interruption) however caused
* and on any theory of liability, whether in contract, strict liability,
* or tort (including negligence or otherwise) arising in any way out of
* the use of this software, even if advised of the possibility of such damage.
*/
#include "common.hpp"
#include "saturate_cast.hpp"
#include <vector>
#include <float.h> // For FLT_EPSILON
namespace CAROTENE_NS {
#define CV_DESCALE(x,n) (((x) + (1 << ((n)-1))) >> (n))
/*
* Pyramidal Lucas-Kanade Optical Flow level processing
*/
void pyrLKOptFlowLevel(const Size2D &size, s32 cn,
const u8 *prevData, ptrdiff_t prevStride,
const s16 *prevDerivData, ptrdiff_t prevDerivStride,
const u8 *nextData, ptrdiff_t nextStride,
u32 ptCount,
const f32 *prevPts, f32 *nextPts,
u8 *status, f32 *err,
const Size2D &winSize,
u32 terminationCount, f64 terminationEpsilon,
u32 level, u32 maxLevel, bool useInitialFlow, bool getMinEigenVals,
f32 minEigThreshold)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
f32 halfWinX = (winSize.width-1)*0.5f, halfWinY = (winSize.height-1)*0.5f;
s32 cn2 = cn*2;
std::vector<s16> _buf(winSize.total()*(cn + cn2));
s16* IWinBuf = &_buf[0];
s32 IWinBufStride = winSize.width*cn;
s16* derivIWinBuf = &_buf[winSize.total()*cn];
s32 derivIWinBufStride = winSize.width*cn2;
for( u32 ptidx = 0; ptidx < ptCount; ptidx++ )
{
f32 levscale = (1./(1 << level));
u32 ptref = ptidx << 1;
f32 prevPtX = prevPts[ptref+0]*levscale;
f32 prevPtY = prevPts[ptref+1]*levscale;
f32 nextPtX;
f32 nextPtY;
if( level == maxLevel )
{
if( useInitialFlow )
{
nextPtX = nextPts[ptref+0]*levscale;
nextPtY = nextPts[ptref+1]*levscale;
}
else
{
nextPtX = prevPtX;
nextPtY = prevPtY;
}
}
else
{
nextPtX = nextPts[ptref+0]*2.f;
nextPtY = nextPts[ptref+1]*2.f;
}
nextPts[ptref+0] = nextPtX;
nextPts[ptref+1] = nextPtY;
s32 iprevPtX, iprevPtY;
s32 inextPtX, inextPtY;
prevPtX -= halfWinX;
prevPtY -= halfWinY;
iprevPtX = floor(prevPtX);
iprevPtY = floor(prevPtY);
if( iprevPtX < -(s32)winSize.width || iprevPtX >= (s32)size.width ||
iprevPtY < -(s32)winSize.height || iprevPtY >= (s32)size.height )
{
if( level == 0 )
{
if( status )
status[ptidx] = false;
if( err )
err[ptidx] = 0;
}
continue;
}
f32 a = prevPtX - iprevPtX;
f32 b = prevPtY - iprevPtY;
const s32 W_BITS = 14, W_BITS1 = 14;
const f32 FLT_SCALE = 1.f/(1 << 20);
s32 iw00 = round((1.f - a)*(1.f - b)*(1 << W_BITS));
s32 iw01 = round(a*(1.f - b)*(1 << W_BITS));
s32 iw10 = round((1.f - a)*b*(1 << W_BITS));
s32 iw11 = (1 << W_BITS) - iw00 - iw01 - iw10;
s32 dstep = prevDerivStride/sizeof(s16);
f32 A11 = 0, A12 = 0, A22 = 0;
int16x4_t viw00 = vmov_n_s16((s16)iw00);
int16x4_t viw01 = vmov_n_s16((s16)iw01);
int16x4_t viw10 = vmov_n_s16((s16)iw10);
int16x4_t viw11 = vmov_n_s16((s16)iw11);
float32x4_t vA11 = vmovq_n_f32(0);
float32x4_t vA12 = vmovq_n_f32(0);
float32x4_t vA22 = vmovq_n_f32(0);
s32 wwcn = winSize.width*cn;
// extract the patch from the first image, compute covariation matrix of derivatives
s32 x = 0;
for(s32 y = 0; y < (s32)winSize.height; y++ )
{
const u8* src = prevData + prevStride*(y + iprevPtY) + iprevPtX*cn;
const s16* dsrc = prevDerivData + dstep*(y + iprevPtY) + iprevPtX*cn2;
s16* Iptr = IWinBuf + y*IWinBufStride;
s16* dIptr = derivIWinBuf + y*derivIWinBufStride;
internal::prefetch(src + x + prevStride * 2, 0);
for(x = 0; x <= wwcn - 8; x += 8)
{
uint8x8_t vsrc00 = vld1_u8(src + x);
uint8x8_t vsrc10 = vld1_u8(src + x + prevStride);
uint8x8_t vsrc01 = vld1_u8(src + x + cn);
uint8x8_t vsrc11 = vld1_u8(src + x + prevStride + cn);
int16x8_t vs00 = vreinterpretq_s16_u16(vmovl_u8(vsrc00));
int16x8_t vs10 = vreinterpretq_s16_u16(vmovl_u8(vsrc10));
int16x8_t vs01 = vreinterpretq_s16_u16(vmovl_u8(vsrc01));
int16x8_t vs11 = vreinterpretq_s16_u16(vmovl_u8(vsrc11));
int32x4_t vsuml = vmull_s16(vget_low_s16(vs00), viw00);
int32x4_t vsumh = vmull_s16(vget_high_s16(vs10), viw10);
vsuml = vmlal_s16(vsuml, vget_low_s16(vs01), viw01);
vsumh = vmlal_s16(vsumh, vget_high_s16(vs11), viw11);
vsuml = vmlal_s16(vsuml, vget_low_s16(vs10), viw10);
vsumh = vmlal_s16(vsumh, vget_high_s16(vs00), viw00);
vsuml = vmlal_s16(vsuml, vget_low_s16(vs11), viw11);
vsumh = vmlal_s16(vsumh, vget_high_s16(vs01), viw01);
int16x4_t vsumnl = vrshrn_n_s32(vsuml, W_BITS1-5);
int16x4_t vsumnh = vrshrn_n_s32(vsumh, W_BITS1-5);
vst1q_s16(Iptr + x, vcombine_s16(vsumnl, vsumnh));
}
for(; x <= wwcn - 4; x += 4)
{
uint8x8_t vsrc00 = vld1_u8(src + x);
uint8x8_t vsrc10 = vld1_u8(src + x + prevStride);
uint8x8_t vsrc01 = vld1_u8(src + x + cn);
uint8x8_t vsrc11 = vld1_u8(src + x + prevStride + cn);
int16x4_t vs00 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(vsrc00)));
int16x4_t vs10 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(vsrc10)));
int16x4_t vs01 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(vsrc01)));
int16x4_t vs11 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(vsrc11)));
int32x4_t vsuml1 = vmull_s16(vs00, viw00);
int32x4_t vsuml2 = vmull_s16(vs01, viw01);
vsuml1 = vmlal_s16(vsuml1, vs10, viw10);
vsuml2 = vmlal_s16(vsuml2, vs11, viw11);
int32x4_t vsuml = vaddq_s32(vsuml1, vsuml2);
int16x4_t vsumnl = vrshrn_n_s32(vsuml, W_BITS1-5);
vst1_s16(Iptr + x, vsumnl);
}
internal::prefetch(dsrc + dstep * 2, 0);
for(x = 0; x <= wwcn - 4; x += 4, dsrc += 4*2, dIptr += 4*2 )
{
#if __GNUC_MINOR__ < 0
__asm__ (
"vld2.16 {d0-d1}, [%[dsrc00]] \n\t"
"vld2.16 {d2-d3}, [%[dsrc10]] \n\t"
"vld2.16 {d4-d5}, [%[dsrc01]] \n\t"
"vld2.16 {d6-d7}, [%[dsrc11]] \n\t"
"vmull.s16 q4, d3, %P[viw10] \n\t"
"vmull.s16 q5, d0, %P[viw00] \n\t"
"vmlal.s16 q4, d7, %P[viw11] \n\t"
"vmlal.s16 q5, d4, %P[viw01] \n\t"
"vmlal.s16 q4, d1, %P[viw00] \n\t"
"vmlal.s16 q5, d2, %P[viw10] \n\t"
"vmlal.s16 q4, d5, %P[viw01] \n\t"
"vmlal.s16 q5, d6, %P[viw11] \n\t"
"vrshrn.s32 d13, q4, %[W_BITS1] \n\t"
"vrshrn.s32 d12, q5, %[W_BITS1] \n\t"
"vmull.s16 q3, d13, d13 \n\t"
"vmull.s16 q4, d12, d12 \n\t"
"vmull.s16 q5, d13, d12 \n\t"
"vcvt.f32.s32 q3, q3 \n\t"
"vcvt.f32.s32 q4, q4 \n\t"
"vcvt.f32.s32 q5, q5 \n\t"
"vadd.f32 %q[vA22], q3 \n\t"
"vadd.f32 %q[vA11], q4 \n\t"
"vadd.f32 %q[vA12], q5 \n\t"
"vst2.16 {d12-d13}, [%[out]] \n\t"
: [vA22] "=w" (vA22),
[vA11] "=w" (vA11),
[vA12] "=w" (vA12)
: "0" (vA22),
"1" (vA11),
"2" (vA12),
[out] "r" (dIptr),
[dsrc00] "r" (dsrc),
[dsrc10] "r" (dsrc + dstep),
[dsrc01] "r" (dsrc + cn2),
[dsrc11] "r" (dsrc + dstep + cn2),
[viw00] "w" (viw00),
[viw10] "w" (viw10),
[viw01] "w" (viw01),
[viw11] "w" (viw11),
[W_BITS1] "I" (W_BITS1)
: "d0","d1","d2","d3","d4","d5","d6","d7","d8","d9","d10","d11","d12","d13"
);
#else
int16x4x2_t vdsrc00 = vld2_s16(dsrc);
int16x4x2_t vdsrc10 = vld2_s16(dsrc + dstep);
int16x4x2_t vdsrc01 = vld2_s16(dsrc + cn2);
int16x4x2_t vdsrc11 = vld2_s16(dsrc + dstep + cn2);
int32x4_t vsumy = vmull_s16(vdsrc10.val[1], viw10);
int32x4_t vsumx = vmull_s16(vdsrc00.val[0], viw00);
vsumy = vmlal_s16(vsumy, vdsrc11.val[1], viw11);
vsumx = vmlal_s16(vsumx, vdsrc01.val[0], viw01);
vsumy = vmlal_s16(vsumy, vdsrc00.val[1], viw00);
vsumx = vmlal_s16(vsumx, vdsrc10.val[0], viw10);
vsumy = vmlal_s16(vsumy, vdsrc01.val[1], viw01);
vsumx = vmlal_s16(vsumx, vdsrc11.val[0], viw11);
int16x4_t vsumny = vrshrn_n_s32(vsumy, W_BITS1);
int16x4_t vsumnx = vrshrn_n_s32(vsumx, W_BITS1);
int32x4_t va22i = vmull_s16(vsumny, vsumny);
int32x4_t va11i = vmull_s16(vsumnx, vsumnx);
int32x4_t va12i = vmull_s16(vsumnx, vsumny);
float32x4_t va22f = vcvtq_f32_s32(va22i);
float32x4_t va11f = vcvtq_f32_s32(va11i);
float32x4_t va12f = vcvtq_f32_s32(va12i);
vA22 = vaddq_f32(vA22, va22f);
vA11 = vaddq_f32(vA11, va11f);
vA12 = vaddq_f32(vA12, va12f);
int16x4x2_t vsum;
vsum.val[0] = vsumnx;
vsum.val[1] = vsumny;
vst2_s16(dIptr, vsum);
#endif
}
for( ; x < wwcn; x++, dsrc += 2, dIptr += 2 )
{
s32 ival = CV_DESCALE(src[x]*iw00 + src[x+cn]*iw01 +
src[x+prevStride]*iw10 + src[x+prevStride+cn]*iw11, W_BITS1-5);
s32 ixval = CV_DESCALE(dsrc[0]*iw00 + dsrc[cn2]*iw01 +
dsrc[dstep]*iw10 + dsrc[dstep+cn2]*iw11, W_BITS1);
s32 iyval = CV_DESCALE(dsrc[1]*iw00 + dsrc[cn2+1]*iw01 + dsrc[dstep+1]*iw10 +
dsrc[dstep+cn2+1]*iw11, W_BITS1);
Iptr[x] = (s16)ival;
dIptr[0] = (s16)ixval;
dIptr[1] = (s16)iyval;
A11 += (f32)(ixval*ixval);
A12 += (f32)(ixval*iyval);
A22 += (f32)(iyval*iyval);
}
}
f32 A11buf[2], A12buf[2], A22buf[2];
vst1_f32(A11buf, vadd_f32(vget_low_f32(vA11), vget_high_f32(vA11)));
vst1_f32(A12buf, vadd_f32(vget_low_f32(vA12), vget_high_f32(vA12)));
vst1_f32(A22buf, vadd_f32(vget_low_f32(vA22), vget_high_f32(vA22)));
A11 += A11buf[0] + A11buf[1];
A12 += A12buf[0] + A12buf[1];
A22 += A22buf[0] + A22buf[1];
A11 *= FLT_SCALE;
A12 *= FLT_SCALE;
A22 *= FLT_SCALE;
f32 D = A11*A22 - A12*A12;
f32 minEig = (A22 + A11 - std::sqrt((A11-A22)*(A11-A22) +
4.f*A12*A12))/(2*winSize.width*winSize.height);
if( err && getMinEigenVals )
err[ptidx] = (f32)minEig;
if( minEig < minEigThreshold || D < FLT_EPSILON )
{
if( level == 0 && status )
status[ptidx] = false;
continue;
}
D = 1.f/D;
nextPtX -= halfWinX;
nextPtY -= halfWinY;
f32 prevDeltaX = 0;
f32 prevDeltaY = 0;
for(u32 j = 0; j < terminationCount; j++ )
{
inextPtX = floor(nextPtX);
inextPtY = floor(nextPtY);
if( inextPtX < -(s32)winSize.width || inextPtX >= (s32)size.width ||
inextPtY < -(s32)winSize.height || inextPtY >= (s32)size.height )
{
if( level == 0 && status )
status[ptidx] = false;
break;
}
a = nextPtX - inextPtX;
b = nextPtY - inextPtY;
iw00 = round((1.f - a)*(1.f - b)*(1 << W_BITS));
iw01 = round(a*(1.f - b)*(1 << W_BITS));
iw10 = round((1.f - a)*b*(1 << W_BITS));
iw11 = (1 << W_BITS) - iw00 - iw01 - iw10;
f32 b1 = 0, b2 = 0;
viw00 = vmov_n_s16((s16)iw00);
viw01 = vmov_n_s16((s16)iw01);
viw10 = vmov_n_s16((s16)iw10);
viw11 = vmov_n_s16((s16)iw11);
float32x4_t vb1 = vmovq_n_f32(0);
float32x4_t vb2 = vmovq_n_f32(0);
for(s32 y = 0; y < (s32)winSize.height; y++ )
{
const u8* Jptr = nextData + nextStride*(y + inextPtY) + inextPtX*cn;
const s16* Iptr = IWinBuf + y*IWinBufStride;
const s16* dIptr = derivIWinBuf + y*derivIWinBufStride;
x = 0;
internal::prefetch(Jptr, nextStride * 2);
internal::prefetch(Iptr, IWinBufStride/2);
internal::prefetch(dIptr, derivIWinBufStride/2);
for( ; x <= wwcn - 8; x += 8, dIptr += 8*2 )
{
uint8x8_t vj00 = vld1_u8(Jptr + x);
uint8x8_t vj10 = vld1_u8(Jptr + x + nextStride);
uint8x8_t vj01 = vld1_u8(Jptr + x + cn);
uint8x8_t vj11 = vld1_u8(Jptr + x + nextStride + cn);
int16x8_t vI = vld1q_s16(Iptr + x);
int16x8x2_t vDerivI = vld2q_s16(dIptr);
int16x8_t vs00 = vreinterpretq_s16_u16(vmovl_u8(vj00));
int16x8_t vs10 = vreinterpretq_s16_u16(vmovl_u8(vj10));
int16x8_t vs01 = vreinterpretq_s16_u16(vmovl_u8(vj01));
int16x8_t vs11 = vreinterpretq_s16_u16(vmovl_u8(vj11));
int32x4_t vsuml = vmull_s16(vget_low_s16(vs00), viw00);
int32x4_t vsumh = vmull_s16(vget_high_s16(vs10), viw10);
vsuml = vmlal_s16(vsuml, vget_low_s16(vs01), viw01);
vsumh = vmlal_s16(vsumh, vget_high_s16(vs11), viw11);
vsuml = vmlal_s16(vsuml, vget_low_s16(vs10), viw10);
vsumh = vmlal_s16(vsumh, vget_high_s16(vs00), viw00);
vsuml = vmlal_s16(vsuml, vget_low_s16(vs11), viw11);
vsumh = vmlal_s16(vsumh, vget_high_s16(vs01), viw01);
int16x4_t vsumnl = vrshrn_n_s32(vsuml, W_BITS1-5);
int16x4_t vsumnh = vrshrn_n_s32(vsumh, W_BITS1-5);
int16x8_t diff = vqsubq_s16(vcombine_s16(vsumnl, vsumnh), vI);
int32x4_t vb1l = vmull_s16(vget_low_s16(diff), vget_low_s16(vDerivI.val[0]));
int32x4_t vb2h = vmull_s16(vget_high_s16(diff), vget_high_s16(vDerivI.val[1]));
int32x4_t vb1i = vmlal_s16(vb1l, vget_high_s16(diff), vget_high_s16(vDerivI.val[0]));
int32x4_t vb2i = vmlal_s16(vb2h, vget_low_s16(diff), vget_low_s16(vDerivI.val[1]));
float32x4_t vb1f = vcvtq_f32_s32(vb1i);
float32x4_t vb2f = vcvtq_f32_s32(vb2i);
vb1 = vaddq_f32(vb1, vb1f);
vb2 = vaddq_f32(vb2, vb2f);
}
for( ; x < wwcn; x++, dIptr += 2 )
{
s32 diff = CV_DESCALE(Jptr[x]*iw00 + Jptr[x+cn]*iw01 +
Jptr[x+nextStride]*iw10 + Jptr[x+nextStride+cn]*iw11,
W_BITS1-5) - Iptr[x];
b1 += (f32)(diff*dIptr[0]);
b2 += (f32)(diff*dIptr[1]);
}
}
f32 bbuf[2];
float32x2_t vb = vpadd_f32(vadd_f32(vget_low_f32(vb1), vget_high_f32(vb1)), vadd_f32(vget_low_f32(vb2), vget_high_f32(vb2)));
vst1_f32(bbuf, vb);
b1 += bbuf[0];
b2 += bbuf[1];
b1 *= FLT_SCALE;
b2 *= FLT_SCALE;
f32 deltaX = (f32)((A12*b2 - A22*b1) * D);
f32 deltaY = (f32)((A12*b1 - A11*b2) * D);
nextPtX += deltaX;
nextPtY += deltaY;
nextPts[ptref+0] = nextPtX + halfWinX;
nextPts[ptref+1] = nextPtY + halfWinY;
if( ((double)deltaX*deltaX + (double)deltaY*deltaY) <= terminationEpsilon )
break;
if( j > 0 && std::abs(deltaX + prevDeltaX) < 0.01 &&
std::abs(deltaY + prevDeltaY) < 0.01 )
{
nextPts[ptref+0] -= deltaX*0.5f;
nextPts[ptref+1] -= deltaY*0.5f;
break;
}
prevDeltaX = deltaX;
prevDeltaY = deltaY;
}
if( status && status[ptidx] && err && level == 0 && !getMinEigenVals )
{
f32 nextPointX = nextPts[ptref+0] - halfWinX;
f32 nextPointY = nextPts[ptref+1] - halfWinY;
s32 inextPointX = floor(nextPointX);
s32 inextPointY = floor(nextPointY);
if( inextPointX < -(s32)winSize.width || inextPointX >= (s32)size.width ||
inextPointY < -(s32)winSize.height || inextPointY >= (s32)size.height )
{
if( status )
status[ptidx] = false;
continue;
}
f32 aa = nextPointX - inextPointX;
f32 bb = nextPointY - inextPointY;
iw00 = round((1.f - aa)*(1.f - bb)*(1 << W_BITS));
iw01 = round(aa*(1.f - bb)*(1 << W_BITS));
iw10 = round((1.f - aa)*bb*(1 << W_BITS));
iw11 = (1 << W_BITS) - iw00 - iw01 - iw10;
f32 errval = 0.f;
for(s32 y = 0; y < (s32)winSize.height; y++ )
{
const u8* Jptr = nextData + nextStride*(y + inextPointY) + inextPointX*cn;
const s16* Iptr = IWinBuf + y*IWinBufStride;
for( x = 0; x < wwcn; x++ )
{
s32 diff = CV_DESCALE(Jptr[x]*iw00 + Jptr[x+cn]*iw01 +
Jptr[x+nextStride]*iw10 + Jptr[x+nextStride+cn]*iw11,
W_BITS1-5) - Iptr[x];
errval += std::abs((f32)diff);
}
}
err[ptidx] = errval / (32*wwcn*winSize.height);
}
}
#else
(void)size;
(void)cn;
(void)prevData;
(void)prevStride;
(void)prevDerivData;
(void)prevDerivStride;
(void)nextData;
(void)nextStride;
(void)prevPts;
(void)nextPts;
(void)status;
(void)err;
(void)winSize;
(void)terminationCount;
(void)terminationEpsilon;
(void)level;
(void)maxLevel;
(void)useInitialFlow;
(void)getMinEigenVals;
(void)minEigThreshold;
(void)ptCount;
#endif
}
}//CAROTENE_NS

274
3rdparty/carotene/src/phase.cpp vendored Normal file
View File

@ -0,0 +1,274 @@
/*
* By downloading, copying, installing or using the software you agree to this license.
* If you do not agree to this license, do not download, install,
* copy or use the software.
*
*
* License Agreement
* For Open Source Computer Vision Library
* (3-clause BSD License)
*
* Copyright (C) 2012-2015, NVIDIA Corporation, all rights reserved.
* Third party copyrights are property of their respective owners.
*
* Redistribution and use in source and binary forms, with or without modification,
* are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* * Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* * Neither the names of the copyright holders nor the names of the contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* This software is provided by the copyright holders and contributors "as is" and
* any express or implied warranties, including, but not limited to, the implied
* warranties of merchantability and fitness for a particular purpose are disclaimed.
* In no event shall copyright holders or contributors be liable for any direct,
* indirect, incidental, special, exemplary, or consequential damages
* (including, but not limited to, procurement of substitute goods or services;
* loss of use, data, or profits; or business interruption) however caused
* and on any theory of liability, whether in contract, strict liability,
* or tort (including negligence or otherwise) arising in any way out of
* the use of this software, even if advised of the possibility of such damage.
*/
#include <cfloat>
#include <cmath>
#include "common.hpp"
namespace CAROTENE_NS {
#ifdef CAROTENE_NEON
namespace {
#define FASTATAN2CONST(scale) \
f32 P1((f32)( 0.9997878412794807 * (180.0 / M_PI) * scale)), \
P3((f32)(-0.3258083974640975 * (180.0 / M_PI) * scale)), \
P5((f32)( 0.1555786518463281 * (180.0 / M_PI) * scale)), \
P7((f32)(-0.04432655554792128 * (180.0 / M_PI) * scale)), \
A_90((f32)(90.f * scale)), \
A_180((f32)(180.f * scale)), \
A_360((f32)(360.f * scale)); \
float32x4_t eps(vdupq_n_f32((float)DBL_EPSILON)), \
_90(vdupq_n_f32(A_90)), \
_180(vdupq_n_f32(A_180)), \
_360(vdupq_n_f32(A_360)), \
z(vdupq_n_f32(0.0f)), \
p1(vdupq_n_f32(P1)), \
p3(vdupq_n_f32(P3)), \
p5(vdupq_n_f32(P5)), \
p7(vdupq_n_f32(P7));
#define FASTATAN2SCALAR(y, x, a) \
{ \
f32 ax = std::abs(x), ay = std::abs(y); \
f32 c, c2; \
if (ax >= ay) \
{ \
c = ay / (ax + (float)DBL_EPSILON); \
c2 = c * c; \
a = (((P7 * c2 + P5) * c2 + P3) * c2 + P1) * c; \
} \
else \
{ \
c = ax / (ay + (float)DBL_EPSILON); \
c2 = c * c; \
a = A_90 - (((P7 * c2 + P5) * c2 + P3) * c2 + P1) * c; \
} \
if (x < 0) \
a = A_180 - a; \
if (y < 0) \
a = A_360 - a; \
}
#define FASTATAN2VECTOR(v_y, v_x, a) \
{ \
float32x4_t ax = vabsq_f32(v_x), ay = vabsq_f32(v_y); \
float32x4_t tmin = vminq_f32(ax, ay), tmax = vmaxq_f32(ax, ay); \
float32x4_t c = vmulq_f32(tmin, internal::vrecpq_f32(vaddq_f32(tmax, eps))); \
float32x4_t c2 = vmulq_f32(c, c); \
a = vmulq_f32(c2, p7); \
\
a = vmulq_f32(vaddq_f32(a, p5), c2); \
a = vmulq_f32(vaddq_f32(a, p3), c2); \
a = vmulq_f32(vaddq_f32(a, p1), c); \
\
a = vbslq_f32(vcgeq_f32(ax, ay), a, vsubq_f32(_90, a)); \
a = vbslq_f32(vcltq_f32(v_x, z), vsubq_f32(_180, a), a); \
a = vbslq_f32(vcltq_f32(v_y, z), vsubq_f32(_360, a), a); \
\
}
} // namespace
#endif
void phase(const Size2D &size,
const s16 * src0Base, ptrdiff_t src0Stride,
const s16 * src1Base, ptrdiff_t src1Stride,
u8 * dstBase, ptrdiff_t dstStride)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
FASTATAN2CONST(256.0f / 360.0f)
size_t roiw16 = size.width >= 15 ? size.width - 15 : 0;
size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
float32x4_t v_05 = vdupq_n_f32(0.5f);
for (size_t i = 0; i < size.height; ++i)
{
const s16 * src0 = internal::getRowPtr(src0Base, src0Stride, i);
const s16 * src1 = internal::getRowPtr(src1Base, src1Stride, i);
u8 * dst = internal::getRowPtr(dstBase, dstStride, i);
size_t j = 0;
for (; j < roiw16; j += 16)
{
internal::prefetch(src0 + j);
internal::prefetch(src1 + j);
int16x8_t v_src00 = vld1q_s16(src0 + j), v_src01 = vld1q_s16(src0 + j + 8);
int16x8_t v_src10 = vld1q_s16(src1 + j), v_src11 = vld1q_s16(src1 + j + 8);
// 0
float32x4_t v_src0_p = vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src00)));
float32x4_t v_src1_p = vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src10)));
float32x4_t v_dst32f0;
FASTATAN2VECTOR(v_src1_p, v_src0_p, v_dst32f0)
v_src0_p = vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src00)));
v_src1_p = vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src10)));
float32x4_t v_dst32f1;
FASTATAN2VECTOR(v_src1_p, v_src0_p, v_dst32f1)
uint16x8_t v_dst16s0 = vcombine_u16(vmovn_u32(vcvtq_u32_f32(vaddq_f32(v_dst32f0, v_05))),
vmovn_u32(vcvtq_u32_f32(vaddq_f32(v_dst32f1, v_05))));
// 1
v_src0_p = vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src01)));
v_src1_p = vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src11)));
FASTATAN2VECTOR(v_src1_p, v_src0_p, v_dst32f0)
v_src0_p = vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src01)));
v_src1_p = vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src11)));
FASTATAN2VECTOR(v_src1_p, v_src0_p, v_dst32f1)
uint16x8_t v_dst16s1 = vcombine_u16(vmovn_u32(vcvtq_u32_f32(vaddq_f32(v_dst32f0, v_05))),
vmovn_u32(vcvtq_u32_f32(vaddq_f32(v_dst32f1, v_05))));
vst1q_u8(dst + j, vcombine_u8(vmovn_u16(v_dst16s0),
vmovn_u16(v_dst16s1)));
}
for (; j < roiw8; j += 8)
{
int16x8_t v_src0 = vld1q_s16(src0 + j);
int16x8_t v_src1 = vld1q_s16(src1 + j);
float32x4_t v_src0_p = vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src0)));
float32x4_t v_src1_p = vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src1)));
float32x4_t v_dst32f0;
FASTATAN2VECTOR(v_src1_p, v_src0_p, v_dst32f0)
v_src0_p = vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src0)));
v_src1_p = vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src1)));
float32x4_t v_dst32f1;
FASTATAN2VECTOR(v_src1_p, v_src0_p, v_dst32f1)
uint16x8_t v_dst = vcombine_u16(vmovn_u32(vcvtq_u32_f32(vaddq_f32(v_dst32f0, v_05))),
vmovn_u32(vcvtq_u32_f32(vaddq_f32(v_dst32f1, v_05))));
vst1_u8(dst + j, vmovn_u16(v_dst));
}
for (; j < size.width; j++)
{
f32 x = src0[j], y = src1[j];
f32 a;
FASTATAN2SCALAR(y, x, a)
dst[j] = (u8)(s32)floor(a + 0.5f);
}
}
#else
(void)size;
(void)src0Base;
(void)src0Stride;
(void)src1Base;
(void)src1Stride;
(void)dstBase;
(void)dstStride;
#endif
}
void phase(const Size2D &size,
const f32 * src0Base, ptrdiff_t src0Stride,
const f32 * src1Base, ptrdiff_t src1Stride,
f32 * dstBase, ptrdiff_t dstStride,
f32 scale)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
FASTATAN2CONST(scale)
size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
for (size_t i = 0; i < size.height; ++i)
{
const f32 * src0 = internal::getRowPtr(src0Base, src0Stride, i);
const f32 * src1 = internal::getRowPtr(src1Base, src1Stride, i);
f32 * dst = internal::getRowPtr(dstBase, dstStride, i);
size_t j = 0;
for (; j < roiw8; j += 8)
{
internal::prefetch(src0 + j);
internal::prefetch(src1 + j);
float32x4_t v_src00 = vld1q_f32(src0 + j), v_src01 = vld1q_f32(src0 + j + 4);
float32x4_t v_src10 = vld1q_f32(src1 + j), v_src11 = vld1q_f32(src1 + j + 4);
float32x4_t v_dst32f;
// 0
FASTATAN2VECTOR(v_src10, v_src00, v_dst32f)
vst1q_f32(dst + j, v_dst32f);
// 1
FASTATAN2VECTOR(v_src11, v_src01, v_dst32f)
vst1q_f32(dst + j + 4, v_dst32f);
}
if(j + 4 <= size.width)
{
float32x4_t v_src0 = vld1q_f32(src0 + j);
float32x4_t v_src1 = vld1q_f32(src1 + j);
float32x4_t v_dst32f;
FASTATAN2VECTOR(v_src1, v_src0, v_dst32f)
vst1q_f32(dst + j, v_dst32f);
j += 4;
}
for (; j < size.width; j++)
{
f32 a;
FASTATAN2SCALAR(src1[j], src0[j], a)
dst[j] = a;
}
}
#else
(void)size;
(void)src0Base;
(void)src0Stride;
(void)src1Base;
(void)src1Stride;
(void)dstBase;
(void)dstStride;
(void)scale;
#endif
}
} // namespace CAROTENE_NS

1414
3rdparty/carotene/src/pyramid.cpp vendored Normal file

File diff suppressed because it is too large Load Diff

460
3rdparty/carotene/src/reduce.cpp vendored Normal file
View File

@ -0,0 +1,460 @@
/*
* By downloading, copying, installing or using the software you agree to this license.
* If you do not agree to this license, do not download, install,
* copy or use the software.
*
*
* License Agreement
* For Open Source Computer Vision Library
* (3-clause BSD License)
*
* Copyright (C) 2012-2015, NVIDIA Corporation, all rights reserved.
* Third party copyrights are property of their respective owners.
*
* Redistribution and use in source and binary forms, with or without modification,
* are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* * Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* * Neither the names of the copyright holders nor the names of the contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* This software is provided by the copyright holders and contributors "as is" and
* any express or implied warranties, including, but not limited to, the implied
* warranties of merchantability and fitness for a particular purpose are disclaimed.
* In no event shall copyright holders or contributors be liable for any direct,
* indirect, incidental, special, exemplary, or consequential damages
* (including, but not limited to, procurement of substitute goods or services;
* loss of use, data, or profits; or business interruption) however caused
* and on any theory of liability, whether in contract, strict liability,
* or tort (including negligence or otherwise) arising in any way out of
* the use of this software, even if advised of the possibility of such damage.
*/
#include "common.hpp"
#include <cstring>
namespace CAROTENE_NS {
void reduceColSum(const Size2D &size,
const u8 * srcBase, ptrdiff_t srcStride,
s32 * dstBase)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
memset(dstBase, 0, size.width*sizeof(s32));
size_t i = 0;
for (; i + 16 <= size.width; i += 16)
{
const u8* src_address = srcBase + i;
int32x4_t sll = vmovq_n_s32(0);
int32x4_t slh = vmovq_n_s32(0);
int32x4_t shl = vmovq_n_s32(0);
int32x4_t shh = vmovq_n_s32(0);
for (size_t h = 0; h < size.height; h += 256)
{
size_t lim = std::min(h + 256, size.height);
uint16x8_t sl = vmovq_n_u16(0);
uint16x8_t sh = vmovq_n_u16(0);
for (size_t k = h; k < lim; ++k, src_address += srcStride)
{
internal::prefetch(src_address + srcStride, 0);
uint8x16_t v = vld1q_u8(src_address);
sl = vaddw_u8(sl, vget_low_u8(v));
sh = vaddw_u8(sh, vget_high_u8(v));
}
int32x4_t vsll = vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(sl)));
int32x4_t vslh = vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(sl)));
int32x4_t vshl = vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(sh)));
int32x4_t vshh = vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(sh)));
sll = vqaddq_s32(sll, vsll);
slh = vqaddq_s32(slh, vslh);
shl = vqaddq_s32(shl, vshl);
shh = vqaddq_s32(shh, vshh);
}
vst1q_s32(dstBase + i + 0, sll);
vst1q_s32(dstBase + i + 4, slh);
vst1q_s32(dstBase + i + 8, shl);
vst1q_s32(dstBase + i + 12, shh);
}
for(size_t h = 0; h < size.height; ++h)
{
for(size_t j = i ; j < size.width; j++ )
{
if (((u32)(dstBase[j] += srcBase[j + srcStride * h])) > 0x7fFFffFFu)
dstBase[j] = 0x7fFFffFF;
}
}
#else
(void)size;
(void)srcBase;
(void)srcStride;
(void)dstBase;
#endif
}
void reduceColMax(const Size2D &size,
const u8 * srcBase, ptrdiff_t srcStride,
u8 * dstBase)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
memcpy(dstBase, srcBase, size.width);
size_t i = 0;
for (; i + 16*4 <= size.width; i += 16*4)
{
const u8* src_address = srcBase + i;
uint8x16_t s1 = vld1q_u8(src_address + 0);
uint8x16_t s2 = vld1q_u8(src_address + 16);
uint8x16_t s3 = vld1q_u8(src_address + 32);
uint8x16_t s4 = vld1q_u8(src_address + 48);
src_address += srcStride;
for(size_t h = 1; h < size.height; ++h, src_address += srcStride)
{
internal::prefetch(src_address + srcStride, 0);
internal::prefetch(src_address + srcStride, 32);
uint8x16_t v1 = vld1q_u8(src_address + 0);
uint8x16_t v2 = vld1q_u8(src_address + 16);
uint8x16_t v3 = vld1q_u8(src_address + 32);
uint8x16_t v4 = vld1q_u8(src_address + 48);
s1 = vmaxq_u8(s1, v1);
s2 = vmaxq_u8(s2, v2);
s3 = vmaxq_u8(s3, v3);
s4 = vmaxq_u8(s4, v4);
}
vst1q_u8(dstBase + i + 0, s1);
vst1q_u8(dstBase + i + 16, s2);
vst1q_u8(dstBase + i + 32, s3);
vst1q_u8(dstBase + i + 48, s4);
}
for (; i + 16 <= size.width; i += 16)
{
const u8* src_address = srcBase + i;
uint8x16_t s1 = vld1q_u8(src_address);
src_address += srcStride;
for(size_t h = 1; h < size.height; ++h, src_address += srcStride)
{
internal::prefetch(src_address + srcStride, 0);
uint8x16_t v1 = vld1q_u8(src_address);
s1 = vmaxq_u8(s1, v1);
}
vst1q_u8(dstBase + i, s1);
}
if (i < size.width)
for(size_t h = 1; h < size.height; ++h)
for(size_t j = i ; j < size.width; j++ )
dstBase[j] = std::max(dstBase[j], srcBase[j + srcStride * h]);
#else
(void)size;
(void)srcBase;
(void)srcStride;
(void)dstBase;
#endif
}
void reduceColMin(const Size2D &size,
const u8 * srcBase, ptrdiff_t srcStride,
u8 * dstBase)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
memcpy(dstBase, srcBase, size.width);
size_t i = 0;
for (; i + 16*4 <= size.width; i += 16*4)
{
const u8* src_address = srcBase + i;
uint8x16_t s1 = vld1q_u8(src_address + 0);
uint8x16_t s2 = vld1q_u8(src_address + 16);
uint8x16_t s3 = vld1q_u8(src_address + 32);
uint8x16_t s4 = vld1q_u8(src_address + 48);
src_address += srcStride;
for(size_t h = 1; h < size.height; ++h, src_address += srcStride)
{
internal::prefetch(src_address + srcStride, 0);
internal::prefetch(src_address + srcStride, 32);
uint8x16_t v1 = vld1q_u8(src_address + 0);
uint8x16_t v2 = vld1q_u8(src_address + 16);
uint8x16_t v3 = vld1q_u8(src_address + 32);
uint8x16_t v4 = vld1q_u8(src_address + 48);
s1 = vminq_u8(s1, v1);
s2 = vminq_u8(s2, v2);
s3 = vminq_u8(s3, v3);
s4 = vminq_u8(s4, v4);
}
vst1q_u8(dstBase + i + 0, s1);
vst1q_u8(dstBase + i + 16, s2);
vst1q_u8(dstBase + i + 32, s3);
vst1q_u8(dstBase + i + 48, s4);
}
for (; i + 16 <= size.width; i += 16)
{
const u8* src_address = srcBase + i;
uint8x16_t s1 = vld1q_u8(src_address);
src_address += srcStride;
for(size_t h = 1; h < size.height; ++h, src_address += srcStride)
{
internal::prefetch(src_address + srcStride, 0);
uint8x16_t v1 = vld1q_u8(src_address);
s1 = vminq_u8(s1, v1);
}
vst1q_u8(dstBase + i, s1);
}
if (i < size.width)
for(size_t h = 1; h < size.height; ++h)
for(size_t j = i ; j < size.width; j++ )
dstBase[j] = std::min(dstBase[j], srcBase[j + srcStride * h]);
#else
(void)size;
(void)srcBase;
(void)srcStride;
(void)dstBase;
#endif
}
void reduceColSum(const Size2D &size,
const f32 * srcBase, ptrdiff_t srcStride,
f32 * dstBase)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
memcpy(dstBase, srcBase, size.width*sizeof(f32));
size_t srcstep = srcStride/sizeof(f32);
size_t i = 0;
for (; i + 16 <= size.width; i += 16)
{
const f32* src_address = srcBase + i;
float32x4_t s1 = vld1q_f32(src_address + 0);
float32x4_t s2 = vld1q_f32(src_address + 4);
float32x4_t s3 = vld1q_f32(src_address + 8);
float32x4_t s4 = vld1q_f32(src_address + 12);
src_address += srcstep;
for(size_t h = 1; h < size.height; ++h, src_address += srcstep)
{
internal::prefetch(src_address + srcstep, 0);
internal::prefetch(src_address + srcstep, 32);
float32x4_t v1 = vld1q_f32(src_address + 0);
float32x4_t v2 = vld1q_f32(src_address + 4);
float32x4_t v3 = vld1q_f32(src_address + 8);
float32x4_t v4 = vld1q_f32(src_address + 12);
s1 = vaddq_f32(s1, v1);
s2 = vaddq_f32(s2, v2);
s3 = vaddq_f32(s3, v3);
s4 = vaddq_f32(s4, v4);
}
vst1q_f32(dstBase + i + 0, s1);
vst1q_f32(dstBase + i + 4, s2);
vst1q_f32(dstBase + i + 8, s3);
vst1q_f32(dstBase + i + 12, s4);
}
for (; i + 4 <= size.width; i += 4)
{
const f32* src_address = srcBase + i;
float32x4_t s1 = vld1q_f32(src_address);
src_address += srcstep;
for(size_t h = 1; h < size.height; ++h, src_address += srcstep)
{
internal::prefetch(src_address + srcstep, 0);
float32x4_t v1 = vld1q_f32(src_address);
s1 = vaddq_f32(s1, v1);
}
vst1q_f32(dstBase + i, s1);
}
if (i < size.width)
for(size_t h = 1; h < size.height; ++h)
{
for(size_t j = i ; j < size.width; j++ )
{
dstBase[j] += srcBase[j + srcstep * h];
}
}
#else
(void)size;
(void)srcBase;
(void)srcStride;
(void)dstBase;
#endif
}
void reduceColMax(const Size2D &size,
const f32 * srcBase, ptrdiff_t srcStride,
f32 * dstBase)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
memcpy(dstBase, srcBase, size.width*sizeof(f32));
size_t srcstep = srcStride/sizeof(f32);
size_t i = 0;
for (; i + 16 <= size.width; i += 16)
{
const f32* src_address = srcBase + i;
float32x4_t s1 = vld1q_f32(src_address + 0);
float32x4_t s2 = vld1q_f32(src_address + 4);
float32x4_t s3 = vld1q_f32(src_address + 8);
float32x4_t s4 = vld1q_f32(src_address + 12);
src_address += srcstep;
for(size_t h = 1; h < size.height; ++h, src_address += srcstep)
{
internal::prefetch(src_address + srcstep, 0);
internal::prefetch(src_address + srcstep, 32);
float32x4_t v1 = vld1q_f32(src_address + 0);
float32x4_t v2 = vld1q_f32(src_address + 4);
float32x4_t v3 = vld1q_f32(src_address + 8);
float32x4_t v4 = vld1q_f32(src_address + 12);
s1 = vmaxq_f32(s1, v1);
s2 = vmaxq_f32(s2, v2);
s3 = vmaxq_f32(s3, v3);
s4 = vmaxq_f32(s4, v4);
}
vst1q_f32(dstBase + i + 0, s1);
vst1q_f32(dstBase + i + 4, s2);
vst1q_f32(dstBase + i + 8, s3);
vst1q_f32(dstBase + i + 12, s4);
}
for (; i + 4 <= size.width; i += 4)
{
const f32* src_address = srcBase + i;
float32x4_t s1 = vld1q_f32(src_address);
src_address += srcstep;
for(size_t h = 1; h < size.height; ++h, src_address += srcstep)
{
internal::prefetch(src_address + srcstep, 0);
float32x4_t v1 = vld1q_f32(src_address);
s1 = vmaxq_f32(s1, v1);
}
vst1q_f32(dstBase + i, s1);
}
if (i < size.width)
for(size_t h = 1; h < size.height; ++h)
for(size_t j = i ; j < size.width; j++ )
dstBase[j] = std::max(dstBase[j], srcBase[j + srcstep * h]);
#else
(void)size;
(void)srcBase;
(void)srcStride;
(void)dstBase;
#endif
}
void reduceColMin(const Size2D &size,
const f32 * srcBase, ptrdiff_t srcStride,
f32 * dstBase)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
memcpy(dstBase, srcBase, size.width*sizeof(f32));
size_t srcstep = srcStride/sizeof(f32);
size_t i = 0;
for (; i + 16 <= size.width; i += 16)
{
const f32* src_address = srcBase + i;
float32x4_t s1 = vld1q_f32(src_address + 0);
float32x4_t s2 = vld1q_f32(src_address + 4);
float32x4_t s3 = vld1q_f32(src_address + 8);
float32x4_t s4 = vld1q_f32(src_address + 12);
src_address += srcstep;
for(size_t h = 1; h < size.height; ++h, src_address += srcstep)
{
internal::prefetch(src_address + srcstep, 0);
internal::prefetch(src_address + srcstep, 32);
float32x4_t v1 = vld1q_f32(src_address + 0);
float32x4_t v2 = vld1q_f32(src_address + 4);
float32x4_t v3 = vld1q_f32(src_address + 8);
float32x4_t v4 = vld1q_f32(src_address + 12);
s1 = vminq_f32(s1, v1);
s2 = vminq_f32(s2, v2);
s3 = vminq_f32(s3, v3);
s4 = vminq_f32(s4, v4);
}
vst1q_f32(dstBase + i + 0, s1);
vst1q_f32(dstBase + i + 4, s2);
vst1q_f32(dstBase + i + 8, s3);
vst1q_f32(dstBase + i + 12, s4);
}
for (; i + 4 <= size.width; i += 4)
{
const f32* src_address = srcBase + i;
float32x4_t s1 = vld1q_f32(src_address);
src_address += srcstep;
for(size_t h = 1; h < size.height; ++h, src_address += srcstep)
{
internal::prefetch(src_address + srcstep, 0);
float32x4_t v1 = vld1q_f32(src_address);
s1 = vminq_f32(s1, v1);
}
vst1q_f32(dstBase + i, s1);
}
if (i < size.width)
for(size_t h = 1; h < size.height; ++h)
for(size_t j = i ; j < size.width; j++ )
dstBase[j] = std::min(dstBase[j], srcBase[j + srcstep * h]);
#else
(void)size;
(void)srcBase;
(void)srcStride;
(void)dstBase;
#endif
}
} // namespace CAROTENE_NS

694
3rdparty/carotene/src/remap.cpp vendored Normal file
View File

@ -0,0 +1,694 @@
/*
* By downloading, copying, installing or using the software you agree to this license.
* If you do not agree to this license, do not download, install,
* copy or use the software.
*
*
* License Agreement
* For Open Source Computer Vision Library
* (3-clause BSD License)
*
* Copyright (C) 2015, NVIDIA Corporation, all rights reserved.
* Third party copyrights are property of their respective owners.
*
* Redistribution and use in source and binary forms, with or without modification,
* are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* * Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* * Neither the names of the copyright holders nor the names of the contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* This software is provided by the copyright holders and contributors "as is" and
* any express or implied warranties, including, but not limited to, the implied
* warranties of merchantability and fitness for a particular purpose are disclaimed.
* In no event shall copyright holders or contributors be liable for any direct,
* indirect, incidental, special, exemplary, or consequential damages
* (including, but not limited to, procurement of substitute goods or services;
* loss of use, data, or profits; or business interruption) however caused
* and on any theory of liability, whether in contract, strict liability,
* or tort (including negligence or otherwise) arising in any way out of
* the use of this software, even if advised of the possibility of such damage.
*/
#include "remap.hpp"
namespace CAROTENE_NS {
#ifdef CAROTENE_NEON
namespace internal {
void remapNearestNeighborReplicate(const Size2D size,
const u8 * srcBase,
const s32 * map,
u8 * dstBase, ptrdiff_t dstStride)
{
for (size_t y = 0; y < size.height; ++y)
{
const s32 * map_row = internal::getRowPtr(map, size.width * sizeof(s32), y);
u8 * dst_row = internal::getRowPtr(dstBase, dstStride, y);
for (size_t x = 0; x < size.width; ++x)
{
dst_row[x] = srcBase[map_row[x]];
}
}
}
void remapNearestNeighborConst(const Size2D size,
const u8 * srcBase,
const s32 * map,
u8 * dstBase, ptrdiff_t dstStride,
u8 borderValue)
{
for (size_t y = 0; y < size.height; ++y)
{
const s32 * map_row = internal::getRowPtr(map, size.width * sizeof(s32), y);
u8 * dst_row = internal::getRowPtr(dstBase, dstStride, y);
for (size_t x = 0; x < size.width; ++x)
{
s32 src_idx = map_row[x];
dst_row[x] = src_idx >= 0 ? srcBase[map_row[x]] : borderValue;
}
}
}
void remapLinearReplicate(const Size2D size,
const u8 * srcBase,
const s32 * map,
const f32 * coeffs,
u8 * dstBase, ptrdiff_t dstStride)
{
int16x8_t v_zero16 = vdupq_n_s16(0);
for (size_t y = 0; y < size.height; ++y)
{
const s32 * map_row = internal::getRowPtr(map, size.width * sizeof(s32) * 4, y);
const f32 * coeff_row = internal::getRowPtr(coeffs, size.width * sizeof(f32) * 2, y);
u8 * dst_row = internal::getRowPtr(dstBase, dstStride, y);
size_t x = 0;
for ( ; x + 8 < size.width; x += 8)
{
int16x8_t v_src00 = vsetq_lane_s16(srcBase[map_row[(x << 2)]], v_zero16, 0);
v_src00 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 4]], v_src00, 1);
v_src00 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 8]], v_src00, 2);
v_src00 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 12]], v_src00, 3);
v_src00 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 16]], v_src00, 4);
v_src00 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 20]], v_src00, 5);
v_src00 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 24]], v_src00, 6);
v_src00 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 28]], v_src00, 7);
int16x8_t v_src01 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 1]], v_zero16, 0);
v_src01 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 5]], v_src01, 1);
v_src01 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 9]], v_src01, 2);
v_src01 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 13]], v_src01, 3);
v_src01 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 17]], v_src01, 4);
v_src01 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 21]], v_src01, 5);
v_src01 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 25]], v_src01, 6);
v_src01 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 29]], v_src01, 7);
int16x8_t v_src10 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 2]], v_zero16, 0);
v_src10 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 6]], v_src10, 1);
v_src10 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 10]], v_src10, 2);
v_src10 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 14]], v_src10, 3);
v_src10 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 18]], v_src10, 4);
v_src10 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 22]], v_src10, 5);
v_src10 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 26]], v_src10, 6);
v_src10 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 30]], v_src10, 7);
int16x8_t v_src11 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 3]], v_zero16, 0);
v_src11 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 7]], v_src11, 1);
v_src11 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 11]], v_src11, 2);
v_src11 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 15]], v_src11, 3);
v_src11 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 19]], v_src11, 4);
v_src11 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 23]], v_src11, 5);
v_src11 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 27]], v_src11, 6);
v_src11 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 31]], v_src11, 7);
// first part
float32x4_t v_src00_f = vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src00)));
float32x4_t v_src10_f = vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src10)));
float32x4x2_t v_coeff = vld2q_f32(coeff_row + (x << 1));
float32x4_t v_dst_0 = vmlaq_f32(v_src00_f, vcvtq_f32_s32(vsubl_s16(vget_low_s16(v_src01),
vget_low_s16(v_src00))), v_coeff.val[0]);
float32x4_t v_dst_1 = vmlaq_f32(v_src10_f, vcvtq_f32_s32(vsubl_s16(vget_low_s16(v_src11),
vget_low_s16(v_src10))), v_coeff.val[0]);
float32x4_t v_dst = vmlaq_f32(v_dst_0, vsubq_f32(v_dst_1, v_dst_0), v_coeff.val[1]);
uint16x4_t v_dst0 = vmovn_u32(vcvtq_u32_f32(v_dst));
// second part
v_src00_f = vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src00)));
v_src10_f = vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src10)));
v_coeff = vld2q_f32(coeff_row + (x << 1) + 8);
v_dst_0 = vmlaq_f32(v_src00_f, vcvtq_f32_s32(vsubl_s16(vget_high_s16(v_src01),
vget_high_s16(v_src00))), v_coeff.val[0]);
v_dst_1 = vmlaq_f32(v_src10_f, vcvtq_f32_s32(vsubl_s16(vget_high_s16(v_src11),
vget_high_s16(v_src10))), v_coeff.val[0]);
v_dst = vmlaq_f32(v_dst_0, vsubq_f32(v_dst_1, v_dst_0), v_coeff.val[1]);
uint16x4_t v_dst1 = vmovn_u32(vcvtq_u32_f32(v_dst));
// store
vst1_u8(dst_row + x, vmovn_u16(vcombine_u16(v_dst0, v_dst1)));
}
for ( ; x < size.width; ++x)
{
s32 src00_index = map_row[(x << 2)];
s32 src10_index = map_row[(x << 2) + 2];
f32 dst_val_0 = (srcBase[map_row[(x << 2) + 1]] - srcBase[src00_index]) * coeff_row[x << 1] +
srcBase[src00_index];
f32 dst_val_1 = (srcBase[map_row[(x << 2) + 3]] - srcBase[src10_index]) * coeff_row[x << 1] +
srcBase[src10_index];
dst_row[x] = floorf((dst_val_1 - dst_val_0) * coeff_row[(x << 1) + 1] + dst_val_0);
}
}
}
void remapLinearConst(const Size2D size,
const u8 * srcBase,
const s32 * map,
const f32 * coeffs,
u8 * dstBase, ptrdiff_t dstStride,
u8 borderValue)
{
int16x8_t v_zero16 = vdupq_n_s16(0);
for (size_t y = 0; y < size.height; ++y)
{
const s32 * map_row = internal::getRowPtr(map, size.width * sizeof(s32) * 4, y);
const f32 * coeff_row = internal::getRowPtr(coeffs, size.width * sizeof(f32) * 2, y);
u8 * dst_row = internal::getRowPtr(dstBase, dstStride, y);
size_t x = 0;
for ( ; x + 8 < size.width; x += 8)
{
int16x8_t v_src00 = vsetq_lane_s16(map_row[(x << 2)] >= 0 ? srcBase[map_row[(x << 2)]] : borderValue, v_zero16, 0);
v_src00 = vsetq_lane_s16(map_row[(x << 2) + 4] >= 0 ? srcBase[map_row[(x << 2) + 4]] : borderValue, v_src00, 1);
v_src00 = vsetq_lane_s16(map_row[(x << 2) + 8] >= 0 ? srcBase[map_row[(x << 2) + 8]] : borderValue, v_src00, 2);
v_src00 = vsetq_lane_s16(map_row[(x << 2) + 12] >= 0 ? srcBase[map_row[(x << 2) + 12]] : borderValue, v_src00, 3);
v_src00 = vsetq_lane_s16(map_row[(x << 2) + 16] >= 0 ? srcBase[map_row[(x << 2) + 16]] : borderValue, v_src00, 4);
v_src00 = vsetq_lane_s16(map_row[(x << 2) + 20] >= 0 ? srcBase[map_row[(x << 2) + 20]] : borderValue, v_src00, 5);
v_src00 = vsetq_lane_s16(map_row[(x << 2) + 24] >= 0 ? srcBase[map_row[(x << 2) + 24]] : borderValue, v_src00, 6);
v_src00 = vsetq_lane_s16(map_row[(x << 2) + 28] >= 0 ? srcBase[map_row[(x << 2) + 28]] : borderValue, v_src00, 7);
int16x8_t v_src01 = vsetq_lane_s16(map_row[(x << 2) + 1] >= 0 ? srcBase[map_row[(x << 2) + 1]] : borderValue, v_zero16, 0);
v_src01 = vsetq_lane_s16(map_row[(x << 2) + 5] >= 0 ? srcBase[map_row[(x << 2) + 5]] : borderValue, v_src01, 1);
v_src01 = vsetq_lane_s16(map_row[(x << 2) + 9] >= 0 ? srcBase[map_row[(x << 2) + 9]] : borderValue, v_src01, 2);
v_src01 = vsetq_lane_s16(map_row[(x << 2) + 13] >= 0 ? srcBase[map_row[(x << 2) + 13]] : borderValue, v_src01, 3);
v_src01 = vsetq_lane_s16(map_row[(x << 2) + 17] >= 0 ? srcBase[map_row[(x << 2) + 17]] : borderValue, v_src01, 4);
v_src01 = vsetq_lane_s16(map_row[(x << 2) + 21] >= 0 ? srcBase[map_row[(x << 2) + 21]] : borderValue, v_src01, 5);
v_src01 = vsetq_lane_s16(map_row[(x << 2) + 25] >= 0 ? srcBase[map_row[(x << 2) + 25]] : borderValue, v_src01, 6);
v_src01 = vsetq_lane_s16(map_row[(x << 2) + 29] >= 0 ? srcBase[map_row[(x << 2) + 29]] : borderValue, v_src01, 7);
int16x8_t v_src10 = vsetq_lane_s16(map_row[(x << 2) + 2] >= 0 ? srcBase[map_row[(x << 2) + 2]] : borderValue, v_zero16, 0);
v_src10 = vsetq_lane_s16(map_row[(x << 2) + 6] >= 0 ? srcBase[map_row[(x << 2) + 6]] : borderValue, v_src10, 1);
v_src10 = vsetq_lane_s16(map_row[(x << 2) + 10] >= 0 ? srcBase[map_row[(x << 2) + 10]] : borderValue, v_src10, 2);
v_src10 = vsetq_lane_s16(map_row[(x << 2) + 14] >= 0 ? srcBase[map_row[(x << 2) + 14]] : borderValue, v_src10, 3);
v_src10 = vsetq_lane_s16(map_row[(x << 2) + 18] >= 0 ? srcBase[map_row[(x << 2) + 18]] : borderValue, v_src10, 4);
v_src10 = vsetq_lane_s16(map_row[(x << 2) + 22] >= 0 ? srcBase[map_row[(x << 2) + 22]] : borderValue, v_src10, 5);
v_src10 = vsetq_lane_s16(map_row[(x << 2) + 26] >= 0 ? srcBase[map_row[(x << 2) + 26]] : borderValue, v_src10, 6);
v_src10 = vsetq_lane_s16(map_row[(x << 2) + 30] >= 0 ? srcBase[map_row[(x << 2) + 30]] : borderValue, v_src10, 7);
int16x8_t v_src11 = vsetq_lane_s16(map_row[(x << 2) + 3] >= 0 ? srcBase[map_row[(x << 2) + 3]] : borderValue, v_zero16, 0);
v_src11 = vsetq_lane_s16(map_row[(x << 2) + 7] >= 0 ? srcBase[map_row[(x << 2) + 7]] : borderValue, v_src11, 1);
v_src11 = vsetq_lane_s16(map_row[(x << 2) + 11] >= 0 ? srcBase[map_row[(x << 2) + 11]] : borderValue, v_src11, 2);
v_src11 = vsetq_lane_s16(map_row[(x << 2) + 15] >= 0 ? srcBase[map_row[(x << 2) + 15]] : borderValue, v_src11, 3);
v_src11 = vsetq_lane_s16(map_row[(x << 2) + 19] >= 0 ? srcBase[map_row[(x << 2) + 19]] : borderValue, v_src11, 4);
v_src11 = vsetq_lane_s16(map_row[(x << 2) + 23] >= 0 ? srcBase[map_row[(x << 2) + 23]] : borderValue, v_src11, 5);
v_src11 = vsetq_lane_s16(map_row[(x << 2) + 27] >= 0 ? srcBase[map_row[(x << 2) + 27]] : borderValue, v_src11, 6);
v_src11 = vsetq_lane_s16(map_row[(x << 2) + 31] >= 0 ? srcBase[map_row[(x << 2) + 31]] : borderValue, v_src11, 7);
// first part
float32x4_t v_src00_f = vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src00)));
float32x4_t v_src10_f = vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src10)));
float32x4x2_t v_coeff = vld2q_f32(coeff_row + (x << 1));
float32x4_t v_dst_0 = vmlaq_f32(v_src00_f, vcvtq_f32_s32(vsubl_s16(vget_low_s16(v_src01),
vget_low_s16(v_src00))), v_coeff.val[0]);
float32x4_t v_dst_1 = vmlaq_f32(v_src10_f, vcvtq_f32_s32(vsubl_s16(vget_low_s16(v_src11),
vget_low_s16(v_src10))), v_coeff.val[0]);
float32x4_t v_dst = vmlaq_f32(v_dst_0, vsubq_f32(v_dst_1, v_dst_0), v_coeff.val[1]);
uint16x4_t v_dst0 = vmovn_u32(vcvtq_u32_f32(v_dst));
// second part
v_src00_f = vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src00)));
v_src10_f = vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src10)));
v_coeff = vld2q_f32(coeff_row + (x << 1) + 8);
v_dst_0 = vmlaq_f32(v_src00_f, vcvtq_f32_s32(vsubl_s16(vget_high_s16(v_src01),
vget_high_s16(v_src00))), v_coeff.val[0]);
v_dst_1 = vmlaq_f32(v_src10_f, vcvtq_f32_s32(vsubl_s16(vget_high_s16(v_src11),
vget_high_s16(v_src10))), v_coeff.val[0]);
v_dst = vmlaq_f32(v_dst_0, vsubq_f32(v_dst_1, v_dst_0), v_coeff.val[1]);
uint16x4_t v_dst1 = vmovn_u32(vcvtq_u32_f32(v_dst));
// store
vst1_u8(dst_row + x, vmovn_u16(vcombine_u16(v_dst0, v_dst1)));
}
for ( ; x < size.width; ++x)
{
s16 src00 = map_row[(x << 2) + 0] >= 0 ? srcBase[map_row[(x << 2) + 0]] : borderValue;
s16 src01 = map_row[(x << 2) + 1] >= 0 ? srcBase[map_row[(x << 2) + 1]] : borderValue;
s16 src10 = map_row[(x << 2) + 2] >= 0 ? srcBase[map_row[(x << 2) + 2]] : borderValue;
s16 src11 = map_row[(x << 2) + 3] >= 0 ? srcBase[map_row[(x << 2) + 3]] : borderValue;
f32 dst_val_0 = (src01 - src00) * coeff_row[(x << 1)] + src00;
f32 dst_val_1 = (src11 - src10) * coeff_row[(x << 1)] + src10;
dst_row[x] = floorf((dst_val_1 - dst_val_0) * coeff_row[(x << 1) + 1] + dst_val_0);
}
}
}
} // namespace internal
#endif // CAROTENE_NEON
bool isRemapNearestNeighborSupported(const Size2D &ssize)
{
#if SIZE_MAX > UINT32_MAX
return !(ssize.width > 0xffffFFFF || ssize.height > 0xffffFFFF) && // Restrict image size since internal index evaluation
// is performed with u32
isSupportedConfiguration();
#else
(void)ssize;
return isSupportedConfiguration();
#endif
}
bool isRemapLinearSupported(const Size2D &ssize)
{
#if SIZE_MAX > UINT32_MAX
return !(ssize.width > 0xffffFFFF || ssize.height > 0xffffFFFF) && // Restrict image size since internal index evaluation
// is performed with u32
isSupportedConfiguration();
#else
(void)ssize;
return isSupportedConfiguration();
#endif
}
void remapNearestNeighbor(const Size2D &ssize, const Size2D &dsize,
const u8 * srcBase, ptrdiff_t srcStride,
const f32 * tableBase, ptrdiff_t tableStride,
u8 * dstBase, ptrdiff_t dstStride,
BORDER_MODE borderMode, u8 borderValue)
{
internal::assertSupportedConfiguration(isRemapNearestNeighborSupported(ssize));
#ifdef CAROTENE_NEON
using namespace internal;
s32 _map[BLOCK_SIZE * BLOCK_SIZE + 16];
s32 * map = alignPtr(_map, 16);
int32x4_t v_width4 = vdupq_n_s32(ssize.width - 1), v_height4 = vdupq_n_s32(ssize.height - 1);
int32x2_t v_width2 = vdup_n_s32(ssize.width - 1), v_height2 = vdup_n_s32(ssize.height - 1);
int32x4_t v_step4 = vdupq_n_s32(srcStride);
int32x2_t v_step2 = vdup_n_s32(srcStride);
if (borderMode == BORDER_MODE_REPLICATE)
{
int32x4_t v_zero4 = vdupq_n_s32(0);
int32x2_t v_zero2 = vdup_n_s32(0);
for (size_t i = 0; i < dsize.height; i += BLOCK_SIZE)
{
size_t blockHeight = std::min<size_t>(BLOCK_SIZE, dsize.height - i);
for (size_t j = 0; j < dsize.width; j += BLOCK_SIZE)
{
size_t blockWidth = std::min<size_t>(BLOCK_SIZE, dsize.width - j);
// compute table
for (size_t y = 0; y < blockHeight; ++y)
{
const f32 * table_row = getRowPtr(tableBase, tableStride, i + y) + (j << 1);
s32 * map_row = getRowPtr(&map[0], blockWidth * sizeof(s32), y);
size_t x = 0;
for ( ; x + 8 <= blockWidth; x += 8)
{
float32x4x2_t v_table0 = vld2q_f32(table_row + (x << 1)),
v_table1 = vld2q_f32(table_row + (x << 1) + 8);
int32x4_t v_dst_x = vmaxq_s32(v_zero4, vminq_s32(v_width4, vcvtq_s32_f32(v_table0.val[0])));
int32x4_t v_dst_y = vmaxq_s32(v_zero4, vminq_s32(v_height4, vcvtq_s32_f32(v_table0.val[1])));
int32x4_t v_dst_index = vmlaq_s32(v_dst_x, v_dst_y, v_step4);
vst1q_s32(map_row + x, v_dst_index);
v_dst_x = vmaxq_s32(v_zero4, vminq_s32(v_width4, vcvtq_s32_f32(v_table1.val[0])));
v_dst_y = vmaxq_s32(v_zero4, vminq_s32(v_height4, vcvtq_s32_f32(v_table1.val[1])));
v_dst_index = vmlaq_s32(v_dst_x, v_dst_y, v_step4);
vst1q_s32(map_row + x + 4, v_dst_index);
}
for ( ; x + 4 <= blockWidth; x += 4)
{
float32x4x2_t v_table0 = vld2q_f32(table_row + (x << 1));
int32x4_t v_dst_x = vmaxq_s32(v_zero4, vminq_s32(v_width4, vcvtq_s32_f32(v_table0.val[0])));
int32x4_t v_dst_y = vmaxq_s32(v_zero4, vminq_s32(v_height4, vcvtq_s32_f32(v_table0.val[1])));
int32x4_t v_dst_index = vmlaq_s32(v_dst_x, v_dst_y, v_step4);
vst1q_s32(map_row + x, v_dst_index);
}
for ( ; x + 2 <= blockWidth; x += 2)
{
float32x2x2_t v_table0 = vld2_f32(table_row + (x << 1));
int32x2_t v_dst_x = vmax_s32(v_zero2, vmin_s32(v_width2, vcvt_s32_f32(v_table0.val[0])));
int32x2_t v_dst_y = vmax_s32(v_zero2, vmin_s32(v_height2, vcvt_s32_f32(v_table0.val[1])));
int32x2_t v_dst_index = vmla_s32(v_dst_x, v_dst_y, v_step2);
vst1_s32(map_row + x, v_dst_index);
}
for ( ; x < blockWidth; ++x)
{
s32 src_x = std::max(0, std::min<s32>(ssize.width - 1, (s32)floorf(table_row[(x << 1) + 0])));
s32 src_y = std::max(0, std::min<s32>(ssize.height - 1, (s32)floorf(table_row[(x << 1) + 1])));
map_row[x] = src_y * srcStride + src_x;
}
}
// make remap
remapNearestNeighborReplicate(Size2D(blockWidth, blockHeight), srcBase, &map[0],
getRowPtr(dstBase, dstStride, i) + j, dstStride);
}
}
}
else if (borderMode == BORDER_MODE_CONSTANT)
{
int32x4_t v_m1_4 = vdupq_n_s32(-1);
int32x2_t v_m1_2 = vdup_n_s32(-1);
float32x4_t v_zero4 = vdupq_n_f32(0.0f);
float32x2_t v_zero2 = vdup_n_f32(0.0f);
for (size_t i = 0; i < dsize.height; i += BLOCK_SIZE)
{
size_t blockHeight = std::min<size_t>(BLOCK_SIZE, dsize.height - i);
for (size_t j = 0; j < dsize.width; j += BLOCK_SIZE)
{
size_t blockWidth = std::min<size_t>(BLOCK_SIZE, dsize.width - j);
// compute table
for (size_t y = 0; y < blockHeight; ++y)
{
const f32 * table_row = getRowPtr(tableBase, tableStride, i + y) + (j << 1);
s32 * map_row = getRowPtr(&map[0], blockWidth * sizeof(s32), y);
size_t x = 0;
for ( ; x + 8 <= blockWidth; x += 8)
{
float32x4x2_t v_table0 = vld2q_f32(table_row + (x << 1)),
v_table1 = vld2q_f32(table_row + (x << 1) + 8);
int32x4_t v_dst_x = vcvtq_s32_f32(v_table0.val[0]);
int32x4_t v_dst_y = vcvtq_s32_f32(v_table0.val[1]);
uint32x4_t v_mask = vandq_u32(vandq_u32(vcgeq_f32(v_table0.val[0], v_zero4), vcleq_s32(v_dst_x, v_width4)),
vandq_u32(vcgeq_f32(v_table0.val[1], v_zero4), vcleq_s32(v_dst_y, v_height4)));
int32x4_t v_dst_index = vbslq_s32(v_mask, vmlaq_s32(v_dst_x, v_dst_y, v_step4), v_m1_4);
vst1q_s32(map_row + x, v_dst_index);
v_dst_x = vcvtq_s32_f32(v_table1.val[0]);
v_dst_y = vcvtq_s32_f32(v_table1.val[1]);
v_mask = vandq_u32(vandq_u32(vcgeq_f32(v_table1.val[0], v_zero4), vcleq_s32(v_dst_x, v_width4)),
vandq_u32(vcgeq_f32(v_table1.val[1], v_zero4), vcleq_s32(v_dst_y, v_height4)));
v_dst_index = vbslq_s32(v_mask, vmlaq_s32(v_dst_x, v_dst_y, v_step4), v_m1_4);
vst1q_s32(map_row + x + 4, v_dst_index);
}
for ( ; x + 4 <= blockWidth; x += 4)
{
float32x4x2_t v_table0 = vld2q_f32(table_row + (x << 1));
int32x4_t v_dst_x = vcvtq_s32_f32(v_table0.val[0]);
int32x4_t v_dst_y = vcvtq_s32_f32(v_table0.val[1]);
uint32x4_t v_mask = vandq_u32(vandq_u32(vcgeq_f32(v_table0.val[0], v_zero4), vcleq_s32(v_dst_x, v_width4)),
vandq_u32(vcgeq_f32(v_table0.val[1], v_zero4), vcleq_s32(v_dst_y, v_height4)));
int32x4_t v_dst_index = vbslq_s32(v_mask, vmlaq_s32(v_dst_x, v_dst_y, v_step4), v_m1_4);
vst1q_s32(map_row + x, v_dst_index);
}
for ( ; x + 2 <= blockWidth; x += 2)
{
float32x2x2_t v_table0 = vld2_f32(table_row + (x << 1));
int32x2_t v_dst_x = vcvt_s32_f32(v_table0.val[0]);
int32x2_t v_dst_y = vcvt_s32_f32(v_table0.val[1]);
uint32x2_t v_mask = vand_u32(vand_u32(vcge_f32(v_table0.val[0], v_zero2), vcle_s32(v_dst_x, v_width2)),
vand_u32(vcge_f32(v_table0.val[1], v_zero2), vcle_s32(v_dst_y, v_height2)));
int32x2_t v_dst_index = vbsl_s32(v_mask, vmla_s32(v_dst_x, v_dst_y, v_step2), v_m1_2);
vst1_s32(map_row + x, v_dst_index);
}
for ( ; x < blockWidth; ++x)
{
s32 src_x = (s32)floorf(table_row[(x << 1) + 0]);
s32 src_y = (s32)floorf(table_row[(x << 1) + 1]);
map_row[x] = (src_x >= 0) && (src_x < (s32)ssize.width) &&
(src_y >= 0) && (src_y < (s32)ssize.height) ? src_y * srcStride + src_x : -1;
}
}
// make remap
remapNearestNeighborConst(Size2D(blockWidth, blockHeight), srcBase, &map[0],
getRowPtr(dstBase, dstStride, i) + j, dstStride, borderValue);
}
}
}
#else
(void)ssize;
(void)dsize;
(void)srcBase;
(void)srcStride;
(void)tableBase;
(void)tableStride;
(void)dstBase;
(void)dstStride;
(void)borderMode;
(void)borderValue;
#endif
}
void remapLinear(const Size2D &ssize, const Size2D &dsize,
const u8 * srcBase, ptrdiff_t srcStride,
const f32 * tableBase, ptrdiff_t tableStride,
u8 * dstBase, ptrdiff_t dstStride,
BORDER_MODE borderMode, u8 borderValue)
{
internal::assertSupportedConfiguration(isRemapLinearSupported(ssize));
#ifdef CAROTENE_NEON
using namespace internal;
s32 _map[((BLOCK_SIZE * BLOCK_SIZE) << 2) + 16];
f32 _coeffs[((BLOCK_SIZE * BLOCK_SIZE) << 1) + 16];
s32 * map = alignPtr(_map, 16);
f32 * coeffs = alignPtr(_coeffs, 16);
int32x4_t v_width4 = vdupq_n_s32(ssize.width - 1), v_height4 = vdupq_n_s32(ssize.height - 1);
int32x4_t v_step4 = vdupq_n_s32(srcStride), v_1 = vdupq_n_s32(1);
float32x4_t v_zero4f = vdupq_n_f32(0.0f), v_one4f = vdupq_n_f32(1.0f);
if (borderMode == BORDER_MODE_REPLICATE)
{
int32x4_t v_zero4 = vdupq_n_s32(0);
for (size_t i = 0; i < dsize.height; i += BLOCK_SIZE)
{
size_t blockHeight = std::min<size_t>(BLOCK_SIZE, dsize.height - i);
for (size_t j = 0; j < dsize.width; j += BLOCK_SIZE)
{
size_t blockWidth = std::min<size_t>(BLOCK_SIZE, dsize.width - j);
// compute table
for (size_t y = 0; y < blockHeight; ++y)
{
const f32 * table_row = getRowPtr(tableBase, tableStride, i + y) + (j << 1);
s32 * map_row = getRowPtr(map, blockWidth * sizeof(s32) * 4, y);
f32 * coeff_row = getRowPtr(coeffs, blockWidth * sizeof(f32) * 2, y);
size_t x = 0;
for ( ; x + 4 <= blockWidth; x += 4)
{
float32x4x2_t v_table = vld2q_f32(table_row + (x << 1));
int32x4_t v_src_x = vcvtq_s32_f32(v_table.val[0]);
int32x4_t v_src_y = vcvtq_s32_f32(v_table.val[1]);
float32x4x2_t v_coeff;
v_coeff.val[0] = vsubq_f32(v_table.val[0], vcvtq_f32_s32(v_src_x));
v_coeff.val[1] = vsubq_f32(v_table.val[1], vcvtq_f32_s32(v_src_y));
uint32x4_t v_maskx = vcltq_f32(v_coeff.val[0], v_zero4f);
uint32x4_t v_masky = vcltq_f32(v_coeff.val[1], v_zero4f);
v_coeff.val[0] = vbslq_f32(v_maskx, vaddq_f32(v_one4f, v_coeff.val[0]), v_coeff.val[0]);
v_coeff.val[1] = vbslq_f32(v_masky, vaddq_f32(v_one4f, v_coeff.val[1]), v_coeff.val[1]);
v_src_x = vbslq_s32(v_maskx, vsubq_s32(v_src_x, v_1), v_src_x);
v_src_y = vbslq_s32(v_masky, vsubq_s32(v_src_y, v_1), v_src_y);
int32x4_t v_dst0_x = vmaxq_s32(v_zero4, vminq_s32(v_width4, v_src_x));
int32x4_t v_dst0_y = vmaxq_s32(v_zero4, vminq_s32(v_height4, v_src_y));
int32x4_t v_dst1_x = vmaxq_s32(v_zero4, vminq_s32(v_width4, vaddq_s32(v_1, v_src_x)));
int32x4_t v_dst1_y = vmaxq_s32(v_zero4, vminq_s32(v_height4, vaddq_s32(v_1, v_src_y)));
int32x4x4_t v_dst_index;
v_dst_index.val[0] = vmlaq_s32(v_dst0_x, v_dst0_y, v_step4);
v_dst_index.val[1] = vmlaq_s32(v_dst1_x, v_dst0_y, v_step4);
v_dst_index.val[2] = vmlaq_s32(v_dst0_x, v_dst1_y, v_step4);
v_dst_index.val[3] = vmlaq_s32(v_dst1_x, v_dst1_y, v_step4);
vst2q_f32(coeff_row + (x << 1), v_coeff);
vst4q_s32(map_row + (x << 2), v_dst_index);
}
for ( ; x < blockWidth; ++x)
{
f32 src_x_f = table_row[(x << 1) + 0];
f32 src_y_f = table_row[(x << 1) + 1];
s32 src0_x = (s32)floorf(src_x_f);
s32 src0_y = (s32)floorf(src_y_f);
coeff_row[x << 1] = src_x_f - src0_x;
coeff_row[(x << 1) + 1] = src_y_f - src0_y;
s32 src1_y = std::max(0, std::min<s32>(ssize.height - 1, src0_y + 1));
src0_y = std::max(0, std::min<s32>(ssize.height - 1, src0_y));
s32 src1_x = std::max(0, std::min<s32>(ssize.width - 1, src0_x + 1));
src0_x = std::max(0, std::min<s32>(ssize.width - 1, src0_x));
map_row[(x << 2) + 0] = src0_y * srcStride + src0_x;
map_row[(x << 2) + 1] = src0_y * srcStride + src1_x;
map_row[(x << 2) + 2] = src1_y * srcStride + src0_x;
map_row[(x << 2) + 3] = src1_y * srcStride + src1_x;
}
}
remapLinearReplicate(Size2D(blockWidth, blockHeight),
srcBase, &map[0], &coeffs[0],
getRowPtr(dstBase, dstStride, i) + j, dstStride);
}
}
}
else if (borderMode == BORDER_MODE_CONSTANT)
{
float32x4_t v_zero4 = vdupq_n_f32(0.0f);
int32x4_t v_m1_4 = vdupq_n_s32(-1);
for (size_t i = 0; i < dsize.height; i += BLOCK_SIZE)
{
size_t blockHeight = std::min<size_t>(BLOCK_SIZE, dsize.height - i);
for (size_t j = 0; j < dsize.width; j += BLOCK_SIZE)
{
size_t blockWidth = std::min<size_t>(BLOCK_SIZE, dsize.width - j);
// compute table
for (size_t y = 0; y < blockHeight; ++y)
{
const f32 * table_row = getRowPtr(tableBase, tableStride, i + y) + (j << 1);
s32 * map_row = getRowPtr(map, blockWidth * sizeof(s32) * 4, y);
f32 * coeff_row = getRowPtr(coeffs, blockWidth * sizeof(f32) * 2, y);
size_t x = 0;
for ( ; x + 4 <= blockWidth; x += 4)
{
float32x4x2_t v_table = vld2q_f32(table_row + (x << 1));
int32x4_t v_src_x0 = vcvtq_s32_f32(v_table.val[0]);
int32x4_t v_src_y0 = vcvtq_s32_f32(v_table.val[1]);
float32x4x2_t v_coeff;
v_coeff.val[0] = vsubq_f32(v_table.val[0], vcvtq_f32_s32(v_src_x0));
v_coeff.val[1] = vsubq_f32(v_table.val[1], vcvtq_f32_s32(v_src_y0));
uint32x4_t v_maskx = vcltq_f32(v_coeff.val[0], v_zero4f);
uint32x4_t v_masky = vcltq_f32(v_coeff.val[1], v_zero4f);
v_coeff.val[0] = vbslq_f32(v_maskx, vaddq_f32(v_one4f, v_coeff.val[0]), v_coeff.val[0]);
v_coeff.val[1] = vbslq_f32(v_masky, vaddq_f32(v_one4f, v_coeff.val[1]), v_coeff.val[1]);
v_src_x0 = vbslq_s32(v_maskx, vsubq_s32(v_src_x0, v_1), v_src_x0);
v_src_y0 = vbslq_s32(v_masky, vsubq_s32(v_src_y0, v_1), v_src_y0);
int32x4_t v_src_x1 = vaddq_s32(v_src_x0, v_1);
int32x4_t v_src_y1 = vaddq_s32(v_src_y0, v_1);
int32x4x4_t v_dst_index;
v_dst_index.val[0] = vmlaq_s32(v_src_x0, v_src_y0, v_step4);
v_dst_index.val[1] = vmlaq_s32(v_src_x1, v_src_y0, v_step4);
v_dst_index.val[2] = vmlaq_s32(v_src_x0, v_src_y1, v_step4);
v_dst_index.val[3] = vmlaq_s32(v_src_x1, v_src_y1, v_step4);
uint32x4_t v_mask_x0 = vandq_u32(vcgeq_f32(v_table.val[0], v_zero4), vcleq_s32(v_src_x0, v_width4));
uint32x4_t v_mask_x1 = vandq_u32(vcgeq_f32(vaddq_f32(v_table.val[0], v_one4f), v_zero4), vcleq_s32(v_src_x1, v_width4));
uint32x4_t v_mask_y0 = vandq_u32(vcgeq_f32(v_table.val[1], v_zero4), vcleq_s32(v_src_y0, v_height4));
uint32x4_t v_mask_y1 = vandq_u32(vcgeq_f32(vaddq_f32(v_table.val[1], v_one4f), v_zero4), vcleq_s32(v_src_y1, v_height4));
v_dst_index.val[0] = vbslq_s32(vandq_u32(v_mask_x0, v_mask_y0), v_dst_index.val[0], v_m1_4);
v_dst_index.val[1] = vbslq_s32(vandq_u32(v_mask_x1, v_mask_y0), v_dst_index.val[1], v_m1_4);
v_dst_index.val[2] = vbslq_s32(vandq_u32(v_mask_x0, v_mask_y1), v_dst_index.val[2], v_m1_4);
v_dst_index.val[3] = vbslq_s32(vandq_u32(v_mask_x1, v_mask_y1), v_dst_index.val[3], v_m1_4);
vst2q_f32(coeff_row + (x << 1), v_coeff);
vst4q_s32(map_row + (x << 2), v_dst_index);
}
for ( ; x < blockWidth; ++x)
{
f32 src_x_f = table_row[(x << 1) + 0];
f32 src_y_f = table_row[(x << 1) + 1];
s32 src0_x = (s32)floorf(src_x_f), src1_x = src0_x + 1;
s32 src0_y = (s32)floorf(src_y_f), src1_y = src0_y + 1;
coeff_row[(x << 1)] = src_x_f - src0_x;
coeff_row[(x << 1) + 1] = src_y_f - src0_y;
map_row[(x << 2) + 0] = (src0_x >= 0) && (src0_x < (s32)ssize.width) &&
(src0_y >= 0) && (src0_y < (s32)ssize.height) ? src0_y * srcStride + src0_x : -1;
map_row[(x << 2) + 1] = (src1_x >= 0) && (src1_x < (s32)ssize.width) &&
(src0_y >= 0) && (src0_y < (s32)ssize.height) ? src0_y * srcStride + src1_x : -1;
map_row[(x << 2) + 2] = (src0_x >= 0) && (src0_x < (s32)ssize.width) &&
(src1_y >= 0) && (src1_y < (s32)ssize.height) ? src1_y * srcStride + src0_x : -1;
map_row[(x << 2) + 3] = (src1_x >= 0) && (src1_x < (s32)ssize.width) &&
(src1_y >= 0) && (src1_y < (s32)ssize.height) ? src1_y * srcStride + src1_x : -1;
}
}
remapLinearConst(Size2D(blockWidth, blockHeight),
srcBase, &map[0], &coeffs[0],
getRowPtr(dstBase, dstStride, i) + j, dstStride, borderValue);
}
}
}
#else
(void)ssize;
(void)dsize;
(void)srcBase;
(void)srcStride;
(void)tableBase;
(void)tableStride;
(void)dstBase;
(void)dstStride;
(void)borderMode;
(void)borderValue;
#endif
}
} // namespace CAROTENE_NS

85
3rdparty/carotene/src/remap.hpp vendored Normal file
View File

@ -0,0 +1,85 @@
/*
* By downloading, copying, installing or using the software you agree to this license.
* If you do not agree to this license, do not download, install,
* copy or use the software.
*
*
* License Agreement
* For Open Source Computer Vision Library
* (3-clause BSD License)
*
* Copyright (C) 2015, NVIDIA Corporation, all rights reserved.
* Third party copyrights are property of their respective owners.
*
* Redistribution and use in source and binary forms, with or without modification,
* are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* * Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* * Neither the names of the copyright holders nor the names of the contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* This software is provided by the copyright holders and contributors "as is" and
* any express or implied warranties, including, but not limited to, the implied
* warranties of merchantability and fitness for a particular purpose are disclaimed.
* In no event shall copyright holders or contributors be liable for any direct,
* indirect, incidental, special, exemplary, or consequential damages
* (including, but not limited to, procurement of substitute goods or services;
* loss of use, data, or profits; or business interruption) however caused
* and on any theory of liability, whether in contract, strict liability,
* or tort (including negligence or otherwise) arising in any way out of
* the use of this software, even if advised of the possibility of such damage.
*/
#ifndef CAROTENE_SRC_REMAP_HPP
#define CAROTENE_SRC_REMAP_HPP
#include "common.hpp"
#include <cmath>
#ifdef CAROTENE_NEON
namespace CAROTENE_NS { namespace internal {
enum
{
BLOCK_SIZE = 32
};
void remapNearestNeighborReplicate(const Size2D size,
const u8 * srcBase,
const s32 * map,
u8 * dstBase, ptrdiff_t dstStride);
void remapNearestNeighborConst(const Size2D size,
const u8 * srcBase,
const s32 * map,
u8 * dstBase, ptrdiff_t dstStride,
u8 borderValue);
void remapLinearReplicate(const Size2D size,
const u8 * srcBase,
const s32 * map,
const f32 * coeffs,
u8 * dstBase, ptrdiff_t dstStride);
void remapLinearConst(const Size2D size,
const u8 * srcBase,
const s32 * map,
const f32 * coeffs,
u8 * dstBase, ptrdiff_t dstStride,
u8 borderValue);
} }
#endif // CAROTENE_NEON
#endif // CAROTENE_SRC_REMAP_HPP

2191
3rdparty/carotene/src/resize.cpp vendored Normal file

File diff suppressed because it is too large Load Diff

199
3rdparty/carotene/src/saturate_cast.hpp vendored Normal file
View File

@ -0,0 +1,199 @@
/*
* By downloading, copying, installing or using the software you agree to this license.
* If you do not agree to this license, do not download, install,
* copy or use the software.
*
*
* License Agreement
* For Open Source Computer Vision Library
* (3-clause BSD License)
*
* Copyright (C) 2012-2015, NVIDIA Corporation, all rights reserved.
* Third party copyrights are property of their respective owners.
*
* Redistribution and use in source and binary forms, with or without modification,
* are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* * Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* * Neither the names of the copyright holders nor the names of the contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* This software is provided by the copyright holders and contributors "as is" and
* any express or implied warranties, including, but not limited to, the implied
* warranties of merchantability and fitness for a particular purpose are disclaimed.
* In no event shall copyright holders or contributors be liable for any direct,
* indirect, incidental, special, exemplary, or consequential damages
* (including, but not limited to, procurement of substitute goods or services;
* loss of use, data, or profits; or business interruption) however caused
* and on any theory of liability, whether in contract, strict liability,
* or tort (including negligence or otherwise) arising in any way out of
* the use of this software, even if advised of the possibility of such damage.
*/
#ifndef CAROTENE_SATURATE_CAST_HPP
#define CAROTENE_SATURATE_CAST_HPP
#include <algorithm>
#include <climits>
#include <cmath>
#if defined _MSC_VER && defined _M_ARM
# include <intrin.h>
#endif
#include <carotene/definitions.hpp>
#include <carotene/types.hpp>
namespace CAROTENE_NS { namespace internal {
#if defined _MSC_VER && defined _M_ARM
__declspec(naked) static void vcvtr_s32_f64_imp(f64 d)
{
(void)d;
__emit(0xEEBD); // vcvtr.s32.f64 s0, d0
__emit(0x0B40);
__emit(0xEE10); // vmov r0, s0
__emit(0x0A10);
__emit(0x4770); // bx lr
}
# define CAROTENE_ROUND_FLT(x) return ((s32 (*)(f64))vcvtr_s32_f64_imp)((f64)x);
# define CAROTENE_ROUND_DBL(x) return ((s32 (*)(f64))vcvtr_s32_f64_imp)(x);
#elif defined CV_ICC || defined __GNUC__
# if defined(__VFP_FP__) && !defined(__SOFTFP__) && !(defined _DEBUG || defined DEBUG) && !defined(__CUDACC__)
# define CAROTENE_ROUND_FLT(value) { \
register union { f32 f; s32 i; } result; \
asm ("ftosis %0, %1 \n" : "=w" (result.f) : "w" (value) ); \
return result.i; }
# define CAROTENE_ROUND_DBL(value) { \
register union {f32 f; s32 i;} __tegra_result; \
asm ( \
"ftosid %0, %P1\n" \
: "=w" (__tegra_result.f) \
: "w" (value) \
); \
return __tegra_result.i; \
}
# else
# define CAROTENE_ROUND_FLT(x) return (s32)lrintf(value);
# define CAROTENE_ROUND_DBL(value) return (s32)lrint(value);
# endif
#endif
inline s32 round(f32 value)
{
#ifdef CAROTENE_ROUND_FLT
CAROTENE_ROUND_FLT(value)
#else
s32 intpart = (s32)(value);
f32 fractpart = value - intpart;
if ((fractpart != 0.5 && fractpart != -0.5) || ((intpart % 2) != 0))
return (s32)(value + (value >= 0 ? 0.5 : -0.5));
else
return intpart;
#endif
}
inline s32 round(f64 value)
{
#ifdef CAROTENE_ROUND_DBL
CAROTENE_ROUND_DBL(value)
#else
s32 intpart = (s32)(value);
f64 fractpart = value - intpart;
if ((fractpart != 0.5 && fractpart != -0.5) || ((intpart % 2) != 0))
return (s32)(value + (value >= 0 ? 0.5 : -0.5));
else
return intpart;
#endif
}
/////////////// saturate_cast (used in image & signal processing) ///////////////////
template<typename _Tp> inline _Tp saturate_cast(u8 v) { return _Tp(v); }
template<typename _Tp> inline _Tp saturate_cast(s8 v) { return _Tp(v); }
template<typename _Tp> inline _Tp saturate_cast(u16 v) { return _Tp(v); }
template<typename _Tp> inline _Tp saturate_cast(s16 v) { return _Tp(v); }
template<typename _Tp> inline _Tp saturate_cast(u32 v) { return _Tp(v); }
template<typename _Tp> inline _Tp saturate_cast(s32 v) { return _Tp(v); }
template<typename _Tp> inline _Tp saturate_cast(s64 v) { return _Tp(v); }
template<typename _Tp> inline _Tp saturate_cast(u64 v) { return _Tp(v); }
template<typename _Tp> inline _Tp saturate_cast(f32 v) { return _Tp(v); }
template<typename _Tp> inline _Tp saturate_cast(f64 v) { return _Tp(v); }
template<> inline u8 saturate_cast<u8>(s8 v) { return (u8)std::max((s32)v, 0); }
template<> inline u8 saturate_cast<u8>(u16 v) { return (u8)std::min((u32)v, (u32)UCHAR_MAX); }
template<> inline u8 saturate_cast<u8>(s32 v) { return (u8)((u32)v <= UCHAR_MAX ? v : v > 0 ? UCHAR_MAX : 0); }
template<> inline u8 saturate_cast<u8>(s16 v) { return saturate_cast<u8>((s32)v); }
template<> inline u8 saturate_cast<u8>(u32 v) { return (u8)std::min(v, (u32)UCHAR_MAX); }
template<> inline u8 saturate_cast<u8>(s64 v) { return (u8)((u64)v <= UCHAR_MAX ? v : v > 0 ? UCHAR_MAX : 0); }
template<> inline u8 saturate_cast<u8>(u64 v) { return (u8)std::min(v, (u64)UCHAR_MAX); }
template<> inline u8 saturate_cast<u8>(f32 v) { return saturate_cast<u8>(round(v)); }
template<> inline u8 saturate_cast<u8>(f64 v) { return saturate_cast<u8>(round(v)); }
template<> inline s8 saturate_cast<s8>(u8 v) { return (s8)std::min((s32)v, SCHAR_MAX); }
template<> inline s8 saturate_cast<s8>(u16 v) { return (s8)std::min((u32)v, (u32)SCHAR_MAX); }
template<> inline s8 saturate_cast<s8>(s32 v) { return (s8)((u32)(v-SCHAR_MIN) <= (u32)UCHAR_MAX ? v : v > 0 ? SCHAR_MAX : SCHAR_MIN); }
template<> inline s8 saturate_cast<s8>(s16 v) { return saturate_cast<s8>((s32)v); }
template<> inline s8 saturate_cast<s8>(u32 v) { return (s8)std::min(v, (u32)SCHAR_MAX); }
template<> inline s8 saturate_cast<s8>(s64 v) { return (s8)((u64)(v-SCHAR_MIN) <= (u64)UCHAR_MAX ? v : v > 0 ? SCHAR_MAX : SCHAR_MIN); }
template<> inline s8 saturate_cast<s8>(u64 v) { return (s8)std::min(v, (u64)SCHAR_MAX); }
template<> inline s8 saturate_cast<s8>(f32 v) { return saturate_cast<s8>(round(v)); }
template<> inline s8 saturate_cast<s8>(f64 v) { return saturate_cast<s8>(round(v)); }
template<> inline u16 saturate_cast<u16>(s8 v) { return (u16)std::max((s32)v, 0); }
template<> inline u16 saturate_cast<u16>(s16 v) { return (u16)std::max((s32)v, 0); }
template<> inline u16 saturate_cast<u16>(s32 v) { return (u16)((u32)v <= (u32)USHRT_MAX ? v : v > 0 ? USHRT_MAX : 0); }
template<> inline u16 saturate_cast<u16>(u32 v) { return (u16)std::min(v, (u32)USHRT_MAX); }
template<> inline u16 saturate_cast<u16>(s64 v) { return (u16)((u64)v <= (u64)USHRT_MAX ? v : v > 0 ? USHRT_MAX : 0); }
template<> inline u16 saturate_cast<u16>(u64 v) { return (u16)std::min(v, (u64)USHRT_MAX); }
template<> inline u16 saturate_cast<u16>(f32 v) { return saturate_cast<u16>(round(v)); }
template<> inline u16 saturate_cast<u16>(f64 v) { return saturate_cast<u16>(round(v)); }
template<> inline s16 saturate_cast<s16>(u16 v) { return (s16)std::min((s32)v, SHRT_MAX); }
template<> inline s16 saturate_cast<s16>(s32 v) { return (s16)((u32)(v - SHRT_MIN) <= (u32)USHRT_MAX ? v : v > 0 ? SHRT_MAX : SHRT_MIN); }
template<> inline s16 saturate_cast<s16>(u32 v) { return (s16)std::min(v, (u32)SHRT_MAX); }
template<> inline s16 saturate_cast<s16>(s64 v) { return (s16)((u64)(v - SHRT_MIN) <= (u64)USHRT_MAX ? v : v > 0 ? SHRT_MAX : SHRT_MIN); }
template<> inline s16 saturate_cast<s16>(u64 v) { return (s16)std::min(v, (u64)SHRT_MAX); }
template<> inline s16 saturate_cast<s16>(f32 v) { return saturate_cast<s16>(round(v)); }
template<> inline s16 saturate_cast<s16>(f64 v) { return saturate_cast<s16>(round(v)); }
template<> inline u32 saturate_cast<u32>(s8 v) { return (u32)std::max(v, (s8)0); }
template<> inline u32 saturate_cast<u32>(s16 v) { return (u32)std::max(v, (s16)0); }
template<> inline u32 saturate_cast<u32>(s32 v) { return (u32)std::max(v, (s32)0); }
template<> inline u32 saturate_cast<u32>(s64 v) { return (u32)((u64)v <= (u64)UINT_MAX ? v : v > 0 ? UINT_MAX : 0); }
template<> inline u32 saturate_cast<u32>(u64 v) { return (u32)std::min(v, (u64)UINT_MAX); }
//OpenCV like f32/f64 -> u32 conversion
//we intentionally do not clip negative numbers, to make -1 become 0xffffffff etc.
template<> inline u32 saturate_cast<u32>(f32 v) { return round(v); }
template<> inline u32 saturate_cast<u32>(f64 v) { return round(v); }
//Negative clipping implementation
//template<> inline u32 saturate_cast<u32>(f32 v) { return saturate_cast<u32>(round(v)); }
//template<> inline u32 saturate_cast<u32>(f64 v) { return saturate_cast<u32>(round(v)); }
template<> inline s32 saturate_cast<s32>(u32 v) { return (s32)std::min(v, (u32)INT_MAX); }
template<> inline s32 saturate_cast<s32>(s64 v) { return (s32)((u64)(v - INT_MIN) <= (u64)UINT_MAX ? v : v > 0 ? INT_MAX : INT_MIN); }
template<> inline s32 saturate_cast<s32>(u64 v) { return (s32)std::min(v, (u64)INT_MAX); }
template<> inline s32 saturate_cast<s32>(f32 v) { return round(v); }
template<> inline s32 saturate_cast<s32>(f64 v) { return round(v); }
template<> inline u64 saturate_cast<u64>(s8 v) { return (u64)std::max(v, (s8)0); }
template<> inline u64 saturate_cast<u64>(s16 v) { return (u64)std::max(v, (s16)0); }
template<> inline u64 saturate_cast<u64>(s32 v) { return (u64)std::max(v, (s32)0); }
template<> inline u64 saturate_cast<u64>(s64 v) { return (u64)std::max(v, (s64)0); }
template<> inline s64 saturate_cast<s64>(u64 v) { return (s64)std::min(v, (u64)LLONG_MAX); }
} }
#endif

219
3rdparty/carotene/src/scharr.cpp vendored Normal file
View File

@ -0,0 +1,219 @@
/*
* By downloading, copying, installing or using the software you agree to this license.
* If you do not agree to this license, do not download, install,
* copy or use the software.
*
*
* License Agreement
* For Open Source Computer Vision Library
* (3-clause BSD License)
*
* Copyright (C) 2012-2015, NVIDIA Corporation, all rights reserved.
* Third party copyrights are property of their respective owners.
*
* Redistribution and use in source and binary forms, with or without modification,
* are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* * Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* * Neither the names of the copyright holders nor the names of the contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* This software is provided by the copyright holders and contributors "as is" and
* any express or implied warranties, including, but not limited to, the implied
* warranties of merchantability and fitness for a particular purpose are disclaimed.
* In no event shall copyright holders or contributors be liable for any direct,
* indirect, incidental, special, exemplary, or consequential damages
* (including, but not limited to, procurement of substitute goods or services;
* loss of use, data, or profits; or business interruption) however caused
* and on any theory of liability, whether in contract, strict liability,
* or tort (including negligence or otherwise) arising in any way out of
* the use of this software, even if advised of the possibility of such damage.
*/
#include <vector>
#include "common.hpp"
namespace CAROTENE_NS {
bool isScharr3x3Supported(const Size2D &size, BORDER_MODE border, s32 dx, s32 dy, Margin borderMargin)
{
return (dx == 0 && dy == 1 &&
isSeparableFilter3x3Supported(size, border, 3, 1, borderMargin)) ||
(dx == 1 && dy == 0 &&
isSeparableFilter3x3Supported(size, border, 1, 3, borderMargin));
}
void Scharr3x3(const Size2D &size,
const u8 * srcBase, ptrdiff_t srcStride,
s16 * dstBase, ptrdiff_t dstStride,
s32 dx, s32 dy,
BORDER_MODE border, u8 borderValue, Margin borderMargin)
{
internal::assertSupportedConfiguration(isScharr3x3Supported(size, border, dx, dy, borderMargin));
#ifdef CAROTENE_NEON
static s16 dw[] = {3, 10, 3};
if (dy == 1)
SeparableFilter3x3(size, srcBase, srcStride, dstBase, dstStride,
3, 1, dw, 0,
border, borderValue, borderMargin);
else
SeparableFilter3x3(size, srcBase, srcStride, dstBase, dstStride,
1, 3, 0, dw,
border, borderValue, borderMargin);
#else
(void)srcBase;
(void)srcStride;
(void)dstBase;
(void)dstStride;
(void)borderValue;
#endif
}
void ScharrDeriv(const Size2D &size, s32 cn,
const u8 * srcBase, ptrdiff_t srcStride,
s16 * dstBase, ptrdiff_t dstStride)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
size_t colsn = size.width*cn;
size_t roiw8 = colsn > 7 ? colsn - 7 : 0;
ptrdiff_t delta = (ptrdiff_t)(((size.width + 2)*cn + 15) & -16);//align size
std::vector<s16> _tempBuf((delta << 1) + 64);
s16 *trow0 = internal::alignPtr(&_tempBuf[cn], 16), *trow1 = internal::alignPtr(trow0 + delta, 16);
int16x8_t vc3 = vmovq_n_s16(3);
int16x8_t vc10 = vmovq_n_s16(10);
uint8x8_t v8c10 = vmov_n_u8(10);
for(size_t y = 0; y < size.height; y++ )
{
const u8* srow0 = internal::getRowPtr(srcBase, srcStride, y > 0 ? y-1 : size.height > 1 ? 1 : 0);
const u8* srow1 = internal::getRowPtr(srcBase, srcStride, y);
const u8* srow2 = internal::getRowPtr(srcBase, srcStride, y < size.height-1 ? y+1 : size.height > 1 ? size.height-2 : 0);
s16* drow = internal::getRowPtr(dstBase, dstStride, y);
// do vertical convolution
size_t x = 0;
for( ; x < roiw8; x += 8 )
{
internal::prefetch(srow0 + x);
internal::prefetch(srow1 + x);
internal::prefetch(srow2 + x);
#if __GNUC_MINOR__ < 7
__asm__ (
"vld1.8 {d0}, [%[src0]] \n\t"
"vld1.8 {d2}, [%[src2]] \n\t"
"vld1.8 {d1}, [%[src1]] \n\t"
"vaddl.u8 q2, d2, d0 \n\t"
"vmull.u8 q3, d1, %[vc10] \n\t"
"vsubl.u8 q4, d2, d0 \n\t"
"vmla.s16 q3, q2, %q[vc3] \n\t"
"vst1.16 {d8-d9}, [%[out1],:128] \n\t"
"vst1.16 {d6-d7}, [%[out0],:128] \n\t"
:
: [out0] "r" (trow0 + x),
[out1] "r" (trow1 + x),
[src0] "r" (srow0 + x),
[src1] "r" (srow1 + x),
[src2] "r" (srow2 + x),
[vc10] "w" (v8c10), [vc3] "w" (vc3)
: "d0","d1","d2","d3","d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15"
);
#else
uint8x8_t s0 = vld1_u8(srow0 + x);
uint8x8_t s1 = vld1_u8(srow1 + x);
uint8x8_t s2 = vld1_u8(srow2 + x);
int16x8_t s1x10 = vreinterpretq_s16_u16(vmull_u8(s1, v8c10));
int16x8_t s02 = vreinterpretq_s16_u16(vaddl_u8(s2, s0));
int16x8_t t1 = vreinterpretq_s16_u16(vsubl_u8(s2, s0));
int16x8_t t0 = vmlaq_s16(s1x10, s02, vc3);
vst1q_s16(trow1 + x, t1);
vst1q_s16(trow0 + x, t0);
#endif
}
for( ; x < colsn; x++ )
{
trow0[x] = (s16)((srow0[x] + srow2[x])*3 + srow1[x]*10);
trow1[x] = (s16)(srow2[x] - srow0[x]);
}
// make border
size_t x0 = (size.width > 1 ? cn : 0), x1 = (size.width > 1 ? (size.width-2)*cn : 0);
for( s32 k = 0; k < cn; k++ )
{
trow0[-cn + k] = trow0[x0 + k]; trow0[colsn + k] = trow0[x1 + k];
trow1[-cn + k] = trow1[x0 + k]; trow1[colsn + k] = trow1[x1 + k];
}
// do horizontal convolution, interleave the results and store them to dst
x = 0;
for( ; x < roiw8; x += 8 )
{
#if __GNUC_MINOR__ < 6
__asm__ (
"vld1.16 {d4-d5}, [%[s2ptr]] \n\t"
"vld1.16 {d8-d9}, [%[s4ptr]] \n\t"
"vld1.16 {d6-d7}, [%[s3ptr],:128] \n\t"
"vld1.16 {d0-d1}, [%[s0ptr]] \n\t"
"vld1.16 {d2-d3}, [%[s1ptr]] \n\t"
"vadd.i16 q7, q2, q4 \n\t"
"vmul.s16 q6, q3, %q[vc10] \n\t"
"vsub.s16 q5, q1, q0 \n\t"
"vmla.s16 q6, q7, %q[vc3] \n\t"
"vst2.16 {d10-d13}, [%[out]] \n\t"
:
: [out] "r" (drow + x * 2),
[s0ptr] "r" (trow0 + x - cn),
[s1ptr] "r" (trow0 + x + cn),
[s2ptr] "r" (trow1 + x - cn),
[s3ptr] "r" (trow1 + x),
[s4ptr] "r" (trow1 + x + cn),
[vc10] "w" (vc10), [vc3] "w" (vc3)
: "d0","d1","d2","d3","d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15"
);
#else
int16x8_t s0 = vld1q_s16(trow0 + x - cn);
int16x8_t s1 = vld1q_s16(trow0 + x + cn);
int16x8_t s2 = vld1q_s16(trow1 + x - cn);
int16x8_t s3 = vld1q_s16(trow1 + x);
int16x8_t s4 = vld1q_s16(trow1 + x + cn);
int16x8_t s3x10 = vmulq_s16(s3, vc10);
int16x8_t s24 = vaddq_s16(s2, s4);
int16x8x2_t vr;
vr.val[0] = vsubq_s16(s1, s0);
vr.val[1] = vmlaq_s16(s3x10, s24, vc3);
vst2q_s16(drow + x*2, vr);
#endif //__GNUC_MINOR__ < 6
}
for( ; x < colsn; x++ )
{
drow[x*2] = (s16)(trow0[x+cn] - trow0[x-cn]);
drow[x*2+1] = (s16)((trow1[x+cn] + trow1[x-cn])*3 + trow1[x]*10);
}
}
#else
(void)size;
(void)cn;
(void)srcBase;
(void)srcStride;
(void)dstBase;
(void)dstStride;
#endif
}
} // namespace CAROTENE_NS

View File

@ -0,0 +1,109 @@
/*
* By downloading, copying, installing or using the software you agree to this license.
* If you do not agree to this license, do not download, install,
* copy or use the software.
*
*
* License Agreement
* For Open Source Computer Vision Library
* (3-clause BSD License)
*
* Copyright (C) 2014-2015, NVIDIA Corporation, all rights reserved.
* Third party copyrights are property of their respective owners.
*
* Redistribution and use in source and binary forms, with or without modification,
* are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* * Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* * Neither the names of the copyright holders nor the names of the contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* This software is provided by the copyright holders and contributors "as is" and
* any express or implied warranties, including, but not limited to, the implied
* warranties of merchantability and fitness for a particular purpose are disclaimed.
* In no event shall copyright holders or contributors be liable for any direct,
* indirect, incidental, special, exemplary, or consequential damages
* (including, but not limited to, procurement of substitute goods or services;
* loss of use, data, or profits; or business interruption) however caused
* and on any theory of liability, whether in contract, strict liability,
* or tort (including negligence or otherwise) arising in any way out of
* the use of this software, even if advised of the possibility of such damage.
*/
#include "common.hpp"
#include "separable_filter.hpp"
namespace CAROTENE_NS {
bool isSeparableFilter3x3Supported(const Size2D &size, BORDER_MODE border, s32 dx, s32 dy, Margin borderMargin)
{
return isSupportedConfiguration() &&
size.width >= 9 && size.height >= 1 &&
(size.height + borderMargin.top + borderMargin.bottom) >= 2 &&
(dx >= 0) && (dx < 4) && (dy >= 0) && (dy < 4) &&
(border == BORDER_MODE_CONSTANT ||
border == BORDER_MODE_REFLECT ||
border == BORDER_MODE_REFLECT101 ||
border == BORDER_MODE_REPLICATE );
}
void SeparableFilter3x3(const Size2D &size,
const u8 * srcBase, ptrdiff_t srcStride,
s16 * dstBase, ptrdiff_t dstStride,
const u8 rowFilter, const u8 colFilter, const s16 *xw, const s16 *yw,
BORDER_MODE border, u8 borderValue, Margin borderMargin)
{
internal::assertSupportedConfiguration(isSeparableFilter3x3Supported(size, border, rowFilter, colFilter, borderMargin));
#ifdef CAROTENE_NEON
if(!((xw || rowFilter < 3) && (yw || colFilter < 3)))
std::abort();//Couldn't call generic filter without provided weights
typedef void (*sepFilter3x3_8u16s_func)(const Size2D&, const u8*, ptrdiff_t, s16*, ptrdiff_t,
const s16*, const s16*, BORDER_MODE, u8, Margin);
static sepFilter3x3_8u16s_func quickFilters[4][4]=
{
/*d0y*/{ /*d0x*/ internal::sepFilter3x3<internal::RowFilter3x3S16_121, internal::ColFilter3x3S16_121>::process,
/*dx*/ internal::sepFilter3x3<internal::RowFilter3x3S16_m101, internal::ColFilter3x3S16_121>::process,
/*d2x*/ internal::sepFilter3x3<internal::RowFilter3x3S16_1m21, internal::ColFilter3x3S16_121>::process,
/*dNx*/ internal::sepFilter3x3<internal::RowFilter3x3S16Generic, internal::ColFilter3x3S16_121>::process},
/*dy */{ /*d0x*/ internal::sepFilter3x3<internal::RowFilter3x3S16_121, internal::ColFilter3x3S16_m101>::process,
/*dx*/ internal::sepFilter3x3<internal::RowFilter3x3S16_m101, internal::ColFilter3x3S16_m101>::process,
/*d2x*/ internal::sepFilter3x3<internal::RowFilter3x3S16_1m21, internal::ColFilter3x3S16_m101>::process,
/*dNx*/ internal::sepFilter3x3<internal::RowFilter3x3S16Generic, internal::ColFilter3x3S16_m101>::process},
/*d2y*/{ /*d0x*/ internal::sepFilter3x3<internal::RowFilter3x3S16_121, internal::ColFilter3x3S16_1m21>::process,
/*dx*/ internal::sepFilter3x3<internal::RowFilter3x3S16_m101, internal::ColFilter3x3S16_1m21>::process,
/*d2x*/ internal::sepFilter3x3<internal::RowFilter3x3S16_1m21, internal::ColFilter3x3S16_1m21>::process,
/*dNx*/ internal::sepFilter3x3<internal::RowFilter3x3S16Generic, internal::ColFilter3x3S16_1m21>::process},
/*dNy*/{ /*d0x*/ internal::sepFilter3x3<internal::RowFilter3x3S16_121, internal::ColFilter3x3S16Generic>::process,
/*dx*/ internal::sepFilter3x3<internal::RowFilter3x3S16_m101, internal::ColFilter3x3S16Generic>::process,
/*d2x*/ internal::sepFilter3x3<internal::RowFilter3x3S16_1m21, internal::ColFilter3x3S16Generic>::process,
/*dNx*/ internal::sepFilter3x3<internal::RowFilter3x3S16Generic, internal::ColFilter3x3S16Generic>::process}
};
quickFilters[colFilter][rowFilter](size, srcBase, srcStride, dstBase, dstStride,
xw, yw, border, borderValue, borderMargin);
#else
(void)srcBase;
(void)srcStride;
(void)dstBase;
(void)dstStride;
(void)xw;
(void)yw;
(void)borderValue;
#endif
}
} // namespace CAROTENE_NS

1161
3rdparty/carotene/src/separable_filter.hpp vendored Normal file

File diff suppressed because it is too large Load Diff

317
3rdparty/carotene/src/sobel.cpp vendored Normal file
View File

@ -0,0 +1,317 @@
/*
* By downloading, copying, installing or using the software you agree to this license.
* If you do not agree to this license, do not download, install,
* copy or use the software.
*
*
* License Agreement
* For Open Source Computer Vision Library
* (3-clause BSD License)
*
* Copyright (C) 2012-2015, NVIDIA Corporation, all rights reserved.
* Third party copyrights are property of their respective owners.
*
* Redistribution and use in source and binary forms, with or without modification,
* are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* * Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* * Neither the names of the copyright holders nor the names of the contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* This software is provided by the copyright holders and contributors "as is" and
* any express or implied warranties, including, but not limited to, the implied
* warranties of merchantability and fitness for a particular purpose are disclaimed.
* In no event shall copyright holders or contributors be liable for any direct,
* indirect, incidental, special, exemplary, or consequential damages
* (including, but not limited to, procurement of substitute goods or services;
* loss of use, data, or profits; or business interruption) however caused
* and on any theory of liability, whether in contract, strict liability,
* or tort (including negligence or otherwise) arising in any way out of
* the use of this software, even if advised of the possibility of such damage.
*/
#include <vector>
#include "common.hpp"
namespace CAROTENE_NS {
bool isSobel3x3Supported(const Size2D &size, BORDER_MODE border,
s32 dx, s32 dy, Margin borderMargin)
{
return dx < 3 && dx >= 0 &&
dy < 3 && dy >= 0 &&
(dx + dy) > 0 &&
isSeparableFilter3x3Supported(size, border, dx, dy, borderMargin);
}
void Sobel3x3(const Size2D &size,
const u8 * srcBase, ptrdiff_t srcStride,
s16 * dstBase, ptrdiff_t dstStride,
s32 dx, s32 dy,
BORDER_MODE borderType, u8 borderValue, Margin borderMargin)
{
internal::assertSupportedConfiguration(isSobel3x3Supported(size, borderType, dx, dy, borderMargin));
#ifdef CAROTENE_NEON
SeparableFilter3x3(size, srcBase, srcStride, dstBase, dstStride,
dx, dy, 0, 0,
borderType, borderValue, borderMargin);
#else
(void)srcBase;
(void)srcStride;
(void)dstBase;
(void)dstStride;
(void)borderValue;
#endif
}
bool isSobel3x3f32Supported(const Size2D &size, BORDER_MODE border,
s32 dx, s32 dy)
{
return isSupportedConfiguration() &&
dx < 3 && dx >= 0 &&
dy < 3 && dy >= 0 &&
(dx + dy) > 0 &&
size.width >= 4 && size.height >= 2 &&
(border == BORDER_MODE_CONSTANT ||
border == BORDER_MODE_REFLECT ||
border == BORDER_MODE_REFLECT101 ||
border == BORDER_MODE_REPLICATE );
}
void Sobel3x3(const Size2D &size,
const f32 * srcBase, ptrdiff_t srcStride,
f32 * dstBase, ptrdiff_t dstStride,
s32 dx, s32 dy,
BORDER_MODE borderType, f32 borderValue)
{
internal::assertSupportedConfiguration(isSobel3x3f32Supported(size, borderType, dx, dy));
#ifdef CAROTENE_NEON
std::vector<f32> _tmp;
f32 *tmp = 0;
if (borderType == BORDER_MODE_CONSTANT)
{
_tmp.assign(size.width + 2, borderValue);
tmp = &_tmp[1];
}
ptrdiff_t delta = (ptrdiff_t)((size.width + 2 + 31) & -32);//align size
std::vector<f32> _tempBuf((delta << 1) + 64);
f32 *trow0 = internal::alignPtr(&_tempBuf[1], 32), *trow1 = internal::alignPtr(trow0 + delta, 32);
for( size_t y = 0; y < size.height; y++ )
{
const f32* srow0;
const f32* srow1 = internal::getRowPtr(srcBase, srcStride, y);
const f32* srow2;
f32* drow = internal::getRowPtr(dstBase, dstStride, y > 0 ? y-1 : 0);
f32* drow1 = internal::getRowPtr(dstBase, dstStride, y);
if (borderType == BORDER_MODE_REFLECT101) {
srow0 = internal::getRowPtr(srcBase, srcStride, y > 0 ? y-1 : 1);
srow2 = internal::getRowPtr(srcBase, srcStride, y < size.height-1 ? y+1 : size.height-2);
} else if (borderType == BORDER_MODE_CONSTANT) {
srow0 = y > 0 ? internal::getRowPtr(srcBase, srcStride, y-1) : tmp;
srow2 = y < size.height-1 ? internal::getRowPtr(srcBase, srcStride, y+1) : tmp;
} else { // BORDER_MODE_REFLECT || BORDER_MODE_REPLICATE
srow0 = internal::getRowPtr(srcBase, srcStride, y > 0 ? y-1 : 0);
srow2 = internal::getRowPtr(srcBase, srcStride, y < size.height-1 ? y+1 : size.height-1);
}
float32x4_t tprev = vmovq_n_f32(0.f);
float32x4_t tcurr = vmovq_n_f32(0.f);
float32x4_t tnext = vmovq_n_f32(0.f);
float32x4_t t0, t1, t2;
// do vertical convolution
size_t x = 0, bcolsn = y + 2 < size.height ? size.width : (size.width - 4);
for( ; x <= bcolsn; x += 4 )
{
internal::prefetch(srow0 + x);
internal::prefetch(srow1 + x);
internal::prefetch(srow2 + x);
float32x4_t x0 = vld1q_f32(srow0 + x);
float32x4_t x1 = vld1q_f32(srow1 + x);
float32x4_t x2 = vld1q_f32(srow2 + x);
tprev = tcurr;
tcurr = tnext;
if(!dy)
{
tnext = vaddq_f32(vaddq_f32(vaddq_f32(x1, x1), x2), x0);
}
else if(dy == 2)
{
tnext = vsubq_f32(vsubq_f32(x2, x1), vsubq_f32(x1, x0));
}
else
{
tnext = vsubq_f32(x2, x0);
}
if(!x) {
tcurr = tnext;
// make border
if (borderType == BORDER_MODE_CONSTANT)
{
tcurr = vsetq_lane_f32(borderValue,tcurr, 3);
}
else if (borderType == BORDER_MODE_REFLECT101)
{
tcurr = vsetq_lane_f32(vgetq_lane_f32(tcurr, 1),tcurr, 3);
}
else // BORDER_MODE_REFLECT || BORDER_MODE_REPLICATE
{
tcurr = vsetq_lane_f32(vgetq_lane_f32(tcurr, 0),tcurr, 3);
}
continue;
}
internal::prefetch(trow0 + x);
internal::prefetch(trow1 + x);
t0 = vextq_f32(tprev, tcurr, 3);
t1 = tcurr;
t2 = vextq_f32(tcurr, tnext, 1);
if(!dx)
{
t0 = vaddq_f32(t0, vaddq_f32(vaddq_f32(t1, t1), t2));
}
else if(dx == 2)
{
t0 = vsubq_f32(vsubq_f32(t2, t1), vsubq_f32(t1, t0));
}
else
{
t0 = vsubq_f32(t2, t0);
}
if(!(y%2))
{
vst1q_f32(trow0 + x - 4, t0);
}
else
{
vst1q_f32(trow1 + x - 4, t0);
}
}
x -= 4;
if(x == size.width){
x--;
}
f32 prevx = 0, rowx = 0, nextx = 0;
if(!dy)
{
prevx = x > 0 ? srow2[x-1] + 2*srow1[x-1] + srow0[x-1] :
(borderType == BORDER_MODE_REFLECT101 ? srow2[1] + 2*srow1[1] + srow0[1] :
(borderType == BORDER_MODE_CONSTANT ? 4*borderValue :
srow2[0] + 2*srow1[0] + srow0[0]) );
rowx = srow2[x] + 2*srow1[x] + srow0[x];
}
else if(dy == 2)
{
prevx = x > 0 ? srow2[x-1] - 2*srow1[x-1] + srow0[x-1] :
(borderType == BORDER_MODE_REFLECT101 ? srow2[1] - 2*srow1[1] + srow0[1] :
(borderType == BORDER_MODE_CONSTANT ? 0.f :
srow2[0] - 2*srow1[0] + srow0[0]) );
rowx = srow2[x] - 2*srow1[x] + srow0[x];
}
else
{
prevx = x > 0 ? srow2[x-1] - srow0[x-1] :
(borderType == BORDER_MODE_REFLECT101 ? srow2[1] - srow0[1] :
(borderType == BORDER_MODE_CONSTANT ? 0.f :
srow2[0] - srow0[0]) );
rowx = srow2[x] - srow0[x];
}
for( ; x < size.width; x++ )
{
if(x+1 == size.width) {
// make border
if (borderType == BORDER_MODE_CONSTANT)
{
if(!dy) {
nextx = 4*borderValue;
} else {
nextx = 0.f;
}
} else if (borderType == BORDER_MODE_REFLECT101)
{
if(!dy) {
nextx = srow2[x-1] + 2*srow1[x-1] + srow0[x-1];
} else if(dy == 2) {
nextx = srow2[x-1] - 2*srow1[x-1] + srow0[x-1];
} else {
nextx = srow2[x-1] - srow0[x-1];
}
} else {
if(!dy) {
nextx = srow2[x] + 2*srow1[x] + srow0[x];
} else if(dy == 2) {
nextx = srow2[x] - 2*srow1[x] + srow0[x];
} else {
nextx = srow2[x] - srow0[x];
}
}
} else {
if(!dy) {
nextx = srow2[x+1] + 2*srow1[x+1] + srow0[x+1];
} else if(dy == 2) {
nextx = srow2[x+1] - 2*srow1[x+1] + srow0[x+1];
} else {
nextx = srow2[x+1] - srow0[x+1];
}
}
f32 res;
if(dx==1) {
res = nextx - prevx;
} else if(!dx) {
res = prevx + 2*rowx + nextx;
} else {
res = prevx - 2*rowx + nextx;
}
if(!(y%2)) {
*(trow0+x) = res;
} else {
*(trow1+x) = res;
}
prevx = rowx;
rowx = nextx;
}
if(y>0) {
for(size_t x1 = 0; x1 < size.width; x1++ )
{
if(y%2)
*(drow + x1) = trow0[x1];
else
*(drow + x1) = trow1[x1];
}
}
if(y == size.height-1) {
for(size_t x1 = 0; x1 < size.width; x1++ )
{
if(!(y%2))
*(drow1 + x1) = trow0[x1];
else
*(drow1 + x1) = trow1[x1];
}
}
}
#else
(void)srcBase;
(void)srcStride;
(void)dstBase;
(void)dstStride;
(void)borderValue;
#endif
}
} // namespace CAROTENE_NS

621
3rdparty/carotene/src/sub.cpp vendored Normal file
View File

@ -0,0 +1,621 @@
/*
* By downloading, copying, installing or using the software you agree to this license.
* If you do not agree to this license, do not download, install,
* copy or use the software.
*
*
* License Agreement
* For Open Source Computer Vision Library
* (3-clause BSD License)
*
* Copyright (C) 2014, NVIDIA Corporation, all rights reserved.
* Third party copyrights are property of their respective owners.
*
* Redistribution and use in source and binary forms, with or without modification,
* are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* * Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* * Neither the names of the copyright holders nor the names of the contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* This software is provided by the copyright holders and contributors "as is" and
* any express or implied warranties, including, but not limited to, the implied
* warranties of merchantability and fitness for a particular purpose are disclaimed.
* In no event shall copyright holders or contributors be liable for any direct,
* indirect, incidental, special, exemplary, or consequential damages
* (including, but not limited to, procurement of substitute goods or services;
* loss of use, data, or profits; or business interruption) however caused
* and on any theory of liability, whether in contract, strict liability,
* or tort (including negligence or otherwise) arising in any way out of
* the use of this software, even if advised of the possibility of such damage.
*/
#include "common.hpp"
#include "vtransform.hpp"
namespace CAROTENE_NS {
#ifdef CAROTENE_NEON
namespace {
template <typename T, typename WT>
struct SubWrap
{
typedef T type;
void operator() (const typename internal::VecTraits<T>::vec128 & v_src0,
const typename internal::VecTraits<T>::vec128 & v_src1,
typename internal::VecTraits<T>::vec128 & v_dst) const
{
v_dst = internal::vsubq(v_src0, v_src1);
}
void operator() (const typename internal::VecTraits<T>::vec64 & v_src0,
const typename internal::VecTraits<T>::vec64 & v_src1,
typename internal::VecTraits<T>::vec64 & v_dst) const
{
v_dst = internal::vsub(v_src0, v_src1);
}
void operator() (const T * src0, const T * src1, T * dst) const
{
dst[0] = (T)((WT)src0[0] - (WT)src1[0]);
}
};
template <typename T, typename WT>
struct SubSaturate
{
typedef T type;
void operator() (const typename internal::VecTraits<T>::vec128 & v_src0,
const typename internal::VecTraits<T>::vec128 & v_src1,
typename internal::VecTraits<T>::vec128 & v_dst) const
{
v_dst = internal::vqsubq(v_src0, v_src1);
}
void operator() (const typename internal::VecTraits<T>::vec64 & v_src0,
const typename internal::VecTraits<T>::vec64 & v_src1,
typename internal::VecTraits<T>::vec64 & v_dst) const
{
v_dst = internal::vqsub(v_src0, v_src1);
}
void operator() (const T * src0, const T * src1, T * dst) const
{
dst[0] = internal::saturate_cast<T>((WT)src0[0] - (WT)src1[0]);
}
};
} // namespace
#endif
void sub(const Size2D &size,
const u8 * src0Base, ptrdiff_t src0Stride,
const u8 * src1Base, ptrdiff_t src1Stride,
u8 *dstBase, ptrdiff_t dstStride,
CONVERT_POLICY policy)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
if (policy == CONVERT_POLICY_SATURATE)
{
internal::vtransform(size,
src0Base, src0Stride,
src1Base, src1Stride,
dstBase, dstStride,
SubSaturate<u8, s16>());
}
else
{
internal::vtransform(size,
src0Base, src0Stride,
src1Base, src1Stride,
dstBase, dstStride,
SubWrap<u8, s16>());
}
#else
(void)size;
(void)src0Base;
(void)src0Stride;
(void)src1Base;
(void)src1Stride;
(void)dstBase;
(void)dstStride;
(void)policy;
#endif
}
void sub(const Size2D &size,
const u8 * src0Base, ptrdiff_t src0Stride,
const u8 * src1Base, ptrdiff_t src1Stride,
s16 *dstBase, ptrdiff_t dstStride,
CONVERT_POLICY)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
size_t roiw32 = size.width >= 31 ? size.width - 31 : 0;
size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
for (size_t i = 0; i < size.height; ++i)
{
const u8 * src0 = internal::getRowPtr(src0Base, src0Stride, i);
const u8 * src1 = internal::getRowPtr(src1Base, src1Stride, i);
u16 * dstu16 = internal::getRowPtr((u16 *)dstBase, dstStride, i);
s16 * dst = internal::getRowPtr(dstBase, dstStride, i);
size_t j = 0;
for (; j < roiw32; j += 32)
{
internal::prefetch(src0 + j);
internal::prefetch(src1 + j);
uint8x16_t v_src00 = vld1q_u8(src0 + j), v_src01 = vld1q_u8(src0 + j + 16);
uint8x16_t v_src10 = vld1q_u8(src1 + j), v_src11 = vld1q_u8(src1 + j + 16);
vst1q_u16(dstu16 + j, vsubl_u8(vget_low_u8(v_src00), vget_low_u8(v_src10)));
vst1q_u16(dstu16 + j + 8, vsubl_u8(vget_high_u8(v_src00), vget_high_u8(v_src10)));
vst1q_u16(dstu16 + j + 16, vsubl_u8(vget_low_u8(v_src01), vget_low_u8(v_src11)));
vst1q_u16(dstu16 + j + 24, vsubl_u8(vget_high_u8(v_src01), vget_high_u8(v_src11)));
}
for (; j < roiw8; j += 8)
{
uint8x8_t v_src0 = vld1_u8(src0 + j);
uint8x8_t v_src1 = vld1_u8(src1 + j);
vst1q_u16(dstu16 + j, vsubl_u8(v_src0, v_src1));
}
for (; j < size.width; j++)
dst[j] = (s16)src0[j] - (s16)src1[j];
}
#else
(void)size;
(void)src0Base;
(void)src0Stride;
(void)src1Base;
(void)src1Stride;
(void)dstBase;
(void)dstStride;
#endif
}
void sub(const Size2D &size,
const u8 * src0Base, ptrdiff_t src0Stride,
const u8 * src1Base, ptrdiff_t src1Stride,
f32 *dstBase, ptrdiff_t dstStride)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
size_t roiw32 = size.width >= 31 ? size.width - 31 : 0;
size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
for (size_t i = 0; i < size.height; ++i)
{
const u8 * src0 = internal::getRowPtr(src0Base, src0Stride, i);
const u8 * src1 = internal::getRowPtr(src1Base, src1Stride, i);
f32 * dst = internal::getRowPtr(dstBase, dstStride, i);
size_t j = 0;
for (; j < roiw32; j += 32)
{
internal::prefetch(src0 + j);
internal::prefetch(src1 + j);
uint8x16_t v_src00 = vld1q_u8(src0 + j), v_src01 = vld1q_u8(src0 + j + 16);
uint8x16_t v_src10 = vld1q_u8(src1 + j), v_src11 = vld1q_u8(src1 + j + 16);
int16x8_t vsl = vreinterpretq_s16_u16(vsubl_u8( vget_low_u8(v_src00), vget_low_u8(v_src10)));
int16x8_t vsh = vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(v_src00), vget_high_u8(v_src10)));
vst1q_f32(dst + j + 0, vcvtq_f32_s32(vmovl_s16( vget_low_s16(vsl) )));
vst1q_f32(dst + j + 4, vcvtq_f32_s32(vmovl_s16( vget_high_s16(vsl) )));
vst1q_f32(dst + j + 8, vcvtq_f32_s32(vmovl_s16( vget_low_s16(vsh) )));
vst1q_f32(dst + j + 12, vcvtq_f32_s32(vmovl_s16( vget_high_s16(vsh) )));
vsl = vreinterpretq_s16_u16(vsubl_u8( vget_low_u8(v_src01), vget_low_u8(v_src11)));
vsh = vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(v_src01), vget_high_u8(v_src11)));
vst1q_f32(dst + j + 16, vcvtq_f32_s32(vmovl_s16( vget_low_s16(vsl) )));
vst1q_f32(dst + j + 20, vcvtq_f32_s32(vmovl_s16( vget_high_s16(vsl) )));
vst1q_f32(dst + j + 24, vcvtq_f32_s32(vmovl_s16( vget_low_s16(vsh) )));
vst1q_f32(dst + j + 28, vcvtq_f32_s32(vmovl_s16( vget_high_s16(vsh) )));
}
for (; j < roiw8; j += 8)
{
uint8x8_t v_src0 = vld1_u8(src0 + j);
uint8x8_t v_src1 = vld1_u8(src1 + j);
int16x8_t vs = vreinterpretq_s16_u16(vsubl_u8(v_src0, v_src1));
vst1q_f32(dst + j + 0, vcvtq_f32_s32(vmovl_s16( vget_low_s16(vs) )));
vst1q_f32(dst + j + 4, vcvtq_f32_s32(vmovl_s16( vget_high_s16(vs) )));
}
for(; j < size.width; j++)
dst[j] = (f32)src0[j] - (f32)src1[j];
}
#else
(void)size;
(void)src0Base;
(void)src0Stride;
(void)src1Base;
(void)src1Stride;
(void)dstBase;
(void)dstStride;
#endif
}
void sub(const Size2D &size,
const u8 * src0Base, ptrdiff_t src0Stride,
const s16 * src1Base, ptrdiff_t src1Stride,
s16 *dstBase, ptrdiff_t dstStride,
CONVERT_POLICY policy)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
size_t roiw16 = size.width >= 15 ? size.width - 15 : 0;
size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
for (size_t i = 0; i < size.height; ++i)
{
const u8 * src0 = internal::getRowPtr(src0Base, src0Stride, i);
const s16 * src1 = internal::getRowPtr(src1Base, src1Stride, i);
s16 * dst = internal::getRowPtr(dstBase, dstStride, i);
size_t j = 0;
if (policy == CONVERT_POLICY_SATURATE)
{
for (; j < roiw16; j += 16)
{
internal::prefetch(src0 + j);
internal::prefetch(src1 + j);
uint8x16_t v_src0 = vld1q_u8(src0 + j);
int16x8_t v_src00 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(v_src0)));
int16x8_t v_src01 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(v_src0)));
int16x8_t v_src10 = vld1q_s16(src1 + j), v_src11 = vld1q_s16(src1 + j + 8);
int16x8_t v_dst0 = vqsubq_s16(v_src00, v_src10);
int16x8_t v_dst1 = vqsubq_s16(v_src01, v_src11);
vst1q_s16(dst + j, v_dst0);
vst1q_s16(dst + j + 8, v_dst1);
}
for (; j < roiw8; j += 8)
{
int16x8_t v_src0 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(src0 + j)));
int16x8_t v_src1 = vld1q_s16(src1 + j);
int16x8_t v_dst = vqsubq_s16(v_src0, v_src1);
vst1q_s16(dst + j, v_dst);
}
for (; j < size.width; j++)
dst[j] = internal::saturate_cast<s16>((s32)src0[j] - (s32)src1[j]);
}
else
{
for (; j < roiw16; j += 16)
{
internal::prefetch(src0 + j);
internal::prefetch(src1 + j);
uint8x16_t v_src0 = vld1q_u8(src0 + j);
int16x8_t v_src00 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(v_src0)));
int16x8_t v_src01 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(v_src0)));
int16x8_t v_src10 = vld1q_s16(src1 + j), v_src11 = vld1q_s16(src1 + j + 8);
int16x8_t v_dst0 = vsubq_s16(v_src00, v_src10);
int16x8_t v_dst1 = vsubq_s16(v_src01, v_src11);
vst1q_s16(dst + j, v_dst0);
vst1q_s16(dst + j + 8, v_dst1);
}
for (; j < roiw8; j += 8)
{
int16x8_t v_src0 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(src0 + j)));
int16x8_t v_src1 = vld1q_s16(src1 + j);
int16x8_t v_dst = vsubq_s16(v_src0, v_src1);
vst1q_s16(dst + j, v_dst);
}
for (; j < size.width; j++)
dst[j] = (s16)((s32)src0[j] - (s32)src1[j]);
}
}
#else
(void)size;
(void)src0Base;
(void)src0Stride;
(void)src1Base;
(void)src1Stride;
(void)dstBase;
(void)dstStride;
(void)policy;
#endif
}
void sub(const Size2D &size,
const s16 * src0Base, ptrdiff_t src0Stride,
const u8 * src1Base, ptrdiff_t src1Stride,
s16 *dstBase, ptrdiff_t dstStride,
CONVERT_POLICY policy)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
size_t roiw16 = size.width >= 15 ? size.width - 15 : 0;
size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
for (size_t i = 0; i < size.height; ++i)
{
const s16 * src0 = internal::getRowPtr(src0Base, src0Stride, i);
const u8 * src1 = internal::getRowPtr(src1Base, src1Stride, i);
s16 * dst = internal::getRowPtr(dstBase, dstStride, i);
size_t j = 0;
if (policy == CONVERT_POLICY_SATURATE)
{
for (; j < roiw16; j += 16)
{
internal::prefetch(src0 + j);
internal::prefetch(src1 + j);
int16x8_t v_src00 = vld1q_s16(src0 + j), v_src01 = vld1q_s16(src0 + j + 8);
uint8x16_t v_src1 = vld1q_u8(src1 + j);
int16x8_t v_src10 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(v_src1)));
int16x8_t v_src11 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(v_src1)));
int16x8_t v_dst0 = vqsubq_s16(v_src00, v_src10);
int16x8_t v_dst1 = vqsubq_s16(v_src01, v_src11);
vst1q_s16(dst + j, v_dst0);
vst1q_s16(dst + j + 8, v_dst1);
}
for (; j < roiw8; j += 8)
{
int16x8_t v_src0 = vld1q_s16(src0 + j);
int16x8_t v_src1 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(src1 + j)));
int16x8_t v_dst = vqsubq_s16(v_src0, v_src1);
vst1q_s16(dst + j, v_dst);
}
for (; j < size.width; j++)
dst[j] = internal::saturate_cast<s16>((s32)src0[j] - (s32)src1[j]);
}
else
{
for (; j < roiw16; j += 16)
{
internal::prefetch(src0 + j);
internal::prefetch(src1 + j);
int16x8_t v_src00 = vld1q_s16(src0 + j), v_src01 = vld1q_s16(src0 + j + 8);
uint8x16_t v_src1 = vld1q_u8(src1 + j);
int16x8_t v_src10 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(v_src1)));
int16x8_t v_src11 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(v_src1)));
int16x8_t v_dst0 = vsubq_s16(v_src00, v_src10);
int16x8_t v_dst1 = vsubq_s16(v_src01, v_src11);
vst1q_s16(dst + j, v_dst0);
vst1q_s16(dst + j + 8, v_dst1);
}
for (; j < roiw8; j += 8)
{
int16x8_t v_src0 = vld1q_s16(src0 + j);
int16x8_t v_src1 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(src1 + j)));
int16x8_t v_dst = vsubq_s16(v_src0, v_src1);
vst1q_s16(dst + j, v_dst);
}
for (; j < size.width; j++)
dst[j] = (s16)((s32)src0[j] - (s32)src1[j]);
}
}
#else
(void)size;
(void)src0Base;
(void)src0Stride;
(void)src1Base;
(void)src1Stride;
(void)dstBase;
(void)dstStride;
(void)policy;
#endif
}
void sub(const Size2D &size,
const s8 * src0Base, ptrdiff_t src0Stride,
const s8 * src1Base, ptrdiff_t src1Stride,
s8 *dstBase, ptrdiff_t dstStride,
CONVERT_POLICY policy)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
if (policy == CONVERT_POLICY_SATURATE)
{
internal::vtransform(size,
src0Base, src0Stride,
src1Base, src1Stride,
dstBase, dstStride,
SubSaturate<s8, s16>());
}
else
{
internal::vtransform(size,
src0Base, src0Stride,
src1Base, src1Stride,
dstBase, dstStride,
SubWrap<s8, s16>());
}
#else
(void)size;
(void)src0Base;
(void)src0Stride;
(void)src1Base;
(void)src1Stride;
(void)dstBase;
(void)dstStride;
(void)policy;
#endif
}
void sub(const Size2D &size,
const s16 * src0Base, ptrdiff_t src0Stride,
const s16 * src1Base, ptrdiff_t src1Stride,
s16 *dstBase, ptrdiff_t dstStride,
CONVERT_POLICY policy)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
if (policy == CONVERT_POLICY_SATURATE)
{
internal::vtransform(size,
src0Base, src0Stride,
src1Base, src1Stride,
dstBase, dstStride,
SubSaturate<s16, s32>());
}
else
{
internal::vtransform(size,
src0Base, src0Stride,
src1Base, src1Stride,
dstBase, dstStride,
SubWrap<s16, s32>());
}
#else
(void)size;
(void)src0Base;
(void)src0Stride;
(void)src1Base;
(void)src1Stride;
(void)dstBase;
(void)dstStride;
(void)policy;
#endif
}
void sub(const Size2D &size,
const u16 * src0Base, ptrdiff_t src0Stride,
const u16 * src1Base, ptrdiff_t src1Stride,
u16 *dstBase, ptrdiff_t dstStride,
CONVERT_POLICY policy)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
if (policy == CONVERT_POLICY_SATURATE)
{
internal::vtransform(size,
src0Base, src0Stride,
src1Base, src1Stride,
dstBase, dstStride,
SubSaturate<u16, s32>());
}
else
{
internal::vtransform(size,
src0Base, src0Stride,
src1Base, src1Stride,
dstBase, dstStride,
SubWrap<u16, s32>());
}
#else
(void)size;
(void)src0Base;
(void)src0Stride;
(void)src1Base;
(void)src1Stride;
(void)dstBase;
(void)dstStride;
(void)policy;
#endif
}
void sub(const Size2D &size,
const s32 * src0Base, ptrdiff_t src0Stride,
const s32 * src1Base, ptrdiff_t src1Stride,
s32 *dstBase, ptrdiff_t dstStride,
CONVERT_POLICY policy)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
if (policy == CONVERT_POLICY_SATURATE)
{
internal::vtransform(size,
src0Base, src0Stride,
src1Base, src1Stride,
dstBase, dstStride,
SubSaturate<s32, s64>());
}
else
{
internal::vtransform(size,
src0Base, src0Stride,
src1Base, src1Stride,
dstBase, dstStride,
SubWrap<s32, s64>());
}
#else
(void)size;
(void)src0Base;
(void)src0Stride;
(void)src1Base;
(void)src1Stride;
(void)dstBase;
(void)dstStride;
(void)policy;
#endif
}
void sub(const Size2D &size,
const u32 * src0Base, ptrdiff_t src0Stride,
const u32 * src1Base, ptrdiff_t src1Stride,
u32 *dstBase, ptrdiff_t dstStride,
CONVERT_POLICY policy)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
if (policy == CONVERT_POLICY_SATURATE)
{
internal::vtransform(size,
src0Base, src0Stride,
src1Base, src1Stride,
dstBase, dstStride,
SubSaturate<u32, s64>());
}
else
{
internal::vtransform(size,
src0Base, src0Stride,
src1Base, src1Stride,
dstBase, dstStride,
SubWrap<u32, s64>());
}
#else
(void)size;
(void)src0Base;
(void)src0Stride;
(void)src1Base;
(void)src1Stride;
(void)dstBase;
(void)dstStride;
(void)policy;
#endif
}
void sub(const Size2D &size,
const f32 * src0Base, ptrdiff_t src0Stride,
const f32 * src1Base, ptrdiff_t src1Stride,
f32 *dstBase, ptrdiff_t dstStride)
{
internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
internal::vtransform(size,
src0Base, src0Stride,
src1Base, src1Stride,
dstBase, dstStride,
SubWrap<f32, f32>());
#else
(void)size;
(void)src0Base;
(void)src0Stride;
(void)src1Base;
(void)src1Stride;
(void)dstBase;
(void)dstStride;
#endif
}
} // namespace CAROTENE_NS

385
3rdparty/carotene/src/sum.cpp vendored Normal file
View File

@ -0,0 +1,385 @@
/*
* By downloading, copying, installing or using the software you agree to this license.
* If you do not agree to this license, do not download, install,
* copy or use the software.
*
*
* License Agreement
* For Open Source Computer Vision Library
* (3-clause BSD License)
*
* Copyright (C) 2012-2015, NVIDIA Corporation, all rights reserved.
* Third party copyrights are property of their respective owners.
*
* Redistribution and use in source and binary forms, with or without modification,
* are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* * Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* * Neither the names of the copyright holders nor the names of the contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* This software is provided by the copyright holders and contributors "as is" and
* any express or implied warranties, including, but not limited to, the implied
* warranties of merchantability and fitness for a particular purpose are disclaimed.
* In no event shall copyright holders or contributors be liable for any direct,
* indirect, incidental, special, exemplary, or consequential damages
* (including, but not limited to, procurement of substitute goods or services;
* loss of use, data, or profits; or business interruption) however caused
* and on any theory of liability, whether in contract, strict liability,
* or tort (including negligence or otherwise) arising in any way out of
* the use of this software, even if advised of the possibility of such damage.
*/
#include "common.hpp"
#include "vtransform.hpp"
namespace CAROTENE_NS {
bool isSumSupported(u32 channels)
{
return (channels && channels < 5);
}
void sum(const Size2D &_size,
const u8 * srcBase, ptrdiff_t srcStride,
u32 * sumdst, u32 channels)
{
internal::assertSupportedConfiguration(isSumSupported(channels));
#ifdef CAROTENE_NEON
Size2D size(_size);
if (srcStride == (ptrdiff_t)(size.width))
{
size.width *= size.height;
size.height = 1;
}
const ptrdiff_t width = size.width * channels;
for(size_t k = 0; k < size.height; ++k)
{
const u8* src = internal::getRowPtr( srcBase, srcStride, k);
ptrdiff_t i = 0;
if (channels == 3)
{
uint32x4_t vs1231 = vdupq_n_u32(0);
uint32x4_t vs3123 = vdupq_n_u32(0);
uint32x4_t vs2312 = vdupq_n_u32(0);
for (; i <= width - 257*8*3; i += 257*8*3, src += 257*8*3)
{
uint16x8_t s1 = vmovl_u8(vld1_u8(src + 0));
uint16x8_t s2 = vmovl_u8(vld1_u8(src + 8));
uint16x8_t s3 = vmovl_u8(vld1_u8(src + 16));
for (ptrdiff_t j = 8*3; j < 257*8*3; j+= 8*3)
{
internal::prefetch(src + j + 24);
s1 = vaddw_u8(s1, vld1_u8(src + j + 0));
s2 = vaddw_u8(s2, vld1_u8(src + j + 8));
s3 = vaddw_u8(s3, vld1_u8(src + j + 16));
}
vs1231 = vqaddq_u32(vs1231, vaddl_u16(vget_low_u16(s1), vget_high_u16(s2)));
vs3123 = vqaddq_u32(vs3123, vaddl_u16(vget_low_u16(s2), vget_high_u16(s3)));
vs2312 = vqaddq_u32(vs2312, vaddl_u16(vget_low_u16(s3), vget_high_u16(s1)));
}
if (i <= width - 8*3)
{
uint16x8_t s1 = vmovl_u8(vld1_u8(src + 0));
uint16x8_t s2 = vmovl_u8(vld1_u8(src + 8));
uint16x8_t s3 = vmovl_u8(vld1_u8(src + 16));
for (i += 8*3, src += 8*3; i <= width - 8*3; i += 8*3, src += 8*3)
{
internal::prefetch(src + 24);
s1 = vaddw_u8(s1, vld1_u8(src + 0));
s2 = vaddw_u8(s2, vld1_u8(src + 8));
s3 = vaddw_u8(s3, vld1_u8(src + 16));
}
vs1231 = vqaddq_u32(vs1231, vaddl_u16(vget_low_u16(s1), vget_high_u16(s2)));
vs3123 = vqaddq_u32(vs3123, vaddl_u16(vget_low_u16(s2), vget_high_u16(s3)));
vs2312 = vqaddq_u32(vs2312, vaddl_u16(vget_low_u16(s3), vget_high_u16(s1)));
}
u32 sum[12];
vst1q_u32(sum+0, vs1231);
vst1q_u32(sum+4, vs2312);
vst1q_u32(sum+8, vs3123);
for (; i < width; i += 3, src += 3)
{
sumdst[0] += src[0];
sumdst[1] += src[1];
sumdst[2] += src[2];
}
sumdst[0] += sum[0] + sum[3] + sum[6] + sum[9];
sumdst[1] += sum[1] + sum[4] + sum[7] + sum[10];
sumdst[2] += sum[2] + sum[5] + sum[8] + sum[11];
}
else
{
uint32x4_t vs = vdupq_n_u32(0);
for (; i <= width - 257*8; i += 257*8, src += 257 * 8)
{
uint16x8_t s1 = vmovl_u8(vld1_u8(src));
for (int j = 8; j < 257 * 8; j += 8)
{
internal::prefetch(src + j);
s1 = vaddw_u8(s1, vld1_u8(src + j));
}
vs = vqaddq_u32(vs, vaddl_u16(vget_low_u16(s1), vget_high_u16(s1)));
}
if (i < width - 7)
{
uint16x8_t s1 = vmovl_u8(vld1_u8(src));
for(i+=8,src+=8; i < width-7; i+=8,src+=8)
{
internal::prefetch(src);
s1 = vaddw_u8(s1, vld1_u8(src));
}
vs = vqaddq_u32(vs, vaddl_u16(vget_low_u16(s1), vget_high_u16(s1)));
}
if (channels == 1)
{
uint32x2_t vs2 = vqadd_u32(vget_low_u32(vs), vget_high_u32(vs));
uint32x2_t vs1 = vreinterpret_u32_u64(vpaddl_u32(vs2));
u32 s0 = vget_lane_u32(vs1, 0);
for(; i < width; ++i,++src)
s0 += src[0];
sumdst[0] += s0;
}
else if (channels == 4)
{
vst1q_u32(sumdst, vqaddq_u32(vs, vld1q_u32(sumdst)));
for(; i < width; i+=4,src+=4)
{
sumdst[0] += src[0];
sumdst[1] += src[1];
sumdst[2] += src[2];
sumdst[3] += src[3];
}
}
else//if (channels == 2)
{
uint32x2_t vs2 = vqadd_u32(vget_low_u32(vs), vget_high_u32(vs));
vst1_u32(sumdst, vqadd_u32(vs2, vld1_u32(sumdst)));
for(; i < width; i+=2,src+=2)
{
sumdst[0] += src[0];
sumdst[1] += src[1];
}
}
}//channels != 3
}
#else
(void)_size;
(void)srcBase;
(void)srcStride;
(void)sumdst;
(void)channels;
#endif
}
void sum(const Size2D &_size,
const f32 * srcBase, ptrdiff_t srcStride,
f64 * sumdst, u32 channels)
{
internal::assertSupportedConfiguration(isSumSupported(channels));
#ifdef CAROTENE_NEON
Size2D size(_size);
if (srcStride == (ptrdiff_t)(size.width))
{
size.width *= size.height;
size.height = 1;
}
const ptrdiff_t width = size.width * channels;
for(size_t k = 0; k < size.height; ++k)
{
const f32* src = internal::getRowPtr( srcBase, srcStride, k);
ptrdiff_t i = 0;
if (channels == 3)
{
float32x4_t vs1231 = vdupq_n_f32(0);
float32x4_t vs2312 = vdupq_n_f32(0);
float32x4_t vs3123 = vdupq_n_f32(0);
for(; i <= width-12; i += 12)
{
internal::prefetch(src + i + 12);
vs1231 = vaddq_f32(vs1231, vld1q_f32(src + i + 0));
vs2312 = vaddq_f32(vs2312, vld1q_f32(src + i + 4));
vs3123 = vaddq_f32(vs3123, vld1q_f32(src + i + 8));
}
f32 s[12];
vst1q_f32(s + 0, vs1231);
vst1q_f32(s + 4, vs2312);
vst1q_f32(s + 8, vs3123);
sumdst[0] += s[0] + s[3] + s[6] + s[9];
sumdst[1] += s[1] + s[4] + s[7] + s[10];
sumdst[2] += s[2] + s[5] + s[8] + s[11];
for( ; i < width; i+=3)
{
sumdst[0] += src[i];
sumdst[1] += src[i+1];
sumdst[2] += src[i+2];
}
}
else
{
float32x4_t vs = vdupq_n_f32(0);
for(; i <= width-4; i += 4)
{
internal::prefetch(src + i);
vs = vaddq_f32(vs, vld1q_f32(src+i));
}
if (channels == 1)
{
float32x2_t vs2 = vpadd_f32(vget_low_f32(vs), vget_high_f32(vs));
f32 s[2];
vst1_f32(s, vs2);
sumdst[0] += s[0] + s[1];
for( ; i < width; i++)
sumdst[0] += src[i];
}
else if (channels == 4)
{
f32 s[4];
vst1q_f32(s, vs);
sumdst[0] += s[0];
sumdst[1] += s[1];
sumdst[2] += s[2];
sumdst[3] += s[3];
}
else//if (channels == 2)
{
float32x2_t vs2 = vadd_f32(vget_low_f32(vs), vget_high_f32(vs));
f32 s[2];
vst1_f32(s, vs2);
sumdst[0] += s[0];
sumdst[1] += s[1];
if(i < width)
{
sumdst[0] += src[i];
sumdst[1] += src[i+1];
}
}
}//channels != 3
}
#else
(void)_size;
(void)srcBase;
(void)srcStride;
(void)sumdst;
(void)channels;
#endif
}
bool isSqsumSupported(u32 channels)
{
return (channels && ((4/channels)*channels == 4));
}
void sqsum(const Size2D &_size,
const u8 * srcBase, ptrdiff_t srcStride,
f64 * sumdst, f64 * sqsumdst, u32 channels)
{
internal::assertSupportedConfiguration(isSqsumSupported(channels));
#ifdef CAROTENE_NEON
Size2D size(_size);
if (srcStride == (ptrdiff_t)(size.width*channels))
{
size.width *= size.height;
size.height = 1;
}
const size_t width = size.width * channels;
size_t blockSize0 = 1 << 23;
size_t roiw8 = width & ~7;
uint32x4_t v_zero = vdupq_n_u32(0u);
for (size_t i = 0; i < size.height; ++i)
{
const u8 * src = internal::getRowPtr(srcBase, srcStride, i);
size_t j = 0u;
while (j < roiw8)
{
size_t blockSize = std::min(roiw8 - j, blockSize0) + j;
uint32x4_t v_sum = v_zero;
uint32x4_t v_sqsum = v_zero;
for ( ; j < blockSize ; j += 8, src += 8)
{
internal::prefetch(src);
uint8x8_t v_src0 = vld1_u8(src);
uint16x8_t v_src = vmovl_u8(v_src0);
uint16x4_t v_srclo = vget_low_u16(v_src), v_srchi = vget_high_u16(v_src);
v_sum = vaddq_u32(v_sum, vaddl_u16(v_srclo, v_srchi));
v_sqsum = vmlal_u16(v_sqsum, v_srclo, v_srclo);
v_sqsum = vmlal_u16(v_sqsum, v_srchi, v_srchi);
}
u32 arsum[8];
vst1q_u32(arsum, v_sum);
vst1q_u32(arsum + 4, v_sqsum);
sumdst[0] += (f64)arsum[0];
sumdst[1 % channels] += (f64)arsum[1];
sumdst[2 % channels] += (f64)arsum[2];
sumdst[3 % channels] += (f64)arsum[3];
sqsumdst[0] += (f64)arsum[4];
sqsumdst[1 % channels] += (f64)arsum[5];
sqsumdst[2 % channels] += (f64)arsum[6];
sqsumdst[3 % channels] += (f64)arsum[7];
}
// collect a few last elements in the current row
// it's ok to process channels elements per step
// since we could handle 1,2 or 4 channels
// we always have channels-fold amount of elements remaining
for ( ; j < width; j+=channels, src+=channels)
{
for (u32 kk = 0; kk < channels; kk++)
{
u32 srcval = src[kk];
sumdst[kk] += srcval;
sqsumdst[kk] += srcval * srcval;
}
}
}
#else
(void)_size;
(void)srcBase;
(void)srcStride;
(void)sumdst;
(void)sqsumdst;
(void)channels;
#endif
}
} // namespace CAROTENE_NS

View File

@ -0,0 +1,241 @@
/*
* By downloading, copying, installing or using the software you agree to this license.
* If you do not agree to this license, do not download, install,
* copy or use the software.
*
*
* License Agreement
* For Open Source Computer Vision Library
* (3-clause BSD License)
*
* Copyright (C) 2013-2015, NVIDIA Corporation, all rights reserved.
* Third party copyrights are property of their respective owners.
*
* Redistribution and use in source and binary forms, with or without modification,
* are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* * Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* * Neither the names of the copyright holders nor the names of the contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* This software is provided by the copyright holders and contributors "as is" and
* any express or implied warranties, including, but not limited to, the implied
* warranties of merchantability and fitness for a particular purpose are disclaimed.
* In no event shall copyright holders or contributors be liable for any direct,
* indirect, incidental, special, exemplary, or consequential damages
* (including, but not limited to, procurement of substitute goods or services;
* loss of use, data, or profits; or business interruption) however caused
* and on any theory of liability, whether in contract, strict liability,
* or tort (including negligence or otherwise) arising in any way out of
* the use of this software, even if advised of the possibility of such damage.
*/
#include "common.hpp"
#include <vector>
#include <cstring>
namespace CAROTENE_NS {
#define ENABLE4LINESMATCHING false //Disabled since overall time for simultaneous 4 lines matching is greater than
//time for simultaneous 2 lines matching for the same amount of data
bool isMatchTemplateSupported(const Size2D &tmplSize)
{
return isSupportedConfiguration() &&
tmplSize.width >= 8 && // Actually the function could process even shorter templates
// but there will be no NEON optimization in this case
(tmplSize.width * tmplSize.height) <= 256;
}
void matchTemplate(const Size2D &srcSize,
const u8 * srcBase, ptrdiff_t srcStride,
const Size2D &tmplSize,
const u8 * tmplBase, ptrdiff_t tmplStride,
f32 * dstBase, ptrdiff_t dstStride,
bool normalize)
{
internal::assertSupportedConfiguration(isMatchTemplateSupported(tmplSize));
#ifdef CAROTENE_NEON
const size_t tmplW = tmplSize.width;
const size_t tmplH = tmplSize.height;
const size_t dstW = srcSize.width - tmplSize.width + 1;
const size_t dstH = srcSize.height - tmplSize.height + 1;
//template correlation part
{
#if ENABLE4LINESMATCHING
const size_t dstroiw4 = dstW & ~3u;
#endif
const size_t dstroiw2 = dstW & ~1u;
const size_t tmplroiw = tmplW & ~7u;
const size_t dstride = dstStride >> 2;
f32 *corr = dstBase;
const u8 *imgrrow = srcBase;
for(size_t r = 0; r < dstH; ++r, corr+=dstride, imgrrow+=srcStride)
{
size_t c = 0;
#if ENABLE4LINESMATCHING
for(; c < dstroiw4; c+=4)
{
u32 dot[4] = {0, 0, 0, 0};
uint32x4_t vdot0 = vmovq_n_u32(0);
uint32x4_t vdot1 = vmovq_n_u32(0);
uint32x4_t vdot2 = vmovq_n_u32(0);
uint32x4_t vdot3 = vmovq_n_u32(0);
const u8 *img = imgrrow;
const u8 *tmpl = tmplBase;
for(size_t i = 0; i < tmplH; ++i, tmpl+=tmplStride, img+=srcStride)
{
size_t j = 0;
for(; j < tmplroiw; j+=8)
{
uint8x8_t vtmpl = vld1_u8(tmpl + j);
uint8x8_t vimg0 = vld1_u8(img + j + c + 0);
uint8x8_t vimg1 = vld1_u8(img + j + c + 1);
uint8x8_t vimg2 = vld1_u8(img + j + c + 2);
uint8x8_t vimg3 = vld1_u8(img + j + c + 3);
uint16x8_t vd0 = vmull_u8(vtmpl, vimg0);
uint16x8_t vd1 = vmull_u8(vtmpl, vimg1);
uint16x8_t vd2 = vmull_u8(vtmpl, vimg2);
uint16x8_t vd3 = vmull_u8(vtmpl, vimg3);
vdot0 = vpadalq_u16(vdot0, vd0);
vdot1 = vpadalq_u16(vdot1, vd1);
vdot2 = vpadalq_u16(vdot2, vd2);
vdot3 = vpadalq_u16(vdot3, vd3);
}
for(; j < tmplW; ++j)
{
dot[0] += tmpl[j] * img[j + c + 0];
dot[1] += tmpl[j] * img[j + c + 1];
dot[2] += tmpl[j] * img[j + c + 2];
dot[3] += tmpl[j] * img[j + c + 3];
}
}
uint32x4_t vdotx = vld1q_u32(dot);
uint32x2_t vdot_0 = vpadd_u32(vget_low_u32(vdot0), vget_high_u32(vdot0));
uint32x2_t vdot_1 = vpadd_u32(vget_low_u32(vdot1), vget_high_u32(vdot1));
uint32x2_t vdot_2 = vpadd_u32(vget_low_u32(vdot2), vget_high_u32(vdot2));
uint32x2_t vdot_3 = vpadd_u32(vget_low_u32(vdot3), vget_high_u32(vdot3));
uint32x2_t vdot_01 = vpadd_u32(vdot_0, vdot_1);
uint32x2_t vdot_23 = vpadd_u32(vdot_2, vdot_3);
vst1q_f32(corr + c, vcvtq_f32_u32(vaddq_u32(vdotx, vcombine_u32(vdot_01, vdot_23))));
}
#endif
for(; c < dstroiw2; c+=2)
{
u32 dot[2] = {0, 0};
uint32x4_t vdot0 = vmovq_n_u32(0);
uint32x4_t vdot1 = vmovq_n_u32(0);
const u8 *img = imgrrow;
const u8 *tmpl = tmplBase;
for(size_t i = 0; i < tmplH; ++i, tmpl+=tmplStride, img+=srcStride)
{
size_t j = 0;
for(; j < tmplroiw; j+=8)
{
uint8x8_t vtmpl = vld1_u8(tmpl + j);
uint8x8_t vimg0 = vld1_u8(img + j + c + 0);
uint8x8_t vimg1 = vld1_u8(img + j + c + 1);
uint16x8_t vd0 = vmull_u8(vtmpl, vimg0);
uint16x8_t vd1 = vmull_u8(vtmpl, vimg1);
vdot0 = vpadalq_u16(vdot0, vd0);
vdot1 = vpadalq_u16(vdot1, vd1);
}
for(; j < tmplW; ++j)
{
dot[0] += tmpl[j] * img[j + c + 0];
dot[1] += tmpl[j] * img[j + c + 1];
}
}
uint32x2_t vdotx = vld1_u32(dot);
uint32x2_t vdot_0 = vpadd_u32(vget_low_u32(vdot0), vget_high_u32(vdot0));
uint32x2_t vdot_1 = vpadd_u32(vget_low_u32(vdot1), vget_high_u32(vdot1));
uint32x2_t vdot_ = vpadd_u32(vdot_0, vdot_1);
vst1_f32(corr + c, vcvt_f32_u32(vadd_u32(vdotx, vdot_)));
}
for(; c < dstW; ++c)
{
u32 dot = 0;
uint32x4_t vdot = vmovq_n_u32(0);
const u8 *img = imgrrow;
const u8 *tmpl = tmplBase;
for(size_t i = 0; i < tmplH; ++i, tmpl+=tmplStride, img+=srcStride)
{
size_t j = 0;
for(; j < tmplroiw; j+=8)
{
uint8x8_t vtmpl = vld1_u8(tmpl + j);
uint8x8_t vimg = vld1_u8(img + j + c);
uint16x8_t vd = vmull_u8(vtmpl, vimg);
vdot = vpadalq_u16(vdot, vd);
}
for(; j < tmplW; ++j)
dot += tmpl[j] * img[j + c];
}
u32 wdot[2];
vst1_u32(wdot, vpadd_u32(vget_low_u32(vdot), vget_high_u32(vdot)));
dot += wdot[0] + wdot[1];
corr[c] = (f32)dot;
}
}
}
if(normalize)
{
f32 tn = std::sqrt((f32)normL2(tmplSize, tmplBase, tmplStride));
size_t iw = srcSize.width+1;
size_t ih = srcSize.height+1;
std::vector<f64> _sqsum(iw*ih);
f64 *sqsum = &_sqsum[0];
memset(sqsum, 0, iw*sizeof(f64));
for(size_t i = 1; i < ih; ++i)
sqsum[iw*i] = 0.;
sqrIntegral(srcSize, srcBase, srcStride, sqsum + iw + 1, iw*sizeof(f64));
for(size_t i = 0; i < dstH; ++i)
{
f32 *result = internal::getRowPtr(dstBase, dstStride, i);
for(size_t j = 0; j < dstW; ++j)
{
double s2 = sqsum[iw*i + j] +
sqsum[iw*(i + tmplSize.height) + j + tmplSize.width] -
sqsum[iw*(i + tmplSize.height) + j] -
sqsum[iw*i + j + tmplSize.width];
result[j] /= tn * std::sqrt(s2);
}
}
}
#else
(void)srcSize;
(void)srcBase;
(void)srcStride;
(void)tmplBase;
(void)tmplStride;
(void)dstBase;
(void)dstStride;
(void)normalize;
#endif
}
} // namespace CAROTENE_NS

1627
3rdparty/carotene/src/threshold.cpp vendored Normal file

File diff suppressed because it is too large Load Diff

689
3rdparty/carotene/src/vtransform.hpp vendored Normal file
View File

@ -0,0 +1,689 @@
/*
* By downloading, copying, installing or using the software you agree to this license.
* If you do not agree to this license, do not download, install,
* copy or use the software.
*
*
* License Agreement
* For Open Source Computer Vision Library
* (3-clause BSD License)
*
* Copyright (C) 2014-2015, NVIDIA Corporation, all rights reserved.
* Third party copyrights are property of their respective owners.
*
* Redistribution and use in source and binary forms, with or without modification,
* are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* * Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* * Neither the names of the copyright holders nor the names of the contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* This software is provided by the copyright holders and contributors "as is" and
* any express or implied warranties, including, but not limited to, the implied
* warranties of merchantability and fitness for a particular purpose are disclaimed.
* In no event shall copyright holders or contributors be liable for any direct,
* indirect, incidental, special, exemplary, or consequential damages
* (including, but not limited to, procurement of substitute goods or services;
* loss of use, data, or profits; or business interruption) however caused
* and on any theory of liability, whether in contract, strict liability,
* or tort (including negligence or otherwise) arising in any way out of
* the use of this software, even if advised of the possibility of such damage.
*/
#ifndef CAROTENE_SRC_VTRANSFORM_HPP
#define CAROTENE_SRC_VTRANSFORM_HPP
#include "common.hpp"
#include <carotene/types.hpp>
#ifdef CAROTENE_NEON
namespace CAROTENE_NS { namespace internal {
////////////////////////////// Type Traits ///////////////////////
template <typename T, int cn = 1>
struct VecTraits;
template <> struct VecTraits< u8, 1> { typedef uint8x16_t vec128; typedef uint8x8_t vec64; typedef VecTraits< u8, 1> unsign; };
template <> struct VecTraits< s8, 1> { typedef int8x16_t vec128; typedef int8x8_t vec64; typedef VecTraits< u8, 1> unsign; };
template <> struct VecTraits<u16, 1> { typedef uint16x8_t vec128; typedef uint16x4_t vec64; typedef VecTraits< u16, 1> unsign; };
template <> struct VecTraits<s16, 1> { typedef int16x8_t vec128; typedef int16x4_t vec64; typedef VecTraits< u16, 1> unsign; };
template <> struct VecTraits<s32, 1> { typedef int32x4_t vec128; typedef int32x2_t vec64; typedef VecTraits< u32, 1> unsign; };
template <> struct VecTraits<u32, 1> { typedef uint32x4_t vec128; typedef uint32x2_t vec64; typedef VecTraits< u32, 1> unsign; };
template <> struct VecTraits<s64, 1> { typedef int64x2_t vec128; typedef int64x1_t vec64; typedef VecTraits< u64, 1> unsign; };
template <> struct VecTraits<u64, 1> { typedef uint64x2_t vec128; typedef uint64x1_t vec64; typedef VecTraits< u64, 1> unsign; };
template <> struct VecTraits<f32, 1> { typedef float32x4_t vec128; typedef float32x2_t vec64; typedef VecTraits< u32, 1> unsign; };
template <> struct VecTraits< u8, 2> { typedef uint8x16x2_t vec128; typedef uint8x8x2_t vec64; typedef VecTraits< u8, 2> unsign; };
template <> struct VecTraits< s8, 2> { typedef int8x16x2_t vec128; typedef int8x8x2_t vec64; typedef VecTraits< u8, 2> unsign; };
template <> struct VecTraits<u16, 2> { typedef uint16x8x2_t vec128; typedef uint16x4x2_t vec64; typedef VecTraits< u16, 2> unsign; };
template <> struct VecTraits<s16, 2> { typedef int16x8x2_t vec128; typedef int16x4x2_t vec64; typedef VecTraits< u16, 2> unsign; };
template <> struct VecTraits<s32, 2> { typedef int32x4x2_t vec128; typedef int32x2x2_t vec64; typedef VecTraits< u32, 2> unsign; };
template <> struct VecTraits<u32, 2> { typedef uint32x4x2_t vec128; typedef uint32x2x2_t vec64; typedef VecTraits< u32, 2> unsign; };
template <> struct VecTraits<s64, 2> { typedef int64x2x2_t vec128; typedef int64x1x2_t vec64; typedef VecTraits< u64, 2> unsign; };
template <> struct VecTraits<u64, 2> { typedef uint64x2x2_t vec128; typedef uint64x1x2_t vec64; typedef VecTraits< u64, 2> unsign; };
template <> struct VecTraits<f32, 2> { typedef float32x4x2_t vec128; typedef float32x2x2_t vec64; typedef VecTraits< u32, 2> unsign; };
template <> struct VecTraits< u8, 3> { typedef uint8x16x3_t vec128; typedef uint8x8x3_t vec64; typedef VecTraits< u8, 3> unsign; };
template <> struct VecTraits< s8, 3> { typedef int8x16x3_t vec128; typedef int8x8x3_t vec64; typedef VecTraits< u8, 3> unsign; };
template <> struct VecTraits<u16, 3> { typedef uint16x8x3_t vec128; typedef uint16x4x3_t vec64; typedef VecTraits< u16, 3> unsign; };
template <> struct VecTraits<s16, 3> { typedef int16x8x3_t vec128; typedef int16x4x3_t vec64; typedef VecTraits< u16, 3> unsign; };
template <> struct VecTraits<s32, 3> { typedef int32x4x3_t vec128; typedef int32x2x3_t vec64; typedef VecTraits< u32, 3> unsign; };
template <> struct VecTraits<u32, 3> { typedef uint32x4x3_t vec128; typedef uint32x2x3_t vec64; typedef VecTraits< u32, 3> unsign; };
template <> struct VecTraits<s64, 3> { typedef int64x2x3_t vec128; typedef int64x1x3_t vec64; typedef VecTraits< u64, 2> unsign; };
template <> struct VecTraits<u64, 3> { typedef uint64x2x3_t vec128; typedef uint64x1x3_t vec64; typedef VecTraits< u64, 2> unsign; };
template <> struct VecTraits<f32, 3> { typedef float32x4x3_t vec128; typedef float32x2x3_t vec64; typedef VecTraits< u32, 3> unsign; };
template <> struct VecTraits< u8, 4> { typedef uint8x16x4_t vec128; typedef uint8x8x4_t vec64; typedef VecTraits< u8, 3> unsign; };
template <> struct VecTraits< s8, 4> { typedef int8x16x4_t vec128; typedef int8x8x4_t vec64; typedef VecTraits< u8, 3> unsign; };
template <> struct VecTraits<u16, 4> { typedef uint16x8x4_t vec128; typedef uint16x4x4_t vec64; typedef VecTraits< u16, 3> unsign; };
template <> struct VecTraits<s16, 4> { typedef int16x8x4_t vec128; typedef int16x4x4_t vec64; typedef VecTraits< u16, 3> unsign; };
template <> struct VecTraits<s32, 4> { typedef int32x4x4_t vec128; typedef int32x2x4_t vec64; typedef VecTraits< u32, 3> unsign; };
template <> struct VecTraits<u32, 4> { typedef uint32x4x4_t vec128; typedef uint32x2x4_t vec64; typedef VecTraits< u32, 3> unsign; };
template <> struct VecTraits<s64, 4> { typedef int64x2x4_t vec128; typedef int64x1x4_t vec64; typedef VecTraits< u64, 2> unsign; };
template <> struct VecTraits<u64, 4> { typedef uint64x2x4_t vec128; typedef uint64x1x4_t vec64; typedef VecTraits< u64, 2> unsign; };
template <> struct VecTraits<f32, 4> { typedef float32x4x4_t vec128; typedef float32x2x4_t vec64; typedef VecTraits< u32, 3> unsign; };
////////////////////////////// vld1q ///////////////////////
inline uint8x16_t vld1q(const u8 * ptr) { return vld1q_u8(ptr); }
inline int8x16_t vld1q(const s8 * ptr) { return vld1q_s8(ptr); }
inline uint16x8_t vld1q(const u16 * ptr) { return vld1q_u16(ptr); }
inline int16x8_t vld1q(const s16 * ptr) { return vld1q_s16(ptr); }
inline uint32x4_t vld1q(const u32 * ptr) { return vld1q_u32(ptr); }
inline int32x4_t vld1q(const s32 * ptr) { return vld1q_s32(ptr); }
inline float32x4_t vld1q(const f32 * ptr) { return vld1q_f32(ptr); }
////////////////////////////// vld1 ///////////////////////
inline uint8x8_t vld1(const u8 * ptr) { return vld1_u8(ptr); }
inline int8x8_t vld1(const s8 * ptr) { return vld1_s8(ptr); }
inline uint16x4_t vld1(const u16 * ptr) { return vld1_u16(ptr); }
inline int16x4_t vld1(const s16 * ptr) { return vld1_s16(ptr); }
inline uint32x2_t vld1(const u32 * ptr) { return vld1_u32(ptr); }
inline int32x2_t vld1(const s32 * ptr) { return vld1_s32(ptr); }
inline float32x2_t vld1(const f32 * ptr) { return vld1_f32(ptr); }
////////////////////////////// vld2q ///////////////////////
inline uint8x16x2_t vld2q(const u8 * ptr) { return vld2q_u8(ptr); }
inline int8x16x2_t vld2q(const s8 * ptr) { return vld2q_s8(ptr); }
inline uint16x8x2_t vld2q(const u16 * ptr) { return vld2q_u16(ptr); }
inline int16x8x2_t vld2q(const s16 * ptr) { return vld2q_s16(ptr); }
inline uint32x4x2_t vld2q(const u32 * ptr) { return vld2q_u32(ptr); }
inline int32x4x2_t vld2q(const s32 * ptr) { return vld2q_s32(ptr); }
inline float32x4x2_t vld2q(const f32 * ptr) { return vld2q_f32(ptr); }
////////////////////////////// vld2 ///////////////////////
inline uint8x8x2_t vld2(const u8 * ptr) { return vld2_u8(ptr); }
inline int8x8x2_t vld2(const s8 * ptr) { return vld2_s8(ptr); }
inline uint16x4x2_t vld2(const u16 * ptr) { return vld2_u16(ptr); }
inline int16x4x2_t vld2(const s16 * ptr) { return vld2_s16(ptr); }
inline uint32x2x2_t vld2(const u32 * ptr) { return vld2_u32(ptr); }
inline int32x2x2_t vld2(const s32 * ptr) { return vld2_s32(ptr); }
inline float32x2x2_t vld2(const f32 * ptr) { return vld2_f32(ptr); }
////////////////////////////// vld3q ///////////////////////
inline uint8x16x3_t vld3q(const u8 * ptr) { return vld3q_u8(ptr); }
inline int8x16x3_t vld3q(const s8 * ptr) { return vld3q_s8(ptr); }
inline uint16x8x3_t vld3q(const u16 * ptr) { return vld3q_u16(ptr); }
inline int16x8x3_t vld3q(const s16 * ptr) { return vld3q_s16(ptr); }
inline uint32x4x3_t vld3q(const u32 * ptr) { return vld3q_u32(ptr); }
inline int32x4x3_t vld3q(const s32 * ptr) { return vld3q_s32(ptr); }
inline float32x4x3_t vld3q(const f32 * ptr) { return vld3q_f32(ptr); }
////////////////////////////// vld3 ///////////////////////
inline uint8x8x3_t vld3(const u8 * ptr) { return vld3_u8(ptr); }
inline int8x8x3_t vld3(const s8 * ptr) { return vld3_s8(ptr); }
inline uint16x4x3_t vld3(const u16 * ptr) { return vld3_u16(ptr); }
inline int16x4x3_t vld3(const s16 * ptr) { return vld3_s16(ptr); }
inline uint32x2x3_t vld3(const u32 * ptr) { return vld3_u32(ptr); }
inline int32x2x3_t vld3(const s32 * ptr) { return vld3_s32(ptr); }
inline float32x2x3_t vld3(const f32 * ptr) { return vld3_f32(ptr); }
////////////////////////////// vld4q ///////////////////////
inline uint8x16x4_t vld4q(const u8 * ptr) { return vld4q_u8(ptr); }
inline int8x16x4_t vld4q(const s8 * ptr) { return vld4q_s8(ptr); }
inline uint16x8x4_t vld4q(const u16 * ptr) { return vld4q_u16(ptr); }
inline int16x8x4_t vld4q(const s16 * ptr) { return vld4q_s16(ptr); }
inline uint32x4x4_t vld4q(const u32 * ptr) { return vld4q_u32(ptr); }
inline int32x4x4_t vld4q(const s32 * ptr) { return vld4q_s32(ptr); }
inline float32x4x4_t vld4q(const f32 * ptr) { return vld4q_f32(ptr); }
////////////////////////////// vld4 ///////////////////////
inline uint8x8x4_t vld4(const u8 * ptr) { return vld4_u8(ptr); }
inline int8x8x4_t vld4(const s8 * ptr) { return vld4_s8(ptr); }
inline uint16x4x4_t vld4(const u16 * ptr) { return vld4_u16(ptr); }
inline int16x4x4_t vld4(const s16 * ptr) { return vld4_s16(ptr); }
inline uint32x2x4_t vld4(const u32 * ptr) { return vld4_u32(ptr); }
inline int32x2x4_t vld4(const s32 * ptr) { return vld4_s32(ptr); }
inline float32x2x4_t vld4(const f32 * ptr) { return vld4_f32(ptr); }
////////////////////////////// vst1q ///////////////////////
inline void vst1q(u8 * ptr, const uint8x16_t & v) { return vst1q_u8(ptr, v); }
inline void vst1q(s8 * ptr, const int8x16_t & v) { return vst1q_s8(ptr, v); }
inline void vst1q(u16 * ptr, const uint16x8_t & v) { return vst1q_u16(ptr, v); }
inline void vst1q(s16 * ptr, const int16x8_t & v) { return vst1q_s16(ptr, v); }
inline void vst1q(u32 * ptr, const uint32x4_t & v) { return vst1q_u32(ptr, v); }
inline void vst1q(s32 * ptr, const int32x4_t & v) { return vst1q_s32(ptr, v); }
inline void vst1q(f32 * ptr, const float32x4_t & v) { return vst1q_f32(ptr, v); }
////////////////////////////// vst1 ///////////////////////
inline void vst1(u8 * ptr, const uint8x8_t & v) { return vst1_u8(ptr, v); }
inline void vst1(s8 * ptr, const int8x8_t & v) { return vst1_s8(ptr, v); }
inline void vst1(u16 * ptr, const uint16x4_t & v) { return vst1_u16(ptr, v); }
inline void vst1(s16 * ptr, const int16x4_t & v) { return vst1_s16(ptr, v); }
inline void vst1(u32 * ptr, const uint32x2_t & v) { return vst1_u32(ptr, v); }
inline void vst1(s32 * ptr, const int32x2_t & v) { return vst1_s32(ptr, v); }
inline void vst1(f32 * ptr, const float32x2_t & v) { return vst1_f32(ptr, v); }
////////////////////////////// vst2q ///////////////////////
inline void vst2q(u8 * ptr, const uint8x16x2_t & v) { return vst2q_u8(ptr, v); }
inline void vst2q(s8 * ptr, const int8x16x2_t & v) { return vst2q_s8(ptr, v); }
inline void vst2q(u16 * ptr, const uint16x8x2_t & v) { return vst2q_u16(ptr, v); }
inline void vst2q(s16 * ptr, const int16x8x2_t & v) { return vst2q_s16(ptr, v); }
inline void vst2q(u32 * ptr, const uint32x4x2_t & v) { return vst2q_u32(ptr, v); }
inline void vst2q(s32 * ptr, const int32x4x2_t & v) { return vst2q_s32(ptr, v); }
inline void vst2q(f32 * ptr, const float32x4x2_t & v) { return vst2q_f32(ptr, v); }
////////////////////////////// vst2 ///////////////////////
inline void vst2(u8 * ptr, const uint8x8x2_t & v) { return vst2_u8(ptr, v); }
inline void vst2(s8 * ptr, const int8x8x2_t & v) { return vst2_s8(ptr, v); }
inline void vst2(u16 * ptr, const uint16x4x2_t & v) { return vst2_u16(ptr, v); }
inline void vst2(s16 * ptr, const int16x4x2_t & v) { return vst2_s16(ptr, v); }
inline void vst2(u32 * ptr, const uint32x2x2_t & v) { return vst2_u32(ptr, v); }
inline void vst2(s32 * ptr, const int32x2x2_t & v) { return vst2_s32(ptr, v); }
inline void vst2(f32 * ptr, const float32x2x2_t & v) { return vst2_f32(ptr, v); }
////////////////////////////// vst3q ///////////////////////
inline void vst3q(u8 * ptr, const uint8x16x3_t & v) { return vst3q_u8(ptr, v); }
inline void vst3q(s8 * ptr, const int8x16x3_t & v) { return vst3q_s8(ptr, v); }
inline void vst3q(u16 * ptr, const uint16x8x3_t & v) { return vst3q_u16(ptr, v); }
inline void vst3q(s16 * ptr, const int16x8x3_t & v) { return vst3q_s16(ptr, v); }
inline void vst3q(u32 * ptr, const uint32x4x3_t & v) { return vst3q_u32(ptr, v); }
inline void vst3q(s32 * ptr, const int32x4x3_t & v) { return vst3q_s32(ptr, v); }
inline void vst3q(f32 * ptr, const float32x4x3_t & v) { return vst3q_f32(ptr, v); }
////////////////////////////// vst3 ///////////////////////
inline void vst3(u8 * ptr, const uint8x8x3_t & v) { return vst3_u8(ptr, v); }
inline void vst3(s8 * ptr, const int8x8x3_t & v) { return vst3_s8(ptr, v); }
inline void vst3(u16 * ptr, const uint16x4x3_t & v) { return vst3_u16(ptr, v); }
inline void vst3(s16 * ptr, const int16x4x3_t & v) { return vst3_s16(ptr, v); }
inline void vst3(u32 * ptr, const uint32x2x3_t & v) { return vst3_u32(ptr, v); }
inline void vst3(s32 * ptr, const int32x2x3_t & v) { return vst3_s32(ptr, v); }
inline void vst3(f32 * ptr, const float32x2x3_t & v) { return vst3_f32(ptr, v); }
////////////////////////////// vst4q ///////////////////////
inline void vst4q(u8 * ptr, const uint8x16x4_t & v) { return vst4q_u8(ptr, v); }
inline void vst4q(s8 * ptr, const int8x16x4_t & v) { return vst4q_s8(ptr, v); }
inline void vst4q(u16 * ptr, const uint16x8x4_t & v) { return vst4q_u16(ptr, v); }
inline void vst4q(s16 * ptr, const int16x8x4_t & v) { return vst4q_s16(ptr, v); }
inline void vst4q(u32 * ptr, const uint32x4x4_t & v) { return vst4q_u32(ptr, v); }
inline void vst4q(s32 * ptr, const int32x4x4_t & v) { return vst4q_s32(ptr, v); }
inline void vst4q(f32 * ptr, const float32x4x4_t & v) { return vst4q_f32(ptr, v); }
////////////////////////////// vst4 ///////////////////////
inline void vst4(u8 * ptr, const uint8x8x4_t & v) { return vst4_u8(ptr, v); }
inline void vst4(s8 * ptr, const int8x8x4_t & v) { return vst4_s8(ptr, v); }
inline void vst4(u16 * ptr, const uint16x4x4_t & v) { return vst4_u16(ptr, v); }
inline void vst4(s16 * ptr, const int16x4x4_t & v) { return vst4_s16(ptr, v); }
inline void vst4(u32 * ptr, const uint32x2x4_t & v) { return vst4_u32(ptr, v); }
inline void vst4(s32 * ptr, const int32x2x4_t & v) { return vst4_s32(ptr, v); }
inline void vst4(f32 * ptr, const float32x2x4_t & v) { return vst4_f32(ptr, v); }
////////////////////////////// vabdq ///////////////////////
inline uint8x16_t vabdq(const uint8x16_t & v0, const uint8x16_t & v1) { return vabdq_u8 (v0, v1); }
inline int8x16_t vabdq(const int8x16_t & v0, const int8x16_t & v1) { return vabdq_s8 (v0, v1); }
inline uint16x8_t vabdq(const uint16x8_t & v0, const uint16x8_t & v1) { return vabdq_u16(v0, v1); }
inline int16x8_t vabdq(const int16x8_t & v0, const int16x8_t & v1) { return vabdq_s16(v0, v1); }
inline uint32x4_t vabdq(const uint32x4_t & v0, const uint32x4_t & v1) { return vabdq_u32(v0, v1); }
inline int32x4_t vabdq(const int32x4_t & v0, const int32x4_t & v1) { return vabdq_s32(v0, v1); }
inline float32x4_t vabdq(const float32x4_t & v0, const float32x4_t & v1) { return vabdq_f32(v0, v1); }
////////////////////////////// vabd ///////////////////////
inline uint8x8_t vabd(const uint8x8_t & v0, const uint8x8_t & v1) { return vabd_u8 (v0, v1); }
inline int8x8_t vabd(const int8x8_t & v0, const int8x8_t & v1) { return vabd_s8 (v0, v1); }
inline uint16x4_t vabd(const uint16x4_t & v0, const uint16x4_t & v1) { return vabd_u16(v0, v1); }
inline int16x4_t vabd(const int16x4_t & v0, const int16x4_t & v1) { return vabd_s16(v0, v1); }
inline uint32x2_t vabd(const uint32x2_t & v0, const uint32x2_t & v1) { return vabd_u32(v0, v1); }
inline int32x2_t vabd(const int32x2_t & v0, const int32x2_t & v1) { return vabd_s32(v0, v1); }
inline float32x2_t vabd(const float32x2_t & v0, const float32x2_t & v1) { return vabd_f32(v0, v1); }
////////////////////////////// vminq ///////////////////////
inline uint8x16_t vminq(const uint8x16_t & v0, const uint8x16_t & v1) { return vminq_u8 (v0, v1); }
inline int8x16_t vminq(const int8x16_t & v0, const int8x16_t & v1) { return vminq_s8 (v0, v1); }
inline uint16x8_t vminq(const uint16x8_t & v0, const uint16x8_t & v1) { return vminq_u16(v0, v1); }
inline int16x8_t vminq(const int16x8_t & v0, const int16x8_t & v1) { return vminq_s16(v0, v1); }
inline uint32x4_t vminq(const uint32x4_t & v0, const uint32x4_t & v1) { return vminq_u32(v0, v1); }
inline int32x4_t vminq(const int32x4_t & v0, const int32x4_t & v1) { return vminq_s32(v0, v1); }
inline float32x4_t vminq(const float32x4_t & v0, const float32x4_t & v1) { return vminq_f32(v0, v1); }
////////////////////////////// vmin ///////////////////////
inline uint8x8_t vmin(const uint8x8_t & v0, const uint8x8_t & v1) { return vmin_u8 (v0, v1); }
inline int8x8_t vmin(const int8x8_t & v0, const int8x8_t & v1) { return vmin_s8 (v0, v1); }
inline uint16x4_t vmin(const uint16x4_t & v0, const uint16x4_t & v1) { return vmin_u16(v0, v1); }
inline int16x4_t vmin(const int16x4_t & v0, const int16x4_t & v1) { return vmin_s16(v0, v1); }
inline uint32x2_t vmin(const uint32x2_t & v0, const uint32x2_t & v1) { return vmin_u32(v0, v1); }
inline int32x2_t vmin(const int32x2_t & v0, const int32x2_t & v1) { return vmin_s32(v0, v1); }
inline float32x2_t vmin(const float32x2_t & v0, const float32x2_t & v1) { return vmin_f32(v0, v1); }
////////////////////////////// vmaxq ///////////////////////
inline uint8x16_t vmaxq(const uint8x16_t & v0, const uint8x16_t & v1) { return vmaxq_u8 (v0, v1); }
inline int8x16_t vmaxq(const int8x16_t & v0, const int8x16_t & v1) { return vmaxq_s8 (v0, v1); }
inline uint16x8_t vmaxq(const uint16x8_t & v0, const uint16x8_t & v1) { return vmaxq_u16(v0, v1); }
inline int16x8_t vmaxq(const int16x8_t & v0, const int16x8_t & v1) { return vmaxq_s16(v0, v1); }
inline uint32x4_t vmaxq(const uint32x4_t & v0, const uint32x4_t & v1) { return vmaxq_u32(v0, v1); }
inline int32x4_t vmaxq(const int32x4_t & v0, const int32x4_t & v1) { return vmaxq_s32(v0, v1); }
inline float32x4_t vmaxq(const float32x4_t & v0, const float32x4_t & v1) { return vmaxq_f32(v0, v1); }
////////////////////////////// vmax ///////////////////////
inline uint8x8_t vmax(const uint8x8_t & v0, const uint8x8_t & v1) { return vmax_u8 (v0, v1); }
inline int8x8_t vmax(const int8x8_t & v0, const int8x8_t & v1) { return vmax_s8 (v0, v1); }
inline uint16x4_t vmax(const uint16x4_t & v0, const uint16x4_t & v1) { return vmax_u16(v0, v1); }
inline int16x4_t vmax(const int16x4_t & v0, const int16x4_t & v1) { return vmax_s16(v0, v1); }
inline uint32x2_t vmax(const uint32x2_t & v0, const uint32x2_t & v1) { return vmax_u32(v0, v1); }
inline int32x2_t vmax(const int32x2_t & v0, const int32x2_t & v1) { return vmax_s32(v0, v1); }
inline float32x2_t vmax(const float32x2_t & v0, const float32x2_t & v1) { return vmax_f32(v0, v1); }
////////////////////////////// vdupq_n ///////////////////////
inline uint8x16_t vdupq_n(const u8 & val) { return vdupq_n_u8(val); }
inline int8x16_t vdupq_n(const s8 & val) { return vdupq_n_s8(val); }
inline uint16x8_t vdupq_n(const u16 & val) { return vdupq_n_u16(val); }
inline int16x8_t vdupq_n(const s16 & val) { return vdupq_n_s16(val); }
inline uint32x4_t vdupq_n(const u32 & val) { return vdupq_n_u32(val); }
inline int32x4_t vdupq_n(const s32 & val) { return vdupq_n_s32(val); }
inline uint64x2_t vdupq_n(const u64 & val) { return vdupq_n_u64(val); }
inline int64x2_t vdupq_n(const s64 & val) { return vdupq_n_s64(val); }
inline float32x4_t vdupq_n(const f32 & val) { return vdupq_n_f32(val); }
////////////////////////////// vdup_n ///////////////////////
inline uint8x8_t vdup_n(const u8 & val) { return vdup_n_u8(val); }
inline int8x8_t vdup_n(const s8 & val) { return vdup_n_s8(val); }
inline uint16x4_t vdup_n(const u16 & val) { return vdup_n_u16(val); }
inline int16x4_t vdup_n(const s16 & val) { return vdup_n_s16(val); }
inline uint32x2_t vdup_n(const u32 & val) { return vdup_n_u32(val); }
inline int32x2_t vdup_n(const s32 & val) { return vdup_n_s32(val); }
inline uint64x1_t vdup_n(const u64 & val) { return vdup_n_u64(val); }
inline int64x1_t vdup_n(const s64 & val) { return vdup_n_s64(val); }
inline float32x2_t vdup_n(const f32 & val) { return vdup_n_f32(val); }
////////////////////////////// vget_low ///////////////////////
inline uint8x8_t vget_low(const uint8x16_t & v) { return vget_low_u8 (v); }
inline int8x8_t vget_low(const int8x16_t & v) { return vget_low_s8 (v); }
inline uint16x4_t vget_low(const uint16x8_t & v) { return vget_low_u16(v); }
inline int16x4_t vget_low(const int16x8_t & v) { return vget_low_s16(v); }
inline uint32x2_t vget_low(const uint32x4_t & v) { return vget_low_u32(v); }
inline int32x2_t vget_low(const int32x4_t & v) { return vget_low_s32(v); }
inline float32x2_t vget_low(const float32x4_t & v) { return vget_low_f32(v); }
////////////////////////////// vget_high ///////////////////////
inline uint8x8_t vget_high(const uint8x16_t & v) { return vget_high_u8 (v); }
inline int8x8_t vget_high(const int8x16_t & v) { return vget_high_s8 (v); }
inline uint16x4_t vget_high(const uint16x8_t & v) { return vget_high_u16(v); }
inline int16x4_t vget_high(const int16x8_t & v) { return vget_high_s16(v); }
inline uint32x2_t vget_high(const uint32x4_t & v) { return vget_high_u32(v); }
inline int32x2_t vget_high(const int32x4_t & v) { return vget_high_s32(v); }
inline float32x2_t vget_high(const float32x4_t & v) { return vget_high_f32(v); }
////////////////////////////// vcombine ///////////////////////
inline uint8x16_t vcombine(const uint8x8_t & v0, const uint8x8_t & v1) { return vcombine_u8 (v0, v1); }
inline int8x16_t vcombine(const int8x8_t & v0, const int8x8_t & v1) { return vcombine_s8 (v0, v1); }
inline uint16x8_t vcombine(const uint16x4_t & v0, const uint16x4_t & v1) { return vcombine_u16(v0, v1); }
inline int16x8_t vcombine(const int16x4_t & v0, const int16x4_t & v1) { return vcombine_s16(v0, v1); }
inline uint32x4_t vcombine(const uint32x2_t & v0, const uint32x2_t & v1) { return vcombine_u32(v0, v1); }
inline int32x4_t vcombine(const int32x2_t & v0, const int32x2_t & v1) { return vcombine_s32(v0, v1); }
inline float32x4_t vcombine(const float32x2_t & v0, const float32x2_t & v1) { return vcombine_f32(v0, v1); }
////////////////////////////// vaddq ///////////////////////
inline uint8x16_t vaddq(const uint8x16_t & v0, const uint8x16_t & v1) { return vaddq_u8 (v0, v1); }
inline int8x16_t vaddq(const int8x16_t & v0, const int8x16_t & v1) { return vaddq_s8 (v0, v1); }
inline uint16x8_t vaddq(const uint16x8_t & v0, const uint16x8_t & v1) { return vaddq_u16(v0, v1); }
inline int16x8_t vaddq(const int16x8_t & v0, const int16x8_t & v1) { return vaddq_s16(v0, v1); }
inline uint32x4_t vaddq(const uint32x4_t & v0, const uint32x4_t & v1) { return vaddq_u32(v0, v1); }
inline int32x4_t vaddq(const int32x4_t & v0, const int32x4_t & v1) { return vaddq_s32(v0, v1); }
inline float32x4_t vaddq(const float32x4_t & v0, const float32x4_t & v1) { return vaddq_f32(v0, v1); }
////////////////////////////// vadd ///////////////////////
inline uint8x8_t vadd(const uint8x8_t & v0, const uint8x8_t & v1) { return vadd_u8 (v0, v1); }
inline int8x8_t vadd(const int8x8_t & v0, const int8x8_t & v1) { return vadd_s8 (v0, v1); }
inline uint16x4_t vadd(const uint16x4_t & v0, const uint16x4_t & v1) { return vadd_u16(v0, v1); }
inline int16x4_t vadd(const int16x4_t & v0, const int16x4_t & v1) { return vadd_s16(v0, v1); }
inline uint32x2_t vadd(const uint32x2_t & v0, const uint32x2_t & v1) { return vadd_u32(v0, v1); }
inline int32x2_t vadd(const int32x2_t & v0, const int32x2_t & v1) { return vadd_s32(v0, v1); }
inline float32x2_t vadd(const float32x2_t & v0, const float32x2_t & v1) { return vadd_f32(v0, v1); }
////////////////////////////// vqaddq ///////////////////////
inline uint8x16_t vqaddq(const uint8x16_t & v0, const uint8x16_t & v1) { return vqaddq_u8 (v0, v1); }
inline int8x16_t vqaddq(const int8x16_t & v0, const int8x16_t & v1) { return vqaddq_s8 (v0, v1); }
inline uint16x8_t vqaddq(const uint16x8_t & v0, const uint16x8_t & v1) { return vqaddq_u16(v0, v1); }
inline int16x8_t vqaddq(const int16x8_t & v0, const int16x8_t & v1) { return vqaddq_s16(v0, v1); }
inline uint32x4_t vqaddq(const uint32x4_t & v0, const uint32x4_t & v1) { return vqaddq_u32(v0, v1); }
inline int32x4_t vqaddq(const int32x4_t & v0, const int32x4_t & v1) { return vqaddq_s32(v0, v1); }
////////////////////////////// vqadd ///////////////////////
inline uint8x8_t vqadd(const uint8x8_t & v0, const uint8x8_t & v1) { return vqadd_u8 (v0, v1); }
inline int8x8_t vqadd(const int8x8_t & v0, const int8x8_t & v1) { return vqadd_s8 (v0, v1); }
inline uint16x4_t vqadd(const uint16x4_t & v0, const uint16x4_t & v1) { return vqadd_u16(v0, v1); }
inline int16x4_t vqadd(const int16x4_t & v0, const int16x4_t & v1) { return vqadd_s16(v0, v1); }
inline uint32x2_t vqadd(const uint32x2_t & v0, const uint32x2_t & v1) { return vqadd_u32(v0, v1); }
inline int32x2_t vqadd(const int32x2_t & v0, const int32x2_t & v1) { return vqadd_s32(v0, v1); }
////////////////////////////// vsubq ///////////////////////
inline uint8x16_t vsubq(const uint8x16_t & v0, const uint8x16_t & v1) { return vsubq_u8 (v0, v1); }
inline int8x16_t vsubq(const int8x16_t & v0, const int8x16_t & v1) { return vsubq_s8 (v0, v1); }
inline uint16x8_t vsubq(const uint16x8_t & v0, const uint16x8_t & v1) { return vsubq_u16(v0, v1); }
inline int16x8_t vsubq(const int16x8_t & v0, const int16x8_t & v1) { return vsubq_s16(v0, v1); }
inline uint32x4_t vsubq(const uint32x4_t & v0, const uint32x4_t & v1) { return vsubq_u32(v0, v1); }
inline int32x4_t vsubq(const int32x4_t & v0, const int32x4_t & v1) { return vsubq_s32(v0, v1); }
inline float32x4_t vsubq(const float32x4_t & v0, const float32x4_t & v1) { return vsubq_f32(v0, v1); }
////////////////////////////// vsub ///////////////////////
inline uint8x8_t vsub(const uint8x8_t & v0, const uint8x8_t & v1) { return vsub_u8 (v0, v1); }
inline int8x8_t vsub(const int8x8_t & v0, const int8x8_t & v1) { return vsub_s8 (v0, v1); }
inline uint16x4_t vsub(const uint16x4_t & v0, const uint16x4_t & v1) { return vsub_u16(v0, v1); }
inline int16x4_t vsub(const int16x4_t & v0, const int16x4_t & v1) { return vsub_s16(v0, v1); }
inline uint32x2_t vsub(const uint32x2_t & v0, const uint32x2_t & v1) { return vsub_u32(v0, v1); }
inline int32x2_t vsub(const int32x2_t & v0, const int32x2_t & v1) { return vsub_s32(v0, v1); }
inline float32x2_t vsub(const float32x2_t & v0, const float32x2_t & v1) { return vsub_f32(v0, v1); }
////////////////////////////// vqsubq ///////////////////////
inline uint8x16_t vqsubq(const uint8x16_t & v0, const uint8x16_t & v1) { return vqsubq_u8 (v0, v1); }
inline int8x16_t vqsubq(const int8x16_t & v0, const int8x16_t & v1) { return vqsubq_s8 (v0, v1); }
inline uint16x8_t vqsubq(const uint16x8_t & v0, const uint16x8_t & v1) { return vqsubq_u16(v0, v1); }
inline int16x8_t vqsubq(const int16x8_t & v0, const int16x8_t & v1) { return vqsubq_s16(v0, v1); }
inline uint32x4_t vqsubq(const uint32x4_t & v0, const uint32x4_t & v1) { return vqsubq_u32(v0, v1); }
inline int32x4_t vqsubq(const int32x4_t & v0, const int32x4_t & v1) { return vqsubq_s32(v0, v1); }
inline uint64x2_t vqsubq(const uint64x2_t & v0, const uint64x2_t & v1) { return vqsubq_u64(v0, v1); }
inline int64x2_t vqsubq(const int64x2_t & v0, const int64x2_t & v1) { return vqsubq_s64(v0, v1); }
////////////////////////////// vqsub ///////////////////////
inline uint8x8_t vqsub(const uint8x8_t & v0, const uint8x8_t & v1) { return vqsub_u8 (v0, v1); }
inline int8x8_t vqsub(const int8x8_t & v0, const int8x8_t & v1) { return vqsub_s8 (v0, v1); }
inline uint16x4_t vqsub(const uint16x4_t & v0, const uint16x4_t & v1) { return vqsub_u16(v0, v1); }
inline int16x4_t vqsub(const int16x4_t & v0, const int16x4_t & v1) { return vqsub_s16(v0, v1); }
inline uint32x2_t vqsub(const uint32x2_t & v0, const uint32x2_t & v1) { return vqsub_u32(v0, v1); }
inline int32x2_t vqsub(const int32x2_t & v0, const int32x2_t & v1) { return vqsub_s32(v0, v1); }
inline uint64x1_t vqsub(const uint64x1_t & v0, const uint64x1_t & v1) { return vqsub_u64(v0, v1); }
inline int64x1_t vqsub(const int64x1_t & v0, const int64x1_t & v1) { return vqsub_s64(v0, v1); }
////////////////////////////// vmull ///////////////////////
inline uint16x8_t vmull(const uint8x8_t & v0, const uint8x8_t & v1) { return vmull_u8 (v0, v1); }
inline int16x8_t vmull(const int8x8_t & v0, const int8x8_t & v1) { return vmull_s8 (v0, v1); }
inline uint32x4_t vmull(const uint16x4_t & v0, const uint16x4_t & v1) { return vmull_u16(v0, v1); }
inline int32x4_t vmull(const int16x4_t & v0, const int16x4_t & v1) { return vmull_s16(v0, v1); }
inline uint64x2_t vmull(const uint32x2_t & v0, const uint32x2_t & v1) { return vmull_u32(v0, v1); }
inline int64x2_t vmull(const int32x2_t & v0, const int32x2_t & v1) { return vmull_s32(v0, v1); }
////////////////////////////// vrev64q ///////////////////////
inline uint8x16_t vrev64q(const uint8x16_t & v) { return vrev64q_u8 (v); }
inline int8x16_t vrev64q(const int8x16_t & v) { return vrev64q_s8 (v); }
inline uint16x8_t vrev64q(const uint16x8_t & v) { return vrev64q_u16(v); }
inline int16x8_t vrev64q(const int16x8_t & v) { return vrev64q_s16(v); }
inline uint32x4_t vrev64q(const uint32x4_t & v) { return vrev64q_u32(v); }
inline int32x4_t vrev64q(const int32x4_t & v) { return vrev64q_s32(v); }
inline float32x4_t vrev64q(const float32x4_t & v) { return vrev64q_f32(v); }
////////////////////////////// vrev64 ///////////////////////
inline uint8x8_t vrev64(const uint8x8_t & v) { return vrev64_u8 (v); }
inline int8x8_t vrev64(const int8x8_t & v) { return vrev64_s8 (v); }
inline uint16x4_t vrev64(const uint16x4_t & v) { return vrev64_u16(v); }
inline int16x4_t vrev64(const int16x4_t & v) { return vrev64_s16(v); }
inline uint32x2_t vrev64(const uint32x2_t & v) { return vrev64_u32(v); }
inline int32x2_t vrev64(const int32x2_t & v) { return vrev64_s32(v); }
inline float32x2_t vrev64(const float32x2_t & v) { return vrev64_f32(v); }
////////////////////////////// vceqq ///////////////////////
inline uint8x16_t vceqq(const uint8x16_t & v0, const uint8x16_t & v1) { return vceqq_u8 (v0, v1); }
inline uint8x16_t vceqq(const int8x16_t & v0, const int8x16_t & v1) { return vceqq_s8 (v0, v1); }
inline uint16x8_t vceqq(const uint16x8_t & v0, const uint16x8_t & v1) { return vceqq_u16(v0, v1); }
inline uint16x8_t vceqq(const int16x8_t & v0, const int16x8_t & v1) { return vceqq_s16(v0, v1); }
inline uint32x4_t vceqq(const uint32x4_t & v0, const uint32x4_t & v1) { return vceqq_u32(v0, v1); }
inline uint32x4_t vceqq(const int32x4_t & v0, const int32x4_t & v1) { return vceqq_s32(v0, v1); }
inline uint32x4_t vceqq(const float32x4_t & v0, const float32x4_t & v1) { return vceqq_f32(v0, v1); }
////////////////////////////// vceq ///////////////////////
inline uint8x8_t vceq(const uint8x8_t & v0, const uint8x8_t & v1) { return vceq_u8 (v0, v1); }
inline uint8x8_t vceq(const int8x8_t & v0, const int8x8_t & v1) { return vceq_s8 (v0, v1); }
inline uint16x4_t vceq(const uint16x4_t & v0, const uint16x4_t & v1) { return vceq_u16(v0, v1); }
inline uint16x4_t vceq(const int16x4_t & v0, const int16x4_t & v1) { return vceq_s16(v0, v1); }
inline uint32x2_t vceq(const uint32x2_t & v0, const uint32x2_t & v1) { return vceq_u32(v0, v1); }
inline uint32x2_t vceq(const int32x2_t & v0, const int32x2_t & v1) { return vceq_s32(v0, v1); }
inline uint32x2_t vceq(const float32x2_t & v0, const float32x2_t & v1) { return vceq_f32(v0, v1); }
////////////////////////////// vcgtq ///////////////////////
inline uint8x16_t vcgtq(const uint8x16_t & v0, const uint8x16_t & v1) { return vcgtq_u8 (v0, v1); }
inline uint8x16_t vcgtq(const int8x16_t & v0, const int8x16_t & v1) { return vcgtq_s8 (v0, v1); }
inline uint16x8_t vcgtq(const uint16x8_t & v0, const uint16x8_t & v1) { return vcgtq_u16(v0, v1); }
inline uint16x8_t vcgtq(const int16x8_t & v0, const int16x8_t & v1) { return vcgtq_s16(v0, v1); }
inline uint32x4_t vcgtq(const uint32x4_t & v0, const uint32x4_t & v1) { return vcgtq_u32(v0, v1); }
inline uint32x4_t vcgtq(const int32x4_t & v0, const int32x4_t & v1) { return vcgtq_s32(v0, v1); }
inline uint32x4_t vcgtq(const float32x4_t & v0, const float32x4_t & v1) { return vcgtq_f32(v0, v1); }
////////////////////////////// vcgt ///////////////////////
inline uint8x8_t vcgt(const uint8x8_t & v0, const uint8x8_t & v1) { return vcgt_u8 (v0, v1); }
inline uint8x8_t vcgt(const int8x8_t & v0, const int8x8_t & v1) { return vcgt_s8 (v0, v1); }
inline uint16x4_t vcgt(const uint16x4_t & v0, const uint16x4_t & v1) { return vcgt_u16(v0, v1); }
inline uint16x4_t vcgt(const int16x4_t & v0, const int16x4_t & v1) { return vcgt_s16(v0, v1); }
inline uint32x2_t vcgt(const uint32x2_t & v0, const uint32x2_t & v1) { return vcgt_u32(v0, v1); }
inline uint32x2_t vcgt(const int32x2_t & v0, const int32x2_t & v1) { return vcgt_s32(v0, v1); }
inline uint32x2_t vcgt(const float32x2_t & v0, const float32x2_t & v1) { return vcgt_f32(v0, v1); }
////////////////////////////// vcgeq ///////////////////////
inline uint8x16_t vcgeq(const uint8x16_t & v0, const uint8x16_t & v1) { return vcgeq_u8 (v0, v1); }
inline uint8x16_t vcgeq(const int8x16_t & v0, const int8x16_t & v1) { return vcgeq_s8 (v0, v1); }
inline uint16x8_t vcgeq(const uint16x8_t & v0, const uint16x8_t & v1) { return vcgeq_u16(v0, v1); }
inline uint16x8_t vcgeq(const int16x8_t & v0, const int16x8_t & v1) { return vcgeq_s16(v0, v1); }
inline uint32x4_t vcgeq(const uint32x4_t & v0, const uint32x4_t & v1) { return vcgeq_u32(v0, v1); }
inline uint32x4_t vcgeq(const int32x4_t & v0, const int32x4_t & v1) { return vcgeq_s32(v0, v1); }
inline uint32x4_t vcgeq(const float32x4_t & v0, const float32x4_t & v1) { return vcgeq_f32(v0, v1); }
////////////////////////////// vcge ///////////////////////
inline uint8x8_t vcge(const uint8x8_t & v0, const uint8x8_t & v1) { return vcge_u8 (v0, v1); }
inline uint8x8_t vcge(const int8x8_t & v0, const int8x8_t & v1) { return vcge_s8 (v0, v1); }
inline uint16x4_t vcge(const uint16x4_t & v0, const uint16x4_t & v1) { return vcge_u16(v0, v1); }
inline uint16x4_t vcge(const int16x4_t & v0, const int16x4_t & v1) { return vcge_s16(v0, v1); }
inline uint32x2_t vcge(const uint32x2_t & v0, const uint32x2_t & v1) { return vcge_u32(v0, v1); }
inline uint32x2_t vcge(const int32x2_t & v0, const int32x2_t & v1) { return vcge_s32(v0, v1); }
inline uint32x2_t vcge(const float32x2_t & v0, const float32x2_t & v1) { return vcge_f32(v0, v1); }
////////////////////////////// vandq ///////////////////////
inline uint8x16_t vandq(const uint8x16_t & v0, const uint8x16_t & v1) { return vandq_u8 (v0, v1); }
inline int8x16_t vandq(const int8x16_t & v0, const int8x16_t & v1) { return vandq_s8 (v0, v1); }
inline uint16x8_t vandq(const uint16x8_t & v0, const uint16x8_t & v1) { return vandq_u16(v0, v1); }
inline int16x8_t vandq(const int16x8_t & v0, const int16x8_t & v1) { return vandq_s16(v0, v1); }
inline uint32x4_t vandq(const uint32x4_t & v0, const uint32x4_t & v1) { return vandq_u32(v0, v1); }
inline int32x4_t vandq(const int32x4_t & v0, const int32x4_t & v1) { return vandq_s32(v0, v1); }
////////////////////////////// vand ///////////////////////
inline uint8x8_t vand(const uint8x8_t & v0, const uint8x8_t & v1) { return vand_u8 (v0, v1); }
inline int8x8_t vand(const int8x8_t & v0, const int8x8_t & v1) { return vand_s8 (v0, v1); }
inline uint16x4_t vand(const uint16x4_t & v0, const uint16x4_t & v1) { return vand_u16(v0, v1); }
inline int16x4_t vand(const int16x4_t & v0, const int16x4_t & v1) { return vand_s16(v0, v1); }
inline uint32x2_t vand(const uint32x2_t & v0, const uint32x2_t & v1) { return vand_u32(v0, v1); }
inline int32x2_t vand(const int32x2_t & v0, const int32x2_t & v1) { return vand_s32(v0, v1); }
////////////////////////////// vmovn ///////////////////////
inline uint8x8_t vmovn(const uint16x8_t & v) { return vmovn_u16(v); }
inline int8x8_t vmovn(const int16x8_t & v) { return vmovn_s16(v); }
inline uint16x4_t vmovn(const uint32x4_t & v) { return vmovn_u32(v); }
inline int16x4_t vmovn(const int32x4_t & v) { return vmovn_s32(v); }
inline uint32x2_t vmovn(const uint64x2_t & v) { return vmovn_u64(v); }
inline int32x2_t vmovn(const int64x2_t & v) { return vmovn_s64(v); }
////////////////////////////// vqmovn ///////////////////////
inline uint8x8_t vqmovn(const uint16x8_t & v) { return vqmovn_u16(v); }
inline int8x8_t vqmovn(const int16x8_t & v) { return vqmovn_s16(v); }
inline uint16x4_t vqmovn(const uint32x4_t & v) { return vqmovn_u32(v); }
inline int16x4_t vqmovn(const int32x4_t & v) { return vqmovn_s32(v); }
inline uint32x2_t vqmovn(const uint64x2_t & v) { return vqmovn_u64(v); }
inline int32x2_t vqmovn(const int64x2_t & v) { return vqmovn_s64(v); }
////////////////////////////// vmovl ///////////////////////
inline uint16x8_t vmovl(const uint8x8_t & v) { return vmovl_u8(v); }
inline int16x8_t vmovl(const int8x8_t & v) { return vmovl_s8(v); }
inline uint32x4_t vmovl(const uint16x4_t & v) { return vmovl_u16(v); }
inline int32x4_t vmovl(const int16x4_t & v) { return vmovl_s16(v); }
////////////////////////////// vmvnq ///////////////////////
inline uint8x16_t vmvnq(const uint8x16_t & v) { return vmvnq_u8 (v); }
inline int8x16_t vmvnq(const int8x16_t & v) { return vmvnq_s8 (v); }
inline uint16x8_t vmvnq(const uint16x8_t & v) { return vmvnq_u16(v); }
inline int16x8_t vmvnq(const int16x8_t & v) { return vmvnq_s16(v); }
inline uint32x4_t vmvnq(const uint32x4_t & v) { return vmvnq_u32(v); }
inline int32x4_t vmvnq(const int32x4_t & v) { return vmvnq_s32(v); }
////////////////////////////// vmvn ///////////////////////
inline uint8x8_t vmvn(const uint8x8_t & v) { return vmvn_u8 (v); }
inline int8x8_t vmvn(const int8x8_t & v) { return vmvn_s8 (v); }
inline uint16x4_t vmvn(const uint16x4_t & v) { return vmvn_u16(v); }
inline int16x4_t vmvn(const int16x4_t & v) { return vmvn_s16(v); }
inline uint32x2_t vmvn(const uint32x2_t & v) { return vmvn_u32(v); }
inline int32x2_t vmvn(const int32x2_t & v) { return vmvn_s32(v); }
////////////////////////////// vbicq ///////////////////////
inline uint8x16_t vbicq(const uint8x16_t & v0, const uint8x16_t & v1) { return vbicq_u8 (v0, v1); }
inline int8x16_t vbicq(const int8x16_t & v0, const int8x16_t & v1) { return vbicq_s8 (v0, v1); }
inline uint16x8_t vbicq(const uint16x8_t & v0, const uint16x8_t & v1) { return vbicq_u16(v0, v1); }
inline int16x8_t vbicq(const int16x8_t & v0, const int16x8_t & v1) { return vbicq_s16(v0, v1); }
inline uint32x4_t vbicq(const uint32x4_t & v0, const uint32x4_t & v1) { return vbicq_u32(v0, v1); }
inline int32x4_t vbicq(const int32x4_t & v0, const int32x4_t & v1) { return vbicq_s32(v0, v1); }
inline uint64x2_t vbicq(const uint64x2_t & v0, const uint64x2_t & v1) { return vbicq_u64(v0, v1); }
inline int64x2_t vbicq(const int64x2_t & v0, const int64x2_t & v1) { return vbicq_s64(v0, v1); }
////////////////////////////// vbic ///////////////////////
inline uint8x8_t vbic(const uint8x8_t & v0, const uint8x8_t & v1) { return vbic_u8 (v0, v1); }
inline int8x8_t vbic(const int8x8_t & v0, const int8x8_t & v1) { return vbic_s8 (v0, v1); }
inline uint16x4_t vbic(const uint16x4_t & v0, const uint16x4_t & v1) { return vbic_u16(v0, v1); }
inline int16x4_t vbic(const int16x4_t & v0, const int16x4_t & v1) { return vbic_s16(v0, v1); }
inline uint32x2_t vbic(const uint32x2_t & v0, const uint32x2_t & v1) { return vbic_u32(v0, v1); }
inline int32x2_t vbic(const int32x2_t & v0, const int32x2_t & v1) { return vbic_s32(v0, v1); }
inline uint64x1_t vbic(const uint64x1_t & v0, const uint64x1_t & v1) { return vbic_u64(v0, v1); }
inline int64x1_t vbic(const int64x1_t & v0, const int64x1_t & v1) { return vbic_s64(v0, v1); }
////////////////////////////// vtransform ///////////////////////
template <typename Op>
void vtransform(Size2D size,
const typename Op::type * src0Base, ptrdiff_t src0Stride,
const typename Op::type * src1Base, ptrdiff_t src1Stride,
typename Op::type * dstBase, ptrdiff_t dstStride, const Op & op)
{
typedef typename Op::type type;
typedef typename VecTraits<type>::vec128 vec128;
typedef typename VecTraits<type>::vec64 vec64;
if (src0Stride == src1Stride && src0Stride == dstStride &&
src0Stride == (ptrdiff_t)(size.width * sizeof(type)))
{
size.width *= size.height;
size.height = 1;
}
const size_t step_base = 32 / sizeof(type);
size_t roiw_base = size.width >= (step_base - 1) ? size.width - step_base + 1 : 0;
const size_t step_tail = 8 / sizeof(type);
size_t roiw_tail = size.width >= (step_tail - 1) ? size.width - step_tail + 1 : 0;
for (size_t y = 0; y < size.height; ++y)
{
const type * src0 = internal::getRowPtr(src0Base, src0Stride, y);
const type * src1 = internal::getRowPtr(src1Base, src1Stride, y);
typename Op::type * dst = internal::getRowPtr(dstBase, dstStride, y);
size_t x = 0;
for( ; x < roiw_base; x += step_base )
{
internal::prefetch(src0 + x);
internal::prefetch(src1 + x);
vec128 v_src00 = vld1q(src0 + x), v_src01 = vld1q(src0 + x + 16 / sizeof(type));
vec128 v_src10 = vld1q(src1 + x), v_src11 = vld1q(src1 + x + 16 / sizeof(type));
vec128 v_dst;
op(v_src00, v_src10, v_dst);
vst1q(dst + x, v_dst);
op(v_src01, v_src11, v_dst);
vst1q(dst + x + 16 / sizeof(type), v_dst);
}
for( ; x < roiw_tail; x += step_tail )
{
vec64 v_src0 = vld1(src0 + x);
vec64 v_src1 = vld1(src1 + x);
vec64 v_dst;
op(v_src0, v_src1, v_dst);
vst1(dst + x, v_dst);
}
for (; x < size.width; ++x)
{
op(src0 + x, src1 + x, dst + x);
}
}
}
} }
#endif // CAROTENE_NEON
#endif

434
3rdparty/carotene/src/warp_affine.cpp vendored Normal file
View File

@ -0,0 +1,434 @@
/*
* By downloading, copying, installing or using the software you agree to this license.
* If you do not agree to this license, do not download, install,
* copy or use the software.
*
*
* License Agreement
* For Open Source Computer Vision Library
* (3-clause BSD License)
*
* Copyright (C) 2015, NVIDIA Corporation, all rights reserved.
* Third party copyrights are property of their respective owners.
*
* Redistribution and use in source and binary forms, with or without modification,
* are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* * Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* * Neither the names of the copyright holders nor the names of the contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* This software is provided by the copyright holders and contributors "as is" and
* any express or implied warranties, including, but not limited to, the implied
* warranties of merchantability and fitness for a particular purpose are disclaimed.
* In no event shall copyright holders or contributors be liable for any direct,
* indirect, incidental, special, exemplary, or consequential damages
* (including, but not limited to, procurement of substitute goods or services;
* loss of use, data, or profits; or business interruption) however caused
* and on any theory of liability, whether in contract, strict liability,
* or tort (including negligence or otherwise) arising in any way out of
* the use of this software, even if advised of the possibility of such damage.
*/
#include "remap.hpp"
namespace CAROTENE_NS {
bool isWarpAffineNearestNeighborSupported(const Size2D &ssize)
{
#if SIZE_MAX > UINT32_MAX
return !(ssize.width > 0xffffFFFF || ssize.height > 0xffffFFFF) && // Restrict image size since internal index evaluation
// is performed with u32
isSupportedConfiguration();
#else
(void)ssize;
return isSupportedConfiguration();
#endif
}
bool isWarpAffineLinearSupported(const Size2D &ssize)
{
#if SIZE_MAX > UINT32_MAX
return !(ssize.width > 0xffffFFFF || ssize.height > 0xffffFFFF) && // Restrict image size since internal index evaluation
// is performed with u32
isSupportedConfiguration();
#else
(void)ssize;
return isSupportedConfiguration();
#endif
}
void warpAffineNearestNeighbor(const Size2D &ssize, const Size2D &dsize,
const u8 * srcBase, ptrdiff_t srcStride,
const f32 * m,
u8 * dstBase, ptrdiff_t dstStride,
BORDER_MODE borderMode, u8 borderValue)
{
internal::assertSupportedConfiguration(isWarpAffineNearestNeighborSupported(ssize));
#ifdef CAROTENE_NEON
using namespace internal;
s32 _map[BLOCK_SIZE * BLOCK_SIZE + 16];
s32 * map = alignPtr(_map, 16);
int32x4_t v_width4 = vdupq_n_s32(ssize.width - 1), v_height4 = vdupq_n_s32(ssize.height - 1);
int32x4_t v_step4 = vdupq_n_s32(srcStride);
float32x4_t v_4 = vdupq_n_f32(4.0f);
float32x4_t v_m0 = vdupq_n_f32(m[0]);
float32x4_t v_m1 = vdupq_n_f32(m[1]);
float32x4_t v_m2 = vdupq_n_f32(m[2]);
float32x4_t v_m3 = vdupq_n_f32(m[3]);
float32x4_t v_m4 = vdupq_n_f32(m[4]);
float32x4_t v_m5 = vdupq_n_f32(m[5]);
if (borderMode == BORDER_MODE_REPLICATE)
{
int32x4_t v_zero4 = vdupq_n_s32(0);
for (size_t i = 0; i < dsize.height; i += BLOCK_SIZE)
{
size_t blockHeight = std::min<size_t>(BLOCK_SIZE, dsize.height - i);
for (size_t j = 0; j < dsize.width; j += BLOCK_SIZE)
{
size_t blockWidth = std::min<size_t>(BLOCK_SIZE, dsize.width - j);
// compute table
for (size_t y = 0; y < blockHeight; ++y)
{
s32 * map_row = getRowPtr(&map[0], blockWidth * sizeof(s32), y);
size_t x = 0, y_ = y + i;
f32 indeces[4] = { j + 0.0f, j + 1.0f, j + 2.0f, j + 3.0f };
float32x4_t v_x = vld1q_f32(indeces), v_y = vdupq_n_f32(y_);
float32x4_t v_yx = vmlaq_f32(v_m4, v_m2, v_y), v_yy = vmlaq_f32(v_m5, v_m3, v_y);
for ( ; x + 4 <= blockWidth; x += 4)
{
float32x4_t v_src_xf = vmlaq_f32(v_yx, v_m0, v_x);
float32x4_t v_src_yf = vmlaq_f32(v_yy, v_m1, v_x);
int32x4_t v_src_x = vmaxq_s32(v_zero4, vminq_s32(v_width4, vcvtq_s32_f32(v_src_xf)));
int32x4_t v_src_y = vmaxq_s32(v_zero4, vminq_s32(v_height4, vcvtq_s32_f32(v_src_yf)));
int32x4_t v_src_index = vmlaq_s32(v_src_x, v_src_y, v_step4);
vst1q_s32(map_row + x, v_src_index);
v_x = vaddq_f32(v_x, v_4);
}
f32 yx = m[2] * y_ + m[4], yy = m[3] * y_ + m[5];
for (ptrdiff_t x_ = x + j; x < blockWidth; ++x, ++x_)
{
f32 src_x_f = m[0] * x_ + yx;
f32 src_y_f = m[1] * x_ + yy;
s32 src_x = floorf(src_x_f), src_y = floorf(src_y_f);
src_x = std::max(0, std::min<s32>(ssize.width - 1, src_x));
src_y = std::max(0, std::min<s32>(ssize.height - 1, src_y));
map_row[x] = src_y * srcStride + src_x;
}
}
// make remap
remapNearestNeighborReplicate(Size2D(blockWidth, blockHeight), srcBase, &map[0],
getRowPtr(dstBase, dstStride, i) + j, dstStride);
}
}
}
else if (borderMode == BORDER_MODE_CONSTANT)
{
int32x4_t v_m1_4 = vdupq_n_s32(-1);
float32x4_t v_zero4 = vdupq_n_f32(0.0f);
for (size_t i = 0; i < dsize.height; i += BLOCK_SIZE)
{
size_t blockHeight = std::min<size_t>(BLOCK_SIZE, dsize.height - i);
for (size_t j = 0; j < dsize.width; j += BLOCK_SIZE)
{
size_t blockWidth = std::min<size_t>(BLOCK_SIZE, dsize.width - j);
// compute table
for (size_t y = 0; y < blockHeight; ++y)
{
s32 * map_row = getRowPtr(&map[0], blockWidth * sizeof(s32), y);
size_t x = 0, y_ = y + i;
f32 indeces[4] = { j + 0.0f, j + 1.0f, j + 2.0f, j + 3.0f };
float32x4_t v_x = vld1q_f32(indeces), v_y = vdupq_n_f32(y_);
float32x4_t v_yx = vmlaq_f32(v_m4, v_m2, v_y), v_yy = vmlaq_f32(v_m5, v_m3, v_y);
for ( ; x + 4 <= blockWidth; x += 4)
{
float32x4_t v_src_xf = vmlaq_f32(v_yx, v_m0, v_x);
float32x4_t v_src_yf = vmlaq_f32(v_yy, v_m1, v_x);
int32x4_t v_src_x = vcvtq_s32_f32(v_src_xf);
int32x4_t v_src_y = vcvtq_s32_f32(v_src_yf);
uint32x4_t v_mask = vandq_u32(vandq_u32(vcgeq_f32(v_src_xf, v_zero4), vcleq_s32(v_src_x, v_width4)),
vandq_u32(vcgeq_f32(v_src_yf, v_zero4), vcleq_s32(v_src_y, v_height4)));
int32x4_t v_src_index = vbslq_s32(v_mask, vmlaq_s32(v_src_x, v_src_y, v_step4), v_m1_4);
vst1q_s32(map_row + x, v_src_index);
v_x = vaddq_f32(v_x, v_4);
}
f32 yx = m[2] * y_ + m[4], yy = m[3] * y_ + m[5];
for (ptrdiff_t x_ = x + j; x < blockWidth; ++x, ++x_)
{
f32 src_x_f = m[0] * x_ + yx;
f32 src_y_f = m[1] * x_ + yy;
s32 src_x = floorf(src_x_f), src_y = floorf(src_y_f);
map_row[x] = (src_x >= 0) && (src_x < (s32)ssize.width) &&
(src_y >= 0) && (src_y < (s32)ssize.height) ? src_y * srcStride + src_x : -1;
}
}
// make remap
remapNearestNeighborConst(Size2D(blockWidth, blockHeight), srcBase, &map[0],
getRowPtr(dstBase, dstStride, i) + j, dstStride, borderValue);
}
}
}
#else
(void)ssize;
(void)dsize;
(void)srcBase;
(void)srcStride;
(void)m;
(void)dstBase;
(void)dstStride;
(void)borderMode;
(void)borderValue;
#endif
}
void warpAffineLinear(const Size2D &ssize, const Size2D &dsize,
const u8 * srcBase, ptrdiff_t srcStride,
const f32 * m,
u8 * dstBase, ptrdiff_t dstStride,
BORDER_MODE borderMode, u8 borderValue)
{
internal::assertSupportedConfiguration(isWarpAffineLinearSupported(ssize));
#ifdef CAROTENE_NEON
using namespace internal;
s32 _map[((BLOCK_SIZE * BLOCK_SIZE) << 2) + 16];
f32 _coeffs[((BLOCK_SIZE * BLOCK_SIZE) << 1) + 16];
s32 * map = alignPtr(_map, 16);
f32 * coeffs = alignPtr(_coeffs, 16);
int32x4_t v_width4 = vdupq_n_s32(ssize.width - 1), v_height4 = vdupq_n_s32(ssize.height - 1);
int32x4_t v_step4 = vdupq_n_s32(srcStride), v_1 = vdupq_n_s32(1);
float32x4_t v_zero4f = vdupq_n_f32(0.0f), v_one4f = vdupq_n_f32(1.0f);
float32x4_t v_m0 = vdupq_n_f32(m[0]);
float32x4_t v_m1 = vdupq_n_f32(m[1]);
float32x4_t v_m2 = vdupq_n_f32(m[2]);
float32x4_t v_m3 = vdupq_n_f32(m[3]);
float32x4_t v_m4 = vdupq_n_f32(m[4]);
float32x4_t v_m5 = vdupq_n_f32(m[5]);
if (borderMode == BORDER_MODE_REPLICATE)
{
int32x4_t v_zero4 = vdupq_n_s32(0);
for (size_t i = 0; i < dsize.height; i += BLOCK_SIZE)
{
size_t blockHeight = std::min<size_t>(BLOCK_SIZE, dsize.height - i);
for (size_t j = 0; j < dsize.width; j += BLOCK_SIZE)
{
size_t blockWidth = std::min<size_t>(BLOCK_SIZE, dsize.width - j);
// compute table
for (size_t y = 0; y < blockHeight; ++y)
{
s32 * map_row = getRowPtr(map, blockWidth * sizeof(s32) * 4, y);
f32 * coeff_row = getRowPtr(coeffs, blockWidth * sizeof(f32) * 2, y);
size_t x = 0, y_ = y + i;
f32 indeces[4] = { j + 0.0f, j + 1.0f, j + 2.0f, j + 3.0f };
float32x4_t v_x = vld1q_f32(indeces), v_y = vdupq_n_f32(y_), v_4 = vdupq_n_f32(4.0f);
float32x4_t v_yx = vmlaq_f32(v_m4, v_m2, v_y), v_yy = vmlaq_f32(v_m5, v_m3, v_y);
for ( ; x + 4 <= blockWidth; x += 4)
{
float32x4_t v_src_xf = vmlaq_f32(v_yx, v_m0, v_x);
float32x4_t v_src_yf = vmlaq_f32(v_yy, v_m1, v_x);
int32x4_t v_src_x = vcvtq_s32_f32(v_src_xf);
int32x4_t v_src_y = vcvtq_s32_f32(v_src_yf);
float32x4x2_t v_coeff;
v_coeff.val[0] = vsubq_f32(v_src_xf, vcvtq_f32_s32(v_src_x));
v_coeff.val[1] = vsubq_f32(v_src_yf, vcvtq_f32_s32(v_src_y));
uint32x4_t v_maskx = vcltq_f32(v_coeff.val[0], v_zero4f);
uint32x4_t v_masky = vcltq_f32(v_coeff.val[1], v_zero4f);
v_coeff.val[0] = vbslq_f32(v_maskx, vaddq_f32(v_one4f, v_coeff.val[0]), v_coeff.val[0]);
v_coeff.val[1] = vbslq_f32(v_masky, vaddq_f32(v_one4f, v_coeff.val[1]), v_coeff.val[1]);
v_src_x = vbslq_s32(v_maskx, vsubq_s32(v_src_x, v_1), v_src_x);
v_src_y = vbslq_s32(v_masky, vsubq_s32(v_src_y, v_1), v_src_y);
int32x4_t v_dst0_x = vmaxq_s32(v_zero4, vminq_s32(v_width4, v_src_x));
int32x4_t v_dst0_y = vmaxq_s32(v_zero4, vminq_s32(v_height4, v_src_y));
int32x4_t v_dst1_x = vmaxq_s32(v_zero4, vminq_s32(v_width4, vaddq_s32(v_1, v_src_x)));
int32x4_t v_dst1_y = vmaxq_s32(v_zero4, vminq_s32(v_height4, vaddq_s32(v_1, v_src_y)));
int32x4x4_t v_dst_index;
v_dst_index.val[0] = vmlaq_s32(v_dst0_x, v_dst0_y, v_step4);
v_dst_index.val[1] = vmlaq_s32(v_dst1_x, v_dst0_y, v_step4);
v_dst_index.val[2] = vmlaq_s32(v_dst0_x, v_dst1_y, v_step4);
v_dst_index.val[3] = vmlaq_s32(v_dst1_x, v_dst1_y, v_step4);
vst2q_f32(coeff_row + (x << 1), v_coeff);
vst4q_s32(map_row + (x << 2), v_dst_index);
v_x = vaddq_f32(v_x, v_4);
}
f32 yx = m[2] * y_ + m[4], yy = m[3] * y_ + m[5];
for (ptrdiff_t x_ = x + j; x < blockWidth; ++x, ++x_)
{
f32 src_x_f = m[0] * x_ + yx;
f32 src_y_f = m[1] * x_ + yy;
s32 src0_x = (s32)floorf(src_x_f);
s32 src0_y = (s32)floorf(src_y_f);
coeff_row[(x << 1) + 0] = src_x_f - src0_x;
coeff_row[(x << 1) + 1] = src_y_f - src0_y;
s32 src1_y = std::max(0, std::min<s32>(ssize.height - 1, src0_y + 1));
src0_y = std::max(0, std::min<s32>(ssize.height - 1, src0_y));
s32 src1_x = std::max(0, std::min<s32>(ssize.width - 1, src0_x + 1));
src0_x = std::max(0, std::min<s32>(ssize.width - 1, src0_x));
map_row[(x << 2) + 0] = src0_y * srcStride + src0_x;
map_row[(x << 2) + 1] = src0_y * srcStride + src1_x;
map_row[(x << 2) + 2] = src1_y * srcStride + src0_x;
map_row[(x << 2) + 3] = src1_y * srcStride + src1_x;
}
}
remapLinearReplicate(Size2D(blockWidth, blockHeight),
srcBase, &map[0], &coeffs[0],
getRowPtr(dstBase, dstStride, i) + j, dstStride);
}
}
}
else if (borderMode == BORDER_MODE_CONSTANT)
{
float32x4_t v_zero4 = vdupq_n_f32(0.0f);
int32x4_t v_m1_4 = vdupq_n_s32(-1);
for (size_t i = 0; i < dsize.height; i += BLOCK_SIZE)
{
size_t blockHeight = std::min<size_t>(BLOCK_SIZE, dsize.height - i);
for (size_t j = 0; j < dsize.width; j += BLOCK_SIZE)
{
size_t blockWidth = std::min<size_t>(BLOCK_SIZE, dsize.width - j);
// compute table
for (size_t y = 0; y < blockHeight; ++y)
{
s32 * map_row = getRowPtr(map, blockWidth * sizeof(s32) * 4, y);
f32 * coeff_row = getRowPtr(coeffs, blockWidth * sizeof(f32) * 2, y);
size_t x = 0, y_ = y + i;
f32 indeces[4] = { j + 0.0f, j + 1.0f, j + 2.0f, j + 3.0f };
float32x4_t v_x = vld1q_f32(indeces), v_y = vdupq_n_f32(y_), v_4 = vdupq_n_f32(4.0f);
float32x4_t v_yx = vmlaq_f32(v_m4, v_m2, v_y), v_yy = vmlaq_f32(v_m5, v_m3, v_y);
for ( ; x + 4 <= blockWidth; x += 4)
{
float32x4_t v_src_xf = vmlaq_f32(v_yx, v_m0, v_x);
float32x4_t v_src_yf = vmlaq_f32(v_yy, v_m1, v_x);
int32x4_t v_src_x0 = vcvtq_s32_f32(v_src_xf);
int32x4_t v_src_y0 = vcvtq_s32_f32(v_src_yf);
float32x4x2_t v_coeff;
v_coeff.val[0] = vsubq_f32(v_src_xf, vcvtq_f32_s32(v_src_x0));
v_coeff.val[1] = vsubq_f32(v_src_yf, vcvtq_f32_s32(v_src_y0));
uint32x4_t v_maskx = vcltq_f32(v_coeff.val[0], v_zero4f);
uint32x4_t v_masky = vcltq_f32(v_coeff.val[1], v_zero4f);
v_coeff.val[0] = vbslq_f32(v_maskx, vaddq_f32(v_one4f, v_coeff.val[0]), v_coeff.val[0]);
v_coeff.val[1] = vbslq_f32(v_masky, vaddq_f32(v_one4f, v_coeff.val[1]), v_coeff.val[1]);
v_src_x0 = vbslq_s32(v_maskx, vsubq_s32(v_src_x0, v_1), v_src_x0);
v_src_y0 = vbslq_s32(v_masky, vsubq_s32(v_src_y0, v_1), v_src_y0);
int32x4_t v_src_x1 = vaddq_s32(v_src_x0, v_1);
int32x4_t v_src_y1 = vaddq_s32(v_src_y0, v_1);
int32x4x4_t v_dst_index;
v_dst_index.val[0] = vmlaq_s32(v_src_x0, v_src_y0, v_step4);
v_dst_index.val[1] = vmlaq_s32(v_src_x1, v_src_y0, v_step4);
v_dst_index.val[2] = vmlaq_s32(v_src_x0, v_src_y1, v_step4);
v_dst_index.val[3] = vmlaq_s32(v_src_x1, v_src_y1, v_step4);
uint32x4_t v_mask_x0 = vandq_u32(vcgeq_f32(v_src_xf, v_zero4), vcleq_s32(v_src_x0, v_width4));
uint32x4_t v_mask_x1 = vandq_u32(vcgeq_f32(vaddq_f32(v_src_xf, v_one4f), v_zero4), vcleq_s32(v_src_x1, v_width4));
uint32x4_t v_mask_y0 = vandq_u32(vcgeq_f32(v_src_yf, v_zero4), vcleq_s32(v_src_y0, v_height4));
uint32x4_t v_mask_y1 = vandq_u32(vcgeq_f32(vaddq_f32(v_src_yf, v_one4f), v_zero4), vcleq_s32(v_src_y1, v_height4));
v_dst_index.val[0] = vbslq_s32(vandq_u32(v_mask_x0, v_mask_y0), v_dst_index.val[0], v_m1_4);
v_dst_index.val[1] = vbslq_s32(vandq_u32(v_mask_x1, v_mask_y0), v_dst_index.val[1], v_m1_4);
v_dst_index.val[2] = vbslq_s32(vandq_u32(v_mask_x0, v_mask_y1), v_dst_index.val[2], v_m1_4);
v_dst_index.val[3] = vbslq_s32(vandq_u32(v_mask_x1, v_mask_y1), v_dst_index.val[3], v_m1_4);
vst2q_f32(coeff_row + (x << 1), v_coeff);
vst4q_s32(map_row + (x << 2), v_dst_index);
v_x = vaddq_f32(v_x, v_4);
}
f32 yx = m[2] * y_ + m[4], yy = m[3] * y_ + m[5];
for (ptrdiff_t x_ = x + j; x < blockWidth; ++x, ++x_)
{
f32 src_x_f = m[0] * x_ + yx;
f32 src_y_f = m[1] * x_ + yy;
s32 src0_x = (s32)floorf(src_x_f), src1_x = src0_x + 1;
s32 src0_y = (s32)floorf(src_y_f), src1_y = src0_y + 1;
coeff_row[(x << 1) + 0] = src_x_f - src0_x;
coeff_row[(x << 1) + 1] = src_y_f - src0_y;
map_row[(x << 2) + 0] = (src0_x >= 0) && (src0_x < (s32)ssize.width) &&
(src0_y >= 0) && (src0_y < (s32)ssize.height) ? src0_y * srcStride + src0_x : -1;
map_row[(x << 2) + 1] = (src1_x >= 0) && (src1_x < (s32)ssize.width) &&
(src0_y >= 0) && (src0_y < (s32)ssize.height) ? src0_y * srcStride + src1_x : -1;
map_row[(x << 2) + 2] = (src0_x >= 0) && (src0_x < (s32)ssize.width) &&
(src1_y >= 0) && (src1_y < (s32)ssize.height) ? src1_y * srcStride + src0_x : -1;
map_row[(x << 2) + 3] = (src1_x >= 0) && (src1_x < (s32)ssize.width) &&
(src1_y >= 0) && (src1_y < (s32)ssize.height) ? src1_y * srcStride + src1_x : -1;
}
}
remapLinearConst(Size2D(blockWidth, blockHeight),
srcBase, &map[0], &coeffs[0],
getRowPtr(dstBase, dstStride, i) + j, dstStride, borderValue);
}
}
}
#else
(void)ssize;
(void)dsize;
(void)srcBase;
(void)srcStride;
(void)m;
(void)dstBase;
(void)dstStride;
(void)borderMode;
(void)borderValue;
#endif
}
} // namespace CAROTENE_NS

View File

@ -0,0 +1,464 @@
/*
* By downloading, copying, installing or using the software you agree to this license.
* If you do not agree to this license, do not download, install,
* copy or use the software.
*
*
* License Agreement
* For Open Source Computer Vision Library
* (3-clause BSD License)
*
* Copyright (C) 2015, NVIDIA Corporation, all rights reserved.
* Third party copyrights are property of their respective owners.
*
* Redistribution and use in source and binary forms, with or without modification,
* are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* * Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* * Neither the names of the copyright holders nor the names of the contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* This software is provided by the copyright holders and contributors "as is" and
* any express or implied warranties, including, but not limited to, the implied
* warranties of merchantability and fitness for a particular purpose are disclaimed.
* In no event shall copyright holders or contributors be liable for any direct,
* indirect, incidental, special, exemplary, or consequential damages
* (including, but not limited to, procurement of substitute goods or services;
* loss of use, data, or profits; or business interruption) however caused
* and on any theory of liability, whether in contract, strict liability,
* or tort (including negligence or otherwise) arising in any way out of
* the use of this software, even if advised of the possibility of such damage.
*/
#include "remap.hpp"
namespace CAROTENE_NS {
bool isWarpPerspectiveNearestNeighborSupported(const Size2D &ssize)
{
#if SIZE_MAX > UINT32_MAX
return !(ssize.width > 0xffffFFFF || ssize.height > 0xffffFFFF) && // Restrict image size since internal index evaluation
// is performed with u32
isSupportedConfiguration();
#else
(void)ssize;
return isSupportedConfiguration();
#endif
}
bool isWarpPerspectiveLinearSupported(const Size2D &ssize)
{
#if SIZE_MAX > UINT32_MAX
return !(ssize.width > 0xffffFFFF || ssize.height > 0xffffFFFF) && // Restrict image size since internal index evaluation
// is performed with u32
isSupportedConfiguration();
#else
(void)ssize;
return isSupportedConfiguration();
#endif
}
void warpPerspectiveNearestNeighbor(const Size2D &ssize, const Size2D &dsize,
const u8 * srcBase, ptrdiff_t srcStride,
const f32 * m,
u8 * dstBase, ptrdiff_t dstStride,
BORDER_MODE borderMode, u8 borderValue)
{
internal::assertSupportedConfiguration(isWarpPerspectiveNearestNeighborSupported(ssize));
#ifdef CAROTENE_NEON
using namespace internal;
s32 _map[BLOCK_SIZE * BLOCK_SIZE + 16];
s32 * map = alignPtr(_map, 16);
int32x4_t v_width4 = vdupq_n_s32(ssize.width - 1), v_height4 = vdupq_n_s32(ssize.height - 1);
int32x4_t v_step4 = vdupq_n_s32(srcStride);
float32x4_t v_4 = vdupq_n_f32(4.0f);
float32x4_t v_m0 = vdupq_n_f32(m[0]);
float32x4_t v_m1 = vdupq_n_f32(m[1]);
float32x4_t v_m2 = vdupq_n_f32(m[2]);
float32x4_t v_m3 = vdupq_n_f32(m[3]);
float32x4_t v_m4 = vdupq_n_f32(m[4]);
float32x4_t v_m5 = vdupq_n_f32(m[5]);
float32x4_t v_m6 = vdupq_n_f32(m[6]);
float32x4_t v_m7 = vdupq_n_f32(m[7]);
float32x4_t v_m8 = vdupq_n_f32(m[8]);
if (borderMode == BORDER_MODE_REPLICATE)
{
int32x4_t v_zero4 = vdupq_n_s32(0);
for (size_t i = 0; i < dsize.height; i += BLOCK_SIZE)
{
size_t blockHeight = std::min<size_t>(BLOCK_SIZE, dsize.height - i);
for (size_t j = 0; j < dsize.width; j += BLOCK_SIZE)
{
size_t blockWidth = std::min<size_t>(BLOCK_SIZE, dsize.width - j);
// compute table
for (size_t y = 0; y < blockHeight; ++y)
{
s32 * map_row = getRowPtr(&map[0], blockWidth * sizeof(s32), y);
size_t x = 0, y_ = y + i;
f32 indeces[4] = { j + 0.0f, j + 1.0f, j + 2.0f, j + 3.0f };
float32x4_t v_x = vld1q_f32(indeces), v_y = vdupq_n_f32(y_);
float32x4_t v_yx = vmlaq_f32(v_m6, v_m3, v_y), v_yy = vmlaq_f32(v_m7, v_m4, v_y),
v_yw = vmlaq_f32(v_m8, v_m5, v_y);
for ( ; x + 4 <= blockWidth; x += 4)
{
float32x4_t v_src_xf = vmlaq_f32(v_yx, v_m0, v_x);
float32x4_t v_src_yf = vmlaq_f32(v_yy, v_m1, v_x);
float32x4_t v_wf = vrecpq_f32(vmlaq_f32(v_yw, v_m2, v_x));
v_src_xf = vmulq_f32(v_wf, v_src_xf);
v_src_yf = vmulq_f32(v_wf, v_src_yf);
int32x4_t v_src_x = vmaxq_s32(v_zero4, vminq_s32(v_width4, vcvtq_s32_f32(v_src_xf)));
int32x4_t v_src_y = vmaxq_s32(v_zero4, vminq_s32(v_height4, vcvtq_s32_f32(v_src_yf)));
int32x4_t v_src_index = vmlaq_s32(v_src_x, v_src_y, v_step4);
vst1q_s32(map_row + x, v_src_index);
v_x = vaddq_f32(v_x, v_4);
}
f32 yx = m[3] * y_ + m[6], yy = m[4] * y_ + m[7], yw = m[5] * y_ + m[8];
for (ptrdiff_t x_ = x + j; x < blockWidth; ++x, ++x_)
{
f32 w_f = 1.0f / (m[2] * x_ + yw);
f32 src_x_f = (m[0] * x_ + yx) * w_f;
f32 src_y_f = (m[1] * x_ + yy) * w_f;
s32 src_x = floorf(src_x_f), src_y = floorf(src_y_f);
src_x = std::max(0, std::min<s32>(ssize.width - 1, src_x));
src_y = std::max(0, std::min<s32>(ssize.height - 1, src_y));
map_row[x] = src_y * srcStride + src_x;
}
}
// make remap
remapNearestNeighborReplicate(Size2D(blockWidth, blockHeight), srcBase, &map[0],
getRowPtr(dstBase, dstStride, i) + j, dstStride);
}
}
}
else if (borderMode == BORDER_MODE_CONSTANT)
{
int32x4_t v_m1_4 = vdupq_n_s32(-1);
float32x4_t v_zero4 = vdupq_n_f32(0.0f);
for (size_t i = 0; i < dsize.height; i += BLOCK_SIZE)
{
size_t blockHeight = std::min<size_t>(BLOCK_SIZE, dsize.height - i);
for (size_t j = 0; j < dsize.width; j += BLOCK_SIZE)
{
size_t blockWidth = std::min<size_t>(BLOCK_SIZE, dsize.width - j);
// compute table
for (size_t y = 0; y < blockHeight; ++y)
{
s32 * map_row = getRowPtr(&map[0], blockWidth * sizeof(s32), y);
size_t x = 0, y_ = y + i;
f32 indeces[4] = { j + 0.0f, j + 1.0f, j + 2.0f, j + 3.0f };
float32x4_t v_x = vld1q_f32(indeces), v_y = vdupq_n_f32(y_);
float32x4_t v_yx = vmlaq_f32(v_m6, v_m3, v_y), v_yy = vmlaq_f32(v_m7, v_m4, v_y),
v_yw = vmlaq_f32(v_m8, v_m5, v_y);
for ( ; x + 4 <= blockWidth; x += 4)
{
float32x4_t v_src_xf = vmlaq_f32(v_yx, v_m0, v_x);
float32x4_t v_src_yf = vmlaq_f32(v_yy, v_m1, v_x);
float32x4_t v_wf = vrecpq_f32(vmlaq_f32(v_yw, v_m2, v_x));
v_src_xf = vmulq_f32(v_wf, v_src_xf);
v_src_yf = vmulq_f32(v_wf, v_src_yf);
int32x4_t v_src_x = vcvtq_s32_f32(v_src_xf);
int32x4_t v_src_y = vcvtq_s32_f32(v_src_yf);
uint32x4_t v_mask = vandq_u32(vandq_u32(vcgeq_f32(v_src_xf, v_zero4), vcleq_s32(v_src_x, v_width4)),
vandq_u32(vcgeq_f32(v_src_yf, v_zero4), vcleq_s32(v_src_y, v_height4)));
int32x4_t v_src_index = vbslq_s32(v_mask, vmlaq_s32(v_src_x, v_src_y, v_step4), v_m1_4);
vst1q_s32(map_row + x, v_src_index);
v_x = vaddq_f32(v_x, v_4);
}
f32 yx = m[3] * y_ + m[6], yy = m[4] * y_ + m[7], yw = m[5] * y_ + m[8];
for (ptrdiff_t x_ = x + j; x < blockWidth; ++x, ++x_)
{
f32 w_f = 1.0f / (m[2] * x_ + yw);
f32 src_x_f = (m[0] * x_ + yx) * w_f;
f32 src_y_f = (m[1] * x_ + yy) * w_f;
s32 src_x = floorf(src_x_f), src_y = floorf(src_y_f);
map_row[x] = (src_x >= 0) && (src_x < (s32)ssize.width) &&
(src_y >= 0) && (src_y < (s32)ssize.height) ? src_y * srcStride + src_x : -1;
}
}
// make remap
remapNearestNeighborConst(Size2D(blockWidth, blockHeight), srcBase, &map[0],
getRowPtr(dstBase, dstStride, i) + j, dstStride, borderValue);
}
}
}
#else
(void)ssize;
(void)dsize;
(void)srcBase;
(void)srcStride;
(void)m;
(void)dstBase;
(void)dstStride;
(void)borderMode;
(void)borderValue;
#endif
}
void warpPerspectiveLinear(const Size2D &ssize, const Size2D &dsize,
const u8 * srcBase, ptrdiff_t srcStride,
const f32 * m,
u8 * dstBase, ptrdiff_t dstStride,
BORDER_MODE borderMode, u8 borderValue)
{
internal::assertSupportedConfiguration(isWarpPerspectiveLinearSupported(ssize));
#ifdef CAROTENE_NEON
using namespace internal;
s32 _map[((BLOCK_SIZE * BLOCK_SIZE) << 2) + 16];
f32 _coeffs[((BLOCK_SIZE * BLOCK_SIZE) << 1) + 16];
s32 * map = alignPtr(_map, 16);
f32 * coeffs = alignPtr(_coeffs, 16);
int32x4_t v_width4 = vdupq_n_s32(ssize.width - 1), v_height4 = vdupq_n_s32(ssize.height - 1);
int32x4_t v_step4 = vdupq_n_s32(srcStride), v_1 = vdupq_n_s32(1);
float32x4_t v_zero4f = vdupq_n_f32(0.0f), v_one4f = vdupq_n_f32(1.0f);
float32x4_t v_4 = vdupq_n_f32(4.0f);
float32x4_t v_m0 = vdupq_n_f32(m[0]);
float32x4_t v_m1 = vdupq_n_f32(m[1]);
float32x4_t v_m2 = vdupq_n_f32(m[2]);
float32x4_t v_m3 = vdupq_n_f32(m[3]);
float32x4_t v_m4 = vdupq_n_f32(m[4]);
float32x4_t v_m5 = vdupq_n_f32(m[5]);
float32x4_t v_m6 = vdupq_n_f32(m[6]);
float32x4_t v_m7 = vdupq_n_f32(m[7]);
float32x4_t v_m8 = vdupq_n_f32(m[8]);
if (borderMode == BORDER_MODE_REPLICATE)
{
int32x4_t v_zero4 = vdupq_n_s32(0);
for (size_t i = 0; i < dsize.height; i += BLOCK_SIZE)
{
size_t blockHeight = std::min<size_t>(BLOCK_SIZE, dsize.height - i);
for (size_t j = 0; j < dsize.width; j += BLOCK_SIZE)
{
size_t blockWidth = std::min<size_t>(BLOCK_SIZE, dsize.width - j);
// compute table
for (size_t y = 0; y < blockHeight; ++y)
{
s32 * map_row = getRowPtr(map, blockWidth * sizeof(s32) * 4, y);
f32 * coeff_row = getRowPtr(coeffs, blockWidth * sizeof(f32) * 2, y);
size_t x = 0, y_ = y + i;
f32 indeces[4] = { j + 0.0f, j + 1.0f, j + 2.0f, j + 3.0f };
float32x4_t v_x = vld1q_f32(indeces), v_y = vdupq_n_f32(y_);
float32x4_t v_yx = vmlaq_f32(v_m6, v_m3, v_y), v_yy = vmlaq_f32(v_m7, v_m4, v_y),
v_yw = vmlaq_f32(v_m8, v_m5, v_y);
for ( ; x + 4 <= blockWidth; x += 4)
{
float32x4_t v_src_xf = vmlaq_f32(v_yx, v_m0, v_x);
float32x4_t v_src_yf = vmlaq_f32(v_yy, v_m1, v_x);
float32x4_t v_wf = vrecpq_f32(vmlaq_f32(v_yw, v_m2, v_x));
v_src_xf = vmulq_f32(v_wf, v_src_xf);
v_src_yf = vmulq_f32(v_wf, v_src_yf);
int32x4_t v_src_x = vcvtq_s32_f32(v_src_xf);
int32x4_t v_src_y = vcvtq_s32_f32(v_src_yf);
float32x4x2_t v_coeff;
v_coeff.val[0] = vsubq_f32(v_src_xf, vcvtq_f32_s32(v_src_x));
v_coeff.val[1] = vsubq_f32(v_src_yf, vcvtq_f32_s32(v_src_y));
uint32x4_t v_maskx = vcltq_f32(v_coeff.val[0], v_zero4f);
uint32x4_t v_masky = vcltq_f32(v_coeff.val[1], v_zero4f);
v_coeff.val[0] = vbslq_f32(v_maskx, vaddq_f32(v_one4f, v_coeff.val[0]), v_coeff.val[0]);
v_coeff.val[1] = vbslq_f32(v_masky, vaddq_f32(v_one4f, v_coeff.val[1]), v_coeff.val[1]);
v_src_x = vbslq_s32(v_maskx, vsubq_s32(v_src_x, v_1), v_src_x);
v_src_y = vbslq_s32(v_masky, vsubq_s32(v_src_y, v_1), v_src_y);
int32x4_t v_dst0_x = vmaxq_s32(v_zero4, vminq_s32(v_width4, v_src_x));
int32x4_t v_dst0_y = vmaxq_s32(v_zero4, vminq_s32(v_height4, v_src_y));
int32x4_t v_dst1_x = vmaxq_s32(v_zero4, vminq_s32(v_width4, vaddq_s32(v_1, v_src_x)));
int32x4_t v_dst1_y = vmaxq_s32(v_zero4, vminq_s32(v_height4, vaddq_s32(v_1, v_src_y)));
int32x4x4_t v_dst_index;
v_dst_index.val[0] = vmlaq_s32(v_dst0_x, v_dst0_y, v_step4);
v_dst_index.val[1] = vmlaq_s32(v_dst1_x, v_dst0_y, v_step4);
v_dst_index.val[2] = vmlaq_s32(v_dst0_x, v_dst1_y, v_step4);
v_dst_index.val[3] = vmlaq_s32(v_dst1_x, v_dst1_y, v_step4);
vst2q_f32(coeff_row + (x << 1), v_coeff);
vst4q_s32(map_row + (x << 2), v_dst_index);
v_x = vaddq_f32(v_x, v_4);
}
f32 yx = m[3] * y_ + m[6], yy = m[4] * y_ + m[7], yw = m[5] * y_ + m[8];
for (ptrdiff_t x_ = x + j; x < blockWidth; ++x, ++x_)
{
f32 w_f = 1.0f / (m[2] * x_ + yw);
f32 src_x_f = (m[0] * x_ + yx) * w_f;
f32 src_y_f = (m[1] * x_ + yy) * w_f;
s32 src0_x = (s32)floorf(src_x_f);
s32 src0_y = (s32)floorf(src_y_f);
coeff_row[(x << 1) + 0] = src_x_f - src0_x;
coeff_row[(x << 1) + 1] = src_y_f - src0_y;
s32 src1_y = std::max(0, std::min<s32>(ssize.height - 1, src0_y + 1));
src0_y = std::max(0, std::min<s32>(ssize.height - 1, src0_y));
s32 src1_x = std::max(0, std::min<s32>(ssize.width - 1, src0_x + 1));
src0_x = std::max(0, std::min<s32>(ssize.width - 1, src0_x));
map_row[(x << 2) + 0] = src0_y * srcStride + src0_x;
map_row[(x << 2) + 1] = src0_y * srcStride + src1_x;
map_row[(x << 2) + 2] = src1_y * srcStride + src0_x;
map_row[(x << 2) + 3] = src1_y * srcStride + src1_x;
}
}
remapLinearReplicate(Size2D(blockWidth, blockHeight),
srcBase, &map[0], &coeffs[0],
getRowPtr(dstBase, dstStride, i) + j, dstStride);
}
}
}
else if (borderMode == BORDER_MODE_CONSTANT)
{
float32x4_t v_zero4 = vdupq_n_f32(0.0f);
int32x4_t v_m1_4 = vdupq_n_s32(-1);
for (size_t i = 0; i < dsize.height; i += BLOCK_SIZE)
{
size_t blockHeight = std::min<size_t>(BLOCK_SIZE, dsize.height - i);
for (size_t j = 0; j < dsize.width; j += BLOCK_SIZE)
{
size_t blockWidth = std::min<size_t>(BLOCK_SIZE, dsize.width - j);
// compute table
for (size_t y = 0; y < blockHeight; ++y)
{
s32 * map_row = getRowPtr(map, blockWidth * sizeof(s32) * 4, y);
f32 * coeff_row = getRowPtr(coeffs, blockWidth * sizeof(f32) * 2, y);
size_t x = 0, y_ = y + i;
f32 indeces[4] = { j + 0.0f, j + 1.0f, j + 2.0f, j + 3.0f };
float32x4_t v_x = vld1q_f32(indeces), v_y = vdupq_n_f32(y_);
float32x4_t v_yx = vmlaq_f32(v_m6, v_m3, v_y), v_yy = vmlaq_f32(v_m7, v_m4, v_y),
v_yw = vmlaq_f32(v_m8, v_m5, v_y);
for ( ; x + 4 <= blockWidth; x += 4)
{
float32x4_t v_src_xf = vmlaq_f32(v_yx, v_m0, v_x);
float32x4_t v_src_yf = vmlaq_f32(v_yy, v_m1, v_x);
float32x4_t v_wf = vrecpq_f32(vmlaq_f32(v_yw, v_m2, v_x));
v_src_xf = vmulq_f32(v_wf, v_src_xf);
v_src_yf = vmulq_f32(v_wf, v_src_yf);
int32x4_t v_src_x0 = vcvtq_s32_f32(v_src_xf);
int32x4_t v_src_y0 = vcvtq_s32_f32(v_src_yf);
float32x4x2_t v_coeff;
v_coeff.val[0] = vsubq_f32(v_src_xf, vcvtq_f32_s32(v_src_x0));
v_coeff.val[1] = vsubq_f32(v_src_yf, vcvtq_f32_s32(v_src_y0));
uint32x4_t v_maskx = vcltq_f32(v_coeff.val[0], v_zero4f);
uint32x4_t v_masky = vcltq_f32(v_coeff.val[1], v_zero4f);
v_coeff.val[0] = vbslq_f32(v_maskx, vaddq_f32(v_one4f, v_coeff.val[0]), v_coeff.val[0]);
v_coeff.val[1] = vbslq_f32(v_masky, vaddq_f32(v_one4f, v_coeff.val[1]), v_coeff.val[1]);
v_src_x0 = vbslq_s32(v_maskx, vsubq_s32(v_src_x0, v_1), v_src_x0);
v_src_y0 = vbslq_s32(v_masky, vsubq_s32(v_src_y0, v_1), v_src_y0);
int32x4_t v_src_x1 = vaddq_s32(v_src_x0, v_1);
int32x4_t v_src_y1 = vaddq_s32(v_src_y0, v_1);
int32x4x4_t v_dst_index;
v_dst_index.val[0] = vmlaq_s32(v_src_x0, v_src_y0, v_step4);
v_dst_index.val[1] = vmlaq_s32(v_src_x1, v_src_y0, v_step4);
v_dst_index.val[2] = vmlaq_s32(v_src_x0, v_src_y1, v_step4);
v_dst_index.val[3] = vmlaq_s32(v_src_x1, v_src_y1, v_step4);
uint32x4_t v_mask_x0 = vandq_u32(vcgeq_f32(v_src_xf, v_zero4), vcleq_s32(v_src_x0, v_width4));
uint32x4_t v_mask_x1 = vandq_u32(vcgeq_f32(vaddq_f32(v_src_xf, v_one4f), v_zero4), vcleq_s32(v_src_x1, v_width4));
uint32x4_t v_mask_y0 = vandq_u32(vcgeq_f32(v_src_yf, v_zero4), vcleq_s32(v_src_y0, v_height4));
uint32x4_t v_mask_y1 = vandq_u32(vcgeq_f32(vaddq_f32(v_src_yf, v_one4f), v_zero4), vcleq_s32(v_src_y1, v_height4));
v_dst_index.val[0] = vbslq_s32(vandq_u32(v_mask_x0, v_mask_y0), v_dst_index.val[0], v_m1_4);
v_dst_index.val[1] = vbslq_s32(vandq_u32(v_mask_x1, v_mask_y0), v_dst_index.val[1], v_m1_4);
v_dst_index.val[2] = vbslq_s32(vandq_u32(v_mask_x0, v_mask_y1), v_dst_index.val[2], v_m1_4);
v_dst_index.val[3] = vbslq_s32(vandq_u32(v_mask_x1, v_mask_y1), v_dst_index.val[3], v_m1_4);
vst2q_f32(coeff_row + (x << 1), v_coeff);
vst4q_s32(map_row + (x << 2), v_dst_index);
v_x = vaddq_f32(v_x, v_4);
}
f32 yx = m[3] * y_ + m[6], yy = m[4] * y_ + m[7], yw = m[5] * y_ + m[8];
for (ptrdiff_t x_ = x + j; x < blockWidth; ++x, ++x_)
{
f32 w_f = 1.0f / (m[2] * x_ + yw);
f32 src_x_f = (m[0] * x_ + yx) * w_f;
f32 src_y_f = (m[1] * x_ + yy) * w_f;
s32 src0_x = (s32)floorf(src_x_f), src1_x = src0_x + 1;
s32 src0_y = (s32)floorf(src_y_f), src1_y = src0_y + 1;
coeff_row[(x << 1) + 0] = src_x_f - src0_x;
coeff_row[(x << 1) + 1] = src_y_f - src0_y;
map_row[(x << 2) + 0] = (src0_x >= 0) && (src0_x < (s32)ssize.width) &&
(src0_y >= 0) && (src0_y < (s32)ssize.height) ? src0_y * srcStride + src0_x : -1;
map_row[(x << 2) + 1] = (src1_x >= 0) && (src1_x < (s32)ssize.width) &&
(src0_y >= 0) && (src0_y < (s32)ssize.height) ? src0_y * srcStride + src1_x : -1;
map_row[(x << 2) + 2] = (src0_x >= 0) && (src0_x < (s32)ssize.width) &&
(src1_y >= 0) && (src1_y < (s32)ssize.height) ? src1_y * srcStride + src0_x : -1;
map_row[(x << 2) + 3] = (src1_x >= 0) && (src1_x < (s32)ssize.width) &&
(src1_y >= 0) && (src1_y < (s32)ssize.height) ? src1_y * srcStride + src1_x : -1;
}
}
remapLinearConst(Size2D(blockWidth, blockHeight),
srcBase, &map[0], &coeffs[0],
getRowPtr(dstBase, dstStride, i) + j, dstStride, borderValue);
}
}
}
#else
(void)ssize;
(void)dsize;
(void)srcBase;
(void)srcStride;
(void)m;
(void)dstBase;
(void)dstStride;
(void)borderMode;
(void)borderValue;
#endif
}
} // namespace CAROTENE_NS

View File

@ -5,9 +5,9 @@ if (WIN32 AND NOT ARM)
message(FATAL_ERROR "BUILD_TBB option supports Windows on ARM only!\nUse regular official TBB build instead of the BUILD_TBB option!")
endif()
set(tbb_ver "tbb43_20141204oss")
set(tbb_url "http://www.threadingbuildingblocks.org/sites/default/files/software_releases/source/tbb43_20141204oss_src.tgz")
set(tbb_md5 "e903dd92d9433701f097fa7ca29a3c1f")
set(tbb_ver "tbb44_20160128oss")
set(tbb_url "http://www.threadingbuildingblocks.org/sites/default/files/software_releases/source/tbb44_20160128oss_src_0.tgz")
set(tbb_md5 "9d8a4cdf43496f1b3f7c473a5248e5cc")
set(tbb_version_file "version_string.ver")
ocv_warnings_disable(CMAKE_CXX_FLAGS /wd4702)
ocv_warnings_disable(CMAKE_CXX_FLAGS -Wshadow)

View File

@ -82,7 +82,7 @@ if(UNIX)
endif()
endif()
ocv_warnings_disable(CMAKE_C_FLAGS -Wshorten-64-to-32 -Wattributes -Wstrict-prototypes -Wmissing-prototypes -Wmissing-declarations)
ocv_warnings_disable(CMAKE_C_FLAGS -Wshorten-64-to-32 -Wattributes -Wstrict-prototypes -Wmissing-prototypes -Wmissing-declarations -Wshift-negative-value)
set_target_properties(${ZLIB_LIBRARY} PROPERTIES
OUTPUT_NAME ${ZLIB_LIBRARY}

View File

@ -68,6 +68,10 @@ if(WINRT)
endif()
endif()
if(POLICY CMP0020)
cmake_policy(SET CMP0020 OLD)
endif()
if(POLICY CMP0022)
cmake_policy(SET CMP0022 OLD)
endif()
@ -77,13 +81,14 @@ if(POLICY CMP0026)
cmake_policy(SET CMP0026 OLD)
endif()
if (POLICY CMP0042)
# silence cmake 3.0+ warnings about MACOSX_RPATH
cmake_policy(SET CMP0042 OLD)
if(POLICY CMP0042)
cmake_policy(SET CMP0042 NEW)
endif()
include(cmake/OpenCVUtils.cmake)
# must go before the project command
set(CMAKE_CONFIGURATION_TYPES "Debug;Release" CACHE STRING "Configs" FORCE)
ocv_update(CMAKE_CONFIGURATION_TYPES "Debug;Release" CACHE STRING "Configs" FORCE)
if(DEFINED CMAKE_BUILD_TYPE)
set_property( CACHE CMAKE_BUILD_TYPE PROPERTY STRINGS ${CMAKE_CONFIGURATION_TYPES} )
endif()
@ -96,7 +101,7 @@ if(MSVC)
set(CMAKE_USE_RELATIVE_PATHS ON CACHE INTERNAL "" FORCE)
endif()
include(cmake/OpenCVUtils.cmake)
ocv_cmake_eval(DEBUG_PRE ONCE)
ocv_clear_vars(OpenCVModules_TARGETS)
@ -164,6 +169,7 @@ endif()
OCV_OPTION(WITH_1394 "Include IEEE1394 support" ON IF (NOT ANDROID AND NOT IOS AND NOT WINRT) )
OCV_OPTION(WITH_AVFOUNDATION "Use AVFoundation for Video I/O" ON IF IOS)
OCV_OPTION(WITH_CARBON "Use Carbon for UI instead of Cocoa" OFF IF APPLE )
OCV_OPTION(WITH_CAROTENE "Use NVidia carotene acceleration library for ARM platform" ON IF (ARM OR AARCH64) AND NOT IOS AND NOT (CMAKE_VERSION VERSION_LESS "2.8.11"))
OCV_OPTION(WITH_VTK "Include VTK library support (and build opencv_viz module eiher)" ON IF (NOT ANDROID AND NOT IOS AND NOT WINRT AND NOT CMAKE_CROSSCOMPILING) )
OCV_OPTION(WITH_CUDA "Include NVidia Cuda Runtime support" ON IF (NOT IOS AND NOT WINRT) )
OCV_OPTION(WITH_CUFFT "Include NVidia Cuda Fast Fourier Transform (FFT) library support" ON IF (NOT IOS AND NOT WINRT) )
@ -185,8 +191,8 @@ OCV_OPTION(WITH_OPENGL "Include OpenGL support" OFF
OCV_OPTION(WITH_OPENNI "Include OpenNI support" OFF IF (NOT ANDROID AND NOT IOS AND NOT WINRT) )
OCV_OPTION(WITH_OPENNI2 "Include OpenNI2 support" OFF IF (NOT ANDROID AND NOT IOS AND NOT WINRT) )
OCV_OPTION(WITH_PNG "Include PNG support" ON)
OCV_OPTION(WITH_PVAPI "Include Prosilica GigE support" ON IF (NOT ANDROID AND NOT IOS AND NOT WINRT) )
OCV_OPTION(WITH_GIGEAPI "Include Smartek GigE support" ON IF (NOT ANDROID AND NOT IOS AND NOT WINRT) )
OCV_OPTION(WITH_PVAPI "Include Prosilica GigE support" OFF IF (NOT ANDROID AND NOT IOS AND NOT WINRT) )
OCV_OPTION(WITH_GIGEAPI "Include Smartek GigE support" OFF IF (NOT ANDROID AND NOT IOS AND NOT WINRT) )
OCV_OPTION(WITH_QT "Build with Qt Backend support" OFF IF (NOT ANDROID AND NOT IOS AND NOT WINRT) )
OCV_OPTION(WITH_WIN32UI "Build with Win32 UI Backend support" ON IF WIN32 AND NOT WINRT)
OCV_OPTION(WITH_QUICKTIME "Use QuickTime for Video I/O insted of QTKit" OFF IF APPLE )
@ -215,6 +221,7 @@ OCV_OPTION(WITH_VA "Include VA support" OFF
OCV_OPTION(WITH_VA_INTEL "Include Intel VA-API/OpenCL support" OFF IF (UNIX AND NOT ANDROID) )
OCV_OPTION(WITH_GDAL "Include GDAL Support" OFF IF (NOT ANDROID AND NOT IOS AND NOT WINRT) )
OCV_OPTION(WITH_GPHOTO2 "Include gPhoto2 library support" ON IF (UNIX AND NOT ANDROID) )
OCV_OPTION(WITH_LAPACK "Include Lapack library support" ON IF (UNIX AND NOT ANDROID) )
# OpenCV build components
# ===================================================
@ -253,7 +260,7 @@ OCV_OPTION(INSTALL_TESTS "Install accuracy and performance test binar
# OpenCV build options
# ===================================================
OCV_OPTION(ENABLE_PRECOMPILED_HEADERS "Use precompiled headers" ON IF (NOT IOS) )
OCV_OPTION(ENABLE_PRECOMPILED_HEADERS "Use precompiled headers" ON IF (NOT IOS AND NOT CMAKE_CROSSCOMPILING) )
OCV_OPTION(ENABLE_SOLUTION_FOLDERS "Solution folder in Visual Studio or in other IDEs" (MSVC_IDE OR CMAKE_GENERATOR MATCHES Xcode) )
OCV_OPTION(ENABLE_PROFILING "Enable profiling in the GCC compiler (Add flags: -g -pg)" OFF IF CMAKE_COMPILER_IS_GNUCXX )
OCV_OPTION(ENABLE_COVERAGE "Enable coverage collection with GCov" OFF IF CMAKE_COMPILER_IS_GNUCXX )
@ -297,50 +304,50 @@ include(cmake/OpenCVVersion.cmake)
# ----------------------------------------------------------------------------
# Save libs and executables in the same place
set(EXECUTABLE_OUTPUT_PATH "${CMAKE_BINARY_DIR}/bin" CACHE PATH "Output directory for applications" )
set(EXECUTABLE_OUTPUT_PATH "${CMAKE_BINARY_DIR}/bin" CACHE PATH "Output directory for applications")
if (ANDROID)
if (ANDROID_ABI MATCHES "NEON")
if(ANDROID)
if(ANDROID_ABI MATCHES "NEON")
set(ENABLE_NEON ON)
endif()
if (ANDROID_ABI MATCHES "VFPV3")
if(ANDROID_ABI MATCHES "VFPV3")
set(ENABLE_VFPV3 ON)
endif()
endif()
if(ANDROID OR WIN32)
set(OPENCV_DOC_INSTALL_PATH doc)
ocv_update(OPENCV_DOC_INSTALL_PATH doc)
else()
set(OPENCV_DOC_INSTALL_PATH share/OpenCV/doc)
ocv_update(OPENCV_DOC_INSTALL_PATH share/OpenCV/doc)
endif()
if(WIN32 AND CMAKE_HOST_SYSTEM_NAME MATCHES Windows)
if(DEFINED OpenCV_RUNTIME AND DEFINED OpenCV_ARCH)
set(OpenCV_INSTALL_BINARIES_PREFIX "${OpenCV_ARCH}/${OpenCV_RUNTIME}/")
ocv_update(OpenCV_INSTALL_BINARIES_PREFIX "${OpenCV_ARCH}/${OpenCV_RUNTIME}/")
else()
message(STATUS "Can't detect runtime and/or arch")
set(OpenCV_INSTALL_BINARIES_PREFIX "")
ocv_update(OpenCV_INSTALL_BINARIES_PREFIX "")
endif()
elseif(ANDROID)
set(OpenCV_INSTALL_BINARIES_PREFIX "sdk/native/")
ocv_update(OpenCV_INSTALL_BINARIES_PREFIX "sdk/native/")
else()
set(OpenCV_INSTALL_BINARIES_PREFIX "")
ocv_update(OpenCV_INSTALL_BINARIES_PREFIX "")
endif()
if(ANDROID)
set(OPENCV_SAMPLES_BIN_INSTALL_PATH "${OpenCV_INSTALL_BINARIES_PREFIX}samples/${ANDROID_NDK_ABI_NAME}")
ocv_update(OPENCV_SAMPLES_BIN_INSTALL_PATH "${OpenCV_INSTALL_BINARIES_PREFIX}samples/${ANDROID_NDK_ABI_NAME}")
else()
set(OPENCV_SAMPLES_BIN_INSTALL_PATH "${OpenCV_INSTALL_BINARIES_PREFIX}samples")
ocv_update(OPENCV_SAMPLES_BIN_INSTALL_PATH "${OpenCV_INSTALL_BINARIES_PREFIX}samples")
endif()
if(ANDROID)
set(OPENCV_BIN_INSTALL_PATH "${OpenCV_INSTALL_BINARIES_PREFIX}bin/${ANDROID_NDK_ABI_NAME}")
ocv_update(OPENCV_BIN_INSTALL_PATH "${OpenCV_INSTALL_BINARIES_PREFIX}bin/${ANDROID_NDK_ABI_NAME}")
else()
set(OPENCV_BIN_INSTALL_PATH "${OpenCV_INSTALL_BINARIES_PREFIX}bin")
ocv_update(OPENCV_BIN_INSTALL_PATH "${OpenCV_INSTALL_BINARIES_PREFIX}bin")
endif()
if(NOT OPENCV_TEST_INSTALL_PATH)
set(OPENCV_TEST_INSTALL_PATH "${OPENCV_BIN_INSTALL_PATH}")
ocv_update(OPENCV_TEST_INSTALL_PATH "${OPENCV_BIN_INSTALL_PATH}")
endif()
if (OPENCV_TEST_DATA_PATH)
@ -349,66 +356,74 @@ endif()
if(OPENCV_TEST_DATA_PATH AND NOT OPENCV_TEST_DATA_INSTALL_PATH)
if(ANDROID)
set(OPENCV_TEST_DATA_INSTALL_PATH "sdk/etc/testdata")
ocv_update(OPENCV_TEST_DATA_INSTALL_PATH "sdk/etc/testdata")
elseif(WIN32)
set(OPENCV_TEST_DATA_INSTALL_PATH "testdata")
ocv_update(OPENCV_TEST_DATA_INSTALL_PATH "testdata")
else()
set(OPENCV_TEST_DATA_INSTALL_PATH "share/OpenCV/testdata")
ocv_update(OPENCV_TEST_DATA_INSTALL_PATH "share/OpenCV/testdata")
endif()
endif()
if(ANDROID)
set(LIBRARY_OUTPUT_PATH "${OpenCV_BINARY_DIR}/lib/${ANDROID_NDK_ABI_NAME}")
set(3P_LIBRARY_OUTPUT_PATH "${OpenCV_BINARY_DIR}/3rdparty/lib/${ANDROID_NDK_ABI_NAME}")
set(OPENCV_LIB_INSTALL_PATH sdk/native/libs/${ANDROID_NDK_ABI_NAME})
set(OPENCV_3P_LIB_INSTALL_PATH sdk/native/3rdparty/libs/${ANDROID_NDK_ABI_NAME})
set(OPENCV_CONFIG_INSTALL_PATH sdk/native/jni)
set(OPENCV_INCLUDE_INSTALL_PATH sdk/native/jni/include)
set(OPENCV_SAMPLES_SRC_INSTALL_PATH samples/native)
set(OPENCV_OTHER_INSTALL_PATH sdk/etc)
set(LIBRARY_OUTPUT_PATH "${OpenCV_BINARY_DIR}/lib/${ANDROID_NDK_ABI_NAME}")
ocv_update(3P_LIBRARY_OUTPUT_PATH "${OpenCV_BINARY_DIR}/3rdparty/lib/${ANDROID_NDK_ABI_NAME}")
ocv_update(OPENCV_LIB_INSTALL_PATH sdk/native/libs/${ANDROID_NDK_ABI_NAME})
ocv_update(OPENCV_3P_LIB_INSTALL_PATH sdk/native/3rdparty/libs/${ANDROID_NDK_ABI_NAME})
ocv_update(OPENCV_CONFIG_INSTALL_PATH sdk/native/jni)
ocv_update(OPENCV_INCLUDE_INSTALL_PATH sdk/native/jni/include)
ocv_update(OPENCV_SAMPLES_SRC_INSTALL_PATH samples/native)
ocv_update(OPENCV_OTHER_INSTALL_PATH sdk/etc)
else()
set(LIBRARY_OUTPUT_PATH "${OpenCV_BINARY_DIR}/lib")
set(3P_LIBRARY_OUTPUT_PATH "${OpenCV_BINARY_DIR}/3rdparty/lib${LIB_SUFFIX}")
set(LIBRARY_OUTPUT_PATH "${OpenCV_BINARY_DIR}/lib")
ocv_update(3P_LIBRARY_OUTPUT_PATH "${OpenCV_BINARY_DIR}/3rdparty/lib${LIB_SUFFIX}")
if(WIN32 AND CMAKE_HOST_SYSTEM_NAME MATCHES Windows)
if(OpenCV_STATIC)
set(OPENCV_LIB_INSTALL_PATH "${OpenCV_INSTALL_BINARIES_PREFIX}staticlib${LIB_SUFFIX}")
ocv_update(OPENCV_LIB_INSTALL_PATH "${OpenCV_INSTALL_BINARIES_PREFIX}staticlib${LIB_SUFFIX}")
else()
set(OPENCV_LIB_INSTALL_PATH "${OpenCV_INSTALL_BINARIES_PREFIX}lib${LIB_SUFFIX}")
ocv_update(OPENCV_LIB_INSTALL_PATH "${OpenCV_INSTALL_BINARIES_PREFIX}lib${LIB_SUFFIX}")
endif()
set(OPENCV_3P_LIB_INSTALL_PATH "${OpenCV_INSTALL_BINARIES_PREFIX}staticlib${LIB_SUFFIX}")
set(OPENCV_SAMPLES_SRC_INSTALL_PATH samples/native)
set(OPENCV_JAR_INSTALL_PATH java)
set(OPENCV_OTHER_INSTALL_PATH etc)
ocv_update(OPENCV_3P_LIB_INSTALL_PATH "${OpenCV_INSTALL_BINARIES_PREFIX}staticlib${LIB_SUFFIX}")
ocv_update(OPENCV_SAMPLES_SRC_INSTALL_PATH samples/native)
ocv_update(OPENCV_JAR_INSTALL_PATH java)
ocv_update(OPENCV_OTHER_INSTALL_PATH etc)
ocv_update(OPENCV_CONFIG_INSTALL_PATH ".")
else()
set(OPENCV_LIB_INSTALL_PATH lib${LIB_SUFFIX})
set(OPENCV_3P_LIB_INSTALL_PATH share/OpenCV/3rdparty/${OPENCV_LIB_INSTALL_PATH})
set(OPENCV_SAMPLES_SRC_INSTALL_PATH share/OpenCV/samples)
set(OPENCV_JAR_INSTALL_PATH share/OpenCV/java)
set(OPENCV_OTHER_INSTALL_PATH share/OpenCV)
endif()
set(OPENCV_INCLUDE_INSTALL_PATH "include")
ocv_update(OPENCV_LIB_INSTALL_PATH lib${LIB_SUFFIX})
ocv_update(OPENCV_3P_LIB_INSTALL_PATH share/OpenCV/3rdparty/${OPENCV_LIB_INSTALL_PATH})
ocv_update(OPENCV_SAMPLES_SRC_INSTALL_PATH share/OpenCV/samples)
ocv_update(OPENCV_JAR_INSTALL_PATH share/OpenCV/java)
ocv_update(OPENCV_OTHER_INSTALL_PATH share/OpenCV)
math(EXPR SIZEOF_VOID_P_BITS "8 * ${CMAKE_SIZEOF_VOID_P}")
if(LIB_SUFFIX AND NOT SIZEOF_VOID_P_BITS EQUAL LIB_SUFFIX)
set(OPENCV_CONFIG_INSTALL_PATH lib${LIB_SUFFIX}/cmake/opencv)
else()
set(OPENCV_CONFIG_INSTALL_PATH share/OpenCV)
if(NOT DEFINED OPENCV_CONFIG_INSTALL_PATH)
math(EXPR SIZEOF_VOID_P_BITS "8 * ${CMAKE_SIZEOF_VOID_P}")
if(LIB_SUFFIX AND NOT SIZEOF_VOID_P_BITS EQUAL LIB_SUFFIX)
ocv_update(OPENCV_CONFIG_INSTALL_PATH lib${LIB_SUFFIX}/cmake/opencv)
else()
ocv_update(OPENCV_CONFIG_INSTALL_PATH share/OpenCV)
endif()
endif()
endif()
ocv_update(OPENCV_INCLUDE_INSTALL_PATH "include")
endif()
set(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_PREFIX}/${OPENCV_LIB_INSTALL_PATH}")
ocv_update(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_PREFIX}/${OPENCV_LIB_INSTALL_PATH}")
set(CMAKE_INSTALL_RPATH_USE_LINK_PATH TRUE)
if(INSTALL_TO_MANGLED_PATHS)
set(OPENCV_INCLUDE_INSTALL_PATH ${OPENCV_INCLUDE_INSTALL_PATH}/opencv-${OPENCV_VERSION})
string(REPLACE "OpenCV" "OpenCV-${OPENCV_VERSION}" OPENCV_3P_LIB_INSTALL_PATH "${OPENCV_3P_LIB_INSTALL_PATH}")
string(REPLACE "OpenCV" "OpenCV-${OPENCV_VERSION}" OPENCV_SAMPLES_SRC_INSTALL_PATH "${OPENCV_SAMPLES_SRC_INSTALL_PATH}")
string(REPLACE "OpenCV" "OpenCV-${OPENCV_VERSION}" OPENCV_CONFIG_INSTALL_PATH "${OPENCV_CONFIG_INSTALL_PATH}")
string(REPLACE "OpenCV" "OpenCV-${OPENCV_VERSION}" OPENCV_DOC_INSTALL_PATH "${OPENCV_DOC_INSTALL_PATH}")
string(REPLACE "OpenCV" "OpenCV-${OPENCV_VERSION}" OPENCV_JAR_INSTALL_PATH "${OPENCV_JAR_INSTALL_PATH}")
string(REPLACE "OpenCV" "OpenCV-${OPENCV_VERSION}" OPENCV_TEST_DATA_INSTALL_PATH "${OPENCV_TEST_DATA_INSTALL_PATH}")
string(REPLACE "OpenCV" "OpenCV-${OPENCV_VERSION}" OPENCV_OTHER_INSTALL_PATH "${OPENCV_OTHER_INSTALL_PATH}")
foreach(v
OPENCV_3P_LIB_INSTALL_PATH
OPENCV_SAMPLES_SRC_INSTALL_PATH
OPENCV_CONFIG_INSTALL_PATH
OPENCV_DOC_INSTALL_PATH
OPENCV_JAR_INSTALL_PATH
OPENCV_TEST_DATA_INSTALL_PATH
OPENCV_OTHER_INSTALL_PATH
)
string(REPLACE "OpenCV" "OpenCV-${OPENCV_VERSION}" ${v} "${${v}}")
string(REPLACE "opencv" "opencv-${OPENCV_VERSION}" ${v} "${${v}}")
endforeach()
endif()
@ -433,7 +448,7 @@ endif()
# ----------------------------------------------------------------------------
# Path for build/platform -specific headers
# ----------------------------------------------------------------------------
set(OPENCV_CONFIG_FILE_INCLUDE_DIR "${CMAKE_BINARY_DIR}/" CACHE PATH "Where to create the platform-dependant cvconfig.h")
ocv_update(OPENCV_CONFIG_FILE_INCLUDE_DIR "${CMAKE_BINARY_DIR}/" CACHE PATH "Where to create the platform-dependant cvconfig.h")
ocv_include_directories(${OPENCV_CONFIG_FILE_INCLUDE_DIR})
# ----------------------------------------------------------------------------
@ -446,7 +461,7 @@ set(OPENCV_EXTRA_MODULES_PATH "" CACHE PATH "Where to look for additional OpenCV
# ----------------------------------------------------------------------------
find_host_package(Git QUIET)
if(GIT_FOUND)
if(NOT DEFINED OPENCV_VCSVERSION AND GIT_FOUND)
execute_process(COMMAND "${GIT_EXECUTABLE}" describe --tags --always --dirty --match "[0-9].[0-9].[0-9]*"
WORKING_DIRECTORY "${OpenCV_SOURCE_DIR}"
OUTPUT_VARIABLE OPENCV_VCSVERSION
@ -457,7 +472,7 @@ if(GIT_FOUND)
if(NOT GIT_RESULT EQUAL 0)
set(OPENCV_VCSVERSION "unknown")
endif()
else()
elseif(NOT DEFINED OPENCV_VCSVERSION)
# We don't have git:
set(OPENCV_VCSVERSION "unknown")
endif()
@ -596,30 +611,56 @@ endif()
include(cmake/OpenCVDetectVTK.cmake)
# -- Custom HAL replacement --
set(_includes "")
# assuming OPENCV_HAL_HEADERS and OPENCV_HAL_LIBS are lists of files:
# option example: -DOPENCV_HAL_HEADERS="<some-path>/header1.h;<some-path>/header2.h"
if (OPENCV_HAL_HEADERS AND OPENCV_HAL_LIBS)
foreach (h ${OPENCV_HAL_HEADERS})
get_filename_component(h "${h}" ABSOLUTE)
set(_includes "${_includes}\n#include \"${h}\"")
# ----------------------------------------------------------------------------
# OpenCV HAL
# ----------------------------------------------------------------------------
set(_hal_includes "")
macro(ocv_hal_register HAL_LIBRARIES_VAR HAL_HEADERS_VAR HAL_INCLUDE_DIRS_VAR)
# 1. libraries
foreach (l ${${HAL_LIBRARIES_VAR}})
if(NOT TARGET ${l})
get_filename_component(l "${l}" ABSOLUTE)
endif()
list(APPEND OPENCV_HAL_LINKER_LIBS ${l})
endforeach()
foreach (l ${OPENCV_HAL_LIBS})
get_filename_component(l "${l}" ABSOLUTE)
set(OPENCV_LINKER_LIBS ${OPENCV_LINKER_LIBS} ${l})
# TODO: install?
# ocv_install_target(${l} EXPORT OpenCVModules ARCHIVE DESTINATION ${OPENCV_3P_LIB_INSTALL_PATH} COMPONENT dev)
# 2. headers
foreach (h ${${HAL_HEADERS_VAR}})
set(_hal_includes "${_hal_includes}\n#include \"${h}\"")
endforeach()
else()
set(_includes "// using default HAL")
unset(OPENCV_HAL_HEADERS CACHE)
unset(OPENCV_HAL_LIBS CACHE)
# 3. include paths
ocv_include_directories(${${HAL_INCLUDE_DIRS_VAR}})
endmacro()
if(NOT DEFINED OpenCV_HAL)
set(OpenCV_HAL "OpenCV_HAL")
endif()
set(OPENCV_HAL_HEADERS "${OPENCV_HAL_HEADERS}" CACHE STRING "Headers with custom HAL implementation")
set(OPENCV_HAL_LIBS "${OPENCV_HAL_LIBS}" CACHE STRING "Libraries with custom HAL implementation")
if(WITH_CAROTENE)
ocv_debug_message(STATUS "Enable carotene acceleration")
if(NOT ";${OpenCV_HAL};" MATCHES ";carotene;")
set(OpenCV_HAL "carotene;${OpenCV_HAL}")
endif()
endif()
foreach(hal ${OpenCV_HAL})
if(hal STREQUAL "carotene")
add_subdirectory(3rdparty/carotene/hal)
ocv_hal_register(CAROTENE_HAL_LIBRARIES CAROTENE_HAL_HEADERS CAROTENE_HAL_INCLUDE_DIRS)
list(APPEND OpenCV_USED_HAL "carotene (ver ${CAROTENE_HAL_VERSION})")
else()
ocv_debug_message(STATUS "OpenCV HAL: ${hal} ...")
ocv_clear_vars(OpenCV_HAL_LIBRARIES OpenCV_HAL_HEADERS OpenCV_HAL_INCLUDE_DIRS)
find_package(${hal} NO_MODULE QUIET)
if(${hal}_FOUND)
ocv_hal_register(OpenCV_HAL_LIBRARIES OpenCV_HAL_HEADERS OpenCV_HAL_INCLUDE_DIRS)
list(APPEND OpenCV_USED_HAL "${hal} (ver ${${hal}_VERSION})")
endif()
endif()
endforeach()
configure_file("${OpenCV_SOURCE_DIR}/cmake/templates/custom_hal.hpp.in" "${CMAKE_BINARY_DIR}/custom_hal.hpp" @ONLY)
unset(_includes)
unset(_hal_includes)
# ----------------------------------------------------------------------------
# Add CUDA libraries (needed for apps/tools, samples)
@ -633,7 +674,7 @@ if(HAVE_CUDA)
set(OPENCV_LINKER_LIBS ${OPENCV_LINKER_LIBS} ${CUDA_cufft_LIBRARY})
endif()
foreach(p ${CUDA_LIBS_PATH})
set(OPENCV_LINKER_LIBS ${OPENCV_LINKER_LIBS} -L${p})
set(OPENCV_LINKER_LIBS ${OPENCV_LINKER_LIBS} ${CMAKE_LIBRARY_PATH_FLAG}${p})
endforeach()
endif()
# ----------------------------------------------------------------------------
@ -691,7 +732,7 @@ include(cmake/OpenCVGenPkgconfig.cmake)
# Generate OpenCV.mk for ndk-build (Android build tool)
include(cmake/OpenCVGenAndroidMK.cmake)
# Generate OpenCVСonfig.cmake and OpenCVConfig-version.cmake for cmake projects
# Generate OpenCVConfig.cmake and OpenCVConfig-version.cmake for cmake projects
include(cmake/OpenCVGenConfig.cmake)
# Generate Info.plist for the IOS framework
@ -709,7 +750,7 @@ if(INSTALL_TESTS AND OPENCV_TEST_DATA_PATH)
configure_file("${CMAKE_CURRENT_SOURCE_DIR}/cmake/templates/opencv_run_all_tests_android.sh.in"
"${CMAKE_BINARY_DIR}/unix-install/opencv_run_all_tests.sh" @ONLY)
install(PROGRAMS "${CMAKE_BINARY_DIR}/unix-install/opencv_run_all_tests.sh"
DESTINATION ${CMAKE_INSTALL_PREFIX} COMPONENT tests)
DESTINATION ./ COMPONENT tests)
elseif(WIN32)
configure_file("${CMAKE_CURRENT_SOURCE_DIR}/cmake/templates/opencv_run_all_tests_windows.cmd.in"
"${CMAKE_BINARY_DIR}/win-install/opencv_run_all_tests.cmd" @ONLY)
@ -737,11 +778,11 @@ endif()
if(ANDROID OR NOT UNIX)
install(FILES ${OPENCV_LICENSE_FILE}
PERMISSIONS OWNER_READ GROUP_READ WORLD_READ
DESTINATION ${CMAKE_INSTALL_PREFIX} COMPONENT libs)
DESTINATION ./ COMPONENT libs)
if(OPENCV_README_FILE)
install(FILES ${OPENCV_README_FILE}
PERMISSIONS OWNER_READ GROUP_READ WORLD_READ
DESTINATION ${CMAKE_INSTALL_PREFIX} COMPONENT libs)
DESTINATION ./ COMPONENT libs)
endif()
endif()
@ -753,10 +794,46 @@ status("General configuration for OpenCV ${OPENCV_VERSION} =====================
if(OPENCV_VCSVERSION)
status(" Version control:" ${OPENCV_VCSVERSION})
endif()
if(OPENCV_EXTRA_MODULES_PATH AND NOT BUILD_INFO_SKIP_EXTRA_MODULES)
set(__dump_extra_header OFF)
foreach(p ${OPENCV_EXTRA_MODULES_PATH})
if(EXISTS ${p})
if(NOT __dump_extra_header)
set(__dump_extra_header ON)
status("")
status(" Extra modules:")
else()
status("")
endif()
set(EXTRA_MODULES_VCSVERSION "unknown")
if(GIT_FOUND)
execute_process(COMMAND "${GIT_EXECUTABLE}" describe --tags --always --dirty --match "[0-9].[0-9].[0-9]*"
WORKING_DIRECTORY "${p}"
OUTPUT_VARIABLE EXTRA_MODULES_VCSVERSION
RESULT_VARIABLE GIT_RESULT
ERROR_QUIET
OUTPUT_STRIP_TRAILING_WHITESPACE
)
if(NOT GIT_RESULT EQUAL 0)
set(EXTRA_MODULES_VCSVERSION "unknown")
endif()
endif()
status(" Location (extra):" ${p})
status(" Version control (extra):" ${EXTRA_MODULES_VCSVERSION})
endif()
endforeach()
unset(__dump_extra_header)
endif()
# ========================== build platform ==========================
status("")
status(" Platform:")
if(NOT CMAKE_VERSION VERSION_LESS 2.8.11 AND NOT BUILD_INFO_SKIP_TIMESTAMP)
string(TIMESTAMP TIMESTAMP "" UTC)
if(TIMESTAMP)
status(" Timestamp:" ${TIMESTAMP})
endif()
endif()
status(" Host:" ${CMAKE_HOST_SYSTEM_NAME} ${CMAKE_HOST_SYSTEM_VERSION} ${CMAKE_HOST_SYSTEM_PROCESSOR})
if(CMAKE_CROSSCOMPILING)
status(" Target:" ${CMAKE_SYSTEM_NAME} ${CMAKE_SYSTEM_VERSION} ${CMAKE_SYSTEM_PROCESSOR})
@ -1119,10 +1196,14 @@ if(DEFINED WITH_VA_INTEL)
status(" Use Intel VA-API/OpenCL:" HAVE_VA_INTEL THEN "YES (MSDK: ${VA_INTEL_MSDK_ROOT} OpenCL: ${VA_INTEL_IOCL_ROOT})" ELSE NO)
endif(DEFINED WITH_VA_INTEL)
if(DEFINED WITH_LAPACK)
status(" Use Lapack:" HAVE_LAPACK THEN "YES" ELSE NO)
endif(DEFINED WITH_LAPACK)
status(" Use Eigen:" HAVE_EIGEN THEN "YES (ver ${EIGEN_WORLD_VERSION}.${EIGEN_MAJOR_VERSION}.${EIGEN_MINOR_VERSION})" ELSE NO)
status(" Use Cuda:" HAVE_CUDA THEN "YES (ver ${CUDA_VERSION_STRING})" ELSE NO)
status(" Use OpenCL:" HAVE_OPENCL THEN YES ELSE NO)
status(" Use custom HAL:" OPENCV_HAL_HEADERS AND OPENCV_HAL_LIBS THEN "YES (${OPENCV_HAL_HEADERS}; ${OPENCV_HAL_LIBS})" ELSE "NO")
status(" Use custom HAL:" OpenCV_USED_HAL THEN "YES (${OpenCV_USED_HAL})" ELSE "NO")
if(HAVE_CUDA)
status("")
@ -1138,14 +1219,13 @@ endif()
if(HAVE_OPENCL)
status("")
status(" OpenCL:")
if(HAVE_OPENCL_STATIC)
set(__opencl_ver "static")
set(__opencl_type "<Link with OpenCL library>")
else()
set(__opencl_ver "dynamic")
set(__opencl_type "<Dynamic loading of OpenCL library>")
endif()
status(" Version:" ${__opencl_ver})
if(OPENCL_INCLUDE_DIR)
status(" OpenCL:" ${__opencl_type})
if(OPENCL_INCLUDE_DIRS)
status(" Include path:" ${OPENCL_INCLUDE_DIRS})
endif()
if(OPENCL_LIBRARIES)
@ -1162,7 +1242,7 @@ if(HAVE_OPENCL)
list(APPEND __libs "${l}")
endif()
endforeach()
status(" libraries:" ${__libs})
status(" Link libraries:" ${__libs})
endif()
status(" Use AMDFFT:" HAVE_CLAMDFFT THEN YES ELSE NO)
status(" Use AMDBLAS:" HAVE_CLAMDBLAS THEN YES ELSE NO)
@ -1255,3 +1335,7 @@ endif()
# ----------------------------------------------------------------------------
include(cmake/OpenCVPackaging.cmake)
# This should be the last command
ocv_cmake_dump_vars("" TOFILE "CMakeVars.txt")
ocv_cmake_eval(DEBUG_POST ONCE)

View File

@ -7,12 +7,12 @@ copy or use the software.
For Open Source Computer Vision Library
(3-clause BSD License)
Copyright (C) 2000-2015, Intel Corporation, all rights reserved.
Copyright (C) 2000-2016, Intel Corporation, all rights reserved.
Copyright (C) 2009-2011, Willow Garage Inc., all rights reserved.
Copyright (C) 2009-2015, NVIDIA Corporation, all rights reserved.
Copyright (C) 2009-2016, NVIDIA Corporation, all rights reserved.
Copyright (C) 2010-2013, Advanced Micro Devices, Inc., all rights reserved.
Copyright (C) 2015, OpenCV Foundation, all rights reserved.
Copyright (C) 2015, Itseez Inc., all rights reserved.
Copyright (C) 2015-2016, OpenCV Foundation, all rights reserved.
Copyright (C) 2015-2016, Itseez Inc., all rights reserved.
Third party copyrights are property of their respective owners.
Redistribution and use in source and binary forms, with or without modification,

View File

@ -1,7 +1,5 @@
### OpenCV: Open Source Computer Vision Library
[![Gittip](http://img.shields.io/gittip/OpenCV.png)](https://www.gittip.com/OpenCV/)
#### Resources
* Homepage: <http://opencv.org>

View File

@ -4,3 +4,4 @@ link_libraries(${OPENCV_LINKER_LIBS})
add_subdirectory(traincascade)
add_subdirectory(createsamples)
add_subdirectory(annotation)
add_subdirectory(visualisation)

View File

@ -21,7 +21,6 @@ set_target_properties(${the_target} PROPERTIES
DEBUG_POSTFIX "${OPENCV_DEBUG_POSTFIX}"
ARCHIVE_OUTPUT_DIRECTORY ${LIBRARY_OUTPUT_PATH}
RUNTIME_OUTPUT_DIRECTORY ${EXECUTABLE_OUTPUT_PATH}
INSTALL_NAME_DIR lib
OUTPUT_NAME "opencv_annotation")
if(ENABLE_SOLUTION_FOLDERS)

View File

@ -46,6 +46,9 @@ USAGE:
./opencv_annotation -images <folder location> -annotations <ouput file>
Created by: Puttemans Steven - February 2015
Adapted by: Puttemans Steven - April 2016 - Vectorize the process to enable better processing
+ early leave and store by pressing an ESC key
+ enable delete `d` button, to remove last annotation
*****************************************************************************************************/
#include <opencv2/core.hpp>
@ -68,16 +71,15 @@ using namespace cv;
// Function prototypes
void on_mouse(int, int, int, int, void*);
string int2string(int);
void get_annotations(Mat, stringstream*);
vector<Rect> get_annotations(Mat);
// Public parameters
Mat image;
int roi_x0 = 0, roi_y0 = 0, roi_x1 = 0, roi_y1 = 0, num_of_rec = 0;
bool start_draw = false;
bool start_draw = false, stop = false;
// Window name for visualisation purposes
const string window_name="OpenCV Based Annotation Tool";
const string window_name = "OpenCV Based Annotation Tool";
// FUNCTION : Mouse response for selecting objects in images
// If left button is clicked, start drawing a rectangle as long as mouse moves
@ -98,7 +100,8 @@ void on_mouse(int event, int x, int y, int , void * )
start_draw = false;
}
}
// Action when mouse is moving
// Action when mouse is moving and drawing is enabled
if((event == EVENT_MOUSEMOVE) && start_draw)
{
// Redraw bounding box for annotation
@ -109,42 +112,34 @@ void on_mouse(int event, int x, int y, int , void * )
}
}
// FUNCTION : snippet to convert an integer value to a string using a clean function
// instead of creating a stringstream each time inside the main code
string int2string(int num)
// FUNCTION : returns a vector of Rect objects given an image containing positive object instances
vector<Rect> get_annotations(Mat input_image)
{
stringstream temp_stream;
temp_stream << num;
return temp_stream.str();
}
vector<Rect> current_annotations;
// FUNCTION : given an image containing positive object instances, add all the object
// annotations to a known stringstream
void get_annotations(Mat input_image, stringstream* output_stream)
{
// Make it possible to exit the annotation
bool stop = false;
// Reset the num_of_rec element at each iteration
// Make sure the global image is set to the current image
num_of_rec = 0;
image = input_image;
// Make it possible to exit the annotation process
stop = false;
// Init window interface and couple mouse actions
namedWindow(window_name, WINDOW_AUTOSIZE);
setMouseCallback(window_name, on_mouse);
image = input_image;
imshow(window_name, image);
stringstream temp_stream;
int key_pressed = 0;
do
{
// Get a temporary image clone
Mat temp_image = input_image.clone();
Rect currentRect(0, 0, 0, 0);
// Keys for processing
// You need to select one for confirming a selection and one to continue to the next image
// Based on the universal ASCII code of the keystroke: http://www.asciitable.com/
// c = 99 add rectangle to current image
// n = 110 save added rectangles and show next image
// d = 100 delete the last annotation made
// <ESC> = 27 exit program
key_pressed = 0xFF & waitKey(0);
switch( key_pressed )
@ -152,32 +147,53 @@ void get_annotations(Mat input_image, stringstream* output_stream)
case 27:
destroyWindow(window_name);
stop = true;
break;
case 99:
// Add a rectangle to the list
num_of_rec++;
// Draw initiated from top left corner
if(roi_x0<roi_x1 && roi_y0<roi_y1)
{
temp_stream << " " << int2string(roi_x0) << " " << int2string(roi_y0) << " " << int2string(roi_x1-roi_x0) << " " << int2string(roi_y1-roi_y0);
currentRect.x = roi_x0;
currentRect.y = roi_y0;
currentRect.width = roi_x1-roi_x0;
currentRect.height = roi_y1-roi_y0;
}
// Draw initiated from bottom right corner
if(roi_x0>roi_x1 && roi_y0>roi_y1)
{
temp_stream << " " << int2string(roi_x1) << " " << int2string(roi_y1) << " " << int2string(roi_x0-roi_x1) << " " << int2string(roi_y0-roi_y1);
currentRect.x = roi_x1;
currentRect.y = roi_y1;
currentRect.width = roi_x0-roi_x1;
currentRect.height = roi_y0-roi_y1;
}
// Draw initiated from top right corner
if(roi_x0>roi_x1 && roi_y0<roi_y1)
{
temp_stream << " " << int2string(roi_x1) << " " << int2string(roi_y0) << " " << int2string(roi_x0-roi_x1) << " " << int2string(roi_y1-roi_y0);
currentRect.x = roi_x1;
currentRect.y = roi_y0;
currentRect.width = roi_x0-roi_x1;
currentRect.height = roi_y1-roi_y0;
}
// Draw initiated from bottom left corner
if(roi_x0<roi_x1 && roi_y0>roi_y1)
{
temp_stream << " " << int2string(roi_x0) << " " << int2string(roi_y1) << " " << int2string(roi_x1-roi_x0) << " " << int2string(roi_y0-roi_y1);
currentRect.x = roi_x0;
currentRect.y = roi_y1;
currentRect.width = roi_x1-roi_x0;
currentRect.height = roi_y0-roi_y1;
}
rectangle(input_image, Point(roi_x0,roi_y0), Point(roi_x1,roi_y1), Scalar(0,255,0), 1);
// Draw the rectangle on the canvas
// Add the rectangle to the vector of annotations
current_annotations.push_back(currentRect);
break;
case 100:
// Remove the last annotation
if(current_annotations.size() > 0){
current_annotations.pop_back();
}
break;
default:
// Default case --> do nothing at all
// Other keystrokes can simply be ignored
break;
}
@ -186,19 +202,24 @@ void get_annotations(Mat input_image, stringstream* output_stream)
{
break;
}
// Draw all the current rectangles onto the top image and make sure that the global image is linked
for(int i=0; i < (int)current_annotations.size(); i++){
rectangle(temp_image, current_annotations[i], Scalar(0,255,0), 1);
}
image = temp_image;
// Force an explicit redraw of the canvas --> necessary to visualize delete correctly
imshow(window_name, image);
}
// Continue as long as the next image key has not been pressed
while(key_pressed != 110);
// If there are annotations AND the next image key is pressed
// Write the image annotations to the file
if(num_of_rec>0 && key_pressed==110)
{
*output_stream << " " << num_of_rec << temp_stream.str() << endl;
}
// Close down the window
destroyWindow(window_name);
// Return the data
return current_annotations;
}
int main( int argc, const char** argv )
@ -208,13 +229,13 @@ int main( int argc, const char** argv )
cout << "Usage: " << argv[0] << endl;
cout << " -images <folder_location> [example - /data/testimages/]" << endl;
cout << " -annotations <ouput_file> [example - /data/annotations.txt]" << endl;
cout << "TIP: Use absolute paths to avoid any problems with the software!" << endl;
return -1;
}
// Read in the input arguments
string image_folder;
string annotations;
string annotations_file;
for(int i = 1; i < argc; ++i )
{
if( !strcmp( argv[i], "-images" ) )
@ -223,7 +244,7 @@ int main( int argc, const char** argv )
}
else if( !strcmp( argv[i], "-annotations" ) )
{
annotations = argv[++i];
annotations_file = argv[++i];
}
}
@ -248,14 +269,9 @@ int main( int argc, const char** argv )
}
#endif
// Create the outputfilestream
ofstream output(annotations.c_str());
if ( !output.is_open() ){
cerr << "The path for the output file contains an error and could not be opened. Please check again!" << endl;
return 0;
}
// Start by processing the data
// Return the image filenames inside the image folder
vector< vector<Rect> > annotations;
vector<String> filenames;
String folder(image_folder);
glob(folder, filenames);
@ -273,15 +289,33 @@ int main( int argc, const char** argv )
continue;
}
// Perform annotations & generate corresponding output
stringstream output_stream;
get_annotations(current_image, &output_stream);
// Perform annotations & store the result inside the vectorized structure
vector<Rect> current_annotations = get_annotations(current_image);
annotations.push_back(current_annotations);
// Store the annotations, write to the output file
if (output_stream.str() != ""){
output << filenames[i] << output_stream.str();
// Check if the ESC key was hit, then exit earlier then expected
if(stop){
break;
}
}
// When all data is processed, store the data gathered inside the proper file
// This now even gets called when the ESC button was hit to store preliminary results
ofstream output(annotations_file.c_str());
if ( !output.is_open() ){
cerr << "The path for the output file contains an error and could not be opened. Please check again!" << endl;
return 0;
}
// Store the annotations, write to the output file
for(int i = 0; i < (int)annotations.size(); i++){
output << filenames[i] << " " << annotations[i].size();
for(int j=0; j < (int)annotations[i].size(); j++){
Rect temp = annotations[i][j];
output << " " << temp.x << " " << temp.y << " " << temp.width << " " << temp.height;
}
output << endl;
}
return 0;
}

View File

@ -23,7 +23,6 @@ set_target_properties(${the_target} PROPERTIES
DEBUG_POSTFIX "${OPENCV_DEBUG_POSTFIX}"
ARCHIVE_OUTPUT_DIRECTORY ${LIBRARY_OUTPUT_PATH}
RUNTIME_OUTPUT_DIRECTORY ${EXECUTABLE_OUTPUT_PATH}
INSTALL_NAME_DIR lib
OUTPUT_NAME "opencv_createsamples")
if(ENABLE_SOLUTION_FOLDERS)

View File

@ -0,0 +1,48 @@
set(OPENCV_INTERACTIVECALIBRATION_DEPS opencv_core opencv_aruco opencv_highgui opencv_calib3d opencv_videoio)
ocv_check_dependencies(${OPENCV_INTERACTIVECALIBRATION_DEPS})
if(NOT OCV_DEPENDENCIES_FOUND)
return()
endif()
find_package(LAPACK)
if(LAPACK_FOUND)
find_file(LAPACK_HEADER "lapacke.h")
if(LAPACK_HEADER)
add_definitions(-DUSE_LAPACK)
link_libraries(${LAPACK_LIBRARIES})
endif()
endif()
project(interactive-calibration)
set(the_target opencv_interactive-calibration)
ocv_target_include_directories(${the_target} PRIVATE "${CMAKE_CURRENT_SOURCE_DIR}" "${OpenCV_SOURCE_DIR}/include/opencv")
ocv_target_include_modules_recurse(${the_target} ${OPENCV_INTERACTIVECALIBRATION_DEPS})
file(GLOB SRCS *.cpp)
file(GLOB HDRS *.h*)
set(interactive-calibration_files ${SRCS} ${HDRS})
ocv_add_executable(${the_target} ${interactive-calibration_files})
ocv_target_link_libraries(${the_target} ${OPENCV_INTERACTIVECALIBRATION_DEPS})
set_target_properties(${the_target} PROPERTIES
DEBUG_POSTFIX "${OPENCV_DEBUG_POSTFIX}"
ARCHIVE_OUTPUT_DIRECTORY ${LIBRARY_OUTPUT_PATH}
RUNTIME_OUTPUT_DIRECTORY ${EXECUTABLE_OUTPUT_PATH}
INSTALL_NAME_DIR lib
OUTPUT_NAME "opencv_interactive-calibration")
if(ENABLE_SOLUTION_FOLDERS)
set_target_properties(${the_target} PROPERTIES FOLDER "applications")
endif()
if(INSTALL_CREATE_DISTRIB)
if(BUILD_SHARED_LIBS)
install(TARGETS ${the_target} RUNTIME DESTINATION ${OPENCV_BIN_INSTALL_PATH} CONFIGURATIONS Release COMPONENT dev)
endif()
else()
install(TARGETS ${the_target} OPTIONAL RUNTIME DESTINATION ${OPENCV_BIN_INSTALL_PATH} COMPONENT dev)
endif()

View File

@ -0,0 +1,118 @@
#ifndef CALIB_COMMON_HPP
#define CALIB_COMMON_HPP
#include <memory>
#include <opencv2/core.hpp>
#include <vector>
#include <string>
namespace calib
{
#define OVERLAY_DELAY 1000
#define IMAGE_MAX_WIDTH 1280
#define IMAGE_MAX_HEIGHT 960
bool showOverlayMessage(const std::string& message);
enum InputType { Video, Pictures };
enum InputVideoSource { Camera, File };
enum TemplateType { AcirclesGrid, Chessboard, chAruco, DoubleAcirclesGrid };
static const std::string mainWindowName = "Calibration";
static const std::string gridWindowName = "Board locations";
static const std::string consoleHelp = "Hot keys:\nesc - exit application\n"
"s - save current data to .xml file\n"
"r - delete last frame\n"
"u - enable/disable applying undistortion"
"d - delete all frames\n"
"v - switch visualization";
static const double sigmaMult = 1.96;
struct calibrationData
{
cv::Mat cameraMatrix;
cv::Mat distCoeffs;
cv::Mat stdDeviations;
cv::Mat perViewErrors;
std::vector<cv::Mat> rvecs;
std::vector<cv::Mat> tvecs;
double totalAvgErr;
cv::Size imageSize;
std::vector<std::vector<cv::Point2f> > imagePoints;
std::vector< std::vector<cv::Point3f> > objectPoints;
std::vector<cv::Mat> allCharucoCorners;
std::vector<cv::Mat> allCharucoIds;
cv::Mat undistMap1, undistMap2;
calibrationData()
{
imageSize = cv::Size(IMAGE_MAX_WIDTH, IMAGE_MAX_HEIGHT);
}
};
struct cameraParameters
{
cv::Mat cameraMatrix;
cv::Mat distCoeffs;
cv::Mat stdDeviations;
double avgError;
cameraParameters(){}
cameraParameters(cv::Mat& _cameraMatrix, cv::Mat& _distCoeffs, cv::Mat& _stdDeviations, double _avgError = 0) :
cameraMatrix(_cameraMatrix), distCoeffs(_distCoeffs), stdDeviations(_stdDeviations), avgError(_avgError)
{}
};
struct captureParameters
{
InputType captureMethod;
InputVideoSource source;
TemplateType board;
cv::Size boardSize;
int charucoDictName;
int calibrationStep;
float charucoSquareLenght, charucoMarkerSize;
float captureDelay;
float squareSize;
float templDst;
std::string videoFileName;
bool flipVertical;
int camID;
int fps;
cv::Size cameraResolution;
int maxFramesNum;
int minFramesNum;
captureParameters()
{
calibrationStep = 1;
captureDelay = 500.f;
maxFramesNum = 30;
minFramesNum = 10;
fps = 30;
cameraResolution = cv::Size(IMAGE_MAX_WIDTH, IMAGE_MAX_HEIGHT);
}
};
struct internalParameters
{
double solverEps;
int solverMaxIters;
bool fastSolving;
double filterAlpha;
internalParameters()
{
solverEps = 1e-7;
solverMaxIters = 30;
fastSolving = false;
filterAlpha = 0.1;
}
};
}
#endif

View File

@ -0,0 +1,327 @@
#include "calibController.hpp"
#include <algorithm>
#include <cmath>
#include <ctime>
#include <opencv2/calib3d.hpp>
#include <opencv2/imgproc.hpp>
double calib::calibController::estimateCoverageQuality()
{
int gridSize = 10;
int xGridStep = mCalibData->imageSize.width / gridSize;
int yGridStep = mCalibData->imageSize.height / gridSize;
std::vector<int> pointsInCell(gridSize*gridSize);
std::fill(pointsInCell.begin(), pointsInCell.end(), 0);
for(std::vector<std::vector<cv::Point2f> >::iterator it = mCalibData->imagePoints.begin(); it != mCalibData->imagePoints.end(); ++it)
for(std::vector<cv::Point2f>::iterator pointIt = (*it).begin(); pointIt != (*it).end(); ++pointIt) {
int i = (int)((*pointIt).x / xGridStep);
int j = (int)((*pointIt).y / yGridStep);
pointsInCell[i*gridSize + j]++;
}
for(std::vector<cv::Mat>::iterator it = mCalibData->allCharucoCorners.begin(); it != mCalibData->allCharucoCorners.end(); ++it)
for(int l = 0; l < (*it).size[0]; l++) {
int i = (int)((*it).at<float>(l, 0) / xGridStep);
int j = (int)((*it).at<float>(l, 1) / yGridStep);
pointsInCell[i*gridSize + j]++;
}
cv::Mat mean, stdDev;
cv::meanStdDev(pointsInCell, mean, stdDev);
return mean.at<double>(0) / (stdDev.at<double>(0) + 1e-7);
}
calib::calibController::calibController()
{
mCalibFlags = 0;
}
calib::calibController::calibController(cv::Ptr<calib::calibrationData> data, int initialFlags, bool autoTuning, int minFramesNum) :
mCalibData(data)
{
mCalibFlags = initialFlags;
mNeedTuning = autoTuning;
mMinFramesNum = minFramesNum;
mConfIntervalsState = false;
mCoverageQualityState = false;
}
void calib::calibController::updateState()
{
if(mCalibData->cameraMatrix.total()) {
const double relErrEps = 0.05;
bool fConfState = false, cConfState = false, dConfState = true;
if(sigmaMult*mCalibData->stdDeviations.at<double>(0) / mCalibData->cameraMatrix.at<double>(0,0) < relErrEps &&
sigmaMult*mCalibData->stdDeviations.at<double>(1) / mCalibData->cameraMatrix.at<double>(1,1) < relErrEps)
fConfState = true;
if(sigmaMult*mCalibData->stdDeviations.at<double>(2) / mCalibData->cameraMatrix.at<double>(0,2) < relErrEps &&
sigmaMult*mCalibData->stdDeviations.at<double>(3) / mCalibData->cameraMatrix.at<double>(1,2) < relErrEps)
cConfState = true;
for(int i = 0; i < 5; i++)
if(mCalibData->stdDeviations.at<double>(4+i) / fabs(mCalibData->distCoeffs.at<double>(i)) > 1)
dConfState = false;
mConfIntervalsState = fConfState && cConfState && dConfState;
}
if(getFramesNumberState())
mCoverageQualityState = estimateCoverageQuality() > 1.8 ? true : false;
if (getFramesNumberState() && mNeedTuning) {
if( !(mCalibFlags & cv::CALIB_FIX_ASPECT_RATIO) &&
mCalibData->cameraMatrix.total()) {
double fDiff = fabs(mCalibData->cameraMatrix.at<double>(0,0) -
mCalibData->cameraMatrix.at<double>(1,1));
if (fDiff < 3*mCalibData->stdDeviations.at<double>(0) &&
fDiff < 3*mCalibData->stdDeviations.at<double>(1)) {
mCalibFlags |= cv::CALIB_FIX_ASPECT_RATIO;
mCalibData->cameraMatrix.at<double>(0,0) =
mCalibData->cameraMatrix.at<double>(1,1);
}
}
if(!(mCalibFlags & cv::CALIB_ZERO_TANGENT_DIST)) {
const double eps = 0.005;
if(fabs(mCalibData->distCoeffs.at<double>(2)) < eps &&
fabs(mCalibData->distCoeffs.at<double>(3)) < eps)
mCalibFlags |= cv::CALIB_ZERO_TANGENT_DIST;
}
if(!(mCalibFlags & cv::CALIB_FIX_K1)) {
const double eps = 0.005;
if(fabs(mCalibData->distCoeffs.at<double>(0)) < eps)
mCalibFlags |= cv::CALIB_FIX_K1;
}
if(!(mCalibFlags & cv::CALIB_FIX_K2)) {
const double eps = 0.005;
if(fabs(mCalibData->distCoeffs.at<double>(1)) < eps)
mCalibFlags |= cv::CALIB_FIX_K2;
}
if(!(mCalibFlags & cv::CALIB_FIX_K3)) {
const double eps = 0.005;
if(fabs(mCalibData->distCoeffs.at<double>(4)) < eps)
mCalibFlags |= cv::CALIB_FIX_K3;
}
}
}
bool calib::calibController::getCommonCalibrationState() const
{
int rating = (int)getFramesNumberState() + (int)getConfidenceIntrervalsState() +
(int)getRMSState() + (int)mCoverageQualityState;
return rating == 4;
}
bool calib::calibController::getFramesNumberState() const
{
return std::max(mCalibData->imagePoints.size(), mCalibData->allCharucoCorners.size()) > mMinFramesNum;
}
bool calib::calibController::getConfidenceIntrervalsState() const
{
return mConfIntervalsState;
}
bool calib::calibController::getRMSState() const
{
return mCalibData->totalAvgErr < 0.5;
}
int calib::calibController::getNewFlags() const
{
return mCalibFlags;
}
//////////////////// calibDataController
double calib::calibDataController::estimateGridSubsetQuality(size_t excludedIndex)
{
{
int gridSize = 10;
int xGridStep = mCalibData->imageSize.width / gridSize;
int yGridStep = mCalibData->imageSize.height / gridSize;
std::vector<int> pointsInCell(gridSize*gridSize);
std::fill(pointsInCell.begin(), pointsInCell.end(), 0);
for(size_t k = 0; k < mCalibData->imagePoints.size(); k++)
if(k != excludedIndex)
for(std::vector<cv::Point2f>::iterator pointIt = mCalibData->imagePoints[k].begin(); pointIt != mCalibData->imagePoints[k].end(); ++pointIt) {
int i = (int)((*pointIt).x / xGridStep);
int j = (int)((*pointIt).y / yGridStep);
pointsInCell[i*gridSize + j]++;
}
for(size_t k = 0; k < mCalibData->allCharucoCorners.size(); k++)
if(k != excludedIndex)
for(int l = 0; l < mCalibData->allCharucoCorners[k].size[0]; l++) {
int i = (int)(mCalibData->allCharucoCorners[k].at<float>(l, 0) / xGridStep);
int j = (int)(mCalibData->allCharucoCorners[k].at<float>(l, 1) / yGridStep);
pointsInCell[i*gridSize + j]++;
}
cv::Mat mean, stdDev;
cv::meanStdDev(pointsInCell, mean, stdDev);
return mean.at<double>(0) / (stdDev.at<double>(0) + 1e-7);
}
}
calib::calibDataController::calibDataController(cv::Ptr<calib::calibrationData> data, int maxFrames, double convParameter) :
mCalibData(data), mParamsFileName("CamParams.xml")
{
mMaxFramesNum = maxFrames;
mAlpha = convParameter;
}
calib::calibDataController::calibDataController()
{
}
void calib::calibDataController::filterFrames()
{
size_t numberOfFrames = std::max(mCalibData->allCharucoIds.size(), mCalibData->imagePoints.size());
CV_Assert(numberOfFrames == mCalibData->perViewErrors.total());
if(numberOfFrames >= mMaxFramesNum) {
double worstValue = -HUGE_VAL, maxQuality = estimateGridSubsetQuality(numberOfFrames);
size_t worstElemIndex = 0;
for(size_t i = 0; i < numberOfFrames; i++) {
double gridQDelta = estimateGridSubsetQuality(i) - maxQuality;
double currentValue = mCalibData->perViewErrors.at<double>((int)i)*mAlpha + gridQDelta*(1. - mAlpha);
if(currentValue > worstValue) {
worstValue = currentValue;
worstElemIndex = i;
}
}
showOverlayMessage(cv::format("Frame %d is worst", worstElemIndex + 1));
if(mCalibData->imagePoints.size()) {
mCalibData->imagePoints.erase(mCalibData->imagePoints.begin() + worstElemIndex);
mCalibData->objectPoints.erase(mCalibData->objectPoints.begin() + worstElemIndex);
}
else {
mCalibData->allCharucoCorners.erase(mCalibData->allCharucoCorners.begin() + worstElemIndex);
mCalibData->allCharucoIds.erase(mCalibData->allCharucoIds.begin() + worstElemIndex);
}
cv::Mat newErrorsVec = cv::Mat((int)numberOfFrames - 1, 1, CV_64F);
std::copy(mCalibData->perViewErrors.ptr<double>(0),
mCalibData->perViewErrors.ptr<double>((int)worstElemIndex), newErrorsVec.ptr<double>(0));
std::copy(mCalibData->perViewErrors.ptr<double>((int)worstElemIndex + 1), mCalibData->perViewErrors.ptr<double>((int)numberOfFrames),
newErrorsVec.ptr<double>((int)worstElemIndex));
mCalibData->perViewErrors = newErrorsVec;
}
}
void calib::calibDataController::setParametersFileName(const std::string &name)
{
mParamsFileName = name;
}
void calib::calibDataController::deleteLastFrame()
{
if( !mCalibData->imagePoints.empty()) {
mCalibData->imagePoints.pop_back();
mCalibData->objectPoints.pop_back();
}
if (!mCalibData->allCharucoCorners.empty()) {
mCalibData->allCharucoCorners.pop_back();
mCalibData->allCharucoIds.pop_back();
}
if(!mParamsStack.empty()) {
mCalibData->cameraMatrix = (mParamsStack.top()).cameraMatrix;
mCalibData->distCoeffs = (mParamsStack.top()).distCoeffs;
mCalibData->stdDeviations = (mParamsStack.top()).stdDeviations;
mCalibData->totalAvgErr = (mParamsStack.top()).avgError;
mParamsStack.pop();
}
}
void calib::calibDataController::rememberCurrentParameters()
{
cv::Mat oldCameraMat, oldDistcoeefs, oldStdDevs;
mCalibData->cameraMatrix.copyTo(oldCameraMat);
mCalibData->distCoeffs.copyTo(oldDistcoeefs);
mCalibData->stdDeviations.copyTo(oldStdDevs);
mParamsStack.push(cameraParameters(oldCameraMat, oldDistcoeefs, oldStdDevs, mCalibData->totalAvgErr));
}
void calib::calibDataController::deleteAllData()
{
mCalibData->imagePoints.clear();
mCalibData->objectPoints.clear();
mCalibData->allCharucoCorners.clear();
mCalibData->allCharucoIds.clear();
mCalibData->cameraMatrix = mCalibData->distCoeffs = cv::Mat();
mParamsStack = std::stack<cameraParameters>();
rememberCurrentParameters();
}
bool calib::calibDataController::saveCurrentCameraParameters() const
{
bool success = false;
if(mCalibData->cameraMatrix.total()) {
cv::FileStorage parametersWriter(mParamsFileName, cv::FileStorage::WRITE);
if(parametersWriter.isOpened()) {
time_t rawtime;
time(&rawtime);
char buf[256];
strftime(buf, sizeof(buf)-1, "%c", localtime(&rawtime));
parametersWriter << "calibrationDate" << buf;
parametersWriter << "framesCount" << std::max((int)mCalibData->objectPoints.size(), (int)mCalibData->allCharucoCorners.size());
parametersWriter << "cameraResolution" << mCalibData->imageSize;
parametersWriter << "cameraMatrix" << mCalibData->cameraMatrix;
parametersWriter << "cameraMatrix_std_dev" << mCalibData->stdDeviations.rowRange(cv::Range(0, 4));
parametersWriter << "dist_coeffs" << mCalibData->distCoeffs;
parametersWriter << "dist_coeffs_std_dev" << mCalibData->stdDeviations.rowRange(cv::Range(4, 9));
parametersWriter << "avg_reprojection_error" << mCalibData->totalAvgErr;
parametersWriter.release();
success = true;
}
}
return success;
}
void calib::calibDataController::printParametersToConsole(std::ostream &output) const
{
const char* border = "---------------------------------------------------";
output << border << std::endl;
output << "Frames used for calibration: " << std::max(mCalibData->objectPoints.size(), mCalibData->allCharucoCorners.size())
<< " \t RMS = " << mCalibData->totalAvgErr << std::endl;
if(mCalibData->cameraMatrix.at<double>(0,0) == mCalibData->cameraMatrix.at<double>(1,1))
output << "F = " << mCalibData->cameraMatrix.at<double>(1,1) << " +- " << sigmaMult*mCalibData->stdDeviations.at<double>(1) << std::endl;
else
output << "Fx = " << mCalibData->cameraMatrix.at<double>(0,0) << " +- " << sigmaMult*mCalibData->stdDeviations.at<double>(0) << " \t "
<< "Fy = " << mCalibData->cameraMatrix.at<double>(1,1) << " +- " << sigmaMult*mCalibData->stdDeviations.at<double>(1) << std::endl;
output << "Cx = " << mCalibData->cameraMatrix.at<double>(0,2) << " +- " << sigmaMult*mCalibData->stdDeviations.at<double>(2) << " \t"
<< "Cy = " << mCalibData->cameraMatrix.at<double>(1,2) << " +- " << sigmaMult*mCalibData->stdDeviations.at<double>(3) << std::endl;
output << "K1 = " << mCalibData->distCoeffs.at<double>(0) << " +- " << sigmaMult*mCalibData->stdDeviations.at<double>(4) << std::endl;
output << "K2 = " << mCalibData->distCoeffs.at<double>(1) << " +- " << sigmaMult*mCalibData->stdDeviations.at<double>(5) << std::endl;
output << "K3 = " << mCalibData->distCoeffs.at<double>(4) << " +- " << sigmaMult*mCalibData->stdDeviations.at<double>(8) << std::endl;
output << "TD1 = " << mCalibData->distCoeffs.at<double>(2) << " +- " << sigmaMult*mCalibData->stdDeviations.at<double>(6) << std::endl;
output << "TD2 = " << mCalibData->distCoeffs.at<double>(3) << " +- " << sigmaMult*mCalibData->stdDeviations.at<double>(7) << std::endl;
}
void calib::calibDataController::updateUndistortMap()
{
cv::initUndistortRectifyMap(mCalibData->cameraMatrix, mCalibData->distCoeffs, cv::noArray(),
cv::getOptimalNewCameraMatrix(mCalibData->cameraMatrix, mCalibData->distCoeffs, mCalibData->imageSize, 0.0, mCalibData->imageSize),
mCalibData->imageSize, CV_16SC2, mCalibData->undistMap1, mCalibData->undistMap2);
}

View File

@ -0,0 +1,64 @@
#ifndef CALIB_CONTROLLER_HPP
#define CALIB_CONTROLLER_HPP
#include "calibCommon.hpp"
#include <stack>
#include <string>
#include <ostream>
namespace calib {
class calibController
{
protected:
cv::Ptr<calibrationData> mCalibData;
int mCalibFlags;
unsigned mMinFramesNum;
bool mNeedTuning;
bool mConfIntervalsState;
bool mCoverageQualityState;
double estimateCoverageQuality();
public:
calibController();
calibController(cv::Ptr<calibrationData> data, int initialFlags, bool autoTuning,
int minFramesNum);
void updateState();
bool getCommonCalibrationState() const;
bool getFramesNumberState() const;
bool getConfidenceIntrervalsState() const;
bool getRMSState() const;
bool getPointsCoverageState() const;
int getNewFlags() const;
};
class calibDataController
{
protected:
cv::Ptr<calibrationData> mCalibData;
std::stack<cameraParameters> mParamsStack;
std::string mParamsFileName;
unsigned mMaxFramesNum;
double mAlpha;
double estimateGridSubsetQuality(size_t excludedIndex);
public:
calibDataController(cv::Ptr<calibrationData> data, int maxFrames, double convParameter);
calibDataController();
void filterFrames();
void setParametersFileName(const std::string& name);
void deleteLastFrame();
void rememberCurrentParameters();
void deleteAllData();
bool saveCurrentCameraParameters() const;
void printParametersToConsole(std::ostream &output) const;
void updateUndistortMap();
};
}
#endif

View File

@ -0,0 +1,91 @@
#include "calibPipeline.hpp"
#include <opencv2/highgui.hpp>
#include <stdexcept>
using namespace calib;
#define CAP_DELAY 10
cv::Size CalibPipeline::getCameraResolution()
{
mCapture.set(cv::CAP_PROP_FRAME_WIDTH, 10000);
mCapture.set(cv::CAP_PROP_FRAME_HEIGHT, 10000);
int w = (int)mCapture.get(cv::CAP_PROP_FRAME_WIDTH);
int h = (int)mCapture.get(cv::CAP_PROP_FRAME_HEIGHT);
return cv::Size(w,h);
}
CalibPipeline::CalibPipeline(captureParameters params) :
mCaptureParams(params)
{
}
PipelineExitStatus CalibPipeline::start(std::vector<cv::Ptr<FrameProcessor> > processors)
{
if(mCaptureParams.source == Camera && !mCapture.isOpened())
{
mCapture.open(mCaptureParams.camID);
cv::Size maxRes = getCameraResolution();
cv::Size neededRes = mCaptureParams.cameraResolution;
if(maxRes.width < neededRes.width) {
double aR = (double)maxRes.width / maxRes.height;
mCapture.set(cv::CAP_PROP_FRAME_WIDTH, neededRes.width);
mCapture.set(cv::CAP_PROP_FRAME_HEIGHT, neededRes.width/aR);
}
else if(maxRes.height < neededRes.height) {
double aR = (double)maxRes.width / maxRes.height;
mCapture.set(cv::CAP_PROP_FRAME_HEIGHT, neededRes.height);
mCapture.set(cv::CAP_PROP_FRAME_WIDTH, neededRes.height*aR);
}
else {
mCapture.set(cv::CAP_PROP_FRAME_HEIGHT, neededRes.height);
mCapture.set(cv::CAP_PROP_FRAME_WIDTH, neededRes.width);
}
mCapture.set(cv::CAP_PROP_AUTOFOCUS, 0);
}
else if (mCaptureParams.source == File && !mCapture.isOpened())
mCapture.open(mCaptureParams.videoFileName);
mImageSize = cv::Size((int)mCapture.get(cv::CAP_PROP_FRAME_WIDTH), (int)mCapture.get(cv::CAP_PROP_FRAME_HEIGHT));
if(!mCapture.isOpened())
throw std::runtime_error("Unable to open video source");
cv::Mat frame, processedFrame;
while(mCapture.grab()) {
mCapture.retrieve(frame);
if(mCaptureParams.flipVertical)
cv::flip(frame, frame, -1);
frame.copyTo(processedFrame);
for (std::vector<cv::Ptr<FrameProcessor> >::iterator it = processors.begin(); it != processors.end(); ++it)
processedFrame = (*it)->processFrame(processedFrame);
cv::imshow(mainWindowName, processedFrame);
int key = cv::waitKey(CAP_DELAY);
if(key == 27) // esc
return Finished;
else if (key == 114) // r
return DeleteLastFrame;
else if (key == 100) // d
return DeleteAllFrames;
else if (key == 115) // s
return SaveCurrentData;
else if (key == 117) // u
return SwitchUndistort;
else if (key == 118) // v
return SwitchVisualisation;
for (std::vector<cv::Ptr<FrameProcessor> >::iterator it = processors.begin(); it != processors.end(); ++it)
if((*it)->isProcessed())
return Calibrate;
}
return Finished;
}
cv::Size CalibPipeline::getImageSize() const
{
return mImageSize;
}

View File

@ -0,0 +1,39 @@
#ifndef CALIB_PIPELINE_HPP
#define CALIB_PIPELINE_HPP
#include <vector>
#include <opencv2/highgui.hpp>
#include "calibCommon.hpp"
#include "frameProcessor.hpp"
namespace calib
{
enum PipelineExitStatus { Finished,
DeleteLastFrame,
Calibrate,
DeleteAllFrames,
SaveCurrentData,
SwitchUndistort,
SwitchVisualisation
};
class CalibPipeline
{
protected:
captureParameters mCaptureParams;
cv::Size mImageSize;
cv::VideoCapture mCapture;
cv::Size getCameraResolution();
public:
CalibPipeline(captureParameters params);
PipelineExitStatus start(std::vector<cv::Ptr<FrameProcessor> > processors);
cv::Size getImageSize() const;
};
}
#endif

View File

@ -0,0 +1,824 @@
#include <opencv2/calib3d.hpp>
#include "linalg.hpp"
#include "cvCalibrationFork.hpp"
using namespace cv;
static void subMatrix(const cv::Mat& src, cv::Mat& dst, const std::vector<uchar>& cols,
const std::vector<uchar>& rows);
static const char* cvDistCoeffErr = "Distortion coefficients must be 1x4, 4x1, 1x5, 5x1, 1x8, 8x1, 1x12, 12x1, 1x14 or 14x1 floating-point vector";
static void cvEvaluateJtJ2(CvMat* _JtJ,
const CvMat* camera_matrix,
const CvMat* distortion_coeffs,
const CvMat* object_points,
const CvMat* param,
const CvMat* npoints,
int flags, int NINTRINSIC, double aspectRatio)
{
int i, pos, ni, total = 0, npstep = 0, maxPoints = 0;
npstep = npoints->rows == 1 ? 1 : npoints->step/CV_ELEM_SIZE(npoints->type);
int nimages = npoints->rows*npoints->cols;
for( i = 0; i < nimages; i++ )
{
ni = npoints->data.i[i*npstep];
if( ni < 4 )
{
CV_Error_( CV_StsOutOfRange, ("The number of points in the view #%d is < 4", i));
}
maxPoints = MAX( maxPoints, ni );
total += ni;
}
Mat _Ji( maxPoints*2, NINTRINSIC, CV_64FC1, Scalar(0));
Mat _Je( maxPoints*2, 6, CV_64FC1 );
Mat _err( maxPoints*2, 1, CV_64FC1 );
Mat _m( 1, total, CV_64FC2 );
const Mat matM = cvarrToMat(object_points);
cvZero(_JtJ);
for(i = 0, pos = 0; i < nimages; i++, pos += ni )
{
CvMat _ri, _ti;
ni = npoints->data.i[i*npstep];
cvGetRows( param, &_ri, NINTRINSIC + i*6, NINTRINSIC + i*6 + 3 );
cvGetRows( param, &_ti, NINTRINSIC + i*6 + 3, NINTRINSIC + i*6 + 6 );
CvMat _Mi(matM.colRange(pos, pos + ni));
CvMat _mi(_m.colRange(pos, pos + ni));
_Je.resize(ni*2); _Ji.resize(ni*2); _err.resize(ni*2);
CvMat _dpdr(_Je.colRange(0, 3));
CvMat _dpdt(_Je.colRange(3, 6));
CvMat _dpdf(_Ji.colRange(0, 2));
CvMat _dpdc(_Ji.colRange(2, 4));
CvMat _dpdk(_Ji.colRange(4, NINTRINSIC));
CvMat _mp(_err.reshape(2, 1));
cvProjectPoints2( &_Mi, &_ri, &_ti, camera_matrix, distortion_coeffs, &_mp, &_dpdr, &_dpdt,
(flags & CALIB_FIX_FOCAL_LENGTH) ? 0 : &_dpdf,
(flags & CALIB_FIX_PRINCIPAL_POINT) ? 0 : &_dpdc, &_dpdk,
(flags & CALIB_FIX_ASPECT_RATIO) ? aspectRatio : 0);
cvSub( &_mp, &_mi, &_mp );
Mat JtJ(cvarrToMat(_JtJ));
// see HZ: (A6.14) for details on the structure of the Jacobian
JtJ(Rect(0, 0, NINTRINSIC, NINTRINSIC)) += _Ji.t() * _Ji;
JtJ(Rect(NINTRINSIC + i * 6, NINTRINSIC + i * 6, 6, 6)) = _Je.t() * _Je;
JtJ(Rect(NINTRINSIC + i * 6, 0, 6, NINTRINSIC)) = _Ji.t() * _Je;
}
}
double cvfork::cvCalibrateCamera2( const CvMat* objectPoints,
const CvMat* imagePoints, const CvMat* npoints,
CvSize imageSize, CvMat* cameraMatrix, CvMat* distCoeffs,
CvMat* rvecs, CvMat* tvecs, CvMat* stdDevs, CvMat* perViewErrors, int flags, CvTermCriteria termCrit )
{
{
const int NINTRINSIC = CV_CALIB_NINTRINSIC;
double reprojErr = 0;
Matx33d A;
double k[14] = {0};
CvMat matA = cvMat(3, 3, CV_64F, A.val), _k;
int i, nimages, maxPoints = 0, ni = 0, pos, total = 0, nparams, npstep, cn;
double aspectRatio = 0.;
// 0. check the parameters & allocate buffers
if( !CV_IS_MAT(objectPoints) || !CV_IS_MAT(imagePoints) ||
!CV_IS_MAT(npoints) || !CV_IS_MAT(cameraMatrix) || !CV_IS_MAT(distCoeffs) )
CV_Error( CV_StsBadArg, "One of required vector arguments is not a valid matrix" );
if( imageSize.width <= 0 || imageSize.height <= 0 )
CV_Error( CV_StsOutOfRange, "image width and height must be positive" );
if( CV_MAT_TYPE(npoints->type) != CV_32SC1 ||
(npoints->rows != 1 && npoints->cols != 1) )
CV_Error( CV_StsUnsupportedFormat,
"the array of point counters must be 1-dimensional integer vector" );
if(flags & CV_CALIB_TILTED_MODEL)
{
//when the tilted sensor model is used the distortion coefficients matrix must have 14 parameters
if (distCoeffs->cols*distCoeffs->rows != 14)
CV_Error( CV_StsBadArg, "The tilted sensor model must have 14 parameters in the distortion matrix" );
}
else
{
//when the thin prism model is used the distortion coefficients matrix must have 12 parameters
if(flags & CV_CALIB_THIN_PRISM_MODEL)
if (distCoeffs->cols*distCoeffs->rows != 12)
CV_Error( CV_StsBadArg, "Thin prism model must have 12 parameters in the distortion matrix" );
}
nimages = npoints->rows*npoints->cols;
npstep = npoints->rows == 1 ? 1 : npoints->step/CV_ELEM_SIZE(npoints->type);
if( rvecs )
{
cn = CV_MAT_CN(rvecs->type);
if( !CV_IS_MAT(rvecs) ||
(CV_MAT_DEPTH(rvecs->type) != CV_32F && CV_MAT_DEPTH(rvecs->type) != CV_64F) ||
((rvecs->rows != nimages || (rvecs->cols*cn != 3 && rvecs->cols*cn != 9)) &&
(rvecs->rows != 1 || rvecs->cols != nimages || cn != 3)) )
CV_Error( CV_StsBadArg, "the output array of rotation vectors must be 3-channel "
"1xn or nx1 array or 1-channel nx3 or nx9 array, where n is the number of views" );
}
if( tvecs )
{
cn = CV_MAT_CN(tvecs->type);
if( !CV_IS_MAT(tvecs) ||
(CV_MAT_DEPTH(tvecs->type) != CV_32F && CV_MAT_DEPTH(tvecs->type) != CV_64F) ||
((tvecs->rows != nimages || tvecs->cols*cn != 3) &&
(tvecs->rows != 1 || tvecs->cols != nimages || cn != 3)) )
CV_Error( CV_StsBadArg, "the output array of translation vectors must be 3-channel "
"1xn or nx1 array or 1-channel nx3 array, where n is the number of views" );
}
if( stdDevs )
{
cn = CV_MAT_CN(stdDevs->type);
if( !CV_IS_MAT(stdDevs) ||
(CV_MAT_DEPTH(stdDevs->type) != CV_32F && CV_MAT_DEPTH(stdDevs->type) != CV_64F) ||
((stdDevs->rows != (nimages*6 + NINTRINSIC) || stdDevs->cols*cn != 1) &&
(stdDevs->rows != 1 || stdDevs->cols != (nimages*6 + NINTRINSIC) || cn != 1)) )
CV_Error( CV_StsBadArg, "the output array of standard deviations vectors must be 1-channel "
"1x(n*6 + NINTRINSIC) or (n*6 + NINTRINSIC)x1 array, where n is the number of views" );
}
if( (CV_MAT_TYPE(cameraMatrix->type) != CV_32FC1 &&
CV_MAT_TYPE(cameraMatrix->type) != CV_64FC1) ||
cameraMatrix->rows != 3 || cameraMatrix->cols != 3 )
CV_Error( CV_StsBadArg,
"Intrinsic parameters must be 3x3 floating-point matrix" );
if( (CV_MAT_TYPE(distCoeffs->type) != CV_32FC1 &&
CV_MAT_TYPE(distCoeffs->type) != CV_64FC1) ||
(distCoeffs->cols != 1 && distCoeffs->rows != 1) ||
(distCoeffs->cols*distCoeffs->rows != 4 &&
distCoeffs->cols*distCoeffs->rows != 5 &&
distCoeffs->cols*distCoeffs->rows != 8 &&
distCoeffs->cols*distCoeffs->rows != 12 &&
distCoeffs->cols*distCoeffs->rows != 14) )
CV_Error( CV_StsBadArg, cvDistCoeffErr );
for( i = 0; i < nimages; i++ )
{
ni = npoints->data.i[i*npstep];
if( ni < 4 )
{
CV_Error_( CV_StsOutOfRange, ("The number of points in the view #%d is < 4", i));
}
maxPoints = MAX( maxPoints, ni );
total += ni;
}
Mat matM( 1, total, CV_64FC3 );
Mat _m( 1, total, CV_64FC2 );
if(CV_MAT_CN(objectPoints->type) == 3) {
cvarrToMat(objectPoints).convertTo(matM, CV_64F);
} else {
convertPointsHomogeneous(cvarrToMat(objectPoints), matM);
}
if(CV_MAT_CN(imagePoints->type) == 2) {
cvarrToMat(imagePoints).convertTo(_m, CV_64F);
} else {
convertPointsHomogeneous(cvarrToMat(imagePoints), _m);
}
nparams = NINTRINSIC + nimages*6;
Mat _Ji( maxPoints*2, NINTRINSIC, CV_64FC1, Scalar(0));
Mat _Je( maxPoints*2, 6, CV_64FC1 );
Mat _err( maxPoints*2, 1, CV_64FC1 );
_k = cvMat( distCoeffs->rows, distCoeffs->cols, CV_MAKETYPE(CV_64F,CV_MAT_CN(distCoeffs->type)), k);
if( distCoeffs->rows*distCoeffs->cols*CV_MAT_CN(distCoeffs->type) < 8 )
{
if( distCoeffs->rows*distCoeffs->cols*CV_MAT_CN(distCoeffs->type) < 5 )
flags |= CALIB_FIX_K3;
flags |= CALIB_FIX_K4 | CALIB_FIX_K5 | CALIB_FIX_K6;
}
const double minValidAspectRatio = 0.01;
const double maxValidAspectRatio = 100.0;
// 1. initialize intrinsic parameters & LM solver
if( flags & CALIB_USE_INTRINSIC_GUESS )
{
cvConvert( cameraMatrix, &matA );
if( A(0, 0) <= 0 || A(1, 1) <= 0 )
CV_Error( CV_StsOutOfRange, "Focal length (fx and fy) must be positive" );
if( A(0, 2) < 0 || A(0, 2) >= imageSize.width ||
A(1, 2) < 0 || A(1, 2) >= imageSize.height )
CV_Error( CV_StsOutOfRange, "Principal point must be within the image" );
if( fabs(A(0, 1)) > 1e-5 )
CV_Error( CV_StsOutOfRange, "Non-zero skew is not supported by the function" );
if( fabs(A(1, 0)) > 1e-5 || fabs(A(2, 0)) > 1e-5 ||
fabs(A(2, 1)) > 1e-5 || fabs(A(2,2)-1) > 1e-5 )
CV_Error( CV_StsOutOfRange,
"The intrinsic matrix must have [fx 0 cx; 0 fy cy; 0 0 1] shape" );
A(0, 1) = A(1, 0) = A(2, 0) = A(2, 1) = 0.;
A(2, 2) = 1.;
if( flags & CALIB_FIX_ASPECT_RATIO )
{
aspectRatio = A(0, 0)/A(1, 1);
if( aspectRatio < minValidAspectRatio || aspectRatio > maxValidAspectRatio )
CV_Error( CV_StsOutOfRange,
"The specified aspect ratio (= cameraMatrix[0][0] / cameraMatrix[1][1]) is incorrect" );
}
cvConvert( distCoeffs, &_k );
}
else
{
Scalar mean, sdv;
meanStdDev(matM, mean, sdv);
if( fabs(mean[2]) > 1e-5 || fabs(sdv[2]) > 1e-5 )
CV_Error( CV_StsBadArg,
"For non-planar calibration rigs the initial intrinsic matrix must be specified" );
for( i = 0; i < total; i++ )
matM.at<Point3d>(i).z = 0.;
if( flags & CALIB_FIX_ASPECT_RATIO )
{
aspectRatio = cvmGet(cameraMatrix,0,0);
aspectRatio /= cvmGet(cameraMatrix,1,1);
if( aspectRatio < minValidAspectRatio || aspectRatio > maxValidAspectRatio )
CV_Error( CV_StsOutOfRange,
"The specified aspect ratio (= cameraMatrix[0][0] / cameraMatrix[1][1]) is incorrect" );
}
CvMat _matM(matM), m(_m);
cvInitIntrinsicParams2D( &_matM, &m, npoints, imageSize, &matA, aspectRatio );
}
//CvLevMarq solver( nparams, 0, termCrit );
cvfork::CvLevMarqFork solver( nparams, 0, termCrit );
Mat allErrors(1, total, CV_64FC2);
if(flags & CALIB_USE_LU) {
solver.solveMethod = DECOMP_LU;
}
else if(flags & CALIB_USE_QR)
solver.solveMethod = DECOMP_QR;
{
double* param = solver.param->data.db;
uchar* mask = solver.mask->data.ptr;
param[0] = A(0, 0); param[1] = A(1, 1); param[2] = A(0, 2); param[3] = A(1, 2);
std::copy(k, k + 14, param + 4);
if( flags & CV_CALIB_FIX_FOCAL_LENGTH )
mask[0] = mask[1] = 0;
if( flags & CV_CALIB_FIX_PRINCIPAL_POINT )
mask[2] = mask[3] = 0;
if( flags & CV_CALIB_ZERO_TANGENT_DIST )
{
param[6] = param[7] = 0;
mask[6] = mask[7] = 0;
}
if( !(flags & CALIB_RATIONAL_MODEL) )
flags |= CALIB_FIX_K4 + CALIB_FIX_K5 + CALIB_FIX_K6;
if( !(flags & CV_CALIB_THIN_PRISM_MODEL))
flags |= CALIB_FIX_S1_S2_S3_S4;
if( !(flags & CV_CALIB_TILTED_MODEL))
flags |= CALIB_FIX_TAUX_TAUY;
mask[ 4] = !(flags & CALIB_FIX_K1);
mask[ 5] = !(flags & CALIB_FIX_K2);
mask[ 8] = !(flags & CALIB_FIX_K3);
mask[ 9] = !(flags & CALIB_FIX_K4);
mask[10] = !(flags & CALIB_FIX_K5);
mask[11] = !(flags & CALIB_FIX_K6);
if(flags & CALIB_FIX_S1_S2_S3_S4)
{
mask[12] = 0;
mask[13] = 0;
mask[14] = 0;
mask[15] = 0;
}
if(flags & CALIB_FIX_TAUX_TAUY)
{
mask[16] = 0;
mask[17] = 0;
}
}
// 2. initialize extrinsic parameters
for( i = 0, pos = 0; i < nimages; i++, pos += ni )
{
CvMat _ri, _ti;
ni = npoints->data.i[i*npstep];
cvGetRows( solver.param, &_ri, NINTRINSIC + i*6, NINTRINSIC + i*6 + 3 );
cvGetRows( solver.param, &_ti, NINTRINSIC + i*6 + 3, NINTRINSIC + i*6 + 6 );
CvMat _Mi(matM.colRange(pos, pos + ni));
CvMat _mi(_m.colRange(pos, pos + ni));
cvFindExtrinsicCameraParams2( &_Mi, &_mi, &matA, &_k, &_ri, &_ti );
}
// 3. run the optimization
for(;;)
{
const CvMat* _param = 0;
CvMat *_JtJ = 0, *_JtErr = 0;
double* _errNorm = 0;
bool proceed = solver.updateAlt( _param, _JtJ, _JtErr, _errNorm );
double *param = solver.param->data.db, *pparam = solver.prevParam->data.db;
if( flags & CALIB_FIX_ASPECT_RATIO )
{
param[0] = param[1]*aspectRatio;
pparam[0] = pparam[1]*aspectRatio;
}
A(0, 0) = param[0]; A(1, 1) = param[1]; A(0, 2) = param[2]; A(1, 2) = param[3];
std::copy(param + 4, param + 4 + 14, k);
if( !proceed ) {
//do errors estimation
if(stdDevs) {
Ptr<CvMat> JtJ(cvCreateMat(nparams, nparams, CV_64F));
CvMat cvMatM(matM);
cvEvaluateJtJ2(JtJ, &matA, &_k, &cvMatM, solver.param, npoints, flags, NINTRINSIC, aspectRatio);
Mat mask = cvarrToMat(solver.mask);
int nparams_nz = countNonZero(mask);
Mat JtJinv, JtJN;
JtJN.create(nparams_nz, nparams_nz, CV_64F);
subMatrix(cvarrToMat(JtJ), JtJN, mask, mask);
completeSymm(JtJN, false);
#ifndef USE_LAPACK
cv::invert(JtJN, JtJinv, DECOMP_SVD);
#else
cvfork::invert(JtJN, JtJinv, DECOMP_SVD);
#endif
double sigma2 = norm(allErrors, NORM_L2SQR) / (total - nparams_nz);
Mat stdDevsM = cvarrToMat(stdDevs);
int j = 0;
for (int s = 0; s < nparams; s++)
if(mask.data[s]) {
stdDevsM.at<double>(s) = std::sqrt(JtJinv.at<double>(j,j)*sigma2);
j++;
}
else
stdDevsM.at<double>(s) = 0;
}
break;
}
reprojErr = 0;
for( i = 0, pos = 0; i < nimages; i++, pos += ni )
{
CvMat _ri, _ti;
ni = npoints->data.i[i*npstep];
cvGetRows( solver.param, &_ri, NINTRINSIC + i*6, NINTRINSIC + i*6 + 3 );
cvGetRows( solver.param, &_ti, NINTRINSIC + i*6 + 3, NINTRINSIC + i*6 + 6 );
CvMat _Mi(matM.colRange(pos, pos + ni));
CvMat _mi(_m.colRange(pos, pos + ni));
CvMat _me(allErrors.colRange(pos, pos + ni));
_Je.resize(ni*2); _Ji.resize(ni*2); _err.resize(ni*2);
CvMat _dpdr(_Je.colRange(0, 3));
CvMat _dpdt(_Je.colRange(3, 6));
CvMat _dpdf(_Ji.colRange(0, 2));
CvMat _dpdc(_Ji.colRange(2, 4));
CvMat _dpdk(_Ji.colRange(4, NINTRINSIC));
CvMat _mp(_err.reshape(2, 1));
if( solver.state == CvLevMarq::CALC_J )
{
cvProjectPoints2( &_Mi, &_ri, &_ti, &matA, &_k, &_mp, &_dpdr, &_dpdt,
(flags & CALIB_FIX_FOCAL_LENGTH) ? 0 : &_dpdf,
(flags & CALIB_FIX_PRINCIPAL_POINT) ? 0 : &_dpdc, &_dpdk,
(flags & CALIB_FIX_ASPECT_RATIO) ? aspectRatio : 0);
}
else
cvProjectPoints2( &_Mi, &_ri, &_ti, &matA, &_k, &_mp );
cvSub( &_mp, &_mi, &_mp );
if( solver.state == CvLevMarq::CALC_J )
{
Mat JtJ(cvarrToMat(_JtJ)), JtErr(cvarrToMat(_JtErr));
// see HZ: (A6.14) for details on the structure of the Jacobian
JtJ(Rect(0, 0, NINTRINSIC, NINTRINSIC)) += _Ji.t() * _Ji;
JtJ(Rect(NINTRINSIC + i * 6, NINTRINSIC + i * 6, 6, 6)) = _Je.t() * _Je;
JtJ(Rect(NINTRINSIC + i * 6, 0, 6, NINTRINSIC)) = _Ji.t() * _Je;
JtErr.rowRange(0, NINTRINSIC) += _Ji.t() * _err;
JtErr.rowRange(NINTRINSIC + i * 6, NINTRINSIC + (i + 1) * 6) = _Je.t() * _err;
}
if (stdDevs || perViewErrors)
cvCopy(&_mp, &_me);
reprojErr += norm(_err, NORM_L2SQR);
}
if( _errNorm )
*_errNorm = reprojErr;
}
// 4. store the results
cvConvert( &matA, cameraMatrix );
cvConvert( &_k, distCoeffs );
for( i = 0, pos = 0; i < nimages; i++)
{
CvMat src, dst;
if( perViewErrors )
{
ni = npoints->data.i[i*npstep];
perViewErrors->data.db[i] = std::sqrt(cv::norm(allErrors.colRange(pos, pos + ni), NORM_L2SQR) / ni);
pos+=ni;
}
if( rvecs )
{
src = cvMat( 3, 1, CV_64F, solver.param->data.db + NINTRINSIC + i*6 );
if( rvecs->rows == nimages && rvecs->cols*CV_MAT_CN(rvecs->type) == 9 )
{
dst = cvMat( 3, 3, CV_MAT_DEPTH(rvecs->type),
rvecs->data.ptr + rvecs->step*i );
cvRodrigues2( &src, &matA );
cvConvert( &matA, &dst );
}
else
{
dst = cvMat( 3, 1, CV_MAT_DEPTH(rvecs->type), rvecs->rows == 1 ?
rvecs->data.ptr + i*CV_ELEM_SIZE(rvecs->type) :
rvecs->data.ptr + rvecs->step*i );
cvConvert( &src, &dst );
}
}
if( tvecs )
{
src = cvMat( 3, 1, CV_64F, solver.param->data.db + NINTRINSIC + i*6 + 3 );
dst = cvMat( 3, 1, CV_MAT_DEPTH(tvecs->type), tvecs->rows == 1 ?
tvecs->data.ptr + i*CV_ELEM_SIZE(tvecs->type) :
tvecs->data.ptr + tvecs->step*i );
cvConvert( &src, &dst );
}
}
return std::sqrt(reprojErr/total);
}
}
static Mat prepareCameraMatrix(Mat& cameraMatrix0, int rtype)
{
Mat cameraMatrix = Mat::eye(3, 3, rtype);
if( cameraMatrix0.size() == cameraMatrix.size() )
cameraMatrix0.convertTo(cameraMatrix, rtype);
return cameraMatrix;
}
static Mat prepareDistCoeffs(Mat& distCoeffs0, int rtype)
{
Mat distCoeffs = Mat::zeros(distCoeffs0.cols == 1 ? Size(1, 14) : Size(14, 1), rtype);
if( distCoeffs0.size() == Size(1, 4) ||
distCoeffs0.size() == Size(1, 5) ||
distCoeffs0.size() == Size(1, 8) ||
distCoeffs0.size() == Size(1, 12) ||
distCoeffs0.size() == Size(1, 14) ||
distCoeffs0.size() == Size(4, 1) ||
distCoeffs0.size() == Size(5, 1) ||
distCoeffs0.size() == Size(8, 1) ||
distCoeffs0.size() == Size(12, 1) ||
distCoeffs0.size() == Size(14, 1) )
{
Mat dstCoeffs(distCoeffs, Rect(0, 0, distCoeffs0.cols, distCoeffs0.rows));
distCoeffs0.convertTo(dstCoeffs, rtype);
}
return distCoeffs;
}
static void collectCalibrationData( InputArrayOfArrays objectPoints,
InputArrayOfArrays imagePoints1,
InputArrayOfArrays imagePoints2,
Mat& objPtMat, Mat& imgPtMat1, Mat* imgPtMat2,
Mat& npoints )
{
int nimages = (int)objectPoints.total();
int i, j = 0, ni = 0, total = 0;
CV_Assert(nimages > 0 && nimages == (int)imagePoints1.total() &&
(!imgPtMat2 || nimages == (int)imagePoints2.total()));
for( i = 0; i < nimages; i++ )
{
ni = objectPoints.getMat(i).checkVector(3, CV_32F);
if( ni <= 0 )
CV_Error(CV_StsUnsupportedFormat, "objectPoints should contain vector of vectors of points of type Point3f");
int ni1 = imagePoints1.getMat(i).checkVector(2, CV_32F);
if( ni1 <= 0 )
CV_Error(CV_StsUnsupportedFormat, "imagePoints1 should contain vector of vectors of points of type Point2f");
CV_Assert( ni == ni1 );
total += ni;
}
npoints.create(1, (int)nimages, CV_32S);
objPtMat.create(1, (int)total, CV_32FC3);
imgPtMat1.create(1, (int)total, CV_32FC2);
Point2f* imgPtData2 = 0;
if( imgPtMat2 )
{
imgPtMat2->create(1, (int)total, CV_32FC2);
imgPtData2 = imgPtMat2->ptr<Point2f>();
}
Point3f* objPtData = objPtMat.ptr<Point3f>();
Point2f* imgPtData1 = imgPtMat1.ptr<Point2f>();
for( i = 0; i < nimages; i++, j += ni )
{
Mat objpt = objectPoints.getMat(i);
Mat imgpt1 = imagePoints1.getMat(i);
ni = objpt.checkVector(3, CV_32F);
npoints.at<int>(i) = ni;
memcpy( objPtData + j, objpt.ptr(), ni*sizeof(objPtData[0]) );
memcpy( imgPtData1 + j, imgpt1.ptr(), ni*sizeof(imgPtData1[0]) );
if( imgPtData2 )
{
Mat imgpt2 = imagePoints2.getMat(i);
int ni2 = imgpt2.checkVector(2, CV_32F);
CV_Assert( ni == ni2 );
memcpy( imgPtData2 + j, imgpt2.ptr(), ni*sizeof(imgPtData2[0]) );
}
}
}
double cvfork::calibrateCamera(InputArrayOfArrays _objectPoints,
InputArrayOfArrays _imagePoints,
Size imageSize, InputOutputArray _cameraMatrix, InputOutputArray _distCoeffs,
OutputArrayOfArrays _rvecs, OutputArrayOfArrays _tvecs, OutputArray _stdDeviations, OutputArray _perViewErrors, int flags, TermCriteria criteria )
{
int rtype = CV_64F;
Mat cameraMatrix = _cameraMatrix.getMat();
cameraMatrix = prepareCameraMatrix(cameraMatrix, rtype);
Mat distCoeffs = _distCoeffs.getMat();
distCoeffs = prepareDistCoeffs(distCoeffs, rtype);
if( !(flags & CALIB_RATIONAL_MODEL) &&
(!(flags & CALIB_THIN_PRISM_MODEL)) &&
(!(flags & CALIB_TILTED_MODEL)))
distCoeffs = distCoeffs.rows == 1 ? distCoeffs.colRange(0, 5) : distCoeffs.rowRange(0, 5);
int nimages = int(_objectPoints.total());
CV_Assert( nimages > 0 );
Mat objPt, imgPt, npoints, rvecM, tvecM, stdDeviationsM, errorsM;
bool rvecs_needed = _rvecs.needed(), tvecs_needed = _tvecs.needed(),
stddev_needed = _stdDeviations.needed(), errors_needed = _perViewErrors.needed();
bool rvecs_mat_vec = _rvecs.isMatVector();
bool tvecs_mat_vec = _tvecs.isMatVector();
if( rvecs_needed ) {
_rvecs.create(nimages, 1, CV_64FC3);
if(rvecs_mat_vec)
rvecM.create(nimages, 3, CV_64F);
else
rvecM = _rvecs.getMat();
}
if( tvecs_needed ) {
_tvecs.create(nimages, 1, CV_64FC3);
if(tvecs_mat_vec)
tvecM.create(nimages, 3, CV_64F);
else
tvecM = _tvecs.getMat();
}
if( stddev_needed ) {
_stdDeviations.create(nimages*6 + CV_CALIB_NINTRINSIC, 1, CV_64F);
stdDeviationsM = _stdDeviations.getMat();
}
if( errors_needed) {
_perViewErrors.create(nimages, 1, CV_64F);
errorsM = _perViewErrors.getMat();
}
collectCalibrationData( _objectPoints, _imagePoints, noArray(),
objPt, imgPt, 0, npoints );
CvMat c_objPt = objPt, c_imgPt = imgPt, c_npoints = npoints;
CvMat c_cameraMatrix = cameraMatrix, c_distCoeffs = distCoeffs;
CvMat c_rvecM = rvecM, c_tvecM = tvecM, c_stdDev = stdDeviationsM, c_errors = errorsM;
double reprojErr = cvfork::cvCalibrateCamera2(&c_objPt, &c_imgPt, &c_npoints, imageSize,
&c_cameraMatrix, &c_distCoeffs,
rvecs_needed ? &c_rvecM : NULL,
tvecs_needed ? &c_tvecM : NULL,
stddev_needed ? &c_stdDev : NULL,
errors_needed ? &c_errors : NULL, flags, criteria );
// overly complicated and inefficient rvec/ tvec handling to support vector<Mat>
for(int i = 0; i < nimages; i++ )
{
if( rvecs_needed && rvecs_mat_vec)
{
_rvecs.create(3, 1, CV_64F, i, true);
Mat rv = _rvecs.getMat(i);
memcpy(rv.ptr(), rvecM.ptr(i), 3*sizeof(double));
}
if( tvecs_needed && tvecs_mat_vec)
{
_tvecs.create(3, 1, CV_64F, i, true);
Mat tv = _tvecs.getMat(i);
memcpy(tv.ptr(), tvecM.ptr(i), 3*sizeof(double));
}
}
cameraMatrix.copyTo(_cameraMatrix);
distCoeffs.copyTo(_distCoeffs);
return reprojErr;
}
double cvfork::calibrateCameraCharuco(InputArrayOfArrays _charucoCorners, InputArrayOfArrays _charucoIds,
Ptr<aruco::CharucoBoard> &_board, Size imageSize,
InputOutputArray _cameraMatrix, InputOutputArray _distCoeffs,
OutputArrayOfArrays _rvecs, OutputArrayOfArrays _tvecs, OutputArray _stdDeviations, OutputArray _perViewErrors,
int flags, TermCriteria criteria) {
CV_Assert(_charucoIds.total() > 0 && (_charucoIds.total() == _charucoCorners.total()));
// Join object points of charuco corners in a single vector for calibrateCamera() function
std::vector< std::vector< Point3f > > allObjPoints;
allObjPoints.resize(_charucoIds.total());
for(unsigned int i = 0; i < _charucoIds.total(); i++) {
unsigned int nCorners = (unsigned int)_charucoIds.getMat(i).total();
CV_Assert(nCorners > 0 && nCorners == _charucoCorners.getMat(i).total()); //actually nCorners must be > 3 for calibration
allObjPoints[i].reserve(nCorners);
for(unsigned int j = 0; j < nCorners; j++) {
int pointId = _charucoIds.getMat(i).ptr< int >(0)[j];
CV_Assert(pointId >= 0 && pointId < (int)_board->chessboardCorners.size());
allObjPoints[i].push_back(_board->chessboardCorners[pointId]);
}
}
return cvfork::calibrateCamera(allObjPoints, _charucoCorners, imageSize, _cameraMatrix, _distCoeffs,
_rvecs, _tvecs, _stdDeviations, _perViewErrors, flags, criteria);
}
static void subMatrix(const cv::Mat& src, cv::Mat& dst, const std::vector<uchar>& cols,
const std::vector<uchar>& rows) {
int nonzeros_cols = cv::countNonZero(cols);
cv::Mat tmp(src.rows, nonzeros_cols, CV_64FC1);
for (int i = 0, j = 0; i < (int)cols.size(); i++)
{
if (cols[i])
{
src.col(i).copyTo(tmp.col(j++));
}
}
int nonzeros_rows = cv::countNonZero(rows);
dst.create(nonzeros_rows, nonzeros_cols, CV_64FC1);
for (int i = 0, j = 0; i < (int)rows.size(); i++)
{
if (rows[i])
{
tmp.row(i).copyTo(dst.row(j++));
}
}
}
void cvfork::CvLevMarqFork::step()
{
using namespace cv;
const double LOG10 = log(10.);
double lambda = exp(lambdaLg10*LOG10);
int nparams = param->rows;
Mat _JtJ = cvarrToMat(JtJ);
Mat _mask = cvarrToMat(mask);
int nparams_nz = countNonZero(_mask);
if(!JtJN || JtJN->rows != nparams_nz) {
// prevent re-allocation in every step
JtJN.reset(cvCreateMat( nparams_nz, nparams_nz, CV_64F ));
JtJV.reset(cvCreateMat( nparams_nz, 1, CV_64F ));
JtJW.reset(cvCreateMat( nparams_nz, 1, CV_64F ));
}
Mat _JtJN = cvarrToMat(JtJN);
Mat _JtErr = cvarrToMat(JtJV);
Mat_<double> nonzero_param = cvarrToMat(JtJW);
subMatrix(cvarrToMat(JtErr), _JtErr, std::vector<uchar>(1, 1), _mask);
subMatrix(_JtJ, _JtJN, _mask, _mask);
if( !err )
completeSymm( _JtJN, completeSymmFlag );
#if 1
_JtJN.diag() *= 1. + lambda;
#else
_JtJN.diag() += lambda;
#endif
#ifndef USE_LAPACK
cv::solve(_JtJN, _JtErr, nonzero_param, solveMethod);
#else
cvfork::solve(_JtJN, _JtErr, nonzero_param, solveMethod);
#endif
int j = 0;
for( int i = 0; i < nparams; i++ )
param->data.db[i] = prevParam->data.db[i] - (mask->data.ptr[i] ? nonzero_param(j++) : 0);
}
cvfork::CvLevMarqFork::CvLevMarqFork(int nparams, int nerrs, CvTermCriteria criteria0, bool _completeSymmFlag)
{
init(nparams, nerrs, criteria0, _completeSymmFlag);
}
cvfork::CvLevMarqFork::~CvLevMarqFork()
{
clear();
}
bool cvfork::CvLevMarqFork::updateAlt( const CvMat*& _param, CvMat*& _JtJ, CvMat*& _JtErr, double*& _errNorm )
{
CV_Assert( !err );
if( state == DONE )
{
_param = param;
return false;
}
if( state == STARTED )
{
_param = param;
cvZero( JtJ );
cvZero( JtErr );
errNorm = 0;
_JtJ = JtJ;
_JtErr = JtErr;
_errNorm = &errNorm;
state = CALC_J;
return true;
}
if( state == CALC_J )
{
cvCopy( param, prevParam );
step();
_param = param;
prevErrNorm = errNorm;
errNorm = 0;
_errNorm = &errNorm;
state = CHECK_ERR;
return true;
}
assert( state == CHECK_ERR );
if( errNorm > prevErrNorm )
{
if( ++lambdaLg10 <= 16 )
{
step();
_param = param;
errNorm = 0;
_errNorm = &errNorm;
state = CHECK_ERR;
return true;
}
}
lambdaLg10 = MAX(lambdaLg10-1, -16);
if( ++iters >= criteria.max_iter ||
cvNorm(param, prevParam, CV_RELATIVE_L2) < criteria.epsilon )
{
//printf("iters %i\n", iters);
_param = param;
state = DONE;
return false;
}
prevErrNorm = errNorm;
cvZero( JtJ );
cvZero( JtErr );
_param = param;
_JtJ = JtJ;
_JtErr = JtErr;
state = CALC_J;
return true;
}

View File

@ -0,0 +1,56 @@
#ifndef CV_CALIBRATION_FORK_HPP
#define CV_CALIBRATION_FORK_HPP
#include <opencv2/core.hpp>
#include <opencv2/aruco/charuco.hpp>
#include <opencv2/calib3d.hpp>
#include <opencv2/calib3d/calib3d_c.h>
namespace cvfork
{
using namespace cv;
#define CV_CALIB_NINTRINSIC 18
#define CALIB_USE_QR (1 << 18)
double calibrateCamera(InputArrayOfArrays objectPoints,
InputArrayOfArrays imagePoints, Size imageSize,
InputOutputArray cameraMatrix, InputOutputArray distCoeffs,
OutputArrayOfArrays rvecs, OutputArrayOfArrays tvecs, OutputArray stdDeviations,
OutputArray perViewErrors, int flags = 0, TermCriteria criteria = TermCriteria(
TermCriteria::COUNT + TermCriteria::EPS, 30, DBL_EPSILON) );
double cvCalibrateCamera2( const CvMat* object_points,
const CvMat* image_points,
const CvMat* point_counts,
CvSize image_size,
CvMat* camera_matrix,
CvMat* distortion_coeffs,
CvMat* rotation_vectors CV_DEFAULT(NULL),
CvMat* translation_vectors CV_DEFAULT(NULL),
CvMat* stdDeviations_vector CV_DEFAULT(NULL),
CvMat* perViewErrors_vector CV_DEFAULT(NULL),
int flags CV_DEFAULT(0),
CvTermCriteria term_crit CV_DEFAULT(cvTermCriteria(
CV_TERMCRIT_ITER+CV_TERMCRIT_EPS,30,DBL_EPSILON)) );
double calibrateCameraCharuco(InputArrayOfArrays _charucoCorners, InputArrayOfArrays _charucoIds,
Ptr<aruco::CharucoBoard> &_board, Size imageSize,
InputOutputArray _cameraMatrix, InputOutputArray _distCoeffs,
OutputArrayOfArrays _rvecs, OutputArrayOfArrays _tvecs, OutputArray _stdDeviations, OutputArray _perViewErrors,
int flags = 0, TermCriteria criteria = TermCriteria(
TermCriteria::COUNT + TermCriteria::EPS, 30, DBL_EPSILON) );
class CvLevMarqFork : public CvLevMarq
{
public:
CvLevMarqFork( int nparams, int nerrs, CvTermCriteria criteria=
cvTermCriteria(CV_TERMCRIT_EPS+CV_TERMCRIT_ITER,30,DBL_EPSILON),
bool completeSymmFlag=false );
bool updateAlt( const CvMat*& _param, CvMat*& _JtJ, CvMat*& _JtErr, double*& _errNorm );
void step();
~CvLevMarqFork();
};
}
#endif

View File

@ -0,0 +1,14 @@
<?xml version="1.0"?>
<opencv_storage>
<charuco_dict>0</charuco_dict>
<charuco_square_lenght>200</charuco_square_lenght>
<charuco_marker_size>100</charuco_marker_size>
<calibration_step>1</calibration_step>
<max_frames_num>30</max_frames_num>
<min_frames_num>10</min_frames_num>
<solver_eps>1e-7</solver_eps>
<solver_max_iters>30</solver_max_iters>
<fast_solver>0</fast_solver>
<frame_filter_conv_param>0.1</frame_filter_conv_param>
<camera_resolution>800 600</camera_resolution>
</opencv_storage>

View File

@ -0,0 +1,518 @@
#include "frameProcessor.hpp"
#include "rotationConverters.hpp"
#include <opencv2/calib3d.hpp>
#include <opencv2/imgproc.hpp>
#include <opencv2/aruco/charuco.hpp>
#include <opencv2/highgui.hpp>
#include <vector>
#include <string>
#include <algorithm>
#include <limits>
using namespace calib;
#define VIDEO_TEXT_SIZE 4
#define POINT_SIZE 5
static cv::SimpleBlobDetector::Params getDetectorParams()
{
cv::SimpleBlobDetector::Params detectorParams;
detectorParams.thresholdStep = 40;
detectorParams.minThreshold = 20;
detectorParams.maxThreshold = 500;
detectorParams.minRepeatability = 2;
detectorParams.minDistBetweenBlobs = 5;
detectorParams.filterByColor = true;
detectorParams.blobColor = 0;
detectorParams.filterByArea = true;
detectorParams.minArea = 5;
detectorParams.maxArea = 5000;
detectorParams.filterByCircularity = false;
detectorParams.minCircularity = 0.8f;
detectorParams.maxCircularity = std::numeric_limits<float>::max();
detectorParams.filterByInertia = true;
detectorParams.minInertiaRatio = 0.1f;
detectorParams.maxInertiaRatio = std::numeric_limits<float>::max();
detectorParams.filterByConvexity = true;
detectorParams.minConvexity = 0.8f;
detectorParams.maxConvexity = std::numeric_limits<float>::max();
return detectorParams;
}
FrameProcessor::~FrameProcessor()
{
}
bool CalibProcessor::detectAndParseChessboard(const cv::Mat &frame)
{
int chessBoardFlags = cv::CALIB_CB_ADAPTIVE_THRESH | cv::CALIB_CB_NORMALIZE_IMAGE | cv::CALIB_CB_FAST_CHECK;
bool isTemplateFound = cv::findChessboardCorners(frame, mBoardSize, mCurrentImagePoints, chessBoardFlags);
if (isTemplateFound) {
cv::Mat viewGray;
cv::cvtColor(frame, viewGray, cv::COLOR_BGR2GRAY);
cv::cornerSubPix(viewGray, mCurrentImagePoints, cv::Size(11,11),
cv::Size(-1,-1), cv::TermCriteria( cv::TermCriteria::EPS+cv::TermCriteria::COUNT, 30, 0.1 ));
cv::drawChessboardCorners(frame, mBoardSize, cv::Mat(mCurrentImagePoints), isTemplateFound);
mTemplateLocations.insert(mTemplateLocations.begin(), mCurrentImagePoints[0]);
}
return isTemplateFound;
}
bool CalibProcessor::detectAndParseChAruco(const cv::Mat &frame)
{
cv::Ptr<cv::aruco::Board> board = mCharucoBoard.staticCast<cv::aruco::Board>();
std::vector<std::vector<cv::Point2f> > corners, rejected;
std::vector<int> ids;
cv::aruco::detectMarkers(frame, mArucoDictionary, corners, ids, cv::aruco::DetectorParameters::create(), rejected);
cv::aruco::refineDetectedMarkers(frame, board, corners, ids, rejected);
cv::Mat currentCharucoCorners, currentCharucoIds;
if(ids.size() > 0)
cv::aruco::interpolateCornersCharuco(corners, ids, frame, mCharucoBoard, currentCharucoCorners,
currentCharucoIds);
if(ids.size() > 0) cv::aruco::drawDetectedMarkers(frame, corners);
if(currentCharucoCorners.total() > 3) {
float centerX = 0, centerY = 0;
for (int i = 0; i < currentCharucoCorners.size[0]; i++) {
centerX += currentCharucoCorners.at<float>(i, 0);
centerY += currentCharucoCorners.at<float>(i, 1);
}
centerX /= currentCharucoCorners.size[0];
centerY /= currentCharucoCorners.size[0];
//cv::circle(frame, cv::Point2f(centerX, centerY), 10, cv::Scalar(0, 255, 0), 10);
mTemplateLocations.insert(mTemplateLocations.begin(), cv::Point2f(centerX, centerY));
cv::aruco::drawDetectedCornersCharuco(frame, currentCharucoCorners, currentCharucoIds);
mCurrentCharucoCorners = currentCharucoCorners;
mCurrentCharucoIds = currentCharucoIds;
return true;
}
return false;
}
bool CalibProcessor::detectAndParseACircles(const cv::Mat &frame)
{
bool isTemplateFound = findCirclesGrid(frame, mBoardSize, mCurrentImagePoints, cv::CALIB_CB_ASYMMETRIC_GRID, mBlobDetectorPtr);
if(isTemplateFound) {
mTemplateLocations.insert(mTemplateLocations.begin(), mCurrentImagePoints[0]);
cv::drawChessboardCorners(frame, mBoardSize, cv::Mat(mCurrentImagePoints), isTemplateFound);
}
return isTemplateFound;
}
bool CalibProcessor::detectAndParseDualACircles(const cv::Mat &frame)
{
std::vector<cv::Point2f> blackPointbuf;
cv::Mat invertedView;
cv::bitwise_not(frame, invertedView);
bool isWhiteGridFound = cv::findCirclesGrid(frame, mBoardSize, mCurrentImagePoints, cv::CALIB_CB_ASYMMETRIC_GRID, mBlobDetectorPtr);
if(!isWhiteGridFound)
return false;
bool isBlackGridFound = cv::findCirclesGrid(invertedView, mBoardSize, blackPointbuf, cv::CALIB_CB_ASYMMETRIC_GRID, mBlobDetectorPtr);
if(!isBlackGridFound)
{
mCurrentImagePoints.clear();
return false;
}
cv::drawChessboardCorners(frame, mBoardSize, cv::Mat(mCurrentImagePoints), isWhiteGridFound);
cv::drawChessboardCorners(frame, mBoardSize, cv::Mat(blackPointbuf), isBlackGridFound);
mCurrentImagePoints.insert(mCurrentImagePoints.end(), blackPointbuf.begin(), blackPointbuf.end());
mTemplateLocations.insert(mTemplateLocations.begin(), mCurrentImagePoints[0]);
return true;
}
void CalibProcessor::saveFrameData()
{
std::vector<cv::Point3f> objectPoints;
switch(mBoardType)
{
case Chessboard:
objectPoints.reserve(mBoardSize.height*mBoardSize.width);
for( int i = 0; i < mBoardSize.height; ++i )
for( int j = 0; j < mBoardSize.width; ++j )
objectPoints.push_back(cv::Point3f(j*mSquareSize, i*mSquareSize, 0));
mCalibData->imagePoints.push_back(mCurrentImagePoints);
mCalibData->objectPoints.push_back(objectPoints);
break;
case chAruco:
mCalibData->allCharucoCorners.push_back(mCurrentCharucoCorners);
mCalibData->allCharucoIds.push_back(mCurrentCharucoIds);
break;
case AcirclesGrid:
objectPoints.reserve(mBoardSize.height*mBoardSize.width);
for( int i = 0; i < mBoardSize.height; i++ )
for( int j = 0; j < mBoardSize.width; j++ )
objectPoints.push_back(cv::Point3f((2*j + i % 2)*mSquareSize, i*mSquareSize, 0));
mCalibData->imagePoints.push_back(mCurrentImagePoints);
mCalibData->objectPoints.push_back(objectPoints);
break;
case DoubleAcirclesGrid:
{
float gridCenterX = (2*((float)mBoardSize.width - 1) + 1)*mSquareSize + mTemplDist / 2;
float gridCenterY = (mBoardSize.height - 1)*mSquareSize / 2;
objectPoints.reserve(2*mBoardSize.height*mBoardSize.width);
//white part
for( int i = 0; i < mBoardSize.height; i++ )
for( int j = 0; j < mBoardSize.width; j++ )
objectPoints.push_back(
cv::Point3f(-float((2*j + i % 2)*mSquareSize + mTemplDist +
(2*(mBoardSize.width - 1) + 1)*mSquareSize - gridCenterX),
-float(i*mSquareSize) - gridCenterY,
0));
//black part
for( int i = 0; i < mBoardSize.height; i++ )
for( int j = 0; j < mBoardSize.width; j++ )
objectPoints.push_back(cv::Point3f(-float((2*j + i % 2)*mSquareSize - gridCenterX),
-float(i*mSquareSize) - gridCenterY, 0));
mCalibData->imagePoints.push_back(mCurrentImagePoints);
mCalibData->objectPoints.push_back(objectPoints);
}
break;
}
}
void CalibProcessor::showCaptureMessage(const cv::Mat& frame, const std::string &message)
{
cv::Point textOrigin(100, 100);
double textSize = VIDEO_TEXT_SIZE * frame.cols / (double) IMAGE_MAX_WIDTH;
cv::bitwise_not(frame, frame);
cv::putText(frame, message, textOrigin, 1, textSize, cv::Scalar(0,0,255), 2, cv::LINE_AA);
cv::imshow(mainWindowName, frame);
cv::waitKey(300);
}
bool CalibProcessor::checkLastFrame()
{
bool isFrameBad = false;
cv::Mat tmpCamMatrix;
const double badAngleThresh = 40;
if(!mCalibData->cameraMatrix.total()) {
tmpCamMatrix = cv::Mat::eye(3, 3, CV_64F);
tmpCamMatrix.at<double>(0,0) = 20000;
tmpCamMatrix.at<double>(1,1) = 20000;
tmpCamMatrix.at<double>(0,2) = mCalibData->imageSize.height/2;
tmpCamMatrix.at<double>(1,2) = mCalibData->imageSize.width/2;
}
else
mCalibData->cameraMatrix.copyTo(tmpCamMatrix);
if(mBoardType != chAruco) {
cv::Mat r, t, angles;
cv::solvePnP(mCalibData->objectPoints.back(), mCurrentImagePoints, tmpCamMatrix, mCalibData->distCoeffs, r, t);
RodriguesToEuler(r, angles, CALIB_DEGREES);
if(fabs(angles.at<double>(0)) > badAngleThresh || fabs(angles.at<double>(1)) > badAngleThresh) {
mCalibData->objectPoints.pop_back();
mCalibData->imagePoints.pop_back();
isFrameBad = true;
}
}
else {
cv::Mat r, t, angles;
std::vector<cv::Point3f> allObjPoints;
allObjPoints.reserve(mCurrentCharucoIds.total());
for(size_t i = 0; i < mCurrentCharucoIds.total(); i++) {
int pointID = mCurrentCharucoIds.at<int>((int)i);
CV_Assert(pointID >= 0 && pointID < (int)mCharucoBoard->chessboardCorners.size());
allObjPoints.push_back(mCharucoBoard->chessboardCorners[pointID]);
}
cv::solvePnP(allObjPoints, mCurrentCharucoCorners, tmpCamMatrix, mCalibData->distCoeffs, r, t);
RodriguesToEuler(r, angles, CALIB_DEGREES);
if(180.0 - fabs(angles.at<double>(0)) > badAngleThresh || fabs(angles.at<double>(1)) > badAngleThresh) {
isFrameBad = true;
mCalibData->allCharucoCorners.pop_back();
mCalibData->allCharucoIds.pop_back();
}
}
return isFrameBad;
}
CalibProcessor::CalibProcessor(cv::Ptr<calibrationData> data, captureParameters &capParams) :
mCalibData(data), mBoardType(capParams.board), mBoardSize(capParams.boardSize)
{
mCapuredFrames = 0;
mNeededFramesNum = capParams.calibrationStep;
mDelayBetweenCaptures = static_cast<int>(capParams.captureDelay * capParams.fps);
mMaxTemplateOffset = std::sqrt(std::pow(mCalibData->imageSize.height, 2) +
std::pow(mCalibData->imageSize.width, 2)) / 20.0;
mSquareSize = capParams.squareSize;
mTemplDist = capParams.templDst;
switch(mBoardType)
{
case chAruco:
mArucoDictionary = cv::aruco::getPredefinedDictionary(
cv::aruco::PREDEFINED_DICTIONARY_NAME(capParams.charucoDictName));
mCharucoBoard = cv::aruco::CharucoBoard::create(mBoardSize.width, mBoardSize.height, capParams.charucoSquareLenght,
capParams.charucoMarkerSize, mArucoDictionary);
break;
case AcirclesGrid:
mBlobDetectorPtr = cv::SimpleBlobDetector::create();
break;
case DoubleAcirclesGrid:
mBlobDetectorPtr = cv::SimpleBlobDetector::create(getDetectorParams());
break;
case Chessboard:
break;
}
}
cv::Mat CalibProcessor::processFrame(const cv::Mat &frame)
{
cv::Mat frameCopy;
frame.copyTo(frameCopy);
bool isTemplateFound = false;
mCurrentImagePoints.clear();
switch(mBoardType)
{
case Chessboard:
isTemplateFound = detectAndParseChessboard(frameCopy);
break;
case chAruco:
isTemplateFound = detectAndParseChAruco(frameCopy);
break;
case AcirclesGrid:
isTemplateFound = detectAndParseACircles(frameCopy);
break;
case DoubleAcirclesGrid:
isTemplateFound = detectAndParseDualACircles(frameCopy);
break;
}
if(mTemplateLocations.size() > mDelayBetweenCaptures)
mTemplateLocations.pop_back();
if(mTemplateLocations.size() == mDelayBetweenCaptures && isTemplateFound) {
if(cv::norm(mTemplateLocations.front() - mTemplateLocations.back()) < mMaxTemplateOffset) {
saveFrameData();
bool isFrameBad = checkLastFrame();
if (!isFrameBad) {
std::string displayMessage = cv::format("Frame # %d captured", std::max(mCalibData->imagePoints.size(),
mCalibData->allCharucoCorners.size()));
if(!showOverlayMessage(displayMessage))
showCaptureMessage(frame, displayMessage);
mCapuredFrames++;
}
else {
std::string displayMessage = "Frame rejected";
if(!showOverlayMessage(displayMessage))
showCaptureMessage(frame, displayMessage);
}
mTemplateLocations.clear();
mTemplateLocations.reserve(mDelayBetweenCaptures);
}
}
return frameCopy;
}
bool CalibProcessor::isProcessed() const
{
if(mCapuredFrames < mNeededFramesNum)
return false;
else
return true;
}
void CalibProcessor::resetState()
{
mCapuredFrames = 0;
mTemplateLocations.clear();
}
CalibProcessor::~CalibProcessor()
{
}
////////////////////////////////////////////
void ShowProcessor::drawBoard(cv::Mat &img, cv::InputArray points)
{
cv::Mat tmpView = cv::Mat::zeros(img.rows, img.cols, CV_8UC3);
std::vector<cv::Point2f> templateHull;
std::vector<cv::Point> poly;
cv::convexHull(points, templateHull);
poly.resize(templateHull.size());
for(size_t i=0; i<templateHull.size();i++)
poly[i] = cv::Point((int)(templateHull[i].x*mGridViewScale), (int)(templateHull[i].y*mGridViewScale));
cv::fillConvexPoly(tmpView, poly, cv::Scalar(0, 255, 0), cv::LINE_AA);
cv::addWeighted(tmpView, .2, img, 1, 0, img);
}
void ShowProcessor::drawGridPoints(const cv::Mat &frame)
{
if(mBoardType != chAruco)
for(std::vector<std::vector<cv::Point2f> >::iterator it = mCalibdata->imagePoints.begin(); it != mCalibdata->imagePoints.end(); ++it)
for(std::vector<cv::Point2f>::iterator pointIt = (*it).begin(); pointIt != (*it).end(); ++pointIt)
cv::circle(frame, *pointIt, POINT_SIZE, cv::Scalar(0, 255, 0), 1, cv::LINE_AA);
else
for(std::vector<cv::Mat>::iterator it = mCalibdata->allCharucoCorners.begin(); it != mCalibdata->allCharucoCorners.end(); ++it)
for(int i = 0; i < (*it).size[0]; i++)
cv::circle(frame, cv::Point((int)(*it).at<float>(i, 0), (int)(*it).at<float>(i, 1)),
POINT_SIZE, cv::Scalar(0, 255, 0), 1, cv::LINE_AA);
}
ShowProcessor::ShowProcessor(cv::Ptr<calibrationData> data, cv::Ptr<calibController> controller, TemplateType board) :
mCalibdata(data), mController(controller), mBoardType(board)
{
mNeedUndistort = true;
mVisMode = Grid;
mGridViewScale = 0.5;
mTextSize = VIDEO_TEXT_SIZE;
}
cv::Mat ShowProcessor::processFrame(const cv::Mat &frame)
{
if(mCalibdata->cameraMatrix.size[0] && mCalibdata->distCoeffs.size[0]) {
mTextSize = VIDEO_TEXT_SIZE * (double) frame.cols / IMAGE_MAX_WIDTH;
cv::Scalar textColor = cv::Scalar(0,0,255);
cv::Mat frameCopy;
if (mNeedUndistort && mController->getFramesNumberState()) {
if(mVisMode == Grid)
drawGridPoints(frame);
cv::remap(frame, frameCopy, mCalibdata->undistMap1, mCalibdata->undistMap2, cv::INTER_LINEAR);
int baseLine = 100;
cv::Size textSize = cv::getTextSize("Undistorted view", 1, mTextSize, 2, &baseLine);
cv::Point textOrigin(baseLine, frame.rows - (int)(2.5*textSize.height));
cv::putText(frameCopy, "Undistorted view", textOrigin, 1, mTextSize, textColor, 2, cv::LINE_AA);
}
else {
frame.copyTo(frameCopy);
if(mVisMode == Grid)
drawGridPoints(frameCopy);
}
std::string displayMessage;
if(mCalibdata->stdDeviations.at<double>(0) == 0)
displayMessage = cv::format("F = %d RMS = %.3f", (int)mCalibdata->cameraMatrix.at<double>(0,0), mCalibdata->totalAvgErr);
else
displayMessage = cv::format("Fx = %d Fy = %d RMS = %.3f", (int)mCalibdata->cameraMatrix.at<double>(0,0),
(int)mCalibdata->cameraMatrix.at<double>(1,1), mCalibdata->totalAvgErr);
if(mController->getRMSState() && mController->getFramesNumberState())
displayMessage.append(" OK");
int baseLine = 100;
cv::Size textSize = cv::getTextSize(displayMessage, 1, mTextSize - 1, 2, &baseLine);
cv::Point textOrigin = cv::Point(baseLine, 2*textSize.height);
cv::putText(frameCopy, displayMessage, textOrigin, 1, mTextSize - 1, textColor, 2, cv::LINE_AA);
if(mCalibdata->stdDeviations.at<double>(0) == 0)
displayMessage = cv::format("DF = %.2f", mCalibdata->stdDeviations.at<double>(1)*sigmaMult);
else
displayMessage = cv::format("DFx = %.2f DFy = %.2f", mCalibdata->stdDeviations.at<double>(0)*sigmaMult,
mCalibdata->stdDeviations.at<double>(1)*sigmaMult);
if(mController->getConfidenceIntrervalsState() && mController->getFramesNumberState())
displayMessage.append(" OK");
cv::putText(frameCopy, displayMessage, cv::Point(baseLine, 4*textSize.height), 1, mTextSize - 1, textColor, 2, cv::LINE_AA);
if(mController->getCommonCalibrationState()) {
displayMessage = cv::format("Calibration is done");
cv::putText(frameCopy, displayMessage, cv::Point(baseLine, 6*textSize.height), 1, mTextSize - 1, textColor, 2, cv::LINE_AA);
}
int calibFlags = mController->getNewFlags();
displayMessage = "";
if(!(calibFlags & cv::CALIB_FIX_ASPECT_RATIO))
displayMessage.append(cv::format("AR=%.3f ", mCalibdata->cameraMatrix.at<double>(0,0)/mCalibdata->cameraMatrix.at<double>(1,1)));
if(calibFlags & cv::CALIB_ZERO_TANGENT_DIST)
displayMessage.append("TD=0 ");
displayMessage.append(cv::format("K1=%.2f K2=%.2f K3=%.2f", mCalibdata->distCoeffs.at<double>(0), mCalibdata->distCoeffs.at<double>(1),
mCalibdata->distCoeffs.at<double>(4)));
cv::putText(frameCopy, displayMessage, cv::Point(baseLine, frameCopy.rows - (int)(1.5*textSize.height)),
1, mTextSize - 1, textColor, 2, cv::LINE_AA);
return frameCopy;
}
return frame;
}
bool ShowProcessor::isProcessed() const
{
return false;
}
void ShowProcessor::resetState()
{
}
void ShowProcessor::setVisualizationMode(visualisationMode mode)
{
mVisMode = mode;
}
void ShowProcessor::switchVisualizationMode()
{
if(mVisMode == Grid) {
mVisMode = Window;
updateBoardsView();
}
else {
mVisMode = Grid;
cv::destroyWindow(gridWindowName);
}
}
void ShowProcessor::clearBoardsView()
{
cv::imshow(gridWindowName, cv::Mat());
}
void ShowProcessor::updateBoardsView()
{
if(mVisMode == Window) {
cv::Size originSize = mCalibdata->imageSize;
cv::Mat altGridView = cv::Mat::zeros((int)(originSize.height*mGridViewScale), (int)(originSize.width*mGridViewScale), CV_8UC3);
if(mBoardType != chAruco)
for(std::vector<std::vector<cv::Point2f> >::iterator it = mCalibdata->imagePoints.begin(); it != mCalibdata->imagePoints.end(); ++it)
if(mBoardType != DoubleAcirclesGrid)
drawBoard(altGridView, *it);
else {
size_t pointsNum = (*it).size()/2;
std::vector<cv::Point2f> points(pointsNum);
std::copy((*it).begin(), (*it).begin() + pointsNum, points.begin());
drawBoard(altGridView, points);
std::copy((*it).begin() + pointsNum, (*it).begin() + 2*pointsNum, points.begin());
drawBoard(altGridView, points);
}
else
for(std::vector<cv::Mat>::iterator it = mCalibdata->allCharucoCorners.begin(); it != mCalibdata->allCharucoCorners.end(); ++it)
drawBoard(altGridView, *it);
cv::imshow(gridWindowName, altGridView);
}
}
void ShowProcessor::switchUndistort()
{
mNeedUndistort = !mNeedUndistort;
}
void ShowProcessor::setUndistort(bool isEnabled)
{
mNeedUndistort = isEnabled;
}
ShowProcessor::~ShowProcessor()
{
}

View File

@ -0,0 +1,95 @@
#ifndef FRAME_PROCESSOR_HPP
#define FRAME_PROCESSOR_HPP
#include <opencv2/core.hpp>
#include <opencv2/aruco/charuco.hpp>
#include <opencv2/calib3d.hpp>
#include "calibCommon.hpp"
#include "calibController.hpp"
namespace calib
{
class FrameProcessor
{
protected:
public:
virtual ~FrameProcessor();
virtual cv::Mat processFrame(const cv::Mat& frame) = 0;
virtual bool isProcessed() const = 0;
virtual void resetState() = 0;
};
class CalibProcessor : public FrameProcessor
{
protected:
cv::Ptr<calibrationData> mCalibData;
TemplateType mBoardType;
cv::Size mBoardSize;
std::vector<cv::Point2f> mTemplateLocations;
std::vector<cv::Point2f> mCurrentImagePoints;
cv::Mat mCurrentCharucoCorners;
cv::Mat mCurrentCharucoIds;
cv::Ptr<cv::SimpleBlobDetector> mBlobDetectorPtr;
cv::Ptr<cv::aruco::Dictionary> mArucoDictionary;
cv::Ptr<cv::aruco::CharucoBoard> mCharucoBoard;
int mNeededFramesNum;
unsigned mDelayBetweenCaptures;
int mCapuredFrames;
double mMaxTemplateOffset;
float mSquareSize;
float mTemplDist;
bool detectAndParseChessboard(const cv::Mat& frame);
bool detectAndParseChAruco(const cv::Mat& frame);
bool detectAndParseACircles(const cv::Mat& frame);
bool detectAndParseDualACircles(const cv::Mat& frame);
void saveFrameData();
void showCaptureMessage(const cv::Mat &frame, const std::string& message);
bool checkLastFrame();
public:
CalibProcessor(cv::Ptr<calibrationData> data, captureParameters& capParams);
virtual cv::Mat processFrame(const cv::Mat& frame);
virtual bool isProcessed() const;
virtual void resetState();
~CalibProcessor();
};
enum visualisationMode {Grid, Window};
class ShowProcessor : public FrameProcessor
{
protected:
cv::Ptr<calibrationData> mCalibdata;
cv::Ptr<calibController> mController;
TemplateType mBoardType;
visualisationMode mVisMode;
bool mNeedUndistort;
double mGridViewScale;
double mTextSize;
void drawBoard(cv::Mat& img, cv::InputArray points);
void drawGridPoints(const cv::Mat& frame);
public:
ShowProcessor(cv::Ptr<calibrationData> data, cv::Ptr<calibController> controller, TemplateType board);
virtual cv::Mat processFrame(const cv::Mat& frame);
virtual bool isProcessed() const;
virtual void resetState();
void setVisualizationMode(visualisationMode mode);
void switchVisualizationMode();
void clearBoardsView();
void updateBoardsView();
void switchUndistort();
void setUndistort(bool isEnabled);
~ShowProcessor();
};
}
#endif

View File

@ -0,0 +1,491 @@
#include "linalg.hpp"
#ifdef USE_LAPACK
typedef int integer;
#include <lapacke.h>
#include <cassert>
using namespace cv;
bool cvfork::solve(InputArray _src, const InputArray _src2arg, OutputArray _dst, int method )
{
bool result = true;
Mat src = _src.getMat(), _src2 = _src2arg.getMat();
int type = src.type();
bool is_normal = (method & DECOMP_NORMAL) != 0;
CV_Assert( type == _src2.type() && (type == CV_32F || type == CV_64F) );
method &= ~DECOMP_NORMAL;
CV_Assert( (method != DECOMP_LU && method != DECOMP_CHOLESKY) ||
is_normal || src.rows == src.cols );
double rcond=-1, s1=0, work1=0, *work=0, *s=0;
float frcond=-1, fs1=0, fwork1=0, *fwork=0, *fs=0;
integer m = src.rows, m_ = m, n = src.cols, mn = std::max(m,n),
nm = std::min(m, n), nb = _src2.cols, lwork=-1, liwork=0, iwork1=0,
lda = m, ldx = mn, info=0, rank=0, *iwork=0;
int elem_size = CV_ELEM_SIZE(type);
bool copy_rhs=false;
int buf_size=0;
AutoBuffer<uchar> buffer;
uchar* ptr;
char N[] = {'N', '\0'}, L[] = {'L', '\0'};
Mat src2 = _src2;
_dst.create( src.cols, src2.cols, src.type() );
Mat dst = _dst.getMat();
if( m <= n )
is_normal = false;
else if( is_normal )
m_ = n;
buf_size += (is_normal ? n*n : m*n)*elem_size;
if( m_ != n || nb > 1 || !dst.isContinuous() )
{
copy_rhs = true;
if( is_normal )
buf_size += n*nb*elem_size;
else
buf_size += mn*nb*elem_size;
}
if( method == DECOMP_SVD || method == DECOMP_EIG )
{
integer nlvl = cvRound(std::log(std::max(std::min(m_,n)/25., 1.))/CV_LOG2) + 1;
liwork = std::min(m_,n)*(3*std::max(nlvl,(integer)0) + 11);
if( type == CV_32F )
sgelsd_(&m_, &n, &nb, (float*)src.data, &lda, (float*)dst.data, &ldx,
&fs1, &frcond, &rank, &fwork1, &lwork, &iwork1, &info);
else
dgelsd_(&m_, &n, &nb, (double*)src.data, &lda, (double*)dst.data, &ldx,
&s1, &rcond, &rank, &work1, &lwork, &iwork1, &info );
buf_size += nm*elem_size + (liwork + 1)*sizeof(integer);
}
else if( method == DECOMP_QR )
{
if( type == CV_32F )
sgels_(N, &m_, &n, &nb, (float*)src.data, &lda,
(float*)dst.data, &ldx, &fwork1, &lwork, &info );
else
dgels_(N, &m_, &n, &nb, (double*)src.data, &lda,
(double*)dst.data, &ldx, &work1, &lwork, &info );
}
else if( method == DECOMP_LU )
{
buf_size += (n+1)*sizeof(integer);
}
else if( method == DECOMP_CHOLESKY )
;
else
CV_Error( Error::StsBadArg, "Unknown method" );
assert(info == 0);
lwork = cvRound(type == CV_32F ? (double)fwork1 : work1);
buf_size += lwork*elem_size;
buffer.allocate(buf_size);
ptr = (uchar*)buffer;
Mat at(n, m_, type, ptr);
ptr += n*m_*elem_size;
if( method == DECOMP_CHOLESKY || method == DECOMP_EIG )
src.copyTo(at);
else if( !is_normal )
transpose(src, at);
else
mulTransposed(src, at, true);
Mat xt;
if( !is_normal )
{
if( copy_rhs )
{
Mat temp(nb, mn, type, ptr);
ptr += nb*mn*elem_size;
Mat bt = temp.colRange(0, m);
xt = temp.colRange(0, n);
transpose(src2, bt);
}
else
{
src2.copyTo(dst);
xt = Mat(1, n, type, dst.data);
}
}
else
{
if( copy_rhs )
{
xt = Mat(nb, n, type, ptr);
ptr += nb*n*elem_size;
}
else
xt = Mat(1, n, type, dst.data);
// (a'*b)' = b'*a
gemm( src2, src, 1, Mat(), 0, xt, GEMM_1_T );
}
lda = (int)(at.step ? at.step/elem_size : at.cols);
ldx = (int)(xt.step ? xt.step/elem_size : (!is_normal && copy_rhs ? mn : n));
if( method == DECOMP_SVD || method == DECOMP_EIG )
{
if( type == CV_32F )
{
fs = (float*)ptr;
ptr += nm*elem_size;
fwork = (float*)ptr;
ptr += lwork*elem_size;
iwork = (integer*)alignPtr(ptr, sizeof(integer));
sgelsd_(&m_, &n, &nb, (float*)at.data, &lda, (float*)xt.data, &ldx,
fs, &frcond, &rank, fwork, &lwork, iwork, &info);
}
else
{
s = (double*)ptr;
ptr += nm*elem_size;
work = (double*)ptr;
ptr += lwork*elem_size;
iwork = (integer*)alignPtr(ptr, sizeof(integer));
dgelsd_(&m_, &n, &nb, (double*)at.data, &lda, (double*)xt.data, &ldx,
s, &rcond, &rank, work, &lwork, iwork, &info);
}
}
else if( method == DECOMP_QR )
{
if( type == CV_32F )
{
fwork = (float*)ptr;
sgels_(N, &m_, &n, &nb, (float*)at.data, &lda,
(float*)xt.data, &ldx, fwork, &lwork, &info);
}
else
{
work = (double*)ptr;
dgels_(N, &m_, &n, &nb, (double*)at.data, &lda,
(double*)xt.data, &ldx, work, &lwork, &info);
}
}
else if( method == DECOMP_CHOLESKY || (method == DECOMP_LU && is_normal) )
{
if( type == CV_32F )
{
spotrf_(L, &n, (float*)at.data, &lda, &info);
if(info==0)
spotrs_(L, &n, &nb, (float*)at.data, &lda, (float*)xt.data, &ldx, &info);
}
else
{
dpotrf_(L, &n, (double*)at.data, &lda, &info);
if(info==0)
dpotrs_(L, &n, &nb, (double*)at.data, &lda, (double*)xt.data, &ldx, &info);
}
}
else if( method == DECOMP_LU )
{
iwork = (integer*)alignPtr(ptr, sizeof(integer));
if( type == CV_32F )
sgesv_(&n, &nb, (float*)at.data, &lda, iwork, (float*)xt.data, &ldx, &info );
else
dgesv_(&n, &nb, (double*)at.data, &lda, iwork, (double*)xt.data, &ldx, &info );
}
else
assert(0);
result = info == 0;
if( !result )
dst = Scalar(0);
else if( xt.data != dst.data )
transpose( xt, dst );
return result;
}
static void _SVDcompute( const InputArray _aarr, OutputArray _w,
OutputArray _u, OutputArray _vt, int flags = 0)
{
Mat a = _aarr.getMat(), u, vt;
integer m = a.rows, n = a.cols, mn = std::max(m, n), nm = std::min(m, n);
int type = a.type(), elem_size = (int)a.elemSize();
bool compute_uv = _u.needed() || _vt.needed();
if( flags & SVD::NO_UV )
{
_u.release();
_vt.release();
compute_uv = false;
}
if( compute_uv )
{
_u.create( (int)m, (int)((flags & SVD::FULL_UV) ? m : nm), type );
_vt.create( (int)((flags & SVD::FULL_UV) ? n : nm), n, type );
u = _u.getMat();
vt = _vt.getMat();
}
_w.create(nm, 1, type, -1, true);
Mat _a = a, w = _w.getMat();
CV_Assert( w.isContinuous() );
int work_ofs=0, iwork_ofs=0, buf_size = 0;
bool temp_a = false;
double u1=0, v1=0, work1=0;
float uf1=0, vf1=0, workf1=0;
integer lda, ldu, ldv, lwork=-1, iwork1=0, info=0;
char mode[] = {compute_uv ? 'S' : 'N', '\0'};
if( m != n && compute_uv && (flags & SVD::FULL_UV) )
mode[0] = 'A';
if( !(flags & SVD::MODIFY_A) )
{
if( mode[0] == 'N' || mode[0] == 'A' )
temp_a = true;
else if( compute_uv && (a.size() == vt.size() || a.size() == u.size()) && mode[0] == 'S' )
mode[0] = 'O';
}
lda = a.cols;
ldv = ldu = mn;
if( type == CV_32F )
{
sgesdd_(mode, &n, &m, (float*)a.data, &lda, (float*)w.data,
&vf1, &ldv, &uf1, &ldu, &workf1, &lwork, &iwork1, &info );
lwork = cvRound(workf1);
}
else
{
dgesdd_(mode, &n, &m, (double*)a.data, &lda, (double*)w.data,
&v1, &ldv, &u1, &ldu, &work1, &lwork, &iwork1, &info );
lwork = cvRound(work1);
}
assert(info == 0);
if( temp_a )
{
buf_size += n*m*elem_size;
}
work_ofs = buf_size;
buf_size += lwork*elem_size;
buf_size = alignSize(buf_size, sizeof(integer));
iwork_ofs = buf_size;
buf_size += 8*nm*sizeof(integer);
AutoBuffer<uchar> buf(buf_size);
uchar* buffer = (uchar*)buf;
if( temp_a )
{
_a = Mat(a.rows, a.cols, type, buffer );
a.copyTo(_a);
}
if( !(flags & SVD::MODIFY_A) && !temp_a )
{
if( compute_uv && a.size() == vt.size() )
{
a.copyTo(vt);
_a = vt;
}
else if( compute_uv && a.size() == u.size() )
{
a.copyTo(u);
_a = u;
}
}
if( compute_uv )
{
ldv = (int)(vt.step ? vt.step/elem_size : vt.cols);
ldu = (int)(u.step ? u.step/elem_size : u.cols);
}
lda = (int)(_a.step ? _a.step/elem_size : _a.cols);
if( type == CV_32F )
{
sgesdd_(mode, &n, &m, _a.ptr<float>(), &lda, w.ptr<float>(),
vt.data ? vt.ptr<float>() : (float*)&v1, &ldv,
u.data ? u.ptr<float>() : (float*)&u1, &ldu,
(float*)(buffer + work_ofs), &lwork,
(integer*)(buffer + iwork_ofs), &info );
}
else
{
dgesdd_(mode, &n, &m, _a.ptr<double>(), &lda, w.ptr<double>(),
vt.data ? vt.ptr<double>() : &v1, &ldv,
u.data ? u.ptr<double>() : &u1, &ldu,
(double*)(buffer + work_ofs), &lwork,
(integer*)(buffer + iwork_ofs), &info );
}
CV_Assert(info >= 0);
if(info != 0)
{
if( u.data )
u = Scalar(0.);
if( vt.data )
vt = Scalar(0.);
w = Scalar(0.);
}
}
//////////////////////////////////////////////////////////
template<typename T1, typename T2, typename T3> static void
MatrAXPY( int m, int n, const T1* x, int dx,
const T2* a, int inca, T3* y, int dy )
{
int i, j;
for( i = 0; i < m; i++, x += dx, y += dy )
{
T2 s = a[i*inca];
for( j = 0; j <= n - 4; j += 4 )
{
T3 t0 = (T3)(y[j] + s*x[j]);
T3 t1 = (T3)(y[j+1] + s*x[j+1]);
y[j] = t0;
y[j+1] = t1;
t0 = (T3)(y[j+2] + s*x[j+2]);
t1 = (T3)(y[j+3] + s*x[j+3]);
y[j+2] = t0;
y[j+3] = t1;
}
for( ; j < n; j++ )
y[j] = (T3)(y[j] + s*x[j]);
}
}
template<typename T> static void
SVBkSb( int m, int n, const T* w, int incw,
const T* u, int ldu, int uT,
const T* v, int ldv, int vT,
const T* b, int ldb, int nb,
T* x, int ldx, double* buffer, T eps )
{
double threshold = 0;
int udelta0 = uT ? ldu : 1, udelta1 = uT ? 1 : ldu;
int vdelta0 = vT ? ldv : 1, vdelta1 = vT ? 1 : ldv;
int i, j, nm = std::min(m, n);
if( !b )
nb = m;
for( i = 0; i < n; i++ )
for( j = 0; j < nb; j++ )
x[i*ldx + j] = 0;
for( i = 0; i < nm; i++ )
threshold += w[i*incw];
threshold *= eps;
// v * inv(w) * uT * b
for( i = 0; i < nm; i++, u += udelta0, v += vdelta0 )
{
double wi = w[i*incw];
if( wi <= threshold )
continue;
wi = 1/wi;
if( nb == 1 )
{
double s = 0;
if( b )
for( j = 0; j < m; j++ )
s += u[j*udelta1]*b[j*ldb];
else
s = u[0];
s *= wi;
for( j = 0; j < n; j++ )
x[j*ldx] = (T)(x[j*ldx] + s*v[j*vdelta1]);
}
else
{
if( b )
{
for( j = 0; j < nb; j++ )
buffer[j] = 0;
MatrAXPY( m, nb, b, ldb, u, udelta1, buffer, 0 );
for( j = 0; j < nb; j++ )
buffer[j] *= wi;
}
else
{
for( j = 0; j < nb; j++ )
buffer[j] = u[j*udelta1]*wi;
}
MatrAXPY( n, nb, buffer, 0, v, vdelta1, x, ldx );
}
}
}
static void _backSubst( const InputArray _w, const InputArray _u, const InputArray _vt,
const InputArray _rhs, OutputArray _dst )
{
Mat w = _w.getMat(), u = _u.getMat(), vt = _vt.getMat(), rhs = _rhs.getMat();
int type = w.type(), esz = (int)w.elemSize();
int m = u.rows, n = vt.cols, nb = rhs.data ? rhs.cols : m;
AutoBuffer<double> buffer(nb);
CV_Assert( u.data && vt.data && w.data );
CV_Assert( rhs.data == 0 || (rhs.type() == type && rhs.rows == m) );
_dst.create( n, nb, type );
Mat dst = _dst.getMat();
if( type == CV_32F )
SVBkSb(m, n, (float*)w.data, 1, (float*)u.data, (int)(u.step/esz), false,
(float*)vt.data, (int)(vt.step/esz), true, (float*)rhs.data, (int)(rhs.step/esz),
nb, (float*)dst.data, (int)(dst.step/esz), buffer, 10*FLT_EPSILON );
else if( type == CV_64F )
SVBkSb(m, n, (double*)w.data, 1, (double*)u.data, (int)(u.step/esz), false,
(double*)vt.data, (int)(vt.step/esz), true, (double*)rhs.data, (int)(rhs.step/esz),
nb, (double*)dst.data, (int)(dst.step/esz), buffer, 2*DBL_EPSILON );
else
CV_Error( Error::StsUnsupportedFormat, "" );
}
///////////////////////////////////////////
#define Sf( y, x ) ((float*)(srcdata + y*srcstep))[x]
#define Sd( y, x ) ((double*)(srcdata + y*srcstep))[x]
#define Df( y, x ) ((float*)(dstdata + y*dststep))[x]
#define Dd( y, x ) ((double*)(dstdata + y*dststep))[x]
double cvfork::invert( InputArray _src, OutputArray _dst, int method )
{
Mat src = _src.getMat();
int type = src.type();
CV_Assert(type == CV_32F || type == CV_64F);
size_t esz = CV_ELEM_SIZE(type);
int m = src.rows, n = src.cols;
if( method == DECOMP_SVD )
{
int nm = std::min(m, n);
AutoBuffer<uchar> _buf((m*nm + nm + nm*n)*esz + sizeof(double));
uchar* buf = alignPtr((uchar*)_buf, (int)esz);
Mat u(m, nm, type, buf);
Mat w(nm, 1, type, u.ptr() + m*nm*esz);
Mat vt(nm, n, type, w.ptr() + nm*esz);
_SVDcompute(src, w, u, vt);
_backSubst(w, u, vt, Mat(), _dst);
return type == CV_32F ?
(w.ptr<float>()[0] >= FLT_EPSILON ?
w.ptr<float>()[n-1]/w.ptr<float>()[0] : 0) :
(w.ptr<double>()[0] >= DBL_EPSILON ?
w.ptr<double>()[n-1]/w.ptr<double>()[0] : 0);
}
return 0;
}
#endif //USE_LAPACK

View File

@ -0,0 +1,13 @@
#ifndef LINALG_HPP
#define LINALG_HPP
#include <opencv2/core.hpp>
namespace cvfork {
double invert( cv::InputArray _src, cv::OutputArray _dst, int method );
bool solve(cv::InputArray _src, cv::InputArray _src2arg, cv::OutputArray _dst, int method );
}
#endif

View File

@ -0,0 +1,210 @@
#include <opencv2/core.hpp>
#include <opencv2/calib3d.hpp>
#include <opencv2/aruco/charuco.hpp>
#include <opencv2/cvconfig.h>
#include <opencv2/highgui.hpp>
#include <string>
#include <vector>
#include <stdexcept>
#include <algorithm>
#include <iostream>
#include "calibCommon.hpp"
#include "calibPipeline.hpp"
#include "frameProcessor.hpp"
#include "cvCalibrationFork.hpp"
#include "calibController.hpp"
#include "parametersController.hpp"
#include "rotationConverters.hpp"
using namespace calib;
const std::string keys =
"{v | | Input from video file }"
"{ci | 0 | Default camera id }"
"{flip | false | Vertical flip of input frames }"
"{t | circles | Template for calibration (circles, chessboard, dualCircles, chAruco) }"
"{sz | 16.3 | Distance between two nearest centers of circles or squares on calibration board}"
"{dst | 295 | Distance between white and black parts of daulCircles template}"
"{w | | Width of template (in corners or circles)}"
"{h | | Height of template (in corners or circles)}"
"{of | cameraParameters.xml | Output file name}"
"{ft | true | Auto tuning of calibration flags}"
"{vis | grid | Captured boards visualisation (grid, window)}"
"{d | 0.8 | Min delay between captures}"
"{pf | defaultConfig.xml| Advanced application parameters}"
"{help | | Print help}";
bool calib::showOverlayMessage(const std::string& message)
{
#ifdef HAVE_QT
cv::displayOverlay(mainWindowName, message, OVERLAY_DELAY);
return true;
#else
std::cout << message << std::endl;
return false;
#endif
}
static void deleteButton(int state, void* data)
{
state++; //to avoid gcc warnings
(static_cast<cv::Ptr<calibDataController>*>(data))->get()->deleteLastFrame();
calib::showOverlayMessage("Last frame deleted");
}
static void deleteAllButton(int state, void* data)
{
state++;
(static_cast<cv::Ptr<calibDataController>*>(data))->get()->deleteAllData();
calib::showOverlayMessage("All frames deleted");
}
static void saveCurrentParamsButton(int state, void* data)
{
state++;
if((static_cast<cv::Ptr<calibDataController>*>(data))->get()->saveCurrentCameraParameters())
calib::showOverlayMessage("Calibration parameters saved");
}
#ifdef HAVE_QT
static void switchVisualizationModeButton(int state, void* data)
{
state++;
ShowProcessor* processor = static_cast<ShowProcessor*>(((cv::Ptr<FrameProcessor>*)data)->get());
processor->switchVisualizationMode();
}
static void undistortButton(int state, void* data)
{
ShowProcessor* processor = static_cast<ShowProcessor*>(((cv::Ptr<FrameProcessor>*)data)->get());
processor->setUndistort(static_cast<bool>(state));
calib::showOverlayMessage(std::string("Undistort is ") +
(static_cast<bool>(state) ? std::string("on") : std::string("off")));
}
#endif //HAVE_QT
int main(int argc, char** argv)
{
cv::CommandLineParser parser(argc, argv, keys);
if(parser.has("help")) {
parser.printMessage();
return 0;
}
std::cout << consoleHelp << std::endl;
parametersController paramsController;
if(!paramsController.loadFromParser(parser))
return 0;
captureParameters capParams = paramsController.getCaptureParameters();
internalParameters intParams = paramsController.getInternalParameters();
cv::TermCriteria solverTermCrit = cv::TermCriteria(cv::TermCriteria::COUNT+cv::TermCriteria::EPS,
intParams.solverMaxIters, intParams.solverEps);
cv::Ptr<calibrationData> globalData(new calibrationData);
if(!parser.has("v")) globalData->imageSize = capParams.cameraResolution;
int calibrationFlags = 0;
if(intParams.fastSolving) calibrationFlags |= CALIB_USE_QR;
cv::Ptr<calibController> controller(new calibController(globalData, calibrationFlags,
parser.get<bool>("ft"), capParams.minFramesNum));
cv::Ptr<calibDataController> dataController(new calibDataController(globalData, capParams.maxFramesNum,
intParams.filterAlpha));
dataController->setParametersFileName(parser.get<std::string>("of"));
cv::Ptr<FrameProcessor> capProcessor, showProcessor;
capProcessor = cv::Ptr<FrameProcessor>(new CalibProcessor(globalData, capParams));
showProcessor = cv::Ptr<FrameProcessor>(new ShowProcessor(globalData, controller, capParams.board));
if(parser.get<std::string>("vis").find("window") == 0) {
static_cast<ShowProcessor*>(showProcessor.get())->setVisualizationMode(Window);
cv::namedWindow(gridWindowName);
cv::moveWindow(gridWindowName, 1280, 500);
}
cv::Ptr<CalibPipeline> pipeline(new CalibPipeline(capParams));
std::vector<cv::Ptr<FrameProcessor> > processors;
processors.push_back(capProcessor);
processors.push_back(showProcessor);
cv::namedWindow(mainWindowName);
cv::moveWindow(mainWindowName, 10, 10);
#ifdef HAVE_QT
cv::createButton("Delete last frame", deleteButton, &dataController, cv::QT_PUSH_BUTTON);
cv::createButton("Delete all frames", deleteAllButton, &dataController, cv::QT_PUSH_BUTTON);
cv::createButton("Undistort", undistortButton, &showProcessor, cv::QT_CHECKBOX, false);
cv::createButton("Save current parameters", saveCurrentParamsButton, &dataController, cv::QT_PUSH_BUTTON);
cv::createButton("Switch visualisation mode", switchVisualizationModeButton, &showProcessor, cv::QT_PUSH_BUTTON);
#endif //HAVE_QT
try {
bool pipelineFinished = false;
while(!pipelineFinished)
{
PipelineExitStatus exitStatus = pipeline->start(processors);
if (exitStatus == Finished) {
if(controller->getCommonCalibrationState())
saveCurrentParamsButton(0, &dataController);
pipelineFinished = true;
continue;
}
else if (exitStatus == Calibrate) {
dataController->rememberCurrentParameters();
globalData->imageSize = pipeline->getImageSize();
calibrationFlags = controller->getNewFlags();
if(capParams.board != chAruco) {
globalData->totalAvgErr =
cvfork::calibrateCamera(globalData->objectPoints, globalData->imagePoints,
globalData->imageSize, globalData->cameraMatrix,
globalData->distCoeffs, cv::noArray(), cv::noArray(),
globalData->stdDeviations, globalData->perViewErrors,
calibrationFlags, solverTermCrit);
}
else {
cv::Ptr<cv::aruco::Dictionary> dictionary =
cv::aruco::getPredefinedDictionary(cv::aruco::PREDEFINED_DICTIONARY_NAME(capParams.charucoDictName));
cv::Ptr<cv::aruco::CharucoBoard> charucoboard =
cv::aruco::CharucoBoard::create(capParams.boardSize.width, capParams.boardSize.height,
capParams.charucoSquareLenght, capParams.charucoMarkerSize, dictionary);
globalData->totalAvgErr =
cvfork::calibrateCameraCharuco(globalData->allCharucoCorners, globalData->allCharucoIds,
charucoboard, globalData->imageSize,
globalData->cameraMatrix, globalData->distCoeffs,
cv::noArray(), cv::noArray(), globalData->stdDeviations,
globalData->perViewErrors, calibrationFlags, solverTermCrit);
}
dataController->updateUndistortMap();
dataController->printParametersToConsole(std::cout);
controller->updateState();
for(int j = 0; j < capParams.calibrationStep; j++)
dataController->filterFrames();
static_cast<ShowProcessor*>(showProcessor.get())->updateBoardsView();
}
else if (exitStatus == DeleteLastFrame) {
deleteButton(0, &dataController);
static_cast<ShowProcessor*>(showProcessor.get())->updateBoardsView();
}
else if (exitStatus == DeleteAllFrames) {
deleteAllButton(0, &dataController);
static_cast<ShowProcessor*>(showProcessor.get())->updateBoardsView();
}
else if (exitStatus == SaveCurrentData) {
saveCurrentParamsButton(0, &dataController);
}
else if (exitStatus == SwitchUndistort)
static_cast<ShowProcessor*>(showProcessor.get())->switchUndistort();
else if (exitStatus == SwitchVisualisation)
static_cast<ShowProcessor*>(showProcessor.get())->switchVisualizationMode();
for (std::vector<cv::Ptr<FrameProcessor> >::iterator it = processors.begin(); it != processors.end(); ++it)
(*it)->resetState();
}
}
catch (std::runtime_error exp) {
std::cout << exp.what() << std::endl;
}
return 0;
}

View File

@ -0,0 +1,138 @@
#include "parametersController.hpp"
#include <iostream>
template <typename T>
static bool readFromNode(cv::FileNode node, T& value)
{
if(!node.isNone()) {
node >> value;
return true;
}
else
return false;
}
static bool checkAssertion(bool value, const std::string& msg)
{
if(!value)
std::cerr << "Error: " << msg << std::endl;
return value;
}
bool calib::parametersController::loadFromFile(const std::string &inputFileName)
{
cv::FileStorage reader;
reader.open(inputFileName, cv::FileStorage::READ);
if(!reader.isOpened()) {
std::cerr << "Warning: Unable to open " << inputFileName <<
" Applicatioin stated with default advanced parameters" << std::endl;
return true;
}
readFromNode(reader["charuco_dict"], mCapParams.charucoDictName);
readFromNode(reader["charuco_square_lenght"], mCapParams.charucoSquareLenght);
readFromNode(reader["charuco_marker_size"], mCapParams.charucoMarkerSize);
readFromNode(reader["camera_resolution"], mCapParams.cameraResolution);
readFromNode(reader["calibration_step"], mCapParams.calibrationStep);
readFromNode(reader["max_frames_num"], mCapParams.maxFramesNum);
readFromNode(reader["min_frames_num"], mCapParams.minFramesNum);
readFromNode(reader["solver_eps"], mInternalParameters.solverEps);
readFromNode(reader["solver_max_iters"], mInternalParameters.solverMaxIters);
readFromNode(reader["fast_solver"], mInternalParameters.fastSolving);
readFromNode(reader["frame_filter_conv_param"], mInternalParameters.filterAlpha);
bool retValue =
checkAssertion(mCapParams.charucoDictName >= 0, "Dict name must be >= 0") &&
checkAssertion(mCapParams.charucoMarkerSize > 0, "Marker size must be positive") &&
checkAssertion(mCapParams.charucoSquareLenght > 0, "Square size must be positive") &&
checkAssertion(mCapParams.minFramesNum > 1, "Minimal number of frames for calibration < 1") &&
checkAssertion(mCapParams.calibrationStep > 0, "Calibration step must be positive") &&
checkAssertion(mCapParams.maxFramesNum > mCapParams.minFramesNum, "maxFramesNum < minFramesNum") &&
checkAssertion(mInternalParameters.solverEps > 0, "Solver precision must be positive") &&
checkAssertion(mInternalParameters.solverMaxIters > 0, "Max solver iterations number must be positive") &&
checkAssertion(mInternalParameters.filterAlpha >=0 && mInternalParameters.filterAlpha <=1 ,
"Frame filter convolution parameter must be in [0,1] interval") &&
checkAssertion(mCapParams.cameraResolution.width > 0 && mCapParams.cameraResolution.height > 0,
"Wrong camera resolution values");
reader.release();
return retValue;
}
calib::parametersController::parametersController()
{
}
calib::captureParameters calib::parametersController::getCaptureParameters() const
{
return mCapParams;
}
calib::internalParameters calib::parametersController::getInternalParameters() const
{
return mInternalParameters;
}
bool calib::parametersController::loadFromParser(cv::CommandLineParser &parser)
{
mCapParams.flipVertical = parser.get<bool>("flip");
mCapParams.captureDelay = parser.get<float>("d");
mCapParams.squareSize = parser.get<float>("sz");
mCapParams.templDst = parser.get<float>("dst");
if(!checkAssertion(mCapParams.squareSize > 0, "Distance between corners or circles must be positive"))
return false;
if(!checkAssertion(mCapParams.templDst > 0, "Distance betwen parts of dual template must be positive"))
return false;
if (parser.has("v")) {
mCapParams.source = File;
mCapParams.videoFileName = parser.get<std::string>("v");
}
else {
mCapParams.source = Camera;
mCapParams.camID = parser.get<int>("ci");
}
std::string templateType = parser.get<std::string>("t");
if(templateType.find("circles", 0) == 0) {
mCapParams.board = AcirclesGrid;
mCapParams.boardSize = cv::Size(4, 11);
}
else if(templateType.find("chessboard", 0) == 0) {
mCapParams.board = Chessboard;
mCapParams.boardSize = cv::Size(7, 7);
}
else if(templateType.find("dualcircles", 0) == 0) {
mCapParams.board = DoubleAcirclesGrid;
mCapParams.boardSize = cv::Size(4, 11);
}
else if(templateType.find("charuco", 0) == 0) {
mCapParams.board = chAruco;
mCapParams.boardSize = cv::Size(6, 8);
mCapParams.charucoDictName = 0;
mCapParams.charucoSquareLenght = 200;
mCapParams.charucoMarkerSize = 100;
}
else {
std::cerr << "Wrong template name\n";
return false;
}
if(parser.has("w") && parser.has("h")) {
mCapParams.boardSize = cv::Size(parser.get<int>("w"), parser.get<int>("h"));
if(!checkAssertion(mCapParams.boardSize.width > 0 || mCapParams.boardSize.height > 0,
"Board size must be positive"))
return false;
}
if(!checkAssertion(parser.get<std::string>("of").find(".xml") > 0,
"Wrong output file name: correct format is [name].xml"))
return false;
loadFromFile(parser.get<std::string>("pf"));
return true;
}

View File

@ -0,0 +1,29 @@
#ifndef PARAMETERS_CONTROLLER_HPP
#define PARAMETERS_CONTROLLER_HPP
#include <string>
#include <opencv2/core.hpp>
#include "calibCommon.hpp"
namespace calib {
class parametersController
{
protected:
captureParameters mCapParams;
internalParameters mInternalParameters;
bool loadFromFile(const std::string& inputFileName);
public:
parametersController();
parametersController(cv::Ptr<captureParameters> params);
captureParameters getCaptureParameters() const;
internalParameters getInternalParameters() const;
bool loadFromParser(cv::CommandLineParser& parser);
};
}
#endif

View File

@ -0,0 +1,121 @@
#include "rotationConverters.hpp"
#include <cmath>
#include <opencv2/calib3d.hpp>
#include <opencv2/core.hpp>
#define CALIB_PI 3.14159265358979323846
#define CALIB_PI_2 1.57079632679489661923
void calib::Euler(const cv::Mat& src, cv::Mat& dst, int argType)
{
if((src.rows == 3) && (src.cols == 3))
{
//convert rotaion matrix to 3 angles (pitch, yaw, roll)
dst = cv::Mat(3, 1, CV_64F);
double pitch, yaw, roll;
if(src.at<double>(0,2) < -0.998)
{
pitch = -atan2(src.at<double>(1,0), src.at<double>(1,1));
yaw = -CALIB_PI_2;
roll = 0.;
}
else if(src.at<double>(0,2) > 0.998)
{
pitch = atan2(src.at<double>(1,0), src.at<double>(1,1));
yaw = CALIB_PI_2;
roll = 0.;
}
else
{
pitch = atan2(-src.at<double>(1,2), src.at<double>(2,2));
yaw = asin(src.at<double>(0,2));
roll = atan2(-src.at<double>(0,1), src.at<double>(0,0));
}
if(argType == CALIB_DEGREES)
{
pitch *= 180./CALIB_PI;
yaw *= 180./CALIB_PI;
roll *= 180./CALIB_PI;
}
else if(argType != CALIB_RADIANS)
CV_Error(cv::Error::StsBadFlag, "Invalid argument type");
dst.at<double>(0,0) = pitch;
dst.at<double>(1,0) = yaw;
dst.at<double>(2,0) = roll;
}
else if( (src.cols == 1 && src.rows == 3) ||
(src.cols == 3 && src.rows == 1 ) )
{
//convert vector which contains 3 angles (pitch, yaw, roll) to rotaion matrix
double pitch, yaw, roll;
if(src.cols == 1 && src.rows == 3)
{
pitch = src.at<double>(0,0);
yaw = src.at<double>(1,0);
roll = src.at<double>(2,0);
}
else{
pitch = src.at<double>(0,0);
yaw = src.at<double>(0,1);
roll = src.at<double>(0,2);
}
if(argType == CALIB_DEGREES)
{
pitch *= CALIB_PI / 180.;
yaw *= CALIB_PI / 180.;
roll *= CALIB_PI / 180.;
}
else if(argType != CALIB_RADIANS)
CV_Error(cv::Error::StsBadFlag, "Invalid argument type");
dst = cv::Mat(3, 3, CV_64F);
cv::Mat M(3, 3, CV_64F);
cv::Mat i = cv::Mat::eye(3, 3, CV_64F);
i.copyTo(dst);
i.copyTo(M);
double* pR = dst.ptr<double>();
pR[4] = cos(pitch);
pR[7] = sin(pitch);
pR[8] = pR[4];
pR[5] = -pR[7];
double* pM = M.ptr<double>();
pM[0] = cos(yaw);
pM[2] = sin(yaw);
pM[8] = pM[0];
pM[6] = -pM[2];
dst *= M;
i.copyTo(M);
pM[0] = cos(roll);
pM[3] = sin(roll);
pM[4] = pM[0];
pM[1] = -pM[3];
dst *= M;
}
else
CV_Error(cv::Error::StsBadFlag, "Input matrix must be 1x3, 3x1 or 3x3" );
}
void calib::RodriguesToEuler(const cv::Mat& src, cv::Mat& dst, int argType)
{
CV_Assert((src.cols == 1 && src.rows == 3) || (src.cols == 3 && src.rows == 1));
cv::Mat R;
cv::Rodrigues(src, R);
Euler(R, dst, argType);
}
void calib::EulerToRodrigues(const cv::Mat& src, cv::Mat& dst, int argType)
{
CV_Assert((src.cols == 1 && src.rows == 3) || (src.cols == 3 && src.rows == 1));
cv::Mat R;
Euler(src, R, argType);
cv::Rodrigues(R, dst);
}

View File

@ -0,0 +1,16 @@
#ifndef RAOTATION_CONVERTERS_HPP
#define RAOTATION_CONVERTERS_HPP
#include <opencv2/core.hpp>
namespace calib
{
#define CALIB_RADIANS 0
#define CALIB_DEGREES 1
void Euler(const cv::Mat& src, cv::Mat& dst, int argType = CALIB_RADIANS);
void RodriguesToEuler(const cv::Mat& src, cv::Mat& dst, int argType = CALIB_RADIANS);
void EulerToRodrigues(const cv::Mat& src, cv::Mat& dst, int argType = CALIB_RADIANS);
}
#endif

View File

@ -23,7 +23,6 @@ set_target_properties(${the_target} PROPERTIES
DEBUG_POSTFIX "${OPENCV_DEBUG_POSTFIX}"
ARCHIVE_OUTPUT_DIRECTORY ${LIBRARY_OUTPUT_PATH}
RUNTIME_OUTPUT_DIRECTORY ${EXECUTABLE_OUTPUT_PATH}
INSTALL_NAME_DIR lib
OUTPUT_NAME "opencv_traincascade")
if(ENABLE_SOLUTION_FOLDERS)

View File

@ -190,6 +190,7 @@ bool CvCascadeClassifier::train( const string _cascadeDirName,
cascadeParams.printAttrs();
stageParams->printAttrs();
featureParams->printAttrs();
cout << "Number of unique features given windowSize [" << _cascadeParams.winSize.width << "," << _cascadeParams.winSize.height << "] : " << featureEvaluator->getNumFeatures() << "" << endl;
int startNumStages = (int)stageClassifiers.size();
if ( startNumStages > 1 )
@ -289,7 +290,7 @@ int CvCascadeClassifier::predict( int sampleIdx )
{
CV_DbgAssert( sampleIdx < numPos + numNeg );
for (vector< Ptr<CvCascadeBoost> >::iterator it = stageClassifiers.begin();
it != stageClassifiers.end(); it++ )
it != stageClassifiers.end();++it )
{
if ( (*it)->predict( sampleIdx ) == 0.f )
return 0;
@ -364,7 +365,7 @@ void CvCascadeClassifier::writeStages( FileStorage &fs, const Mat& featureMap )
int i = 0;
fs << CC_STAGES << "[";
for( vector< Ptr<CvCascadeBoost> >::const_iterator it = stageClassifiers.begin();
it != stageClassifiers.end(); it++, i++ )
it != stageClassifiers.end();++it, ++i )
{
sprintf( cmnt, "stage %d", i );
cvWriteComment( fs.fs, cmnt, 0 );
@ -556,7 +557,7 @@ void CvCascadeClassifier::getUsedFeaturesIdxMap( Mat& featureMap )
featureMap.setTo(Scalar(-1));
for( vector< Ptr<CvCascadeBoost> >::const_iterator it = stageClassifiers.begin();
it != stageClassifiers.end(); it++ )
it != stageClassifiers.end();++it )
(*it)->markUsedFeaturesInMap( featureMap );
for( int fi = 0, idx = 0; fi < varCount; fi++ )

View File

@ -28,7 +28,7 @@ CvCascadeImageReader::NegReader::NegReader()
bool CvCascadeImageReader::NegReader::create( const string _filename, Size _winSize )
{
string dirname, str;
string str;
std::ifstream file(_filename.c_str());
if ( !file.is_open() )
return false;

View File

@ -0,0 +1,37 @@
SET(OPENCV_VISUALISATION_DEPS opencv_core opencv_highgui opencv_imgproc opencv_videoio opencv_imgcodecs)
ocv_check_dependencies(${OPENCV_VISUALISATION_DEPS})
if(NOT OCV_DEPENDENCIES_FOUND)
return()
endif()
project(visualisation)
set(the_target opencv_visualisation)
ocv_target_include_directories(${the_target} PRIVATE "${CMAKE_CURRENT_SOURCE_DIR}" "${OpenCV_SOURCE_DIR}/include/opencv")
ocv_target_include_modules_recurse(${the_target} ${OPENCV_VISUALISATION_DEPS})
file(GLOB SRCS *.cpp)
set(visualisation_files ${SRCS})
ocv_add_executable(${the_target} ${visualisation_files})
ocv_target_link_libraries(${the_target} ${OPENCV_VISUALISATION_DEPS})
set_target_properties(${the_target} PROPERTIES
DEBUG_POSTFIX "${OPENCV_DEBUG_POSTFIX}"
ARCHIVE_OUTPUT_DIRECTORY ${LIBRARY_OUTPUT_PATH}
RUNTIME_OUTPUT_DIRECTORY ${EXECUTABLE_OUTPUT_PATH}
INSTALL_NAME_DIR lib
OUTPUT_NAME "opencv_visualisation")
if(ENABLE_SOLUTION_FOLDERS)
set_target_properties(${the_target} PROPERTIES FOLDER "applications")
endif()
if(INSTALL_CREATE_DISTRIB)
if(BUILD_SHARED_LIBS)
install(TARGETS ${the_target} RUNTIME DESTINATION ${OPENCV_BIN_INSTALL_PATH} CONFIGURATIONS Release COMPONENT dev)
endif()
else()
install(TARGETS ${the_target} RUNTIME DESTINATION ${OPENCV_BIN_INSTALL_PATH} COMPONENT dev)
endif()

View File

@ -0,0 +1,362 @@
////////////////////////////////////////////////////////////////////////////////////////
//
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
// By downloading, copying, installing or using the software you agree to this license.
// If you do not agree to this license, do not download, install,
// copy or use the software.
//
//
// License Agreement
// For Open Source Computer Vision Library
//
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
// Third party copyrights are property of their respective owners.
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other materials provided with the distribution.
//
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors "as is" and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
////////////////////////////////////////////////////////////////////////////////////////
/*****************************************************************************************************
Software for visualising cascade classifier models trained by OpenCV and to get a better
understanding of the used features.
USAGE:
./opencv_visualisation --model=<model.xml> --image=<ref.png> --data=<video output folder>
LIMITS
- Use an absolute path for the output folder to ensure the tool works
- Only handles cascade classifier models
- Handles stumps only for the moment
- Needs a valid training/test sample window with the original model dimensions, passed as `ref.png`
- Can handle HAAR and LBP features
Created by: Puttemans Steven - April 2016
*****************************************************************************************************/
#include <opencv2/core.hpp>
#include <opencv2/highgui.hpp>
#include <opencv2/imgproc.hpp>
#include <opencv2/imgcodecs.hpp>
#include <opencv2/videoio.hpp>
#include <fstream>
#include <iostream>
using namespace std;
using namespace cv;
struct rect_data{
int x;
int y;
int w;
int h;
float weight;
};
int main( int argc, const char** argv )
{
CommandLineParser parser(argc, argv,
"{ help h usage ? | | show this message }"
"{ image i | | (required) path to reference image }"
"{ model m | | (required) path to cascade xml file }"
"{ data d | | (optional) path to video output folder }"
);
// Read in the input arguments
if (parser.has("help")){
parser.printMessage();
return 0;
}
string model(parser.get<string>("model"));
string output_folder(parser.get<string>("data"));
string image_ref = (parser.get<string>("image"));
if (model.empty() || image_ref.empty()){
parser.printMessage();
return -1;
}
// Value for timing
// You can increase this to have a better visualisation during the generation
int timing = 1;
// Value for cols of storing elements
int cols_prefered = 5;
// Open the XML model
FileStorage fs;
bool model_ok = fs.open(model, FileStorage::READ);
if (!model_ok){
cerr << "the cascade file '" << model << "' could not be loaded." << endl;
return -1;
}
// Get a the required information
// First decide which feature type we are using
FileNode cascade = fs["cascade"];
string feature_type = cascade["featureType"];
bool haar = false, lbp = false;
if (feature_type.compare("HAAR") == 0){
haar = true;
}
if (feature_type.compare("LBP") == 0){
lbp = true;
}
if ( feature_type.compare("HAAR") != 0 && feature_type.compare("LBP")){
cerr << "The model is not an HAAR or LBP feature based model!" << endl;
cerr << "Please select a model that can be visualized by the software." << endl;
return -1;
}
// We make a visualisation mask - which increases the window to make it at least a bit more visible
int resize_factor = 10;
int resize_storage_factor = 10;
Mat reference_image = imread(image_ref, IMREAD_GRAYSCALE );
if (reference_image.empty()){
cerr << "the reference image '" << image_ref << "'' could not be loaded." << endl;
return -1;
}
Mat visualization;
resize(reference_image, visualization, Size(reference_image.cols * resize_factor, reference_image.rows * resize_factor));
// First recover for each stage the number of weak features and their index
// Important since it is NOT sequential when using LBP features
vector< vector<int> > stage_features;
FileNode stages = cascade["stages"];
FileNodeIterator it_stages = stages.begin(), it_stages_end = stages.end();
int idx = 0;
for( ; it_stages != it_stages_end; it_stages++, idx++ ){
vector<int> current_feature_indexes;
FileNode weak_classifiers = (*it_stages)["weakClassifiers"];
FileNodeIterator it_weak = weak_classifiers.begin(), it_weak_end = weak_classifiers.end();
vector<int> values;
for(int idy = 0; it_weak != it_weak_end; it_weak++, idy++ ){
(*it_weak)["internalNodes"] >> values;
current_feature_indexes.push_back( (int)values[2] );
}
stage_features.push_back(current_feature_indexes);
}
// If the output option has been chosen than we will store a combined image plane for
// each stage, containing all weak classifiers for that stage.
bool draw_planes = false;
stringstream output_video;
output_video << output_folder << "model_visualization.avi";
VideoWriter result_video;
if( output_folder.compare("") != 0 ){
draw_planes = true;
result_video.open(output_video.str(), VideoWriter::fourcc('X','V','I','D'), 15, Size(reference_image.cols * resize_factor, reference_image.rows * resize_factor), false);
}
if(haar){
// Grab the corresponding features dimensions and weights
FileNode features = cascade["features"];
vector< vector< rect_data > > feature_data;
FileNodeIterator it_features = features.begin(), it_features_end = features.end();
for(int idf = 0; it_features != it_features_end; it_features++, idf++ ){
vector< rect_data > current_feature_rectangles;
FileNode rectangles = (*it_features)["rects"];
int nrects = (int)rectangles.size();
for(int k = 0; k < nrects; k++){
rect_data current_data;
FileNode single_rect = rectangles[k];
current_data.x = (int)single_rect[0];
current_data.y = (int)single_rect[1];
current_data.w = (int)single_rect[2];
current_data.h = (int)single_rect[3];
current_data.weight = (float)single_rect[4];
current_feature_rectangles.push_back(current_data);
}
feature_data.push_back(current_feature_rectangles);
}
// Loop over each possible feature on its index, visualise on the mask and wait a bit,
// then continue to the next feature.
// If visualisations should be stored then do the in between calculations
Mat image_plane;
Mat metadata = Mat::zeros(150, 1000, CV_8UC1);
vector< rect_data > current_rects;
for(int sid = 0; sid < (int)stage_features.size(); sid ++){
if(draw_planes){
int features_nmbr = (int)stage_features[sid].size();
int cols = cols_prefered;
int rows = features_nmbr / cols;
if( (features_nmbr % cols) > 0){
rows++;
}
image_plane = Mat::zeros(reference_image.rows * resize_storage_factor * rows, reference_image.cols * resize_storage_factor * cols, CV_8UC1);
}
for(int fid = 0; fid < (int)stage_features[sid].size(); fid++){
stringstream meta1, meta2;
meta1 << "Stage " << sid << " / Feature " << fid;
meta2 << "Rectangles: ";
Mat temp_window = visualization.clone();
Mat temp_metadata = metadata.clone();
int current_feature_index = stage_features[sid][fid];
current_rects = feature_data[current_feature_index];
Mat single_feature = reference_image.clone();
resize(single_feature, single_feature, Size(), resize_storage_factor, resize_storage_factor);
for(int i = 0; i < (int)current_rects.size(); i++){
rect_data local = current_rects[i];
if(draw_planes){
if(local.weight >= 0){
rectangle(single_feature, Rect(local.x * resize_storage_factor, local.y * resize_storage_factor, local.w * resize_storage_factor, local.h * resize_storage_factor), Scalar(0), FILLED);
}else{
rectangle(single_feature, Rect(local.x * resize_storage_factor, local.y * resize_storage_factor, local.w * resize_storage_factor, local.h * resize_storage_factor), Scalar(255), FILLED);
}
}
Rect part(local.x * resize_factor, local.y * resize_factor, local.w * resize_factor, local.h * resize_factor);
meta2 << part << " (w " << local.weight << ") ";
if(local.weight >= 0){
rectangle(temp_window, part, Scalar(0), FILLED);
}else{
rectangle(temp_window, part, Scalar(255), FILLED);
}
}
imshow("features", temp_window);
putText(temp_window, meta1.str(), Point(15,15), FONT_HERSHEY_SIMPLEX, 0.5, Scalar(255));
result_video.write(temp_window);
// Copy the feature image if needed
if(draw_planes){
single_feature.copyTo(image_plane(Rect(0 + (fid%cols_prefered)*single_feature.cols, 0 + (fid/cols_prefered) * single_feature.rows, single_feature.cols, single_feature.rows)));
}
putText(temp_metadata, meta1.str(), Point(15,15), FONT_HERSHEY_SIMPLEX, 0.5, Scalar(255));
putText(temp_metadata, meta2.str(), Point(15,40), FONT_HERSHEY_SIMPLEX, 0.5, Scalar(255));
imshow("metadata", temp_metadata);
waitKey(timing);
}
//Store the stage image if needed
if(draw_planes){
stringstream save_location;
save_location << output_folder << "stage_" << sid << ".png";
imwrite(save_location.str(), image_plane);
}
}
}
if(lbp){
// Grab the corresponding features dimensions and weights
FileNode features = cascade["features"];
vector<Rect> feature_data;
FileNodeIterator it_features = features.begin(), it_features_end = features.end();
for(int idf = 0; it_features != it_features_end; it_features++, idf++ ){
FileNode rectangle = (*it_features)["rect"];
Rect current_feature ((int)rectangle[0], (int)rectangle[1], (int)rectangle[2], (int)rectangle[3]);
feature_data.push_back(current_feature);
}
// Loop over each possible feature on its index, visualise on the mask and wait a bit,
// then continue to the next feature.
Mat image_plane;
Mat metadata = Mat::zeros(150, 1000, CV_8UC1);
for(int sid = 0; sid < (int)stage_features.size(); sid ++){
if(draw_planes){
int features_nmbr = (int)stage_features[sid].size();
int cols = cols_prefered;
int rows = features_nmbr / cols;
if( (features_nmbr % cols) > 0){
rows++;
}
image_plane = Mat::zeros(reference_image.rows * resize_storage_factor * rows, reference_image.cols * resize_storage_factor * cols, CV_8UC1);
}
for(int fid = 0; fid < (int)stage_features[sid].size(); fid++){
stringstream meta1, meta2;
meta1 << "Stage " << sid << " / Feature " << fid;
meta2 << "Rectangle: ";
Mat temp_window = visualization.clone();
Mat temp_metadata = metadata.clone();
int current_feature_index = stage_features[sid][fid];
Rect current_rect = feature_data[current_feature_index];
Mat single_feature = reference_image.clone();
resize(single_feature, single_feature, Size(), resize_storage_factor, resize_storage_factor);
// VISUALISATION
// The rectangle is the top left one of a 3x3 block LBP constructor
Rect resized(current_rect.x * resize_factor, current_rect.y * resize_factor, current_rect.width * resize_factor, current_rect.height * resize_factor);
meta2 << resized;
// Top left
rectangle(temp_window, resized, Scalar(255), 1);
// Top middle
rectangle(temp_window, Rect(resized.x + resized.width, resized.y, resized.width, resized.height), Scalar(255), 1);
// Top right
rectangle(temp_window, Rect(resized.x + 2*resized.width, resized.y, resized.width, resized.height), Scalar(255), 1);
// Middle left
rectangle(temp_window, Rect(resized.x, resized.y + resized.height, resized.width, resized.height), Scalar(255), 1);
// Middle middle
rectangle(temp_window, Rect(resized.x + resized.width, resized.y + resized.height, resized.width, resized.height), Scalar(255), FILLED);
// Middle right
rectangle(temp_window, Rect(resized.x + 2*resized.width, resized.y + resized.height, resized.width, resized.height), Scalar(255), 1);
// Bottom left
rectangle(temp_window, Rect(resized.x, resized.y + 2*resized.height, resized.width, resized.height), Scalar(255), 1);
// Bottom middle
rectangle(temp_window, Rect(resized.x + resized.width, resized.y + 2*resized.height, resized.width, resized.height), Scalar(255), 1);
// Bottom right
rectangle(temp_window, Rect(resized.x + 2*resized.width, resized.y + 2*resized.height, resized.width, resized.height), Scalar(255), 1);
if(draw_planes){
Rect resized_inner(current_rect.x * resize_storage_factor, current_rect.y * resize_storage_factor, current_rect.width * resize_storage_factor, current_rect.height * resize_storage_factor);
// Top left
rectangle(single_feature, resized_inner, Scalar(255), 1);
// Top middle
rectangle(single_feature, Rect(resized_inner.x + resized_inner.width, resized_inner.y, resized_inner.width, resized_inner.height), Scalar(255), 1);
// Top right
rectangle(single_feature, Rect(resized_inner.x + 2*resized_inner.width, resized_inner.y, resized_inner.width, resized_inner.height), Scalar(255), 1);
// Middle left
rectangle(single_feature, Rect(resized_inner.x, resized_inner.y + resized_inner.height, resized_inner.width, resized_inner.height), Scalar(255), 1);
// Middle middle
rectangle(single_feature, Rect(resized_inner.x + resized_inner.width, resized_inner.y + resized_inner.height, resized_inner.width, resized_inner.height), Scalar(255), FILLED);
// Middle right
rectangle(single_feature, Rect(resized_inner.x + 2*resized_inner.width, resized_inner.y + resized_inner.height, resized_inner.width, resized_inner.height), Scalar(255), 1);
// Bottom left
rectangle(single_feature, Rect(resized_inner.x, resized_inner.y + 2*resized_inner.height, resized_inner.width, resized_inner.height), Scalar(255), 1);
// Bottom middle
rectangle(single_feature, Rect(resized_inner.x + resized_inner.width, resized_inner.y + 2*resized_inner.height, resized_inner.width, resized_inner.height), Scalar(255), 1);
// Bottom right
rectangle(single_feature, Rect(resized_inner.x + 2*resized_inner.width, resized_inner.y + 2*resized_inner.height, resized_inner.width, resized_inner.height), Scalar(255), 1);
single_feature.copyTo(image_plane(Rect(0 + (fid%cols_prefered)*single_feature.cols, 0 + (fid/cols_prefered) * single_feature.rows, single_feature.cols, single_feature.rows)));
}
putText(temp_metadata, meta1.str(), Point(15,15), FONT_HERSHEY_SIMPLEX, 0.5, Scalar(255));
putText(temp_metadata, meta2.str(), Point(15,40), FONT_HERSHEY_SIMPLEX, 0.5, Scalar(255));
imshow("metadata", temp_metadata);
imshow("features", temp_window);
putText(temp_window, meta1.str(), Point(15,15), FONT_HERSHEY_SIMPLEX, 0.5, Scalar(255));
result_video.write(temp_window);
waitKey(timing);
}
//Store the stage image if needed
if(draw_planes){
stringstream save_location;
save_location << output_folder << "stage_" << sid << ".png";
imwrite(save_location.str(), image_plane);
}
}
}
return 0;
}

View File

@ -287,7 +287,7 @@ macro(add_android_project target path)
set(android_proj_NATIVE_DEPS ${android_proj_NATIVE_DEPS} android)
endif()
add_library(${JNI_LIB_NAME} MODULE ${android_proj_jni_files})
add_library(${JNI_LIB_NAME} SHARED ${android_proj_jni_files})
ocv_target_include_modules_recurse(${JNI_LIB_NAME} ${android_proj_NATIVE_DEPS})
ocv_target_include_directories(${JNI_LIB_NAME} "${path}/jni")
ocv_target_link_libraries(${JNI_LIB_NAME} ${OPENCV_LINKER_LIBS} ${android_proj_NATIVE_DEPS})

View File

@ -33,12 +33,17 @@ if(CUDA_FOUND)
if(WIN32)
find_cuda_helper_libs(nvcuvenc)
endif()
set(HAVE_NVCUVID 1)
if(CUDA_nvcuvid_LIBRARY)
set(HAVE_NVCUVID 1)
endif()
if(CUDA_nvcuvenc_LIBRARY)
set(HAVE_NVCUVENC 1)
endif()
endif()
message(STATUS "CUDA detected: " ${CUDA_VERSION})
set(_generations "Fermi" "Kepler")
set(_generations "Fermi" "Kepler" "Maxwell" "Pascal")
if(NOT CMAKE_CROSSCOMPILING)
list(APPEND _generations "Auto")
endif()
@ -58,13 +63,13 @@ if(CUDA_FOUND)
set(__cuda_arch_ptx "")
if(CUDA_GENERATION STREQUAL "Fermi")
set(__cuda_arch_bin "2.0 2.1(2.0)")
set(__cuda_arch_bin "2.0")
elseif(CUDA_GENERATION STREQUAL "Kepler")
if(${CUDA_VERSION} VERSION_LESS "5.0")
set(__cuda_arch_bin "3.0")
else()
set(__cuda_arch_bin "3.0 3.5")
endif()
set(__cuda_arch_bin "3.0 3.5")
elseif(CUDA_GENERATION STREQUAL "Maxwell")
set(__cuda_arch_bin "5.0")
elseif(CUDA_GENERATION STREQUAL "Pascal")
set(__cuda_arch_bin "6.0")
elseif(CUDA_GENERATION STREQUAL "Auto")
execute_process( COMMAND "${CUDA_NVCC_EXECUTABLE}" "${OpenCV_SOURCE_DIR}/cmake/checks/OpenCVDetectCudaArch.cu" "--run"
WORKING_DIRECTORY "${CMAKE_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/CMakeTmp/"
@ -86,14 +91,12 @@ if(CUDA_FOUND)
set(__cuda_arch_bin "5.3")
set(__cuda_arch_ptx "")
else()
if(${CUDA_VERSION} VERSION_LESS "5.0")
set(__cuda_arch_bin "1.1 1.2 1.3 2.0 2.1(2.0) 3.0")
elseif(${CUDA_VERSION} VERSION_GREATER "6.5")
set(__cuda_arch_bin "2.0 2.1(2.0) 3.0 3.5")
if(${CUDA_VERSION} VERSION_LESS "8.0")
set(__cuda_arch_bin "2.0 3.0 3.5 5.0")
else()
set(__cuda_arch_bin "1.1 1.2 1.3 2.0 2.1(2.0) 3.0 3.5")
set(__cuda_arch_bin "2.0 3.0 3.5 5.0 6.0")
endif()
set(__cuda_arch_ptx "3.0")
set(__cuda_arch_ptx "")
endif()
endif()
@ -133,6 +136,7 @@ if(CUDA_FOUND)
set(OPENCV_CUDA_ARCH_FEATURES "${OPENCV_CUDA_ARCH_FEATURES} ${ARCH}")
endif()
endforeach()
set(NVCC_FLAGS_EXTRA ${NVCC_FLAGS_EXTRA} -D_FORCE_INLINES)
# Tell NVCC to add PTX intermediate code for the specified architectures
string(REGEX MATCHALL "[0-9]+" ARCH_LIST "${ARCH_PTX_NO_POINTS}")

View File

@ -1,21 +1,21 @@
set(OPENCL_FOUND ON CACHE BOOL "OpenCL library is found")
if(APPLE)
set(OPENCL_FOUND YES)
set(OPENCL_LIBRARY "-framework OpenCL" CACHE STRING "OpenCL library")
set(OPENCL_INCLUDE_DIR "" CACHE STRING "OpenCL include directory")
mark_as_advanced(OPENCL_INCLUDE_DIR OPENCL_LIBRARY)
set(HAVE_OPENCL_STATIC ON)
set(OPENCL_INCLUDE_DIR "" CACHE PATH "OpenCL include directory")
else(APPLE)
set(OPENCL_FOUND YES)
set(HAVE_OPENCL_STATIC OFF)
set(OPENCL_INCLUDE_DIR "${OpenCV_SOURCE_DIR}/3rdparty/include/opencl/1.2")
set(OPENCL_LIBRARY "" CACHE STRING "OpenCL library")
set(OPENCL_INCLUDE_DIR "${OpenCV_SOURCE_DIR}/3rdparty/include/opencl/1.2" CACHE PATH "OpenCL include directory")
endif(APPLE)
if(WINRT)
set(OPENCL_FOUND NO)
set(HAVE_OPENCL_STATIC OFF)
endif(WINRT)
mark_as_advanced(OPENCL_INCLUDE_DIR OPENCL_LIBRARY)
if(OPENCL_FOUND)
if(OPENCL_LIBRARY)
set(HAVE_OPENCL_STATIC ON)
set(OPENCL_LIBRARIES "${OPENCL_LIBRARY}")
else()
set(HAVE_OPENCL_STATIC OFF)
endif()
if(NOT HAVE_OPENCL_STATIC)
try_compile(__VALID_OPENCL
"${OpenCV_BINARY_DIR}"
@ -29,20 +29,12 @@ if(OPENCL_FOUND)
endif()
endif()
if(NOT WINRT)
set(HAVE_OPENCL 1)
endif()
set(HAVE_OPENCL 1)
if(WITH_OPENCL_SVM)
set(HAVE_OPENCL_SVM 1)
endif()
if(HAVE_OPENCL_STATIC)
set(OPENCL_LIBRARIES "${OPENCL_LIBRARY}")
else()
unset(OPENCL_LIBRARIES)
endif()
set(OPENCL_INCLUDE_DIRS ${OPENCL_INCLUDE_DIR})
if(WITH_OPENCLAMDFFT)

Some files were not shown because too many files have changed in this diff Show More