275 lines
9.7 KiB
C++
275 lines
9.7 KiB
C++
/*
|
|
* By downloading, copying, installing or using the software you agree to this license.
|
|
* If you do not agree to this license, do not download, install,
|
|
* copy or use the software.
|
|
*
|
|
*
|
|
* License Agreement
|
|
* For Open Source Computer Vision Library
|
|
* (3-clause BSD License)
|
|
*
|
|
* Copyright (C) 2012-2015, NVIDIA Corporation, all rights reserved.
|
|
* Third party copyrights are property of their respective owners.
|
|
*
|
|
* Redistribution and use in source and binary forms, with or without modification,
|
|
* are permitted provided that the following conditions are met:
|
|
*
|
|
* * Redistributions of source code must retain the above copyright notice,
|
|
* this list of conditions and the following disclaimer.
|
|
*
|
|
* * Redistributions in binary form must reproduce the above copyright notice,
|
|
* this list of conditions and the following disclaimer in the documentation
|
|
* and/or other materials provided with the distribution.
|
|
*
|
|
* * Neither the names of the copyright holders nor the names of the contributors
|
|
* may be used to endorse or promote products derived from this software
|
|
* without specific prior written permission.
|
|
*
|
|
* This software is provided by the copyright holders and contributors "as is" and
|
|
* any express or implied warranties, including, but not limited to, the implied
|
|
* warranties of merchantability and fitness for a particular purpose are disclaimed.
|
|
* In no event shall copyright holders or contributors be liable for any direct,
|
|
* indirect, incidental, special, exemplary, or consequential damages
|
|
* (including, but not limited to, procurement of substitute goods or services;
|
|
* loss of use, data, or profits; or business interruption) however caused
|
|
* and on any theory of liability, whether in contract, strict liability,
|
|
* or tort (including negligence or otherwise) arising in any way out of
|
|
* the use of this software, even if advised of the possibility of such damage.
|
|
*/
|
|
|
|
#include <cfloat>
|
|
#include <cmath>
|
|
|
|
#include "common.hpp"
|
|
|
|
namespace CAROTENE_NS {
|
|
|
|
#ifdef CAROTENE_NEON
|
|
|
|
namespace {
|
|
|
|
#define FASTATAN2CONST(scale) \
|
|
f32 P1((f32)( 0.9997878412794807 * (180.0 / M_PI) * scale)), \
|
|
P3((f32)(-0.3258083974640975 * (180.0 / M_PI) * scale)), \
|
|
P5((f32)( 0.1555786518463281 * (180.0 / M_PI) * scale)), \
|
|
P7((f32)(-0.04432655554792128 * (180.0 / M_PI) * scale)), \
|
|
A_90((f32)(90.f * scale)), \
|
|
A_180((f32)(180.f * scale)), \
|
|
A_360((f32)(360.f * scale)); \
|
|
float32x4_t eps(vdupq_n_f32((float)DBL_EPSILON)), \
|
|
_90(vdupq_n_f32(A_90)), \
|
|
_180(vdupq_n_f32(A_180)), \
|
|
_360(vdupq_n_f32(A_360)), \
|
|
z(vdupq_n_f32(0.0f)), \
|
|
p1(vdupq_n_f32(P1)), \
|
|
p3(vdupq_n_f32(P3)), \
|
|
p5(vdupq_n_f32(P5)), \
|
|
p7(vdupq_n_f32(P7));
|
|
|
|
#define FASTATAN2SCALAR(y, x, a) \
|
|
{ \
|
|
f32 ax = std::abs(x), ay = std::abs(y); \
|
|
f32 c, c2; \
|
|
if (ax >= ay) \
|
|
{ \
|
|
c = ay / (ax + (float)DBL_EPSILON); \
|
|
c2 = c * c; \
|
|
a = (((P7 * c2 + P5) * c2 + P3) * c2 + P1) * c; \
|
|
} \
|
|
else \
|
|
{ \
|
|
c = ax / (ay + (float)DBL_EPSILON); \
|
|
c2 = c * c; \
|
|
a = A_90 - (((P7 * c2 + P5) * c2 + P3) * c2 + P1) * c; \
|
|
} \
|
|
if (x < 0) \
|
|
a = A_180 - a; \
|
|
if (y < 0) \
|
|
a = A_360 - a; \
|
|
}
|
|
|
|
#define FASTATAN2VECTOR(v_y, v_x, a) \
|
|
{ \
|
|
float32x4_t ax = vabsq_f32(v_x), ay = vabsq_f32(v_y); \
|
|
float32x4_t tmin = vminq_f32(ax, ay), tmax = vmaxq_f32(ax, ay); \
|
|
float32x4_t c = vmulq_f32(tmin, internal::vrecpq_f32(vaddq_f32(tmax, eps))); \
|
|
float32x4_t c2 = vmulq_f32(c, c); \
|
|
a = vmulq_f32(c2, p7); \
|
|
\
|
|
a = vmulq_f32(vaddq_f32(a, p5), c2); \
|
|
a = vmulq_f32(vaddq_f32(a, p3), c2); \
|
|
a = vmulq_f32(vaddq_f32(a, p1), c); \
|
|
\
|
|
a = vbslq_f32(vcgeq_f32(ax, ay), a, vsubq_f32(_90, a)); \
|
|
a = vbslq_f32(vcltq_f32(v_x, z), vsubq_f32(_180, a), a); \
|
|
a = vbslq_f32(vcltq_f32(v_y, z), vsubq_f32(_360, a), a); \
|
|
\
|
|
}
|
|
|
|
} // namespace
|
|
|
|
#endif
|
|
|
|
void phase(const Size2D &size,
|
|
const s16 * src0Base, ptrdiff_t src0Stride,
|
|
const s16 * src1Base, ptrdiff_t src1Stride,
|
|
u8 * dstBase, ptrdiff_t dstStride)
|
|
{
|
|
internal::assertSupportedConfiguration();
|
|
#ifdef CAROTENE_NEON
|
|
FASTATAN2CONST(256.0f / 360.0f)
|
|
size_t roiw16 = size.width >= 15 ? size.width - 15 : 0;
|
|
size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
|
|
|
|
float32x4_t v_05 = vdupq_n_f32(0.5f);
|
|
|
|
for (size_t i = 0; i < size.height; ++i)
|
|
{
|
|
const s16 * src0 = internal::getRowPtr(src0Base, src0Stride, i);
|
|
const s16 * src1 = internal::getRowPtr(src1Base, src1Stride, i);
|
|
u8 * dst = internal::getRowPtr(dstBase, dstStride, i);
|
|
size_t j = 0;
|
|
|
|
for (; j < roiw16; j += 16)
|
|
{
|
|
internal::prefetch(src0 + j);
|
|
internal::prefetch(src1 + j);
|
|
|
|
int16x8_t v_src00 = vld1q_s16(src0 + j), v_src01 = vld1q_s16(src0 + j + 8);
|
|
int16x8_t v_src10 = vld1q_s16(src1 + j), v_src11 = vld1q_s16(src1 + j + 8);
|
|
|
|
// 0
|
|
float32x4_t v_src0_p = vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src00)));
|
|
float32x4_t v_src1_p = vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src10)));
|
|
float32x4_t v_dst32f0;
|
|
FASTATAN2VECTOR(v_src1_p, v_src0_p, v_dst32f0)
|
|
|
|
v_src0_p = vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src00)));
|
|
v_src1_p = vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src10)));
|
|
float32x4_t v_dst32f1;
|
|
FASTATAN2VECTOR(v_src1_p, v_src0_p, v_dst32f1)
|
|
|
|
uint16x8_t v_dst16s0 = vcombine_u16(vmovn_u32(vcvtq_u32_f32(vaddq_f32(v_dst32f0, v_05))),
|
|
vmovn_u32(vcvtq_u32_f32(vaddq_f32(v_dst32f1, v_05))));
|
|
|
|
// 1
|
|
v_src0_p = vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src01)));
|
|
v_src1_p = vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src11)));
|
|
FASTATAN2VECTOR(v_src1_p, v_src0_p, v_dst32f0)
|
|
|
|
v_src0_p = vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src01)));
|
|
v_src1_p = vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src11)));
|
|
FASTATAN2VECTOR(v_src1_p, v_src0_p, v_dst32f1)
|
|
|
|
uint16x8_t v_dst16s1 = vcombine_u16(vmovn_u32(vcvtq_u32_f32(vaddq_f32(v_dst32f0, v_05))),
|
|
vmovn_u32(vcvtq_u32_f32(vaddq_f32(v_dst32f1, v_05))));
|
|
|
|
vst1q_u8(dst + j, vcombine_u8(vmovn_u16(v_dst16s0),
|
|
vmovn_u16(v_dst16s1)));
|
|
}
|
|
for (; j < roiw8; j += 8)
|
|
{
|
|
int16x8_t v_src0 = vld1q_s16(src0 + j);
|
|
int16x8_t v_src1 = vld1q_s16(src1 + j);
|
|
|
|
float32x4_t v_src0_p = vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src0)));
|
|
float32x4_t v_src1_p = vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src1)));
|
|
float32x4_t v_dst32f0;
|
|
FASTATAN2VECTOR(v_src1_p, v_src0_p, v_dst32f0)
|
|
|
|
v_src0_p = vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src0)));
|
|
v_src1_p = vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src1)));
|
|
float32x4_t v_dst32f1;
|
|
FASTATAN2VECTOR(v_src1_p, v_src0_p, v_dst32f1)
|
|
|
|
uint16x8_t v_dst = vcombine_u16(vmovn_u32(vcvtq_u32_f32(vaddq_f32(v_dst32f0, v_05))),
|
|
vmovn_u32(vcvtq_u32_f32(vaddq_f32(v_dst32f1, v_05))));
|
|
|
|
vst1_u8(dst + j, vmovn_u16(v_dst));
|
|
}
|
|
|
|
for (; j < size.width; j++)
|
|
{
|
|
f32 x = src0[j], y = src1[j];
|
|
f32 a;
|
|
FASTATAN2SCALAR(y, x, a)
|
|
dst[j] = (u8)(s32)floor(a + 0.5f);
|
|
}
|
|
}
|
|
#else
|
|
(void)size;
|
|
(void)src0Base;
|
|
(void)src0Stride;
|
|
(void)src1Base;
|
|
(void)src1Stride;
|
|
(void)dstBase;
|
|
(void)dstStride;
|
|
#endif
|
|
}
|
|
|
|
void phase(const Size2D &size,
|
|
const f32 * src0Base, ptrdiff_t src0Stride,
|
|
const f32 * src1Base, ptrdiff_t src1Stride,
|
|
f32 * dstBase, ptrdiff_t dstStride,
|
|
f32 scale)
|
|
{
|
|
internal::assertSupportedConfiguration();
|
|
#ifdef CAROTENE_NEON
|
|
FASTATAN2CONST(scale)
|
|
size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
|
|
|
|
for (size_t i = 0; i < size.height; ++i)
|
|
{
|
|
const f32 * src0 = internal::getRowPtr(src0Base, src0Stride, i);
|
|
const f32 * src1 = internal::getRowPtr(src1Base, src1Stride, i);
|
|
f32 * dst = internal::getRowPtr(dstBase, dstStride, i);
|
|
size_t j = 0;
|
|
|
|
for (; j < roiw8; j += 8)
|
|
{
|
|
internal::prefetch(src0 + j);
|
|
internal::prefetch(src1 + j);
|
|
|
|
float32x4_t v_src00 = vld1q_f32(src0 + j), v_src01 = vld1q_f32(src0 + j + 4);
|
|
float32x4_t v_src10 = vld1q_f32(src1 + j), v_src11 = vld1q_f32(src1 + j + 4);
|
|
|
|
float32x4_t v_dst32f;
|
|
// 0
|
|
FASTATAN2VECTOR(v_src10, v_src00, v_dst32f)
|
|
vst1q_f32(dst + j, v_dst32f);
|
|
// 1
|
|
FASTATAN2VECTOR(v_src11, v_src01, v_dst32f)
|
|
vst1q_f32(dst + j + 4, v_dst32f);
|
|
}
|
|
if(j + 4 <= size.width)
|
|
{
|
|
float32x4_t v_src0 = vld1q_f32(src0 + j);
|
|
float32x4_t v_src1 = vld1q_f32(src1 + j);
|
|
|
|
float32x4_t v_dst32f;
|
|
FASTATAN2VECTOR(v_src1, v_src0, v_dst32f)
|
|
vst1q_f32(dst + j, v_dst32f);
|
|
j += 4;
|
|
}
|
|
|
|
for (; j < size.width; j++)
|
|
{
|
|
f32 a;
|
|
FASTATAN2SCALAR(src1[j], src0[j], a)
|
|
dst[j] = a;
|
|
}
|
|
}
|
|
#else
|
|
(void)size;
|
|
(void)src0Base;
|
|
(void)src0Stride;
|
|
(void)src1Base;
|
|
(void)src1Stride;
|
|
(void)dstBase;
|
|
(void)dstStride;
|
|
(void)scale;
|
|
#endif
|
|
}
|
|
|
|
} // namespace CAROTENE_NS
|