Merge branch '2.4'

This commit is contained in:
Andrey Kamaev
2013-03-21 20:59:18 +04:00
276 changed files with 11834 additions and 5170 deletions

View File

@@ -3,7 +3,7 @@ if(ANDROID OR IOS)
endif()
set(the_description "GPU-accelerated Computer Vision")
ocv_add_module(gpu opencv_imgproc opencv_calib3d opencv_objdetect opencv_video opencv_nonfree opencv_photo opencv_legacy)
ocv_add_module(gpu opencv_imgproc opencv_calib3d opencv_objdetect opencv_video opencv_photo opencv_legacy)
ocv_module_include_directories("${CMAKE_CURRENT_SOURCE_DIR}/src/cuda")

View File

@@ -5,109 +5,6 @@ Feature Detection and Description
gpu::SURF_GPU
-------------
.. ocv:class:: gpu::SURF_GPU
Class used for extracting Speeded Up Robust Features (SURF) from an image. ::
class SURF_GPU
{
public:
enum KeypointLayout
{
X_ROW = 0,
Y_ROW,
LAPLACIAN_ROW,
OCTAVE_ROW,
SIZE_ROW,
ANGLE_ROW,
HESSIAN_ROW,
ROWS_COUNT
};
//! the default constructor
SURF_GPU();
//! the full constructor taking all the necessary parameters
explicit SURF_GPU(double _hessianThreshold, int _nOctaves=4,
int _nOctaveLayers=2, bool _extended=false, float _keypointsRatio=0.01f);
//! returns the descriptor size in float's (64 or 128)
int descriptorSize() const;
//! upload host keypoints to device memory
void uploadKeypoints(const vector<KeyPoint>& keypoints,
GpuMat& keypointsGPU);
//! download keypoints from device to host memory
void downloadKeypoints(const GpuMat& keypointsGPU,
vector<KeyPoint>& keypoints);
//! download descriptors from device to host memory
void downloadDescriptors(const GpuMat& descriptorsGPU,
vector<float>& descriptors);
void operator()(const GpuMat& img, const GpuMat& mask,
GpuMat& keypoints);
void operator()(const GpuMat& img, const GpuMat& mask,
GpuMat& keypoints, GpuMat& descriptors,
bool useProvidedKeypoints = false,
bool calcOrientation = true);
void operator()(const GpuMat& img, const GpuMat& mask,
std::vector<KeyPoint>& keypoints);
void operator()(const GpuMat& img, const GpuMat& mask,
std::vector<KeyPoint>& keypoints, GpuMat& descriptors,
bool useProvidedKeypoints = false,
bool calcOrientation = true);
void operator()(const GpuMat& img, const GpuMat& mask,
std::vector<KeyPoint>& keypoints,
std::vector<float>& descriptors,
bool useProvidedKeypoints = false,
bool calcOrientation = true);
void releaseMemory();
// SURF parameters
double hessianThreshold;
int nOctaves;
int nOctaveLayers;
bool extended;
bool upright;
//! max keypoints = keypointsRatio * img.size().area()
float keypointsRatio;
GpuMat sum, mask1, maskSum, intBuffer;
GpuMat det, trace;
GpuMat maxPosBuffer;
};
The class ``SURF_GPU`` implements Speeded Up Robust Features descriptor. There is a fast multi-scale Hessian keypoint detector that can be used to find the keypoints (which is the default option). But the descriptors can also be computed for the user-specified keypoints. Only 8-bit grayscale images are supported.
The class ``SURF_GPU`` can store results in the GPU and CPU memory. It provides functions to convert results between CPU and GPU version ( ``uploadKeypoints``, ``downloadKeypoints``, ``downloadDescriptors`` ). The format of CPU results is the same as ``SURF`` results. GPU results are stored in ``GpuMat``. The ``keypoints`` matrix is :math:`\texttt{nFeatures} \times 7` matrix with the ``CV_32FC1`` type.
* ``keypoints.ptr<float>(X_ROW)[i]`` contains x coordinate of the i-th feature.
* ``keypoints.ptr<float>(Y_ROW)[i]`` contains y coordinate of the i-th feature.
* ``keypoints.ptr<float>(LAPLACIAN_ROW)[i]`` contains the laplacian sign of the i-th feature.
* ``keypoints.ptr<float>(OCTAVE_ROW)[i]`` contains the octave of the i-th feature.
* ``keypoints.ptr<float>(SIZE_ROW)[i]`` contains the size of the i-th feature.
* ``keypoints.ptr<float>(ANGLE_ROW)[i]`` contain orientation of the i-th feature.
* ``keypoints.ptr<float>(HESSIAN_ROW)[i]`` contains the response of the i-th feature.
The ``descriptors`` matrix is :math:`\texttt{nFeatures} \times \texttt{descriptorSize}` matrix with the ``CV_32FC1`` type.
The class ``SURF_GPU`` uses some buffers and provides access to it. All buffers can be safely released between function calls.
.. seealso:: :ocv:class:`SURF`
gpu::FAST_GPU
-------------
.. ocv:class:: gpu::FAST_GPU

View File

@@ -579,76 +579,6 @@ Releases all inner buffer's memory.
gpu::VIBE_GPU
-------------
.. ocv:class:: gpu::VIBE_GPU
Class used for background/foreground segmentation. ::
class VIBE_GPU
{
public:
explicit VIBE_GPU(unsigned long rngSeed = 1234567);
void initialize(const GpuMat& firstFrame, Stream& stream = Stream::Null());
void operator()(const GpuMat& frame, GpuMat& fgmask, Stream& stream = Stream::Null());
void release();
...
};
The class discriminates between foreground and background pixels by building and maintaining a model of the background. Any pixel which does not fit this model is then deemed to be foreground. The class implements algorithm described in [VIBE2011]_.
gpu::VIBE_GPU::VIBE_GPU
-----------------------
The constructor.
.. ocv:function:: gpu::VIBE_GPU::VIBE_GPU(unsigned long rngSeed = 1234567)
:param rngSeed: Value used to initiate a random sequence.
Default constructor sets all parameters to default values.
gpu::VIBE_GPU::initialize
-------------------------
Initialize background model and allocates all inner buffers.
.. ocv:function:: void gpu::VIBE_GPU::initialize(const GpuMat& firstFrame, Stream& stream = Stream::Null())
:param firstFrame: First frame from video sequence.
:param stream: Stream for the asynchronous version.
gpu::VIBE_GPU::operator()
-------------------------
Updates the background model and returns the foreground mask
.. ocv:function:: void gpu::VIBE_GPU::operator()(const GpuMat& frame, GpuMat& fgmask, Stream& stream = Stream::Null())
:param frame: Next video frame.
:param fgmask: The output foreground mask as an 8-bit binary image.
:param stream: Stream for the asynchronous version.
gpu::VIBE_GPU::release
----------------------
Releases all inner buffer's memory.
.. ocv:function:: void gpu::VIBE_GPU::release()
gpu::GMG_GPU
------------
.. ocv:class:: gpu::GMG_GPU
@@ -1209,5 +1139,4 @@ Parse next video frame. Implementation must call this method after new frame was
.. [MOG2001] P. KadewTraKuPong and R. Bowden. *An improved adaptive background mixture model for real-time tracking with shadow detection*. Proc. 2nd European Workshop on Advanced Video-Based Surveillance Systems, 2001
.. [MOG2004] Z. Zivkovic. *Improved adaptive Gausian mixture model for background subtraction*. International Conference Pattern Recognition, UK, August, 2004
.. [ShadowDetect2003] Prati, Mikic, Trivedi and Cucchiarra. *Detecting Moving Shadows...*. IEEE PAMI, 2003
.. [VIBE2011] O. Barnich and M. Van D Roogenbroeck. *ViBe: A universal background subtraction algorithm for video sequences*. IEEE Transactions on Image Processing, 20(6) :1709-1724, June 2011
.. [GMG2012] A. Godbehere, A. Matsukawa and K. Goldberg. *Visual Tracking of Human Visitors under Variable-Lighting Conditions for a Responsive Audio Art Installation*. American Control Conference, Montreal, June 2012

View File

@@ -491,6 +491,26 @@ CV_EXPORTS void reprojectImageTo3D(const GpuMat& disp, GpuMat& xyzw, const Mat&
//! converts image from one color space to another
CV_EXPORTS void cvtColor(const GpuMat& src, GpuMat& dst, int code, int dcn = 0, Stream& stream = Stream::Null());
enum
{
// Bayer Demosaicing (Malvar, He, and Cutler)
COLOR_BayerBG2BGR_MHT = 256,
COLOR_BayerGB2BGR_MHT = 257,
COLOR_BayerRG2BGR_MHT = 258,
COLOR_BayerGR2BGR_MHT = 259,
COLOR_BayerBG2RGB_MHT = COLOR_BayerRG2BGR_MHT,
COLOR_BayerGB2RGB_MHT = COLOR_BayerGR2BGR_MHT,
COLOR_BayerRG2RGB_MHT = COLOR_BayerBG2BGR_MHT,
COLOR_BayerGR2RGB_MHT = COLOR_BayerGB2BGR_MHT,
COLOR_BayerBG2GRAY_MHT = 260,
COLOR_BayerGB2GRAY_MHT = 261,
COLOR_BayerRG2GRAY_MHT = 262,
COLOR_BayerGR2GRAY_MHT = 263
};
CV_EXPORTS void demosaicing(const GpuMat& src, GpuMat& dst, int code, int dcn = -1, Stream& stream = Stream::Null());
//! swap channels
//! dstOrder - Integer array describing how channel values are permutated. The n-th entry
//! of the array contains the number of the channel that is stored in the n-th channel of
@@ -894,9 +914,11 @@ CV_EXPORTS void histRange(const GpuMat& src, GpuMat hist[4], const GpuMat levels
//! Calculates histogram for 8u one channel image
//! Output hist will have one row, 256 cols and CV32SC1 type.
CV_EXPORTS void calcHist(const GpuMat& src, GpuMat& hist, Stream& stream = Stream::Null());
CV_EXPORTS void calcHist(const GpuMat& src, GpuMat& hist, GpuMat& buf, Stream& stream = Stream::Null());
//! normalizes the grayscale image brightness and contrast by normalizing its histogram
CV_EXPORTS void equalizeHist(const GpuMat& src, GpuMat& dst, Stream& stream = Stream::Null());
CV_EXPORTS void equalizeHist(const GpuMat& src, GpuMat& dst, GpuMat& hist, Stream& stream = Stream::Null());
CV_EXPORTS void equalizeHist(const GpuMat& src, GpuMat& dst, GpuMat& hist, GpuMat& buf, Stream& stream = Stream::Null());
//////////////////////////////// StereoBM_GPU ////////////////////////////////
@@ -1386,82 +1408,6 @@ private:
friend class CascadeClassifier_GPU_LBP;
};
////////////////////////////////// SURF //////////////////////////////////////////
class CV_EXPORTS SURF_GPU
{
public:
enum KeypointLayout
{
X_ROW = 0,
Y_ROW,
LAPLACIAN_ROW,
OCTAVE_ROW,
SIZE_ROW,
ANGLE_ROW,
HESSIAN_ROW,
ROWS_COUNT
};
//! the default constructor
SURF_GPU();
//! the full constructor taking all the necessary parameters
explicit SURF_GPU(double _hessianThreshold, int _nOctaves=4,
int _nOctaveLayers=2, bool _extended=false, float _keypointsRatio=0.01f, bool _upright = false);
//! returns the descriptor size in float's (64 or 128)
int descriptorSize() const;
//! upload host keypoints to device memory
static void uploadKeypoints(const std::vector<KeyPoint>& keypoints, GpuMat& keypointsGPU);
//! download keypoints from device to host memory
static void downloadKeypoints(const GpuMat& keypointsGPU, std::vector<KeyPoint>& keypoints);
//! download descriptors from device to host memory
static void downloadDescriptors(const GpuMat& descriptorsGPU, std::vector<float>& descriptors);
//! finds the keypoints using fast hessian detector used in SURF
//! supports CV_8UC1 images
//! keypoints will have nFeature cols and 6 rows
//! keypoints.ptr<float>(X_ROW)[i] will contain x coordinate of i'th feature
//! keypoints.ptr<float>(Y_ROW)[i] will contain y coordinate of i'th feature
//! keypoints.ptr<float>(LAPLACIAN_ROW)[i] will contain laplacian sign of i'th feature
//! keypoints.ptr<float>(OCTAVE_ROW)[i] will contain octave of i'th feature
//! keypoints.ptr<float>(SIZE_ROW)[i] will contain size of i'th feature
//! keypoints.ptr<float>(ANGLE_ROW)[i] will contain orientation of i'th feature
//! keypoints.ptr<float>(HESSIAN_ROW)[i] will contain response of i'th feature
void operator()(const GpuMat& img, const GpuMat& mask, GpuMat& keypoints);
//! finds the keypoints and computes their descriptors.
//! Optionally it can compute descriptors for the user-provided keypoints and recompute keypoints direction
void operator()(const GpuMat& img, const GpuMat& mask, GpuMat& keypoints, GpuMat& descriptors,
bool useProvidedKeypoints = false);
void operator()(const GpuMat& img, const GpuMat& mask, std::vector<KeyPoint>& keypoints);
void operator()(const GpuMat& img, const GpuMat& mask, std::vector<KeyPoint>& keypoints, GpuMat& descriptors,
bool useProvidedKeypoints = false);
void operator()(const GpuMat& img, const GpuMat& mask, std::vector<KeyPoint>& keypoints, std::vector<float>& descriptors,
bool useProvidedKeypoints = false);
void releaseMemory();
// SURF parameters
double hessianThreshold;
int nOctaves;
int nOctaveLayers;
bool extended;
bool upright;
//! max keypoints = min(keypointsRatio * img.size().area(), 65535)
float keypointsRatio;
GpuMat sum, mask1, maskSum, intBuffer;
GpuMat det, trace;
GpuMat maxPosBuffer;
};
////////////////////////////////// FAST //////////////////////////////////////////
class CV_EXPORTS FAST_GPU
@@ -2129,41 +2075,6 @@ private:
GpuMat bgmodelUsedModes_; //keep track of number of modes per pixel
};
/*!
* The class implements the following algorithm:
* "ViBe: A universal background subtraction algorithm for video sequences"
* O. Barnich and M. Van D Roogenbroeck
* IEEE Transactions on Image Processing, 20(6) :1709-1724, June 2011
*/
class CV_EXPORTS VIBE_GPU
{
public:
//! the default constructor
explicit VIBE_GPU(unsigned long rngSeed = 1234567);
//! re-initiaization method
void initialize(const GpuMat& firstFrame, Stream& stream = Stream::Null());
//! the update operator
void operator()(const GpuMat& frame, GpuMat& fgmask, Stream& stream = Stream::Null());
//! releases all inner buffers
void release();
int nbSamples; // number of samples per pixel
int reqMatches; // #_min
int radius; // R
int subsamplingFactor; // amount of random subsampling
private:
Size frameSize_;
unsigned long rngSeed_;
GpuMat randStates_;
GpuMat samples_;
};
/**
* Background Subtractor module. Takes a series of images and returns a sequence of mask (8UC1)
* images of the same size, where 255 indicates Foreground and 0 represents Background.

View File

@@ -0,0 +1,910 @@
/*M///////////////////////////////////////////////////////////////////////////////////////
//
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
// By downloading, copying, installing or using the software you agree to this license.
// If you do not agree to this license, do not download, install,
// copy or use the software.
//
//
// License Agreement
// For Open Source Computer Vision Library
//
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
// Copyright (C) 2010-2013, NVIDIA Corporation, all rights reserved.
// Third party copyrights are property of their respective owners.
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other materials provided with the distribution.
//
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors "as is" and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/
/*
* Copyright (c) 2013 NVIDIA Corporation. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* Neither the name of NVIDIA Corporation nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef __OPENCV_GPU_SIMD_FUNCTIONS_HPP__
#define __OPENCV_GPU_SIMD_FUNCTIONS_HPP__
#include "common.hpp"
/*
This header file contains inline functions that implement intra-word SIMD
operations, that are hardware accelerated on sm_3x (Kepler) GPUs. Efficient
emulation code paths are provided for earlier architectures (sm_1x, sm_2x)
to make the code portable across all GPUs supported by CUDA. The following
functions are currently implemented:
vadd2(a,b) per-halfword unsigned addition, with wrap-around: a + b
vsub2(a,b) per-halfword unsigned subtraction, with wrap-around: a - b
vabsdiff2(a,b) per-halfword unsigned absolute difference: |a - b|
vavg2(a,b) per-halfword unsigned average: (a + b) / 2
vavrg2(a,b) per-halfword unsigned rounded average: (a + b + 1) / 2
vseteq2(a,b) per-halfword unsigned comparison: a == b ? 1 : 0
vcmpeq2(a,b) per-halfword unsigned comparison: a == b ? 0xffff : 0
vsetge2(a,b) per-halfword unsigned comparison: a >= b ? 1 : 0
vcmpge2(a,b) per-halfword unsigned comparison: a >= b ? 0xffff : 0
vsetgt2(a,b) per-halfword unsigned comparison: a > b ? 1 : 0
vcmpgt2(a,b) per-halfword unsigned comparison: a > b ? 0xffff : 0
vsetle2(a,b) per-halfword unsigned comparison: a <= b ? 1 : 0
vcmple2(a,b) per-halfword unsigned comparison: a <= b ? 0xffff : 0
vsetlt2(a,b) per-halfword unsigned comparison: a < b ? 1 : 0
vcmplt2(a,b) per-halfword unsigned comparison: a < b ? 0xffff : 0
vsetne2(a,b) per-halfword unsigned comparison: a != b ? 1 : 0
vcmpne2(a,b) per-halfword unsigned comparison: a != b ? 0xffff : 0
vmax2(a,b) per-halfword unsigned maximum: max(a, b)
vmin2(a,b) per-halfword unsigned minimum: min(a, b)
vadd4(a,b) per-byte unsigned addition, with wrap-around: a + b
vsub4(a,b) per-byte unsigned subtraction, with wrap-around: a - b
vabsdiff4(a,b) per-byte unsigned absolute difference: |a - b|
vavg4(a,b) per-byte unsigned average: (a + b) / 2
vavrg4(a,b) per-byte unsigned rounded average: (a + b + 1) / 2
vseteq4(a,b) per-byte unsigned comparison: a == b ? 1 : 0
vcmpeq4(a,b) per-byte unsigned comparison: a == b ? 0xff : 0
vsetge4(a,b) per-byte unsigned comparison: a >= b ? 1 : 0
vcmpge4(a,b) per-byte unsigned comparison: a >= b ? 0xff : 0
vsetgt4(a,b) per-byte unsigned comparison: a > b ? 1 : 0
vcmpgt4(a,b) per-byte unsigned comparison: a > b ? 0xff : 0
vsetle4(a,b) per-byte unsigned comparison: a <= b ? 1 : 0
vcmple4(a,b) per-byte unsigned comparison: a <= b ? 0xff : 0
vsetlt4(a,b) per-byte unsigned comparison: a < b ? 1 : 0
vcmplt4(a,b) per-byte unsigned comparison: a < b ? 0xff : 0
vsetne4(a,b) per-byte unsigned comparison: a != b ? 1: 0
vcmpne4(a,b) per-byte unsigned comparison: a != b ? 0xff: 0
vmax4(a,b) per-byte unsigned maximum: max(a, b)
vmin4(a,b) per-byte unsigned minimum: min(a, b)
*/
namespace cv { namespace gpu { namespace device
{
// 2
static __device__ __forceinline__ unsigned int vadd2(unsigned int a, unsigned int b)
{
unsigned int r = 0;
#if __CUDA_ARCH__ >= 300
asm("vadd2.u32.u32.u32.sat %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
#elif __CUDA_ARCH__ >= 200
asm("vadd.u32.u32.u32.sat %0.h0, %1.h0, %2.h0, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
asm("vadd.u32.u32.u32.sat %0.h1, %1.h1, %2.h1, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
#else
unsigned int s;
s = a ^ b; // sum bits
r = a + b; // actual sum
s = s ^ r; // determine carry-ins for each bit position
s = s & 0x00010000; // carry-in to high word (= carry-out from low word)
r = r - s; // subtract out carry-out from low word
#endif
return r;
}
static __device__ __forceinline__ unsigned int vsub2(unsigned int a, unsigned int b)
{
unsigned int r = 0;
#if __CUDA_ARCH__ >= 300
asm("vsub2.u32.u32.u32.sat %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
#elif __CUDA_ARCH__ >= 200
asm("vsub.u32.u32.u32.sat %0.h0, %1.h0, %2.h0, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
asm("vsub.u32.u32.u32.sat %0.h1, %1.h1, %2.h1, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
#else
unsigned int s;
s = a ^ b; // sum bits
r = a - b; // actual sum
s = s ^ r; // determine carry-ins for each bit position
s = s & 0x00010000; // borrow to high word
r = r + s; // compensate for borrow from low word
#endif
return r;
}
static __device__ __forceinline__ unsigned int vabsdiff2(unsigned int a, unsigned int b)
{
unsigned int r = 0;
#if __CUDA_ARCH__ >= 300
asm("vabsdiff2.u32.u32.u32.sat %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
#elif __CUDA_ARCH__ >= 200
asm("vabsdiff.u32.u32.u32.sat %0.h0, %1.h0, %2.h0, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
asm("vabsdiff.u32.u32.u32.sat %0.h1, %1.h1, %2.h1, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
#else
unsigned int s, t, u, v;
s = a & 0x0000ffff; // extract low halfword
r = b & 0x0000ffff; // extract low halfword
u = ::max(r, s); // maximum of low halfwords
v = ::min(r, s); // minimum of low halfwords
s = a & 0xffff0000; // extract high halfword
r = b & 0xffff0000; // extract high halfword
t = ::max(r, s); // maximum of high halfwords
s = ::min(r, s); // minimum of high halfwords
r = u | t; // maximum of both halfwords
s = v | s; // minimum of both halfwords
r = r - s; // |a - b| = max(a,b) - min(a,b);
#endif
return r;
}
static __device__ __forceinline__ unsigned int vavg2(unsigned int a, unsigned int b)
{
unsigned int r, s;
// HAKMEM #23: a + b = 2 * (a & b) + (a ^ b) ==>
// (a + b) / 2 = (a & b) + ((a ^ b) >> 1)
s = a ^ b;
r = a & b;
s = s & 0xfffefffe; // ensure shift doesn't cross halfword boundaries
s = s >> 1;
s = r + s;
return s;
}
static __device__ __forceinline__ unsigned int vavrg2(unsigned int a, unsigned int b)
{
unsigned int r = 0;
#if __CUDA_ARCH__ >= 300
asm("vavrg2.u32.u32.u32 %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
#else
// HAKMEM #23: a + b = 2 * (a | b) - (a ^ b) ==>
// (a + b + 1) / 2 = (a | b) - ((a ^ b) >> 1)
unsigned int s;
s = a ^ b;
r = a | b;
s = s & 0xfffefffe; // ensure shift doesn't cross half-word boundaries
s = s >> 1;
r = r - s;
#endif
return r;
}
static __device__ __forceinline__ unsigned int vseteq2(unsigned int a, unsigned int b)
{
unsigned int r = 0;
#if __CUDA_ARCH__ >= 300
asm("vset2.u32.u32.eq %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
#else
// inspired by Alan Mycroft's null-byte detection algorithm:
// null_byte(x) = ((x - 0x01010101) & (~x & 0x80808080))
unsigned int c;
r = a ^ b; // 0x0000 if a == b
c = r | 0x80008000; // set msbs, to catch carry out
r = r ^ c; // extract msbs, msb = 1 if r < 0x8000
c = c - 0x00010001; // msb = 0, if r was 0x0000 or 0x8000
c = r & ~c; // msb = 1, if r was 0x0000
r = c >> 15; // convert to bool
#endif
return r;
}
static __device__ __forceinline__ unsigned int vcmpeq2(unsigned int a, unsigned int b)
{
unsigned int r, c;
#if __CUDA_ARCH__ >= 300
r = vseteq2(a, b);
c = r << 16; // convert bool
r = c - r; // into mask
#else
// inspired by Alan Mycroft's null-byte detection algorithm:
// null_byte(x) = ((x - 0x01010101) & (~x & 0x80808080))
r = a ^ b; // 0x0000 if a == b
c = r | 0x80008000; // set msbs, to catch carry out
r = r ^ c; // extract msbs, msb = 1 if r < 0x8000
c = c - 0x00010001; // msb = 0, if r was 0x0000 or 0x8000
c = r & ~c; // msb = 1, if r was 0x0000
r = c >> 15; // convert
r = c - r; // msbs to
r = c | r; // mask
#endif
return r;
}
static __device__ __forceinline__ unsigned int vsetge2(unsigned int a, unsigned int b)
{
unsigned int r = 0;
#if __CUDA_ARCH__ >= 300
asm("vset2.u32.u32.ge %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
#else
unsigned int c;
asm("not.b32 %0, %0;" : "+r"(b));
c = vavrg2(a, b); // (a + ~b + 1) / 2 = (a - b) / 2
c = c & 0x80008000; // msb = carry-outs
r = c >> 15; // convert to bool
#endif
return r;
}
static __device__ __forceinline__ unsigned int vcmpge2(unsigned int a, unsigned int b)
{
unsigned int r, c;
#if __CUDA_ARCH__ >= 300
r = vsetge2(a, b);
c = r << 16; // convert bool
r = c - r; // into mask
#else
asm("not.b32 %0, %0;" : "+r"(b));
c = vavrg2(a, b); // (a + ~b + 1) / 2 = (a - b) / 2
c = c & 0x80008000; // msb = carry-outs
r = c >> 15; // convert
r = c - r; // msbs to
r = c | r; // mask
#endif
return r;
}
static __device__ __forceinline__ unsigned int vsetgt2(unsigned int a, unsigned int b)
{
unsigned int r = 0;
#if __CUDA_ARCH__ >= 300
asm("vset2.u32.u32.gt %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
#else
unsigned int c;
asm("not.b32 %0, %0;" : "+r"(b));
c = vavg2(a, b); // (a + ~b) / 2 = (a - b) / 2 [rounded down]
c = c & 0x80008000; // msbs = carry-outs
r = c >> 15; // convert to bool
#endif
return r;
}
static __device__ __forceinline__ unsigned int vcmpgt2(unsigned int a, unsigned int b)
{
unsigned int r, c;
#if __CUDA_ARCH__ >= 300
r = vsetgt2(a, b);
c = r << 16; // convert bool
r = c - r; // into mask
#else
asm("not.b32 %0, %0;" : "+r"(b));
c = vavg2(a, b); // (a + ~b) / 2 = (a - b) / 2 [rounded down]
c = c & 0x80008000; // msbs = carry-outs
r = c >> 15; // convert
r = c - r; // msbs to
r = c | r; // mask
#endif
return r;
}
static __device__ __forceinline__ unsigned int vsetle2(unsigned int a, unsigned int b)
{
unsigned int r = 0;
#if __CUDA_ARCH__ >= 300
asm("vset2.u32.u32.le %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
#else
unsigned int c;
asm("not.b32 %0, %0;" : "+r"(a));
c = vavrg2(a, b); // (b + ~a + 1) / 2 = (b - a) / 2
c = c & 0x80008000; // msb = carry-outs
r = c >> 15; // convert to bool
#endif
return r;
}
static __device__ __forceinline__ unsigned int vcmple2(unsigned int a, unsigned int b)
{
unsigned int r, c;
#if __CUDA_ARCH__ >= 300
r = vsetle2(a, b);
c = r << 16; // convert bool
r = c - r; // into mask
#else
asm("not.b32 %0, %0;" : "+r"(a));
c = vavrg2(a, b); // (b + ~a + 1) / 2 = (b - a) / 2
c = c & 0x80008000; // msb = carry-outs
r = c >> 15; // convert
r = c - r; // msbs to
r = c | r; // mask
#endif
return r;
}
static __device__ __forceinline__ unsigned int vsetlt2(unsigned int a, unsigned int b)
{
unsigned int r = 0;
#if __CUDA_ARCH__ >= 300
asm("vset2.u32.u32.lt %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
#else
unsigned int c;
asm("not.b32 %0, %0;" : "+r"(a));
c = vavg2(a, b); // (b + ~a) / 2 = (b - a) / 2 [rounded down]
c = c & 0x80008000; // msb = carry-outs
r = c >> 15; // convert to bool
#endif
return r;
}
static __device__ __forceinline__ unsigned int vcmplt2(unsigned int a, unsigned int b)
{
unsigned int r, c;
#if __CUDA_ARCH__ >= 300
r = vsetlt2(a, b);
c = r << 16; // convert bool
r = c - r; // into mask
#else
asm("not.b32 %0, %0;" : "+r"(a));
c = vavg2(a, b); // (b + ~a) / 2 = (b - a) / 2 [rounded down]
c = c & 0x80008000; // msb = carry-outs
r = c >> 15; // convert
r = c - r; // msbs to
r = c | r; // mask
#endif
return r;
}
static __device__ __forceinline__ unsigned int vsetne2(unsigned int a, unsigned int b)
{
unsigned int r = 0;
#if __CUDA_ARCH__ >= 300
asm ("vset2.u32.u32.ne %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
#else
// inspired by Alan Mycroft's null-byte detection algorithm:
// null_byte(x) = ((x - 0x01010101) & (~x & 0x80808080))
unsigned int c;
r = a ^ b; // 0x0000 if a == b
c = r | 0x80008000; // set msbs, to catch carry out
c = c - 0x00010001; // msb = 0, if r was 0x0000 or 0x8000
c = r | c; // msb = 1, if r was not 0x0000
c = c & 0x80008000; // extract msbs
r = c >> 15; // convert to bool
#endif
return r;
}
static __device__ __forceinline__ unsigned int vcmpne2(unsigned int a, unsigned int b)
{
unsigned int r, c;
#if __CUDA_ARCH__ >= 300
r = vsetne2(a, b);
c = r << 16; // convert bool
r = c - r; // into mask
#else
// inspired by Alan Mycroft's null-byte detection algorithm:
// null_byte(x) = ((x - 0x01010101) & (~x & 0x80808080))
r = a ^ b; // 0x0000 if a == b
c = r | 0x80008000; // set msbs, to catch carry out
c = c - 0x00010001; // msb = 0, if r was 0x0000 or 0x8000
c = r | c; // msb = 1, if r was not 0x0000
c = c & 0x80008000; // extract msbs
r = c >> 15; // convert
r = c - r; // msbs to
r = c | r; // mask
#endif
return r;
}
static __device__ __forceinline__ unsigned int vmax2(unsigned int a, unsigned int b)
{
unsigned int r = 0;
#if __CUDA_ARCH__ >= 300
asm("vmax2.u32.u32.u32 %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
#elif __CUDA_ARCH__ >= 200
asm("vmax.u32.u32.u32 %0.h0, %1.h0, %2.h0, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
asm("vmax.u32.u32.u32 %0.h1, %1.h1, %2.h1, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
#else
unsigned int s, t, u;
r = a & 0x0000ffff; // extract low halfword
s = b & 0x0000ffff; // extract low halfword
t = ::max(r, s); // maximum of low halfwords
r = a & 0xffff0000; // extract high halfword
s = b & 0xffff0000; // extract high halfword
u = ::max(r, s); // maximum of high halfwords
r = t | u; // combine halfword maximums
#endif
return r;
}
static __device__ __forceinline__ unsigned int vmin2(unsigned int a, unsigned int b)
{
unsigned int r = 0;
#if __CUDA_ARCH__ >= 300
asm("vmin2.u32.u32.u32 %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
#elif __CUDA_ARCH__ >= 200
asm("vmin.u32.u32.u32 %0.h0, %1.h0, %2.h0, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
asm("vmin.u32.u32.u32 %0.h1, %1.h1, %2.h1, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
#else
unsigned int s, t, u;
r = a & 0x0000ffff; // extract low halfword
s = b & 0x0000ffff; // extract low halfword
t = ::min(r, s); // minimum of low halfwords
r = a & 0xffff0000; // extract high halfword
s = b & 0xffff0000; // extract high halfword
u = ::min(r, s); // minimum of high halfwords
r = t | u; // combine halfword minimums
#endif
return r;
}
// 4
static __device__ __forceinline__ unsigned int vadd4(unsigned int a, unsigned int b)
{
unsigned int r = 0;
#if __CUDA_ARCH__ >= 300
asm("vadd4.u32.u32.u32.sat %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
#elif __CUDA_ARCH__ >= 200
asm("vadd.u32.u32.u32.sat %0.b0, %1.b0, %2.b0, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
asm("vadd.u32.u32.u32.sat %0.b1, %1.b1, %2.b1, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
asm("vadd.u32.u32.u32.sat %0.b2, %1.b2, %2.b2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
asm("vadd.u32.u32.u32.sat %0.b3, %1.b3, %2.b3, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
#else
unsigned int s, t;
s = a ^ b; // sum bits
r = a & 0x7f7f7f7f; // clear msbs
t = b & 0x7f7f7f7f; // clear msbs
s = s & 0x80808080; // msb sum bits
r = r + t; // add without msbs, record carry-out in msbs
r = r ^ s; // sum of msb sum and carry-in bits, w/o carry-out
#endif /* __CUDA_ARCH__ >= 300 */
return r;
}
static __device__ __forceinline__ unsigned int vsub4(unsigned int a, unsigned int b)
{
unsigned int r = 0;
#if __CUDA_ARCH__ >= 300
asm("vsub4.u32.u32.u32.sat %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
#elif __CUDA_ARCH__ >= 200
asm("vsub.u32.u32.u32.sat %0.b0, %1.b0, %2.b0, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
asm("vsub.u32.u32.u32.sat %0.b1, %1.b1, %2.b1, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
asm("vsub.u32.u32.u32.sat %0.b2, %1.b2, %2.b2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
asm("vsub.u32.u32.u32.sat %0.b3, %1.b3, %2.b3, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
#else
unsigned int s, t;
s = a ^ ~b; // inverted sum bits
r = a | 0x80808080; // set msbs
t = b & 0x7f7f7f7f; // clear msbs
s = s & 0x80808080; // inverted msb sum bits
r = r - t; // subtract w/o msbs, record inverted borrows in msb
r = r ^ s; // combine inverted msb sum bits and borrows
#endif
return r;
}
static __device__ __forceinline__ unsigned int vavg4(unsigned int a, unsigned int b)
{
unsigned int r, s;
// HAKMEM #23: a + b = 2 * (a & b) + (a ^ b) ==>
// (a + b) / 2 = (a & b) + ((a ^ b) >> 1)
s = a ^ b;
r = a & b;
s = s & 0xfefefefe; // ensure following shift doesn't cross byte boundaries
s = s >> 1;
s = r + s;
return s;
}
static __device__ __forceinline__ unsigned int vavrg4(unsigned int a, unsigned int b)
{
unsigned int r = 0;
#if __CUDA_ARCH__ >= 300
asm("vavrg4.u32.u32.u32 %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
#else
// HAKMEM #23: a + b = 2 * (a | b) - (a ^ b) ==>
// (a + b + 1) / 2 = (a | b) - ((a ^ b) >> 1)
unsigned int c;
c = a ^ b;
r = a | b;
c = c & 0xfefefefe; // ensure following shift doesn't cross byte boundaries
c = c >> 1;
r = r - c;
#endif
return r;
}
static __device__ __forceinline__ unsigned int vseteq4(unsigned int a, unsigned int b)
{
unsigned int r = 0;
#if __CUDA_ARCH__ >= 300
asm("vset4.u32.u32.eq %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
#else
// inspired by Alan Mycroft's null-byte detection algorithm:
// null_byte(x) = ((x - 0x01010101) & (~x & 0x80808080))
unsigned int c;
r = a ^ b; // 0x00 if a == b
c = r | 0x80808080; // set msbs, to catch carry out
r = r ^ c; // extract msbs, msb = 1 if r < 0x80
c = c - 0x01010101; // msb = 0, if r was 0x00 or 0x80
c = r & ~c; // msb = 1, if r was 0x00
r = c >> 7; // convert to bool
#endif
return r;
}
static __device__ __forceinline__ unsigned int vcmpeq4(unsigned int a, unsigned int b)
{
unsigned int r, t;
#if __CUDA_ARCH__ >= 300
r = vseteq4(a, b);
t = r << 8; // convert bool
r = t - r; // to mask
#else
// inspired by Alan Mycroft's null-byte detection algorithm:
// null_byte(x) = ((x - 0x01010101) & (~x & 0x80808080))
t = a ^ b; // 0x00 if a == b
r = t | 0x80808080; // set msbs, to catch carry out
t = t ^ r; // extract msbs, msb = 1 if t < 0x80
r = r - 0x01010101; // msb = 0, if t was 0x00 or 0x80
r = t & ~r; // msb = 1, if t was 0x00
t = r >> 7; // build mask
t = r - t; // from
r = t | r; // msbs
#endif
return r;
}
static __device__ __forceinline__ unsigned int vsetle4(unsigned int a, unsigned int b)
{
unsigned int r = 0;
#if __CUDA_ARCH__ >= 300
asm("vset4.u32.u32.le %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
#else
unsigned int c;
asm("not.b32 %0, %0;" : "+r"(a));
c = vavrg4(a, b); // (b + ~a + 1) / 2 = (b - a) / 2
c = c & 0x80808080; // msb = carry-outs
r = c >> 7; // convert to bool
#endif
return r;
}
static __device__ __forceinline__ unsigned int vcmple4(unsigned int a, unsigned int b)
{
unsigned int r, c;
#if __CUDA_ARCH__ >= 300
r = vsetle4(a, b);
c = r << 8; // convert bool
r = c - r; // to mask
#else
asm("not.b32 %0, %0;" : "+r"(a));
c = vavrg4(a, b); // (b + ~a + 1) / 2 = (b - a) / 2
c = c & 0x80808080; // msbs = carry-outs
r = c >> 7; // convert
r = c - r; // msbs to
r = c | r; // mask
#endif
return r;
}
static __device__ __forceinline__ unsigned int vsetlt4(unsigned int a, unsigned int b)
{
unsigned int r = 0;
#if __CUDA_ARCH__ >= 300
asm("vset4.u32.u32.lt %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
#else
unsigned int c;
asm("not.b32 %0, %0;" : "+r"(a));
c = vavg4(a, b); // (b + ~a) / 2 = (b - a) / 2 [rounded down]
c = c & 0x80808080; // msb = carry-outs
r = c >> 7; // convert to bool
#endif
return r;
}
static __device__ __forceinline__ unsigned int vcmplt4(unsigned int a, unsigned int b)
{
unsigned int r, c;
#if __CUDA_ARCH__ >= 300
r = vsetlt4(a, b);
c = r << 8; // convert bool
r = c - r; // to mask
#else
asm("not.b32 %0, %0;" : "+r"(a));
c = vavg4(a, b); // (b + ~a) / 2 = (b - a) / 2 [rounded down]
c = c & 0x80808080; // msbs = carry-outs
r = c >> 7; // convert
r = c - r; // msbs to
r = c | r; // mask
#endif
return r;
}
static __device__ __forceinline__ unsigned int vsetge4(unsigned int a, unsigned int b)
{
unsigned int r = 0;
#if __CUDA_ARCH__ >= 300
asm("vset4.u32.u32.ge %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
#else
unsigned int c;
asm("not.b32 %0, %0;" : "+r"(b));
c = vavrg4(a, b); // (a + ~b + 1) / 2 = (a - b) / 2
c = c & 0x80808080; // msb = carry-outs
r = c >> 7; // convert to bool
#endif
return r;
}
static __device__ __forceinline__ unsigned int vcmpge4(unsigned int a, unsigned int b)
{
unsigned int r, s;
#if __CUDA_ARCH__ >= 300
r = vsetge4(a, b);
s = r << 8; // convert bool
r = s - r; // to mask
#else
asm ("not.b32 %0,%0;" : "+r"(b));
r = vavrg4 (a, b); // (a + ~b + 1) / 2 = (a - b) / 2
r = r & 0x80808080; // msb = carry-outs
s = r >> 7; // build mask
s = r - s; // from
r = s | r; // msbs
#endif
return r;
}
static __device__ __forceinline__ unsigned int vsetgt4(unsigned int a, unsigned int b)
{
unsigned int r = 0;
#if __CUDA_ARCH__ >= 300
asm("vset4.u32.u32.gt %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
#else
unsigned int c;
asm("not.b32 %0, %0;" : "+r"(b));
c = vavg4(a, b); // (a + ~b) / 2 = (a - b) / 2 [rounded down]
c = c & 0x80808080; // msb = carry-outs
r = c >> 7; // convert to bool
#endif
return r;
}
static __device__ __forceinline__ unsigned int vcmpgt4(unsigned int a, unsigned int b)
{
unsigned int r, c;
#if __CUDA_ARCH__ >= 300
r = vsetgt4(a, b);
c = r << 8; // convert bool
r = c - r; // to mask
#else
asm("not.b32 %0, %0;" : "+r"(b));
c = vavg4(a, b); // (a + ~b) / 2 = (a - b) / 2 [rounded down]
c = c & 0x80808080; // msb = carry-outs
r = c >> 7; // convert
r = c - r; // msbs to
r = c | r; // mask
#endif
return r;
}
static __device__ __forceinline__ unsigned int vsetne4(unsigned int a, unsigned int b)
{
unsigned int r = 0;
#if __CUDA_ARCH__ >= 300
asm("vset4.u32.u32.ne %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
#else
// inspired by Alan Mycroft's null-byte detection algorithm:
// null_byte(x) = ((x - 0x01010101) & (~x & 0x80808080))
unsigned int c;
r = a ^ b; // 0x00 if a == b
c = r | 0x80808080; // set msbs, to catch carry out
c = c - 0x01010101; // msb = 0, if r was 0x00 or 0x80
c = r | c; // msb = 1, if r was not 0x00
c = c & 0x80808080; // extract msbs
r = c >> 7; // convert to bool
#endif
return r;
}
static __device__ __forceinline__ unsigned int vcmpne4(unsigned int a, unsigned int b)
{
unsigned int r, c;
#if __CUDA_ARCH__ >= 300
r = vsetne4(a, b);
c = r << 8; // convert bool
r = c - r; // to mask
#else
// inspired by Alan Mycroft's null-byte detection algorithm:
// null_byte(x) = ((x - 0x01010101) & (~x & 0x80808080))
r = a ^ b; // 0x00 if a == b
c = r | 0x80808080; // set msbs, to catch carry out
c = c - 0x01010101; // msb = 0, if r was 0x00 or 0x80
c = r | c; // msb = 1, if r was not 0x00
c = c & 0x80808080; // extract msbs
r = c >> 7; // convert
r = c - r; // msbs to
r = c | r; // mask
#endif
return r;
}
static __device__ __forceinline__ unsigned int vabsdiff4(unsigned int a, unsigned int b)
{
unsigned int r = 0;
#if __CUDA_ARCH__ >= 300
asm("vabsdiff4.u32.u32.u32.sat %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
#elif __CUDA_ARCH__ >= 200
asm("vabsdiff.u32.u32.u32.sat %0.b0, %1.b0, %2.b0, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
asm("vabsdiff.u32.u32.u32.sat %0.b1, %1.b1, %2.b1, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
asm("vabsdiff.u32.u32.u32.sat %0.b2, %1.b2, %2.b2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
asm("vabsdiff.u32.u32.u32.sat %0.b3, %1.b3, %2.b3, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
#else
unsigned int s;
s = vcmpge4(a, b); // mask = 0xff if a >= b
r = a ^ b; //
s = (r & s) ^ b; // select a when a >= b, else select b => max(a,b)
r = s ^ r; // select a when b >= a, else select b => min(a,b)
r = s - r; // |a - b| = max(a,b) - min(a,b);
#endif
return r;
}
static __device__ __forceinline__ unsigned int vmax4(unsigned int a, unsigned int b)
{
unsigned int r = 0;
#if __CUDA_ARCH__ >= 300
asm("vmax4.u32.u32.u32 %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
#elif __CUDA_ARCH__ >= 200
asm("vmax.u32.u32.u32 %0.b0, %1.b0, %2.b0, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
asm("vmax.u32.u32.u32 %0.b1, %1.b1, %2.b1, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
asm("vmax.u32.u32.u32 %0.b2, %1.b2, %2.b2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
asm("vmax.u32.u32.u32 %0.b3, %1.b3, %2.b3, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
#else
unsigned int s;
s = vcmpge4(a, b); // mask = 0xff if a >= b
r = a & s; // select a when b >= a
s = b & ~s; // select b when b < a
r = r | s; // combine byte selections
#endif
return r; // byte-wise unsigned maximum
}
static __device__ __forceinline__ unsigned int vmin4(unsigned int a, unsigned int b)
{
unsigned int r = 0;
#if __CUDA_ARCH__ >= 300
asm("vmin4.u32.u32.u32 %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
#elif __CUDA_ARCH__ >= 200
asm("vmin.u32.u32.u32 %0.b0, %1.b0, %2.b0, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
asm("vmin.u32.u32.u32 %0.b1, %1.b1, %2.b1, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
asm("vmin.u32.u32.u32 %0.b2, %1.b2, %2.b2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
asm("vmin.u32.u32.u32 %0.b3, %1.b3, %2.b3, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
#else
unsigned int s;
s = vcmpge4(b, a); // mask = 0xff if a >= b
r = a & s; // select a when b >= a
s = b & ~s; // select b when b < a
r = r | s; // combine byte selections
#endif
return r;
}
}}}
#endif // __OPENCV_GPU_SIMD_FUNCTIONS_HPP__

View File

@@ -7,7 +7,7 @@
// copy or use the software.
//
//
// License Agreement
// License Agreement
// For Open Source Computer Vision Library
//
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.

View File

@@ -2,6 +2,7 @@
using namespace std;
using namespace testing;
using namespace perf;
//////////////////////////////////////////////////////////////////////
// StereoBM
@@ -12,7 +13,7 @@ DEF_PARAM_TEST_1(ImagePair, pair_string);
PERF_TEST_P(ImagePair, Calib3D_StereoBM,
Values(pair_string("gpu/perf/aloe.png", "gpu/perf/aloeR.png")))
{
declare.time(5.0);
declare.time(300.0);
const cv::Mat imgLeft = readImage(GET_PARAM(0), cv::IMREAD_GRAYSCALE);
ASSERT_FALSE(imgLeft.empty());
@@ -53,7 +54,7 @@ PERF_TEST_P(ImagePair, Calib3D_StereoBM,
PERF_TEST_P(ImagePair, Calib3D_StereoBeliefPropagation,
Values(pair_string("gpu/stereobp/aloe-L.png", "gpu/stereobp/aloe-R.png")))
{
declare.time(10.0);
declare.time(300.0);
const cv::Mat imgLeft = readImage(GET_PARAM(0));
ASSERT_FALSE(imgLeft.empty());
@@ -87,7 +88,7 @@ PERF_TEST_P(ImagePair, Calib3D_StereoBeliefPropagation,
PERF_TEST_P(ImagePair, Calib3D_StereoConstantSpaceBP,
Values(pair_string("gpu/stereobm/aloe-L.png", "gpu/stereobm/aloe-R.png")))
{
declare.time(10.0);
declare.time(300.0);
const cv::Mat imgLeft = readImage(GET_PARAM(0), cv::IMREAD_GRAYSCALE);
ASSERT_FALSE(imgLeft.empty());

View File

@@ -1748,7 +1748,10 @@ PERF_TEST_P(Sz_Depth_Norm, Core_Norm,
const int normType = GET_PARAM(2);
cv::Mat src(size, depth);
declare.in(src, WARMUP_RNG);
if (depth == CV_8U)
cv::randu(src, 0, 254);
else
declare.in(src, WARMUP_RNG);
if (PERF_RUN_GPU())
{
@@ -1923,7 +1926,10 @@ PERF_TEST_P(Sz_Depth, Core_MinMax,
const int depth = GET_PARAM(1);
cv::Mat src(size, depth);
declare.in(src, WARMUP_RNG);
if (depth == CV_8U)
cv::randu(src, 0, 254);
else
declare.in(src, WARMUP_RNG);
if (PERF_RUN_GPU())
{
@@ -1958,7 +1964,10 @@ PERF_TEST_P(Sz_Depth, Core_MinMaxLoc,
const int depth = GET_PARAM(1);
cv::Mat src(size, depth);
declare.in(src, WARMUP_RNG);
if (depth == CV_8U)
cv::randu(src, 0, 254);
else
declare.in(src, WARMUP_RNG);
if (PERF_RUN_GPU())
{

View File

@@ -2,6 +2,7 @@
using namespace std;
using namespace testing;
using namespace perf;
#define GPU_DENOISING_IMAGE_SIZES testing::Values(perf::szVGA, perf::sz720p)
@@ -63,7 +64,7 @@ PERF_TEST_P(Sz_Depth_Cn_WinSz_BlockSz, Denoising_NonLocalMeans,
Values(21),
Values(5)))
{
declare.time(60.0);
declare.time(600.0);
const cv::Size size = GET_PARAM(0);
const int depth = GET_PARAM(1);

View File

@@ -2,105 +2,7 @@
using namespace std;
using namespace testing;
struct KeypointIdxCompare
{
std::vector<cv::KeyPoint>* keypoints;
explicit KeypointIdxCompare(std::vector<cv::KeyPoint>* _keypoints) : keypoints(_keypoints) {}
bool operator ()(size_t i1, size_t i2) const
{
cv::KeyPoint kp1 = (*keypoints)[i1];
cv::KeyPoint kp2 = (*keypoints)[i2];
if (kp1.pt.x != kp2.pt.x)
return kp1.pt.x < kp2.pt.x;
if (kp1.pt.y != kp2.pt.y)
return kp1.pt.y < kp2.pt.y;
if (kp1.response != kp2.response)
return kp1.response < kp2.response;
return kp1.octave < kp2.octave;
}
};
static void sortKeyPoints(std::vector<cv::KeyPoint>& keypoints, cv::InputOutputArray _descriptors = cv::noArray())
{
std::vector<size_t> indexies(keypoints.size());
for (size_t i = 0; i < indexies.size(); ++i)
indexies[i] = i;
std::sort(indexies.begin(), indexies.end(), KeypointIdxCompare(&keypoints));
std::vector<cv::KeyPoint> new_keypoints;
cv::Mat new_descriptors;
new_keypoints.resize(keypoints.size());
cv::Mat descriptors;
if (_descriptors.needed())
{
descriptors = _descriptors.getMat();
new_descriptors.create(descriptors.size(), descriptors.type());
}
for (size_t i = 0; i < indexies.size(); ++i)
{
size_t new_idx = indexies[i];
new_keypoints[i] = keypoints[new_idx];
if (!new_descriptors.empty())
descriptors.row((int) new_idx).copyTo(new_descriptors.row((int) i));
}
keypoints.swap(new_keypoints);
if (_descriptors.needed())
new_descriptors.copyTo(_descriptors);
}
//////////////////////////////////////////////////////////////////////
// SURF
DEF_PARAM_TEST_1(Image, string);
PERF_TEST_P(Image, Features2D_SURF,
Values<string>("gpu/perf/aloe.png"))
{
declare.time(50.0);
const cv::Mat img = readImage(GetParam(), cv::IMREAD_GRAYSCALE);
ASSERT_FALSE(img.empty());
if (PERF_RUN_GPU())
{
cv::gpu::SURF_GPU d_surf;
const cv::gpu::GpuMat d_img(img);
cv::gpu::GpuMat d_keypoints, d_descriptors;
TEST_CYCLE() d_surf(d_img, cv::gpu::GpuMat(), d_keypoints, d_descriptors);
std::vector<cv::KeyPoint> gpu_keypoints;
d_surf.downloadKeypoints(d_keypoints, gpu_keypoints);
cv::Mat gpu_descriptors(d_descriptors);
sortKeyPoints(gpu_keypoints, gpu_descriptors);
SANITY_CHECK_KEYPOINTS(gpu_keypoints);
SANITY_CHECK(gpu_descriptors, 1e-3);
}
else
{
cv::SURF surf;
std::vector<cv::KeyPoint> cpu_keypoints;
cv::Mat cpu_descriptors;
TEST_CYCLE() surf(img, cv::noArray(), cpu_keypoints, cpu_descriptors);
SANITY_CHECK_KEYPOINTS(cpu_keypoints);
SANITY_CHECK(cpu_descriptors);
}
}
using namespace perf;
//////////////////////////////////////////////////////////////////////
// FAST
@@ -153,6 +55,8 @@ PERF_TEST_P(Image_NFeatures, Features2D_ORB,
Combine(Values<string>("gpu/perf/aloe.png"),
Values(4000)))
{
declare.time(300.0);
const cv::Mat img = readImage(GET_PARAM(0), cv::IMREAD_GRAYSCALE);
ASSERT_FALSE(img.empty());

View File

@@ -2,6 +2,7 @@
using namespace std;
using namespace testing;
using namespace perf;
//////////////////////////////////////////////////////////////////////
// Blur

View File

@@ -632,7 +632,7 @@ DEF_PARAM_TEST_1(Image, string);
PERF_TEST_P(Image, ImgProc_MeanShiftFiltering,
Values<string>("gpu/meanshift/cones.png"))
{
declare.time(15.0);
declare.time(300.0);
const cv::Mat img = readImage(GetParam());
ASSERT_FALSE(img.empty());
@@ -668,7 +668,7 @@ PERF_TEST_P(Image, ImgProc_MeanShiftFiltering,
PERF_TEST_P(Image, ImgProc_MeanShiftProc,
Values<string>("gpu/meanshift/cones.png"))
{
declare.time(5.0);
declare.time(300.0);
const cv::Mat img = readImage(GetParam());
ASSERT_FALSE(img.empty());
@@ -702,7 +702,7 @@ PERF_TEST_P(Image, ImgProc_MeanShiftProc,
PERF_TEST_P(Image, ImgProc_MeanShiftSegmentation,
Values<string>("gpu/meanshift/cones.png"))
{
declare.time(5.0);
declare.time(300.0);
const cv::Mat img = readImage(GetParam());
ASSERT_FALSE(img.empty());
@@ -830,6 +830,8 @@ PERF_TEST_P(Sz_TemplateSz_Cn_Method, ImgProc_MatchTemplate8U,
GPU_CHANNELS_1_3_4,
ALL_TEMPLATE_METHODS))
{
declare.time(300.0);
const cv::Size size = GET_PARAM(0);
const cv::Size templ_size = GET_PARAM(1);
const int cn = GET_PARAM(2);
@@ -868,6 +870,8 @@ PERF_TEST_P(Sz_TemplateSz_Cn_Method, ImgProc_MatchTemplate32F,
GPU_CHANNELS_1_3_4,
Values(TemplateMethod(cv::TM_SQDIFF), TemplateMethod(cv::TM_CCORR))))
{
declare.time(300.0);
const cv::Size size = GET_PARAM(0);
const cv::Size templ_size = GET_PARAM(1);
const int cn = GET_PARAM(2);
@@ -1034,7 +1038,7 @@ PERF_TEST_P(Image_Type_Border_BlockSz_ApertureSz, ImgProc_CornerHarris,
TEST_CYCLE() cv::gpu::cornerHarris(d_img, dst, d_Dx, d_Dy, d_buf, blockSize, apertureSize, k, borderMode);
GPU_SANITY_CHECK(dst);
GPU_SANITY_CHECK(dst, 1e-4);
}
else
{
@@ -1077,7 +1081,7 @@ PERF_TEST_P(Image_Type_Border_BlockSz_ApertureSz, ImgProc_CornerMinEigenVal,
TEST_CYCLE() cv::gpu::cornerMinEigenVal(d_img, dst, d_Dx, d_Dy, d_buf, blockSize, apertureSize, borderMode);
GPU_SANITY_CHECK(dst);
GPU_SANITY_CHECK(dst, 1e-4);
}
else
{
@@ -1341,7 +1345,12 @@ PERF_TEST_P(Sz_Depth_Code, ImgProc_CvtColorBayer,
Values(CvtColorInfo(1, 3, cv::COLOR_BayerBG2BGR),
CvtColorInfo(1, 3, cv::COLOR_BayerGB2BGR),
CvtColorInfo(1, 3, cv::COLOR_BayerRG2BGR),
CvtColorInfo(1, 3, cv::COLOR_BayerGR2BGR))))
CvtColorInfo(1, 3, cv::COLOR_BayerGR2BGR),
CvtColorInfo(1, 1, cv::COLOR_BayerBG2GRAY),
CvtColorInfo(1, 1, cv::COLOR_BayerGB2GRAY),
CvtColorInfo(1, 1, cv::COLOR_BayerRG2GRAY),
CvtColorInfo(1, 1, cv::COLOR_BayerGR2GRAY))))
{
const cv::Size size = GET_PARAM(0);
const int depth = GET_PARAM(1);
@@ -1369,6 +1378,50 @@ PERF_TEST_P(Sz_Depth_Code, ImgProc_CvtColorBayer,
}
}
CV_ENUM(DemosaicingCode,
cv::COLOR_BayerBG2BGR, cv::COLOR_BayerGB2BGR, cv::COLOR_BayerRG2BGR, cv::COLOR_BayerGR2BGR,
cv::COLOR_BayerBG2GRAY, cv::COLOR_BayerGB2GRAY, cv::COLOR_BayerRG2GRAY, cv::COLOR_BayerGR2GRAY,
cv::gpu::COLOR_BayerBG2BGR_MHT, cv::gpu::COLOR_BayerGB2BGR_MHT, cv::gpu::COLOR_BayerRG2BGR_MHT, cv::gpu::COLOR_BayerGR2BGR_MHT,
cv::gpu::COLOR_BayerBG2GRAY_MHT, cv::gpu::COLOR_BayerGB2GRAY_MHT, cv::gpu::COLOR_BayerRG2GRAY_MHT, cv::gpu::COLOR_BayerGR2GRAY_MHT)
DEF_PARAM_TEST(Sz_Code, cv::Size, DemosaicingCode);
PERF_TEST_P(Sz_Code, ImgProc_Demosaicing,
Combine(GPU_TYPICAL_MAT_SIZES,
ValuesIn(DemosaicingCode::all())))
{
const cv::Size size = GET_PARAM(0);
const int code = GET_PARAM(1);
cv::Mat src(size, CV_8UC1);
declare.in(src, WARMUP_RNG);
if (PERF_RUN_GPU())
{
const cv::gpu::GpuMat d_src(src);
cv::gpu::GpuMat dst;
TEST_CYCLE() cv::gpu::demosaicing(d_src, dst, code);
GPU_SANITY_CHECK(dst);
}
else
{
if (code >= cv::COLOR_COLORCVT_MAX)
{
FAIL_NO_CPU();
}
else
{
cv::Mat dst;
TEST_CYCLE() cv::cvtColor(src, dst, code);
CPU_SANITY_CHECK(dst);
}
}
}
//////////////////////////////////////////////////////////////////////
// SwapChannels

View File

@@ -2,6 +2,7 @@
using namespace std;
using namespace testing;
using namespace perf;
DEF_PARAM_TEST_1(Image, string);

View File

@@ -1,70 +1,5 @@
#include "perf_precomp.hpp"
static void printOsInfo()
{
#if defined _WIN32
# if defined _WIN64
printf("[----------]\n[ GPU INFO ] \tRun on OS Windows x64.\n[----------]\n"), fflush(stdout);
# else
printf("[----------]\n[ GPU INFO ] \tRun on OS Windows x32.\n[----------]\n"), fflush(stdout);
# endif
#elif defined linux
# if defined _LP64
printf("[----------]\n[ GPU INFO ] \tRun on OS Linux x64.\n[----------]\n"), fflush(stdout);
# else
printf("[----------]\n[ GPU INFO ] \tRun on OS Linux x32.\n[----------]\n"), fflush(stdout);
# endif
#elif defined __APPLE__
# if defined _LP64
printf("[----------]\n[ GPU INFO ] \tRun on OS Apple x64.\n[----------]\n"), fflush(stdout);
# else
printf("[----------]\n[ GPU INFO ] \tRun on OS Apple x32.\n[----------]\n"), fflush(stdout);
# endif
#endif
}
static void printCudaInfo()
{
printOsInfo();
#ifndef HAVE_CUDA
printf("[----------]\n[ GPU INFO ] \tOpenCV was built without CUDA support.\n[----------]\n"), fflush(stdout);
#else
int driver;
cudaDriverGetVersion(&driver);
printf("[----------]\n"), fflush(stdout);
printf("[ GPU INFO ] \tCUDA Driver version: %d.\n", driver), fflush(stdout);
printf("[ GPU INFO ] \tCUDA Runtime version: %d.\n", CUDART_VERSION), fflush(stdout);
printf("[----------]\n"), fflush(stdout);
printf("[----------]\n"), fflush(stdout);
printf("[ GPU INFO ] \tGPU module was compiled for the following GPU archs.\n"), fflush(stdout);
printf("[ BIN ] \t%s.\n", CUDA_ARCH_BIN), fflush(stdout);
printf("[ PTX ] \t%s.\n", CUDA_ARCH_PTX), fflush(stdout);
printf("[----------]\n"), fflush(stdout);
printf("[----------]\n"), fflush(stdout);
int deviceCount = cv::gpu::getCudaEnabledDeviceCount();
printf("[ GPU INFO ] \tCUDA device count:: %d.\n", deviceCount), fflush(stdout);
printf("[----------]\n"), fflush(stdout);
for (int i = 0; i < deviceCount; ++i)
{
cv::gpu::DeviceInfo info(i);
printf("[----------]\n"), fflush(stdout);
printf("[ DEVICE ] \t# %d %s.\n", i, info.name().c_str()), fflush(stdout);
printf("[ ] \tCompute capability: %d.%d\n", (int)info.majorVersion(), (int)info.minorVersion()), fflush(stdout);
printf("[ ] \tMulti Processor Count: %d\n", info.multiProcessorCount()), fflush(stdout);
printf("[ ] \tTotal memory: %d Mb\n", static_cast<int>(static_cast<int>(info.totalMemory() / 1024.0) / 1024.0)), fflush(stdout);
printf("[ ] \tFree memory: %d Mb\n", static_cast<int>(static_cast<int>(info.freeMemory() / 1024.0) / 1024.0)), fflush(stdout);
if (!info.isCompatible())
printf("[ GPU INFO ] \tThis device is NOT compatible with current GPU module build\n");
printf("[----------]\n"), fflush(stdout);
}
#endif
}
using namespace perf;
CV_PERF_TEST_MAIN(gpu, printCudaInfo())

View File

@@ -2,6 +2,7 @@
using namespace std;
using namespace testing;
using namespace perf;
//////////////////////////////////////////////////////////////////////
// SetTo

View File

@@ -2,6 +2,7 @@
using namespace std;
using namespace testing;
using namespace perf;
///////////////////////////////////////////////////////////////
// HOG
@@ -18,6 +19,8 @@ PERF_TEST_P(Image, ObjDetect_HOG,
"gpu/caltech/image_00000527_0.png",
"gpu/caltech/image_00000574_0.png"))
{
declare.time(300.0);
const cv::Mat img = readImage(GetParam(), cv::IMREAD_GRAYSCALE);
ASSERT_FALSE(img.empty());

View File

@@ -19,6 +19,7 @@
#endif
#include "opencv2/ts.hpp"
#include "opencv2/ts/gpu_perf.hpp"
#include "opencv2/core.hpp"
#include "opencv2/highgui.hpp"
@@ -26,12 +27,9 @@
#include "opencv2/calib3d.hpp"
#include "opencv2/imgproc.hpp"
#include "opencv2/video.hpp"
#include "opencv2/nonfree.hpp"
#include "opencv2/legacy.hpp"
#include "opencv2/photo.hpp"
#include "utility.hpp"
#ifdef GTEST_CREATE_SHARED_LIBRARY
#error no modules except ts should have GTEST_CREATE_SHARED_LIBRARY defined
#endif

View File

@@ -4,6 +4,18 @@ using namespace std;
using namespace testing;
using namespace perf;
#if defined(HAVE_XINE) || \
defined(HAVE_GSTREAMER) || \
defined(HAVE_QUICKTIME) || \
defined(HAVE_AVFOUNDATION) || \
defined(HAVE_FFMPEG) || \
defined(WIN32) /* assume that we have ffmpeg */
# define BUILD_WITH_VIDEO_INPUT_SUPPORT 1
#else
# define BUILD_WITH_VIDEO_INPUT_SUPPORT 0
#endif
namespace cv
{
template<> void Ptr<CvBGStatModel>::delete_obj()
@@ -142,7 +154,7 @@ PERF_TEST_P(Image_MinDistance, Video_GoodFeaturesToTrack,
PERF_TEST_P(ImagePair, Video_BroxOpticalFlow,
Values<pair_string>(make_pair("gpu/opticalflow/frame0.png", "gpu/opticalflow/frame1.png")))
{
declare.time(10);
declare.time(300);
cv::Mat frame0 = readImage(GetParam().first, cv::IMREAD_GRAYSCALE);
ASSERT_FALSE(frame0.empty());
@@ -372,8 +384,8 @@ PERF_TEST_P(ImagePair, Video_OpticalFlowDual_TVL1,
TEST_CYCLE() d_alg(d_frame0, d_frame1, u, v);
GPU_SANITY_CHECK(u, 1e-4);
GPU_SANITY_CHECK(v, 1e-4);
GPU_SANITY_CHECK(u, 1e-2);
GPU_SANITY_CHECK(v, 1e-2);
}
else
{
@@ -482,6 +494,8 @@ PERF_TEST_P(ImagePair, Video_FastOpticalFlowBM,
//////////////////////////////////////////////////////
// FGDStatModel
#if BUILD_WITH_VIDEO_INPUT_SUPPORT
DEF_PARAM_TEST_1(Video, string);
PERF_TEST_P(Video, Video_FGDStatModel,
@@ -548,9 +562,13 @@ PERF_TEST_P(Video, Video_FGDStatModel,
}
}
#endif
//////////////////////////////////////////////////////
// MOG
#if BUILD_WITH_VIDEO_INPUT_SUPPORT
DEF_PARAM_TEST(Video_Cn_LearningRate, string, MatCn, double);
PERF_TEST_P(Video_Cn_LearningRate, Video_MOG,
@@ -643,9 +661,13 @@ PERF_TEST_P(Video_Cn_LearningRate, Video_MOG,
}
}
#endif
//////////////////////////////////////////////////////
// MOG2
#if BUILD_WITH_VIDEO_INPUT_SUPPORT
DEF_PARAM_TEST(Video_Cn, string, int);
PERF_TEST_P(Video_Cn, Video_MOG2,
@@ -740,9 +762,13 @@ PERF_TEST_P(Video_Cn, Video_MOG2,
}
}
#endif
//////////////////////////////////////////////////////
// MOG2GetBackgroundImage
#if BUILD_WITH_VIDEO_INPUT_SUPPORT
PERF_TEST_P(Video_Cn, Video_MOG2GetBackgroundImage,
Combine(Values("gpu/video/768x576.avi", "gpu/video/1920x1080.avi"),
GPU_CHANNELS_1_3_4))
@@ -818,74 +844,13 @@ PERF_TEST_P(Video_Cn, Video_MOG2GetBackgroundImage,
}
}
//////////////////////////////////////////////////////
// VIBE
PERF_TEST_P(Video_Cn, Video_VIBE,
Combine(Values("gpu/video/768x576.avi", "gpu/video/1920x1080.avi"),
GPU_CHANNELS_1_3_4))
{
const string inputFile = perf::TestBase::getDataPath(GET_PARAM(0));
const int cn = GET_PARAM(1);
cv::VideoCapture cap(inputFile);
ASSERT_TRUE(cap.isOpened());
cv::Mat frame;
cap >> frame;
ASSERT_FALSE(frame.empty());
if (cn != 3)
{
cv::Mat temp;
if (cn == 1)
cv::cvtColor(frame, temp, cv::COLOR_BGR2GRAY);
else
cv::cvtColor(frame, temp, cv::COLOR_BGR2BGRA);
cv::swap(temp, frame);
}
if (PERF_RUN_GPU())
{
cv::gpu::GpuMat d_frame(frame);
cv::gpu::VIBE_GPU vibe;
cv::gpu::GpuMat foreground;
vibe(d_frame, foreground);
for (int i = 0; i < 10; ++i)
{
cap >> frame;
ASSERT_FALSE(frame.empty());
if (cn != 3)
{
cv::Mat temp;
if (cn == 1)
cv::cvtColor(frame, temp, cv::COLOR_BGR2GRAY);
else
cv::cvtColor(frame, temp, cv::COLOR_BGR2BGRA);
cv::swap(temp, frame);
}
d_frame.upload(frame);
startTimer(); next();
vibe(d_frame, foreground);
stopTimer();
}
GPU_SANITY_CHECK(foreground);
}
else
{
FAIL_NO_CPU();
}
}
#endif
//////////////////////////////////////////////////////
// GMG
#if BUILD_WITH_VIDEO_INPUT_SUPPORT
DEF_PARAM_TEST(Video_Cn_MaxFeatures, string, MatCn, int);
PERF_TEST_P(Video_Cn_MaxFeatures, Video_GMG,
@@ -993,11 +958,13 @@ PERF_TEST_P(Video_Cn_MaxFeatures, Video_GMG,
}
}
#ifdef HAVE_NVCUVID
#endif
//////////////////////////////////////////////////////
// VideoReader
#if defined(HAVE_NVCUVID) && BUILD_WITH_VIDEO_INPUT_SUPPORT
PERF_TEST_P(Video, Video_VideoReader, Values("gpu/video/768x576.avi", "gpu/video/1920x1080.avi"))
{
declare.time(20);
@@ -1028,10 +995,12 @@ PERF_TEST_P(Video, Video_VideoReader, Values("gpu/video/768x576.avi", "gpu/video
}
}
#endif
//////////////////////////////////////////////////////
// VideoWriter
#ifdef WIN32
#if defined(HAVE_NVCUVID) && defined(WIN32)
PERF_TEST_P(Video, Video_VideoWriter, Values("gpu/video/768x576.avi", "gpu/video/1920x1080.avi"))
{
@@ -1089,6 +1058,4 @@ PERF_TEST_P(Video, Video_VideoWriter, Values("gpu/video/768x576.avi", "gpu/video
SANITY_CHECK(frame);
}
#endif // WIN32
#endif // HAVE_NVCUVID
#endif

View File

@@ -1,184 +0,0 @@
#include "perf_precomp.hpp"
using namespace std;
using namespace cv;
Mat readImage(const string& fileName, int flags)
{
return imread(perf::TestBase::getDataPath(fileName), flags);
}
void PrintTo(const CvtColorInfo& info, ostream* os)
{
static const char* str[] =
{
"BGR2BGRA",
"BGRA2BGR",
"BGR2RGBA",
"RGBA2BGR",
"BGR2RGB",
"BGRA2RGBA",
"BGR2GRAY",
"RGB2GRAY",
"GRAY2BGR",
"GRAY2BGRA",
"BGRA2GRAY",
"RGBA2GRAY",
"BGR2BGR565",
"RGB2BGR565",
"BGR5652BGR",
"BGR5652RGB",
"BGRA2BGR565",
"RGBA2BGR565",
"BGR5652BGRA",
"BGR5652RGBA",
"GRAY2BGR565",
"BGR5652GRAY",
"BGR2BGR555",
"RGB2BGR555",
"BGR5552BGR",
"BGR5552RGB",
"BGRA2BGR555",
"RGBA2BGR555",
"BGR5552BGRA",
"BGR5552RGBA",
"GRAY2BGR555",
"BGR5552GRAY",
"BGR2XYZ",
"RGB2XYZ",
"XYZ2BGR",
"XYZ2RGB",
"BGR2YCrCb",
"RGB2YCrCb",
"YCrCb2BGR",
"YCrCb2RGB",
"BGR2HSV",
"RGB2HSV",
"",
"",
"BGR2Lab",
"RGB2Lab",
"BayerBG2BGR",
"BayerGB2BGR",
"BayerRG2BGR",
"BayerGR2BGR",
"BGR2Luv",
"RGB2Luv",
"BGR2HLS",
"RGB2HLS",
"HSV2BGR",
"HSV2RGB",
"Lab2BGR",
"Lab2RGB",
"Luv2BGR",
"Luv2RGB",
"HLS2BGR",
"HLS2RGB",
"BayerBG2BGR_VNG",
"BayerGB2BGR_VNG",
"BayerRG2BGR_VNG",
"BayerGR2BGR_VNG",
"BGR2HSV_FULL",
"RGB2HSV_FULL",
"BGR2HLS_FULL",
"RGB2HLS_FULL",
"HSV2BGR_FULL",
"HSV2RGB_FULL",
"HLS2BGR_FULL",
"HLS2RGB_FULL",
"LBGR2Lab",
"LRGB2Lab",
"LBGR2Luv",
"LRGB2Luv",
"Lab2LBGR",
"Lab2LRGB",
"Luv2LBGR",
"Luv2LRGB",
"BGR2YUV",
"RGB2YUV",
"YUV2BGR",
"YUV2RGB",
"BayerBG2GRAY",
"BayerGB2GRAY",
"BayerRG2GRAY",
"BayerGR2GRAY",
//YUV 4:2:0 formats family
"YUV2RGB_NV12",
"YUV2BGR_NV12",
"YUV2RGB_NV21",
"YUV2BGR_NV21",
"YUV2RGBA_NV12",
"YUV2BGRA_NV12",
"YUV2RGBA_NV21",
"YUV2BGRA_NV21",
"YUV2RGB_YV12",
"YUV2BGR_YV12",
"YUV2RGB_IYUV",
"YUV2BGR_IYUV",
"YUV2RGBA_YV12",
"YUV2BGRA_YV12",
"YUV2RGBA_IYUV",
"YUV2BGRA_IYUV",
"YUV2GRAY_420",
//YUV 4:2:2 formats family
"YUV2RGB_UYVY",
"YUV2BGR_UYVY",
"YUV2RGB_VYUY",
"YUV2BGR_VYUY",
"YUV2RGBA_UYVY",
"YUV2BGRA_UYVY",
"YUV2RGBA_VYUY",
"YUV2BGRA_VYUY",
"YUV2RGB_YUY2",
"YUV2BGR_YUY2",
"YUV2RGB_YVYU",
"YUV2BGR_YVYU",
"YUV2RGBA_YUY2",
"YUV2BGRA_YUY2",
"YUV2RGBA_YVYU",
"YUV2BGRA_YVYU",
"YUV2GRAY_UYVY",
"YUV2GRAY_YUY2",
// alpha premultiplication
"RGBA2mRGBA",
"mRGBA2RGBA",
"COLORCVT_MAX"
};
*os << str[info.code];
}

View File

@@ -1,63 +0,0 @@
#ifndef __OPENCV_PERF_GPU_UTILITY_HPP__
#define __OPENCV_PERF_GPU_UTILITY_HPP__
#include "opencv2/core.hpp"
#include "opencv2/imgproc.hpp"
#include "opencv2/ts/ts_perf.hpp"
cv::Mat readImage(const std::string& fileName, int flags = cv::IMREAD_COLOR);
using perf::MatType;
using perf::MatDepth;
CV_ENUM(BorderMode, cv::BORDER_REFLECT101, cv::BORDER_REPLICATE, cv::BORDER_CONSTANT, cv::BORDER_REFLECT, cv::BORDER_WRAP)
#define ALL_BORDER_MODES testing::ValuesIn(BorderMode::all())
CV_ENUM(Interpolation, cv::INTER_NEAREST, cv::INTER_LINEAR, cv::INTER_CUBIC, cv::INTER_AREA)
#define ALL_INTERPOLATIONS testing::ValuesIn(Interpolation::all())
CV_ENUM(NormType, cv::NORM_INF, cv::NORM_L1, cv::NORM_L2, cv::NORM_HAMMING, cv::NORM_MINMAX)
enum { Gray = 1, TwoChannel = 2, BGR = 3, BGRA = 4 };
CV_ENUM(MatCn, Gray, TwoChannel, BGR, BGRA)
#define GPU_CHANNELS_1_3_4 testing::Values(MatCn(Gray), MatCn(BGR), MatCn(BGRA))
#define GPU_CHANNELS_1_3 testing::Values(MatCn(Gray), MatCn(BGR))
struct CvtColorInfo
{
int scn;
int dcn;
int code;
CvtColorInfo() {}
explicit CvtColorInfo(int scn_, int dcn_, int code_) : scn(scn_), dcn(dcn_), code(code_) {}
};
void PrintTo(const CvtColorInfo& info, std::ostream* os);
#define GET_PARAM(k) std::tr1::get< k >(GetParam())
#define DEF_PARAM_TEST(name, ...) typedef ::perf::TestBaseWithParam< std::tr1::tuple< __VA_ARGS__ > > name
#define DEF_PARAM_TEST_1(name, param_type) typedef ::perf::TestBaseWithParam< param_type > name
DEF_PARAM_TEST_1(Sz, cv::Size);
typedef perf::Size_MatType Sz_Type;
DEF_PARAM_TEST(Sz_Depth, cv::Size, MatDepth);
DEF_PARAM_TEST(Sz_Depth_Cn, cv::Size, MatDepth, MatCn);
#define GPU_TYPICAL_MAT_SIZES testing::Values(perf::sz720p, perf::szSXGA, perf::sz1080p)
#define FAIL_NO_CPU() FAIL() << "No such CPU implementation analogy"
#define GPU_SANITY_CHECK(mat, ...) \
do{ \
cv::Mat gpu_##mat(mat); \
SANITY_CHECK(gpu_##mat, ## __VA_ARGS__); \
} while(0)
#define CPU_SANITY_CHECK(mat, ...) \
do{ \
cv::Mat cpu_##mat(mat); \
SANITY_CHECK(cpu_##mat, ## __VA_ARGS__); \
} while(0)
#endif // __OPENCV_PERF_GPU_UTILITY_HPP__

View File

@@ -8,69 +8,19 @@
#include "opencv2/video.hpp"
#include "opencv2/legacy.hpp"
#include "opencv2/ts.hpp"
static void printOsInfo()
{
#if defined _WIN32
# if defined _WIN64
printf("[----------]\n[ GPU INFO ] \tRun on OS Windows x64.\n[----------]\n"); fflush(stdout);
# else
printf("[----------]\n[ GPU INFO ] \tRun on OS Windows x32.\n[----------]\n"); fflush(stdout);
# endif
#elif defined linux
# if defined _LP64
printf("[----------]\n[ GPU INFO ] \tRun on OS Linux x64.\n[----------]\n"); fflush(stdout);
# else
printf("[----------]\n[ GPU INFO ] \tRun on OS Linux x32.\n[----------]\n"); fflush(stdout);
# endif
#elif defined __APPLE__
# if defined _LP64
printf("[----------]\n[ GPU INFO ] \tRun on OS Apple x64.\n[----------]\n"); fflush(stdout);
# else
printf("[----------]\n[ GPU INFO ] \tRun on OS Apple x32.\n[----------]\n"); fflush(stdout);
# endif
#endif
}
static void printCudaInfo()
{
const int deviceCount = cv::gpu::getCudaEnabledDeviceCount();
printf("[----------]\n"); fflush(stdout);
printf("[ GPU INFO ] \tCUDA device count:: %d.\n", deviceCount); fflush(stdout);
printf("[----------]\n"); fflush(stdout);
for (int i = 0; i < deviceCount; ++i)
{
cv::gpu::DeviceInfo info(i);
printf("[----------]\n"); fflush(stdout);
printf("[ DEVICE ] \t# %d %s.\n", i, info.name().c_str()); fflush(stdout);
printf("[ ] \tCompute capability: %d.%d\n", info.majorVersion(), info.minorVersion()); fflush(stdout);
printf("[ ] \tMulti Processor Count: %d\n", info.multiProcessorCount()); fflush(stdout);
printf("[ ] \tTotal memory: %d Mb\n", static_cast<int>(static_cast<int>(info.totalMemory() / 1024.0) / 1024.0)); fflush(stdout);
printf("[ ] \tFree memory: %d Mb\n", static_cast<int>(static_cast<int>(info.freeMemory() / 1024.0) / 1024.0)); fflush(stdout);
if (!info.isCompatible())
printf("[ GPU INFO ] \tThis device is NOT compatible with current GPU module build\n");
printf("[----------]\n"); fflush(stdout);
}
}
#include "opencv2/ts/gpu_perf.hpp"
int main(int argc, char* argv[])
{
printOsInfo();
printCudaInfo();
perf::printCudaInfo();
perf::Regression::Init("nv_perf_test");
perf::Regression::Init("gpu_perf4au");
perf::TestBase::Init(argc, argv);
testing::InitGoogleTest(&argc, argv);
return RUN_ALL_TESTS();
}
#define DEF_PARAM_TEST(name, ...) typedef ::perf::TestBaseWithParam< std::tr1::tuple< __VA_ARGS__ > > name
#define DEF_PARAM_TEST_1(name, param_type) typedef ::perf::TestBaseWithParam< param_type > name
//////////////////////////////////////////////////////////
// HoughLinesP

View File

@@ -318,40 +318,14 @@ void cv::gpu::flip(const GpuMat& src, GpuMat& dst, int flipCode, Stream& stream)
void cv::gpu::LUT(const GpuMat& src, const Mat& lut, GpuMat& dst, Stream& s)
{
class LevelsInit
{
public:
Npp32s pLevels[256];
const Npp32s* pLevels3[3];
int nValues3[3];
const int cn = src.channels();
#if (CUDA_VERSION > 4020)
GpuMat d_pLevels;
#endif
CV_Assert( src.type() == CV_8UC1 || src.type() == CV_8UC3 );
CV_Assert( lut.depth() == CV_8U );
CV_Assert( lut.channels() == 1 || lut.channels() == cn );
CV_Assert( lut.rows * lut.cols == 256 && lut.isContinuous() );
LevelsInit()
{
nValues3[0] = nValues3[1] = nValues3[2] = 256;
for (int i = 0; i < 256; ++i)
pLevels[i] = i;
#if (CUDA_VERSION <= 4020)
pLevels3[0] = pLevels3[1] = pLevels3[2] = pLevels;
#else
d_pLevels.upload(Mat(1, 256, CV_32S, pLevels));
pLevels3[0] = pLevels3[1] = pLevels3[2] = d_pLevels.ptr<Npp32s>();
#endif
}
};
static LevelsInit lvls;
int cn = src.channels();
CV_Assert(src.type() == CV_8UC1 || src.type() == CV_8UC3);
CV_Assert(lut.depth() == CV_8U && (lut.channels() == 1 || lut.channels() == cn) && lut.rows * lut.cols == 256 && lut.isContinuous());
dst.create(src.size(), CV_MAKETYPE(lut.depth(), cn));
dst.create(src.size(), CV_MAKE_TYPE(lut.depth(), cn));
NppiSize sz;
sz.height = src.rows;
@@ -360,19 +334,34 @@ void cv::gpu::LUT(const GpuMat& src, const Mat& lut, GpuMat& dst, Stream& s)
Mat nppLut;
lut.convertTo(nppLut, CV_32S);
cudaStream_t stream = StreamAccessor::getStream(s);
int nValues3[] = {256, 256, 256};
Npp32s pLevels[256];
for (int i = 0; i < 256; ++i)
pLevels[i] = i;
const Npp32s* pLevels3[3];
#if (CUDA_VERSION <= 4020)
pLevels3[0] = pLevels3[1] = pLevels3[2] = pLevels;
#else
GpuMat d_pLevels;
d_pLevels.upload(Mat(1, 256, CV_32S, pLevels));
pLevels3[0] = pLevels3[1] = pLevels3[2] = d_pLevels.ptr<Npp32s>();
#endif
cudaStream_t stream = StreamAccessor::getStream(s);
NppStreamHandler h(stream);
if (src.type() == CV_8UC1)
{
#if (CUDA_VERSION <= 4020)
nppSafeCall( nppiLUT_Linear_8u_C1R(src.ptr<Npp8u>(), static_cast<int>(src.step),
dst.ptr<Npp8u>(), static_cast<int>(dst.step), sz, nppLut.ptr<Npp32s>(), lvls.pLevels, 256) );
dst.ptr<Npp8u>(), static_cast<int>(dst.step), sz, nppLut.ptr<Npp32s>(), pLevels, 256) );
#else
GpuMat d_nppLut(Mat(1, 256, CV_32S, nppLut.data));
nppSafeCall( nppiLUT_Linear_8u_C1R(src.ptr<Npp8u>(), static_cast<int>(src.step),
dst.ptr<Npp8u>(), static_cast<int>(dst.step), sz, d_nppLut.ptr<Npp32s>(), lvls.d_pLevels.ptr<Npp32s>(), 256) );
dst.ptr<Npp8u>(), static_cast<int>(dst.step), sz, d_nppLut.ptr<Npp32s>(), d_pLevels.ptr<Npp32s>(), 256) );
#endif
}
else
@@ -409,7 +398,7 @@ void cv::gpu::LUT(const GpuMat& src, const Mat& lut, GpuMat& dst, Stream& s)
}
nppSafeCall( nppiLUT_Linear_8u_C3R(src.ptr<Npp8u>(), static_cast<int>(src.step),
dst.ptr<Npp8u>(), static_cast<int>(dst.step), sz, pValues3, lvls.pLevels3, lvls.nValues3) );
dst.ptr<Npp8u>(), static_cast<int>(dst.step), sz, pValues3, pLevels3, nValues3) );
}
if (stream == 0)

View File

@@ -1,137 +0,0 @@
/*M///////////////////////////////////////////////////////////////////////////////////////
//
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
// By downloading, copying, installing or using the software you agree to this license.
// If you do not agree to this license, do not download, install,
// copy or use the software.
//
//
// License Agreement
// For Open Source Computer Vision Library
//
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other materials provided with the distribution.
//
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors "as is" and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/
#include "precomp.hpp"
#if !defined HAVE_CUDA || defined(CUDA_DISABLER)
cv::gpu::VIBE_GPU::VIBE_GPU(unsigned long) { throw_nogpu(); }
void cv::gpu::VIBE_GPU::initialize(const GpuMat&, Stream&) { throw_nogpu(); }
void cv::gpu::VIBE_GPU::operator()(const GpuMat&, GpuMat&, Stream&) { throw_nogpu(); }
void cv::gpu::VIBE_GPU::release() {}
#else
namespace cv { namespace gpu { namespace device
{
namespace vibe
{
void loadConstants(int nbSamples, int reqMatches, int radius, int subsamplingFactor);
void init_gpu(PtrStepSzb frame, int cn, PtrStepSzb samples, PtrStepSz<unsigned int> randStates, cudaStream_t stream);
void update_gpu(PtrStepSzb frame, int cn, PtrStepSzb fgmask, PtrStepSzb samples, PtrStepSz<unsigned int> randStates, cudaStream_t stream);
}
}}}
namespace
{
const int defaultNbSamples = 20;
const int defaultReqMatches = 2;
const int defaultRadius = 20;
const int defaultSubsamplingFactor = 16;
}
cv::gpu::VIBE_GPU::VIBE_GPU(unsigned long rngSeed) :
frameSize_(0, 0), rngSeed_(rngSeed)
{
nbSamples = defaultNbSamples;
reqMatches = defaultReqMatches;
radius = defaultRadius;
subsamplingFactor = defaultSubsamplingFactor;
}
void cv::gpu::VIBE_GPU::initialize(const GpuMat& firstFrame, Stream& s)
{
using namespace cv::gpu::device::vibe;
CV_Assert(firstFrame.type() == CV_8UC1 || firstFrame.type() == CV_8UC3 || firstFrame.type() == CV_8UC4);
cudaStream_t stream = StreamAccessor::getStream(s);
loadConstants(nbSamples, reqMatches, radius, subsamplingFactor);
frameSize_ = firstFrame.size();
if (randStates_.size() != frameSize_)
{
cv::RNG rng(rngSeed_);
cv::Mat h_randStates(frameSize_, CV_8UC4);
rng.fill(h_randStates, cv::RNG::UNIFORM, 0, 255);
randStates_.upload(h_randStates);
}
int ch = firstFrame.channels();
int sample_ch = ch == 1 ? 1 : 4;
samples_.create(nbSamples * frameSize_.height, frameSize_.width, CV_8UC(sample_ch));
init_gpu(firstFrame, ch, samples_, randStates_, stream);
}
void cv::gpu::VIBE_GPU::operator()(const GpuMat& frame, GpuMat& fgmask, Stream& s)
{
using namespace cv::gpu::device::vibe;
CV_Assert(frame.depth() == CV_8U);
int ch = frame.channels();
int sample_ch = ch == 1 ? 1 : 4;
if (frame.size() != frameSize_ || sample_ch != samples_.channels())
initialize(frame);
fgmask.create(frameSize_, CV_8UC1);
update_gpu(frame, ch, fgmask, samples_, randStates_, StreamAccessor::getStream(s));
}
void cv::gpu::VIBE_GPU::release()
{
frameSize_ = Size(0, 0);
randStates_.release();
samples_.release();
}
#endif

View File

@@ -48,6 +48,7 @@ using namespace cv::gpu;
#if !defined (HAVE_CUDA) || defined (CUDA_DISABLER)
void cv::gpu::cvtColor(const GpuMat&, GpuMat&, int, int, Stream&) { throw_nogpu(); }
void cv::gpu::demosaicing(const GpuMat&, GpuMat&, int, int, Stream&) { throw_nogpu(); }
void cv::gpu::swapChannels(GpuMat&, const int[], Stream&) { throw_nogpu(); }
void cv::gpu::gammaCorrection(const GpuMat&, GpuMat&, bool, Stream&) { throw_nogpu(); }
@@ -62,6 +63,9 @@ namespace cv { namespace gpu {
void Bayer2BGR_8u_gpu(PtrStepSzb src, PtrStepSzb dst, bool blue_last, bool start_with_green, cudaStream_t stream);
template <int cn>
void Bayer2BGR_16u_gpu(PtrStepSzb src, PtrStepSzb dst, bool blue_last, bool start_with_green, cudaStream_t stream);
template <int cn>
void MHCdemosaic(PtrStepSzb src, int2 sourceOffset, PtrStepSzb dst, int2 firstRed, cudaStream_t stream);
}
}}
@@ -1620,26 +1624,56 @@ namespace
funcs[src.depth()][dcn - 1](src, dst, blue_last, start_with_green, StreamAccessor::getStream(stream));
}
void bayerBG_to_bgr(const GpuMat& src, GpuMat& dst, int dcn, Stream& stream)
{
bayer_to_bgr(src, dst, dcn, false, false, stream);
}
void bayerGB_to_bgr(const GpuMat& src, GpuMat& dst, int dcn, Stream& stream)
{
bayer_to_bgr(src, dst, dcn, false, true, stream);
}
void bayerRG_to_bgr(const GpuMat& src, GpuMat& dst, int dcn, Stream& stream)
{
bayer_to_bgr(src, dst, dcn, true, false, stream);
}
void bayerGR_to_bgr(const GpuMat& src, GpuMat& dst, int dcn, Stream& stream)
{
bayer_to_bgr(src, dst, dcn, true, true, stream);
}
void bayer_to_gray(const GpuMat& src, GpuMat& dst, bool blue_last, bool start_with_green, Stream& stream)
{
typedef void (*func_t)(PtrStepSzb src, PtrStepSzb dst, bool blue_last, bool start_with_green, cudaStream_t stream);
static const func_t funcs[3] =
{
Bayer2BGR_8u_gpu<1>,
0,
Bayer2BGR_16u_gpu<1>,
};
CV_Assert(src.type() == CV_8UC1 || src.type() == CV_16UC1);
CV_Assert(src.rows > 2 && src.cols > 2);
dst.create(src.size(), CV_MAKETYPE(src.depth(), 1));
funcs[src.depth()](src, dst, blue_last, start_with_green, StreamAccessor::getStream(stream));
}
void bayerBG_to_gray(const GpuMat& src, GpuMat& dst, int /*dcn*/, Stream& stream)
{
bayer_to_gray(src, dst, false, false, stream);
}
void bayerGB_to_gray(const GpuMat& src, GpuMat& dst, int /*dcn*/, Stream& stream)
{
bayer_to_gray(src, dst, false, true, stream);
}
void bayerRG_to_gray(const GpuMat& src, GpuMat& dst, int /*dcn*/, Stream& stream)
{
bayer_to_gray(src, dst, true, false, stream);
}
void bayerGR_to_gray(const GpuMat& src, GpuMat& dst, int /*dcn*/, Stream& stream)
{
bayer_to_gray(src, dst, true, true, stream);
}
}
void cv::gpu::cvtColor(const GpuMat& src, GpuMat& dst, int code, int dcn, Stream& stream)
@@ -1756,10 +1790,10 @@ void cv::gpu::cvtColor(const GpuMat& src, GpuMat& dst, int code, int dcn, Stream
yuv_to_bgr, // CV_YUV2BGR = 84
yuv_to_rgb, // CV_YUV2RGB = 85
0, // CV_BayerBG2GRAY = 86
0, // CV_BayerGB2GRAY = 87
0, // CV_BayerRG2GRAY = 88
0, // CV_BayerGR2GRAY = 89
bayerBG_to_gray, // CV_BayerBG2GRAY = 86
bayerGB_to_gray, // CV_BayerGB2GRAY = 87
bayerRG_to_gray, // CV_BayerRG2GRAY = 88
bayerGR_to_gray, // CV_BayerGR2GRAY = 89
//YUV 4:2:0 formats family
0, // CV_YUV2RGB_NV12 = 90,
@@ -1825,6 +1859,74 @@ void cv::gpu::cvtColor(const GpuMat& src, GpuMat& dst, int code, int dcn, Stream
func(src, dst, dcn, stream);
}
void cv::gpu::demosaicing(const GpuMat& src, GpuMat& dst, int code, int dcn, Stream& stream)
{
const int depth = src.depth();
CV_Assert( src.channels() == 1 );
switch (code)
{
case CV_BayerBG2GRAY: case CV_BayerGB2GRAY: case CV_BayerRG2GRAY: case CV_BayerGR2GRAY:
bayer_to_gray(src, dst, code == CV_BayerBG2GRAY || code == CV_BayerGB2GRAY, code == CV_BayerGB2GRAY || code == CV_BayerGR2GRAY, stream);
break;
case CV_BayerBG2BGR: case CV_BayerGB2BGR: case CV_BayerRG2BGR: case CV_BayerGR2BGR:
bayer_to_bgr(src, dst, dcn, code == CV_BayerBG2BGR || code == CV_BayerGB2BGR, code == CV_BayerGB2BGR || code == CV_BayerGR2BGR, stream);
break;
case COLOR_BayerBG2BGR_MHT: case COLOR_BayerGB2BGR_MHT: case COLOR_BayerRG2BGR_MHT: case COLOR_BayerGR2BGR_MHT:
{
if (dcn <= 0)
dcn = 3;
CV_Assert( depth == CV_8U );
CV_Assert( dcn == 3 || dcn == 4 );
dst.create(src.size(), CV_MAKETYPE(depth, dcn));
dst.setTo(Scalar::all(0));
Size wholeSize;
Point ofs;
src.locateROI(wholeSize, ofs);
PtrStepSzb srcWhole(wholeSize.height, wholeSize.width, src.datastart, src.step);
const int2 firstRed = make_int2(code == COLOR_BayerRG2BGR_MHT || code == COLOR_BayerGB2BGR_MHT ? 0 : 1,
code == COLOR_BayerRG2BGR_MHT || code == COLOR_BayerGR2BGR_MHT ? 0 : 1);
if (dcn == 3)
device::MHCdemosaic<3>(srcWhole, make_int2(ofs.x, ofs.y), dst, firstRed, StreamAccessor::getStream(stream));
else
device::MHCdemosaic<4>(srcWhole, make_int2(ofs.x, ofs.y), dst, firstRed, StreamAccessor::getStream(stream));
break;
}
case COLOR_BayerBG2GRAY_MHT: case COLOR_BayerGB2GRAY_MHT: case COLOR_BayerRG2GRAY_MHT: case COLOR_BayerGR2GRAY_MHT:
{
CV_Assert( depth == CV_8U );
dst.create(src.size(), CV_MAKETYPE(depth, 1));
dst.setTo(Scalar::all(0));
Size wholeSize;
Point ofs;
src.locateROI(wholeSize, ofs);
PtrStepSzb srcWhole(wholeSize.height, wholeSize.width, src.datastart, src.step);
const int2 firstRed = make_int2(code == COLOR_BayerRG2BGR_MHT || code == COLOR_BayerGB2BGR_MHT ? 0 : 1,
code == COLOR_BayerRG2BGR_MHT || code == COLOR_BayerGR2BGR_MHT ? 0 : 1);
device::MHCdemosaic<1>(srcWhole, make_int2(ofs.x, ofs.y), dst, firstRed, StreamAccessor::getStream(stream));
break;
}
default:
CV_Error( CV_StsBadFlag, "Unknown / unsupported color conversion code" );
}
}
void cv::gpu::swapChannels(GpuMat& image, const int dstOrder[4], Stream& s)
{
CV_Assert(image.type() == CV_8UC4);

View File

@@ -1,258 +0,0 @@
/*M///////////////////////////////////////////////////////////////////////////////////////
//
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
// By downloading, copying, installing or using the software you agree to this license.
// If you do not agree to this license, do not download, install,
// copy or use the software.
//
//
// License Agreement
// For Open Source Computer Vision Library
//
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other materials provided with the distribution.
//
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors "as is" and
// any express or bpied warranties, including, but not limited to, the bpied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/
#if !defined CUDA_DISABLER
#include "opencv2/gpu/device/common.hpp"
namespace cv { namespace gpu { namespace device
{
namespace vibe
{
__constant__ int c_nbSamples;
__constant__ int c_reqMatches;
__constant__ int c_radius;
__constant__ int c_subsamplingFactor;
void loadConstants(int nbSamples, int reqMatches, int radius, int subsamplingFactor)
{
cudaSafeCall( cudaMemcpyToSymbol(c_nbSamples, &nbSamples, sizeof(int)) );
cudaSafeCall( cudaMemcpyToSymbol(c_reqMatches, &reqMatches, sizeof(int)) );
cudaSafeCall( cudaMemcpyToSymbol(c_radius, &radius, sizeof(int)) );
cudaSafeCall( cudaMemcpyToSymbol(c_subsamplingFactor, &subsamplingFactor, sizeof(int)) );
}
__device__ __forceinline__ uint nextRand(uint& state)
{
const unsigned int CV_RNG_COEFF = 4164903690U;
state = state * CV_RNG_COEFF + (state >> 16);
return state;
}
__constant__ int c_xoff[9] = {-1, 0, 1, -1, 1, -1, 0, 1, 0};
__constant__ int c_yoff[9] = {-1, -1, -1, 0, 0, 1, 1, 1, 0};
__device__ __forceinline__ int2 chooseRandomNeighbor(int x, int y, uint& randState, int count = 8)
{
int idx = nextRand(randState) % count;
return make_int2(x + c_xoff[idx], y + c_yoff[idx]);
}
__device__ __forceinline__ uchar cvt(uchar val)
{
return val;
}
__device__ __forceinline__ uchar4 cvt(const uchar3& val)
{
return make_uchar4(val.x, val.y, val.z, 0);
}
__device__ __forceinline__ uchar4 cvt(const uchar4& val)
{
return val;
}
template <typename SrcT, typename SampleT>
__global__ void init(const PtrStepSz<SrcT> frame, PtrStep<SampleT> samples, PtrStep<uint> randStates)
{
const int x = blockIdx.x * blockDim.x + threadIdx.x;
const int y = blockIdx.y * blockDim.y + threadIdx.y;
if (x >= frame.cols || y >= frame.rows)
return;
uint localState = randStates(y, x);
for (int k = 0; k < c_nbSamples; ++k)
{
int2 np = chooseRandomNeighbor(x, y, localState, 9);
np.x = ::max(0, ::min(np.x, frame.cols - 1));
np.y = ::max(0, ::min(np.y, frame.rows - 1));
SrcT pix = frame(np.y, np.x);
samples(k * frame.rows + y, x) = cvt(pix);
}
randStates(y, x) = localState;
}
template <typename SrcT, typename SampleT>
void init_caller(PtrStepSzb frame, PtrStepSzb samples, PtrStepSz<uint> randStates, cudaStream_t stream)
{
dim3 block(32, 8);
dim3 grid(divUp(frame.cols, block.x), divUp(frame.rows, block.y));
cudaSafeCall( cudaFuncSetCacheConfig(init<SrcT, SampleT>, cudaFuncCachePreferL1) );
init<SrcT, SampleT><<<grid, block, 0, stream>>>((PtrStepSz<SrcT>) frame, (PtrStepSz<SampleT>) samples, randStates);
cudaSafeCall( cudaGetLastError() );
if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() );
}
void init_gpu(PtrStepSzb frame, int cn, PtrStepSzb samples, PtrStepSz<uint> randStates, cudaStream_t stream)
{
typedef void (*func_t)(PtrStepSzb frame, PtrStepSzb samples, PtrStepSz<uint> randStates, cudaStream_t stream);
static const func_t funcs[] =
{
0, init_caller<uchar, uchar>, 0, init_caller<uchar3, uchar4>, init_caller<uchar4, uchar4>
};
funcs[cn](frame, samples, randStates, stream);
}
__device__ __forceinline__ int calcDist(uchar a, uchar b)
{
return ::abs(a - b);
}
__device__ __forceinline__ int calcDist(const uchar3& a, const uchar4& b)
{
return (::abs(a.x - b.x) + ::abs(a.y - b.y) + ::abs(a.z - b.z)) / 3;
}
__device__ __forceinline__ int calcDist(const uchar4& a, const uchar4& b)
{
return (::abs(a.x - b.x) + ::abs(a.y - b.y) + ::abs(a.z - b.z)) / 3;
}
template <typename SrcT, typename SampleT>
__global__ void update(const PtrStepSz<SrcT> frame, PtrStepb fgmask, PtrStep<SampleT> samples, PtrStep<uint> randStates)
{
const int x = blockIdx.x * blockDim.x + threadIdx.x;
const int y = blockIdx.y * blockDim.y + threadIdx.y;
if (x >= frame.cols || y >= frame.rows)
return;
uint localState = randStates(y, x);
SrcT imgPix = frame(y, x);
// comparison with the model
int count = 0;
for (int k = 0; (count < c_reqMatches) && (k < c_nbSamples); ++k)
{
SampleT samplePix = samples(k * frame.rows + y, x);
int distance = calcDist(imgPix, samplePix);
if (distance < c_radius)
++count;
}
// pixel classification according to reqMatches
fgmask(y, x) = (uchar) (-(count < c_reqMatches));
if (count >= c_reqMatches)
{
// the pixel belongs to the background
// gets a random number between 0 and subsamplingFactor-1
int randomNumber = nextRand(localState) % c_subsamplingFactor;
// update of the current pixel model
if (randomNumber == 0)
{
// random subsampling
int k = nextRand(localState) % c_nbSamples;
samples(k * frame.rows + y, x) = cvt(imgPix);
}
// update of a neighboring pixel model
randomNumber = nextRand(localState) % c_subsamplingFactor;
if (randomNumber == 0)
{
// random subsampling
// chooses a neighboring pixel randomly
int2 np = chooseRandomNeighbor(x, y, localState);
np.x = ::max(0, ::min(np.x, frame.cols - 1));
np.y = ::max(0, ::min(np.y, frame.rows - 1));
// chooses the value to be replaced randomly
int k = nextRand(localState) % c_nbSamples;
samples(k * frame.rows + np.y, np.x) = cvt(imgPix);
}
}
randStates(y, x) = localState;
}
template <typename SrcT, typename SampleT>
void update_caller(PtrStepSzb frame, PtrStepSzb fgmask, PtrStepSzb samples, PtrStepSz<uint> randStates, cudaStream_t stream)
{
dim3 block(32, 8);
dim3 grid(divUp(frame.cols, block.x), divUp(frame.rows, block.y));
cudaSafeCall( cudaFuncSetCacheConfig(update<SrcT, SampleT>, cudaFuncCachePreferL1) );
update<SrcT, SampleT><<<grid, block, 0, stream>>>((PtrStepSz<SrcT>) frame, fgmask, (PtrStepSz<SampleT>) samples, randStates);
cudaSafeCall( cudaGetLastError() );
if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() );
}
void update_gpu(PtrStepSzb frame, int cn, PtrStepSzb fgmask, PtrStepSzb samples, PtrStepSz<uint> randStates, cudaStream_t stream)
{
typedef void (*func_t)(PtrStepSzb frame, PtrStepSzb fgmask, PtrStepSzb samples, PtrStepSz<uint> randStates, cudaStream_t stream);
static const func_t funcs[] =
{
0, update_caller<uchar, uchar>, 0, update_caller<uchar3, uchar4>, update_caller<uchar4, uchar4>
};
funcs[cn](frame, fgmask, samples, randStates, stream);
}
}
}}}
#endif /* CUDA_DISABLER */

View File

@@ -42,42 +42,38 @@
#if !defined CUDA_DISABLER
#include <opencv2/gpu/device/common.hpp>
#include <opencv2/gpu/device/vec_traits.hpp>
#include <opencv2/gpu/device/vec_math.hpp>
#include <opencv2/gpu/device/limits.hpp>
#include "opencv2/gpu/device/common.hpp"
#include "opencv2/gpu/device/vec_traits.hpp"
#include "opencv2/gpu/device/vec_math.hpp"
#include "opencv2/gpu/device/limits.hpp"
#include "opencv2/gpu/device/color.hpp"
#include "opencv2/gpu/device/saturate_cast.hpp"
namespace cv { namespace gpu {
namespace device
namespace cv { namespace gpu { namespace device
{
template <typename T> struct Bayer2BGR;
template <> struct Bayer2BGR<uchar>
{
template <typename D>
__global__ void Bayer2BGR_8u(const PtrStepb src, PtrStepSz<D> dst, const bool blue_last, const bool start_with_green)
uchar3 res0;
uchar3 res1;
uchar3 res2;
uchar3 res3;
__device__ void apply(const PtrStepSzb& src, int s_x, int s_y, bool blue_last, bool start_with_green)
{
const int s_x = blockIdx.x * blockDim.x + threadIdx.x;
int s_y = blockIdx.y * blockDim.y + threadIdx.y;
if (s_y >= dst.rows || (s_x << 2) >= dst.cols)
return;
s_y = ::min(::max(s_y, 1), dst.rows - 2);
uchar4 patch[3][3];
patch[0][1] = ((const uchar4*) src.ptr(s_y - 1))[s_x];
patch[0][0] = ((const uchar4*) src.ptr(s_y - 1))[::max(s_x - 1, 0)];
patch[0][2] = ((const uchar4*) src.ptr(s_y - 1))[::min(s_x + 1, ((dst.cols + 3) >> 2) - 1)];
patch[0][2] = ((const uchar4*) src.ptr(s_y - 1))[::min(s_x + 1, ((src.cols + 3) >> 2) - 1)];
patch[1][1] = ((const uchar4*) src.ptr(s_y))[s_x];
patch[1][0] = ((const uchar4*) src.ptr(s_y))[::max(s_x - 1, 0)];
patch[1][2] = ((const uchar4*) src.ptr(s_y))[::min(s_x + 1, ((dst.cols + 3) >> 2) - 1)];
patch[1][2] = ((const uchar4*) src.ptr(s_y))[::min(s_x + 1, ((src.cols + 3) >> 2) - 1)];
patch[2][1] = ((const uchar4*) src.ptr(s_y + 1))[s_x];
patch[2][0] = ((const uchar4*) src.ptr(s_y + 1))[::max(s_x - 1, 0)];
patch[2][2] = ((const uchar4*) src.ptr(s_y + 1))[::min(s_x + 1, ((dst.cols + 3) >> 2) - 1)];
D res0 = VecTraits<D>::all(numeric_limits<uchar>::max());
D res1 = VecTraits<D>::all(numeric_limits<uchar>::max());
D res2 = VecTraits<D>::all(numeric_limits<uchar>::max());
D res3 = VecTraits<D>::all(numeric_limits<uchar>::max());
patch[2][2] = ((const uchar4*) src.ptr(s_y + 1))[::min(s_x + 1, ((src.cols + 3) >> 2) - 1)];
if ((s_y & 1) ^ start_with_green)
{
@@ -181,45 +177,69 @@ namespace cv { namespace gpu {
res3.z = t7;
}
}
const int d_x = (blockIdx.x * blockDim.x + threadIdx.x) << 2;
const int d_y = blockIdx.y * blockDim.y + threadIdx.y;
dst(d_y, d_x) = res0;
if (d_x + 1 < dst.cols)
dst(d_y, d_x + 1) = res1;
if (d_x + 2 < dst.cols)
dst(d_y, d_x + 2) = res2;
if (d_x + 3 < dst.cols)
dst(d_y, d_x + 3) = res3;
}
};
template <typename D>
__global__ void Bayer2BGR_16u(const PtrStepb src, PtrStepSz<D> dst, const bool blue_last, const bool start_with_green)
template <typename D> __device__ __forceinline__ D toDst(const uchar3& pix);
template <> __device__ __forceinline__ uchar toDst<uchar>(const uchar3& pix)
{
typename bgr_to_gray_traits<uchar>::functor_type f = bgr_to_gray_traits<uchar>::create_functor();
return f(pix);
}
template <> __device__ __forceinline__ uchar3 toDst<uchar3>(const uchar3& pix)
{
return pix;
}
template <> __device__ __forceinline__ uchar4 toDst<uchar4>(const uchar3& pix)
{
return make_uchar4(pix.x, pix.y, pix.z, 255);
}
template <typename D>
__global__ void Bayer2BGR_8u(const PtrStepSzb src, PtrStep<D> dst, const bool blue_last, const bool start_with_green)
{
const int s_x = blockIdx.x * blockDim.x + threadIdx.x;
int s_y = blockIdx.y * blockDim.y + threadIdx.y;
if (s_y >= src.rows || (s_x << 2) >= src.cols)
return;
s_y = ::min(::max(s_y, 1), src.rows - 2);
Bayer2BGR<uchar> bayer;
bayer.apply(src, s_x, s_y, blue_last, start_with_green);
const int d_x = (blockIdx.x * blockDim.x + threadIdx.x) << 2;
const int d_y = blockIdx.y * blockDim.y + threadIdx.y;
dst(d_y, d_x) = toDst<D>(bayer.res0);
if (d_x + 1 < src.cols)
dst(d_y, d_x + 1) = toDst<D>(bayer.res1);
if (d_x + 2 < src.cols)
dst(d_y, d_x + 2) = toDst<D>(bayer.res2);
if (d_x + 3 < src.cols)
dst(d_y, d_x + 3) = toDst<D>(bayer.res3);
}
template <> struct Bayer2BGR<ushort>
{
ushort3 res0;
ushort3 res1;
__device__ void apply(const PtrStepSzb& src, int s_x, int s_y, bool blue_last, bool start_with_green)
{
const int s_x = blockIdx.x * blockDim.x + threadIdx.x;
int s_y = blockIdx.y * blockDim.y + threadIdx.y;
if (s_y >= dst.rows || (s_x << 1) >= dst.cols)
return;
s_y = ::min(::max(s_y, 1), dst.rows - 2);
ushort2 patch[3][3];
patch[0][1] = ((const ushort2*) src.ptr(s_y - 1))[s_x];
patch[0][0] = ((const ushort2*) src.ptr(s_y - 1))[::max(s_x - 1, 0)];
patch[0][2] = ((const ushort2*) src.ptr(s_y - 1))[::min(s_x + 1, ((dst.cols + 1) >> 1) - 1)];
patch[0][2] = ((const ushort2*) src.ptr(s_y - 1))[::min(s_x + 1, ((src.cols + 1) >> 1) - 1)];
patch[1][1] = ((const ushort2*) src.ptr(s_y))[s_x];
patch[1][0] = ((const ushort2*) src.ptr(s_y))[::max(s_x - 1, 0)];
patch[1][2] = ((const ushort2*) src.ptr(s_y))[::min(s_x + 1, ((dst.cols + 1) >> 1) - 1)];
patch[1][2] = ((const ushort2*) src.ptr(s_y))[::min(s_x + 1, ((src.cols + 1) >> 1) - 1)];
patch[2][1] = ((const ushort2*) src.ptr(s_y + 1))[s_x];
patch[2][0] = ((const ushort2*) src.ptr(s_y + 1))[::max(s_x - 1, 0)];
patch[2][2] = ((const ushort2*) src.ptr(s_y + 1))[::min(s_x + 1, ((dst.cols + 1) >> 1) - 1)];
D res0 = VecTraits<D>::all(numeric_limits<ushort>::max());
D res1 = VecTraits<D>::all(numeric_limits<ushort>::max());
patch[2][2] = ((const ushort2*) src.ptr(s_y + 1))[::min(s_x + 1, ((src.cols + 1) >> 1) - 1)];
if ((s_y & 1) ^ start_with_green)
{
@@ -279,53 +299,246 @@ namespace cv { namespace gpu {
res1.z = t3;
}
}
const int d_x = (blockIdx.x * blockDim.x + threadIdx.x) << 1;
const int d_y = blockIdx.y * blockDim.y + threadIdx.y;
dst(d_y, d_x) = res0;
if (d_x + 1 < dst.cols)
dst(d_y, d_x + 1) = res1;
}
};
template <int cn>
void Bayer2BGR_8u_gpu(PtrStepSzb src, PtrStepSzb dst, bool blue_last, bool start_with_green, cudaStream_t stream)
{
typedef typename TypeVec<uchar, cn>::vec_type dst_t;
const dim3 block(32, 8);
const dim3 grid(divUp(dst.cols, 4 * block.x), divUp(dst.rows, block.y));
cudaSafeCall( cudaFuncSetCacheConfig(Bayer2BGR_8u<dst_t>, cudaFuncCachePreferL1) );
Bayer2BGR_8u<dst_t><<<grid, block, 0, stream>>>(src, (PtrStepSz<dst_t>)dst, blue_last, start_with_green);
cudaSafeCall( cudaGetLastError() );
if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() );
}
template <int cn>
void Bayer2BGR_16u_gpu(PtrStepSzb src, PtrStepSzb dst, bool blue_last, bool start_with_green, cudaStream_t stream)
{
typedef typename TypeVec<ushort, cn>::vec_type dst_t;
const dim3 block(32, 8);
const dim3 grid(divUp(dst.cols, 2 * block.x), divUp(dst.rows, block.y));
cudaSafeCall( cudaFuncSetCacheConfig(Bayer2BGR_16u<dst_t>, cudaFuncCachePreferL1) );
Bayer2BGR_16u<dst_t><<<grid, block, 0, stream>>>(src, (PtrStepSz<dst_t>)dst, blue_last, start_with_green);
cudaSafeCall( cudaGetLastError() );
if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() );
}
template void Bayer2BGR_8u_gpu<3>(PtrStepSzb src, PtrStepSzb dst, bool blue_last, bool start_with_green, cudaStream_t stream);
template void Bayer2BGR_8u_gpu<4>(PtrStepSzb src, PtrStepSzb dst, bool blue_last, bool start_with_green, cudaStream_t stream);
template void Bayer2BGR_16u_gpu<3>(PtrStepSzb src, PtrStepSzb dst, bool blue_last, bool start_with_green, cudaStream_t stream);
template void Bayer2BGR_16u_gpu<4>(PtrStepSzb src, PtrStepSzb dst, bool blue_last, bool start_with_green, cudaStream_t stream);
template <typename D> __device__ __forceinline__ D toDst(const ushort3& pix);
template <> __device__ __forceinline__ ushort toDst<ushort>(const ushort3& pix)
{
typename bgr_to_gray_traits<ushort>::functor_type f = bgr_to_gray_traits<ushort>::create_functor();
return f(pix);
}
template <> __device__ __forceinline__ ushort3 toDst<ushort3>(const ushort3& pix)
{
return pix;
}
template <> __device__ __forceinline__ ushort4 toDst<ushort4>(const ushort3& pix)
{
return make_ushort4(pix.x, pix.y, pix.z, numeric_limits<ushort>::max());
}
}}
#endif /* CUDA_DISABLER */
template <typename D>
__global__ void Bayer2BGR_16u(const PtrStepSzb src, PtrStep<D> dst, const bool blue_last, const bool start_with_green)
{
const int s_x = blockIdx.x * blockDim.x + threadIdx.x;
int s_y = blockIdx.y * blockDim.y + threadIdx.y;
if (s_y >= src.rows || (s_x << 1) >= src.cols)
return;
s_y = ::min(::max(s_y, 1), src.rows - 2);
Bayer2BGR<ushort> bayer;
bayer.apply(src, s_x, s_y, blue_last, start_with_green);
const int d_x = (blockIdx.x * blockDim.x + threadIdx.x) << 1;
const int d_y = blockIdx.y * blockDim.y + threadIdx.y;
dst(d_y, d_x) = toDst<D>(bayer.res0);
if (d_x + 1 < src.cols)
dst(d_y, d_x + 1) = toDst<D>(bayer.res1);
}
template <int cn>
void Bayer2BGR_8u_gpu(PtrStepSzb src, PtrStepSzb dst, bool blue_last, bool start_with_green, cudaStream_t stream)
{
typedef typename TypeVec<uchar, cn>::vec_type dst_t;
const dim3 block(32, 8);
const dim3 grid(divUp(src.cols, 4 * block.x), divUp(src.rows, block.y));
cudaSafeCall( cudaFuncSetCacheConfig(Bayer2BGR_8u<dst_t>, cudaFuncCachePreferL1) );
Bayer2BGR_8u<dst_t><<<grid, block, 0, stream>>>(src, (PtrStepSz<dst_t>)dst, blue_last, start_with_green);
cudaSafeCall( cudaGetLastError() );
if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() );
}
template <int cn>
void Bayer2BGR_16u_gpu(PtrStepSzb src, PtrStepSzb dst, bool blue_last, bool start_with_green, cudaStream_t stream)
{
typedef typename TypeVec<ushort, cn>::vec_type dst_t;
const dim3 block(32, 8);
const dim3 grid(divUp(src.cols, 2 * block.x), divUp(src.rows, block.y));
cudaSafeCall( cudaFuncSetCacheConfig(Bayer2BGR_16u<dst_t>, cudaFuncCachePreferL1) );
Bayer2BGR_16u<dst_t><<<grid, block, 0, stream>>>(src, (PtrStepSz<dst_t>)dst, blue_last, start_with_green);
cudaSafeCall( cudaGetLastError() );
if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() );
}
template void Bayer2BGR_8u_gpu<1>(PtrStepSzb src, PtrStepSzb dst, bool blue_last, bool start_with_green, cudaStream_t stream);
template void Bayer2BGR_8u_gpu<3>(PtrStepSzb src, PtrStepSzb dst, bool blue_last, bool start_with_green, cudaStream_t stream);
template void Bayer2BGR_8u_gpu<4>(PtrStepSzb src, PtrStepSzb dst, bool blue_last, bool start_with_green, cudaStream_t stream);
template void Bayer2BGR_16u_gpu<1>(PtrStepSzb src, PtrStepSzb dst, bool blue_last, bool start_with_green, cudaStream_t stream);
template void Bayer2BGR_16u_gpu<3>(PtrStepSzb src, PtrStepSzb dst, bool blue_last, bool start_with_green, cudaStream_t stream);
template void Bayer2BGR_16u_gpu<4>(PtrStepSzb src, PtrStepSzb dst, bool blue_last, bool start_with_green, cudaStream_t stream);
//////////////////////////////////////////////////////////////
// Bayer Demosaicing (Malvar, He, and Cutler)
//
// by Morgan McGuire, Williams College
// http://graphics.cs.williams.edu/papers/BayerJGT09/#shaders
//
// ported to CUDA
texture<uchar, cudaTextureType2D, cudaReadModeElementType> sourceTex(false, cudaFilterModePoint, cudaAddressModeClamp);
template <typename DstType>
__global__ void MHCdemosaic(PtrStepSz<DstType> dst, const int2 sourceOffset, const int2 firstRed)
{
const float kAx = -1.0f / 8.0f, kAy = -1.5f / 8.0f, kAz = 0.5f / 8.0f /*kAw = -1.0f / 8.0f*/;
const float kBx = 2.0f / 8.0f, /*kBy = 0.0f / 8.0f,*/ /*kBz = 0.0f / 8.0f,*/ kBw = 4.0f / 8.0f ;
const float kCx = 4.0f / 8.0f, kCy = 6.0f / 8.0f, kCz = 5.0f / 8.0f /*kCw = 5.0f / 8.0f*/;
const float /*kDx = 0.0f / 8.0f,*/ kDy = 2.0f / 8.0f, kDz = -1.0f / 8.0f /*kDw = -1.0f / 8.0f*/;
const float kEx = -1.0f / 8.0f, kEy = -1.5f / 8.0f, /*kEz = -1.0f / 8.0f,*/ kEw = 0.5f / 8.0f ;
const float kFx = 2.0f / 8.0f, /*kFy = 0.0f / 8.0f,*/ kFz = 4.0f / 8.0f /*kFw = 0.0f / 8.0f*/;
const int x = blockIdx.x * blockDim.x + threadIdx.x;
const int y = blockIdx.y * blockDim.y + threadIdx.y;
if (x == 0 || x >= dst.cols - 1 || y == 0 || y >= dst.rows - 1)
return;
int2 center;
center.x = x + sourceOffset.x;
center.y = y + sourceOffset.y;
int4 xCoord;
xCoord.x = center.x - 2;
xCoord.y = center.x - 1;
xCoord.z = center.x + 1;
xCoord.w = center.x + 2;
int4 yCoord;
yCoord.x = center.y - 2;
yCoord.y = center.y - 1;
yCoord.z = center.y + 1;
yCoord.w = center.y + 2;
float C = tex2D(sourceTex, center.x, center.y); // ( 0, 0)
float4 Dvec;
Dvec.x = tex2D(sourceTex, xCoord.y, yCoord.y); // (-1,-1)
Dvec.y = tex2D(sourceTex, xCoord.y, yCoord.z); // (-1, 1)
Dvec.z = tex2D(sourceTex, xCoord.z, yCoord.y); // ( 1,-1)
Dvec.w = tex2D(sourceTex, xCoord.z, yCoord.z); // ( 1, 1)
float4 value;
value.x = tex2D(sourceTex, center.x, yCoord.x); // ( 0,-2) A0
value.y = tex2D(sourceTex, center.x, yCoord.y); // ( 0,-1) B0
value.z = tex2D(sourceTex, xCoord.x, center.y); // (-2, 0) E0
value.w = tex2D(sourceTex, xCoord.y, center.y); // (-1, 0) F0
// (A0 + A1), (B0 + B1), (E0 + E1), (F0 + F1)
value.x += tex2D(sourceTex, center.x, yCoord.w); // ( 0, 2) A1
value.y += tex2D(sourceTex, center.x, yCoord.z); // ( 0, 1) B1
value.z += tex2D(sourceTex, xCoord.w, center.y); // ( 2, 0) E1
value.w += tex2D(sourceTex, xCoord.z, center.y); // ( 1, 0) F1
float4 PATTERN;
PATTERN.x = kCx * C;
PATTERN.y = kCy * C;
PATTERN.z = kCz * C;
PATTERN.w = PATTERN.z;
float D = Dvec.x + Dvec.y + Dvec.z + Dvec.w;
// There are five filter patterns (identity, cross, checker,
// theta, phi). Precompute the terms from all of them and then
// use swizzles to assign to color channels.
//
// Channel Matches
// x cross (e.g., EE G)
// y checker (e.g., EE B)
// z theta (e.g., EO R)
// w phi (e.g., EO B)
#define A value.x // A0 + A1
#define B value.y // B0 + B1
#define E value.z // E0 + E1
#define F value.w // F0 + F1
float3 temp;
// PATTERN.yzw += (kD.yz * D).xyy;
temp.x = kDy * D;
temp.y = kDz * D;
PATTERN.y += temp.x;
PATTERN.z += temp.y;
PATTERN.w += temp.y;
// PATTERN += (kA.xyz * A).xyzx;
temp.x = kAx * A;
temp.y = kAy * A;
temp.z = kAz * A;
PATTERN.x += temp.x;
PATTERN.y += temp.y;
PATTERN.z += temp.z;
PATTERN.w += temp.x;
// PATTERN += (kE.xyw * E).xyxz;
temp.x = kEx * E;
temp.y = kEy * E;
temp.z = kEw * E;
PATTERN.x += temp.x;
PATTERN.y += temp.y;
PATTERN.z += temp.x;
PATTERN.w += temp.z;
// PATTERN.xw += kB.xw * B;
PATTERN.x += kBx * B;
PATTERN.w += kBw * B;
// PATTERN.xz += kF.xz * F;
PATTERN.x += kFx * F;
PATTERN.z += kFz * F;
// Determine which of four types of pixels we are on.
int2 alternate;
alternate.x = (x + firstRed.x) % 2;
alternate.y = (y + firstRed.y) % 2;
// in BGR sequence;
uchar3 pixelColor =
(alternate.y == 0) ?
((alternate.x == 0) ?
make_uchar3(saturate_cast<uchar>(PATTERN.y), saturate_cast<uchar>(PATTERN.x), saturate_cast<uchar>(C)) :
make_uchar3(saturate_cast<uchar>(PATTERN.w), saturate_cast<uchar>(C), saturate_cast<uchar>(PATTERN.z))) :
((alternate.x == 0) ?
make_uchar3(saturate_cast<uchar>(PATTERN.z), saturate_cast<uchar>(C), saturate_cast<uchar>(PATTERN.w)) :
make_uchar3(saturate_cast<uchar>(C), saturate_cast<uchar>(PATTERN.x), saturate_cast<uchar>(PATTERN.y)));
dst(y, x) = toDst<DstType>(pixelColor);
}
template <int cn>
void MHCdemosaic(PtrStepSzb src, int2 sourceOffset, PtrStepSzb dst, int2 firstRed, cudaStream_t stream)
{
typedef typename TypeVec<uchar, cn>::vec_type dst_t;
const dim3 block(32, 8);
const dim3 grid(divUp(src.cols, block.x), divUp(src.rows, block.y));
bindTexture(&sourceTex, src);
MHCdemosaic<dst_t><<<grid, block, 0, stream>>>((PtrStepSz<dst_t>)dst, sourceOffset, firstRed);
cudaSafeCall( cudaGetLastError() );
if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() );
}
template void MHCdemosaic<1>(PtrStepSzb src, int2 sourceOffset, PtrStepSzb dst, int2 firstRed, cudaStream_t stream);
template void MHCdemosaic<3>(PtrStepSzb src, int2 sourceOffset, PtrStepSzb dst, int2 firstRed, cudaStream_t stream);
template void MHCdemosaic<4>(PtrStepSzb src, int2 sourceOffset, PtrStepSzb dst, int2 firstRed, cudaStream_t stream);
}}}
#endif /* CUDA_DISABLER */

View File

@@ -48,6 +48,7 @@
#include "opencv2/gpu/device/transform.hpp"
#include "opencv2/gpu/device/limits.hpp"
#include "opencv2/gpu/device/saturate_cast.hpp"
#include "opencv2/gpu/device/simd_functions.hpp"
using namespace cv::gpu;
using namespace cv::gpu::device;
@@ -154,170 +155,28 @@ namespace arithm
namespace arithm
{
template <typename T, typename D> struct VAdd4;
template <> struct VAdd4<uint, uint> : binary_function<uint, uint, uint>
struct VAdd4 : binary_function<uint, uint, uint>
{
__device__ __forceinline__ uint operator ()(uint a, uint b) const
{
uint res = 0;
#if __CUDA_ARCH__ >= 300
asm("vadd4.u32.u32.u32.sat %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
#elif __CUDA_ARCH__ >= 200
asm("vadd.u32.u32.u32.sat %0.b0, %1.b0, %2.b0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
asm("vadd.u32.u32.u32.sat %0.b1, %1.b1, %2.b1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
asm("vadd.u32.u32.u32.sat %0.b2, %1.b2, %2.b2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
asm("vadd.u32.u32.u32.sat %0.b3, %1.b3, %2.b3, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
#endif
return res;
return vadd4(a, b);
}
__device__ __forceinline__ VAdd4() {}
__device__ __forceinline__ VAdd4(const VAdd4<uint, uint>& other) {}
};
template <> struct VAdd4<int, uint> : binary_function<int, int, uint>
{
__device__ __forceinline__ uint operator ()(int a, int b) const
{
uint res = 0;
#if __CUDA_ARCH__ >= 300
asm("vadd4.u32.s32.s32.sat %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
#elif __CUDA_ARCH__ >= 200
asm("vadd.u32.s32.s32.sat %0.b0, %1.b0, %2.b0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
asm("vadd.u32.s32.s32.sat %0.b1, %1.b1, %2.b1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
asm("vadd.u32.s32.s32.sat %0.b2, %1.b2, %2.b2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
asm("vadd.u32.s32.s32.sat %0.b3, %1.b3, %2.b3, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
#endif
return res;
}
__device__ __forceinline__ VAdd4() {}
__device__ __forceinline__ VAdd4(const VAdd4<int, uint>& other) {}
};
template <> struct VAdd4<uint, int> : binary_function<uint, uint, int>
{
__device__ __forceinline__ int operator ()(uint a, uint b) const
{
int res = 0;
#if __CUDA_ARCH__ >= 300
asm("vadd4.s32.u32.u32.sat %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
#elif __CUDA_ARCH__ >= 200
asm("vadd.s32.u32.u32.sat %0.b0, %1.b0, %2.b0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
asm("vadd.s32.u32.u32.sat %0.b1, %1.b1, %2.b1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
asm("vadd.s32.u32.u32.sat %0.b2, %1.b2, %2.b2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
asm("vadd.s32.u32.u32.sat %0.b3, %1.b3, %2.b3, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
#endif
return res;
}
__device__ __forceinline__ VAdd4() {}
__device__ __forceinline__ VAdd4(const VAdd4<uint, int>& other) {}
};
template <> struct VAdd4<int, int> : binary_function<int, int, int>
{
__device__ __forceinline__ int operator ()(int a, int b) const
{
int res = 0;
#if __CUDA_ARCH__ >= 300
asm("vadd4.s32.s32.s32.sat %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
#elif __CUDA_ARCH__ >= 200
asm("vadd.s32.s32.s32.sat %0.b0, %1.b0, %2.b0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
asm("vadd.s32.s32.s32.sat %0.b1, %1.b1, %2.b1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
asm("vadd.s32.s32.s32.sat %0.b2, %1.b2, %2.b2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
asm("vadd.s32.s32.s32.sat %0.b3, %1.b3, %2.b3, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
#endif
return res;
}
__device__ __forceinline__ VAdd4() {}
__device__ __forceinline__ VAdd4(const VAdd4<int, int>& other) {}
__device__ __forceinline__ VAdd4(const VAdd4& other) {}
};
////////////////////////////////////
template <typename T, typename D> struct VAdd2;
template <> struct VAdd2<uint, uint> : binary_function<uint, uint, uint>
struct VAdd2 : binary_function<uint, uint, uint>
{
__device__ __forceinline__ uint operator ()(uint a, uint b) const
{
uint res = 0;
#if __CUDA_ARCH__ >= 300
asm("vadd2.u32.u32.u32.sat %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
#elif __CUDA_ARCH__ >= 200
asm("vadd.u32.u32.u32.sat %0.h0, %1.h0, %2.h0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
asm("vadd.u32.u32.u32.sat %0.h1, %1.h1, %2.h1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
#endif
return res;
return vadd2(a, b);
}
__device__ __forceinline__ VAdd2() {}
__device__ __forceinline__ VAdd2(const VAdd2<uint, uint>& other) {}
};
template <> struct VAdd2<uint, int> : binary_function<uint, uint, int>
{
__device__ __forceinline__ int operator ()(uint a, uint b) const
{
int res = 0;
#if __CUDA_ARCH__ >= 300
asm("vadd2.s32.u32.u32.sat %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
#elif __CUDA_ARCH__ >= 200
asm("vadd.s32.u32.u32.sat %0.h0, %1.h0, %2.h0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
asm("vadd.s32.u32.u32.sat %0.h1, %1.h1, %2.h1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
#endif
return res;
}
__device__ __forceinline__ VAdd2() {}
__device__ __forceinline__ VAdd2(const VAdd2<uint, int>& other) {}
};
template <> struct VAdd2<int, uint> : binary_function<int, int, uint>
{
__device__ __forceinline__ uint operator ()(int a, int b) const
{
uint res = 0;
#if __CUDA_ARCH__ >= 300
asm("vadd2.u32.s32.s32.sat %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
#elif __CUDA_ARCH__ >= 200
asm("vadd.u32.s32.s32.sat %0.h0, %1.h0, %2.h0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
asm("vadd.u32.s32.s32.sat %0.h1, %1.h1, %2.h1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
#endif
return res;
}
__device__ __forceinline__ VAdd2() {}
__device__ __forceinline__ VAdd2(const VAdd2<int, uint>& other) {}
};
template <> struct VAdd2<int, int> : binary_function<int, int, int>
{
__device__ __forceinline__ int operator ()(int a, int b) const
{
int res = 0;
#if __CUDA_ARCH__ >= 300
asm("vadd2.s32.s32.s32.sat %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
#elif __CUDA_ARCH__ >= 200
asm("vadd.s32.s32.s32.sat %0.h0, %1.h0, %2.h0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
asm("vadd.s32.s32.s32.sat %0.h1, %1.h1, %2.h1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
#endif
return res;
}
__device__ __forceinline__ VAdd2() {}
__device__ __forceinline__ VAdd2(const VAdd2<int, int>& other) {}
__device__ __forceinline__ VAdd2(const VAdd2& other) {}
};
////////////////////////////////////
@@ -336,13 +195,13 @@ namespace arithm
namespace cv { namespace gpu { namespace device
{
template <typename T, typename D> struct TransformFunctorTraits< arithm::VAdd4<T, D> > : arithm::ArithmFuncTraits<sizeof(T), sizeof(D)>
template <> struct TransformFunctorTraits< arithm::VAdd4 > : arithm::ArithmFuncTraits<sizeof(uint), sizeof(uint)>
{
};
////////////////////////////////////
template <typename T, typename D> struct TransformFunctorTraits< arithm::VAdd2<T, D> > : arithm::ArithmFuncTraits<sizeof(T), sizeof(D)>
template <> struct TransformFunctorTraits< arithm::VAdd2 > : arithm::ArithmFuncTraits<sizeof(uint), sizeof(uint)>
{
};
@@ -355,28 +214,16 @@ namespace cv { namespace gpu { namespace device
namespace arithm
{
template <typename T, typename D>
void vadd4(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream)
void addMat_v4(PtrStepSz<uint> src1, PtrStepSz<uint> src2, PtrStepSz<uint> dst, cudaStream_t stream)
{
transform((PtrStepSz<T>) src1, (PtrStepSz<T>) src2, (PtrStepSz<D>) dst, VAdd4<T, D>(), WithOutMask(), stream);
transform(src1, src2, dst, VAdd4(), WithOutMask(), stream);
}
template void vadd4<uint, uint>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
template void vadd4<uint, int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
template void vadd4<int, uint>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
template void vadd4<int, int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
template <typename T, typename D>
void vadd2(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream)
void addMat_v2(PtrStepSz<uint> src1, PtrStepSz<uint> src2, PtrStepSz<uint> dst, cudaStream_t stream)
{
transform((PtrStepSz<T>) src1, (PtrStepSz<T>) src2, (PtrStepSz<D>) dst, VAdd2<T, D>(), WithOutMask(), stream);
transform(src1, src2, dst, VAdd2(), WithOutMask(), stream);
}
template void vadd2<uint, uint>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
template void vadd2<uint, int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
template void vadd2<int, uint>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
template void vadd2<int, int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
template <typename T, typename D>
void addMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream)
{
@@ -543,170 +390,28 @@ namespace arithm
namespace arithm
{
template <typename T, typename D> struct VSub4;
template <> struct VSub4<uint, uint> : binary_function<uint, uint, uint>
struct VSub4 : binary_function<uint, uint, uint>
{
__device__ __forceinline__ uint operator ()(uint a, uint b) const
{
uint res = 0;
#if __CUDA_ARCH__ >= 300
asm("vsub4.u32.u32.u32.sat %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
#elif __CUDA_ARCH__ >= 200
asm("vsub.u32.u32.u32.sat %0.b0, %1.b0, %2.b0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
asm("vsub.u32.u32.u32.sat %0.b1, %1.b1, %2.b1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
asm("vsub.u32.u32.u32.sat %0.b2, %1.b2, %2.b2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
asm("vsub.u32.u32.u32.sat %0.b3, %1.b3, %2.b3, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
#endif
return res;
return vsub4(a, b);
}
__device__ __forceinline__ VSub4() {}
__device__ __forceinline__ VSub4(const VSub4<uint, uint>& other) {}
};
template <> struct VSub4<int, uint> : binary_function<int, int, uint>
{
__device__ __forceinline__ uint operator ()(int a, int b) const
{
uint res = 0;
#if __CUDA_ARCH__ >= 300
asm("vsub4.u32.s32.s32.sat %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
#elif __CUDA_ARCH__ >= 200
asm("vsub.u32.s32.s32.sat %0.b0, %1.b0, %2.b0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
asm("vsub.u32.s32.s32.sat %0.b1, %1.b1, %2.b1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
asm("vsub.u32.s32.s32.sat %0.b2, %1.b2, %2.b2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
asm("vsub.u32.s32.s32.sat %0.b3, %1.b3, %2.b3, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
#endif
return res;
}
__device__ __forceinline__ VSub4() {}
__device__ __forceinline__ VSub4(const VSub4<int, uint>& other) {}
};
template <> struct VSub4<uint, int> : binary_function<uint, uint, int>
{
__device__ __forceinline__ int operator ()(uint a, uint b) const
{
int res = 0;
#if __CUDA_ARCH__ >= 300
asm("vsub4.s32.u32.u32.sat %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
#elif __CUDA_ARCH__ >= 200
asm("vsub.s32.u32.u32.sat %0.b0, %1.b0, %2.b0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
asm("vsub.s32.u32.u32.sat %0.b1, %1.b1, %2.b1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
asm("vsub.s32.u32.u32.sat %0.b2, %1.b2, %2.b2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
asm("vsub.s32.u32.u32.sat %0.b3, %1.b3, %2.b3, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
#endif
return res;
}
__device__ __forceinline__ VSub4() {}
__device__ __forceinline__ VSub4(const VSub4<uint, int>& other) {}
};
template <> struct VSub4<int, int> : binary_function<int, int, int>
{
__device__ __forceinline__ int operator ()(int a, int b) const
{
int res = 0;
#if __CUDA_ARCH__ >= 300
asm("vsub4.s32.s32.s32.sat %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
#elif __CUDA_ARCH__ >= 200
asm("vsub.s32.s32.s32.sat %0.b0, %1.b0, %2.b0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
asm("vsub.s32.s32.s32.sat %0.b1, %1.b1, %2.b1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
asm("vsub.s32.s32.s32.sat %0.b2, %1.b2, %2.b2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
asm("vsub.s32.s32.s32.sat %0.b3, %1.b3, %2.b3, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
#endif
return res;
}
__device__ __forceinline__ VSub4() {}
__device__ __forceinline__ VSub4(const VSub4<int, int>& other) {}
__device__ __forceinline__ VSub4(const VSub4& other) {}
};
////////////////////////////////////
template <typename T, typename D> struct VSub2;
template <> struct VSub2<uint, uint> : binary_function<uint, uint, uint>
struct VSub2 : binary_function<uint, uint, uint>
{
__device__ __forceinline__ uint operator ()(uint a, uint b) const
{
uint res = 0;
#if __CUDA_ARCH__ >= 300
asm("vsub2.u32.u32.u32.sat %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
#elif __CUDA_ARCH__ >= 200
asm("vsub.u32.u32.u32.sat %0.h0, %1.h0, %2.h0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
asm("vsub.u32.u32.u32.sat %0.h1, %1.h1, %2.h1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
#endif
return res;
return vsub2(a, b);
}
__device__ __forceinline__ VSub2() {}
__device__ __forceinline__ VSub2(const VSub2<uint, uint>& other) {}
};
template <> struct VSub2<uint, int> : binary_function<uint, uint, int>
{
__device__ __forceinline__ int operator ()(uint a, uint b) const
{
int res = 0;
#if __CUDA_ARCH__ >= 300
asm("vsub2.s32.u32.u32.sat %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
#elif __CUDA_ARCH__ >= 200
asm("vsub.s32.u32.u32.sat %0.h0, %1.h0, %2.h0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
asm("vsub.s32.u32.u32.sat %0.h1, %1.h1, %2.h1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
#endif
return res;
}
__device__ __forceinline__ VSub2() {}
__device__ __forceinline__ VSub2(const VSub2<uint, int>& other) {}
};
template <> struct VSub2<int, uint> : binary_function<int, int, uint>
{
__device__ __forceinline__ uint operator ()(int a, int b) const
{
uint res = 0;
#if __CUDA_ARCH__ >= 300
asm("vsub2.u32.s32.s32.sat %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
#elif __CUDA_ARCH__ >= 200
asm("vsub.u32.s32.s32.sat %0.h0, %1.h0, %2.h0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
asm("vsub.u32.s32.s32.sat %0.h1, %1.h1, %2.h1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
#endif
return res;
}
__device__ __forceinline__ VSub2() {}
__device__ __forceinline__ VSub2(const VSub2<int, uint>& other) {}
};
template <> struct VSub2<int, int> : binary_function<int, int, int>
{
__device__ __forceinline__ int operator ()(int a, int b) const
{
int res = 0;
#if __CUDA_ARCH__ >= 300
asm("vsub2.s32.s32.s32.sat %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
#elif __CUDA_ARCH__ >= 200
asm("vsub.s32.s32.s32.sat %0.h0, %1.h0, %2.h0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
asm("vsub.s32.s32.s32.sat %0.h1, %1.h1, %2.h1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
#endif
return res;
}
__device__ __forceinline__ VSub2() {}
__device__ __forceinline__ VSub2(const VSub2<int, int>& other) {}
__device__ __forceinline__ VSub2(const VSub2& other) {}
};
////////////////////////////////////
@@ -725,13 +430,13 @@ namespace arithm
namespace cv { namespace gpu { namespace device
{
template <typename T, typename D> struct TransformFunctorTraits< arithm::VSub4<T, D> > : arithm::ArithmFuncTraits<sizeof(T), sizeof(D)>
template <> struct TransformFunctorTraits< arithm::VSub4 > : arithm::ArithmFuncTraits<sizeof(uint), sizeof(uint)>
{
};
////////////////////////////////////
template <typename T, typename D> struct TransformFunctorTraits< arithm::VSub2<T, D> > : arithm::ArithmFuncTraits<sizeof(T), sizeof(D)>
template <> struct TransformFunctorTraits< arithm::VSub2 > : arithm::ArithmFuncTraits<sizeof(uint), sizeof(uint)>
{
};
@@ -744,28 +449,16 @@ namespace cv { namespace gpu { namespace device
namespace arithm
{
template <typename T, typename D>
void vsub4(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream)
void subMat_v4(PtrStepSz<uint> src1, PtrStepSz<uint> src2, PtrStepSz<uint> dst, cudaStream_t stream)
{
transform((PtrStepSz<T>) src1, (PtrStepSz<T>) src2, (PtrStepSz<D>) dst, VSub4<T, D>(), WithOutMask(), stream);
transform(src1, src2, dst, VSub4(), WithOutMask(), stream);
}
template void vsub4<uint, uint>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
template void vsub4<uint, int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
template void vsub4<int, uint>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
template void vsub4<int, int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
template <typename T, typename D>
void vsub2(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream)
void subMat_v2(PtrStepSz<uint> src1, PtrStepSz<uint> src2, PtrStepSz<uint> dst, cudaStream_t stream)
{
transform((PtrStepSz<T>) src1, (PtrStepSz<T>) src2, (PtrStepSz<D>) dst, VSub2<T, D>(), WithOutMask(), stream);
transform(src1, src2, dst, VSub2(), WithOutMask(), stream);
}
template void vsub2<uint, uint>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
template void vsub2<uint, int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
template void vsub2<int, uint>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
template void vsub2<int, int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
template <typename T, typename D>
void subMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream)
{
@@ -1496,90 +1189,28 @@ namespace arithm
namespace arithm
{
template <typename T, typename D> struct VAbsDiff4;
template <> struct VAbsDiff4<uint, uint> : binary_function<uint, uint, uint>
struct VAbsDiff4 : binary_function<uint, uint, uint>
{
__device__ __forceinline__ uint operator ()(uint a, uint b) const
{
uint res = 0;
#if __CUDA_ARCH__ >= 300
asm("vabsdiff4.u32.u32.u32.sat %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
#elif __CUDA_ARCH__ >= 200
asm("vabsdiff.u32.u32.u32.sat %0.b0, %1.b0, %2.b0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
asm("vabsdiff.u32.u32.u32.sat %0.b1, %1.b1, %2.b1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
asm("vabsdiff.u32.u32.u32.sat %0.b2, %1.b2, %2.b2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
asm("vabsdiff.u32.u32.u32.sat %0.b3, %1.b3, %2.b3, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
#endif
return res;
return vabsdiff4(a, b);
}
__device__ __forceinline__ VAbsDiff4() {}
__device__ __forceinline__ VAbsDiff4(const VAbsDiff4<uint, uint>& other) {}
};
template <> struct VAbsDiff4<int, int> : binary_function<int, int, int>
{
__device__ __forceinline__ int operator ()(int a, int b) const
{
int res = 0;
#if __CUDA_ARCH__ >= 300
asm("vabsdiff4.s32.s32.s32.sat %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
#elif __CUDA_ARCH__ >= 200
asm("vabsdiff.s32.s32.s32.sat %0.b0, %1.b0, %2.b0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
asm("vabsdiff.s32.s32.s32.sat %0.b1, %1.b1, %2.b1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
asm("vabsdiff.s32.s32.s32.sat %0.b2, %1.b2, %2.b2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
asm("vabsdiff.s32.s32.s32.sat %0.b3, %1.b3, %2.b3, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
#endif
return res;
}
__device__ __forceinline__ VAbsDiff4() {}
__device__ __forceinline__ VAbsDiff4(const VAbsDiff4<int, int>& other) {}
__device__ __forceinline__ VAbsDiff4(const VAbsDiff4& other) {}
};
////////////////////////////////////
template <typename T, typename D> struct VAbsDiff2;
template <> struct VAbsDiff2<uint, uint> : binary_function<uint, uint, uint>
struct VAbsDiff2 : binary_function<uint, uint, uint>
{
__device__ __forceinline__ uint operator ()(uint a, uint b) const
{
uint res = 0;
#if __CUDA_ARCH__ >= 300
asm("vabsdiff2.u32.u32.u32.sat %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
#elif __CUDA_ARCH__ >= 200
asm("vabsdiff.u32.u32.u32.sat %0.h0, %1.h0, %2.h0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
asm("vabsdiff.u32.u32.u32.sat %0.h1, %1.h1, %2.h1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
#endif
return res;
return vabsdiff2(a, b);
}
__device__ __forceinline__ VAbsDiff2() {}
__device__ __forceinline__ VAbsDiff2(const VAbsDiff2<uint, uint>& other) {}
};
template <> struct VAbsDiff2<int, int> : binary_function<int, int, int>
{
__device__ __forceinline__ int operator ()(int a, int b) const
{
int res = 0;
#if __CUDA_ARCH__ >= 300
asm("vabsdiff2.s32.s32.s32.sat %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
#elif __CUDA_ARCH__ >= 200
asm("vabsdiff.s32.s32.s32.sat %0.h0, %1.h0, %2.h0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
asm("vabsdiff.s32.s32.s32.sat %0.h1, %1.h1, %2.h1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
#endif
return res;
}
__device__ __forceinline__ VAbsDiff2() {}
__device__ __forceinline__ VAbsDiff2(const VAbsDiff2<int, int>& other) {}
__device__ __forceinline__ VAbsDiff2(const VAbsDiff2& other) {}
};
////////////////////////////////////
@@ -1611,13 +1242,13 @@ namespace arithm
namespace cv { namespace gpu { namespace device
{
template <typename T, typename D> struct TransformFunctorTraits< arithm::VAbsDiff4<T, D> > : arithm::ArithmFuncTraits<sizeof(T), sizeof(D)>
template <> struct TransformFunctorTraits< arithm::VAbsDiff4 > : arithm::ArithmFuncTraits<sizeof(uint), sizeof(uint)>
{
};
////////////////////////////////////
template <typename T, typename D> struct TransformFunctorTraits< arithm::VAbsDiff2<T, D> > : arithm::ArithmFuncTraits<sizeof(T), sizeof(D)>
template <> struct TransformFunctorTraits< arithm::VAbsDiff2 > : arithm::ArithmFuncTraits<sizeof(uint), sizeof(uint)>
{
};
@@ -1630,24 +1261,16 @@ namespace cv { namespace gpu { namespace device
namespace arithm
{
template <typename T>
void vabsDiff4(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream)
void absDiffMat_v4(PtrStepSz<uint> src1, PtrStepSz<uint> src2, PtrStepSz<uint> dst, cudaStream_t stream)
{
transform((PtrStepSz<T>) src1, (PtrStepSz<T>) src2, (PtrStepSz<T>) dst, VAbsDiff4<T, T>(), WithOutMask(), stream);
transform(src1, src2, dst, VAbsDiff4(), WithOutMask(), stream);
}
template void vabsDiff4<uint>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
template void vabsDiff4<int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
template <typename T>
void vabsDiff2(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream)
void absDiffMat_v2(PtrStepSz<uint> src1, PtrStepSz<uint> src2, PtrStepSz<uint> dst, cudaStream_t stream)
{
transform((PtrStepSz<T>) src1, (PtrStepSz<T>) src2, (PtrStepSz<T>) dst, VAbsDiff2<T, T>(), WithOutMask(), stream);
transform(src1, src2, dst, VAbsDiff2(), WithOutMask(), stream);
}
template void vabsDiff2<uint>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
template void vabsDiff2<int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
template <typename T>
void absDiffMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream)
{
@@ -1877,6 +1500,49 @@ namespace arithm
namespace arithm
{
struct VCmpEq4 : binary_function<uint, uint, uint>
{
__device__ __forceinline__ uint operator ()(uint a, uint b) const
{
return vcmpeq4(a, b);
}
__device__ __forceinline__ VCmpEq4() {}
__device__ __forceinline__ VCmpEq4(const VCmpEq4& other) {}
};
struct VCmpNe4 : binary_function<uint, uint, uint>
{
__device__ __forceinline__ uint operator ()(uint a, uint b) const
{
return vcmpne4(a, b);
}
__device__ __forceinline__ VCmpNe4() {}
__device__ __forceinline__ VCmpNe4(const VCmpNe4& other) {}
};
struct VCmpLt4 : binary_function<uint, uint, uint>
{
__device__ __forceinline__ uint operator ()(uint a, uint b) const
{
return vcmplt4(a, b);
}
__device__ __forceinline__ VCmpLt4() {}
__device__ __forceinline__ VCmpLt4(const VCmpLt4& other) {}
};
struct VCmpLe4 : binary_function<uint, uint, uint>
{
__device__ __forceinline__ uint operator ()(uint a, uint b) const
{
return vcmple4(a, b);
}
__device__ __forceinline__ VCmpLe4() {}
__device__ __forceinline__ VCmpLe4(const VCmpLe4& other) {}
};
////////////////////////////////////
template <class Op, typename T>
struct Cmp : binary_function<T, T, uchar>
{
@@ -1890,6 +1556,21 @@ namespace arithm
namespace cv { namespace gpu { namespace device
{
template <> struct TransformFunctorTraits< arithm::VCmpEq4 > : arithm::ArithmFuncTraits<sizeof(uint), sizeof(uint)>
{
};
template <> struct TransformFunctorTraits< arithm::VCmpNe4 > : arithm::ArithmFuncTraits<sizeof(uint), sizeof(uint)>
{
};
template <> struct TransformFunctorTraits< arithm::VCmpLt4 > : arithm::ArithmFuncTraits<sizeof(uint), sizeof(uint)>
{
};
template <> struct TransformFunctorTraits< arithm::VCmpLe4 > : arithm::ArithmFuncTraits<sizeof(uint), sizeof(uint)>
{
};
////////////////////////////////////
template <class Op, typename T> struct TransformFunctorTraits< arithm::Cmp<Op, T> > : arithm::ArithmFuncTraits<sizeof(T), sizeof(uchar)>
{
};
@@ -1897,6 +1578,23 @@ namespace cv { namespace gpu { namespace device
namespace arithm
{
void cmpMatEq_v4(PtrStepSz<uint> src1, PtrStepSz<uint> src2, PtrStepSz<uint> dst, cudaStream_t stream)
{
transform(src1, src2, dst, VCmpEq4(), WithOutMask(), stream);
}
void cmpMatNe_v4(PtrStepSz<uint> src1, PtrStepSz<uint> src2, PtrStepSz<uint> dst, cudaStream_t stream)
{
transform(src1, src2, dst, VCmpNe4(), WithOutMask(), stream);
}
void cmpMatLt_v4(PtrStepSz<uint> src1, PtrStepSz<uint> src2, PtrStepSz<uint> dst, cudaStream_t stream)
{
transform(src1, src2, dst, VCmpLt4(), WithOutMask(), stream);
}
void cmpMatLe_v4(PtrStepSz<uint> src1, PtrStepSz<uint> src2, PtrStepSz<uint> dst, cudaStream_t stream)
{
transform(src1, src2, dst, VCmpLe4(), WithOutMask(), stream);
}
template <template <typename> class Op, typename T>
void cmpMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream)
{
@@ -2303,44 +2001,11 @@ namespace arithm
namespace arithm
{
template <typename T> struct VMin4;
template <> struct VMin4<uint> : binary_function<uint, uint, uint>
struct VMin4 : binary_function<uint, uint, uint>
{
__device__ __forceinline__ uint operator ()(uint a, uint b) const
{
uint res = 0;
#if __CUDA_ARCH__ >= 300
asm("vmin4.u32.u32.u32 %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
#elif __CUDA_ARCH__ >= 200
asm("vmin.u32.u32.u32 %0.b0, %1.b0, %2.b0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
asm("vmin.u32.u32.u32 %0.b1, %1.b1, %2.b1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
asm("vmin.u32.u32.u32 %0.b2, %1.b2, %2.b2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
asm("vmin.u32.u32.u32 %0.b3, %1.b3, %2.b3, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
#endif
return res;
}
__device__ __forceinline__ VMin4() {}
__device__ __forceinline__ VMin4(const VMin4& other) {}
};
template <> struct VMin4<int> : binary_function<int, int, int>
{
__device__ __forceinline__ int operator ()(int a, int b) const
{
int res = 0;
#if __CUDA_ARCH__ >= 300
asm("vmin4.s32.s32.s32 %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
#elif __CUDA_ARCH__ >= 200
asm("vmin.s32.s32.s32 %0.b0, %1.b0, %2.b0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
asm("vmin.s32.s32.s32 %0.b1, %1.b1, %2.b1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
asm("vmin.s32.s32.s32 %0.b2, %1.b2, %2.b2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
asm("vmin.s32.s32.s32 %0.b3, %1.b3, %2.b3, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
#endif
return res;
return vmin4(a, b);
}
__device__ __forceinline__ VMin4() {}
@@ -2349,40 +2014,11 @@ namespace arithm
////////////////////////////////////
template <typename T> struct VMin2;
template <> struct VMin2<uint> : binary_function<uint, uint, uint>
struct VMin2 : binary_function<uint, uint, uint>
{
__device__ __forceinline__ uint operator ()(uint a, uint b) const
{
uint res = 0;
#if __CUDA_ARCH__ >= 300
asm("vmin2.u32.u32.u32 %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
#elif __CUDA_ARCH__ >= 200
asm("vmin.u32.u32.u32 %0.h0, %1.h0, %2.h0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
asm("vmin.u32.u32.u32 %0.h1, %1.h1, %2.h1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
#endif
return res;
}
__device__ __forceinline__ VMin2() {}
__device__ __forceinline__ VMin2(const VMin2& other) {}
};
template <> struct VMin2<int> : binary_function<int, int, int>
{
__device__ __forceinline__ int operator ()(int a, int b) const
{
int res = 0;
#if __CUDA_ARCH__ >= 300
asm("vmin2.s32.s32.s32 %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
#elif __CUDA_ARCH__ >= 200
asm("vmin.s32.s32.s32 %0.h0, %1.h0, %2.h0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
asm("vmin.s32.s32.s32 %0.h1, %1.h1, %2.h1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
#endif
return res;
return vmin2(a, b);
}
__device__ __forceinline__ VMin2() {}
@@ -2392,13 +2028,13 @@ namespace arithm
namespace cv { namespace gpu { namespace device
{
template <typename T> struct TransformFunctorTraits< arithm::VMin4<T> > : arithm::ArithmFuncTraits<sizeof(T), sizeof(T)>
template <> struct TransformFunctorTraits< arithm::VMin4 > : arithm::ArithmFuncTraits<sizeof(uint), sizeof(uint)>
{
};
////////////////////////////////////
template <typename T> struct TransformFunctorTraits< arithm::VMin2<T> > : arithm::ArithmFuncTraits<sizeof(T), sizeof(T)>
template <> struct TransformFunctorTraits< arithm::VMin2 > : arithm::ArithmFuncTraits<sizeof(uint), sizeof(uint)>
{
};
@@ -2415,14 +2051,14 @@ namespace cv { namespace gpu { namespace device
namespace arithm
{
template <typename T> void vmin4(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream)
void minMat_v4(PtrStepSz<uint> src1, PtrStepSz<uint> src2, PtrStepSz<uint> dst, cudaStream_t stream)
{
transform((PtrStepSz<T>) src1, (PtrStepSz<T>) src2, (PtrStepSz<T>) dst, VMin4<T>(), WithOutMask(), stream);
transform(src1, src2, dst, VMin4(), WithOutMask(), stream);
}
template <typename T> void vmin2(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream)
void minMat_v2(PtrStepSz<uint> src1, PtrStepSz<uint> src2, PtrStepSz<uint> dst, cudaStream_t stream)
{
transform((PtrStepSz<T>) src1, (PtrStepSz<T>) src2, (PtrStepSz<T>) dst, VMin2<T>(), WithOutMask(), stream);
transform(src1, src2, dst, VMin2(), WithOutMask(), stream);
}
template <typename T> void minMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream)
@@ -2430,12 +2066,6 @@ namespace arithm
transform((PtrStepSz<T>) src1, (PtrStepSz<T>) src2, (PtrStepSz<T>) dst, minimum<T>(), WithOutMask(), stream);
}
template void vmin4<uint>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
template void vmin4<int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
template void vmin2<uint>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
template void vmin2<int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
template void minMat<uchar >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
template void minMat<schar >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
template void minMat<ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
@@ -2463,44 +2093,11 @@ namespace arithm
namespace arithm
{
template <typename T> struct VMax4;
template <> struct VMax4<uint> : binary_function<uint, uint, uint>
struct VMax4 : binary_function<uint, uint, uint>
{
__device__ __forceinline__ uint operator ()(uint a, uint b) const
{
uint res = 0;
#if __CUDA_ARCH__ >= 300
asm("vmax4.u32.u32.u32 %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
#elif __CUDA_ARCH__ >= 200
asm("vmax.u32.u32.u32 %0.b0, %1.b0, %2.b0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
asm("vmax.u32.u32.u32 %0.b1, %1.b1, %2.b1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
asm("vmax.u32.u32.u32 %0.b2, %1.b2, %2.b2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
asm("vmax.u32.u32.u32 %0.b3, %1.b3, %2.b3, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
#endif
return res;
}
__device__ __forceinline__ VMax4() {}
__device__ __forceinline__ VMax4(const VMax4& other) {}
};
template <> struct VMax4<int> : binary_function<int, int, int>
{
__device__ __forceinline__ int operator ()(int a, int b) const
{
int res = 0;
#if __CUDA_ARCH__ >= 300
asm("vmax4.s32.s32.s32 %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
#elif __CUDA_ARCH__ >= 200
asm("vmax.s32.s32.s32 %0.b0, %1.b0, %2.b0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
asm("vmax.s32.s32.s32 %0.b1, %1.b1, %2.b1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
asm("vmax.s32.s32.s32 %0.b2, %1.b2, %2.b2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
asm("vmax.s32.s32.s32 %0.b3, %1.b3, %2.b3, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
#endif
return res;
return vmax4(a, b);
}
__device__ __forceinline__ VMax4() {}
@@ -2509,40 +2106,11 @@ namespace arithm
////////////////////////////////////
template <typename T> struct VMax2;
template <> struct VMax2<uint> : binary_function<uint, uint, uint>
struct VMax2 : binary_function<uint, uint, uint>
{
__device__ __forceinline__ uint operator ()(uint a, uint b) const
{
uint res = 0;
#if __CUDA_ARCH__ >= 300
asm("vmax2.u32.u32.u32 %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
#elif __CUDA_ARCH__ >= 200
asm("vmax.u32.u32.u32 %0.h0, %1.h0, %2.h0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
asm("vmax.u32.u32.u32 %0.h1, %1.h1, %2.h1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
#endif
return res;
}
__device__ __forceinline__ VMax2() {}
__device__ __forceinline__ VMax2(const VMax2& other) {}
};
template <> struct VMax2<int> : binary_function<int, int, int>
{
__device__ __forceinline__ int operator ()(int a, int b) const
{
int res = 0;
#if __CUDA_ARCH__ >= 300
asm("vmax2.s32.s32.s32 %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
#elif __CUDA_ARCH__ >= 200
asm("vmax.s32.s32.s32 %0.h0, %1.h0, %2.h0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
asm("vmax.s32.s32.s32 %0.h1, %1.h1, %2.h1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
#endif
return res;
return vmax2(a, b);
}
__device__ __forceinline__ VMax2() {}
@@ -2552,13 +2120,13 @@ namespace arithm
namespace cv { namespace gpu { namespace device
{
template <typename T> struct TransformFunctorTraits< arithm::VMax4<T> > : arithm::ArithmFuncTraits<sizeof(T), sizeof(T)>
template <> struct TransformFunctorTraits< arithm::VMax4 > : arithm::ArithmFuncTraits<sizeof(uint), sizeof(uint)>
{
};
////////////////////////////////////
template <typename T> struct TransformFunctorTraits< arithm::VMax2<T> > : arithm::ArithmFuncTraits<sizeof(T), sizeof(T)>
template <> struct TransformFunctorTraits< arithm::VMax2 > : arithm::ArithmFuncTraits<sizeof(uint), sizeof(uint)>
{
};
@@ -2575,14 +2143,14 @@ namespace cv { namespace gpu { namespace device
namespace arithm
{
template <typename T> void vmax4(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream)
void maxMat_v4(PtrStepSz<uint> src1, PtrStepSz<uint> src2, PtrStepSz<uint> dst, cudaStream_t stream)
{
transform((PtrStepSz<T>) src1, (PtrStepSz<T>) src2, (PtrStepSz<T>) dst, VMax4<T>(), WithOutMask(), stream);
transform(src1, src2, dst, VMax4(), WithOutMask(), stream);
}
template <typename T> void vmax2(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream)
void maxMat_v2(PtrStepSz<uint> src1, PtrStepSz<uint> src2, PtrStepSz<uint> dst, cudaStream_t stream)
{
transform((PtrStepSz<T>) src1, (PtrStepSz<T>) src2, (PtrStepSz<T>) dst, VMax2<T>(), WithOutMask(), stream);
transform(src1, src2, dst, VMax2(), WithOutMask(), stream);
}
template <typename T> void maxMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream)
@@ -2590,12 +2158,6 @@ namespace arithm
transform((PtrStepSz<T>) src1, (PtrStepSz<T>) src2, (PtrStepSz<T>) dst, maximum<T>(), WithOutMask(), stream);
}
template void vmax4<uint>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
template void vmax4<int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
template void vmax2<uint>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
template void vmax2<int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
template void maxMat<uchar >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
template void maxMat<schar >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
template void maxMat<ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);

View File

@@ -1,934 +0,0 @@
/*M///////////////////////////////////////////////////////////////////////////////////////
//
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
// By downloading, copying, installing or using the software you agree to this license.
// If you do not agree to this license, do not download, install,
// copy or use the software.
//
//
// License Agreement
// For Open Source Computer Vision Library
//
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other materials provided with the distribution.
//
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors "as is" and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
// Copyright (c) 2010, Paul Furgale, Chi Hay Tong
//
// The original code was written by Paul Furgale and Chi Hay Tong
// and later optimized and prepared for integration into OpenCV by Itseez.
//
//M*/
#if !defined CUDA_DISABLER
#include "opencv2/gpu/device/common.hpp"
#include "opencv2/gpu/device/limits.hpp"
#include "opencv2/gpu/device/saturate_cast.hpp"
#include "opencv2/gpu/device/reduce.hpp"
#include "opencv2/gpu/device/utility.hpp"
#include "opencv2/gpu/device/functional.hpp"
#include "opencv2/gpu/device/filters.hpp"
namespace cv { namespace gpu { namespace device
{
namespace surf
{
////////////////////////////////////////////////////////////////////////
// Global parameters
// The maximum number of features (before subpixel interpolation) that memory is reserved for.
__constant__ int c_max_candidates;
// The maximum number of features that memory is reserved for.
__constant__ int c_max_features;
// The image size.
__constant__ int c_img_rows;
__constant__ int c_img_cols;
// The number of layers.
__constant__ int c_nOctaveLayers;
// The hessian threshold.
__constant__ float c_hessianThreshold;
// The current octave.
__constant__ int c_octave;
// The current layer size.
__constant__ int c_layer_rows;
__constant__ int c_layer_cols;
void loadGlobalConstants(int maxCandidates, int maxFeatures, int img_rows, int img_cols, int nOctaveLayers, float hessianThreshold)
{
cudaSafeCall( cudaMemcpyToSymbol(c_max_candidates, &maxCandidates, sizeof(maxCandidates)) );
cudaSafeCall( cudaMemcpyToSymbol(c_max_features, &maxFeatures, sizeof(maxFeatures)) );
cudaSafeCall( cudaMemcpyToSymbol(c_img_rows, &img_rows, sizeof(img_rows)) );
cudaSafeCall( cudaMemcpyToSymbol(c_img_cols, &img_cols, sizeof(img_cols)) );
cudaSafeCall( cudaMemcpyToSymbol(c_nOctaveLayers, &nOctaveLayers, sizeof(nOctaveLayers)) );
cudaSafeCall( cudaMemcpyToSymbol(c_hessianThreshold, &hessianThreshold, sizeof(hessianThreshold)) );
}
void loadOctaveConstants(int octave, int layer_rows, int layer_cols)
{
cudaSafeCall( cudaMemcpyToSymbol(c_octave, &octave, sizeof(octave)) );
cudaSafeCall( cudaMemcpyToSymbol(c_layer_rows, &layer_rows, sizeof(layer_rows)) );
cudaSafeCall( cudaMemcpyToSymbol(c_layer_cols, &layer_cols, sizeof(layer_cols)) );
}
////////////////////////////////////////////////////////////////////////
// Integral image texture
texture<unsigned char, 2, cudaReadModeElementType> imgTex(0, cudaFilterModePoint, cudaAddressModeClamp);
texture<unsigned int, 2, cudaReadModeElementType> sumTex(0, cudaFilterModePoint, cudaAddressModeClamp);
texture<unsigned int, 2, cudaReadModeElementType> maskSumTex(0, cudaFilterModePoint, cudaAddressModeClamp);
void bindImgTex(PtrStepSzb img)
{
bindTexture(&imgTex, img);
}
size_t bindSumTex(PtrStepSz<uint> sum)
{
size_t offset;
cudaChannelFormatDesc desc_sum = cudaCreateChannelDesc<uint>();
cudaSafeCall( cudaBindTexture2D(&offset, sumTex, sum.data, desc_sum, sum.cols, sum.rows, sum.step));
return offset / sizeof(uint);
}
size_t bindMaskSumTex(PtrStepSz<uint> maskSum)
{
size_t offset;
cudaChannelFormatDesc desc_sum = cudaCreateChannelDesc<uint>();
cudaSafeCall( cudaBindTexture2D(&offset, maskSumTex, maskSum.data, desc_sum, maskSum.cols, maskSum.rows, maskSum.step));
return offset / sizeof(uint);
}
template <int N> __device__ float icvCalcHaarPatternSum(const float src[][5], int oldSize, int newSize, int y, int x)
{
#if __CUDA_ARCH__ && __CUDA_ARCH__ >= 200
typedef double real_t;
#else
typedef float real_t;
#endif
float ratio = (float)newSize / oldSize;
real_t d = 0;
#pragma unroll
for (int k = 0; k < N; ++k)
{
int dx1 = __float2int_rn(ratio * src[k][0]);
int dy1 = __float2int_rn(ratio * src[k][1]);
int dx2 = __float2int_rn(ratio * src[k][2]);
int dy2 = __float2int_rn(ratio * src[k][3]);
real_t t = 0;
t += tex2D(sumTex, x + dx1, y + dy1);
t -= tex2D(sumTex, x + dx1, y + dy2);
t -= tex2D(sumTex, x + dx2, y + dy1);
t += tex2D(sumTex, x + dx2, y + dy2);
d += t * src[k][4] / ((dx2 - dx1) * (dy2 - dy1));
}
return (float)d;
}
////////////////////////////////////////////////////////////////////////
// Hessian
__constant__ float c_DX [3][5] = { {0, 2, 3, 7, 1}, {3, 2, 6, 7, -2}, {6, 2, 9, 7, 1} };
__constant__ float c_DY [3][5] = { {2, 0, 7, 3, 1}, {2, 3, 7, 6, -2}, {2, 6, 7, 9, 1} };
__constant__ float c_DXY[4][5] = { {1, 1, 4, 4, 1}, {5, 1, 8, 4, -1}, {1, 5, 4, 8, -1}, {5, 5, 8, 8, 1} };
__host__ __device__ __forceinline__ int calcSize(int octave, int layer)
{
/* Wavelet size at first layer of first octave. */
const int HAAR_SIZE0 = 9;
/* Wavelet size increment between layers. This should be an even number,
such that the wavelet sizes in an octave are either all even or all odd.
This ensures that when looking for the neighbours of a sample, the layers
above and below are aligned correctly. */
const int HAAR_SIZE_INC = 6;
return (HAAR_SIZE0 + HAAR_SIZE_INC * layer) << octave;
}
__global__ void icvCalcLayerDetAndTrace(PtrStepf det, PtrStepf trace)
{
// Determine the indices
const int gridDim_y = gridDim.y / (c_nOctaveLayers + 2);
const int blockIdx_y = blockIdx.y % gridDim_y;
const int blockIdx_z = blockIdx.y / gridDim_y;
const int j = threadIdx.x + blockIdx.x * blockDim.x;
const int i = threadIdx.y + blockIdx_y * blockDim.y;
const int layer = blockIdx_z;
const int size = calcSize(c_octave, layer);
const int samples_i = 1 + ((c_img_rows - size) >> c_octave);
const int samples_j = 1 + ((c_img_cols - size) >> c_octave);
// Ignore pixels where some of the kernel is outside the image
const int margin = (size >> 1) >> c_octave;
if (size <= c_img_rows && size <= c_img_cols && i < samples_i && j < samples_j)
{
const float dx = icvCalcHaarPatternSum<3>(c_DX , 9, size, (i << c_octave), (j << c_octave));
const float dy = icvCalcHaarPatternSum<3>(c_DY , 9, size, (i << c_octave), (j << c_octave));
const float dxy = icvCalcHaarPatternSum<4>(c_DXY, 9, size, (i << c_octave), (j << c_octave));
det.ptr(layer * c_layer_rows + i + margin)[j + margin] = dx * dy - 0.81f * dxy * dxy;
trace.ptr(layer * c_layer_rows + i + margin)[j + margin] = dx + dy;
}
}
void icvCalcLayerDetAndTrace_gpu(const PtrStepf& det, const PtrStepf& trace, int img_rows, int img_cols,
int octave, int nOctaveLayers)
{
const int min_size = calcSize(octave, 0);
const int max_samples_i = 1 + ((img_rows - min_size) >> octave);
const int max_samples_j = 1 + ((img_cols - min_size) >> octave);
dim3 threads(16, 16);
dim3 grid;
grid.x = divUp(max_samples_j, threads.x);
grid.y = divUp(max_samples_i, threads.y) * (nOctaveLayers + 2);
icvCalcLayerDetAndTrace<<<grid, threads>>>(det, trace);
cudaSafeCall( cudaGetLastError() );
cudaSafeCall( cudaDeviceSynchronize() );
}
////////////////////////////////////////////////////////////////////////
// NONMAX
__constant__ float c_DM[5] = {0, 0, 9, 9, 1};
struct WithMask
{
static __device__ bool check(int sum_i, int sum_j, int size)
{
float ratio = (float)size / 9.0f;
float d = 0;
int dx1 = __float2int_rn(ratio * c_DM[0]);
int dy1 = __float2int_rn(ratio * c_DM[1]);
int dx2 = __float2int_rn(ratio * c_DM[2]);
int dy2 = __float2int_rn(ratio * c_DM[3]);
float t = 0;
t += tex2D(maskSumTex, sum_j + dx1, sum_i + dy1);
t -= tex2D(maskSumTex, sum_j + dx1, sum_i + dy2);
t -= tex2D(maskSumTex, sum_j + dx2, sum_i + dy1);
t += tex2D(maskSumTex, sum_j + dx2, sum_i + dy2);
d += t * c_DM[4] / ((dx2 - dx1) * (dy2 - dy1));
return (d >= 0.5f);
}
};
template <typename Mask>
__global__ void icvFindMaximaInLayer(const PtrStepf det, const PtrStepf trace, int4* maxPosBuffer,
unsigned int* maxCounter)
{
#if __CUDA_ARCH__ && __CUDA_ARCH__ >= 110
extern __shared__ float N9[];
// The hidx variables are the indices to the hessian buffer.
const int gridDim_y = gridDim.y / c_nOctaveLayers;
const int blockIdx_y = blockIdx.y % gridDim_y;
const int blockIdx_z = blockIdx.y / gridDim_y;
const int layer = blockIdx_z + 1;
const int size = calcSize(c_octave, layer);
// Ignore pixels without a 3x3x3 neighbourhood in the layer above
const int margin = ((calcSize(c_octave, layer + 1) >> 1) >> c_octave) + 1;
const int j = threadIdx.x + blockIdx.x * (blockDim.x - 2) + margin - 1;
const int i = threadIdx.y + blockIdx_y * (blockDim.y - 2) + margin - 1;
// Is this thread within the hessian buffer?
const int zoff = blockDim.x * blockDim.y;
const int localLin = threadIdx.x + threadIdx.y * blockDim.x + zoff;
N9[localLin - zoff] = det.ptr(c_layer_rows * (layer - 1) + ::min(::max(i, 0), c_img_rows - 1))[::min(::max(j, 0), c_img_cols - 1)];
N9[localLin ] = det.ptr(c_layer_rows * (layer ) + ::min(::max(i, 0), c_img_rows - 1))[::min(::max(j, 0), c_img_cols - 1)];
N9[localLin + zoff] = det.ptr(c_layer_rows * (layer + 1) + ::min(::max(i, 0), c_img_rows - 1))[::min(::max(j, 0), c_img_cols - 1)];
__syncthreads();
if (i < c_layer_rows - margin && j < c_layer_cols - margin && threadIdx.x > 0 && threadIdx.x < blockDim.x - 1 && threadIdx.y > 0 && threadIdx.y < blockDim.y - 1)
{
float val0 = N9[localLin];
if (val0 > c_hessianThreshold)
{
// Coordinates for the start of the wavelet in the sum image. There
// is some integer division involved, so don't try to simplify this
// (cancel out sampleStep) without checking the result is the same
const int sum_i = (i - ((size >> 1) >> c_octave)) << c_octave;
const int sum_j = (j - ((size >> 1) >> c_octave)) << c_octave;
if (Mask::check(sum_i, sum_j, size))
{
// Check to see if we have a max (in its 26 neighbours)
const bool condmax = val0 > N9[localLin - 1 - blockDim.x - zoff]
&& val0 > N9[localLin - blockDim.x - zoff]
&& val0 > N9[localLin + 1 - blockDim.x - zoff]
&& val0 > N9[localLin - 1 - zoff]
&& val0 > N9[localLin - zoff]
&& val0 > N9[localLin + 1 - zoff]
&& val0 > N9[localLin - 1 + blockDim.x - zoff]
&& val0 > N9[localLin + blockDim.x - zoff]
&& val0 > N9[localLin + 1 + blockDim.x - zoff]
&& val0 > N9[localLin - 1 - blockDim.x]
&& val0 > N9[localLin - blockDim.x]
&& val0 > N9[localLin + 1 - blockDim.x]
&& val0 > N9[localLin - 1 ]
&& val0 > N9[localLin + 1 ]
&& val0 > N9[localLin - 1 + blockDim.x]
&& val0 > N9[localLin + blockDim.x]
&& val0 > N9[localLin + 1 + blockDim.x]
&& val0 > N9[localLin - 1 - blockDim.x + zoff]
&& val0 > N9[localLin - blockDim.x + zoff]
&& val0 > N9[localLin + 1 - blockDim.x + zoff]
&& val0 > N9[localLin - 1 + zoff]
&& val0 > N9[localLin + zoff]
&& val0 > N9[localLin + 1 + zoff]
&& val0 > N9[localLin - 1 + blockDim.x + zoff]
&& val0 > N9[localLin + blockDim.x + zoff]
&& val0 > N9[localLin + 1 + blockDim.x + zoff]
;
if(condmax)
{
unsigned int ind = atomicInc(maxCounter,(unsigned int) -1);
if (ind < c_max_candidates)
{
const int laplacian = (int) copysignf(1.0f, trace.ptr(layer * c_layer_rows + i)[j]);
maxPosBuffer[ind] = make_int4(j, i, layer, laplacian);
}
}
}
}
}
#endif
}
void icvFindMaximaInLayer_gpu(const PtrStepf& det, const PtrStepf& trace, int4* maxPosBuffer, unsigned int* maxCounter,
int img_rows, int img_cols, int octave, bool use_mask, int nOctaveLayers)
{
const int layer_rows = img_rows >> octave;
const int layer_cols = img_cols >> octave;
const int min_margin = ((calcSize(octave, 2) >> 1) >> octave) + 1;
dim3 threads(16, 16);
dim3 grid;
grid.x = divUp(layer_cols - 2 * min_margin, threads.x - 2);
grid.y = divUp(layer_rows - 2 * min_margin, threads.y - 2) * nOctaveLayers;
const size_t smem_size = threads.x * threads.y * 3 * sizeof(float);
if (use_mask)
icvFindMaximaInLayer<WithMask><<<grid, threads, smem_size>>>(det, trace, maxPosBuffer, maxCounter);
else
icvFindMaximaInLayer<WithOutMask><<<grid, threads, smem_size>>>(det, trace, maxPosBuffer, maxCounter);
cudaSafeCall( cudaGetLastError() );
cudaSafeCall( cudaDeviceSynchronize() );
}
////////////////////////////////////////////////////////////////////////
// INTERPOLATION
__global__ void icvInterpolateKeypoint(const PtrStepf det, const int4* maxPosBuffer,
float* featureX, float* featureY, int* featureLaplacian, int* featureOctave, float* featureSize, float* featureHessian,
unsigned int* featureCounter)
{
#if __CUDA_ARCH__ && __CUDA_ARCH__ >= 110
const int4 maxPos = maxPosBuffer[blockIdx.x];
const int j = maxPos.x - 1 + threadIdx.x;
const int i = maxPos.y - 1 + threadIdx.y;
const int layer = maxPos.z - 1 + threadIdx.z;
__shared__ float N9[3][3][3];
N9[threadIdx.z][threadIdx.y][threadIdx.x] = det.ptr(c_layer_rows * layer + i)[j];
__syncthreads();
if (threadIdx.x == 0 && threadIdx.y == 0 && threadIdx.z == 0)
{
__shared__ float dD[3];
//dx
dD[0] = -0.5f * (N9[1][1][2] - N9[1][1][0]);
//dy
dD[1] = -0.5f * (N9[1][2][1] - N9[1][0][1]);
//ds
dD[2] = -0.5f * (N9[2][1][1] - N9[0][1][1]);
__shared__ float H[3][3];
//dxx
H[0][0] = N9[1][1][0] - 2.0f * N9[1][1][1] + N9[1][1][2];
//dxy
H[0][1]= 0.25f * (N9[1][2][2] - N9[1][2][0] - N9[1][0][2] + N9[1][0][0]);
//dxs
H[0][2]= 0.25f * (N9[2][1][2] - N9[2][1][0] - N9[0][1][2] + N9[0][1][0]);
//dyx = dxy
H[1][0] = H[0][1];
//dyy
H[1][1] = N9[1][0][1] - 2.0f * N9[1][1][1] + N9[1][2][1];
//dys
H[1][2]= 0.25f * (N9[2][2][1] - N9[2][0][1] - N9[0][2][1] + N9[0][0][1]);
//dsx = dxs
H[2][0] = H[0][2];
//dsy = dys
H[2][1] = H[1][2];
//dss
H[2][2] = N9[0][1][1] - 2.0f * N9[1][1][1] + N9[2][1][1];
__shared__ float x[3];
if (solve3x3(H, dD, x))
{
if (::fabs(x[0]) <= 1.f && ::fabs(x[1]) <= 1.f && ::fabs(x[2]) <= 1.f)
{
// if the step is within the interpolation region, perform it
const int size = calcSize(c_octave, maxPos.z);
const int sum_i = (maxPos.y - ((size >> 1) >> c_octave)) << c_octave;
const int sum_j = (maxPos.x - ((size >> 1) >> c_octave)) << c_octave;
const float center_i = sum_i + (float)(size - 1) / 2;
const float center_j = sum_j + (float)(size - 1) / 2;
const float px = center_j + x[0] * (1 << c_octave);
const float py = center_i + x[1] * (1 << c_octave);
const int ds = size - calcSize(c_octave, maxPos.z - 1);
const float psize = roundf(size + x[2] * ds);
/* The sampling intervals and wavelet sized for selecting an orientation
and building the keypoint descriptor are defined relative to 's' */
const float s = psize * 1.2f / 9.0f;
/* To find the dominant orientation, the gradients in x and y are
sampled in a circle of radius 6s using wavelets of size 4s.
We ensure the gradient wavelet size is even to ensure the
wavelet pattern is balanced and symmetric around its center */
const int grad_wav_size = 2 * __float2int_rn(2.0f * s);
// check when grad_wav_size is too big
if ((c_img_rows + 1) >= grad_wav_size && (c_img_cols + 1) >= grad_wav_size)
{
// Get a new feature index.
unsigned int ind = atomicInc(featureCounter, (unsigned int)-1);
if (ind < c_max_features)
{
featureX[ind] = px;
featureY[ind] = py;
featureLaplacian[ind] = maxPos.w;
featureOctave[ind] = c_octave;
featureSize[ind] = psize;
featureHessian[ind] = N9[1][1][1];
}
} // grad_wav_size check
} // If the subpixel interpolation worked
}
} // If this is thread 0.
#endif
}
void icvInterpolateKeypoint_gpu(const PtrStepf& det, const int4* maxPosBuffer, unsigned int maxCounter,
float* featureX, float* featureY, int* featureLaplacian, int* featureOctave, float* featureSize, float* featureHessian,
unsigned int* featureCounter)
{
dim3 threads;
threads.x = 3;
threads.y = 3;
threads.z = 3;
dim3 grid;
grid.x = maxCounter;
icvInterpolateKeypoint<<<grid, threads>>>(det, maxPosBuffer, featureX, featureY, featureLaplacian, featureOctave, featureSize, featureHessian, featureCounter);
cudaSafeCall( cudaGetLastError() );
cudaSafeCall( cudaDeviceSynchronize() );
}
////////////////////////////////////////////////////////////////////////
// Orientation
#define ORI_SEARCH_INC 5
#define ORI_WIN 60
#define ORI_SAMPLES 113
__constant__ float c_aptX[ORI_SAMPLES] = {-6, -5, -5, -5, -5, -5, -5, -5, -4, -4, -4, -4, -4, -4, -4, -4, -4, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 6};
__constant__ float c_aptY[ORI_SAMPLES] = {0, -3, -2, -1, 0, 1, 2, 3, -4, -3, -2, -1, 0, 1, 2, 3, 4, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, -6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, 6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, -4, -3, -2, -1, 0, 1, 2, 3, 4, -3, -2, -1, 0, 1, 2, 3, 0};
__constant__ float c_aptW[ORI_SAMPLES] = {0.001455130288377404f, 0.001707611023448408f, 0.002547456417232752f, 0.003238451667129993f, 0.0035081731621176f, 0.003238451667129993f, 0.002547456417232752f, 0.001707611023448408f, 0.002003900473937392f, 0.0035081731621176f, 0.005233579315245152f, 0.00665318313986063f, 0.00720730796456337f, 0.00665318313986063f, 0.005233579315245152f, 0.0035081731621176f, 0.002003900473937392f, 0.001707611023448408f, 0.0035081731621176f, 0.006141661666333675f, 0.009162282571196556f, 0.01164754293859005f, 0.01261763460934162f, 0.01164754293859005f, 0.009162282571196556f, 0.006141661666333675f, 0.0035081731621176f, 0.001707611023448408f, 0.002547456417232752f, 0.005233579315245152f, 0.009162282571196556f, 0.01366852037608624f, 0.01737609319388866f, 0.0188232995569706f, 0.01737609319388866f, 0.01366852037608624f, 0.009162282571196556f, 0.005233579315245152f, 0.002547456417232752f, 0.003238451667129993f, 0.00665318313986063f, 0.01164754293859005f, 0.01737609319388866f, 0.02208934165537357f, 0.02392910048365593f, 0.02208934165537357f, 0.01737609319388866f, 0.01164754293859005f, 0.00665318313986063f, 0.003238451667129993f, 0.001455130288377404f, 0.0035081731621176f, 0.00720730796456337f, 0.01261763460934162f, 0.0188232995569706f, 0.02392910048365593f, 0.02592208795249462f, 0.02392910048365593f, 0.0188232995569706f, 0.01261763460934162f, 0.00720730796456337f, 0.0035081731621176f, 0.001455130288377404f, 0.003238451667129993f, 0.00665318313986063f, 0.01164754293859005f, 0.01737609319388866f, 0.02208934165537357f, 0.02392910048365593f, 0.02208934165537357f, 0.01737609319388866f, 0.01164754293859005f, 0.00665318313986063f, 0.003238451667129993f, 0.002547456417232752f, 0.005233579315245152f, 0.009162282571196556f, 0.01366852037608624f, 0.01737609319388866f, 0.0188232995569706f, 0.01737609319388866f, 0.01366852037608624f, 0.009162282571196556f, 0.005233579315245152f, 0.002547456417232752f, 0.001707611023448408f, 0.0035081731621176f, 0.006141661666333675f, 0.009162282571196556f, 0.01164754293859005f, 0.01261763460934162f, 0.01164754293859005f, 0.009162282571196556f, 0.006141661666333675f, 0.0035081731621176f, 0.001707611023448408f, 0.002003900473937392f, 0.0035081731621176f, 0.005233579315245152f, 0.00665318313986063f, 0.00720730796456337f, 0.00665318313986063f, 0.005233579315245152f, 0.0035081731621176f, 0.002003900473937392f, 0.001707611023448408f, 0.002547456417232752f, 0.003238451667129993f, 0.0035081731621176f, 0.003238451667129993f, 0.002547456417232752f, 0.001707611023448408f, 0.001455130288377404f};
__constant__ float c_NX[2][5] = {{0, 0, 2, 4, -1}, {2, 0, 4, 4, 1}};
__constant__ float c_NY[2][5] = {{0, 0, 4, 2, 1}, {0, 2, 4, 4, -1}};
__global__ void icvCalcOrientation(const float* featureX, const float* featureY, const float* featureSize, float* featureDir)
{
__shared__ float s_X[128];
__shared__ float s_Y[128];
__shared__ float s_angle[128];
__shared__ float s_sumx[32 * 4];
__shared__ float s_sumy[32 * 4];
/* The sampling intervals and wavelet sized for selecting an orientation
and building the keypoint descriptor are defined relative to 's' */
const float s = featureSize[blockIdx.x] * 1.2f / 9.0f;
/* To find the dominant orientation, the gradients in x and y are
sampled in a circle of radius 6s using wavelets of size 4s.
We ensure the gradient wavelet size is even to ensure the
wavelet pattern is balanced and symmetric around its center */
const int grad_wav_size = 2 * __float2int_rn(2.0f * s);
// check when grad_wav_size is too big
if ((c_img_rows + 1) < grad_wav_size || (c_img_cols + 1) < grad_wav_size)
return;
// Calc X, Y, angle and store it to shared memory
const int tid = threadIdx.y * blockDim.x + threadIdx.x;
float X = 0.0f, Y = 0.0f, angle = 0.0f;
if (tid < ORI_SAMPLES)
{
const float margin = (float)(grad_wav_size - 1) / 2.0f;
const int x = __float2int_rn(featureX[blockIdx.x] + c_aptX[tid] * s - margin);
const int y = __float2int_rn(featureY[blockIdx.x] + c_aptY[tid] * s - margin);
if (y >= 0 && y < (c_img_rows + 1) - grad_wav_size &&
x >= 0 && x < (c_img_cols + 1) - grad_wav_size)
{
X = c_aptW[tid] * icvCalcHaarPatternSum<2>(c_NX, 4, grad_wav_size, y, x);
Y = c_aptW[tid] * icvCalcHaarPatternSum<2>(c_NY, 4, grad_wav_size, y, x);
angle = atan2f(Y, X);
if (angle < 0)
angle += 2.0f * CV_PI_F;
angle *= 180.0f / CV_PI_F;
}
}
s_X[tid] = X;
s_Y[tid] = Y;
s_angle[tid] = angle;
__syncthreads();
float bestx = 0, besty = 0, best_mod = 0;
#if __CUDA_ARCH__ >= 200
#pragma unroll
#endif
for (int i = 0; i < 18; ++i)
{
const int dir = (i * 4 + threadIdx.y) * ORI_SEARCH_INC;
float sumx = 0.0f, sumy = 0.0f;
int d = ::abs(__float2int_rn(s_angle[threadIdx.x]) - dir);
if (d < ORI_WIN / 2 || d > 360 - ORI_WIN / 2)
{
sumx = s_X[threadIdx.x];
sumy = s_Y[threadIdx.x];
}
d = ::abs(__float2int_rn(s_angle[threadIdx.x + 32]) - dir);
if (d < ORI_WIN / 2 || d > 360 - ORI_WIN / 2)
{
sumx += s_X[threadIdx.x + 32];
sumy += s_Y[threadIdx.x + 32];
}
d = ::abs(__float2int_rn(s_angle[threadIdx.x + 64]) - dir);
if (d < ORI_WIN / 2 || d > 360 - ORI_WIN / 2)
{
sumx += s_X[threadIdx.x + 64];
sumy += s_Y[threadIdx.x + 64];
}
d = ::abs(__float2int_rn(s_angle[threadIdx.x + 96]) - dir);
if (d < ORI_WIN / 2 || d > 360 - ORI_WIN / 2)
{
sumx += s_X[threadIdx.x + 96];
sumy += s_Y[threadIdx.x + 96];
}
plus<float> op;
device::reduce<32>(smem_tuple(s_sumx + threadIdx.y * 32, s_sumy + threadIdx.y * 32),
thrust::tie(sumx, sumy), threadIdx.x, thrust::make_tuple(op, op));
const float temp_mod = sumx * sumx + sumy * sumy;
if (temp_mod > best_mod)
{
best_mod = temp_mod;
bestx = sumx;
besty = sumy;
}
__syncthreads();
}
if (threadIdx.x == 0)
{
s_X[threadIdx.y] = bestx;
s_Y[threadIdx.y] = besty;
s_angle[threadIdx.y] = best_mod;
}
__syncthreads();
if (threadIdx.x == 0 && threadIdx.y == 0)
{
int bestIdx = 0;
if (s_angle[1] > s_angle[bestIdx])
bestIdx = 1;
if (s_angle[2] > s_angle[bestIdx])
bestIdx = 2;
if (s_angle[3] > s_angle[bestIdx])
bestIdx = 3;
float kp_dir = atan2f(s_Y[bestIdx], s_X[bestIdx]);
if (kp_dir < 0)
kp_dir += 2.0f * CV_PI_F;
kp_dir *= 180.0f / CV_PI_F;
kp_dir = 360.0f - kp_dir;
if (::fabsf(kp_dir - 360.f) < numeric_limits<float>::epsilon())
kp_dir = 0.f;
featureDir[blockIdx.x] = kp_dir;
}
}
#undef ORI_SEARCH_INC
#undef ORI_WIN
#undef ORI_SAMPLES
void icvCalcOrientation_gpu(const float* featureX, const float* featureY, const float* featureSize, float* featureDir, int nFeatures)
{
dim3 threads;
threads.x = 32;
threads.y = 4;
dim3 grid;
grid.x = nFeatures;
icvCalcOrientation<<<grid, threads>>>(featureX, featureY, featureSize, featureDir);
cudaSafeCall( cudaGetLastError() );
cudaSafeCall( cudaDeviceSynchronize() );
}
////////////////////////////////////////////////////////////////////////
// Descriptors
#define PATCH_SZ 20
__constant__ float c_DW[PATCH_SZ * PATCH_SZ] =
{
3.695352233989979e-006f, 8.444558261544444e-006f, 1.760426494001877e-005f, 3.34794785885606e-005f, 5.808438800158911e-005f, 9.193058212986216e-005f, 0.0001327334757661447f, 0.0001748319627949968f, 0.0002100782439811155f, 0.0002302826324012131f, 0.0002302826324012131f, 0.0002100782439811155f, 0.0001748319627949968f, 0.0001327334757661447f, 9.193058212986216e-005f, 5.808438800158911e-005f, 3.34794785885606e-005f, 1.760426494001877e-005f, 8.444558261544444e-006f, 3.695352233989979e-006f,
8.444558261544444e-006f, 1.929736572492402e-005f, 4.022897701361217e-005f, 7.650675252079964e-005f, 0.0001327334903180599f, 0.0002100782585330308f, 0.0003033203829545528f, 0.0003995231236331165f, 0.0004800673632416874f, 0.0005262381164357066f, 0.0005262381164357066f, 0.0004800673632416874f, 0.0003995231236331165f, 0.0003033203829545528f, 0.0002100782585330308f, 0.0001327334903180599f, 7.650675252079964e-005f, 4.022897701361217e-005f, 1.929736572492402e-005f, 8.444558261544444e-006f,
1.760426494001877e-005f, 4.022897701361217e-005f, 8.386484114453197e-005f, 0.0001594926579855382f, 0.0002767078403849155f, 0.0004379475140012801f, 0.0006323281559161842f, 0.0008328808471560478f, 0.001000790391117334f, 0.001097041997127235f, 0.001097041997127235f, 0.001000790391117334f, 0.0008328808471560478f, 0.0006323281559161842f, 0.0004379475140012801f, 0.0002767078403849155f, 0.0001594926579855382f, 8.386484114453197e-005f, 4.022897701361217e-005f, 1.760426494001877e-005f,
3.34794785885606e-005f, 7.650675252079964e-005f, 0.0001594926579855382f, 0.0003033203247468919f, 0.0005262380582280457f, 0.0008328807889483869f, 0.001202550483867526f, 0.001583957928232849f, 0.001903285388834775f, 0.002086334861814976f, 0.002086334861814976f, 0.001903285388834775f, 0.001583957928232849f, 0.001202550483867526f, 0.0008328807889483869f, 0.0005262380582280457f, 0.0003033203247468919f, 0.0001594926579855382f, 7.650675252079964e-005f, 3.34794785885606e-005f,
5.808438800158911e-005f, 0.0001327334903180599f, 0.0002767078403849155f, 0.0005262380582280457f, 0.0009129836107604206f, 0.001444985857233405f, 0.002086335094645619f, 0.002748048631474376f, 0.00330205773934722f, 0.003619635012000799f, 0.003619635012000799f, 0.00330205773934722f, 0.002748048631474376f, 0.002086335094645619f, 0.001444985857233405f, 0.0009129836107604206f, 0.0005262380582280457f, 0.0002767078403849155f, 0.0001327334903180599f, 5.808438800158911e-005f,
9.193058212986216e-005f, 0.0002100782585330308f, 0.0004379475140012801f, 0.0008328807889483869f, 0.001444985857233405f, 0.002286989474669099f, 0.00330205773934722f, 0.004349356517195702f, 0.00522619066759944f, 0.005728822201490402f, 0.005728822201490402f, 0.00522619066759944f, 0.004349356517195702f, 0.00330205773934722f, 0.002286989474669099f, 0.001444985857233405f, 0.0008328807889483869f, 0.0004379475140012801f, 0.0002100782585330308f, 9.193058212986216e-005f,
0.0001327334757661447f, 0.0003033203829545528f, 0.0006323281559161842f, 0.001202550483867526f, 0.002086335094645619f, 0.00330205773934722f, 0.004767658654600382f, 0.006279794964939356f, 0.007545807864516974f, 0.008271530270576477f, 0.008271530270576477f, 0.007545807864516974f, 0.006279794964939356f, 0.004767658654600382f, 0.00330205773934722f, 0.002086335094645619f, 0.001202550483867526f, 0.0006323281559161842f, 0.0003033203829545528f, 0.0001327334757661447f,
0.0001748319627949968f, 0.0003995231236331165f, 0.0008328808471560478f, 0.001583957928232849f, 0.002748048631474376f, 0.004349356517195702f, 0.006279794964939356f, 0.008271529339253902f, 0.009939077310264111f, 0.01089497376233339f, 0.01089497376233339f, 0.009939077310264111f, 0.008271529339253902f, 0.006279794964939356f, 0.004349356517195702f, 0.002748048631474376f, 0.001583957928232849f, 0.0008328808471560478f, 0.0003995231236331165f, 0.0001748319627949968f,
0.0002100782439811155f, 0.0004800673632416874f, 0.001000790391117334f, 0.001903285388834775f, 0.00330205773934722f, 0.00522619066759944f, 0.007545807864516974f, 0.009939077310264111f, 0.01194280479103327f, 0.01309141051024199f, 0.01309141051024199f, 0.01194280479103327f, 0.009939077310264111f, 0.007545807864516974f, 0.00522619066759944f, 0.00330205773934722f, 0.001903285388834775f, 0.001000790391117334f, 0.0004800673632416874f, 0.0002100782439811155f,
0.0002302826324012131f, 0.0005262381164357066f, 0.001097041997127235f, 0.002086334861814976f, 0.003619635012000799f, 0.005728822201490402f, 0.008271530270576477f, 0.01089497376233339f, 0.01309141051024199f, 0.01435048412531614f, 0.01435048412531614f, 0.01309141051024199f, 0.01089497376233339f, 0.008271530270576477f, 0.005728822201490402f, 0.003619635012000799f, 0.002086334861814976f, 0.001097041997127235f, 0.0005262381164357066f, 0.0002302826324012131f,
0.0002302826324012131f, 0.0005262381164357066f, 0.001097041997127235f, 0.002086334861814976f, 0.003619635012000799f, 0.005728822201490402f, 0.008271530270576477f, 0.01089497376233339f, 0.01309141051024199f, 0.01435048412531614f, 0.01435048412531614f, 0.01309141051024199f, 0.01089497376233339f, 0.008271530270576477f, 0.005728822201490402f, 0.003619635012000799f, 0.002086334861814976f, 0.001097041997127235f, 0.0005262381164357066f, 0.0002302826324012131f,
0.0002100782439811155f, 0.0004800673632416874f, 0.001000790391117334f, 0.001903285388834775f, 0.00330205773934722f, 0.00522619066759944f, 0.007545807864516974f, 0.009939077310264111f, 0.01194280479103327f, 0.01309141051024199f, 0.01309141051024199f, 0.01194280479103327f, 0.009939077310264111f, 0.007545807864516974f, 0.00522619066759944f, 0.00330205773934722f, 0.001903285388834775f, 0.001000790391117334f, 0.0004800673632416874f, 0.0002100782439811155f,
0.0001748319627949968f, 0.0003995231236331165f, 0.0008328808471560478f, 0.001583957928232849f, 0.002748048631474376f, 0.004349356517195702f, 0.006279794964939356f, 0.008271529339253902f, 0.009939077310264111f, 0.01089497376233339f, 0.01089497376233339f, 0.009939077310264111f, 0.008271529339253902f, 0.006279794964939356f, 0.004349356517195702f, 0.002748048631474376f, 0.001583957928232849f, 0.0008328808471560478f, 0.0003995231236331165f, 0.0001748319627949968f,
0.0001327334757661447f, 0.0003033203829545528f, 0.0006323281559161842f, 0.001202550483867526f, 0.002086335094645619f, 0.00330205773934722f, 0.004767658654600382f, 0.006279794964939356f, 0.007545807864516974f, 0.008271530270576477f, 0.008271530270576477f, 0.007545807864516974f, 0.006279794964939356f, 0.004767658654600382f, 0.00330205773934722f, 0.002086335094645619f, 0.001202550483867526f, 0.0006323281559161842f, 0.0003033203829545528f, 0.0001327334757661447f,
9.193058212986216e-005f, 0.0002100782585330308f, 0.0004379475140012801f, 0.0008328807889483869f, 0.001444985857233405f, 0.002286989474669099f, 0.00330205773934722f, 0.004349356517195702f, 0.00522619066759944f, 0.005728822201490402f, 0.005728822201490402f, 0.00522619066759944f, 0.004349356517195702f, 0.00330205773934722f, 0.002286989474669099f, 0.001444985857233405f, 0.0008328807889483869f, 0.0004379475140012801f, 0.0002100782585330308f, 9.193058212986216e-005f,
5.808438800158911e-005f, 0.0001327334903180599f, 0.0002767078403849155f, 0.0005262380582280457f, 0.0009129836107604206f, 0.001444985857233405f, 0.002086335094645619f, 0.002748048631474376f, 0.00330205773934722f, 0.003619635012000799f, 0.003619635012000799f, 0.00330205773934722f, 0.002748048631474376f, 0.002086335094645619f, 0.001444985857233405f, 0.0009129836107604206f, 0.0005262380582280457f, 0.0002767078403849155f, 0.0001327334903180599f, 5.808438800158911e-005f,
3.34794785885606e-005f, 7.650675252079964e-005f, 0.0001594926579855382f, 0.0003033203247468919f, 0.0005262380582280457f, 0.0008328807889483869f, 0.001202550483867526f, 0.001583957928232849f, 0.001903285388834775f, 0.002086334861814976f, 0.002086334861814976f, 0.001903285388834775f, 0.001583957928232849f, 0.001202550483867526f, 0.0008328807889483869f, 0.0005262380582280457f, 0.0003033203247468919f, 0.0001594926579855382f, 7.650675252079964e-005f, 3.34794785885606e-005f,
1.760426494001877e-005f, 4.022897701361217e-005f, 8.386484114453197e-005f, 0.0001594926579855382f, 0.0002767078403849155f, 0.0004379475140012801f, 0.0006323281559161842f, 0.0008328808471560478f, 0.001000790391117334f, 0.001097041997127235f, 0.001097041997127235f, 0.001000790391117334f, 0.0008328808471560478f, 0.0006323281559161842f, 0.0004379475140012801f, 0.0002767078403849155f, 0.0001594926579855382f, 8.386484114453197e-005f, 4.022897701361217e-005f, 1.760426494001877e-005f,
8.444558261544444e-006f, 1.929736572492402e-005f, 4.022897701361217e-005f, 7.650675252079964e-005f, 0.0001327334903180599f, 0.0002100782585330308f, 0.0003033203829545528f, 0.0003995231236331165f, 0.0004800673632416874f, 0.0005262381164357066f, 0.0005262381164357066f, 0.0004800673632416874f, 0.0003995231236331165f, 0.0003033203829545528f, 0.0002100782585330308f, 0.0001327334903180599f, 7.650675252079964e-005f, 4.022897701361217e-005f, 1.929736572492402e-005f, 8.444558261544444e-006f,
3.695352233989979e-006f, 8.444558261544444e-006f, 1.760426494001877e-005f, 3.34794785885606e-005f, 5.808438800158911e-005f, 9.193058212986216e-005f, 0.0001327334757661447f, 0.0001748319627949968f, 0.0002100782439811155f, 0.0002302826324012131f, 0.0002302826324012131f, 0.0002100782439811155f, 0.0001748319627949968f, 0.0001327334757661447f, 9.193058212986216e-005f, 5.808438800158911e-005f, 3.34794785885606e-005f, 1.760426494001877e-005f, 8.444558261544444e-006f, 3.695352233989979e-006f
};
struct WinReader
{
typedef uchar elem_type;
__device__ __forceinline__ uchar operator ()(int i, int j) const
{
float pixel_x = centerX + (win_offset + j) * cos_dir + (win_offset + i) * sin_dir;
float pixel_y = centerY - (win_offset + j) * sin_dir + (win_offset + i) * cos_dir;
return tex2D(imgTex, pixel_x, pixel_y);
}
float centerX;
float centerY;
float win_offset;
float cos_dir;
float sin_dir;
int width;
int height;
};
__device__ void calc_dx_dy(const float* featureX, const float* featureY, const float* featureSize, const float* featureDir,
float& dx, float& dy)
{
__shared__ float s_PATCH[PATCH_SZ + 1][PATCH_SZ + 1];
dx = dy = 0.0f;
WinReader win;
win.centerX = featureX[blockIdx.x];
win.centerY = featureY[blockIdx.x];
// The sampling intervals and wavelet sized for selecting an orientation
// and building the keypoint descriptor are defined relative to 's'
const float s = featureSize[blockIdx.x] * 1.2f / 9.0f;
// Extract a window of pixels around the keypoint of size 20s
const int win_size = (int)((PATCH_SZ + 1) * s);
win.width = win.height = win_size;
// Nearest neighbour version (faster)
win.win_offset = -(win_size - 1.0f) / 2.0f;
float descriptor_dir = 360.0f - featureDir[blockIdx.x];
if (::fabsf(descriptor_dir - 360.f) < numeric_limits<float>::epsilon())
descriptor_dir = 0.f;
descriptor_dir *= CV_PI_F / 180.0f;
sincosf(descriptor_dir, &win.sin_dir, &win.cos_dir);
const int tid = threadIdx.y * blockDim.x + threadIdx.x;
const int xLoadInd = tid % (PATCH_SZ + 1);
const int yLoadInd = tid / (PATCH_SZ + 1);
if (yLoadInd < (PATCH_SZ + 1))
{
if (s > 1)
{
AreaFilter<WinReader> filter(win, s, s);
s_PATCH[yLoadInd][xLoadInd] = filter(yLoadInd, xLoadInd);
}
else
{
LinearFilter<WinReader> filter(win);
s_PATCH[yLoadInd][xLoadInd] = filter(yLoadInd * s, xLoadInd * s);
}
}
__syncthreads();
const int xPatchInd = threadIdx.x % 5;
const int yPatchInd = threadIdx.x / 5;
if (yPatchInd < 5)
{
const int xBlockInd = threadIdx.y % 4;
const int yBlockInd = threadIdx.y / 4;
const int xInd = xBlockInd * 5 + xPatchInd;
const int yInd = yBlockInd * 5 + yPatchInd;
const float dw = c_DW[yInd * PATCH_SZ + xInd];
dx = (s_PATCH[yInd ][xInd + 1] - s_PATCH[yInd][xInd] + s_PATCH[yInd + 1][xInd + 1] - s_PATCH[yInd + 1][xInd ]) * dw;
dy = (s_PATCH[yInd + 1][xInd ] - s_PATCH[yInd][xInd] + s_PATCH[yInd + 1][xInd + 1] - s_PATCH[yInd ][xInd + 1]) * dw;
}
}
__global__ void compute_descriptors_64(PtrStep<float4> descriptors, const float* featureX, const float* featureY, const float* featureSize, const float* featureDir)
{
__shared__ float smem[32 * 16];
float* sRow = smem + threadIdx.y * 32;
float dx, dy;
calc_dx_dy(featureX, featureY, featureSize, featureDir, dx, dy);
float dxabs = ::fabsf(dx);
float dyabs = ::fabsf(dy);
plus<float> op;
reduce<32>(sRow, dx, threadIdx.x, op);
reduce<32>(sRow, dy, threadIdx.x, op);
reduce<32>(sRow, dxabs, threadIdx.x, op);
reduce<32>(sRow, dyabs, threadIdx.x, op);
float4* descriptors_block = descriptors.ptr(blockIdx.x) + threadIdx.y;
// write dx, dy, |dx|, |dy|
if (threadIdx.x == 0)
*descriptors_block = make_float4(dx, dy, dxabs, dyabs);
}
__global__ void compute_descriptors_128(PtrStep<float4> descriptors, const float* featureX, const float* featureY, const float* featureSize, const float* featureDir)
{
__shared__ float smem[32 * 16];
float* sRow = smem + threadIdx.y * 32;
float dx, dy;
calc_dx_dy(featureX, featureY, featureSize, featureDir, dx, dy);
float4* descriptors_block = descriptors.ptr(blockIdx.x) + threadIdx.y * 2;
plus<float> op;
float d1 = 0.0f;
float d2 = 0.0f;
float abs1 = 0.0f;
float abs2 = 0.0f;
if (dy >= 0)
{
d1 = dx;
abs1 = ::fabsf(dx);
}
else
{
d2 = dx;
abs2 = ::fabsf(dx);
}
reduce<32>(sRow, d1, threadIdx.x, op);
reduce<32>(sRow, d2, threadIdx.x, op);
reduce<32>(sRow, abs1, threadIdx.x, op);
reduce<32>(sRow, abs2, threadIdx.x, op);
// write dx (dy >= 0), |dx| (dy >= 0), dx (dy < 0), |dx| (dy < 0)
if (threadIdx.x == 0)
descriptors_block[0] = make_float4(d1, abs1, d2, abs2);
if (dx >= 0)
{
d1 = dy;
abs1 = ::fabsf(dy);
d2 = 0.0f;
abs2 = 0.0f;
}
else
{
d1 = 0.0f;
abs1 = 0.0f;
d2 = dy;
abs2 = ::fabsf(dy);
}
reduce<32>(sRow, d1, threadIdx.x, op);
reduce<32>(sRow, d2, threadIdx.x, op);
reduce<32>(sRow, abs1, threadIdx.x, op);
reduce<32>(sRow, abs2, threadIdx.x, op);
// write dy (dx >= 0), |dy| (dx >= 0), dy (dx < 0), |dy| (dx < 0)
if (threadIdx.x == 0)
descriptors_block[1] = make_float4(d1, abs1, d2, abs2);
}
template <int BLOCK_DIM_X> __global__ void normalize_descriptors(PtrStepf descriptors)
{
__shared__ float smem[BLOCK_DIM_X];
__shared__ float s_len;
// no need for thread ID
float* descriptor_base = descriptors.ptr(blockIdx.x);
// read in the unnormalized descriptor values (squared)
const float val = descriptor_base[threadIdx.x];
float len = val * val;
reduce<BLOCK_DIM_X>(smem, len, threadIdx.x, plus<float>());
if (threadIdx.x == 0)
s_len = ::sqrtf(len);
__syncthreads();
// normalize and store in output
descriptor_base[threadIdx.x] = val / s_len;
}
void compute_descriptors_gpu(PtrStepSz<float4> descriptors, const float* featureX, const float* featureY, const float* featureSize, const float* featureDir, int nFeatures)
{
// compute unnormalized descriptors, then normalize them - odd indexing since grid must be 2D
if (descriptors.cols == 64)
{
compute_descriptors_64<<<nFeatures, dim3(32, 16)>>>(descriptors, featureX, featureY, featureSize, featureDir);
cudaSafeCall( cudaGetLastError() );
cudaSafeCall( cudaDeviceSynchronize() );
normalize_descriptors<64><<<nFeatures, 64>>>((PtrStepSzf) descriptors);
cudaSafeCall( cudaGetLastError() );
cudaSafeCall( cudaDeviceSynchronize() );
}
else
{
compute_descriptors_128<<<nFeatures, dim3(32, 16)>>>(descriptors, featureX, featureY, featureSize, featureDir);
cudaSafeCall( cudaGetLastError() );
cudaSafeCall( cudaDeviceSynchronize() );
normalize_descriptors<128><<<nFeatures, 128>>>((PtrStepSzf) descriptors);
cudaSafeCall( cudaGetLastError() );
cudaSafeCall( cudaDeviceSynchronize() );
}
}
} // namespace surf
}}} // namespace cv { namespace gpu { namespace device
#endif /* CUDA_DISABLER */

View File

@@ -263,11 +263,8 @@ namespace
namespace arithm
{
template <typename T, typename D>
void vadd4(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
template <typename T, typename D>
void vadd2(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
void addMat_v4(PtrStepSz<unsigned int> src1, PtrStepSz<unsigned int> src2, PtrStepSz<unsigned int> dst, cudaStream_t stream);
void addMat_v2(PtrStepSz<unsigned int> src1, PtrStepSz<unsigned int> src2, PtrStepSz<unsigned int> dst, cudaStream_t stream);
template <typename T, typename D>
void addMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
@@ -345,62 +342,6 @@ void cv::gpu::add(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const Gpu
}
};
typedef void (*vfunc_t)(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
static const vfunc_t vfuncs4[4][4] =
{
{
vadd4<unsigned int, unsigned int>,
vadd4<unsigned int, int>,
0,
0
},
{
vadd4<int, unsigned int>,
vadd4<int, int>,
0,
0
},
{
0,
0,
0,
0
},
{
0,
0,
0,
0
}
};
static const vfunc_t vfuncs2[4][4] =
{
{
0,
0,
0,
0
},
{
0,
0,
0,
0
},
{
0,
0,
vadd2<unsigned int, unsigned int>,
vadd2<unsigned int, int>
},
{
0,
0,
vadd2<int, unsigned int>,
vadd2<int, int>
}
};
if (dtype < 0)
dtype = src1.depth();
@@ -426,7 +367,7 @@ void cv::gpu::add(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const Gpu
PtrStepSzb src2_(src1.rows, src1.cols * cn, src2.data, src2.step);
PtrStepSzb dst_(src1.rows, src1.cols * cn, dst.data, dst.step);
if (mask.empty() && sdepth < CV_32S && ddepth < CV_32S)
if (mask.empty() && (sdepth == CV_8U || sdepth == CV_16U) && ddepth == sdepth)
{
const intptr_t src1ptr = reinterpret_cast<intptr_t>(src1_.data);
const intptr_t src2ptr = reinterpret_cast<intptr_t>(src2_.data);
@@ -434,31 +375,27 @@ void cv::gpu::add(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const Gpu
const bool isAllAligned = (src1ptr & 31) == 0 && (src2ptr & 31) == 0 && (dstptr & 31) == 0;
if (deviceSupports(FEATURE_SET_COMPUTE_20) && isAllAligned)
if (isAllAligned)
{
const vfunc_t vfunc4 = vfuncs4[sdepth][ddepth];
const vfunc_t vfunc2 = vfuncs2[sdepth][ddepth];
if (vfunc4 != 0 && (src1_.cols & 3) == 0)
if (sdepth == CV_8U && (src1_.cols & 3) == 0)
{
const int vcols = src1_.cols >> 2;
vfunc4(PtrStepSzb(src1_.rows, vcols, src1_.data, src1_.step),
PtrStepSzb(src1_.rows, vcols, src2_.data, src2_.step),
PtrStepSzb(src1_.rows, vcols, dst_.data, dst_.step),
stream);
addMat_v4(PtrStepSz<unsigned int>(src1_.rows, vcols, (unsigned int*) src1_.data, src1_.step),
PtrStepSz<unsigned int>(src1_.rows, vcols, (unsigned int*) src2_.data, src2_.step),
PtrStepSz<unsigned int>(src1_.rows, vcols, (unsigned int*) dst_.data, dst_.step),
stream);
return;
}
if (vfunc2 != 0 && (src1_.cols & 1) == 0)
else if (sdepth == CV_16U && (src1_.cols & 1) == 0)
{
const int vcols = src1_.cols >> 1;
vfunc2(PtrStepSzb(src1_.rows, vcols, src1_.data, src1_.step),
PtrStepSzb(src1_.rows, vcols, src2_.data, src2_.step),
PtrStepSzb(src1_.rows, vcols, dst_.data, dst_.step),
stream);
addMat_v2(PtrStepSz<unsigned int>(src1_.rows, vcols, (unsigned int*) src1_.data, src1_.step),
PtrStepSz<unsigned int>(src1_.rows, vcols, (unsigned int*) src2_.data, src2_.step),
PtrStepSz<unsigned int>(src1_.rows, vcols, (unsigned int*) dst_.data, dst_.step),
stream);
return;
}
@@ -606,11 +543,8 @@ void cv::gpu::add(const GpuMat& src, const Scalar& sc, GpuMat& dst, const GpuMat
namespace arithm
{
template <typename T, typename D>
void vsub4(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
template <typename T, typename D>
void vsub2(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
void subMat_v4(PtrStepSz<unsigned int> src1, PtrStepSz<unsigned int> src2, PtrStepSz<unsigned int> dst, cudaStream_t stream);
void subMat_v2(PtrStepSz<unsigned int> src1, PtrStepSz<unsigned int> src2, PtrStepSz<unsigned int> dst, cudaStream_t stream);
template <typename T, typename D>
void subMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
@@ -688,62 +622,6 @@ void cv::gpu::subtract(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, cons
}
};
typedef void (*vfunc_t)(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
static const vfunc_t vfuncs4[4][4] =
{
{
vsub4<unsigned int, unsigned int>,
vsub4<unsigned int, int>,
0,
0
},
{
vsub4<int, unsigned int>,
vsub4<int, int>,
0,
0
},
{
0,
0,
0,
0
},
{
0,
0,
0,
0
}
};
static const vfunc_t vfuncs2[4][4] =
{
{
0,
0,
0,
0
},
{
0,
0,
0,
0
},
{
0,
0,
vsub2<unsigned int, unsigned int>,
vsub2<unsigned int, int>
},
{
0,
0,
vsub2<int, unsigned int>,
vsub2<int, int>
}
};
if (dtype < 0)
dtype = src1.depth();
@@ -769,7 +647,7 @@ void cv::gpu::subtract(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, cons
PtrStepSzb src2_(src1.rows, src1.cols * cn, src2.data, src2.step);
PtrStepSzb dst_(src1.rows, src1.cols * cn, dst.data, dst.step);
if (mask.empty() && sdepth < CV_32S && ddepth < CV_32S)
if (mask.empty() && (sdepth == CV_8U || sdepth == CV_16U) && ddepth == sdepth)
{
const intptr_t src1ptr = reinterpret_cast<intptr_t>(src1_.data);
const intptr_t src2ptr = reinterpret_cast<intptr_t>(src2_.data);
@@ -777,31 +655,27 @@ void cv::gpu::subtract(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, cons
const bool isAllAligned = (src1ptr & 31) == 0 && (src2ptr & 31) == 0 && (dstptr & 31) == 0;
if (deviceSupports(FEATURE_SET_COMPUTE_20) && isAllAligned)
if (isAllAligned)
{
const vfunc_t vfunc4 = vfuncs4[sdepth][ddepth];
const vfunc_t vfunc2 = vfuncs2[sdepth][ddepth];
if (vfunc4 != 0 && (src1_.cols & 3) == 0)
if (sdepth == CV_8U && (src1_.cols & 3) == 0)
{
const int vcols = src1_.cols >> 2;
vfunc4(PtrStepSzb(src1_.rows, vcols, src1_.data, src1_.step),
PtrStepSzb(src1_.rows, vcols, src2_.data, src2_.step),
PtrStepSzb(src1_.rows, vcols, dst_.data, dst_.step),
stream);
subMat_v4(PtrStepSz<unsigned int>(src1_.rows, vcols, (unsigned int*) src1_.data, src1_.step),
PtrStepSz<unsigned int>(src1_.rows, vcols, (unsigned int*) src2_.data, src2_.step),
PtrStepSz<unsigned int>(src1_.rows, vcols, (unsigned int*) dst_.data, dst_.step),
stream);
return;
}
if (vfunc2 != 0 && (src1_.cols & 1) == 0)
else if (sdepth == CV_16U && (src1_.cols & 1) == 0)
{
const int vcols = src1_.cols >> 1;
vfunc2(PtrStepSzb(src1_.rows, vcols, src1_.data, src1_.step),
PtrStepSzb(src1_.rows, vcols, src2_.data, src2_.step),
PtrStepSzb(src1_.rows, vcols, dst_.data, dst_.step),
stream);
subMat_v2(PtrStepSz<unsigned int>(src1_.rows, vcols, (unsigned int*) src1_.data, src1_.step),
PtrStepSz<unsigned int>(src1_.rows, vcols, (unsigned int*) src2_.data, src2_.step),
PtrStepSz<unsigned int>(src1_.rows, vcols, (unsigned int*) dst_.data, dst_.step),
stream);
return;
}
@@ -1585,11 +1459,8 @@ void cv::gpu::divide(double scale, const GpuMat& src, GpuMat& dst, int dtype, St
namespace arithm
{
template <typename T>
void vabsDiff4(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
template <typename T>
void vabsDiff2(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
void absDiffMat_v4(PtrStepSz<unsigned int> src1, PtrStepSz<unsigned int> src2, PtrStepSz<unsigned int> dst, cudaStream_t stream);
void absDiffMat_v2(PtrStepSz<unsigned int> src1, PtrStepSz<unsigned int> src2, PtrStepSz<unsigned int> dst, cudaStream_t stream);
template <typename T>
void absDiffMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
@@ -1610,20 +1481,6 @@ void cv::gpu::absdiff(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Strea
absDiffMat<float>,
absDiffMat<double>
};
static const func_t vfuncs4[] =
{
vabsDiff4<unsigned int>,
vabsDiff4<int>,
0,
0
};
static const func_t vfuncs2[] =
{
0,
0,
vabsDiff2<unsigned int>,
vabsDiff2<int>
};
const int depth = src1.depth();
const int cn = src1.channels();
@@ -1645,7 +1502,7 @@ void cv::gpu::absdiff(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Strea
PtrStepSzb src2_(src1.rows, src1.cols * cn, src2.data, src2.step);
PtrStepSzb dst_(src1.rows, src1.cols * cn, dst.data, dst.step);
if (depth < CV_32S)
if (depth == CV_8U || depth == CV_16U)
{
const intptr_t src1ptr = reinterpret_cast<intptr_t>(src1_.data);
const intptr_t src2ptr = reinterpret_cast<intptr_t>(src2_.data);
@@ -1653,31 +1510,27 @@ void cv::gpu::absdiff(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Strea
const bool isAllAligned = (src1ptr & 31) == 0 && (src2ptr & 31) == 0 && (dstptr & 31) == 0;
if (deviceSupports(FEATURE_SET_COMPUTE_20) && isAllAligned)
if (isAllAligned)
{
const func_t vfunc4 = vfuncs4[depth];
const func_t vfunc2 = vfuncs2[depth];
if (vfunc4 != 0 && (src1_.cols & 3) == 0)
if (depth == CV_8U && (src1_.cols & 3) == 0)
{
const int vcols = src1_.cols >> 2;
vfunc4(PtrStepSzb(src1_.rows, vcols, src1_.data, src1_.step),
PtrStepSzb(src1_.rows, vcols, src2_.data, src2_.step),
PtrStepSzb(src1_.rows, vcols, dst_.data, dst_.step),
stream);
absDiffMat_v4(PtrStepSz<unsigned int>(src1_.rows, vcols, (unsigned int*) src1_.data, src1_.step),
PtrStepSz<unsigned int>(src1_.rows, vcols, (unsigned int*) src2_.data, src2_.step),
PtrStepSz<unsigned int>(src1_.rows, vcols, (unsigned int*) dst_.data, dst_.step),
stream);
return;
}
if (vfunc2 != 0 && (src1_.cols & 1) == 0)
else if (depth == CV_16U && (src1_.cols & 1) == 0)
{
const int vcols = src1_.cols >> 1;
vfunc2(PtrStepSzb(src1_.rows, vcols, src1_.data, src1_.step),
PtrStepSzb(src1_.rows, vcols, src2_.data, src2_.step),
PtrStepSzb(src1_.rows, vcols, dst_.data, dst_.step),
stream);
absDiffMat_v2(PtrStepSz<unsigned int>(src1_.rows, vcols, (unsigned int*) src1_.data, src1_.step),
PtrStepSz<unsigned int>(src1_.rows, vcols, (unsigned int*) src2_.data, src2_.step),
PtrStepSz<unsigned int>(src1_.rows, vcols, (unsigned int*) dst_.data, dst_.step),
stream);
return;
}
@@ -1940,6 +1793,11 @@ void cv::gpu::exp(const GpuMat& src, GpuMat& dst, Stream& stream)
namespace arithm
{
void cmpMatEq_v4(PtrStepSz<uint> src1, PtrStepSz<uint> src2, PtrStepSz<uint> dst, cudaStream_t stream);
void cmpMatNe_v4(PtrStepSz<uint> src1, PtrStepSz<uint> src2, PtrStepSz<uint> dst, cudaStream_t stream);
void cmpMatLt_v4(PtrStepSz<uint> src1, PtrStepSz<uint> src2, PtrStepSz<uint> dst, cudaStream_t stream);
void cmpMatLe_v4(PtrStepSz<uint> src1, PtrStepSz<uint> src2, PtrStepSz<uint> dst, cudaStream_t stream);
template <typename T> void cmpMatEq(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
template <typename T> void cmpMatNe(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
template <typename T> void cmpMatLt(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
@@ -1962,6 +1820,12 @@ void cv::gpu::compare(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, int c
{cmpMatEq<double> , cmpMatNe<double> , cmpMatLt<double> , cmpMatLe<double> }
};
typedef void (*func_v4_t)(PtrStepSz<uint> src1, PtrStepSz<uint> src2, PtrStepSz<uint> dst, cudaStream_t stream);
static const func_v4_t funcs_v4[] =
{
cmpMatEq_v4, cmpMatNe_v4, cmpMatLt_v4, cmpMatLe_v4
};
const int depth = src1.depth();
const int cn = src1.channels();
@@ -1997,6 +1861,27 @@ void cv::gpu::compare(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, int c
PtrStepSzb src2_(src1.rows, src1.cols * cn, psrc2[cmpop]->data, psrc2[cmpop]->step);
PtrStepSzb dst_(src1.rows, src1.cols * cn, dst.data, dst.step);
if (depth == CV_8U && (src1_.cols & 3) == 0)
{
const intptr_t src1ptr = reinterpret_cast<intptr_t>(src1_.data);
const intptr_t src2ptr = reinterpret_cast<intptr_t>(src2_.data);
const intptr_t dstptr = reinterpret_cast<intptr_t>(dst_.data);
const bool isAllAligned = (src1ptr & 31) == 0 && (src2ptr & 31) == 0 && (dstptr & 31) == 0;
if (isAllAligned)
{
const int vcols = src1_.cols >> 2;
funcs_v4[code](PtrStepSz<unsigned int>(src1_.rows, vcols, (unsigned int*) src1_.data, src1_.step),
PtrStepSz<unsigned int>(src1_.rows, vcols, (unsigned int*) src2_.data, src2_.step),
PtrStepSz<unsigned int>(src1_.rows, vcols, (unsigned int*) dst_.data, dst_.step),
stream);
return;
}
}
const func_t func = funcs[depth][code];
func(src1_, src2_, dst_, stream);
@@ -2532,13 +2417,13 @@ void cv::gpu::lshift(const GpuMat& src, Scalar_<int> sc, GpuMat& dst, Stream& st
namespace arithm
{
template <typename T> void vmin4(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
template <typename T> void vmin2(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
void minMat_v4(PtrStepSz<unsigned int> src1, PtrStepSz<unsigned int> src2, PtrStepSz<unsigned int> dst, cudaStream_t stream);
void minMat_v2(PtrStepSz<unsigned int> src1, PtrStepSz<unsigned int> src2, PtrStepSz<unsigned int> dst, cudaStream_t stream);
template <typename T> void minMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
template <typename T> void minScalar(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream);
template <typename T> void vmax4(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
template <typename T> void vmax2(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
void maxMat_v4(PtrStepSz<unsigned int> src1, PtrStepSz<unsigned int> src2, PtrStepSz<unsigned int> dst, cudaStream_t stream);
void maxMat_v2(PtrStepSz<unsigned int> src1, PtrStepSz<unsigned int> src2, PtrStepSz<unsigned int> dst, cudaStream_t stream);
template <typename T> void maxMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
template <typename T> void maxScalar(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream);
}
@@ -2558,20 +2443,6 @@ void cv::gpu::min(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Stream& s
minMat<float>,
minMat<double>
};
static const func_t vfuncs4[] =
{
vmin4<unsigned int>,
vmin4<int>,
0,
0
};
static const func_t vfuncs2[] =
{
0,
0,
vmin2<unsigned int>,
vmin2<int>
};
const int depth = src1.depth();
const int cn = src1.channels();
@@ -2593,7 +2464,7 @@ void cv::gpu::min(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Stream& s
PtrStepSzb src2_(src1.rows, src1.cols * cn, src2.data, src2.step);
PtrStepSzb dst_(src1.rows, src1.cols * cn, dst.data, dst.step);
if (depth < CV_32S)
if (depth == CV_8U || depth == CV_16U)
{
const intptr_t src1ptr = reinterpret_cast<intptr_t>(src1_.data);
const intptr_t src2ptr = reinterpret_cast<intptr_t>(src2_.data);
@@ -2601,31 +2472,27 @@ void cv::gpu::min(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Stream& s
const bool isAllAligned = (src1ptr & 31) == 0 && (src2ptr & 31) == 0 && (dstptr & 31) == 0;
if (deviceSupports(FEATURE_SET_COMPUTE_20) && isAllAligned)
if (isAllAligned)
{
const func_t vfunc4 = vfuncs4[depth];
const func_t vfunc2 = vfuncs2[depth];
if (vfunc4 != 0 && (src1_.cols & 3) == 0)
if (depth == CV_8U && (src1_.cols & 3) == 0)
{
const int vcols = src1_.cols >> 2;
vfunc4(PtrStepSzb(src1_.rows, vcols, src1_.data, src1_.step),
PtrStepSzb(src1_.rows, vcols, src2_.data, src2_.step),
PtrStepSzb(src1_.rows, vcols, dst_.data, dst_.step),
stream);
minMat_v4(PtrStepSz<unsigned int>(src1_.rows, vcols, (unsigned int*) src1_.data, src1_.step),
PtrStepSz<unsigned int>(src1_.rows, vcols, (unsigned int*) src2_.data, src2_.step),
PtrStepSz<unsigned int>(src1_.rows, vcols, (unsigned int*) dst_.data, dst_.step),
stream);
return;
}
if (vfunc2 != 0 && (src1_.cols & 1) == 0)
else if (depth == CV_16U && (src1_.cols & 1) == 0)
{
const int vcols = src1_.cols >> 1;
vfunc2(PtrStepSzb(src1_.rows, vcols, src1_.data, src1_.step),
PtrStepSzb(src1_.rows, vcols, src2_.data, src2_.step),
PtrStepSzb(src1_.rows, vcols, dst_.data, dst_.step),
stream);
minMat_v2(PtrStepSz<unsigned int>(src1_.rows, vcols, (unsigned int*) src1_.data, src1_.step),
PtrStepSz<unsigned int>(src1_.rows, vcols, (unsigned int*) src2_.data, src2_.step),
PtrStepSz<unsigned int>(src1_.rows, vcols, (unsigned int*) dst_.data, dst_.step),
stream);
return;
}
@@ -2655,20 +2522,6 @@ void cv::gpu::max(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Stream& s
maxMat<float>,
maxMat<double>
};
static const func_t vfuncs4[] =
{
vmax4<unsigned int>,
vmax4<int>,
0,
0
};
static const func_t vfuncs2[] =
{
0,
0,
vmax2<unsigned int>,
vmax2<int>
};
const int depth = src1.depth();
const int cn = src1.channels();
@@ -2690,7 +2543,7 @@ void cv::gpu::max(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Stream& s
PtrStepSzb src2_(src1.rows, src1.cols * cn, src2.data, src2.step);
PtrStepSzb dst_(src1.rows, src1.cols * cn, dst.data, dst.step);
if (depth < CV_32S)
if (depth == CV_8U || depth == CV_16U)
{
const intptr_t src1ptr = reinterpret_cast<intptr_t>(src1_.data);
const intptr_t src2ptr = reinterpret_cast<intptr_t>(src2_.data);
@@ -2698,31 +2551,27 @@ void cv::gpu::max(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Stream& s
const bool isAllAligned = (src1ptr & 31) == 0 && (src2ptr & 31) == 0 && (dstptr & 31) == 0;
if (deviceSupports(FEATURE_SET_COMPUTE_20) && isAllAligned)
if (isAllAligned)
{
const func_t vfunc4 = vfuncs4[depth];
const func_t vfunc2 = vfuncs2[depth];
if (vfunc4 != 0 && (src1_.cols & 3) == 0)
if (depth == CV_8U && (src1_.cols & 3) == 0)
{
const int vcols = src1_.cols >> 2;
vfunc4(PtrStepSzb(src1_.rows, vcols, src1_.data, src1_.step),
PtrStepSzb(src1_.rows, vcols, src2_.data, src2_.step),
PtrStepSzb(src1_.rows, vcols, dst_.data, dst_.step),
stream);
maxMat_v4(PtrStepSz<unsigned int>(src1_.rows, vcols, (unsigned int*) src1_.data, src1_.step),
PtrStepSz<unsigned int>(src1_.rows, vcols, (unsigned int*) src2_.data, src2_.step),
PtrStepSz<unsigned int>(src1_.rows, vcols, (unsigned int*) dst_.data, dst_.step),
stream);
return;
}
if (vfunc2 != 0 && (src1_.cols & 1) == 0)
else if (depth == CV_16U && (src1_.cols & 1) == 0)
{
const int vcols = src1_.cols >> 1;
vfunc2(PtrStepSzb(src1_.rows, vcols, src1_.data, src1_.step),
PtrStepSzb(src1_.rows, vcols, src2_.data, src2_.step),
PtrStepSzb(src1_.rows, vcols, dst_.data, dst_.step),
stream);
maxMat_v2(PtrStepSz<unsigned int>(src1_.rows, vcols, (unsigned int*) src1_.data, src1_.step),
PtrStepSz<unsigned int>(src1_.rows, vcols, (unsigned int*) src2_.data, src2_.step),
PtrStepSz<unsigned int>(src1_.rows, vcols, (unsigned int*) dst_.data, dst_.step),
stream);
return;
}

View File

@@ -1,418 +0,0 @@
/*M///////////////////////////////////////////////////////////////////////////////////////
//
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
// By downloading, copying, installing or using the software you agree to this license.
// If you do not agree to this license, do not download, install,
// copy or use the software.
//
//
// License Agreement
// For Open Source Computer Vision Library
//
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other GpuMaterials provided with the distribution.
//
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors "as is" and
// any express or bpied warranties, including, but not limited to, the bpied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/
#include "precomp.hpp"
using namespace cv;
using namespace cv::gpu;
#if !defined (HAVE_CUDA) || defined (CUDA_DISABLER)
cv::gpu::SURF_GPU::SURF_GPU() { throw_nogpu(); }
cv::gpu::SURF_GPU::SURF_GPU(double, int, int, bool, float, bool) { throw_nogpu(); }
int cv::gpu::SURF_GPU::descriptorSize() const { throw_nogpu(); return 0;}
void cv::gpu::SURF_GPU::uploadKeypoints(const std::vector<KeyPoint>&, GpuMat&) { throw_nogpu(); }
void cv::gpu::SURF_GPU::downloadKeypoints(const GpuMat&, std::vector<KeyPoint>&) { throw_nogpu(); }
void cv::gpu::SURF_GPU::downloadDescriptors(const GpuMat&, std::vector<float>&) { throw_nogpu(); }
void cv::gpu::SURF_GPU::operator()(const GpuMat&, const GpuMat&, GpuMat&) { throw_nogpu(); }
void cv::gpu::SURF_GPU::operator()(const GpuMat&, const GpuMat&, GpuMat&, GpuMat&, bool) { throw_nogpu(); }
void cv::gpu::SURF_GPU::operator()(const GpuMat&, const GpuMat&, std::vector<KeyPoint>&) { throw_nogpu(); }
void cv::gpu::SURF_GPU::operator()(const GpuMat&, const GpuMat&, std::vector<KeyPoint>&, GpuMat&, bool) { throw_nogpu(); }
void cv::gpu::SURF_GPU::operator()(const GpuMat&, const GpuMat&, std::vector<KeyPoint>&, std::vector<float>&, bool) { throw_nogpu(); }
void cv::gpu::SURF_GPU::releaseMemory() { throw_nogpu(); }
#else /* !defined (HAVE_CUDA) */
namespace cv { namespace gpu { namespace device
{
namespace surf
{
void loadGlobalConstants(int maxCandidates, int maxFeatures, int img_rows, int img_cols, int nOctaveLayers, float hessianThreshold);
void loadOctaveConstants(int octave, int layer_rows, int layer_cols);
void bindImgTex(PtrStepSzb img);
size_t bindSumTex(PtrStepSz<unsigned int> sum);
size_t bindMaskSumTex(PtrStepSz<unsigned int> maskSum);
void icvCalcLayerDetAndTrace_gpu(const PtrStepf& det, const PtrStepf& trace, int img_rows, int img_cols,
int octave, int nOctaveLayer);
void icvFindMaximaInLayer_gpu(const PtrStepf& det, const PtrStepf& trace, int4* maxPosBuffer, unsigned int* maxCounter,
int img_rows, int img_cols, int octave, bool use_mask, int nLayers);
void icvInterpolateKeypoint_gpu(const PtrStepf& det, const int4* maxPosBuffer, unsigned int maxCounter,
float* featureX, float* featureY, int* featureLaplacian, int* featureOctave, float* featureSize, float* featureHessian,
unsigned int* featureCounter);
void icvCalcOrientation_gpu(const float* featureX, const float* featureY, const float* featureSize, float* featureDir, int nFeatures);
void compute_descriptors_gpu(PtrStepSz<float4> descriptors, const float* featureX, const float* featureY, const float* featureSize, const float* featureDir, int nFeatures);
}
}}}
using namespace ::cv::gpu::device::surf;
namespace
{
int calcSize(int octave, int layer)
{
/* Wavelet size at first layer of first octave. */
const int HAAR_SIZE0 = 9;
/* Wavelet size increment between layers. This should be an even number,
such that the wavelet sizes in an octave are either all even or all odd.
This ensures that when looking for the neighbours of a sample, the layers
above and below are aligned correctly. */
const int HAAR_SIZE_INC = 6;
return (HAAR_SIZE0 + HAAR_SIZE_INC * layer) << octave;
}
class SURF_GPU_Invoker
{
public:
SURF_GPU_Invoker(SURF_GPU& surf, const GpuMat& img, const GpuMat& mask) :
surf_(surf),
img_cols(img.cols), img_rows(img.rows),
use_mask(!mask.empty())
{
CV_Assert(!img.empty() && img.type() == CV_8UC1);
CV_Assert(mask.empty() || (mask.size() == img.size() && mask.type() == CV_8UC1));
CV_Assert(surf_.nOctaves > 0 && surf_.nOctaveLayers > 0);
const int min_size = calcSize(surf_.nOctaves - 1, 0);
CV_Assert(img_rows - min_size >= 0);
CV_Assert(img_cols - min_size >= 0);
const int layer_rows = img_rows >> (surf_.nOctaves - 1);
const int layer_cols = img_cols >> (surf_.nOctaves - 1);
const int min_margin = ((calcSize((surf_.nOctaves - 1), 2) >> 1) >> (surf_.nOctaves - 1)) + 1;
CV_Assert(layer_rows - 2 * min_margin > 0);
CV_Assert(layer_cols - 2 * min_margin > 0);
maxFeatures = std::min(static_cast<int>(img.size().area() * surf.keypointsRatio), 65535);
maxCandidates = std::min(static_cast<int>(1.5 * maxFeatures), 65535);
CV_Assert(maxFeatures > 0);
counters.create(1, surf_.nOctaves + 1, CV_32SC1);
counters.setTo(Scalar::all(0));
loadGlobalConstants(maxCandidates, maxFeatures, img_rows, img_cols, surf_.nOctaveLayers, static_cast<float>(surf_.hessianThreshold));
bindImgTex(img);
integralBuffered(img, surf_.sum, surf_.intBuffer);
sumOffset = bindSumTex(surf_.sum);
if (use_mask)
{
min(mask, 1.0, surf_.mask1);
integralBuffered(surf_.mask1, surf_.maskSum, surf_.intBuffer);
maskOffset = bindMaskSumTex(surf_.maskSum);
}
}
void detectKeypoints(GpuMat& keypoints)
{
ensureSizeIsEnough(img_rows * (surf_.nOctaveLayers + 2), img_cols, CV_32FC1, surf_.det);
ensureSizeIsEnough(img_rows * (surf_.nOctaveLayers + 2), img_cols, CV_32FC1, surf_.trace);
ensureSizeIsEnough(1, maxCandidates, CV_32SC4, surf_.maxPosBuffer);
ensureSizeIsEnough(SURF_GPU::ROWS_COUNT, maxFeatures, CV_32FC1, keypoints);
keypoints.setTo(Scalar::all(0));
for (int octave = 0; octave < surf_.nOctaves; ++octave)
{
const int layer_rows = img_rows >> octave;
const int layer_cols = img_cols >> octave;
loadOctaveConstants(octave, layer_rows, layer_cols);
icvCalcLayerDetAndTrace_gpu(surf_.det, surf_.trace, img_rows, img_cols, octave, surf_.nOctaveLayers);
icvFindMaximaInLayer_gpu(surf_.det, surf_.trace, surf_.maxPosBuffer.ptr<int4>(), counters.ptr<unsigned int>() + 1 + octave,
img_rows, img_cols, octave, use_mask, surf_.nOctaveLayers);
unsigned int maxCounter;
cudaSafeCall( cudaMemcpy(&maxCounter, counters.ptr<unsigned int>() + 1 + octave, sizeof(unsigned int), cudaMemcpyDeviceToHost) );
maxCounter = std::min(maxCounter, static_cast<unsigned int>(maxCandidates));
if (maxCounter > 0)
{
icvInterpolateKeypoint_gpu(surf_.det, surf_.maxPosBuffer.ptr<int4>(), maxCounter,
keypoints.ptr<float>(SURF_GPU::X_ROW), keypoints.ptr<float>(SURF_GPU::Y_ROW),
keypoints.ptr<int>(SURF_GPU::LAPLACIAN_ROW), keypoints.ptr<int>(SURF_GPU::OCTAVE_ROW),
keypoints.ptr<float>(SURF_GPU::SIZE_ROW), keypoints.ptr<float>(SURF_GPU::HESSIAN_ROW),
counters.ptr<unsigned int>());
}
}
unsigned int featureCounter;
cudaSafeCall( cudaMemcpy(&featureCounter, counters.ptr<unsigned int>(), sizeof(unsigned int), cudaMemcpyDeviceToHost) );
featureCounter = std::min(featureCounter, static_cast<unsigned int>(maxFeatures));
keypoints.cols = featureCounter;
if (surf_.upright)
keypoints.row(SURF_GPU::ANGLE_ROW).setTo(Scalar::all(360.0 - 90.0));
else
findOrientation(keypoints);
}
void findOrientation(GpuMat& keypoints)
{
const int nFeatures = keypoints.cols;
if (nFeatures > 0)
{
icvCalcOrientation_gpu(keypoints.ptr<float>(SURF_GPU::X_ROW), keypoints.ptr<float>(SURF_GPU::Y_ROW),
keypoints.ptr<float>(SURF_GPU::SIZE_ROW), keypoints.ptr<float>(SURF_GPU::ANGLE_ROW), nFeatures);
}
}
void computeDescriptors(const GpuMat& keypoints, GpuMat& descriptors, int descriptorSize)
{
const int nFeatures = keypoints.cols;
if (nFeatures > 0)
{
ensureSizeIsEnough(nFeatures, descriptorSize, CV_32F, descriptors);
compute_descriptors_gpu(descriptors, keypoints.ptr<float>(SURF_GPU::X_ROW), keypoints.ptr<float>(SURF_GPU::Y_ROW),
keypoints.ptr<float>(SURF_GPU::SIZE_ROW), keypoints.ptr<float>(SURF_GPU::ANGLE_ROW), nFeatures);
}
}
private:
SURF_GPU& surf_;
int img_cols, img_rows;
bool use_mask;
int maxCandidates;
int maxFeatures;
size_t maskOffset;
size_t sumOffset;
GpuMat counters;
};
}
cv::gpu::SURF_GPU::SURF_GPU()
{
hessianThreshold = 100;
extended = true;
nOctaves = 4;
nOctaveLayers = 2;
keypointsRatio = 0.01f;
upright = false;
}
cv::gpu::SURF_GPU::SURF_GPU(double _threshold, int _nOctaves, int _nOctaveLayers, bool _extended, float _keypointsRatio, bool _upright)
{
hessianThreshold = _threshold;
extended = _extended;
nOctaves = _nOctaves;
nOctaveLayers = _nOctaveLayers;
keypointsRatio = _keypointsRatio;
upright = _upright;
}
int cv::gpu::SURF_GPU::descriptorSize() const
{
return extended ? 128 : 64;
}
void cv::gpu::SURF_GPU::uploadKeypoints(const std::vector<KeyPoint>& keypoints, GpuMat& keypointsGPU)
{
if (keypoints.empty())
keypointsGPU.release();
else
{
Mat keypointsCPU(SURF_GPU::ROWS_COUNT, static_cast<int>(keypoints.size()), CV_32FC1);
float* kp_x = keypointsCPU.ptr<float>(SURF_GPU::X_ROW);
float* kp_y = keypointsCPU.ptr<float>(SURF_GPU::Y_ROW);
int* kp_laplacian = keypointsCPU.ptr<int>(SURF_GPU::LAPLACIAN_ROW);
int* kp_octave = keypointsCPU.ptr<int>(SURF_GPU::OCTAVE_ROW);
float* kp_size = keypointsCPU.ptr<float>(SURF_GPU::SIZE_ROW);
float* kp_dir = keypointsCPU.ptr<float>(SURF_GPU::ANGLE_ROW);
float* kp_hessian = keypointsCPU.ptr<float>(SURF_GPU::HESSIAN_ROW);
for (size_t i = 0, size = keypoints.size(); i < size; ++i)
{
const KeyPoint& kp = keypoints[i];
kp_x[i] = kp.pt.x;
kp_y[i] = kp.pt.y;
kp_octave[i] = kp.octave;
kp_size[i] = kp.size;
kp_dir[i] = kp.angle;
kp_hessian[i] = kp.response;
kp_laplacian[i] = 1;
}
keypointsGPU.upload(keypointsCPU);
}
}
void cv::gpu::SURF_GPU::downloadKeypoints(const GpuMat& keypointsGPU, std::vector<KeyPoint>& keypoints)
{
const int nFeatures = keypointsGPU.cols;
if (nFeatures == 0)
keypoints.clear();
else
{
CV_Assert(keypointsGPU.type() == CV_32FC1 && keypointsGPU.rows == ROWS_COUNT);
Mat keypointsCPU(keypointsGPU);
keypoints.resize(nFeatures);
float* kp_x = keypointsCPU.ptr<float>(SURF_GPU::X_ROW);
float* kp_y = keypointsCPU.ptr<float>(SURF_GPU::Y_ROW);
int* kp_laplacian = keypointsCPU.ptr<int>(SURF_GPU::LAPLACIAN_ROW);
int* kp_octave = keypointsCPU.ptr<int>(SURF_GPU::OCTAVE_ROW);
float* kp_size = keypointsCPU.ptr<float>(SURF_GPU::SIZE_ROW);
float* kp_dir = keypointsCPU.ptr<float>(SURF_GPU::ANGLE_ROW);
float* kp_hessian = keypointsCPU.ptr<float>(SURF_GPU::HESSIAN_ROW);
for (int i = 0; i < nFeatures; ++i)
{
KeyPoint& kp = keypoints[i];
kp.pt.x = kp_x[i];
kp.pt.y = kp_y[i];
kp.class_id = kp_laplacian[i];
kp.octave = kp_octave[i];
kp.size = kp_size[i];
kp.angle = kp_dir[i];
kp.response = kp_hessian[i];
}
}
}
void cv::gpu::SURF_GPU::downloadDescriptors(const GpuMat& descriptorsGPU, std::vector<float>& descriptors)
{
if (descriptorsGPU.empty())
descriptors.clear();
else
{
CV_Assert(descriptorsGPU.type() == CV_32F);
descriptors.resize(descriptorsGPU.rows * descriptorsGPU.cols);
Mat descriptorsCPU(descriptorsGPU.size(), CV_32F, &descriptors[0]);
descriptorsGPU.download(descriptorsCPU);
}
}
void cv::gpu::SURF_GPU::operator()(const GpuMat& img, const GpuMat& mask, GpuMat& keypoints)
{
if (!img.empty())
{
SURF_GPU_Invoker surf(*this, img, mask);
surf.detectKeypoints(keypoints);
}
}
void cv::gpu::SURF_GPU::operator()(const GpuMat& img, const GpuMat& mask, GpuMat& keypoints, GpuMat& descriptors,
bool useProvidedKeypoints)
{
if (!img.empty())
{
SURF_GPU_Invoker surf(*this, img, mask);
if (!useProvidedKeypoints)
surf.detectKeypoints(keypoints);
else if (!upright)
{
surf.findOrientation(keypoints);
}
surf.computeDescriptors(keypoints, descriptors, descriptorSize());
}
}
void cv::gpu::SURF_GPU::operator()(const GpuMat& img, const GpuMat& mask, std::vector<KeyPoint>& keypoints)
{
GpuMat keypointsGPU;
(*this)(img, mask, keypointsGPU);
downloadKeypoints(keypointsGPU, keypoints);
}
void cv::gpu::SURF_GPU::operator()(const GpuMat& img, const GpuMat& mask, std::vector<KeyPoint>& keypoints,
GpuMat& descriptors, bool useProvidedKeypoints)
{
GpuMat keypointsGPU;
if (useProvidedKeypoints)
uploadKeypoints(keypoints, keypointsGPU);
(*this)(img, mask, keypointsGPU, descriptors, useProvidedKeypoints);
downloadKeypoints(keypointsGPU, keypoints);
}
void cv::gpu::SURF_GPU::operator()(const GpuMat& img, const GpuMat& mask, std::vector<KeyPoint>& keypoints,
std::vector<float>& descriptors, bool useProvidedKeypoints)
{
GpuMat descriptorsGPU;
(*this)(img, mask, keypoints, descriptorsGPU, useProvidedKeypoints);
downloadDescriptors(descriptorsGPU, descriptors);
}
void cv::gpu::SURF_GPU::releaseMemory()
{
sum.release();
mask1.release();
maskSum.release();
intBuffer.release();
det.release();
trace.release();
maxPosBuffer.release();
}
#endif /* !defined (HAVE_CUDA) */

View File

@@ -49,71 +49,6 @@ using namespace cv::gpu;
using namespace cvtest;
using namespace testing;
void printOsInfo()
{
#if defined _WIN32
# if defined _WIN64
cout << "OS: Windows x64 \n" << endl;
# else
cout << "OS: Windows x32 \n" << endl;
# endif
#elif defined linux
# if defined _LP64
cout << "OS: Linux x64 \n" << endl;
# else
cout << "OS: Linux x32 \n" << endl;
# endif
#elif defined __APPLE__
# if defined _LP64
cout << "OS: Apple x64 \n" << endl;
# else
cout << "OS: Apple x32 \n" << endl;
# endif
#endif
}
void printCudaInfo()
{
#if !defined HAVE_CUDA || defined(CUDA_DISABLER)
cout << "OpenCV was built without CUDA support \n" << endl;
#else
int driver;
cudaDriverGetVersion(&driver);
cout << "CUDA Driver version: " << driver << '\n';
cout << "CUDA Runtime version: " << CUDART_VERSION << '\n';
cout << endl;
cout << "GPU module was compiled for the following GPU archs:" << endl;
cout << " BIN: " << CUDA_ARCH_BIN << '\n';
cout << " PTX: " << CUDA_ARCH_PTX << '\n';
cout << endl;
int deviceCount = getCudaEnabledDeviceCount();
cout << "CUDA device count: " << deviceCount << '\n';
cout << endl;
for (int i = 0; i < deviceCount; ++i)
{
DeviceInfo info(i);
cout << "Device [" << i << "] \n";
cout << "\t Name: " << info.name() << '\n';
cout << "\t Compute capability: " << info.majorVersion() << '.' << info.minorVersion()<< '\n';
cout << "\t Multi Processor Count: " << info.multiProcessorCount() << '\n';
cout << "\t Total memory: " << static_cast<int>(static_cast<int>(info.totalMemory() / 1024.0) / 1024.0) << " Mb \n";
cout << "\t Free memory: " << static_cast<int>(static_cast<int>(info.freeMemory() / 1024.0) / 1024.0) << " Mb \n";
if (!info.isCompatible())
cout << "\t !!! This device is NOT compatible with current GPU module build \n";
cout << endl;
}
#endif
}
int main(int argc, char** argv)
{
try
@@ -133,7 +68,6 @@ int main(int argc, char** argv)
return 0;
}
printOsInfo();
printCudaInfo();
if (cmd.has("info"))

View File

@@ -43,9 +43,25 @@
#ifdef HAVE_CUDA
using namespace cvtest;
#if defined(HAVE_XINE) || \
defined(HAVE_GSTREAMER) || \
defined(HAVE_QUICKTIME) || \
defined(HAVE_AVFOUNDATION) || \
defined(HAVE_FFMPEG) || \
defined(WIN32) /* assume that we have ffmpeg */
# define BUILD_WITH_VIDEO_INPUT_SUPPORT 1
#else
# define BUILD_WITH_VIDEO_INPUT_SUPPORT 0
#endif
//////////////////////////////////////////////////////
// FGDStatModel
#if BUILD_WITH_VIDEO_INPUT_SUPPORT
namespace cv
{
template<> void Ptr<CvBGStatModel>::delete_obj()
@@ -130,9 +146,13 @@ INSTANTIATE_TEST_CASE_P(GPU_Video, FGDStatModel, testing::Combine(
testing::Values(std::string("768x576.avi")),
testing::Values(Channels(3), Channels(4))));
#endif
//////////////////////////////////////////////////////
// MOG
#if BUILD_WITH_VIDEO_INPUT_SUPPORT
namespace
{
IMPLEMENT_PARAM_CLASS(UseGray, bool)
@@ -204,9 +224,13 @@ INSTANTIATE_TEST_CASE_P(GPU_Video, MOG, testing::Combine(
testing::Values(LearningRate(0.0), LearningRate(0.01)),
WHOLE_SUBMAT));
#endif
//////////////////////////////////////////////////////
// MOG2
#if BUILD_WITH_VIDEO_INPUT_SUPPORT
namespace
{
IMPLEMENT_PARAM_CLASS(DetectShadow, bool)
@@ -320,46 +344,7 @@ INSTANTIATE_TEST_CASE_P(GPU_Video, MOG2, testing::Combine(
testing::Values(DetectShadow(true), DetectShadow(false)),
WHOLE_SUBMAT));
//////////////////////////////////////////////////////
// VIBE
PARAM_TEST_CASE(VIBE, cv::gpu::DeviceInfo, cv::Size, MatType, UseRoi)
{
};
GPU_TEST_P(VIBE, Accuracy)
{
const cv::gpu::DeviceInfo devInfo = GET_PARAM(0);
cv::gpu::setDevice(devInfo.deviceID());
const cv::Size size = GET_PARAM(1);
const int type = GET_PARAM(2);
const bool useRoi = GET_PARAM(3);
const cv::Mat fullfg(size, CV_8UC1, cv::Scalar::all(255));
cv::Mat frame = randomMat(size, type, 0.0, 100);
cv::gpu::GpuMat d_frame = loadMat(frame, useRoi);
cv::gpu::VIBE_GPU vibe;
cv::gpu::GpuMat d_fgmask = createMat(size, CV_8UC1, useRoi);
vibe.initialize(d_frame);
for (int i = 0; i < 20; ++i)
vibe(d_frame, d_fgmask);
frame = randomMat(size, type, 160, 255);
d_frame = loadMat(frame, useRoi);
vibe(d_frame, d_fgmask);
// now fgmask should be entirely foreground
ASSERT_MAT_NEAR(fullfg, d_fgmask, 0);
}
INSTANTIATE_TEST_CASE_P(GPU_Video, VIBE, testing::Combine(
ALL_DEVICES,
DIFFERENT_SIZES,
testing::Values(MatType(CV_8UC1), MatType(CV_8UC3), MatType(CV_8UC4)),
WHOLE_SUBMAT));
#endif
//////////////////////////////////////////////////////
// GMG

View File

@@ -43,6 +43,8 @@
#ifdef HAVE_CUDA
using namespace cvtest;
//////////////////////////////////////////////////////////////////////////
// StereoBM

View File

@@ -43,6 +43,8 @@
#ifdef HAVE_CUDA
using namespace cvtest;
///////////////////////////////////////////////////////////////////////////////////////////////////////
// cvtColor
@@ -2218,12 +2220,245 @@ GPU_TEST_P(CvtColor, BayerGR2BGR4)
EXPECT_MAT_NEAR(dst_gold(cv::Rect(1, 1, dst.cols - 2, dst.rows - 2)), dst3(cv::Rect(1, 1, dst.cols - 2, dst.rows - 2)), 0);
}
GPU_TEST_P(CvtColor, BayerBG2Gray)
{
if ((depth != CV_8U && depth != CV_16U) || useRoi)
return;
cv::Mat src = randomMat(size, depth);
cv::gpu::GpuMat dst;
cv::gpu::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_BayerBG2GRAY);
cv::Mat dst_gold;
cv::cvtColor(src, dst_gold, cv::COLOR_BayerBG2GRAY);
EXPECT_MAT_NEAR(dst_gold(cv::Rect(1, 1, dst.cols - 2, dst.rows - 2)), dst(cv::Rect(1, 1, dst.cols - 2, dst.rows - 2)), 2);
}
GPU_TEST_P(CvtColor, BayerGB2Gray)
{
if ((depth != CV_8U && depth != CV_16U) || useRoi)
return;
cv::Mat src = randomMat(size, depth);
cv::gpu::GpuMat dst;
cv::gpu::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_BayerGB2GRAY);
cv::Mat dst_gold;
cv::cvtColor(src, dst_gold, cv::COLOR_BayerGB2GRAY);
EXPECT_MAT_NEAR(dst_gold(cv::Rect(1, 1, dst.cols - 2, dst.rows - 2)), dst(cv::Rect(1, 1, dst.cols - 2, dst.rows - 2)), 2);
}
GPU_TEST_P(CvtColor, BayerRG2Gray)
{
if ((depth != CV_8U && depth != CV_16U) || useRoi)
return;
cv::Mat src = randomMat(size, depth);
cv::gpu::GpuMat dst;
cv::gpu::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_BayerRG2GRAY);
cv::Mat dst_gold;
cv::cvtColor(src, dst_gold, cv::COLOR_BayerRG2GRAY);
EXPECT_MAT_NEAR(dst_gold(cv::Rect(1, 1, dst.cols - 2, dst.rows - 2)), dst(cv::Rect(1, 1, dst.cols - 2, dst.rows - 2)), 2);
}
GPU_TEST_P(CvtColor, BayerGR2Gray)
{
if ((depth != CV_8U && depth != CV_16U) || useRoi)
return;
cv::Mat src = randomMat(size, depth);
cv::gpu::GpuMat dst;
cv::gpu::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_BayerGR2GRAY);
cv::Mat dst_gold;
cv::cvtColor(src, dst_gold, cv::COLOR_BayerGR2GRAY);
EXPECT_MAT_NEAR(dst_gold(cv::Rect(1, 1, dst.cols - 2, dst.rows - 2)), dst(cv::Rect(1, 1, dst.cols - 2, dst.rows - 2)), 2);
}
INSTANTIATE_TEST_CASE_P(GPU_ImgProc, CvtColor, testing::Combine(
ALL_DEVICES,
DIFFERENT_SIZES,
testing::Values(MatDepth(CV_8U), MatDepth(CV_16U), MatDepth(CV_32F)),
WHOLE_SUBMAT));
///////////////////////////////////////////////////////////////////////////////////////////////////////
// Demosaicing
struct Demosaicing : testing::TestWithParam<cv::gpu::DeviceInfo>
{
cv::gpu::DeviceInfo devInfo;
virtual void SetUp()
{
devInfo = GetParam();
cv::gpu::setDevice(devInfo.deviceID());
}
static void mosaic(const cv::Mat_<cv::Vec3b>& src, cv::Mat_<uchar>& dst, cv::Point firstRed)
{
dst.create(src.size());
for (int y = 0; y < src.rows; ++y)
{
for (int x = 0; x < src.cols; ++x)
{
cv::Vec3b pix = src(y, x);
cv::Point alternate;
alternate.x = (x + firstRed.x) % 2;
alternate.y = (y + firstRed.y) % 2;
if (alternate.y == 0)
{
if (alternate.x == 0)
{
// RG
// GB
dst(y, x) = pix[2];
}
else
{
// GR
// BG
dst(y, x) = pix[1];
}
}
else
{
if (alternate.x == 0)
{
// GB
// RG
dst(y, x) = pix[1];
}
else
{
// BG
// GR
dst(y, x) = pix[0];
}
}
}
}
}
};
GPU_TEST_P(Demosaicing, BayerBG2BGR)
{
cv::Mat img = readImage("stereobm/aloe-L.png");
cv::Mat_<uchar> src;
mosaic(img, src, cv::Point(1, 1));
cv::gpu::GpuMat dst;
cv::gpu::demosaicing(loadMat(src), dst, cv::COLOR_BayerBG2BGR);
EXPECT_MAT_SIMILAR(img, dst, 2e-2);
}
GPU_TEST_P(Demosaicing, BayerGB2BGR)
{
cv::Mat img = readImage("stereobm/aloe-L.png");
cv::Mat_<uchar> src;
mosaic(img, src, cv::Point(0, 1));
cv::gpu::GpuMat dst;
cv::gpu::demosaicing(loadMat(src), dst, cv::COLOR_BayerGB2BGR);
EXPECT_MAT_SIMILAR(img, dst, 2e-2);
}
GPU_TEST_P(Demosaicing, BayerRG2BGR)
{
cv::Mat img = readImage("stereobm/aloe-L.png");
cv::Mat_<uchar> src;
mosaic(img, src, cv::Point(0, 0));
cv::gpu::GpuMat dst;
cv::gpu::demosaicing(loadMat(src), dst, cv::COLOR_BayerRG2BGR);
EXPECT_MAT_SIMILAR(img, dst, 2e-2);
}
GPU_TEST_P(Demosaicing, BayerGR2BGR)
{
cv::Mat img = readImage("stereobm/aloe-L.png");
cv::Mat_<uchar> src;
mosaic(img, src, cv::Point(1, 0));
cv::gpu::GpuMat dst;
cv::gpu::demosaicing(loadMat(src), dst, cv::COLOR_BayerGR2BGR);
EXPECT_MAT_SIMILAR(img, dst, 2e-2);
}
GPU_TEST_P(Demosaicing, BayerBG2BGR_MHT)
{
cv::Mat img = readImage("stereobm/aloe-L.png");
cv::Mat_<uchar> src;
mosaic(img, src, cv::Point(1, 1));
cv::gpu::GpuMat dst;
cv::gpu::demosaicing(loadMat(src), dst, cv::gpu::COLOR_BayerBG2BGR_MHT);
EXPECT_MAT_SIMILAR(img, dst, 5e-3);
}
GPU_TEST_P(Demosaicing, BayerGB2BGR_MHT)
{
cv::Mat img = readImage("stereobm/aloe-L.png");
cv::Mat_<uchar> src;
mosaic(img, src, cv::Point(0, 1));
cv::gpu::GpuMat dst;
cv::gpu::demosaicing(loadMat(src), dst, cv::gpu::COLOR_BayerGB2BGR_MHT);
EXPECT_MAT_SIMILAR(img, dst, 5e-3);
}
GPU_TEST_P(Demosaicing, BayerRG2BGR_MHT)
{
cv::Mat img = readImage("stereobm/aloe-L.png");
cv::Mat_<uchar> src;
mosaic(img, src, cv::Point(0, 0));
cv::gpu::GpuMat dst;
cv::gpu::demosaicing(loadMat(src), dst, cv::gpu::COLOR_BayerRG2BGR_MHT);
EXPECT_MAT_SIMILAR(img, dst, 5e-3);
}
GPU_TEST_P(Demosaicing, BayerGR2BGR_MHT)
{
cv::Mat img = readImage("stereobm/aloe-L.png");
cv::Mat_<uchar> src;
mosaic(img, src, cv::Point(1, 0));
cv::gpu::GpuMat dst;
cv::gpu::demosaicing(loadMat(src), dst, cv::gpu::COLOR_BayerGR2BGR_MHT);
EXPECT_MAT_SIMILAR(img, dst, 5e-3);
}
INSTANTIATE_TEST_CASE_P(GPU_ImgProc, Demosaicing, ALL_DEVICES);
///////////////////////////////////////////////////////////////////////////////////////////////////////
// swapChannels

View File

@@ -43,6 +43,8 @@
#ifdef HAVE_CUDA
using namespace cvtest;
namespace
{
IMPLEMENT_PARAM_CLASS(Border, int)

View File

@@ -43,6 +43,8 @@
#ifdef HAVE_CUDA
using namespace cvtest;
////////////////////////////////////////////////////////////////////////////////
// Merge

View File

@@ -43,6 +43,8 @@
#ifdef HAVE_CUDA
using namespace cvtest;
////////////////////////////////////////////////////////
// BilateralFilter

View File

@@ -43,306 +43,7 @@
#ifdef HAVE_CUDA
namespace
{
bool keyPointsEquals(const cv::KeyPoint& p1, const cv::KeyPoint& p2)
{
const double maxPtDif = 1.0;
const double maxSizeDif = 1.0;
const double maxAngleDif = 2.0;
const double maxResponseDif = 0.1;
double dist = cv::norm(p1.pt - p2.pt);
if (dist < maxPtDif &&
fabs(p1.size - p2.size) < maxSizeDif &&
abs(p1.angle - p2.angle) < maxAngleDif &&
abs(p1.response - p2.response) < maxResponseDif &&
p1.octave == p2.octave &&
p1.class_id == p2.class_id)
{
return true;
}
return false;
}
struct KeyPointLess : std::binary_function<cv::KeyPoint, cv::KeyPoint, bool>
{
bool operator()(const cv::KeyPoint& kp1, const cv::KeyPoint& kp2) const
{
return kp1.pt.y < kp2.pt.y || (kp1.pt.y == kp2.pt.y && kp1.pt.x < kp2.pt.x);
}
};
testing::AssertionResult assertKeyPointsEquals(const char* gold_expr, const char* actual_expr, std::vector<cv::KeyPoint>& gold, std::vector<cv::KeyPoint>& actual)
{
if (gold.size() != actual.size())
{
return testing::AssertionFailure() << "KeyPoints size mistmach\n"
<< "\"" << gold_expr << "\" : " << gold.size() << "\n"
<< "\"" << actual_expr << "\" : " << actual.size();
}
std::sort(actual.begin(), actual.end(), KeyPointLess());
std::sort(gold.begin(), gold.end(), KeyPointLess());
for (size_t i = 0; i < gold.size(); ++i)
{
const cv::KeyPoint& p1 = gold[i];
const cv::KeyPoint& p2 = actual[i];
if (!keyPointsEquals(p1, p2))
{
return testing::AssertionFailure() << "KeyPoints differ at " << i << "\n"
<< "\"" << gold_expr << "\" vs \"" << actual_expr << "\" : \n"
<< "pt : " << testing::PrintToString(p1.pt) << " vs " << testing::PrintToString(p2.pt) << "\n"
<< "size : " << p1.size << " vs " << p2.size << "\n"
<< "angle : " << p1.angle << " vs " << p2.angle << "\n"
<< "response : " << p1.response << " vs " << p2.response << "\n"
<< "octave : " << p1.octave << " vs " << p2.octave << "\n"
<< "class_id : " << p1.class_id << " vs " << p2.class_id;
}
}
return ::testing::AssertionSuccess();
}
#define ASSERT_KEYPOINTS_EQ(gold, actual) EXPECT_PRED_FORMAT2(assertKeyPointsEquals, gold, actual);
int getMatchedPointsCount(std::vector<cv::KeyPoint>& gold, std::vector<cv::KeyPoint>& actual)
{
std::sort(actual.begin(), actual.end(), KeyPointLess());
std::sort(gold.begin(), gold.end(), KeyPointLess());
int validCount = 0;
for (size_t i = 0; i < gold.size(); ++i)
{
const cv::KeyPoint& p1 = gold[i];
const cv::KeyPoint& p2 = actual[i];
if (keyPointsEquals(p1, p2))
++validCount;
}
return validCount;
}
int getMatchedPointsCount(const std::vector<cv::KeyPoint>& keypoints1, const std::vector<cv::KeyPoint>& keypoints2, const std::vector<cv::DMatch>& matches)
{
int validCount = 0;
for (size_t i = 0; i < matches.size(); ++i)
{
const cv::DMatch& m = matches[i];
const cv::KeyPoint& p1 = keypoints1[m.queryIdx];
const cv::KeyPoint& p2 = keypoints2[m.trainIdx];
if (keyPointsEquals(p1, p2))
++validCount;
}
return validCount;
}
}
/////////////////////////////////////////////////////////////////////////////////////////////////
// SURF
namespace
{
IMPLEMENT_PARAM_CLASS(SURF_HessianThreshold, double)
IMPLEMENT_PARAM_CLASS(SURF_Octaves, int)
IMPLEMENT_PARAM_CLASS(SURF_OctaveLayers, int)
IMPLEMENT_PARAM_CLASS(SURF_Extended, bool)
IMPLEMENT_PARAM_CLASS(SURF_Upright, bool)
}
PARAM_TEST_CASE(SURF, cv::gpu::DeviceInfo, SURF_HessianThreshold, SURF_Octaves, SURF_OctaveLayers, SURF_Extended, SURF_Upright)
{
cv::gpu::DeviceInfo devInfo;
double hessianThreshold;
int nOctaves;
int nOctaveLayers;
bool extended;
bool upright;
virtual void SetUp()
{
devInfo = GET_PARAM(0);
hessianThreshold = GET_PARAM(1);
nOctaves = GET_PARAM(2);
nOctaveLayers = GET_PARAM(3);
extended = GET_PARAM(4);
upright = GET_PARAM(5);
cv::gpu::setDevice(devInfo.deviceID());
}
};
GPU_TEST_P(SURF, Detector)
{
cv::Mat image = readImage("features2d/aloe.png", cv::IMREAD_GRAYSCALE);
ASSERT_FALSE(image.empty());
cv::gpu::SURF_GPU surf;
surf.hessianThreshold = hessianThreshold;
surf.nOctaves = nOctaves;
surf.nOctaveLayers = nOctaveLayers;
surf.extended = extended;
surf.upright = upright;
surf.keypointsRatio = 0.05f;
if (!supportFeature(devInfo, cv::gpu::GLOBAL_ATOMICS))
{
try
{
std::vector<cv::KeyPoint> keypoints;
surf(loadMat(image), cv::gpu::GpuMat(), keypoints);
}
catch (const cv::Exception& e)
{
ASSERT_EQ(CV_StsNotImplemented, e.code);
}
}
else
{
std::vector<cv::KeyPoint> keypoints;
surf(loadMat(image), cv::gpu::GpuMat(), keypoints);
cv::SURF surf_gold;
surf_gold.hessianThreshold = hessianThreshold;
surf_gold.nOctaves = nOctaves;
surf_gold.nOctaveLayers = nOctaveLayers;
surf_gold.extended = extended;
surf_gold.upright = upright;
std::vector<cv::KeyPoint> keypoints_gold;
surf_gold(image, cv::noArray(), keypoints_gold);
ASSERT_EQ(keypoints_gold.size(), keypoints.size());
int matchedCount = getMatchedPointsCount(keypoints_gold, keypoints);
double matchedRatio = static_cast<double>(matchedCount) / keypoints_gold.size();
EXPECT_GT(matchedRatio, 0.95);
}
}
GPU_TEST_P(SURF, Detector_Masked)
{
cv::Mat image = readImage("features2d/aloe.png", cv::IMREAD_GRAYSCALE);
ASSERT_FALSE(image.empty());
cv::Mat mask(image.size(), CV_8UC1, cv::Scalar::all(1));
mask(cv::Range(0, image.rows / 2), cv::Range(0, image.cols / 2)).setTo(cv::Scalar::all(0));
cv::gpu::SURF_GPU surf;
surf.hessianThreshold = hessianThreshold;
surf.nOctaves = nOctaves;
surf.nOctaveLayers = nOctaveLayers;
surf.extended = extended;
surf.upright = upright;
surf.keypointsRatio = 0.05f;
if (!supportFeature(devInfo, cv::gpu::GLOBAL_ATOMICS))
{
try
{
std::vector<cv::KeyPoint> keypoints;
surf(loadMat(image), loadMat(mask), keypoints);
}
catch (const cv::Exception& e)
{
ASSERT_EQ(CV_StsNotImplemented, e.code);
}
}
else
{
std::vector<cv::KeyPoint> keypoints;
surf(loadMat(image), loadMat(mask), keypoints);
cv::SURF surf_gold;
surf_gold.hessianThreshold = hessianThreshold;
surf_gold.nOctaves = nOctaves;
surf_gold.nOctaveLayers = nOctaveLayers;
surf_gold.extended = extended;
surf_gold.upright = upright;
std::vector<cv::KeyPoint> keypoints_gold;
surf_gold(image, mask, keypoints_gold);
ASSERT_EQ(keypoints_gold.size(), keypoints.size());
int matchedCount = getMatchedPointsCount(keypoints_gold, keypoints);
double matchedRatio = static_cast<double>(matchedCount) / keypoints_gold.size();
EXPECT_GT(matchedRatio, 0.95);
}
}
GPU_TEST_P(SURF, Descriptor)
{
cv::Mat image = readImage("features2d/aloe.png", cv::IMREAD_GRAYSCALE);
ASSERT_FALSE(image.empty());
cv::gpu::SURF_GPU surf;
surf.hessianThreshold = hessianThreshold;
surf.nOctaves = nOctaves;
surf.nOctaveLayers = nOctaveLayers;
surf.extended = extended;
surf.upright = upright;
surf.keypointsRatio = 0.05f;
cv::SURF surf_gold;
surf_gold.hessianThreshold = hessianThreshold;
surf_gold.nOctaves = nOctaves;
surf_gold.nOctaveLayers = nOctaveLayers;
surf_gold.extended = extended;
surf_gold.upright = upright;
if (!supportFeature(devInfo, cv::gpu::GLOBAL_ATOMICS))
{
try
{
std::vector<cv::KeyPoint> keypoints;
cv::gpu::GpuMat descriptors;
surf(loadMat(image), cv::gpu::GpuMat(), keypoints, descriptors);
}
catch (const cv::Exception& e)
{
ASSERT_EQ(CV_StsNotImplemented, e.code);
}
}
else
{
std::vector<cv::KeyPoint> keypoints;
surf_gold(image, cv::noArray(), keypoints);
cv::gpu::GpuMat descriptors;
surf(loadMat(image), cv::gpu::GpuMat(), keypoints, descriptors, true);
cv::Mat descriptors_gold;
surf_gold(image, cv::noArray(), keypoints, descriptors_gold, true);
cv::BFMatcher matcher(cv::NORM_L2);
std::vector<cv::DMatch> matches;
matcher.match(descriptors_gold, cv::Mat(descriptors), matches);
int matchedCount = getMatchedPointsCount(keypoints, keypoints, matches);
double matchedRatio = static_cast<double>(matchedCount) / keypoints.size();
EXPECT_GT(matchedRatio, 0.6);
}
}
INSTANTIATE_TEST_CASE_P(GPU_Features2D, SURF, testing::Combine(
ALL_DEVICES,
testing::Values(SURF_HessianThreshold(100.0), SURF_HessianThreshold(500.0), SURF_HessianThreshold(1000.0)),
testing::Values(SURF_Octaves(3), SURF_Octaves(4)),
testing::Values(SURF_OctaveLayers(2), SURF_OctaveLayers(3)),
testing::Values(SURF_Extended(false), SURF_Extended(true)),
testing::Values(SURF_Upright(false), SURF_Upright(true))));
using namespace cvtest;
/////////////////////////////////////////////////////////////////////////////////////////////////
// FAST

View File

@@ -43,6 +43,8 @@
#ifdef HAVE_CUDA
using namespace cvtest;
namespace
{
IMPLEMENT_PARAM_CLASS(KSize, cv::Size)

View File

@@ -44,6 +44,8 @@
#ifdef HAVE_CUDA
using namespace cvtest;
////////////////////////////////////////////////////////////////////////////////
// SetTo

View File

@@ -43,6 +43,8 @@
#ifdef HAVE_CUDA
using namespace cvtest;
///////////////////////////////////////////////////////////////////////////////////////////////////////
// HoughLines

View File

@@ -43,6 +43,8 @@
#ifdef HAVE_CUDA
using namespace cvtest;
///////////////////////////////////////////////////////////////////////////////////////////////////////
// Integral

View File

@@ -43,6 +43,8 @@
#ifdef HAVE_CUDA
using namespace cvtest;
//#define DUMP
struct HOG : testing::TestWithParam<cv::gpu::DeviceInfo>, cv::gpu::HOGDescriptor

View File

@@ -43,6 +43,8 @@
#if defined(HAVE_CUDA) && defined(HAVE_OPENGL)
using namespace cvtest;
/////////////////////////////////////////////
// Buffer

View File

@@ -43,6 +43,8 @@
#ifdef HAVE_CUDA
using namespace cvtest;
//////////////////////////////////////////////////////
// BroxOpticalFlow

View File

@@ -76,11 +76,10 @@
#include "opencv2/imgproc.hpp"
#include "opencv2/video.hpp"
#include "opencv2/ts.hpp"
#include "opencv2/ts/gpu_test.hpp"
#include "opencv2/gpu.hpp"
#include "opencv2/nonfree.hpp"
#include "opencv2/legacy.hpp"
#include "utility.hpp"
#include "interpolation.hpp"
#include "main_test_nvidia.h"
#endif

View File

@@ -43,6 +43,8 @@
#ifdef HAVE_CUDA
using namespace cvtest;
////////////////////////////////////////////////////////
// pyrDown

View File

@@ -43,6 +43,8 @@
#ifdef HAVE_CUDA
using namespace cvtest;
///////////////////////////////////////////////////////////////////
// Gold implementation

View File

@@ -43,6 +43,8 @@
#ifdef HAVE_CUDA
using namespace cvtest;
///////////////////////////////////////////////////////////////////
// Gold implementation

View File

@@ -44,6 +44,8 @@
#ifdef HAVE_CUDA
using namespace cvtest;
#if CUDA_VERSION >= 5000
struct Async : testing::TestWithParam<cv::gpu::DeviceInfo>

View File

@@ -43,6 +43,8 @@
#ifdef HAVE_CUDA
using namespace cvtest;
CV_ENUM(ThreshOp, cv::THRESH_BINARY, cv::THRESH_BINARY_INV, cv::THRESH_TRUNC, cv::THRESH_TOZERO, cv::THRESH_TOZERO_INV)
#define ALL_THRESH_OPS testing::Values(ThreshOp(cv::THRESH_BINARY), ThreshOp(cv::THRESH_BINARY_INV), ThreshOp(cv::THRESH_TRUNC), ThreshOp(cv::THRESH_TOZERO), ThreshOp(cv::THRESH_TOZERO_INV))

View File

@@ -43,6 +43,8 @@
#ifdef HAVE_CUDA
using namespace cvtest;
namespace
{
cv::Mat createTransfomMatrix(cv::Size srcSize, double angle)

View File

@@ -43,6 +43,8 @@
#ifdef HAVE_CUDA
using namespace cvtest;
namespace
{
cv::Mat createTransfomMatrix(cv::Size srcSize, double angle)

View File

@@ -1,407 +0,0 @@
/*M///////////////////////////////////////////////////////////////////////////////////////
//
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
// By downloading, copying, installing or using the software you agree to this license.
// If you do not agree to this license, do not download, install,
// copy or use the software.
//
//
// Intel License Agreement
// For Open Source Computer Vision Library
//
// Copyright (C) 2000, Intel Corporation, all rights reserved.
// Third party copyrights are property of their respective owners.
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other materials provided with the distribution.
//
// * The name of Intel Corporation may not be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors "as is" and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/
#include "test_precomp.hpp"
#ifdef HAVE_CUDA
using namespace std;
using namespace cv;
using namespace cv::gpu;
using namespace cvtest;
using namespace testing;
using namespace testing::internal;
//////////////////////////////////////////////////////////////////////
// random generators
int randomInt(int minVal, int maxVal)
{
RNG& rng = TS::ptr()->get_rng();
return rng.uniform(minVal, maxVal);
}
double randomDouble(double minVal, double maxVal)
{
RNG& rng = TS::ptr()->get_rng();
return rng.uniform(minVal, maxVal);
}
Size randomSize(int minVal, int maxVal)
{
return Size(randomInt(minVal, maxVal), randomInt(minVal, maxVal));
}
Scalar randomScalar(double minVal, double maxVal)
{
return Scalar(randomDouble(minVal, maxVal), randomDouble(minVal, maxVal), randomDouble(minVal, maxVal), randomDouble(minVal, maxVal));
}
Mat randomMat(Size size, int type, double minVal, double maxVal)
{
return randomMat(TS::ptr()->get_rng(), size, type, minVal, maxVal, false);
}
//////////////////////////////////////////////////////////////////////
// GpuMat create
GpuMat createMat(Size size, int type, bool useRoi)
{
Size size0 = size;
if (useRoi)
{
size0.width += randomInt(5, 15);
size0.height += randomInt(5, 15);
}
GpuMat d_m(size0, type);
if (size0 != size)
d_m = d_m(Rect((size0.width - size.width) / 2, (size0.height - size.height) / 2, size.width, size.height));
return d_m;
}
GpuMat loadMat(const Mat& m, bool useRoi)
{
GpuMat d_m = createMat(m.size(), m.type(), useRoi);
d_m.upload(m);
return d_m;
}
//////////////////////////////////////////////////////////////////////
// Image load
Mat readImage(const std::string& fileName, int flags)
{
return imread(TS::ptr()->get_data_path() + fileName, flags);
}
Mat readImageType(const std::string& fname, int type)
{
Mat src = readImage(fname, CV_MAT_CN(type) == 1 ? IMREAD_GRAYSCALE : IMREAD_COLOR);
if (CV_MAT_CN(type) == 4)
{
Mat temp;
cvtColor(src, temp, COLOR_BGR2BGRA);
swap(src, temp);
}
src.convertTo(src, CV_MAT_DEPTH(type), CV_MAT_DEPTH(type) == CV_32F ? 1.0 / 255.0 : 1.0);
return src;
}
//////////////////////////////////////////////////////////////////////
// Gpu devices
bool supportFeature(const DeviceInfo& info, FeatureSet feature)
{
return TargetArchs::builtWith(feature) && info.supports(feature);
}
DeviceManager& DeviceManager::instance()
{
static DeviceManager obj;
return obj;
}
void DeviceManager::load(int i)
{
devices_.clear();
devices_.reserve(1);
std::ostringstream msg;
if (i < 0 || i >= getCudaEnabledDeviceCount())
{
msg << "Incorrect device number - " << i;
throw runtime_error(msg.str());
}
DeviceInfo info(i);
if (!info.isCompatible())
{
msg << "Device " << i << " [" << info.name() << "] is NOT compatible with current GPU module build";
throw runtime_error(msg.str());
}
devices_.push_back(info);
}
void DeviceManager::loadAll()
{
int deviceCount = getCudaEnabledDeviceCount();
devices_.clear();
devices_.reserve(deviceCount);
for (int i = 0; i < deviceCount; ++i)
{
DeviceInfo info(i);
if (info.isCompatible())
{
devices_.push_back(info);
}
}
}
//////////////////////////////////////////////////////////////////////
// Additional assertion
namespace
{
template <typename T, typename OutT> std::string printMatValImpl(const Mat& m, Point p)
{
const int cn = m.channels();
std::ostringstream ostr;
ostr << "(";
p.x /= cn;
ostr << static_cast<OutT>(m.at<T>(p.y, p.x * cn));
for (int c = 1; c < m.channels(); ++c)
{
ostr << ", " << static_cast<OutT>(m.at<T>(p.y, p.x * cn + c));
}
ostr << ")";
return ostr.str();
}
std::string printMatVal(const Mat& m, Point p)
{
typedef std::string (*func_t)(const Mat& m, Point p);
static const func_t funcs[] =
{
printMatValImpl<uchar, int>, printMatValImpl<schar, int>, printMatValImpl<ushort, int>, printMatValImpl<short, int>,
printMatValImpl<int, int>, printMatValImpl<float, float>, printMatValImpl<double, double>
};
return funcs[m.depth()](m, p);
}
}
void minMaxLocGold(const Mat& src, double* minVal_, double* maxVal_, Point* minLoc_, Point* maxLoc_, const Mat& mask)
{
if (src.depth() != CV_8S)
{
minMaxLoc(src, minVal_, maxVal_, minLoc_, maxLoc_, mask);
return;
}
// OpenCV's minMaxLoc doesn't support CV_8S type
double minVal = numeric_limits<double>::max();
Point minLoc(-1, -1);
double maxVal = -numeric_limits<double>::max();
Point maxLoc(-1, -1);
for (int y = 0; y < src.rows; ++y)
{
const schar* src_row = src.ptr<schar>(y);
const uchar* mask_row = mask.empty() ? 0 : mask.ptr<uchar>(y);
for (int x = 0; x < src.cols; ++x)
{
if (!mask_row || mask_row[x])
{
schar val = src_row[x];
if (val < minVal)
{
minVal = val;
minLoc = cv::Point(x, y);
}
if (val > maxVal)
{
maxVal = val;
maxLoc = cv::Point(x, y);
}
}
}
}
if (minVal_) *minVal_ = minVal;
if (maxVal_) *maxVal_ = maxVal;
if (minLoc_) *minLoc_ = minLoc;
if (maxLoc_) *maxLoc_ = maxLoc;
}
Mat getMat(InputArray arr)
{
if (arr.kind() == _InputArray::GPU_MAT)
{
Mat m;
arr.getGpuMat().download(m);
return m;
}
return arr.getMat();
}
AssertionResult assertMatNear(const char* expr1, const char* expr2, const char* eps_expr, InputArray m1_, InputArray m2_, double eps)
{
Mat m1 = getMat(m1_);
Mat m2 = getMat(m2_);
if (m1.size() != m2.size())
{
return AssertionFailure() << "Matrices \"" << expr1 << "\" and \"" << expr2 << "\" have different sizes : \""
<< expr1 << "\" [" << PrintToString(m1.size()) << "] vs \""
<< expr2 << "\" [" << PrintToString(m2.size()) << "]";
}
if (m1.type() != m2.type())
{
return AssertionFailure() << "Matrices \"" << expr1 << "\" and \"" << expr2 << "\" have different types : \""
<< expr1 << "\" [" << PrintToString(MatType(m1.type())) << "] vs \""
<< expr2 << "\" [" << PrintToString(MatType(m2.type())) << "]";
}
Mat diff;
absdiff(m1.reshape(1), m2.reshape(1), diff);
double maxVal = 0.0;
Point maxLoc;
minMaxLocGold(diff, 0, &maxVal, 0, &maxLoc);
if (maxVal > eps)
{
return AssertionFailure() << "The max difference between matrices \"" << expr1 << "\" and \"" << expr2
<< "\" is " << maxVal << " at (" << maxLoc.y << ", " << maxLoc.x / m1.channels() << ")"
<< ", which exceeds \"" << eps_expr << "\", where \""
<< expr1 << "\" at (" << maxLoc.y << ", " << maxLoc.x / m1.channels() << ") evaluates to " << printMatVal(m1, maxLoc) << ", \""
<< expr2 << "\" at (" << maxLoc.y << ", " << maxLoc.x / m1.channels() << ") evaluates to " << printMatVal(m2, maxLoc) << ", \""
<< eps_expr << "\" evaluates to " << eps;
}
return AssertionSuccess();
}
double checkSimilarity(InputArray m1, InputArray m2)
{
Mat diff;
matchTemplate(getMat(m1), getMat(m2), diff, CV_TM_CCORR_NORMED);
return std::abs(diff.at<float>(0, 0) - 1.f);
}
//////////////////////////////////////////////////////////////////////
// Helper structs for value-parameterized tests
vector<MatType> types(int depth_start, int depth_end, int cn_start, int cn_end)
{
vector<MatType> v;
v.reserve((depth_end - depth_start + 1) * (cn_end - cn_start + 1));
for (int depth = depth_start; depth <= depth_end; ++depth)
{
for (int cn = cn_start; cn <= cn_end; ++cn)
{
v.push_back(MatType(CV_MAKE_TYPE(depth, cn)));
}
}
return v;
}
const vector<MatType>& all_types()
{
static vector<MatType> v = types(CV_8U, CV_64F, 1, 4);
return v;
}
void cv::gpu::PrintTo(const DeviceInfo& info, ostream* os)
{
(*os) << info.name();
}
void PrintTo(const UseRoi& useRoi, std::ostream* os)
{
if (useRoi)
(*os) << "sub matrix";
else
(*os) << "whole matrix";
}
void PrintTo(const Inverse& inverse, std::ostream* os)
{
if (inverse)
(*os) << "inverse";
else
(*os) << "direct";
}
//////////////////////////////////////////////////////////////////////
// Other
void dumpImage(const std::string& fileName, const Mat& image)
{
imwrite(TS::ptr()->get_data_path() + fileName, image);
}
void showDiff(InputArray gold_, InputArray actual_, double eps)
{
Mat gold = getMat(gold_);
Mat actual = getMat(actual_);
Mat diff;
absdiff(gold, actual, diff);
threshold(diff, diff, eps, 255.0, cv::THRESH_BINARY);
namedWindow("gold", WINDOW_NORMAL);
namedWindow("actual", WINDOW_NORMAL);
namedWindow("diff", WINDOW_NORMAL);
imshow("gold", gold);
imshow("actual", actual);
imshow("diff", diff);
waitKey();
}
#endif // HAVE_CUDA

View File

@@ -1,331 +0,0 @@
/*M///////////////////////////////////////////////////////////////////////////////////////
//
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
// By downloading, copying, installing or using the software you agree to this license.
// If you do not agree to this license, do not download, install,
// copy or use the software.
//
//
// Intel License Agreement
// For Open Source Computer Vision Library
//
// Copyright (C) 2000, Intel Corporation, all rights reserved.
// Third party copyrights are property of their respective owners.
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other materials provided with the distribution.
//
// * The name of Intel Corporation may not be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors "as is" and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/
#ifndef __OPENCV_GPU_TEST_UTILITY_HPP__
#define __OPENCV_GPU_TEST_UTILITY_HPP__
#include "opencv2/core.hpp"
#include "opencv2/core/gpumat.hpp"
#include "opencv2/highgui.hpp"
#include "opencv2/ts.hpp"
//////////////////////////////////////////////////////////////////////
// random generators
int randomInt(int minVal, int maxVal);
double randomDouble(double minVal, double maxVal);
cv::Size randomSize(int minVal, int maxVal);
cv::Scalar randomScalar(double minVal, double maxVal);
cv::Mat randomMat(cv::Size size, int type, double minVal = 0.0, double maxVal = 255.0);
//////////////////////////////////////////////////////////////////////
// GpuMat create
cv::gpu::GpuMat createMat(cv::Size size, int type, bool useRoi = false);
cv::gpu::GpuMat loadMat(const cv::Mat& m, bool useRoi = false);
//////////////////////////////////////////////////////////////////////
// Image load
//! read image from testdata folder
cv::Mat readImage(const std::string& fileName, int flags = cv::IMREAD_COLOR);
//! read image from testdata folder and convert it to specified type
cv::Mat readImageType(const std::string& fname, int type);
//////////////////////////////////////////////////////////////////////
// Gpu devices
//! return true if device supports specified feature and gpu module was built with support the feature.
bool supportFeature(const cv::gpu::DeviceInfo& info, cv::gpu::FeatureSet feature);
class DeviceManager
{
public:
static DeviceManager& instance();
void load(int i);
void loadAll();
const std::vector<cv::gpu::DeviceInfo>& values() const { return devices_; }
private:
std::vector<cv::gpu::DeviceInfo> devices_;
};
#define ALL_DEVICES testing::ValuesIn(DeviceManager::instance().values())
//////////////////////////////////////////////////////////////////////
// Additional assertion
void minMaxLocGold(const cv::Mat& src, double* minVal_, double* maxVal_ = 0, cv::Point* minLoc_ = 0, cv::Point* maxLoc_ = 0, const cv::Mat& mask = cv::Mat());
cv::Mat getMat(cv::InputArray arr);
testing::AssertionResult assertMatNear(const char* expr1, const char* expr2, const char* eps_expr, cv::InputArray m1, cv::InputArray m2, double eps);
#define EXPECT_MAT_NEAR(m1, m2, eps) EXPECT_PRED_FORMAT3(assertMatNear, m1, m2, eps)
#define ASSERT_MAT_NEAR(m1, m2, eps) ASSERT_PRED_FORMAT3(assertMatNear, m1, m2, eps)
#define EXPECT_SCALAR_NEAR(s1, s2, eps) \
{ \
EXPECT_NEAR(s1[0], s2[0], eps); \
EXPECT_NEAR(s1[1], s2[1], eps); \
EXPECT_NEAR(s1[2], s2[2], eps); \
EXPECT_NEAR(s1[3], s2[3], eps); \
}
#define ASSERT_SCALAR_NEAR(s1, s2, eps) \
{ \
ASSERT_NEAR(s1[0], s2[0], eps); \
ASSERT_NEAR(s1[1], s2[1], eps); \
ASSERT_NEAR(s1[2], s2[2], eps); \
ASSERT_NEAR(s1[3], s2[3], eps); \
}
#define EXPECT_POINT2_NEAR(p1, p2, eps) \
{ \
EXPECT_NEAR(p1.x, p2.x, eps); \
EXPECT_NEAR(p1.y, p2.y, eps); \
}
#define ASSERT_POINT2_NEAR(p1, p2, eps) \
{ \
ASSERT_NEAR(p1.x, p2.x, eps); \
ASSERT_NEAR(p1.y, p2.y, eps); \
}
#define EXPECT_POINT3_NEAR(p1, p2, eps) \
{ \
EXPECT_NEAR(p1.x, p2.x, eps); \
EXPECT_NEAR(p1.y, p2.y, eps); \
EXPECT_NEAR(p1.z, p2.z, eps); \
}
#define ASSERT_POINT3_NEAR(p1, p2, eps) \
{ \
ASSERT_NEAR(p1.x, p2.x, eps); \
ASSERT_NEAR(p1.y, p2.y, eps); \
ASSERT_NEAR(p1.z, p2.z, eps); \
}
double checkSimilarity(cv::InputArray m1, cv::InputArray m2);
#define EXPECT_MAT_SIMILAR(mat1, mat2, eps) \
{ \
ASSERT_EQ(mat1.type(), mat2.type()); \
ASSERT_EQ(mat1.size(), mat2.size()); \
EXPECT_LE(checkSimilarity(mat1, mat2), eps); \
}
#define ASSERT_MAT_SIMILAR(mat1, mat2, eps) \
{ \
ASSERT_EQ(mat1.type(), mat2.type()); \
ASSERT_EQ(mat1.size(), mat2.size()); \
ASSERT_LE(checkSimilarity(mat1, mat2), eps); \
}
//////////////////////////////////////////////////////////////////////
// Helper structs for value-parameterized tests
#define GPU_TEST_P(test_case_name, test_name) \
class GTEST_TEST_CLASS_NAME_(test_case_name, test_name) \
: public test_case_name { \
public: \
GTEST_TEST_CLASS_NAME_(test_case_name, test_name)() {} \
virtual void TestBody(); \
private: \
void UnsafeTestBody(); \
static int AddToRegistry() { \
::testing::UnitTest::GetInstance()->parameterized_test_registry(). \
GetTestCasePatternHolder<test_case_name>(\
#test_case_name, __FILE__, __LINE__)->AddTestPattern(\
#test_case_name, \
#test_name, \
new ::testing::internal::TestMetaFactory< \
GTEST_TEST_CLASS_NAME_(test_case_name, test_name)>()); \
return 0; \
} \
static int gtest_registering_dummy_; \
GTEST_DISALLOW_COPY_AND_ASSIGN_(\
GTEST_TEST_CLASS_NAME_(test_case_name, test_name)); \
}; \
int GTEST_TEST_CLASS_NAME_(test_case_name, \
test_name)::gtest_registering_dummy_ = \
GTEST_TEST_CLASS_NAME_(test_case_name, test_name)::AddToRegistry(); \
void GTEST_TEST_CLASS_NAME_(test_case_name, test_name)::TestBody() \
{ \
try \
{ \
UnsafeTestBody(); \
} \
catch (...) \
{ \
cv::gpu::resetDevice(); \
throw; \
} \
} \
void GTEST_TEST_CLASS_NAME_(test_case_name, test_name)::UnsafeTestBody()
#define PARAM_TEST_CASE(name, ...) struct name : testing::TestWithParam< std::tr1::tuple< __VA_ARGS__ > >
#define GET_PARAM(k) std::tr1::get< k >(GetParam())
namespace cv { namespace gpu
{
void PrintTo(const DeviceInfo& info, std::ostream* os);
}}
#define DIFFERENT_SIZES testing::Values(cv::Size(128, 128), cv::Size(113, 113))
// Depth
using perf::MatDepth;
#define ALL_DEPTH testing::Values(MatDepth(CV_8U), MatDepth(CV_8S), MatDepth(CV_16U), MatDepth(CV_16S), MatDepth(CV_32S), MatDepth(CV_32F), MatDepth(CV_64F))
#define DEPTH_PAIRS testing::Values(std::make_pair(MatDepth(CV_8U), MatDepth(CV_8U)), \
std::make_pair(MatDepth(CV_8U), MatDepth(CV_16U)), \
std::make_pair(MatDepth(CV_8U), MatDepth(CV_16S)), \
std::make_pair(MatDepth(CV_8U), MatDepth(CV_32S)), \
std::make_pair(MatDepth(CV_8U), MatDepth(CV_32F)), \
std::make_pair(MatDepth(CV_8U), MatDepth(CV_64F)), \
\
std::make_pair(MatDepth(CV_16U), MatDepth(CV_16U)), \
std::make_pair(MatDepth(CV_16U), MatDepth(CV_32S)), \
std::make_pair(MatDepth(CV_16U), MatDepth(CV_32F)), \
std::make_pair(MatDepth(CV_16U), MatDepth(CV_64F)), \
\
std::make_pair(MatDepth(CV_16S), MatDepth(CV_16S)), \
std::make_pair(MatDepth(CV_16S), MatDepth(CV_32S)), \
std::make_pair(MatDepth(CV_16S), MatDepth(CV_32F)), \
std::make_pair(MatDepth(CV_16S), MatDepth(CV_64F)), \
\
std::make_pair(MatDepth(CV_32S), MatDepth(CV_32S)), \
std::make_pair(MatDepth(CV_32S), MatDepth(CV_32F)), \
std::make_pair(MatDepth(CV_32S), MatDepth(CV_64F)), \
\
std::make_pair(MatDepth(CV_32F), MatDepth(CV_32F)), \
std::make_pair(MatDepth(CV_32F), MatDepth(CV_64F)), \
\
std::make_pair(MatDepth(CV_64F), MatDepth(CV_64F)))
// Type
using perf::MatType;
//! return vector with types from specified range.
std::vector<MatType> types(int depth_start, int depth_end, int cn_start, int cn_end);
//! return vector with all types (depth: CV_8U-CV_64F, channels: 1-4).
const std::vector<MatType>& all_types();
#define ALL_TYPES testing::ValuesIn(all_types())
#define TYPES(depth_start, depth_end, cn_start, cn_end) testing::ValuesIn(types(depth_start, depth_end, cn_start, cn_end))
// ROI
class UseRoi
{
public:
inline UseRoi(bool val = false) : val_(val) {}
inline operator bool() const { return val_; }
private:
bool val_;
};
void PrintTo(const UseRoi& useRoi, std::ostream* os);
#define WHOLE_SUBMAT testing::Values(UseRoi(false), UseRoi(true))
// Direct/Inverse
class Inverse
{
public:
inline Inverse(bool val = false) : val_(val) {}
inline operator bool() const { return val_; }
private:
bool val_;
};
void PrintTo(const Inverse& useRoi, std::ostream* os);
#define DIRECT_INVERSE testing::Values(Inverse(false), Inverse(true))
// Param class
#define IMPLEMENT_PARAM_CLASS(name, type) \
class name \
{ \
public: \
name ( type arg = type ()) : val_(arg) {} \
operator type () const {return val_;} \
private: \
type val_; \
}; \
inline void PrintTo( name param, std::ostream* os) \
{ \
*os << #name << "(" << testing::PrintToString(static_cast< type >(param)) << ")"; \
}
IMPLEMENT_PARAM_CLASS(Channels, int)
#define ALL_CHANNELS testing::Values(Channels(1), Channels(2), Channels(3), Channels(4))
#define IMAGE_CHANNELS testing::Values(Channels(1), Channels(3), Channels(4))
// Flags and enums
CV_ENUM(NormCode, cv::NORM_INF, cv::NORM_L1, cv::NORM_L2, cv::NORM_TYPE_MASK, cv::NORM_RELATIVE, cv::NORM_MINMAX)
CV_ENUM(Interpolation, cv::INTER_NEAREST, cv::INTER_LINEAR, cv::INTER_CUBIC, cv::INTER_AREA)
CV_ENUM(BorderType, cv::BORDER_REFLECT101, cv::BORDER_REPLICATE, cv::BORDER_CONSTANT, cv::BORDER_REFLECT, cv::BORDER_WRAP)
#define ALL_BORDER_TYPES testing::Values(BorderType(cv::BORDER_REFLECT101), BorderType(cv::BORDER_REPLICATE), BorderType(cv::BORDER_CONSTANT), BorderType(cv::BORDER_REFLECT), BorderType(cv::BORDER_WRAP))
CV_FLAGS(WarpFlags, cv::INTER_NEAREST, cv::INTER_LINEAR, cv::INTER_CUBIC, cv::WARP_INVERSE_MAP)
//////////////////////////////////////////////////////////////////////
// Other
void dumpImage(const std::string& fileName, const cv::Mat& image);
void showDiff(cv::InputArray gold, cv::InputArray actual, double eps);
#endif // __OPENCV_GPU_TEST_UTILITY_HPP__