Merge branch '2.4'
This commit is contained in:
@@ -3,7 +3,7 @@ if(ANDROID OR IOS)
|
||||
endif()
|
||||
|
||||
set(the_description "GPU-accelerated Computer Vision")
|
||||
ocv_add_module(gpu opencv_imgproc opencv_calib3d opencv_objdetect opencv_video opencv_nonfree opencv_photo opencv_legacy)
|
||||
ocv_add_module(gpu opencv_imgproc opencv_calib3d opencv_objdetect opencv_video opencv_photo opencv_legacy)
|
||||
|
||||
ocv_module_include_directories("${CMAKE_CURRENT_SOURCE_DIR}/src/cuda")
|
||||
|
||||
|
@@ -5,109 +5,6 @@ Feature Detection and Description
|
||||
|
||||
|
||||
|
||||
gpu::SURF_GPU
|
||||
-------------
|
||||
.. ocv:class:: gpu::SURF_GPU
|
||||
|
||||
Class used for extracting Speeded Up Robust Features (SURF) from an image. ::
|
||||
|
||||
class SURF_GPU
|
||||
{
|
||||
public:
|
||||
enum KeypointLayout
|
||||
{
|
||||
X_ROW = 0,
|
||||
Y_ROW,
|
||||
LAPLACIAN_ROW,
|
||||
OCTAVE_ROW,
|
||||
SIZE_ROW,
|
||||
ANGLE_ROW,
|
||||
HESSIAN_ROW,
|
||||
ROWS_COUNT
|
||||
};
|
||||
|
||||
//! the default constructor
|
||||
SURF_GPU();
|
||||
//! the full constructor taking all the necessary parameters
|
||||
explicit SURF_GPU(double _hessianThreshold, int _nOctaves=4,
|
||||
int _nOctaveLayers=2, bool _extended=false, float _keypointsRatio=0.01f);
|
||||
|
||||
//! returns the descriptor size in float's (64 or 128)
|
||||
int descriptorSize() const;
|
||||
|
||||
//! upload host keypoints to device memory
|
||||
void uploadKeypoints(const vector<KeyPoint>& keypoints,
|
||||
GpuMat& keypointsGPU);
|
||||
//! download keypoints from device to host memory
|
||||
void downloadKeypoints(const GpuMat& keypointsGPU,
|
||||
vector<KeyPoint>& keypoints);
|
||||
|
||||
//! download descriptors from device to host memory
|
||||
void downloadDescriptors(const GpuMat& descriptorsGPU,
|
||||
vector<float>& descriptors);
|
||||
|
||||
void operator()(const GpuMat& img, const GpuMat& mask,
|
||||
GpuMat& keypoints);
|
||||
|
||||
void operator()(const GpuMat& img, const GpuMat& mask,
|
||||
GpuMat& keypoints, GpuMat& descriptors,
|
||||
bool useProvidedKeypoints = false,
|
||||
bool calcOrientation = true);
|
||||
|
||||
void operator()(const GpuMat& img, const GpuMat& mask,
|
||||
std::vector<KeyPoint>& keypoints);
|
||||
|
||||
void operator()(const GpuMat& img, const GpuMat& mask,
|
||||
std::vector<KeyPoint>& keypoints, GpuMat& descriptors,
|
||||
bool useProvidedKeypoints = false,
|
||||
bool calcOrientation = true);
|
||||
|
||||
void operator()(const GpuMat& img, const GpuMat& mask,
|
||||
std::vector<KeyPoint>& keypoints,
|
||||
std::vector<float>& descriptors,
|
||||
bool useProvidedKeypoints = false,
|
||||
bool calcOrientation = true);
|
||||
|
||||
void releaseMemory();
|
||||
|
||||
// SURF parameters
|
||||
double hessianThreshold;
|
||||
int nOctaves;
|
||||
int nOctaveLayers;
|
||||
bool extended;
|
||||
bool upright;
|
||||
|
||||
//! max keypoints = keypointsRatio * img.size().area()
|
||||
float keypointsRatio;
|
||||
|
||||
GpuMat sum, mask1, maskSum, intBuffer;
|
||||
|
||||
GpuMat det, trace;
|
||||
|
||||
GpuMat maxPosBuffer;
|
||||
};
|
||||
|
||||
|
||||
The class ``SURF_GPU`` implements Speeded Up Robust Features descriptor. There is a fast multi-scale Hessian keypoint detector that can be used to find the keypoints (which is the default option). But the descriptors can also be computed for the user-specified keypoints. Only 8-bit grayscale images are supported.
|
||||
|
||||
The class ``SURF_GPU`` can store results in the GPU and CPU memory. It provides functions to convert results between CPU and GPU version ( ``uploadKeypoints``, ``downloadKeypoints``, ``downloadDescriptors`` ). The format of CPU results is the same as ``SURF`` results. GPU results are stored in ``GpuMat``. The ``keypoints`` matrix is :math:`\texttt{nFeatures} \times 7` matrix with the ``CV_32FC1`` type.
|
||||
|
||||
* ``keypoints.ptr<float>(X_ROW)[i]`` contains x coordinate of the i-th feature.
|
||||
* ``keypoints.ptr<float>(Y_ROW)[i]`` contains y coordinate of the i-th feature.
|
||||
* ``keypoints.ptr<float>(LAPLACIAN_ROW)[i]`` contains the laplacian sign of the i-th feature.
|
||||
* ``keypoints.ptr<float>(OCTAVE_ROW)[i]`` contains the octave of the i-th feature.
|
||||
* ``keypoints.ptr<float>(SIZE_ROW)[i]`` contains the size of the i-th feature.
|
||||
* ``keypoints.ptr<float>(ANGLE_ROW)[i]`` contain orientation of the i-th feature.
|
||||
* ``keypoints.ptr<float>(HESSIAN_ROW)[i]`` contains the response of the i-th feature.
|
||||
|
||||
The ``descriptors`` matrix is :math:`\texttt{nFeatures} \times \texttt{descriptorSize}` matrix with the ``CV_32FC1`` type.
|
||||
|
||||
The class ``SURF_GPU`` uses some buffers and provides access to it. All buffers can be safely released between function calls.
|
||||
|
||||
.. seealso:: :ocv:class:`SURF`
|
||||
|
||||
|
||||
|
||||
gpu::FAST_GPU
|
||||
-------------
|
||||
.. ocv:class:: gpu::FAST_GPU
|
||||
|
@@ -579,76 +579,6 @@ Releases all inner buffer's memory.
|
||||
|
||||
|
||||
|
||||
gpu::VIBE_GPU
|
||||
-------------
|
||||
.. ocv:class:: gpu::VIBE_GPU
|
||||
|
||||
Class used for background/foreground segmentation. ::
|
||||
|
||||
class VIBE_GPU
|
||||
{
|
||||
public:
|
||||
explicit VIBE_GPU(unsigned long rngSeed = 1234567);
|
||||
|
||||
void initialize(const GpuMat& firstFrame, Stream& stream = Stream::Null());
|
||||
|
||||
void operator()(const GpuMat& frame, GpuMat& fgmask, Stream& stream = Stream::Null());
|
||||
|
||||
void release();
|
||||
|
||||
...
|
||||
};
|
||||
|
||||
The class discriminates between foreground and background pixels by building and maintaining a model of the background. Any pixel which does not fit this model is then deemed to be foreground. The class implements algorithm described in [VIBE2011]_.
|
||||
|
||||
|
||||
|
||||
gpu::VIBE_GPU::VIBE_GPU
|
||||
-----------------------
|
||||
The constructor.
|
||||
|
||||
.. ocv:function:: gpu::VIBE_GPU::VIBE_GPU(unsigned long rngSeed = 1234567)
|
||||
|
||||
:param rngSeed: Value used to initiate a random sequence.
|
||||
|
||||
Default constructor sets all parameters to default values.
|
||||
|
||||
|
||||
|
||||
gpu::VIBE_GPU::initialize
|
||||
-------------------------
|
||||
Initialize background model and allocates all inner buffers.
|
||||
|
||||
.. ocv:function:: void gpu::VIBE_GPU::initialize(const GpuMat& firstFrame, Stream& stream = Stream::Null())
|
||||
|
||||
:param firstFrame: First frame from video sequence.
|
||||
|
||||
:param stream: Stream for the asynchronous version.
|
||||
|
||||
|
||||
|
||||
gpu::VIBE_GPU::operator()
|
||||
-------------------------
|
||||
Updates the background model and returns the foreground mask
|
||||
|
||||
.. ocv:function:: void gpu::VIBE_GPU::operator()(const GpuMat& frame, GpuMat& fgmask, Stream& stream = Stream::Null())
|
||||
|
||||
:param frame: Next video frame.
|
||||
|
||||
:param fgmask: The output foreground mask as an 8-bit binary image.
|
||||
|
||||
:param stream: Stream for the asynchronous version.
|
||||
|
||||
|
||||
|
||||
gpu::VIBE_GPU::release
|
||||
----------------------
|
||||
Releases all inner buffer's memory.
|
||||
|
||||
.. ocv:function:: void gpu::VIBE_GPU::release()
|
||||
|
||||
|
||||
|
||||
gpu::GMG_GPU
|
||||
------------
|
||||
.. ocv:class:: gpu::GMG_GPU
|
||||
@@ -1209,5 +1139,4 @@ Parse next video frame. Implementation must call this method after new frame was
|
||||
.. [MOG2001] P. KadewTraKuPong and R. Bowden. *An improved adaptive background mixture model for real-time tracking with shadow detection*. Proc. 2nd European Workshop on Advanced Video-Based Surveillance Systems, 2001
|
||||
.. [MOG2004] Z. Zivkovic. *Improved adaptive Gausian mixture model for background subtraction*. International Conference Pattern Recognition, UK, August, 2004
|
||||
.. [ShadowDetect2003] Prati, Mikic, Trivedi and Cucchiarra. *Detecting Moving Shadows...*. IEEE PAMI, 2003
|
||||
.. [VIBE2011] O. Barnich and M. Van D Roogenbroeck. *ViBe: A universal background subtraction algorithm for video sequences*. IEEE Transactions on Image Processing, 20(6) :1709-1724, June 2011
|
||||
.. [GMG2012] A. Godbehere, A. Matsukawa and K. Goldberg. *Visual Tracking of Human Visitors under Variable-Lighting Conditions for a Responsive Audio Art Installation*. American Control Conference, Montreal, June 2012
|
||||
|
@@ -491,6 +491,26 @@ CV_EXPORTS void reprojectImageTo3D(const GpuMat& disp, GpuMat& xyzw, const Mat&
|
||||
//! converts image from one color space to another
|
||||
CV_EXPORTS void cvtColor(const GpuMat& src, GpuMat& dst, int code, int dcn = 0, Stream& stream = Stream::Null());
|
||||
|
||||
enum
|
||||
{
|
||||
// Bayer Demosaicing (Malvar, He, and Cutler)
|
||||
COLOR_BayerBG2BGR_MHT = 256,
|
||||
COLOR_BayerGB2BGR_MHT = 257,
|
||||
COLOR_BayerRG2BGR_MHT = 258,
|
||||
COLOR_BayerGR2BGR_MHT = 259,
|
||||
|
||||
COLOR_BayerBG2RGB_MHT = COLOR_BayerRG2BGR_MHT,
|
||||
COLOR_BayerGB2RGB_MHT = COLOR_BayerGR2BGR_MHT,
|
||||
COLOR_BayerRG2RGB_MHT = COLOR_BayerBG2BGR_MHT,
|
||||
COLOR_BayerGR2RGB_MHT = COLOR_BayerGB2BGR_MHT,
|
||||
|
||||
COLOR_BayerBG2GRAY_MHT = 260,
|
||||
COLOR_BayerGB2GRAY_MHT = 261,
|
||||
COLOR_BayerRG2GRAY_MHT = 262,
|
||||
COLOR_BayerGR2GRAY_MHT = 263
|
||||
};
|
||||
CV_EXPORTS void demosaicing(const GpuMat& src, GpuMat& dst, int code, int dcn = -1, Stream& stream = Stream::Null());
|
||||
|
||||
//! swap channels
|
||||
//! dstOrder - Integer array describing how channel values are permutated. The n-th entry
|
||||
//! of the array contains the number of the channel that is stored in the n-th channel of
|
||||
@@ -894,9 +914,11 @@ CV_EXPORTS void histRange(const GpuMat& src, GpuMat hist[4], const GpuMat levels
|
||||
//! Calculates histogram for 8u one channel image
|
||||
//! Output hist will have one row, 256 cols and CV32SC1 type.
|
||||
CV_EXPORTS void calcHist(const GpuMat& src, GpuMat& hist, Stream& stream = Stream::Null());
|
||||
CV_EXPORTS void calcHist(const GpuMat& src, GpuMat& hist, GpuMat& buf, Stream& stream = Stream::Null());
|
||||
|
||||
//! normalizes the grayscale image brightness and contrast by normalizing its histogram
|
||||
CV_EXPORTS void equalizeHist(const GpuMat& src, GpuMat& dst, Stream& stream = Stream::Null());
|
||||
CV_EXPORTS void equalizeHist(const GpuMat& src, GpuMat& dst, GpuMat& hist, Stream& stream = Stream::Null());
|
||||
CV_EXPORTS void equalizeHist(const GpuMat& src, GpuMat& dst, GpuMat& hist, GpuMat& buf, Stream& stream = Stream::Null());
|
||||
|
||||
//////////////////////////////// StereoBM_GPU ////////////////////////////////
|
||||
@@ -1386,82 +1408,6 @@ private:
|
||||
friend class CascadeClassifier_GPU_LBP;
|
||||
};
|
||||
|
||||
////////////////////////////////// SURF //////////////////////////////////////////
|
||||
|
||||
class CV_EXPORTS SURF_GPU
|
||||
{
|
||||
public:
|
||||
enum KeypointLayout
|
||||
{
|
||||
X_ROW = 0,
|
||||
Y_ROW,
|
||||
LAPLACIAN_ROW,
|
||||
OCTAVE_ROW,
|
||||
SIZE_ROW,
|
||||
ANGLE_ROW,
|
||||
HESSIAN_ROW,
|
||||
ROWS_COUNT
|
||||
};
|
||||
|
||||
//! the default constructor
|
||||
SURF_GPU();
|
||||
//! the full constructor taking all the necessary parameters
|
||||
explicit SURF_GPU(double _hessianThreshold, int _nOctaves=4,
|
||||
int _nOctaveLayers=2, bool _extended=false, float _keypointsRatio=0.01f, bool _upright = false);
|
||||
|
||||
//! returns the descriptor size in float's (64 or 128)
|
||||
int descriptorSize() const;
|
||||
|
||||
//! upload host keypoints to device memory
|
||||
static void uploadKeypoints(const std::vector<KeyPoint>& keypoints, GpuMat& keypointsGPU);
|
||||
//! download keypoints from device to host memory
|
||||
static void downloadKeypoints(const GpuMat& keypointsGPU, std::vector<KeyPoint>& keypoints);
|
||||
|
||||
//! download descriptors from device to host memory
|
||||
static void downloadDescriptors(const GpuMat& descriptorsGPU, std::vector<float>& descriptors);
|
||||
|
||||
//! finds the keypoints using fast hessian detector used in SURF
|
||||
//! supports CV_8UC1 images
|
||||
//! keypoints will have nFeature cols and 6 rows
|
||||
//! keypoints.ptr<float>(X_ROW)[i] will contain x coordinate of i'th feature
|
||||
//! keypoints.ptr<float>(Y_ROW)[i] will contain y coordinate of i'th feature
|
||||
//! keypoints.ptr<float>(LAPLACIAN_ROW)[i] will contain laplacian sign of i'th feature
|
||||
//! keypoints.ptr<float>(OCTAVE_ROW)[i] will contain octave of i'th feature
|
||||
//! keypoints.ptr<float>(SIZE_ROW)[i] will contain size of i'th feature
|
||||
//! keypoints.ptr<float>(ANGLE_ROW)[i] will contain orientation of i'th feature
|
||||
//! keypoints.ptr<float>(HESSIAN_ROW)[i] will contain response of i'th feature
|
||||
void operator()(const GpuMat& img, const GpuMat& mask, GpuMat& keypoints);
|
||||
//! finds the keypoints and computes their descriptors.
|
||||
//! Optionally it can compute descriptors for the user-provided keypoints and recompute keypoints direction
|
||||
void operator()(const GpuMat& img, const GpuMat& mask, GpuMat& keypoints, GpuMat& descriptors,
|
||||
bool useProvidedKeypoints = false);
|
||||
|
||||
void operator()(const GpuMat& img, const GpuMat& mask, std::vector<KeyPoint>& keypoints);
|
||||
void operator()(const GpuMat& img, const GpuMat& mask, std::vector<KeyPoint>& keypoints, GpuMat& descriptors,
|
||||
bool useProvidedKeypoints = false);
|
||||
|
||||
void operator()(const GpuMat& img, const GpuMat& mask, std::vector<KeyPoint>& keypoints, std::vector<float>& descriptors,
|
||||
bool useProvidedKeypoints = false);
|
||||
|
||||
void releaseMemory();
|
||||
|
||||
// SURF parameters
|
||||
double hessianThreshold;
|
||||
int nOctaves;
|
||||
int nOctaveLayers;
|
||||
bool extended;
|
||||
bool upright;
|
||||
|
||||
//! max keypoints = min(keypointsRatio * img.size().area(), 65535)
|
||||
float keypointsRatio;
|
||||
|
||||
GpuMat sum, mask1, maskSum, intBuffer;
|
||||
|
||||
GpuMat det, trace;
|
||||
|
||||
GpuMat maxPosBuffer;
|
||||
};
|
||||
|
||||
////////////////////////////////// FAST //////////////////////////////////////////
|
||||
|
||||
class CV_EXPORTS FAST_GPU
|
||||
@@ -2129,41 +2075,6 @@ private:
|
||||
GpuMat bgmodelUsedModes_; //keep track of number of modes per pixel
|
||||
};
|
||||
|
||||
/*!
|
||||
* The class implements the following algorithm:
|
||||
* "ViBe: A universal background subtraction algorithm for video sequences"
|
||||
* O. Barnich and M. Van D Roogenbroeck
|
||||
* IEEE Transactions on Image Processing, 20(6) :1709-1724, June 2011
|
||||
*/
|
||||
class CV_EXPORTS VIBE_GPU
|
||||
{
|
||||
public:
|
||||
//! the default constructor
|
||||
explicit VIBE_GPU(unsigned long rngSeed = 1234567);
|
||||
|
||||
//! re-initiaization method
|
||||
void initialize(const GpuMat& firstFrame, Stream& stream = Stream::Null());
|
||||
|
||||
//! the update operator
|
||||
void operator()(const GpuMat& frame, GpuMat& fgmask, Stream& stream = Stream::Null());
|
||||
|
||||
//! releases all inner buffers
|
||||
void release();
|
||||
|
||||
int nbSamples; // number of samples per pixel
|
||||
int reqMatches; // #_min
|
||||
int radius; // R
|
||||
int subsamplingFactor; // amount of random subsampling
|
||||
|
||||
private:
|
||||
Size frameSize_;
|
||||
|
||||
unsigned long rngSeed_;
|
||||
GpuMat randStates_;
|
||||
|
||||
GpuMat samples_;
|
||||
};
|
||||
|
||||
/**
|
||||
* Background Subtractor module. Takes a series of images and returns a sequence of mask (8UC1)
|
||||
* images of the same size, where 255 indicates Foreground and 0 represents Background.
|
||||
|
910
modules/gpu/include/opencv2/gpu/device/simd_functions.hpp
Normal file
910
modules/gpu/include/opencv2/gpu/device/simd_functions.hpp
Normal file
@@ -0,0 +1,910 @@
|
||||
/*M///////////////////////////////////////////////////////////////////////////////////////
|
||||
//
|
||||
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
|
||||
//
|
||||
// By downloading, copying, installing or using the software you agree to this license.
|
||||
// If you do not agree to this license, do not download, install,
|
||||
// copy or use the software.
|
||||
//
|
||||
//
|
||||
// License Agreement
|
||||
// For Open Source Computer Vision Library
|
||||
//
|
||||
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
|
||||
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
|
||||
// Copyright (C) 2010-2013, NVIDIA Corporation, all rights reserved.
|
||||
// Third party copyrights are property of their respective owners.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without modification,
|
||||
// are permitted provided that the following conditions are met:
|
||||
//
|
||||
// * Redistribution's of source code must retain the above copyright notice,
|
||||
// this list of conditions and the following disclaimer.
|
||||
//
|
||||
// * Redistribution's in binary form must reproduce the above copyright notice,
|
||||
// this list of conditions and the following disclaimer in the documentation
|
||||
// and/or other materials provided with the distribution.
|
||||
//
|
||||
// * The name of the copyright holders may not be used to endorse or promote products
|
||||
// derived from this software without specific prior written permission.
|
||||
//
|
||||
// This software is provided by the copyright holders and contributors "as is" and
|
||||
// any express or implied warranties, including, but not limited to, the implied
|
||||
// warranties of merchantability and fitness for a particular purpose are disclaimed.
|
||||
// In no event shall the Intel Corporation or contributors be liable for any direct,
|
||||
// indirect, incidental, special, exemplary, or consequential damages
|
||||
// (including, but not limited to, procurement of substitute goods or services;
|
||||
// loss of use, data, or profits; or business interruption) however caused
|
||||
// and on any theory of liability, whether in contract, strict liability,
|
||||
// or tort (including negligence or otherwise) arising in any way out of
|
||||
// the use of this software, even if advised of the possibility of such damage.
|
||||
//
|
||||
//M*/
|
||||
|
||||
/*
|
||||
* Copyright (c) 2013 NVIDIA Corporation. All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
*
|
||||
* Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* Neither the name of NVIDIA Corporation nor the names of its contributors
|
||||
* may be used to endorse or promote products derived from this software
|
||||
* without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
|
||||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#ifndef __OPENCV_GPU_SIMD_FUNCTIONS_HPP__
|
||||
#define __OPENCV_GPU_SIMD_FUNCTIONS_HPP__
|
||||
|
||||
#include "common.hpp"
|
||||
|
||||
/*
|
||||
This header file contains inline functions that implement intra-word SIMD
|
||||
operations, that are hardware accelerated on sm_3x (Kepler) GPUs. Efficient
|
||||
emulation code paths are provided for earlier architectures (sm_1x, sm_2x)
|
||||
to make the code portable across all GPUs supported by CUDA. The following
|
||||
functions are currently implemented:
|
||||
|
||||
vadd2(a,b) per-halfword unsigned addition, with wrap-around: a + b
|
||||
vsub2(a,b) per-halfword unsigned subtraction, with wrap-around: a - b
|
||||
vabsdiff2(a,b) per-halfword unsigned absolute difference: |a - b|
|
||||
vavg2(a,b) per-halfword unsigned average: (a + b) / 2
|
||||
vavrg2(a,b) per-halfword unsigned rounded average: (a + b + 1) / 2
|
||||
vseteq2(a,b) per-halfword unsigned comparison: a == b ? 1 : 0
|
||||
vcmpeq2(a,b) per-halfword unsigned comparison: a == b ? 0xffff : 0
|
||||
vsetge2(a,b) per-halfword unsigned comparison: a >= b ? 1 : 0
|
||||
vcmpge2(a,b) per-halfword unsigned comparison: a >= b ? 0xffff : 0
|
||||
vsetgt2(a,b) per-halfword unsigned comparison: a > b ? 1 : 0
|
||||
vcmpgt2(a,b) per-halfword unsigned comparison: a > b ? 0xffff : 0
|
||||
vsetle2(a,b) per-halfword unsigned comparison: a <= b ? 1 : 0
|
||||
vcmple2(a,b) per-halfword unsigned comparison: a <= b ? 0xffff : 0
|
||||
vsetlt2(a,b) per-halfword unsigned comparison: a < b ? 1 : 0
|
||||
vcmplt2(a,b) per-halfword unsigned comparison: a < b ? 0xffff : 0
|
||||
vsetne2(a,b) per-halfword unsigned comparison: a != b ? 1 : 0
|
||||
vcmpne2(a,b) per-halfword unsigned comparison: a != b ? 0xffff : 0
|
||||
vmax2(a,b) per-halfword unsigned maximum: max(a, b)
|
||||
vmin2(a,b) per-halfword unsigned minimum: min(a, b)
|
||||
|
||||
vadd4(a,b) per-byte unsigned addition, with wrap-around: a + b
|
||||
vsub4(a,b) per-byte unsigned subtraction, with wrap-around: a - b
|
||||
vabsdiff4(a,b) per-byte unsigned absolute difference: |a - b|
|
||||
vavg4(a,b) per-byte unsigned average: (a + b) / 2
|
||||
vavrg4(a,b) per-byte unsigned rounded average: (a + b + 1) / 2
|
||||
vseteq4(a,b) per-byte unsigned comparison: a == b ? 1 : 0
|
||||
vcmpeq4(a,b) per-byte unsigned comparison: a == b ? 0xff : 0
|
||||
vsetge4(a,b) per-byte unsigned comparison: a >= b ? 1 : 0
|
||||
vcmpge4(a,b) per-byte unsigned comparison: a >= b ? 0xff : 0
|
||||
vsetgt4(a,b) per-byte unsigned comparison: a > b ? 1 : 0
|
||||
vcmpgt4(a,b) per-byte unsigned comparison: a > b ? 0xff : 0
|
||||
vsetle4(a,b) per-byte unsigned comparison: a <= b ? 1 : 0
|
||||
vcmple4(a,b) per-byte unsigned comparison: a <= b ? 0xff : 0
|
||||
vsetlt4(a,b) per-byte unsigned comparison: a < b ? 1 : 0
|
||||
vcmplt4(a,b) per-byte unsigned comparison: a < b ? 0xff : 0
|
||||
vsetne4(a,b) per-byte unsigned comparison: a != b ? 1: 0
|
||||
vcmpne4(a,b) per-byte unsigned comparison: a != b ? 0xff: 0
|
||||
vmax4(a,b) per-byte unsigned maximum: max(a, b)
|
||||
vmin4(a,b) per-byte unsigned minimum: min(a, b)
|
||||
*/
|
||||
|
||||
namespace cv { namespace gpu { namespace device
|
||||
{
|
||||
// 2
|
||||
|
||||
static __device__ __forceinline__ unsigned int vadd2(unsigned int a, unsigned int b)
|
||||
{
|
||||
unsigned int r = 0;
|
||||
|
||||
#if __CUDA_ARCH__ >= 300
|
||||
asm("vadd2.u32.u32.u32.sat %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
|
||||
#elif __CUDA_ARCH__ >= 200
|
||||
asm("vadd.u32.u32.u32.sat %0.h0, %1.h0, %2.h0, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
|
||||
asm("vadd.u32.u32.u32.sat %0.h1, %1.h1, %2.h1, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
|
||||
#else
|
||||
unsigned int s;
|
||||
s = a ^ b; // sum bits
|
||||
r = a + b; // actual sum
|
||||
s = s ^ r; // determine carry-ins for each bit position
|
||||
s = s & 0x00010000; // carry-in to high word (= carry-out from low word)
|
||||
r = r - s; // subtract out carry-out from low word
|
||||
#endif
|
||||
|
||||
return r;
|
||||
}
|
||||
|
||||
static __device__ __forceinline__ unsigned int vsub2(unsigned int a, unsigned int b)
|
||||
{
|
||||
unsigned int r = 0;
|
||||
|
||||
#if __CUDA_ARCH__ >= 300
|
||||
asm("vsub2.u32.u32.u32.sat %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
|
||||
#elif __CUDA_ARCH__ >= 200
|
||||
asm("vsub.u32.u32.u32.sat %0.h0, %1.h0, %2.h0, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
|
||||
asm("vsub.u32.u32.u32.sat %0.h1, %1.h1, %2.h1, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
|
||||
#else
|
||||
unsigned int s;
|
||||
s = a ^ b; // sum bits
|
||||
r = a - b; // actual sum
|
||||
s = s ^ r; // determine carry-ins for each bit position
|
||||
s = s & 0x00010000; // borrow to high word
|
||||
r = r + s; // compensate for borrow from low word
|
||||
#endif
|
||||
|
||||
return r;
|
||||
}
|
||||
|
||||
static __device__ __forceinline__ unsigned int vabsdiff2(unsigned int a, unsigned int b)
|
||||
{
|
||||
unsigned int r = 0;
|
||||
|
||||
#if __CUDA_ARCH__ >= 300
|
||||
asm("vabsdiff2.u32.u32.u32.sat %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
|
||||
#elif __CUDA_ARCH__ >= 200
|
||||
asm("vabsdiff.u32.u32.u32.sat %0.h0, %1.h0, %2.h0, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
|
||||
asm("vabsdiff.u32.u32.u32.sat %0.h1, %1.h1, %2.h1, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
|
||||
#else
|
||||
unsigned int s, t, u, v;
|
||||
s = a & 0x0000ffff; // extract low halfword
|
||||
r = b & 0x0000ffff; // extract low halfword
|
||||
u = ::max(r, s); // maximum of low halfwords
|
||||
v = ::min(r, s); // minimum of low halfwords
|
||||
s = a & 0xffff0000; // extract high halfword
|
||||
r = b & 0xffff0000; // extract high halfword
|
||||
t = ::max(r, s); // maximum of high halfwords
|
||||
s = ::min(r, s); // minimum of high halfwords
|
||||
r = u | t; // maximum of both halfwords
|
||||
s = v | s; // minimum of both halfwords
|
||||
r = r - s; // |a - b| = max(a,b) - min(a,b);
|
||||
#endif
|
||||
|
||||
return r;
|
||||
}
|
||||
|
||||
static __device__ __forceinline__ unsigned int vavg2(unsigned int a, unsigned int b)
|
||||
{
|
||||
unsigned int r, s;
|
||||
|
||||
// HAKMEM #23: a + b = 2 * (a & b) + (a ^ b) ==>
|
||||
// (a + b) / 2 = (a & b) + ((a ^ b) >> 1)
|
||||
s = a ^ b;
|
||||
r = a & b;
|
||||
s = s & 0xfffefffe; // ensure shift doesn't cross halfword boundaries
|
||||
s = s >> 1;
|
||||
s = r + s;
|
||||
|
||||
return s;
|
||||
}
|
||||
|
||||
static __device__ __forceinline__ unsigned int vavrg2(unsigned int a, unsigned int b)
|
||||
{
|
||||
unsigned int r = 0;
|
||||
|
||||
#if __CUDA_ARCH__ >= 300
|
||||
asm("vavrg2.u32.u32.u32 %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
|
||||
#else
|
||||
// HAKMEM #23: a + b = 2 * (a | b) - (a ^ b) ==>
|
||||
// (a + b + 1) / 2 = (a | b) - ((a ^ b) >> 1)
|
||||
unsigned int s;
|
||||
s = a ^ b;
|
||||
r = a | b;
|
||||
s = s & 0xfffefffe; // ensure shift doesn't cross half-word boundaries
|
||||
s = s >> 1;
|
||||
r = r - s;
|
||||
#endif
|
||||
|
||||
return r;
|
||||
}
|
||||
|
||||
static __device__ __forceinline__ unsigned int vseteq2(unsigned int a, unsigned int b)
|
||||
{
|
||||
unsigned int r = 0;
|
||||
|
||||
#if __CUDA_ARCH__ >= 300
|
||||
asm("vset2.u32.u32.eq %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
|
||||
#else
|
||||
// inspired by Alan Mycroft's null-byte detection algorithm:
|
||||
// null_byte(x) = ((x - 0x01010101) & (~x & 0x80808080))
|
||||
unsigned int c;
|
||||
r = a ^ b; // 0x0000 if a == b
|
||||
c = r | 0x80008000; // set msbs, to catch carry out
|
||||
r = r ^ c; // extract msbs, msb = 1 if r < 0x8000
|
||||
c = c - 0x00010001; // msb = 0, if r was 0x0000 or 0x8000
|
||||
c = r & ~c; // msb = 1, if r was 0x0000
|
||||
r = c >> 15; // convert to bool
|
||||
#endif
|
||||
|
||||
return r;
|
||||
}
|
||||
|
||||
static __device__ __forceinline__ unsigned int vcmpeq2(unsigned int a, unsigned int b)
|
||||
{
|
||||
unsigned int r, c;
|
||||
|
||||
#if __CUDA_ARCH__ >= 300
|
||||
r = vseteq2(a, b);
|
||||
c = r << 16; // convert bool
|
||||
r = c - r; // into mask
|
||||
#else
|
||||
// inspired by Alan Mycroft's null-byte detection algorithm:
|
||||
// null_byte(x) = ((x - 0x01010101) & (~x & 0x80808080))
|
||||
r = a ^ b; // 0x0000 if a == b
|
||||
c = r | 0x80008000; // set msbs, to catch carry out
|
||||
r = r ^ c; // extract msbs, msb = 1 if r < 0x8000
|
||||
c = c - 0x00010001; // msb = 0, if r was 0x0000 or 0x8000
|
||||
c = r & ~c; // msb = 1, if r was 0x0000
|
||||
r = c >> 15; // convert
|
||||
r = c - r; // msbs to
|
||||
r = c | r; // mask
|
||||
#endif
|
||||
|
||||
return r;
|
||||
}
|
||||
|
||||
static __device__ __forceinline__ unsigned int vsetge2(unsigned int a, unsigned int b)
|
||||
{
|
||||
unsigned int r = 0;
|
||||
|
||||
#if __CUDA_ARCH__ >= 300
|
||||
asm("vset2.u32.u32.ge %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
|
||||
#else
|
||||
unsigned int c;
|
||||
asm("not.b32 %0, %0;" : "+r"(b));
|
||||
c = vavrg2(a, b); // (a + ~b + 1) / 2 = (a - b) / 2
|
||||
c = c & 0x80008000; // msb = carry-outs
|
||||
r = c >> 15; // convert to bool
|
||||
#endif
|
||||
|
||||
return r;
|
||||
}
|
||||
|
||||
static __device__ __forceinline__ unsigned int vcmpge2(unsigned int a, unsigned int b)
|
||||
{
|
||||
unsigned int r, c;
|
||||
|
||||
#if __CUDA_ARCH__ >= 300
|
||||
r = vsetge2(a, b);
|
||||
c = r << 16; // convert bool
|
||||
r = c - r; // into mask
|
||||
#else
|
||||
asm("not.b32 %0, %0;" : "+r"(b));
|
||||
c = vavrg2(a, b); // (a + ~b + 1) / 2 = (a - b) / 2
|
||||
c = c & 0x80008000; // msb = carry-outs
|
||||
r = c >> 15; // convert
|
||||
r = c - r; // msbs to
|
||||
r = c | r; // mask
|
||||
#endif
|
||||
|
||||
return r;
|
||||
}
|
||||
|
||||
static __device__ __forceinline__ unsigned int vsetgt2(unsigned int a, unsigned int b)
|
||||
{
|
||||
unsigned int r = 0;
|
||||
|
||||
#if __CUDA_ARCH__ >= 300
|
||||
asm("vset2.u32.u32.gt %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
|
||||
#else
|
||||
unsigned int c;
|
||||
asm("not.b32 %0, %0;" : "+r"(b));
|
||||
c = vavg2(a, b); // (a + ~b) / 2 = (a - b) / 2 [rounded down]
|
||||
c = c & 0x80008000; // msbs = carry-outs
|
||||
r = c >> 15; // convert to bool
|
||||
#endif
|
||||
|
||||
return r;
|
||||
}
|
||||
|
||||
static __device__ __forceinline__ unsigned int vcmpgt2(unsigned int a, unsigned int b)
|
||||
{
|
||||
unsigned int r, c;
|
||||
|
||||
#if __CUDA_ARCH__ >= 300
|
||||
r = vsetgt2(a, b);
|
||||
c = r << 16; // convert bool
|
||||
r = c - r; // into mask
|
||||
#else
|
||||
asm("not.b32 %0, %0;" : "+r"(b));
|
||||
c = vavg2(a, b); // (a + ~b) / 2 = (a - b) / 2 [rounded down]
|
||||
c = c & 0x80008000; // msbs = carry-outs
|
||||
r = c >> 15; // convert
|
||||
r = c - r; // msbs to
|
||||
r = c | r; // mask
|
||||
#endif
|
||||
|
||||
return r;
|
||||
}
|
||||
|
||||
static __device__ __forceinline__ unsigned int vsetle2(unsigned int a, unsigned int b)
|
||||
{
|
||||
unsigned int r = 0;
|
||||
|
||||
#if __CUDA_ARCH__ >= 300
|
||||
asm("vset2.u32.u32.le %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
|
||||
#else
|
||||
unsigned int c;
|
||||
asm("not.b32 %0, %0;" : "+r"(a));
|
||||
c = vavrg2(a, b); // (b + ~a + 1) / 2 = (b - a) / 2
|
||||
c = c & 0x80008000; // msb = carry-outs
|
||||
r = c >> 15; // convert to bool
|
||||
#endif
|
||||
|
||||
return r;
|
||||
}
|
||||
|
||||
static __device__ __forceinline__ unsigned int vcmple2(unsigned int a, unsigned int b)
|
||||
{
|
||||
unsigned int r, c;
|
||||
|
||||
#if __CUDA_ARCH__ >= 300
|
||||
r = vsetle2(a, b);
|
||||
c = r << 16; // convert bool
|
||||
r = c - r; // into mask
|
||||
#else
|
||||
asm("not.b32 %0, %0;" : "+r"(a));
|
||||
c = vavrg2(a, b); // (b + ~a + 1) / 2 = (b - a) / 2
|
||||
c = c & 0x80008000; // msb = carry-outs
|
||||
r = c >> 15; // convert
|
||||
r = c - r; // msbs to
|
||||
r = c | r; // mask
|
||||
#endif
|
||||
|
||||
return r;
|
||||
}
|
||||
|
||||
static __device__ __forceinline__ unsigned int vsetlt2(unsigned int a, unsigned int b)
|
||||
{
|
||||
unsigned int r = 0;
|
||||
|
||||
#if __CUDA_ARCH__ >= 300
|
||||
asm("vset2.u32.u32.lt %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
|
||||
#else
|
||||
unsigned int c;
|
||||
asm("not.b32 %0, %0;" : "+r"(a));
|
||||
c = vavg2(a, b); // (b + ~a) / 2 = (b - a) / 2 [rounded down]
|
||||
c = c & 0x80008000; // msb = carry-outs
|
||||
r = c >> 15; // convert to bool
|
||||
#endif
|
||||
|
||||
return r;
|
||||
}
|
||||
|
||||
static __device__ __forceinline__ unsigned int vcmplt2(unsigned int a, unsigned int b)
|
||||
{
|
||||
unsigned int r, c;
|
||||
|
||||
#if __CUDA_ARCH__ >= 300
|
||||
r = vsetlt2(a, b);
|
||||
c = r << 16; // convert bool
|
||||
r = c - r; // into mask
|
||||
#else
|
||||
asm("not.b32 %0, %0;" : "+r"(a));
|
||||
c = vavg2(a, b); // (b + ~a) / 2 = (b - a) / 2 [rounded down]
|
||||
c = c & 0x80008000; // msb = carry-outs
|
||||
r = c >> 15; // convert
|
||||
r = c - r; // msbs to
|
||||
r = c | r; // mask
|
||||
#endif
|
||||
|
||||
return r;
|
||||
}
|
||||
|
||||
static __device__ __forceinline__ unsigned int vsetne2(unsigned int a, unsigned int b)
|
||||
{
|
||||
unsigned int r = 0;
|
||||
|
||||
#if __CUDA_ARCH__ >= 300
|
||||
asm ("vset2.u32.u32.ne %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
|
||||
#else
|
||||
// inspired by Alan Mycroft's null-byte detection algorithm:
|
||||
// null_byte(x) = ((x - 0x01010101) & (~x & 0x80808080))
|
||||
unsigned int c;
|
||||
r = a ^ b; // 0x0000 if a == b
|
||||
c = r | 0x80008000; // set msbs, to catch carry out
|
||||
c = c - 0x00010001; // msb = 0, if r was 0x0000 or 0x8000
|
||||
c = r | c; // msb = 1, if r was not 0x0000
|
||||
c = c & 0x80008000; // extract msbs
|
||||
r = c >> 15; // convert to bool
|
||||
#endif
|
||||
|
||||
return r;
|
||||
}
|
||||
|
||||
static __device__ __forceinline__ unsigned int vcmpne2(unsigned int a, unsigned int b)
|
||||
{
|
||||
unsigned int r, c;
|
||||
|
||||
#if __CUDA_ARCH__ >= 300
|
||||
r = vsetne2(a, b);
|
||||
c = r << 16; // convert bool
|
||||
r = c - r; // into mask
|
||||
#else
|
||||
// inspired by Alan Mycroft's null-byte detection algorithm:
|
||||
// null_byte(x) = ((x - 0x01010101) & (~x & 0x80808080))
|
||||
r = a ^ b; // 0x0000 if a == b
|
||||
c = r | 0x80008000; // set msbs, to catch carry out
|
||||
c = c - 0x00010001; // msb = 0, if r was 0x0000 or 0x8000
|
||||
c = r | c; // msb = 1, if r was not 0x0000
|
||||
c = c & 0x80008000; // extract msbs
|
||||
r = c >> 15; // convert
|
||||
r = c - r; // msbs to
|
||||
r = c | r; // mask
|
||||
#endif
|
||||
|
||||
return r;
|
||||
}
|
||||
|
||||
static __device__ __forceinline__ unsigned int vmax2(unsigned int a, unsigned int b)
|
||||
{
|
||||
unsigned int r = 0;
|
||||
|
||||
#if __CUDA_ARCH__ >= 300
|
||||
asm("vmax2.u32.u32.u32 %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
|
||||
#elif __CUDA_ARCH__ >= 200
|
||||
asm("vmax.u32.u32.u32 %0.h0, %1.h0, %2.h0, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
|
||||
asm("vmax.u32.u32.u32 %0.h1, %1.h1, %2.h1, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
|
||||
#else
|
||||
unsigned int s, t, u;
|
||||
r = a & 0x0000ffff; // extract low halfword
|
||||
s = b & 0x0000ffff; // extract low halfword
|
||||
t = ::max(r, s); // maximum of low halfwords
|
||||
r = a & 0xffff0000; // extract high halfword
|
||||
s = b & 0xffff0000; // extract high halfword
|
||||
u = ::max(r, s); // maximum of high halfwords
|
||||
r = t | u; // combine halfword maximums
|
||||
#endif
|
||||
|
||||
return r;
|
||||
}
|
||||
|
||||
static __device__ __forceinline__ unsigned int vmin2(unsigned int a, unsigned int b)
|
||||
{
|
||||
unsigned int r = 0;
|
||||
|
||||
#if __CUDA_ARCH__ >= 300
|
||||
asm("vmin2.u32.u32.u32 %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
|
||||
#elif __CUDA_ARCH__ >= 200
|
||||
asm("vmin.u32.u32.u32 %0.h0, %1.h0, %2.h0, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
|
||||
asm("vmin.u32.u32.u32 %0.h1, %1.h1, %2.h1, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
|
||||
#else
|
||||
unsigned int s, t, u;
|
||||
r = a & 0x0000ffff; // extract low halfword
|
||||
s = b & 0x0000ffff; // extract low halfword
|
||||
t = ::min(r, s); // minimum of low halfwords
|
||||
r = a & 0xffff0000; // extract high halfword
|
||||
s = b & 0xffff0000; // extract high halfword
|
||||
u = ::min(r, s); // minimum of high halfwords
|
||||
r = t | u; // combine halfword minimums
|
||||
#endif
|
||||
|
||||
return r;
|
||||
}
|
||||
|
||||
// 4
|
||||
|
||||
static __device__ __forceinline__ unsigned int vadd4(unsigned int a, unsigned int b)
|
||||
{
|
||||
unsigned int r = 0;
|
||||
|
||||
#if __CUDA_ARCH__ >= 300
|
||||
asm("vadd4.u32.u32.u32.sat %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
|
||||
#elif __CUDA_ARCH__ >= 200
|
||||
asm("vadd.u32.u32.u32.sat %0.b0, %1.b0, %2.b0, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
|
||||
asm("vadd.u32.u32.u32.sat %0.b1, %1.b1, %2.b1, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
|
||||
asm("vadd.u32.u32.u32.sat %0.b2, %1.b2, %2.b2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
|
||||
asm("vadd.u32.u32.u32.sat %0.b3, %1.b3, %2.b3, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
|
||||
#else
|
||||
unsigned int s, t;
|
||||
s = a ^ b; // sum bits
|
||||
r = a & 0x7f7f7f7f; // clear msbs
|
||||
t = b & 0x7f7f7f7f; // clear msbs
|
||||
s = s & 0x80808080; // msb sum bits
|
||||
r = r + t; // add without msbs, record carry-out in msbs
|
||||
r = r ^ s; // sum of msb sum and carry-in bits, w/o carry-out
|
||||
#endif /* __CUDA_ARCH__ >= 300 */
|
||||
|
||||
return r;
|
||||
}
|
||||
|
||||
static __device__ __forceinline__ unsigned int vsub4(unsigned int a, unsigned int b)
|
||||
{
|
||||
unsigned int r = 0;
|
||||
|
||||
#if __CUDA_ARCH__ >= 300
|
||||
asm("vsub4.u32.u32.u32.sat %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
|
||||
#elif __CUDA_ARCH__ >= 200
|
||||
asm("vsub.u32.u32.u32.sat %0.b0, %1.b0, %2.b0, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
|
||||
asm("vsub.u32.u32.u32.sat %0.b1, %1.b1, %2.b1, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
|
||||
asm("vsub.u32.u32.u32.sat %0.b2, %1.b2, %2.b2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
|
||||
asm("vsub.u32.u32.u32.sat %0.b3, %1.b3, %2.b3, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
|
||||
#else
|
||||
unsigned int s, t;
|
||||
s = a ^ ~b; // inverted sum bits
|
||||
r = a | 0x80808080; // set msbs
|
||||
t = b & 0x7f7f7f7f; // clear msbs
|
||||
s = s & 0x80808080; // inverted msb sum bits
|
||||
r = r - t; // subtract w/o msbs, record inverted borrows in msb
|
||||
r = r ^ s; // combine inverted msb sum bits and borrows
|
||||
#endif
|
||||
|
||||
return r;
|
||||
}
|
||||
|
||||
static __device__ __forceinline__ unsigned int vavg4(unsigned int a, unsigned int b)
|
||||
{
|
||||
unsigned int r, s;
|
||||
|
||||
// HAKMEM #23: a + b = 2 * (a & b) + (a ^ b) ==>
|
||||
// (a + b) / 2 = (a & b) + ((a ^ b) >> 1)
|
||||
s = a ^ b;
|
||||
r = a & b;
|
||||
s = s & 0xfefefefe; // ensure following shift doesn't cross byte boundaries
|
||||
s = s >> 1;
|
||||
s = r + s;
|
||||
|
||||
return s;
|
||||
}
|
||||
|
||||
static __device__ __forceinline__ unsigned int vavrg4(unsigned int a, unsigned int b)
|
||||
{
|
||||
unsigned int r = 0;
|
||||
|
||||
#if __CUDA_ARCH__ >= 300
|
||||
asm("vavrg4.u32.u32.u32 %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
|
||||
#else
|
||||
// HAKMEM #23: a + b = 2 * (a | b) - (a ^ b) ==>
|
||||
// (a + b + 1) / 2 = (a | b) - ((a ^ b) >> 1)
|
||||
unsigned int c;
|
||||
c = a ^ b;
|
||||
r = a | b;
|
||||
c = c & 0xfefefefe; // ensure following shift doesn't cross byte boundaries
|
||||
c = c >> 1;
|
||||
r = r - c;
|
||||
#endif
|
||||
|
||||
return r;
|
||||
}
|
||||
|
||||
static __device__ __forceinline__ unsigned int vseteq4(unsigned int a, unsigned int b)
|
||||
{
|
||||
unsigned int r = 0;
|
||||
|
||||
#if __CUDA_ARCH__ >= 300
|
||||
asm("vset4.u32.u32.eq %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
|
||||
#else
|
||||
// inspired by Alan Mycroft's null-byte detection algorithm:
|
||||
// null_byte(x) = ((x - 0x01010101) & (~x & 0x80808080))
|
||||
unsigned int c;
|
||||
r = a ^ b; // 0x00 if a == b
|
||||
c = r | 0x80808080; // set msbs, to catch carry out
|
||||
r = r ^ c; // extract msbs, msb = 1 if r < 0x80
|
||||
c = c - 0x01010101; // msb = 0, if r was 0x00 or 0x80
|
||||
c = r & ~c; // msb = 1, if r was 0x00
|
||||
r = c >> 7; // convert to bool
|
||||
#endif
|
||||
|
||||
return r;
|
||||
}
|
||||
|
||||
static __device__ __forceinline__ unsigned int vcmpeq4(unsigned int a, unsigned int b)
|
||||
{
|
||||
unsigned int r, t;
|
||||
|
||||
#if __CUDA_ARCH__ >= 300
|
||||
r = vseteq4(a, b);
|
||||
t = r << 8; // convert bool
|
||||
r = t - r; // to mask
|
||||
#else
|
||||
// inspired by Alan Mycroft's null-byte detection algorithm:
|
||||
// null_byte(x) = ((x - 0x01010101) & (~x & 0x80808080))
|
||||
t = a ^ b; // 0x00 if a == b
|
||||
r = t | 0x80808080; // set msbs, to catch carry out
|
||||
t = t ^ r; // extract msbs, msb = 1 if t < 0x80
|
||||
r = r - 0x01010101; // msb = 0, if t was 0x00 or 0x80
|
||||
r = t & ~r; // msb = 1, if t was 0x00
|
||||
t = r >> 7; // build mask
|
||||
t = r - t; // from
|
||||
r = t | r; // msbs
|
||||
#endif
|
||||
|
||||
return r;
|
||||
}
|
||||
|
||||
static __device__ __forceinline__ unsigned int vsetle4(unsigned int a, unsigned int b)
|
||||
{
|
||||
unsigned int r = 0;
|
||||
|
||||
#if __CUDA_ARCH__ >= 300
|
||||
asm("vset4.u32.u32.le %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
|
||||
#else
|
||||
unsigned int c;
|
||||
asm("not.b32 %0, %0;" : "+r"(a));
|
||||
c = vavrg4(a, b); // (b + ~a + 1) / 2 = (b - a) / 2
|
||||
c = c & 0x80808080; // msb = carry-outs
|
||||
r = c >> 7; // convert to bool
|
||||
#endif
|
||||
|
||||
return r;
|
||||
}
|
||||
|
||||
static __device__ __forceinline__ unsigned int vcmple4(unsigned int a, unsigned int b)
|
||||
{
|
||||
unsigned int r, c;
|
||||
|
||||
#if __CUDA_ARCH__ >= 300
|
||||
r = vsetle4(a, b);
|
||||
c = r << 8; // convert bool
|
||||
r = c - r; // to mask
|
||||
#else
|
||||
asm("not.b32 %0, %0;" : "+r"(a));
|
||||
c = vavrg4(a, b); // (b + ~a + 1) / 2 = (b - a) / 2
|
||||
c = c & 0x80808080; // msbs = carry-outs
|
||||
r = c >> 7; // convert
|
||||
r = c - r; // msbs to
|
||||
r = c | r; // mask
|
||||
#endif
|
||||
|
||||
return r;
|
||||
}
|
||||
|
||||
static __device__ __forceinline__ unsigned int vsetlt4(unsigned int a, unsigned int b)
|
||||
{
|
||||
unsigned int r = 0;
|
||||
|
||||
#if __CUDA_ARCH__ >= 300
|
||||
asm("vset4.u32.u32.lt %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
|
||||
#else
|
||||
unsigned int c;
|
||||
asm("not.b32 %0, %0;" : "+r"(a));
|
||||
c = vavg4(a, b); // (b + ~a) / 2 = (b - a) / 2 [rounded down]
|
||||
c = c & 0x80808080; // msb = carry-outs
|
||||
r = c >> 7; // convert to bool
|
||||
#endif
|
||||
|
||||
return r;
|
||||
}
|
||||
|
||||
static __device__ __forceinline__ unsigned int vcmplt4(unsigned int a, unsigned int b)
|
||||
{
|
||||
unsigned int r, c;
|
||||
|
||||
#if __CUDA_ARCH__ >= 300
|
||||
r = vsetlt4(a, b);
|
||||
c = r << 8; // convert bool
|
||||
r = c - r; // to mask
|
||||
#else
|
||||
asm("not.b32 %0, %0;" : "+r"(a));
|
||||
c = vavg4(a, b); // (b + ~a) / 2 = (b - a) / 2 [rounded down]
|
||||
c = c & 0x80808080; // msbs = carry-outs
|
||||
r = c >> 7; // convert
|
||||
r = c - r; // msbs to
|
||||
r = c | r; // mask
|
||||
#endif
|
||||
|
||||
return r;
|
||||
}
|
||||
|
||||
static __device__ __forceinline__ unsigned int vsetge4(unsigned int a, unsigned int b)
|
||||
{
|
||||
unsigned int r = 0;
|
||||
|
||||
#if __CUDA_ARCH__ >= 300
|
||||
asm("vset4.u32.u32.ge %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
|
||||
#else
|
||||
unsigned int c;
|
||||
asm("not.b32 %0, %0;" : "+r"(b));
|
||||
c = vavrg4(a, b); // (a + ~b + 1) / 2 = (a - b) / 2
|
||||
c = c & 0x80808080; // msb = carry-outs
|
||||
r = c >> 7; // convert to bool
|
||||
#endif
|
||||
|
||||
return r;
|
||||
}
|
||||
|
||||
static __device__ __forceinline__ unsigned int vcmpge4(unsigned int a, unsigned int b)
|
||||
{
|
||||
unsigned int r, s;
|
||||
|
||||
#if __CUDA_ARCH__ >= 300
|
||||
r = vsetge4(a, b);
|
||||
s = r << 8; // convert bool
|
||||
r = s - r; // to mask
|
||||
#else
|
||||
asm ("not.b32 %0,%0;" : "+r"(b));
|
||||
r = vavrg4 (a, b); // (a + ~b + 1) / 2 = (a - b) / 2
|
||||
r = r & 0x80808080; // msb = carry-outs
|
||||
s = r >> 7; // build mask
|
||||
s = r - s; // from
|
||||
r = s | r; // msbs
|
||||
#endif
|
||||
|
||||
return r;
|
||||
}
|
||||
|
||||
static __device__ __forceinline__ unsigned int vsetgt4(unsigned int a, unsigned int b)
|
||||
{
|
||||
unsigned int r = 0;
|
||||
|
||||
#if __CUDA_ARCH__ >= 300
|
||||
asm("vset4.u32.u32.gt %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
|
||||
#else
|
||||
unsigned int c;
|
||||
asm("not.b32 %0, %0;" : "+r"(b));
|
||||
c = vavg4(a, b); // (a + ~b) / 2 = (a - b) / 2 [rounded down]
|
||||
c = c & 0x80808080; // msb = carry-outs
|
||||
r = c >> 7; // convert to bool
|
||||
#endif
|
||||
|
||||
return r;
|
||||
}
|
||||
|
||||
static __device__ __forceinline__ unsigned int vcmpgt4(unsigned int a, unsigned int b)
|
||||
{
|
||||
unsigned int r, c;
|
||||
|
||||
#if __CUDA_ARCH__ >= 300
|
||||
r = vsetgt4(a, b);
|
||||
c = r << 8; // convert bool
|
||||
r = c - r; // to mask
|
||||
#else
|
||||
asm("not.b32 %0, %0;" : "+r"(b));
|
||||
c = vavg4(a, b); // (a + ~b) / 2 = (a - b) / 2 [rounded down]
|
||||
c = c & 0x80808080; // msb = carry-outs
|
||||
r = c >> 7; // convert
|
||||
r = c - r; // msbs to
|
||||
r = c | r; // mask
|
||||
#endif
|
||||
|
||||
return r;
|
||||
}
|
||||
|
||||
static __device__ __forceinline__ unsigned int vsetne4(unsigned int a, unsigned int b)
|
||||
{
|
||||
unsigned int r = 0;
|
||||
|
||||
#if __CUDA_ARCH__ >= 300
|
||||
asm("vset4.u32.u32.ne %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
|
||||
#else
|
||||
// inspired by Alan Mycroft's null-byte detection algorithm:
|
||||
// null_byte(x) = ((x - 0x01010101) & (~x & 0x80808080))
|
||||
unsigned int c;
|
||||
r = a ^ b; // 0x00 if a == b
|
||||
c = r | 0x80808080; // set msbs, to catch carry out
|
||||
c = c - 0x01010101; // msb = 0, if r was 0x00 or 0x80
|
||||
c = r | c; // msb = 1, if r was not 0x00
|
||||
c = c & 0x80808080; // extract msbs
|
||||
r = c >> 7; // convert to bool
|
||||
#endif
|
||||
|
||||
return r;
|
||||
}
|
||||
|
||||
static __device__ __forceinline__ unsigned int vcmpne4(unsigned int a, unsigned int b)
|
||||
{
|
||||
unsigned int r, c;
|
||||
|
||||
#if __CUDA_ARCH__ >= 300
|
||||
r = vsetne4(a, b);
|
||||
c = r << 8; // convert bool
|
||||
r = c - r; // to mask
|
||||
#else
|
||||
// inspired by Alan Mycroft's null-byte detection algorithm:
|
||||
// null_byte(x) = ((x - 0x01010101) & (~x & 0x80808080))
|
||||
r = a ^ b; // 0x00 if a == b
|
||||
c = r | 0x80808080; // set msbs, to catch carry out
|
||||
c = c - 0x01010101; // msb = 0, if r was 0x00 or 0x80
|
||||
c = r | c; // msb = 1, if r was not 0x00
|
||||
c = c & 0x80808080; // extract msbs
|
||||
r = c >> 7; // convert
|
||||
r = c - r; // msbs to
|
||||
r = c | r; // mask
|
||||
#endif
|
||||
|
||||
return r;
|
||||
}
|
||||
|
||||
static __device__ __forceinline__ unsigned int vabsdiff4(unsigned int a, unsigned int b)
|
||||
{
|
||||
unsigned int r = 0;
|
||||
|
||||
#if __CUDA_ARCH__ >= 300
|
||||
asm("vabsdiff4.u32.u32.u32.sat %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
|
||||
#elif __CUDA_ARCH__ >= 200
|
||||
asm("vabsdiff.u32.u32.u32.sat %0.b0, %1.b0, %2.b0, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
|
||||
asm("vabsdiff.u32.u32.u32.sat %0.b1, %1.b1, %2.b1, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
|
||||
asm("vabsdiff.u32.u32.u32.sat %0.b2, %1.b2, %2.b2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
|
||||
asm("vabsdiff.u32.u32.u32.sat %0.b3, %1.b3, %2.b3, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
|
||||
#else
|
||||
unsigned int s;
|
||||
s = vcmpge4(a, b); // mask = 0xff if a >= b
|
||||
r = a ^ b; //
|
||||
s = (r & s) ^ b; // select a when a >= b, else select b => max(a,b)
|
||||
r = s ^ r; // select a when b >= a, else select b => min(a,b)
|
||||
r = s - r; // |a - b| = max(a,b) - min(a,b);
|
||||
#endif
|
||||
|
||||
return r;
|
||||
}
|
||||
|
||||
static __device__ __forceinline__ unsigned int vmax4(unsigned int a, unsigned int b)
|
||||
{
|
||||
unsigned int r = 0;
|
||||
|
||||
#if __CUDA_ARCH__ >= 300
|
||||
asm("vmax4.u32.u32.u32 %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
|
||||
#elif __CUDA_ARCH__ >= 200
|
||||
asm("vmax.u32.u32.u32 %0.b0, %1.b0, %2.b0, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
|
||||
asm("vmax.u32.u32.u32 %0.b1, %1.b1, %2.b1, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
|
||||
asm("vmax.u32.u32.u32 %0.b2, %1.b2, %2.b2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
|
||||
asm("vmax.u32.u32.u32 %0.b3, %1.b3, %2.b3, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
|
||||
#else
|
||||
unsigned int s;
|
||||
s = vcmpge4(a, b); // mask = 0xff if a >= b
|
||||
r = a & s; // select a when b >= a
|
||||
s = b & ~s; // select b when b < a
|
||||
r = r | s; // combine byte selections
|
||||
#endif
|
||||
|
||||
return r; // byte-wise unsigned maximum
|
||||
}
|
||||
|
||||
static __device__ __forceinline__ unsigned int vmin4(unsigned int a, unsigned int b)
|
||||
{
|
||||
unsigned int r = 0;
|
||||
|
||||
#if __CUDA_ARCH__ >= 300
|
||||
asm("vmin4.u32.u32.u32 %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
|
||||
#elif __CUDA_ARCH__ >= 200
|
||||
asm("vmin.u32.u32.u32 %0.b0, %1.b0, %2.b0, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
|
||||
asm("vmin.u32.u32.u32 %0.b1, %1.b1, %2.b1, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
|
||||
asm("vmin.u32.u32.u32 %0.b2, %1.b2, %2.b2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
|
||||
asm("vmin.u32.u32.u32 %0.b3, %1.b3, %2.b3, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
|
||||
#else
|
||||
unsigned int s;
|
||||
s = vcmpge4(b, a); // mask = 0xff if a >= b
|
||||
r = a & s; // select a when b >= a
|
||||
s = b & ~s; // select b when b < a
|
||||
r = r | s; // combine byte selections
|
||||
#endif
|
||||
|
||||
return r;
|
||||
}
|
||||
}}}
|
||||
|
||||
#endif // __OPENCV_GPU_SIMD_FUNCTIONS_HPP__
|
@@ -7,7 +7,7 @@
|
||||
// copy or use the software.
|
||||
//
|
||||
//
|
||||
// License Agreement
|
||||
// License Agreement
|
||||
// For Open Source Computer Vision Library
|
||||
//
|
||||
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
|
||||
|
@@ -2,6 +2,7 @@
|
||||
|
||||
using namespace std;
|
||||
using namespace testing;
|
||||
using namespace perf;
|
||||
|
||||
//////////////////////////////////////////////////////////////////////
|
||||
// StereoBM
|
||||
@@ -12,7 +13,7 @@ DEF_PARAM_TEST_1(ImagePair, pair_string);
|
||||
PERF_TEST_P(ImagePair, Calib3D_StereoBM,
|
||||
Values(pair_string("gpu/perf/aloe.png", "gpu/perf/aloeR.png")))
|
||||
{
|
||||
declare.time(5.0);
|
||||
declare.time(300.0);
|
||||
|
||||
const cv::Mat imgLeft = readImage(GET_PARAM(0), cv::IMREAD_GRAYSCALE);
|
||||
ASSERT_FALSE(imgLeft.empty());
|
||||
@@ -53,7 +54,7 @@ PERF_TEST_P(ImagePair, Calib3D_StereoBM,
|
||||
PERF_TEST_P(ImagePair, Calib3D_StereoBeliefPropagation,
|
||||
Values(pair_string("gpu/stereobp/aloe-L.png", "gpu/stereobp/aloe-R.png")))
|
||||
{
|
||||
declare.time(10.0);
|
||||
declare.time(300.0);
|
||||
|
||||
const cv::Mat imgLeft = readImage(GET_PARAM(0));
|
||||
ASSERT_FALSE(imgLeft.empty());
|
||||
@@ -87,7 +88,7 @@ PERF_TEST_P(ImagePair, Calib3D_StereoBeliefPropagation,
|
||||
PERF_TEST_P(ImagePair, Calib3D_StereoConstantSpaceBP,
|
||||
Values(pair_string("gpu/stereobm/aloe-L.png", "gpu/stereobm/aloe-R.png")))
|
||||
{
|
||||
declare.time(10.0);
|
||||
declare.time(300.0);
|
||||
|
||||
const cv::Mat imgLeft = readImage(GET_PARAM(0), cv::IMREAD_GRAYSCALE);
|
||||
ASSERT_FALSE(imgLeft.empty());
|
||||
|
@@ -1748,7 +1748,10 @@ PERF_TEST_P(Sz_Depth_Norm, Core_Norm,
|
||||
const int normType = GET_PARAM(2);
|
||||
|
||||
cv::Mat src(size, depth);
|
||||
declare.in(src, WARMUP_RNG);
|
||||
if (depth == CV_8U)
|
||||
cv::randu(src, 0, 254);
|
||||
else
|
||||
declare.in(src, WARMUP_RNG);
|
||||
|
||||
if (PERF_RUN_GPU())
|
||||
{
|
||||
@@ -1923,7 +1926,10 @@ PERF_TEST_P(Sz_Depth, Core_MinMax,
|
||||
const int depth = GET_PARAM(1);
|
||||
|
||||
cv::Mat src(size, depth);
|
||||
declare.in(src, WARMUP_RNG);
|
||||
if (depth == CV_8U)
|
||||
cv::randu(src, 0, 254);
|
||||
else
|
||||
declare.in(src, WARMUP_RNG);
|
||||
|
||||
if (PERF_RUN_GPU())
|
||||
{
|
||||
@@ -1958,7 +1964,10 @@ PERF_TEST_P(Sz_Depth, Core_MinMaxLoc,
|
||||
const int depth = GET_PARAM(1);
|
||||
|
||||
cv::Mat src(size, depth);
|
||||
declare.in(src, WARMUP_RNG);
|
||||
if (depth == CV_8U)
|
||||
cv::randu(src, 0, 254);
|
||||
else
|
||||
declare.in(src, WARMUP_RNG);
|
||||
|
||||
if (PERF_RUN_GPU())
|
||||
{
|
||||
|
@@ -2,6 +2,7 @@
|
||||
|
||||
using namespace std;
|
||||
using namespace testing;
|
||||
using namespace perf;
|
||||
|
||||
#define GPU_DENOISING_IMAGE_SIZES testing::Values(perf::szVGA, perf::sz720p)
|
||||
|
||||
@@ -63,7 +64,7 @@ PERF_TEST_P(Sz_Depth_Cn_WinSz_BlockSz, Denoising_NonLocalMeans,
|
||||
Values(21),
|
||||
Values(5)))
|
||||
{
|
||||
declare.time(60.0);
|
||||
declare.time(600.0);
|
||||
|
||||
const cv::Size size = GET_PARAM(0);
|
||||
const int depth = GET_PARAM(1);
|
||||
|
@@ -2,105 +2,7 @@
|
||||
|
||||
using namespace std;
|
||||
using namespace testing;
|
||||
|
||||
struct KeypointIdxCompare
|
||||
{
|
||||
std::vector<cv::KeyPoint>* keypoints;
|
||||
|
||||
explicit KeypointIdxCompare(std::vector<cv::KeyPoint>* _keypoints) : keypoints(_keypoints) {}
|
||||
|
||||
bool operator ()(size_t i1, size_t i2) const
|
||||
{
|
||||
cv::KeyPoint kp1 = (*keypoints)[i1];
|
||||
cv::KeyPoint kp2 = (*keypoints)[i2];
|
||||
if (kp1.pt.x != kp2.pt.x)
|
||||
return kp1.pt.x < kp2.pt.x;
|
||||
if (kp1.pt.y != kp2.pt.y)
|
||||
return kp1.pt.y < kp2.pt.y;
|
||||
if (kp1.response != kp2.response)
|
||||
return kp1.response < kp2.response;
|
||||
return kp1.octave < kp2.octave;
|
||||
}
|
||||
};
|
||||
|
||||
static void sortKeyPoints(std::vector<cv::KeyPoint>& keypoints, cv::InputOutputArray _descriptors = cv::noArray())
|
||||
{
|
||||
std::vector<size_t> indexies(keypoints.size());
|
||||
for (size_t i = 0; i < indexies.size(); ++i)
|
||||
indexies[i] = i;
|
||||
|
||||
std::sort(indexies.begin(), indexies.end(), KeypointIdxCompare(&keypoints));
|
||||
|
||||
std::vector<cv::KeyPoint> new_keypoints;
|
||||
cv::Mat new_descriptors;
|
||||
|
||||
new_keypoints.resize(keypoints.size());
|
||||
|
||||
cv::Mat descriptors;
|
||||
if (_descriptors.needed())
|
||||
{
|
||||
descriptors = _descriptors.getMat();
|
||||
new_descriptors.create(descriptors.size(), descriptors.type());
|
||||
}
|
||||
|
||||
for (size_t i = 0; i < indexies.size(); ++i)
|
||||
{
|
||||
size_t new_idx = indexies[i];
|
||||
new_keypoints[i] = keypoints[new_idx];
|
||||
if (!new_descriptors.empty())
|
||||
descriptors.row((int) new_idx).copyTo(new_descriptors.row((int) i));
|
||||
}
|
||||
|
||||
keypoints.swap(new_keypoints);
|
||||
if (_descriptors.needed())
|
||||
new_descriptors.copyTo(_descriptors);
|
||||
}
|
||||
|
||||
//////////////////////////////////////////////////////////////////////
|
||||
// SURF
|
||||
|
||||
DEF_PARAM_TEST_1(Image, string);
|
||||
|
||||
PERF_TEST_P(Image, Features2D_SURF,
|
||||
Values<string>("gpu/perf/aloe.png"))
|
||||
{
|
||||
declare.time(50.0);
|
||||
|
||||
const cv::Mat img = readImage(GetParam(), cv::IMREAD_GRAYSCALE);
|
||||
ASSERT_FALSE(img.empty());
|
||||
|
||||
if (PERF_RUN_GPU())
|
||||
{
|
||||
cv::gpu::SURF_GPU d_surf;
|
||||
|
||||
const cv::gpu::GpuMat d_img(img);
|
||||
cv::gpu::GpuMat d_keypoints, d_descriptors;
|
||||
|
||||
TEST_CYCLE() d_surf(d_img, cv::gpu::GpuMat(), d_keypoints, d_descriptors);
|
||||
|
||||
std::vector<cv::KeyPoint> gpu_keypoints;
|
||||
d_surf.downloadKeypoints(d_keypoints, gpu_keypoints);
|
||||
|
||||
cv::Mat gpu_descriptors(d_descriptors);
|
||||
|
||||
sortKeyPoints(gpu_keypoints, gpu_descriptors);
|
||||
|
||||
SANITY_CHECK_KEYPOINTS(gpu_keypoints);
|
||||
SANITY_CHECK(gpu_descriptors, 1e-3);
|
||||
}
|
||||
else
|
||||
{
|
||||
cv::SURF surf;
|
||||
|
||||
std::vector<cv::KeyPoint> cpu_keypoints;
|
||||
cv::Mat cpu_descriptors;
|
||||
|
||||
TEST_CYCLE() surf(img, cv::noArray(), cpu_keypoints, cpu_descriptors);
|
||||
|
||||
SANITY_CHECK_KEYPOINTS(cpu_keypoints);
|
||||
SANITY_CHECK(cpu_descriptors);
|
||||
}
|
||||
}
|
||||
using namespace perf;
|
||||
|
||||
//////////////////////////////////////////////////////////////////////
|
||||
// FAST
|
||||
@@ -153,6 +55,8 @@ PERF_TEST_P(Image_NFeatures, Features2D_ORB,
|
||||
Combine(Values<string>("gpu/perf/aloe.png"),
|
||||
Values(4000)))
|
||||
{
|
||||
declare.time(300.0);
|
||||
|
||||
const cv::Mat img = readImage(GET_PARAM(0), cv::IMREAD_GRAYSCALE);
|
||||
ASSERT_FALSE(img.empty());
|
||||
|
||||
|
@@ -2,6 +2,7 @@
|
||||
|
||||
using namespace std;
|
||||
using namespace testing;
|
||||
using namespace perf;
|
||||
|
||||
//////////////////////////////////////////////////////////////////////
|
||||
// Blur
|
||||
|
@@ -632,7 +632,7 @@ DEF_PARAM_TEST_1(Image, string);
|
||||
PERF_TEST_P(Image, ImgProc_MeanShiftFiltering,
|
||||
Values<string>("gpu/meanshift/cones.png"))
|
||||
{
|
||||
declare.time(15.0);
|
||||
declare.time(300.0);
|
||||
|
||||
const cv::Mat img = readImage(GetParam());
|
||||
ASSERT_FALSE(img.empty());
|
||||
@@ -668,7 +668,7 @@ PERF_TEST_P(Image, ImgProc_MeanShiftFiltering,
|
||||
PERF_TEST_P(Image, ImgProc_MeanShiftProc,
|
||||
Values<string>("gpu/meanshift/cones.png"))
|
||||
{
|
||||
declare.time(5.0);
|
||||
declare.time(300.0);
|
||||
|
||||
const cv::Mat img = readImage(GetParam());
|
||||
ASSERT_FALSE(img.empty());
|
||||
@@ -702,7 +702,7 @@ PERF_TEST_P(Image, ImgProc_MeanShiftProc,
|
||||
PERF_TEST_P(Image, ImgProc_MeanShiftSegmentation,
|
||||
Values<string>("gpu/meanshift/cones.png"))
|
||||
{
|
||||
declare.time(5.0);
|
||||
declare.time(300.0);
|
||||
|
||||
const cv::Mat img = readImage(GetParam());
|
||||
ASSERT_FALSE(img.empty());
|
||||
@@ -830,6 +830,8 @@ PERF_TEST_P(Sz_TemplateSz_Cn_Method, ImgProc_MatchTemplate8U,
|
||||
GPU_CHANNELS_1_3_4,
|
||||
ALL_TEMPLATE_METHODS))
|
||||
{
|
||||
declare.time(300.0);
|
||||
|
||||
const cv::Size size = GET_PARAM(0);
|
||||
const cv::Size templ_size = GET_PARAM(1);
|
||||
const int cn = GET_PARAM(2);
|
||||
@@ -868,6 +870,8 @@ PERF_TEST_P(Sz_TemplateSz_Cn_Method, ImgProc_MatchTemplate32F,
|
||||
GPU_CHANNELS_1_3_4,
|
||||
Values(TemplateMethod(cv::TM_SQDIFF), TemplateMethod(cv::TM_CCORR))))
|
||||
{
|
||||
declare.time(300.0);
|
||||
|
||||
const cv::Size size = GET_PARAM(0);
|
||||
const cv::Size templ_size = GET_PARAM(1);
|
||||
const int cn = GET_PARAM(2);
|
||||
@@ -1034,7 +1038,7 @@ PERF_TEST_P(Image_Type_Border_BlockSz_ApertureSz, ImgProc_CornerHarris,
|
||||
|
||||
TEST_CYCLE() cv::gpu::cornerHarris(d_img, dst, d_Dx, d_Dy, d_buf, blockSize, apertureSize, k, borderMode);
|
||||
|
||||
GPU_SANITY_CHECK(dst);
|
||||
GPU_SANITY_CHECK(dst, 1e-4);
|
||||
}
|
||||
else
|
||||
{
|
||||
@@ -1077,7 +1081,7 @@ PERF_TEST_P(Image_Type_Border_BlockSz_ApertureSz, ImgProc_CornerMinEigenVal,
|
||||
|
||||
TEST_CYCLE() cv::gpu::cornerMinEigenVal(d_img, dst, d_Dx, d_Dy, d_buf, blockSize, apertureSize, borderMode);
|
||||
|
||||
GPU_SANITY_CHECK(dst);
|
||||
GPU_SANITY_CHECK(dst, 1e-4);
|
||||
}
|
||||
else
|
||||
{
|
||||
@@ -1341,7 +1345,12 @@ PERF_TEST_P(Sz_Depth_Code, ImgProc_CvtColorBayer,
|
||||
Values(CvtColorInfo(1, 3, cv::COLOR_BayerBG2BGR),
|
||||
CvtColorInfo(1, 3, cv::COLOR_BayerGB2BGR),
|
||||
CvtColorInfo(1, 3, cv::COLOR_BayerRG2BGR),
|
||||
CvtColorInfo(1, 3, cv::COLOR_BayerGR2BGR))))
|
||||
CvtColorInfo(1, 3, cv::COLOR_BayerGR2BGR),
|
||||
|
||||
CvtColorInfo(1, 1, cv::COLOR_BayerBG2GRAY),
|
||||
CvtColorInfo(1, 1, cv::COLOR_BayerGB2GRAY),
|
||||
CvtColorInfo(1, 1, cv::COLOR_BayerRG2GRAY),
|
||||
CvtColorInfo(1, 1, cv::COLOR_BayerGR2GRAY))))
|
||||
{
|
||||
const cv::Size size = GET_PARAM(0);
|
||||
const int depth = GET_PARAM(1);
|
||||
@@ -1369,6 +1378,50 @@ PERF_TEST_P(Sz_Depth_Code, ImgProc_CvtColorBayer,
|
||||
}
|
||||
}
|
||||
|
||||
CV_ENUM(DemosaicingCode,
|
||||
cv::COLOR_BayerBG2BGR, cv::COLOR_BayerGB2BGR, cv::COLOR_BayerRG2BGR, cv::COLOR_BayerGR2BGR,
|
||||
cv::COLOR_BayerBG2GRAY, cv::COLOR_BayerGB2GRAY, cv::COLOR_BayerRG2GRAY, cv::COLOR_BayerGR2GRAY,
|
||||
cv::gpu::COLOR_BayerBG2BGR_MHT, cv::gpu::COLOR_BayerGB2BGR_MHT, cv::gpu::COLOR_BayerRG2BGR_MHT, cv::gpu::COLOR_BayerGR2BGR_MHT,
|
||||
cv::gpu::COLOR_BayerBG2GRAY_MHT, cv::gpu::COLOR_BayerGB2GRAY_MHT, cv::gpu::COLOR_BayerRG2GRAY_MHT, cv::gpu::COLOR_BayerGR2GRAY_MHT)
|
||||
|
||||
DEF_PARAM_TEST(Sz_Code, cv::Size, DemosaicingCode);
|
||||
|
||||
PERF_TEST_P(Sz_Code, ImgProc_Demosaicing,
|
||||
Combine(GPU_TYPICAL_MAT_SIZES,
|
||||
ValuesIn(DemosaicingCode::all())))
|
||||
{
|
||||
const cv::Size size = GET_PARAM(0);
|
||||
const int code = GET_PARAM(1);
|
||||
|
||||
cv::Mat src(size, CV_8UC1);
|
||||
declare.in(src, WARMUP_RNG);
|
||||
|
||||
if (PERF_RUN_GPU())
|
||||
{
|
||||
const cv::gpu::GpuMat d_src(src);
|
||||
cv::gpu::GpuMat dst;
|
||||
|
||||
TEST_CYCLE() cv::gpu::demosaicing(d_src, dst, code);
|
||||
|
||||
GPU_SANITY_CHECK(dst);
|
||||
}
|
||||
else
|
||||
{
|
||||
if (code >= cv::COLOR_COLORCVT_MAX)
|
||||
{
|
||||
FAIL_NO_CPU();
|
||||
}
|
||||
else
|
||||
{
|
||||
cv::Mat dst;
|
||||
|
||||
TEST_CYCLE() cv::cvtColor(src, dst, code);
|
||||
|
||||
CPU_SANITY_CHECK(dst);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
//////////////////////////////////////////////////////////////////////
|
||||
// SwapChannels
|
||||
|
||||
|
@@ -2,6 +2,7 @@
|
||||
|
||||
using namespace std;
|
||||
using namespace testing;
|
||||
using namespace perf;
|
||||
|
||||
DEF_PARAM_TEST_1(Image, string);
|
||||
|
||||
|
@@ -1,70 +1,5 @@
|
||||
#include "perf_precomp.hpp"
|
||||
|
||||
static void printOsInfo()
|
||||
{
|
||||
#if defined _WIN32
|
||||
# if defined _WIN64
|
||||
printf("[----------]\n[ GPU INFO ] \tRun on OS Windows x64.\n[----------]\n"), fflush(stdout);
|
||||
# else
|
||||
printf("[----------]\n[ GPU INFO ] \tRun on OS Windows x32.\n[----------]\n"), fflush(stdout);
|
||||
# endif
|
||||
#elif defined linux
|
||||
# if defined _LP64
|
||||
printf("[----------]\n[ GPU INFO ] \tRun on OS Linux x64.\n[----------]\n"), fflush(stdout);
|
||||
# else
|
||||
printf("[----------]\n[ GPU INFO ] \tRun on OS Linux x32.\n[----------]\n"), fflush(stdout);
|
||||
# endif
|
||||
#elif defined __APPLE__
|
||||
# if defined _LP64
|
||||
printf("[----------]\n[ GPU INFO ] \tRun on OS Apple x64.\n[----------]\n"), fflush(stdout);
|
||||
# else
|
||||
printf("[----------]\n[ GPU INFO ] \tRun on OS Apple x32.\n[----------]\n"), fflush(stdout);
|
||||
# endif
|
||||
#endif
|
||||
|
||||
}
|
||||
|
||||
static void printCudaInfo()
|
||||
{
|
||||
printOsInfo();
|
||||
#ifndef HAVE_CUDA
|
||||
printf("[----------]\n[ GPU INFO ] \tOpenCV was built without CUDA support.\n[----------]\n"), fflush(stdout);
|
||||
#else
|
||||
int driver;
|
||||
cudaDriverGetVersion(&driver);
|
||||
|
||||
printf("[----------]\n"), fflush(stdout);
|
||||
printf("[ GPU INFO ] \tCUDA Driver version: %d.\n", driver), fflush(stdout);
|
||||
printf("[ GPU INFO ] \tCUDA Runtime version: %d.\n", CUDART_VERSION), fflush(stdout);
|
||||
printf("[----------]\n"), fflush(stdout);
|
||||
|
||||
printf("[----------]\n"), fflush(stdout);
|
||||
printf("[ GPU INFO ] \tGPU module was compiled for the following GPU archs.\n"), fflush(stdout);
|
||||
printf("[ BIN ] \t%s.\n", CUDA_ARCH_BIN), fflush(stdout);
|
||||
printf("[ PTX ] \t%s.\n", CUDA_ARCH_PTX), fflush(stdout);
|
||||
printf("[----------]\n"), fflush(stdout);
|
||||
|
||||
printf("[----------]\n"), fflush(stdout);
|
||||
int deviceCount = cv::gpu::getCudaEnabledDeviceCount();
|
||||
printf("[ GPU INFO ] \tCUDA device count:: %d.\n", deviceCount), fflush(stdout);
|
||||
printf("[----------]\n"), fflush(stdout);
|
||||
|
||||
for (int i = 0; i < deviceCount; ++i)
|
||||
{
|
||||
cv::gpu::DeviceInfo info(i);
|
||||
|
||||
printf("[----------]\n"), fflush(stdout);
|
||||
printf("[ DEVICE ] \t# %d %s.\n", i, info.name().c_str()), fflush(stdout);
|
||||
printf("[ ] \tCompute capability: %d.%d\n", (int)info.majorVersion(), (int)info.minorVersion()), fflush(stdout);
|
||||
printf("[ ] \tMulti Processor Count: %d\n", info.multiProcessorCount()), fflush(stdout);
|
||||
printf("[ ] \tTotal memory: %d Mb\n", static_cast<int>(static_cast<int>(info.totalMemory() / 1024.0) / 1024.0)), fflush(stdout);
|
||||
printf("[ ] \tFree memory: %d Mb\n", static_cast<int>(static_cast<int>(info.freeMemory() / 1024.0) / 1024.0)), fflush(stdout);
|
||||
if (!info.isCompatible())
|
||||
printf("[ GPU INFO ] \tThis device is NOT compatible with current GPU module build\n");
|
||||
printf("[----------]\n"), fflush(stdout);
|
||||
}
|
||||
|
||||
#endif
|
||||
}
|
||||
using namespace perf;
|
||||
|
||||
CV_PERF_TEST_MAIN(gpu, printCudaInfo())
|
||||
|
@@ -2,6 +2,7 @@
|
||||
|
||||
using namespace std;
|
||||
using namespace testing;
|
||||
using namespace perf;
|
||||
|
||||
//////////////////////////////////////////////////////////////////////
|
||||
// SetTo
|
||||
|
@@ -2,6 +2,7 @@
|
||||
|
||||
using namespace std;
|
||||
using namespace testing;
|
||||
using namespace perf;
|
||||
|
||||
///////////////////////////////////////////////////////////////
|
||||
// HOG
|
||||
@@ -18,6 +19,8 @@ PERF_TEST_P(Image, ObjDetect_HOG,
|
||||
"gpu/caltech/image_00000527_0.png",
|
||||
"gpu/caltech/image_00000574_0.png"))
|
||||
{
|
||||
declare.time(300.0);
|
||||
|
||||
const cv::Mat img = readImage(GetParam(), cv::IMREAD_GRAYSCALE);
|
||||
ASSERT_FALSE(img.empty());
|
||||
|
||||
|
@@ -19,6 +19,7 @@
|
||||
#endif
|
||||
|
||||
#include "opencv2/ts.hpp"
|
||||
#include "opencv2/ts/gpu_perf.hpp"
|
||||
|
||||
#include "opencv2/core.hpp"
|
||||
#include "opencv2/highgui.hpp"
|
||||
@@ -26,12 +27,9 @@
|
||||
#include "opencv2/calib3d.hpp"
|
||||
#include "opencv2/imgproc.hpp"
|
||||
#include "opencv2/video.hpp"
|
||||
#include "opencv2/nonfree.hpp"
|
||||
#include "opencv2/legacy.hpp"
|
||||
#include "opencv2/photo.hpp"
|
||||
|
||||
#include "utility.hpp"
|
||||
|
||||
#ifdef GTEST_CREATE_SHARED_LIBRARY
|
||||
#error no modules except ts should have GTEST_CREATE_SHARED_LIBRARY defined
|
||||
#endif
|
||||
|
@@ -4,6 +4,18 @@ using namespace std;
|
||||
using namespace testing;
|
||||
using namespace perf;
|
||||
|
||||
#if defined(HAVE_XINE) || \
|
||||
defined(HAVE_GSTREAMER) || \
|
||||
defined(HAVE_QUICKTIME) || \
|
||||
defined(HAVE_AVFOUNDATION) || \
|
||||
defined(HAVE_FFMPEG) || \
|
||||
defined(WIN32) /* assume that we have ffmpeg */
|
||||
|
||||
# define BUILD_WITH_VIDEO_INPUT_SUPPORT 1
|
||||
#else
|
||||
# define BUILD_WITH_VIDEO_INPUT_SUPPORT 0
|
||||
#endif
|
||||
|
||||
namespace cv
|
||||
{
|
||||
template<> void Ptr<CvBGStatModel>::delete_obj()
|
||||
@@ -142,7 +154,7 @@ PERF_TEST_P(Image_MinDistance, Video_GoodFeaturesToTrack,
|
||||
PERF_TEST_P(ImagePair, Video_BroxOpticalFlow,
|
||||
Values<pair_string>(make_pair("gpu/opticalflow/frame0.png", "gpu/opticalflow/frame1.png")))
|
||||
{
|
||||
declare.time(10);
|
||||
declare.time(300);
|
||||
|
||||
cv::Mat frame0 = readImage(GetParam().first, cv::IMREAD_GRAYSCALE);
|
||||
ASSERT_FALSE(frame0.empty());
|
||||
@@ -372,8 +384,8 @@ PERF_TEST_P(ImagePair, Video_OpticalFlowDual_TVL1,
|
||||
|
||||
TEST_CYCLE() d_alg(d_frame0, d_frame1, u, v);
|
||||
|
||||
GPU_SANITY_CHECK(u, 1e-4);
|
||||
GPU_SANITY_CHECK(v, 1e-4);
|
||||
GPU_SANITY_CHECK(u, 1e-2);
|
||||
GPU_SANITY_CHECK(v, 1e-2);
|
||||
}
|
||||
else
|
||||
{
|
||||
@@ -482,6 +494,8 @@ PERF_TEST_P(ImagePair, Video_FastOpticalFlowBM,
|
||||
//////////////////////////////////////////////////////
|
||||
// FGDStatModel
|
||||
|
||||
#if BUILD_WITH_VIDEO_INPUT_SUPPORT
|
||||
|
||||
DEF_PARAM_TEST_1(Video, string);
|
||||
|
||||
PERF_TEST_P(Video, Video_FGDStatModel,
|
||||
@@ -548,9 +562,13 @@ PERF_TEST_P(Video, Video_FGDStatModel,
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
//////////////////////////////////////////////////////
|
||||
// MOG
|
||||
|
||||
#if BUILD_WITH_VIDEO_INPUT_SUPPORT
|
||||
|
||||
DEF_PARAM_TEST(Video_Cn_LearningRate, string, MatCn, double);
|
||||
|
||||
PERF_TEST_P(Video_Cn_LearningRate, Video_MOG,
|
||||
@@ -643,9 +661,13 @@ PERF_TEST_P(Video_Cn_LearningRate, Video_MOG,
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
//////////////////////////////////////////////////////
|
||||
// MOG2
|
||||
|
||||
#if BUILD_WITH_VIDEO_INPUT_SUPPORT
|
||||
|
||||
DEF_PARAM_TEST(Video_Cn, string, int);
|
||||
|
||||
PERF_TEST_P(Video_Cn, Video_MOG2,
|
||||
@@ -740,9 +762,13 @@ PERF_TEST_P(Video_Cn, Video_MOG2,
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
//////////////////////////////////////////////////////
|
||||
// MOG2GetBackgroundImage
|
||||
|
||||
#if BUILD_WITH_VIDEO_INPUT_SUPPORT
|
||||
|
||||
PERF_TEST_P(Video_Cn, Video_MOG2GetBackgroundImage,
|
||||
Combine(Values("gpu/video/768x576.avi", "gpu/video/1920x1080.avi"),
|
||||
GPU_CHANNELS_1_3_4))
|
||||
@@ -818,74 +844,13 @@ PERF_TEST_P(Video_Cn, Video_MOG2GetBackgroundImage,
|
||||
}
|
||||
}
|
||||
|
||||
//////////////////////////////////////////////////////
|
||||
// VIBE
|
||||
|
||||
PERF_TEST_P(Video_Cn, Video_VIBE,
|
||||
Combine(Values("gpu/video/768x576.avi", "gpu/video/1920x1080.avi"),
|
||||
GPU_CHANNELS_1_3_4))
|
||||
{
|
||||
const string inputFile = perf::TestBase::getDataPath(GET_PARAM(0));
|
||||
const int cn = GET_PARAM(1);
|
||||
|
||||
cv::VideoCapture cap(inputFile);
|
||||
ASSERT_TRUE(cap.isOpened());
|
||||
|
||||
cv::Mat frame;
|
||||
cap >> frame;
|
||||
ASSERT_FALSE(frame.empty());
|
||||
|
||||
if (cn != 3)
|
||||
{
|
||||
cv::Mat temp;
|
||||
if (cn == 1)
|
||||
cv::cvtColor(frame, temp, cv::COLOR_BGR2GRAY);
|
||||
else
|
||||
cv::cvtColor(frame, temp, cv::COLOR_BGR2BGRA);
|
||||
cv::swap(temp, frame);
|
||||
}
|
||||
|
||||
if (PERF_RUN_GPU())
|
||||
{
|
||||
cv::gpu::GpuMat d_frame(frame);
|
||||
cv::gpu::VIBE_GPU vibe;
|
||||
cv::gpu::GpuMat foreground;
|
||||
|
||||
vibe(d_frame, foreground);
|
||||
|
||||
for (int i = 0; i < 10; ++i)
|
||||
{
|
||||
cap >> frame;
|
||||
ASSERT_FALSE(frame.empty());
|
||||
|
||||
if (cn != 3)
|
||||
{
|
||||
cv::Mat temp;
|
||||
if (cn == 1)
|
||||
cv::cvtColor(frame, temp, cv::COLOR_BGR2GRAY);
|
||||
else
|
||||
cv::cvtColor(frame, temp, cv::COLOR_BGR2BGRA);
|
||||
cv::swap(temp, frame);
|
||||
}
|
||||
|
||||
d_frame.upload(frame);
|
||||
|
||||
startTimer(); next();
|
||||
vibe(d_frame, foreground);
|
||||
stopTimer();
|
||||
}
|
||||
|
||||
GPU_SANITY_CHECK(foreground);
|
||||
}
|
||||
else
|
||||
{
|
||||
FAIL_NO_CPU();
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
//////////////////////////////////////////////////////
|
||||
// GMG
|
||||
|
||||
#if BUILD_WITH_VIDEO_INPUT_SUPPORT
|
||||
|
||||
DEF_PARAM_TEST(Video_Cn_MaxFeatures, string, MatCn, int);
|
||||
|
||||
PERF_TEST_P(Video_Cn_MaxFeatures, Video_GMG,
|
||||
@@ -993,11 +958,13 @@ PERF_TEST_P(Video_Cn_MaxFeatures, Video_GMG,
|
||||
}
|
||||
}
|
||||
|
||||
#ifdef HAVE_NVCUVID
|
||||
#endif
|
||||
|
||||
//////////////////////////////////////////////////////
|
||||
// VideoReader
|
||||
|
||||
#if defined(HAVE_NVCUVID) && BUILD_WITH_VIDEO_INPUT_SUPPORT
|
||||
|
||||
PERF_TEST_P(Video, Video_VideoReader, Values("gpu/video/768x576.avi", "gpu/video/1920x1080.avi"))
|
||||
{
|
||||
declare.time(20);
|
||||
@@ -1028,10 +995,12 @@ PERF_TEST_P(Video, Video_VideoReader, Values("gpu/video/768x576.avi", "gpu/video
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
//////////////////////////////////////////////////////
|
||||
// VideoWriter
|
||||
|
||||
#ifdef WIN32
|
||||
#if defined(HAVE_NVCUVID) && defined(WIN32)
|
||||
|
||||
PERF_TEST_P(Video, Video_VideoWriter, Values("gpu/video/768x576.avi", "gpu/video/1920x1080.avi"))
|
||||
{
|
||||
@@ -1089,6 +1058,4 @@ PERF_TEST_P(Video, Video_VideoWriter, Values("gpu/video/768x576.avi", "gpu/video
|
||||
SANITY_CHECK(frame);
|
||||
}
|
||||
|
||||
#endif // WIN32
|
||||
|
||||
#endif // HAVE_NVCUVID
|
||||
#endif
|
||||
|
@@ -1,184 +0,0 @@
|
||||
#include "perf_precomp.hpp"
|
||||
|
||||
using namespace std;
|
||||
using namespace cv;
|
||||
|
||||
Mat readImage(const string& fileName, int flags)
|
||||
{
|
||||
return imread(perf::TestBase::getDataPath(fileName), flags);
|
||||
}
|
||||
|
||||
void PrintTo(const CvtColorInfo& info, ostream* os)
|
||||
{
|
||||
static const char* str[] =
|
||||
{
|
||||
"BGR2BGRA",
|
||||
"BGRA2BGR",
|
||||
"BGR2RGBA",
|
||||
"RGBA2BGR",
|
||||
"BGR2RGB",
|
||||
"BGRA2RGBA",
|
||||
|
||||
"BGR2GRAY",
|
||||
"RGB2GRAY",
|
||||
"GRAY2BGR",
|
||||
"GRAY2BGRA",
|
||||
"BGRA2GRAY",
|
||||
"RGBA2GRAY",
|
||||
|
||||
"BGR2BGR565",
|
||||
"RGB2BGR565",
|
||||
"BGR5652BGR",
|
||||
"BGR5652RGB",
|
||||
"BGRA2BGR565",
|
||||
"RGBA2BGR565",
|
||||
"BGR5652BGRA",
|
||||
"BGR5652RGBA",
|
||||
|
||||
"GRAY2BGR565",
|
||||
"BGR5652GRAY",
|
||||
|
||||
"BGR2BGR555",
|
||||
"RGB2BGR555",
|
||||
"BGR5552BGR",
|
||||
"BGR5552RGB",
|
||||
"BGRA2BGR555",
|
||||
"RGBA2BGR555",
|
||||
"BGR5552BGRA",
|
||||
"BGR5552RGBA",
|
||||
|
||||
"GRAY2BGR555",
|
||||
"BGR5552GRAY",
|
||||
|
||||
"BGR2XYZ",
|
||||
"RGB2XYZ",
|
||||
"XYZ2BGR",
|
||||
"XYZ2RGB",
|
||||
|
||||
"BGR2YCrCb",
|
||||
"RGB2YCrCb",
|
||||
"YCrCb2BGR",
|
||||
"YCrCb2RGB",
|
||||
|
||||
"BGR2HSV",
|
||||
"RGB2HSV",
|
||||
|
||||
"",
|
||||
"",
|
||||
|
||||
"BGR2Lab",
|
||||
"RGB2Lab",
|
||||
|
||||
"BayerBG2BGR",
|
||||
"BayerGB2BGR",
|
||||
"BayerRG2BGR",
|
||||
"BayerGR2BGR",
|
||||
|
||||
"BGR2Luv",
|
||||
"RGB2Luv",
|
||||
|
||||
"BGR2HLS",
|
||||
"RGB2HLS",
|
||||
|
||||
"HSV2BGR",
|
||||
"HSV2RGB",
|
||||
|
||||
"Lab2BGR",
|
||||
"Lab2RGB",
|
||||
"Luv2BGR",
|
||||
"Luv2RGB",
|
||||
|
||||
"HLS2BGR",
|
||||
"HLS2RGB",
|
||||
|
||||
"BayerBG2BGR_VNG",
|
||||
"BayerGB2BGR_VNG",
|
||||
"BayerRG2BGR_VNG",
|
||||
"BayerGR2BGR_VNG",
|
||||
|
||||
"BGR2HSV_FULL",
|
||||
"RGB2HSV_FULL",
|
||||
"BGR2HLS_FULL",
|
||||
"RGB2HLS_FULL",
|
||||
|
||||
"HSV2BGR_FULL",
|
||||
"HSV2RGB_FULL",
|
||||
"HLS2BGR_FULL",
|
||||
"HLS2RGB_FULL",
|
||||
|
||||
"LBGR2Lab",
|
||||
"LRGB2Lab",
|
||||
"LBGR2Luv",
|
||||
"LRGB2Luv",
|
||||
|
||||
"Lab2LBGR",
|
||||
"Lab2LRGB",
|
||||
"Luv2LBGR",
|
||||
"Luv2LRGB",
|
||||
|
||||
"BGR2YUV",
|
||||
"RGB2YUV",
|
||||
"YUV2BGR",
|
||||
"YUV2RGB",
|
||||
|
||||
"BayerBG2GRAY",
|
||||
"BayerGB2GRAY",
|
||||
"BayerRG2GRAY",
|
||||
"BayerGR2GRAY",
|
||||
|
||||
//YUV 4:2:0 formats family
|
||||
"YUV2RGB_NV12",
|
||||
"YUV2BGR_NV12",
|
||||
"YUV2RGB_NV21",
|
||||
"YUV2BGR_NV21",
|
||||
|
||||
"YUV2RGBA_NV12",
|
||||
"YUV2BGRA_NV12",
|
||||
"YUV2RGBA_NV21",
|
||||
"YUV2BGRA_NV21",
|
||||
|
||||
"YUV2RGB_YV12",
|
||||
"YUV2BGR_YV12",
|
||||
"YUV2RGB_IYUV",
|
||||
"YUV2BGR_IYUV",
|
||||
|
||||
"YUV2RGBA_YV12",
|
||||
"YUV2BGRA_YV12",
|
||||
"YUV2RGBA_IYUV",
|
||||
"YUV2BGRA_IYUV",
|
||||
|
||||
"YUV2GRAY_420",
|
||||
|
||||
//YUV 4:2:2 formats family
|
||||
"YUV2RGB_UYVY",
|
||||
"YUV2BGR_UYVY",
|
||||
"YUV2RGB_VYUY",
|
||||
"YUV2BGR_VYUY",
|
||||
|
||||
"YUV2RGBA_UYVY",
|
||||
"YUV2BGRA_UYVY",
|
||||
"YUV2RGBA_VYUY",
|
||||
"YUV2BGRA_VYUY",
|
||||
|
||||
"YUV2RGB_YUY2",
|
||||
"YUV2BGR_YUY2",
|
||||
"YUV2RGB_YVYU",
|
||||
"YUV2BGR_YVYU",
|
||||
|
||||
"YUV2RGBA_YUY2",
|
||||
"YUV2BGRA_YUY2",
|
||||
"YUV2RGBA_YVYU",
|
||||
"YUV2BGRA_YVYU",
|
||||
|
||||
"YUV2GRAY_UYVY",
|
||||
"YUV2GRAY_YUY2",
|
||||
|
||||
// alpha premultiplication
|
||||
"RGBA2mRGBA",
|
||||
"mRGBA2RGBA",
|
||||
|
||||
"COLORCVT_MAX"
|
||||
};
|
||||
|
||||
*os << str[info.code];
|
||||
}
|
@@ -1,63 +0,0 @@
|
||||
#ifndef __OPENCV_PERF_GPU_UTILITY_HPP__
|
||||
#define __OPENCV_PERF_GPU_UTILITY_HPP__
|
||||
|
||||
#include "opencv2/core.hpp"
|
||||
#include "opencv2/imgproc.hpp"
|
||||
#include "opencv2/ts/ts_perf.hpp"
|
||||
|
||||
cv::Mat readImage(const std::string& fileName, int flags = cv::IMREAD_COLOR);
|
||||
|
||||
using perf::MatType;
|
||||
using perf::MatDepth;
|
||||
|
||||
CV_ENUM(BorderMode, cv::BORDER_REFLECT101, cv::BORDER_REPLICATE, cv::BORDER_CONSTANT, cv::BORDER_REFLECT, cv::BORDER_WRAP)
|
||||
#define ALL_BORDER_MODES testing::ValuesIn(BorderMode::all())
|
||||
|
||||
CV_ENUM(Interpolation, cv::INTER_NEAREST, cv::INTER_LINEAR, cv::INTER_CUBIC, cv::INTER_AREA)
|
||||
#define ALL_INTERPOLATIONS testing::ValuesIn(Interpolation::all())
|
||||
|
||||
CV_ENUM(NormType, cv::NORM_INF, cv::NORM_L1, cv::NORM_L2, cv::NORM_HAMMING, cv::NORM_MINMAX)
|
||||
|
||||
enum { Gray = 1, TwoChannel = 2, BGR = 3, BGRA = 4 };
|
||||
CV_ENUM(MatCn, Gray, TwoChannel, BGR, BGRA)
|
||||
#define GPU_CHANNELS_1_3_4 testing::Values(MatCn(Gray), MatCn(BGR), MatCn(BGRA))
|
||||
#define GPU_CHANNELS_1_3 testing::Values(MatCn(Gray), MatCn(BGR))
|
||||
|
||||
struct CvtColorInfo
|
||||
{
|
||||
int scn;
|
||||
int dcn;
|
||||
int code;
|
||||
|
||||
CvtColorInfo() {}
|
||||
explicit CvtColorInfo(int scn_, int dcn_, int code_) : scn(scn_), dcn(dcn_), code(code_) {}
|
||||
};
|
||||
void PrintTo(const CvtColorInfo& info, std::ostream* os);
|
||||
|
||||
#define GET_PARAM(k) std::tr1::get< k >(GetParam())
|
||||
|
||||
#define DEF_PARAM_TEST(name, ...) typedef ::perf::TestBaseWithParam< std::tr1::tuple< __VA_ARGS__ > > name
|
||||
#define DEF_PARAM_TEST_1(name, param_type) typedef ::perf::TestBaseWithParam< param_type > name
|
||||
|
||||
DEF_PARAM_TEST_1(Sz, cv::Size);
|
||||
typedef perf::Size_MatType Sz_Type;
|
||||
DEF_PARAM_TEST(Sz_Depth, cv::Size, MatDepth);
|
||||
DEF_PARAM_TEST(Sz_Depth_Cn, cv::Size, MatDepth, MatCn);
|
||||
|
||||
#define GPU_TYPICAL_MAT_SIZES testing::Values(perf::sz720p, perf::szSXGA, perf::sz1080p)
|
||||
|
||||
#define FAIL_NO_CPU() FAIL() << "No such CPU implementation analogy"
|
||||
|
||||
#define GPU_SANITY_CHECK(mat, ...) \
|
||||
do{ \
|
||||
cv::Mat gpu_##mat(mat); \
|
||||
SANITY_CHECK(gpu_##mat, ## __VA_ARGS__); \
|
||||
} while(0)
|
||||
|
||||
#define CPU_SANITY_CHECK(mat, ...) \
|
||||
do{ \
|
||||
cv::Mat cpu_##mat(mat); \
|
||||
SANITY_CHECK(cpu_##mat, ## __VA_ARGS__); \
|
||||
} while(0)
|
||||
|
||||
#endif // __OPENCV_PERF_GPU_UTILITY_HPP__
|
@@ -8,69 +8,19 @@
|
||||
#include "opencv2/video.hpp"
|
||||
#include "opencv2/legacy.hpp"
|
||||
#include "opencv2/ts.hpp"
|
||||
|
||||
static void printOsInfo()
|
||||
{
|
||||
#if defined _WIN32
|
||||
# if defined _WIN64
|
||||
printf("[----------]\n[ GPU INFO ] \tRun on OS Windows x64.\n[----------]\n"); fflush(stdout);
|
||||
# else
|
||||
printf("[----------]\n[ GPU INFO ] \tRun on OS Windows x32.\n[----------]\n"); fflush(stdout);
|
||||
# endif
|
||||
#elif defined linux
|
||||
# if defined _LP64
|
||||
printf("[----------]\n[ GPU INFO ] \tRun on OS Linux x64.\n[----------]\n"); fflush(stdout);
|
||||
# else
|
||||
printf("[----------]\n[ GPU INFO ] \tRun on OS Linux x32.\n[----------]\n"); fflush(stdout);
|
||||
# endif
|
||||
#elif defined __APPLE__
|
||||
# if defined _LP64
|
||||
printf("[----------]\n[ GPU INFO ] \tRun on OS Apple x64.\n[----------]\n"); fflush(stdout);
|
||||
# else
|
||||
printf("[----------]\n[ GPU INFO ] \tRun on OS Apple x32.\n[----------]\n"); fflush(stdout);
|
||||
# endif
|
||||
#endif
|
||||
}
|
||||
|
||||
static void printCudaInfo()
|
||||
{
|
||||
const int deviceCount = cv::gpu::getCudaEnabledDeviceCount();
|
||||
|
||||
printf("[----------]\n"); fflush(stdout);
|
||||
printf("[ GPU INFO ] \tCUDA device count:: %d.\n", deviceCount); fflush(stdout);
|
||||
printf("[----------]\n"); fflush(stdout);
|
||||
|
||||
for (int i = 0; i < deviceCount; ++i)
|
||||
{
|
||||
cv::gpu::DeviceInfo info(i);
|
||||
|
||||
printf("[----------]\n"); fflush(stdout);
|
||||
printf("[ DEVICE ] \t# %d %s.\n", i, info.name().c_str()); fflush(stdout);
|
||||
printf("[ ] \tCompute capability: %d.%d\n", info.majorVersion(), info.minorVersion()); fflush(stdout);
|
||||
printf("[ ] \tMulti Processor Count: %d\n", info.multiProcessorCount()); fflush(stdout);
|
||||
printf("[ ] \tTotal memory: %d Mb\n", static_cast<int>(static_cast<int>(info.totalMemory() / 1024.0) / 1024.0)); fflush(stdout);
|
||||
printf("[ ] \tFree memory: %d Mb\n", static_cast<int>(static_cast<int>(info.freeMemory() / 1024.0) / 1024.0)); fflush(stdout);
|
||||
if (!info.isCompatible())
|
||||
printf("[ GPU INFO ] \tThis device is NOT compatible with current GPU module build\n");
|
||||
printf("[----------]\n"); fflush(stdout);
|
||||
}
|
||||
}
|
||||
#include "opencv2/ts/gpu_perf.hpp"
|
||||
|
||||
int main(int argc, char* argv[])
|
||||
{
|
||||
printOsInfo();
|
||||
printCudaInfo();
|
||||
perf::printCudaInfo();
|
||||
|
||||
perf::Regression::Init("nv_perf_test");
|
||||
perf::Regression::Init("gpu_perf4au");
|
||||
perf::TestBase::Init(argc, argv);
|
||||
testing::InitGoogleTest(&argc, argv);
|
||||
|
||||
return RUN_ALL_TESTS();
|
||||
}
|
||||
|
||||
#define DEF_PARAM_TEST(name, ...) typedef ::perf::TestBaseWithParam< std::tr1::tuple< __VA_ARGS__ > > name
|
||||
#define DEF_PARAM_TEST_1(name, param_type) typedef ::perf::TestBaseWithParam< param_type > name
|
||||
|
||||
//////////////////////////////////////////////////////////
|
||||
// HoughLinesP
|
||||
|
||||
|
@@ -318,40 +318,14 @@ void cv::gpu::flip(const GpuMat& src, GpuMat& dst, int flipCode, Stream& stream)
|
||||
|
||||
void cv::gpu::LUT(const GpuMat& src, const Mat& lut, GpuMat& dst, Stream& s)
|
||||
{
|
||||
class LevelsInit
|
||||
{
|
||||
public:
|
||||
Npp32s pLevels[256];
|
||||
const Npp32s* pLevels3[3];
|
||||
int nValues3[3];
|
||||
const int cn = src.channels();
|
||||
|
||||
#if (CUDA_VERSION > 4020)
|
||||
GpuMat d_pLevels;
|
||||
#endif
|
||||
CV_Assert( src.type() == CV_8UC1 || src.type() == CV_8UC3 );
|
||||
CV_Assert( lut.depth() == CV_8U );
|
||||
CV_Assert( lut.channels() == 1 || lut.channels() == cn );
|
||||
CV_Assert( lut.rows * lut.cols == 256 && lut.isContinuous() );
|
||||
|
||||
LevelsInit()
|
||||
{
|
||||
nValues3[0] = nValues3[1] = nValues3[2] = 256;
|
||||
for (int i = 0; i < 256; ++i)
|
||||
pLevels[i] = i;
|
||||
|
||||
|
||||
#if (CUDA_VERSION <= 4020)
|
||||
pLevels3[0] = pLevels3[1] = pLevels3[2] = pLevels;
|
||||
#else
|
||||
d_pLevels.upload(Mat(1, 256, CV_32S, pLevels));
|
||||
pLevels3[0] = pLevels3[1] = pLevels3[2] = d_pLevels.ptr<Npp32s>();
|
||||
#endif
|
||||
}
|
||||
};
|
||||
static LevelsInit lvls;
|
||||
|
||||
int cn = src.channels();
|
||||
|
||||
CV_Assert(src.type() == CV_8UC1 || src.type() == CV_8UC3);
|
||||
CV_Assert(lut.depth() == CV_8U && (lut.channels() == 1 || lut.channels() == cn) && lut.rows * lut.cols == 256 && lut.isContinuous());
|
||||
|
||||
dst.create(src.size(), CV_MAKETYPE(lut.depth(), cn));
|
||||
dst.create(src.size(), CV_MAKE_TYPE(lut.depth(), cn));
|
||||
|
||||
NppiSize sz;
|
||||
sz.height = src.rows;
|
||||
@@ -360,19 +334,34 @@ void cv::gpu::LUT(const GpuMat& src, const Mat& lut, GpuMat& dst, Stream& s)
|
||||
Mat nppLut;
|
||||
lut.convertTo(nppLut, CV_32S);
|
||||
|
||||
cudaStream_t stream = StreamAccessor::getStream(s);
|
||||
int nValues3[] = {256, 256, 256};
|
||||
|
||||
Npp32s pLevels[256];
|
||||
for (int i = 0; i < 256; ++i)
|
||||
pLevels[i] = i;
|
||||
|
||||
const Npp32s* pLevels3[3];
|
||||
|
||||
#if (CUDA_VERSION <= 4020)
|
||||
pLevels3[0] = pLevels3[1] = pLevels3[2] = pLevels;
|
||||
#else
|
||||
GpuMat d_pLevels;
|
||||
d_pLevels.upload(Mat(1, 256, CV_32S, pLevels));
|
||||
pLevels3[0] = pLevels3[1] = pLevels3[2] = d_pLevels.ptr<Npp32s>();
|
||||
#endif
|
||||
|
||||
cudaStream_t stream = StreamAccessor::getStream(s);
|
||||
NppStreamHandler h(stream);
|
||||
|
||||
if (src.type() == CV_8UC1)
|
||||
{
|
||||
#if (CUDA_VERSION <= 4020)
|
||||
nppSafeCall( nppiLUT_Linear_8u_C1R(src.ptr<Npp8u>(), static_cast<int>(src.step),
|
||||
dst.ptr<Npp8u>(), static_cast<int>(dst.step), sz, nppLut.ptr<Npp32s>(), lvls.pLevels, 256) );
|
||||
dst.ptr<Npp8u>(), static_cast<int>(dst.step), sz, nppLut.ptr<Npp32s>(), pLevels, 256) );
|
||||
#else
|
||||
GpuMat d_nppLut(Mat(1, 256, CV_32S, nppLut.data));
|
||||
nppSafeCall( nppiLUT_Linear_8u_C1R(src.ptr<Npp8u>(), static_cast<int>(src.step),
|
||||
dst.ptr<Npp8u>(), static_cast<int>(dst.step), sz, d_nppLut.ptr<Npp32s>(), lvls.d_pLevels.ptr<Npp32s>(), 256) );
|
||||
dst.ptr<Npp8u>(), static_cast<int>(dst.step), sz, d_nppLut.ptr<Npp32s>(), d_pLevels.ptr<Npp32s>(), 256) );
|
||||
#endif
|
||||
}
|
||||
else
|
||||
@@ -409,7 +398,7 @@ void cv::gpu::LUT(const GpuMat& src, const Mat& lut, GpuMat& dst, Stream& s)
|
||||
}
|
||||
|
||||
nppSafeCall( nppiLUT_Linear_8u_C3R(src.ptr<Npp8u>(), static_cast<int>(src.step),
|
||||
dst.ptr<Npp8u>(), static_cast<int>(dst.step), sz, pValues3, lvls.pLevels3, lvls.nValues3) );
|
||||
dst.ptr<Npp8u>(), static_cast<int>(dst.step), sz, pValues3, pLevels3, nValues3) );
|
||||
}
|
||||
|
||||
if (stream == 0)
|
||||
|
@@ -1,137 +0,0 @@
|
||||
/*M///////////////////////////////////////////////////////////////////////////////////////
|
||||
//
|
||||
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
|
||||
//
|
||||
// By downloading, copying, installing or using the software you agree to this license.
|
||||
// If you do not agree to this license, do not download, install,
|
||||
// copy or use the software.
|
||||
//
|
||||
//
|
||||
// License Agreement
|
||||
// For Open Source Computer Vision Library
|
||||
//
|
||||
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
|
||||
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
|
||||
// Third party copyrights are property of their respective owners.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without modification,
|
||||
// are permitted provided that the following conditions are met:
|
||||
//
|
||||
// * Redistribution's of source code must retain the above copyright notice,
|
||||
// this list of conditions and the following disclaimer.
|
||||
//
|
||||
// * Redistribution's in binary form must reproduce the above copyright notice,
|
||||
// this list of conditions and the following disclaimer in the documentation
|
||||
// and/or other materials provided with the distribution.
|
||||
//
|
||||
// * The name of the copyright holders may not be used to endorse or promote products
|
||||
// derived from this software without specific prior written permission.
|
||||
//
|
||||
// This software is provided by the copyright holders and contributors "as is" and
|
||||
// any express or implied warranties, including, but not limited to, the implied
|
||||
// warranties of merchantability and fitness for a particular purpose are disclaimed.
|
||||
// In no event shall the Intel Corporation or contributors be liable for any direct,
|
||||
// indirect, incidental, special, exemplary, or consequential damages
|
||||
// (including, but not limited to, procurement of substitute goods or services;
|
||||
// loss of use, data, or profits; or business interruption) however caused
|
||||
// and on any theory of liability, whether in contract, strict liability,
|
||||
// or tort (including negligence or otherwise) arising in any way out of
|
||||
// the use of this software, even if advised of the possibility of such damage.
|
||||
//
|
||||
//M*/
|
||||
|
||||
#include "precomp.hpp"
|
||||
|
||||
#if !defined HAVE_CUDA || defined(CUDA_DISABLER)
|
||||
|
||||
cv::gpu::VIBE_GPU::VIBE_GPU(unsigned long) { throw_nogpu(); }
|
||||
void cv::gpu::VIBE_GPU::initialize(const GpuMat&, Stream&) { throw_nogpu(); }
|
||||
void cv::gpu::VIBE_GPU::operator()(const GpuMat&, GpuMat&, Stream&) { throw_nogpu(); }
|
||||
void cv::gpu::VIBE_GPU::release() {}
|
||||
|
||||
#else
|
||||
|
||||
namespace cv { namespace gpu { namespace device
|
||||
{
|
||||
namespace vibe
|
||||
{
|
||||
void loadConstants(int nbSamples, int reqMatches, int radius, int subsamplingFactor);
|
||||
|
||||
void init_gpu(PtrStepSzb frame, int cn, PtrStepSzb samples, PtrStepSz<unsigned int> randStates, cudaStream_t stream);
|
||||
|
||||
void update_gpu(PtrStepSzb frame, int cn, PtrStepSzb fgmask, PtrStepSzb samples, PtrStepSz<unsigned int> randStates, cudaStream_t stream);
|
||||
}
|
||||
}}}
|
||||
|
||||
namespace
|
||||
{
|
||||
const int defaultNbSamples = 20;
|
||||
const int defaultReqMatches = 2;
|
||||
const int defaultRadius = 20;
|
||||
const int defaultSubsamplingFactor = 16;
|
||||
}
|
||||
|
||||
cv::gpu::VIBE_GPU::VIBE_GPU(unsigned long rngSeed) :
|
||||
frameSize_(0, 0), rngSeed_(rngSeed)
|
||||
{
|
||||
nbSamples = defaultNbSamples;
|
||||
reqMatches = defaultReqMatches;
|
||||
radius = defaultRadius;
|
||||
subsamplingFactor = defaultSubsamplingFactor;
|
||||
}
|
||||
|
||||
void cv::gpu::VIBE_GPU::initialize(const GpuMat& firstFrame, Stream& s)
|
||||
{
|
||||
using namespace cv::gpu::device::vibe;
|
||||
|
||||
CV_Assert(firstFrame.type() == CV_8UC1 || firstFrame.type() == CV_8UC3 || firstFrame.type() == CV_8UC4);
|
||||
|
||||
cudaStream_t stream = StreamAccessor::getStream(s);
|
||||
|
||||
loadConstants(nbSamples, reqMatches, radius, subsamplingFactor);
|
||||
|
||||
frameSize_ = firstFrame.size();
|
||||
|
||||
if (randStates_.size() != frameSize_)
|
||||
{
|
||||
cv::RNG rng(rngSeed_);
|
||||
cv::Mat h_randStates(frameSize_, CV_8UC4);
|
||||
rng.fill(h_randStates, cv::RNG::UNIFORM, 0, 255);
|
||||
randStates_.upload(h_randStates);
|
||||
}
|
||||
|
||||
int ch = firstFrame.channels();
|
||||
int sample_ch = ch == 1 ? 1 : 4;
|
||||
|
||||
samples_.create(nbSamples * frameSize_.height, frameSize_.width, CV_8UC(sample_ch));
|
||||
|
||||
init_gpu(firstFrame, ch, samples_, randStates_, stream);
|
||||
}
|
||||
|
||||
void cv::gpu::VIBE_GPU::operator()(const GpuMat& frame, GpuMat& fgmask, Stream& s)
|
||||
{
|
||||
using namespace cv::gpu::device::vibe;
|
||||
|
||||
CV_Assert(frame.depth() == CV_8U);
|
||||
|
||||
int ch = frame.channels();
|
||||
int sample_ch = ch == 1 ? 1 : 4;
|
||||
|
||||
if (frame.size() != frameSize_ || sample_ch != samples_.channels())
|
||||
initialize(frame);
|
||||
|
||||
fgmask.create(frameSize_, CV_8UC1);
|
||||
|
||||
update_gpu(frame, ch, fgmask, samples_, randStates_, StreamAccessor::getStream(s));
|
||||
}
|
||||
|
||||
void cv::gpu::VIBE_GPU::release()
|
||||
{
|
||||
frameSize_ = Size(0, 0);
|
||||
|
||||
randStates_.release();
|
||||
|
||||
samples_.release();
|
||||
}
|
||||
|
||||
#endif
|
@@ -48,6 +48,7 @@ using namespace cv::gpu;
|
||||
#if !defined (HAVE_CUDA) || defined (CUDA_DISABLER)
|
||||
|
||||
void cv::gpu::cvtColor(const GpuMat&, GpuMat&, int, int, Stream&) { throw_nogpu(); }
|
||||
void cv::gpu::demosaicing(const GpuMat&, GpuMat&, int, int, Stream&) { throw_nogpu(); }
|
||||
void cv::gpu::swapChannels(GpuMat&, const int[], Stream&) { throw_nogpu(); }
|
||||
void cv::gpu::gammaCorrection(const GpuMat&, GpuMat&, bool, Stream&) { throw_nogpu(); }
|
||||
|
||||
@@ -62,6 +63,9 @@ namespace cv { namespace gpu {
|
||||
void Bayer2BGR_8u_gpu(PtrStepSzb src, PtrStepSzb dst, bool blue_last, bool start_with_green, cudaStream_t stream);
|
||||
template <int cn>
|
||||
void Bayer2BGR_16u_gpu(PtrStepSzb src, PtrStepSzb dst, bool blue_last, bool start_with_green, cudaStream_t stream);
|
||||
|
||||
template <int cn>
|
||||
void MHCdemosaic(PtrStepSzb src, int2 sourceOffset, PtrStepSzb dst, int2 firstRed, cudaStream_t stream);
|
||||
}
|
||||
}}
|
||||
|
||||
@@ -1620,26 +1624,56 @@ namespace
|
||||
|
||||
funcs[src.depth()][dcn - 1](src, dst, blue_last, start_with_green, StreamAccessor::getStream(stream));
|
||||
}
|
||||
|
||||
void bayerBG_to_bgr(const GpuMat& src, GpuMat& dst, int dcn, Stream& stream)
|
||||
{
|
||||
bayer_to_bgr(src, dst, dcn, false, false, stream);
|
||||
}
|
||||
|
||||
void bayerGB_to_bgr(const GpuMat& src, GpuMat& dst, int dcn, Stream& stream)
|
||||
{
|
||||
bayer_to_bgr(src, dst, dcn, false, true, stream);
|
||||
}
|
||||
|
||||
void bayerRG_to_bgr(const GpuMat& src, GpuMat& dst, int dcn, Stream& stream)
|
||||
{
|
||||
bayer_to_bgr(src, dst, dcn, true, false, stream);
|
||||
}
|
||||
|
||||
void bayerGR_to_bgr(const GpuMat& src, GpuMat& dst, int dcn, Stream& stream)
|
||||
{
|
||||
bayer_to_bgr(src, dst, dcn, true, true, stream);
|
||||
}
|
||||
|
||||
void bayer_to_gray(const GpuMat& src, GpuMat& dst, bool blue_last, bool start_with_green, Stream& stream)
|
||||
{
|
||||
typedef void (*func_t)(PtrStepSzb src, PtrStepSzb dst, bool blue_last, bool start_with_green, cudaStream_t stream);
|
||||
static const func_t funcs[3] =
|
||||
{
|
||||
Bayer2BGR_8u_gpu<1>,
|
||||
0,
|
||||
Bayer2BGR_16u_gpu<1>,
|
||||
};
|
||||
|
||||
CV_Assert(src.type() == CV_8UC1 || src.type() == CV_16UC1);
|
||||
CV_Assert(src.rows > 2 && src.cols > 2);
|
||||
|
||||
dst.create(src.size(), CV_MAKETYPE(src.depth(), 1));
|
||||
|
||||
funcs[src.depth()](src, dst, blue_last, start_with_green, StreamAccessor::getStream(stream));
|
||||
}
|
||||
void bayerBG_to_gray(const GpuMat& src, GpuMat& dst, int /*dcn*/, Stream& stream)
|
||||
{
|
||||
bayer_to_gray(src, dst, false, false, stream);
|
||||
}
|
||||
void bayerGB_to_gray(const GpuMat& src, GpuMat& dst, int /*dcn*/, Stream& stream)
|
||||
{
|
||||
bayer_to_gray(src, dst, false, true, stream);
|
||||
}
|
||||
void bayerRG_to_gray(const GpuMat& src, GpuMat& dst, int /*dcn*/, Stream& stream)
|
||||
{
|
||||
bayer_to_gray(src, dst, true, false, stream);
|
||||
}
|
||||
void bayerGR_to_gray(const GpuMat& src, GpuMat& dst, int /*dcn*/, Stream& stream)
|
||||
{
|
||||
bayer_to_gray(src, dst, true, true, stream);
|
||||
}
|
||||
}
|
||||
|
||||
void cv::gpu::cvtColor(const GpuMat& src, GpuMat& dst, int code, int dcn, Stream& stream)
|
||||
@@ -1756,10 +1790,10 @@ void cv::gpu::cvtColor(const GpuMat& src, GpuMat& dst, int code, int dcn, Stream
|
||||
yuv_to_bgr, // CV_YUV2BGR = 84
|
||||
yuv_to_rgb, // CV_YUV2RGB = 85
|
||||
|
||||
0, // CV_BayerBG2GRAY = 86
|
||||
0, // CV_BayerGB2GRAY = 87
|
||||
0, // CV_BayerRG2GRAY = 88
|
||||
0, // CV_BayerGR2GRAY = 89
|
||||
bayerBG_to_gray, // CV_BayerBG2GRAY = 86
|
||||
bayerGB_to_gray, // CV_BayerGB2GRAY = 87
|
||||
bayerRG_to_gray, // CV_BayerRG2GRAY = 88
|
||||
bayerGR_to_gray, // CV_BayerGR2GRAY = 89
|
||||
|
||||
//YUV 4:2:0 formats family
|
||||
0, // CV_YUV2RGB_NV12 = 90,
|
||||
@@ -1825,6 +1859,74 @@ void cv::gpu::cvtColor(const GpuMat& src, GpuMat& dst, int code, int dcn, Stream
|
||||
func(src, dst, dcn, stream);
|
||||
}
|
||||
|
||||
void cv::gpu::demosaicing(const GpuMat& src, GpuMat& dst, int code, int dcn, Stream& stream)
|
||||
{
|
||||
const int depth = src.depth();
|
||||
|
||||
CV_Assert( src.channels() == 1 );
|
||||
|
||||
switch (code)
|
||||
{
|
||||
case CV_BayerBG2GRAY: case CV_BayerGB2GRAY: case CV_BayerRG2GRAY: case CV_BayerGR2GRAY:
|
||||
bayer_to_gray(src, dst, code == CV_BayerBG2GRAY || code == CV_BayerGB2GRAY, code == CV_BayerGB2GRAY || code == CV_BayerGR2GRAY, stream);
|
||||
break;
|
||||
|
||||
case CV_BayerBG2BGR: case CV_BayerGB2BGR: case CV_BayerRG2BGR: case CV_BayerGR2BGR:
|
||||
bayer_to_bgr(src, dst, dcn, code == CV_BayerBG2BGR || code == CV_BayerGB2BGR, code == CV_BayerGB2BGR || code == CV_BayerGR2BGR, stream);
|
||||
break;
|
||||
|
||||
case COLOR_BayerBG2BGR_MHT: case COLOR_BayerGB2BGR_MHT: case COLOR_BayerRG2BGR_MHT: case COLOR_BayerGR2BGR_MHT:
|
||||
{
|
||||
if (dcn <= 0)
|
||||
dcn = 3;
|
||||
|
||||
CV_Assert( depth == CV_8U );
|
||||
CV_Assert( dcn == 3 || dcn == 4 );
|
||||
|
||||
dst.create(src.size(), CV_MAKETYPE(depth, dcn));
|
||||
dst.setTo(Scalar::all(0));
|
||||
|
||||
Size wholeSize;
|
||||
Point ofs;
|
||||
src.locateROI(wholeSize, ofs);
|
||||
PtrStepSzb srcWhole(wholeSize.height, wholeSize.width, src.datastart, src.step);
|
||||
|
||||
const int2 firstRed = make_int2(code == COLOR_BayerRG2BGR_MHT || code == COLOR_BayerGB2BGR_MHT ? 0 : 1,
|
||||
code == COLOR_BayerRG2BGR_MHT || code == COLOR_BayerGR2BGR_MHT ? 0 : 1);
|
||||
|
||||
if (dcn == 3)
|
||||
device::MHCdemosaic<3>(srcWhole, make_int2(ofs.x, ofs.y), dst, firstRed, StreamAccessor::getStream(stream));
|
||||
else
|
||||
device::MHCdemosaic<4>(srcWhole, make_int2(ofs.x, ofs.y), dst, firstRed, StreamAccessor::getStream(stream));
|
||||
|
||||
break;
|
||||
}
|
||||
|
||||
case COLOR_BayerBG2GRAY_MHT: case COLOR_BayerGB2GRAY_MHT: case COLOR_BayerRG2GRAY_MHT: case COLOR_BayerGR2GRAY_MHT:
|
||||
{
|
||||
CV_Assert( depth == CV_8U );
|
||||
|
||||
dst.create(src.size(), CV_MAKETYPE(depth, 1));
|
||||
dst.setTo(Scalar::all(0));
|
||||
|
||||
Size wholeSize;
|
||||
Point ofs;
|
||||
src.locateROI(wholeSize, ofs);
|
||||
PtrStepSzb srcWhole(wholeSize.height, wholeSize.width, src.datastart, src.step);
|
||||
|
||||
const int2 firstRed = make_int2(code == COLOR_BayerRG2BGR_MHT || code == COLOR_BayerGB2BGR_MHT ? 0 : 1,
|
||||
code == COLOR_BayerRG2BGR_MHT || code == COLOR_BayerGR2BGR_MHT ? 0 : 1);
|
||||
|
||||
device::MHCdemosaic<1>(srcWhole, make_int2(ofs.x, ofs.y), dst, firstRed, StreamAccessor::getStream(stream));
|
||||
|
||||
break;
|
||||
}
|
||||
|
||||
default:
|
||||
CV_Error( CV_StsBadFlag, "Unknown / unsupported color conversion code" );
|
||||
}
|
||||
}
|
||||
|
||||
void cv::gpu::swapChannels(GpuMat& image, const int dstOrder[4], Stream& s)
|
||||
{
|
||||
CV_Assert(image.type() == CV_8UC4);
|
||||
|
@@ -1,258 +0,0 @@
|
||||
/*M///////////////////////////////////////////////////////////////////////////////////////
|
||||
//
|
||||
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
|
||||
//
|
||||
// By downloading, copying, installing or using the software you agree to this license.
|
||||
// If you do not agree to this license, do not download, install,
|
||||
// copy or use the software.
|
||||
//
|
||||
//
|
||||
// License Agreement
|
||||
// For Open Source Computer Vision Library
|
||||
//
|
||||
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
|
||||
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
|
||||
// Third party copyrights are property of their respective owners.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without modification,
|
||||
// are permitted provided that the following conditions are met:
|
||||
//
|
||||
// * Redistribution's of source code must retain the above copyright notice,
|
||||
// this list of conditions and the following disclaimer.
|
||||
//
|
||||
// * Redistribution's in binary form must reproduce the above copyright notice,
|
||||
// this list of conditions and the following disclaimer in the documentation
|
||||
// and/or other materials provided with the distribution.
|
||||
//
|
||||
// * The name of the copyright holders may not be used to endorse or promote products
|
||||
// derived from this software without specific prior written permission.
|
||||
//
|
||||
// This software is provided by the copyright holders and contributors "as is" and
|
||||
// any express or bpied warranties, including, but not limited to, the bpied
|
||||
// warranties of merchantability and fitness for a particular purpose are disclaimed.
|
||||
// In no event shall the Intel Corporation or contributors be liable for any direct,
|
||||
// indirect, incidental, special, exemplary, or consequential damages
|
||||
// (including, but not limited to, procurement of substitute goods or services;
|
||||
// loss of use, data, or profits; or business interruption) however caused
|
||||
// and on any theory of liability, whether in contract, strict liability,
|
||||
// or tort (including negligence or otherwise) arising in any way out of
|
||||
// the use of this software, even if advised of the possibility of such damage.
|
||||
//
|
||||
//M*/
|
||||
|
||||
#if !defined CUDA_DISABLER
|
||||
|
||||
#include "opencv2/gpu/device/common.hpp"
|
||||
|
||||
namespace cv { namespace gpu { namespace device
|
||||
{
|
||||
namespace vibe
|
||||
{
|
||||
__constant__ int c_nbSamples;
|
||||
__constant__ int c_reqMatches;
|
||||
__constant__ int c_radius;
|
||||
__constant__ int c_subsamplingFactor;
|
||||
|
||||
void loadConstants(int nbSamples, int reqMatches, int radius, int subsamplingFactor)
|
||||
{
|
||||
cudaSafeCall( cudaMemcpyToSymbol(c_nbSamples, &nbSamples, sizeof(int)) );
|
||||
cudaSafeCall( cudaMemcpyToSymbol(c_reqMatches, &reqMatches, sizeof(int)) );
|
||||
cudaSafeCall( cudaMemcpyToSymbol(c_radius, &radius, sizeof(int)) );
|
||||
cudaSafeCall( cudaMemcpyToSymbol(c_subsamplingFactor, &subsamplingFactor, sizeof(int)) );
|
||||
}
|
||||
|
||||
__device__ __forceinline__ uint nextRand(uint& state)
|
||||
{
|
||||
const unsigned int CV_RNG_COEFF = 4164903690U;
|
||||
state = state * CV_RNG_COEFF + (state >> 16);
|
||||
return state;
|
||||
}
|
||||
|
||||
__constant__ int c_xoff[9] = {-1, 0, 1, -1, 1, -1, 0, 1, 0};
|
||||
__constant__ int c_yoff[9] = {-1, -1, -1, 0, 0, 1, 1, 1, 0};
|
||||
|
||||
__device__ __forceinline__ int2 chooseRandomNeighbor(int x, int y, uint& randState, int count = 8)
|
||||
{
|
||||
int idx = nextRand(randState) % count;
|
||||
|
||||
return make_int2(x + c_xoff[idx], y + c_yoff[idx]);
|
||||
}
|
||||
|
||||
__device__ __forceinline__ uchar cvt(uchar val)
|
||||
{
|
||||
return val;
|
||||
}
|
||||
__device__ __forceinline__ uchar4 cvt(const uchar3& val)
|
||||
{
|
||||
return make_uchar4(val.x, val.y, val.z, 0);
|
||||
}
|
||||
__device__ __forceinline__ uchar4 cvt(const uchar4& val)
|
||||
{
|
||||
return val;
|
||||
}
|
||||
|
||||
template <typename SrcT, typename SampleT>
|
||||
__global__ void init(const PtrStepSz<SrcT> frame, PtrStep<SampleT> samples, PtrStep<uint> randStates)
|
||||
{
|
||||
const int x = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
const int y = blockIdx.y * blockDim.y + threadIdx.y;
|
||||
|
||||
if (x >= frame.cols || y >= frame.rows)
|
||||
return;
|
||||
|
||||
uint localState = randStates(y, x);
|
||||
|
||||
for (int k = 0; k < c_nbSamples; ++k)
|
||||
{
|
||||
int2 np = chooseRandomNeighbor(x, y, localState, 9);
|
||||
|
||||
np.x = ::max(0, ::min(np.x, frame.cols - 1));
|
||||
np.y = ::max(0, ::min(np.y, frame.rows - 1));
|
||||
|
||||
SrcT pix = frame(np.y, np.x);
|
||||
|
||||
samples(k * frame.rows + y, x) = cvt(pix);
|
||||
}
|
||||
|
||||
randStates(y, x) = localState;
|
||||
}
|
||||
|
||||
template <typename SrcT, typename SampleT>
|
||||
void init_caller(PtrStepSzb frame, PtrStepSzb samples, PtrStepSz<uint> randStates, cudaStream_t stream)
|
||||
{
|
||||
dim3 block(32, 8);
|
||||
dim3 grid(divUp(frame.cols, block.x), divUp(frame.rows, block.y));
|
||||
|
||||
cudaSafeCall( cudaFuncSetCacheConfig(init<SrcT, SampleT>, cudaFuncCachePreferL1) );
|
||||
|
||||
init<SrcT, SampleT><<<grid, block, 0, stream>>>((PtrStepSz<SrcT>) frame, (PtrStepSz<SampleT>) samples, randStates);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
if (stream == 0)
|
||||
cudaSafeCall( cudaDeviceSynchronize() );
|
||||
}
|
||||
|
||||
void init_gpu(PtrStepSzb frame, int cn, PtrStepSzb samples, PtrStepSz<uint> randStates, cudaStream_t stream)
|
||||
{
|
||||
typedef void (*func_t)(PtrStepSzb frame, PtrStepSzb samples, PtrStepSz<uint> randStates, cudaStream_t stream);
|
||||
static const func_t funcs[] =
|
||||
{
|
||||
0, init_caller<uchar, uchar>, 0, init_caller<uchar3, uchar4>, init_caller<uchar4, uchar4>
|
||||
};
|
||||
|
||||
funcs[cn](frame, samples, randStates, stream);
|
||||
}
|
||||
|
||||
__device__ __forceinline__ int calcDist(uchar a, uchar b)
|
||||
{
|
||||
return ::abs(a - b);
|
||||
}
|
||||
__device__ __forceinline__ int calcDist(const uchar3& a, const uchar4& b)
|
||||
{
|
||||
return (::abs(a.x - b.x) + ::abs(a.y - b.y) + ::abs(a.z - b.z)) / 3;
|
||||
}
|
||||
__device__ __forceinline__ int calcDist(const uchar4& a, const uchar4& b)
|
||||
{
|
||||
return (::abs(a.x - b.x) + ::abs(a.y - b.y) + ::abs(a.z - b.z)) / 3;
|
||||
}
|
||||
|
||||
template <typename SrcT, typename SampleT>
|
||||
__global__ void update(const PtrStepSz<SrcT> frame, PtrStepb fgmask, PtrStep<SampleT> samples, PtrStep<uint> randStates)
|
||||
{
|
||||
const int x = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
const int y = blockIdx.y * blockDim.y + threadIdx.y;
|
||||
|
||||
if (x >= frame.cols || y >= frame.rows)
|
||||
return;
|
||||
|
||||
uint localState = randStates(y, x);
|
||||
|
||||
SrcT imgPix = frame(y, x);
|
||||
|
||||
// comparison with the model
|
||||
|
||||
int count = 0;
|
||||
for (int k = 0; (count < c_reqMatches) && (k < c_nbSamples); ++k)
|
||||
{
|
||||
SampleT samplePix = samples(k * frame.rows + y, x);
|
||||
|
||||
int distance = calcDist(imgPix, samplePix);
|
||||
|
||||
if (distance < c_radius)
|
||||
++count;
|
||||
}
|
||||
|
||||
// pixel classification according to reqMatches
|
||||
|
||||
fgmask(y, x) = (uchar) (-(count < c_reqMatches));
|
||||
|
||||
if (count >= c_reqMatches)
|
||||
{
|
||||
// the pixel belongs to the background
|
||||
|
||||
// gets a random number between 0 and subsamplingFactor-1
|
||||
int randomNumber = nextRand(localState) % c_subsamplingFactor;
|
||||
|
||||
// update of the current pixel model
|
||||
if (randomNumber == 0)
|
||||
{
|
||||
// random subsampling
|
||||
|
||||
int k = nextRand(localState) % c_nbSamples;
|
||||
|
||||
samples(k * frame.rows + y, x) = cvt(imgPix);
|
||||
}
|
||||
|
||||
// update of a neighboring pixel model
|
||||
randomNumber = nextRand(localState) % c_subsamplingFactor;
|
||||
|
||||
if (randomNumber == 0)
|
||||
{
|
||||
// random subsampling
|
||||
|
||||
// chooses a neighboring pixel randomly
|
||||
int2 np = chooseRandomNeighbor(x, y, localState);
|
||||
|
||||
np.x = ::max(0, ::min(np.x, frame.cols - 1));
|
||||
np.y = ::max(0, ::min(np.y, frame.rows - 1));
|
||||
|
||||
// chooses the value to be replaced randomly
|
||||
int k = nextRand(localState) % c_nbSamples;
|
||||
|
||||
samples(k * frame.rows + np.y, np.x) = cvt(imgPix);
|
||||
}
|
||||
}
|
||||
|
||||
randStates(y, x) = localState;
|
||||
}
|
||||
|
||||
template <typename SrcT, typename SampleT>
|
||||
void update_caller(PtrStepSzb frame, PtrStepSzb fgmask, PtrStepSzb samples, PtrStepSz<uint> randStates, cudaStream_t stream)
|
||||
{
|
||||
dim3 block(32, 8);
|
||||
dim3 grid(divUp(frame.cols, block.x), divUp(frame.rows, block.y));
|
||||
|
||||
cudaSafeCall( cudaFuncSetCacheConfig(update<SrcT, SampleT>, cudaFuncCachePreferL1) );
|
||||
|
||||
update<SrcT, SampleT><<<grid, block, 0, stream>>>((PtrStepSz<SrcT>) frame, fgmask, (PtrStepSz<SampleT>) samples, randStates);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
if (stream == 0)
|
||||
cudaSafeCall( cudaDeviceSynchronize() );
|
||||
}
|
||||
|
||||
void update_gpu(PtrStepSzb frame, int cn, PtrStepSzb fgmask, PtrStepSzb samples, PtrStepSz<uint> randStates, cudaStream_t stream)
|
||||
{
|
||||
typedef void (*func_t)(PtrStepSzb frame, PtrStepSzb fgmask, PtrStepSzb samples, PtrStepSz<uint> randStates, cudaStream_t stream);
|
||||
static const func_t funcs[] =
|
||||
{
|
||||
0, update_caller<uchar, uchar>, 0, update_caller<uchar3, uchar4>, update_caller<uchar4, uchar4>
|
||||
};
|
||||
|
||||
funcs[cn](frame, fgmask, samples, randStates, stream);
|
||||
}
|
||||
}
|
||||
}}}
|
||||
|
||||
|
||||
#endif /* CUDA_DISABLER */
|
@@ -42,42 +42,38 @@
|
||||
|
||||
#if !defined CUDA_DISABLER
|
||||
|
||||
#include <opencv2/gpu/device/common.hpp>
|
||||
#include <opencv2/gpu/device/vec_traits.hpp>
|
||||
#include <opencv2/gpu/device/vec_math.hpp>
|
||||
#include <opencv2/gpu/device/limits.hpp>
|
||||
#include "opencv2/gpu/device/common.hpp"
|
||||
#include "opencv2/gpu/device/vec_traits.hpp"
|
||||
#include "opencv2/gpu/device/vec_math.hpp"
|
||||
#include "opencv2/gpu/device/limits.hpp"
|
||||
#include "opencv2/gpu/device/color.hpp"
|
||||
#include "opencv2/gpu/device/saturate_cast.hpp"
|
||||
|
||||
namespace cv { namespace gpu {
|
||||
namespace device
|
||||
namespace cv { namespace gpu { namespace device
|
||||
{
|
||||
template <typename T> struct Bayer2BGR;
|
||||
|
||||
template <> struct Bayer2BGR<uchar>
|
||||
{
|
||||
template <typename D>
|
||||
__global__ void Bayer2BGR_8u(const PtrStepb src, PtrStepSz<D> dst, const bool blue_last, const bool start_with_green)
|
||||
uchar3 res0;
|
||||
uchar3 res1;
|
||||
uchar3 res2;
|
||||
uchar3 res3;
|
||||
|
||||
__device__ void apply(const PtrStepSzb& src, int s_x, int s_y, bool blue_last, bool start_with_green)
|
||||
{
|
||||
const int s_x = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
int s_y = blockIdx.y * blockDim.y + threadIdx.y;
|
||||
|
||||
if (s_y >= dst.rows || (s_x << 2) >= dst.cols)
|
||||
return;
|
||||
|
||||
s_y = ::min(::max(s_y, 1), dst.rows - 2);
|
||||
|
||||
uchar4 patch[3][3];
|
||||
patch[0][1] = ((const uchar4*) src.ptr(s_y - 1))[s_x];
|
||||
patch[0][0] = ((const uchar4*) src.ptr(s_y - 1))[::max(s_x - 1, 0)];
|
||||
patch[0][2] = ((const uchar4*) src.ptr(s_y - 1))[::min(s_x + 1, ((dst.cols + 3) >> 2) - 1)];
|
||||
patch[0][2] = ((const uchar4*) src.ptr(s_y - 1))[::min(s_x + 1, ((src.cols + 3) >> 2) - 1)];
|
||||
|
||||
patch[1][1] = ((const uchar4*) src.ptr(s_y))[s_x];
|
||||
patch[1][0] = ((const uchar4*) src.ptr(s_y))[::max(s_x - 1, 0)];
|
||||
patch[1][2] = ((const uchar4*) src.ptr(s_y))[::min(s_x + 1, ((dst.cols + 3) >> 2) - 1)];
|
||||
patch[1][2] = ((const uchar4*) src.ptr(s_y))[::min(s_x + 1, ((src.cols + 3) >> 2) - 1)];
|
||||
|
||||
patch[2][1] = ((const uchar4*) src.ptr(s_y + 1))[s_x];
|
||||
patch[2][0] = ((const uchar4*) src.ptr(s_y + 1))[::max(s_x - 1, 0)];
|
||||
patch[2][2] = ((const uchar4*) src.ptr(s_y + 1))[::min(s_x + 1, ((dst.cols + 3) >> 2) - 1)];
|
||||
|
||||
D res0 = VecTraits<D>::all(numeric_limits<uchar>::max());
|
||||
D res1 = VecTraits<D>::all(numeric_limits<uchar>::max());
|
||||
D res2 = VecTraits<D>::all(numeric_limits<uchar>::max());
|
||||
D res3 = VecTraits<D>::all(numeric_limits<uchar>::max());
|
||||
patch[2][2] = ((const uchar4*) src.ptr(s_y + 1))[::min(s_x + 1, ((src.cols + 3) >> 2) - 1)];
|
||||
|
||||
if ((s_y & 1) ^ start_with_green)
|
||||
{
|
||||
@@ -181,45 +177,69 @@ namespace cv { namespace gpu {
|
||||
res3.z = t7;
|
||||
}
|
||||
}
|
||||
|
||||
const int d_x = (blockIdx.x * blockDim.x + threadIdx.x) << 2;
|
||||
const int d_y = blockIdx.y * blockDim.y + threadIdx.y;
|
||||
|
||||
dst(d_y, d_x) = res0;
|
||||
if (d_x + 1 < dst.cols)
|
||||
dst(d_y, d_x + 1) = res1;
|
||||
if (d_x + 2 < dst.cols)
|
||||
dst(d_y, d_x + 2) = res2;
|
||||
if (d_x + 3 < dst.cols)
|
||||
dst(d_y, d_x + 3) = res3;
|
||||
}
|
||||
};
|
||||
|
||||
template <typename D>
|
||||
__global__ void Bayer2BGR_16u(const PtrStepb src, PtrStepSz<D> dst, const bool blue_last, const bool start_with_green)
|
||||
template <typename D> __device__ __forceinline__ D toDst(const uchar3& pix);
|
||||
template <> __device__ __forceinline__ uchar toDst<uchar>(const uchar3& pix)
|
||||
{
|
||||
typename bgr_to_gray_traits<uchar>::functor_type f = bgr_to_gray_traits<uchar>::create_functor();
|
||||
return f(pix);
|
||||
}
|
||||
template <> __device__ __forceinline__ uchar3 toDst<uchar3>(const uchar3& pix)
|
||||
{
|
||||
return pix;
|
||||
}
|
||||
template <> __device__ __forceinline__ uchar4 toDst<uchar4>(const uchar3& pix)
|
||||
{
|
||||
return make_uchar4(pix.x, pix.y, pix.z, 255);
|
||||
}
|
||||
|
||||
template <typename D>
|
||||
__global__ void Bayer2BGR_8u(const PtrStepSzb src, PtrStep<D> dst, const bool blue_last, const bool start_with_green)
|
||||
{
|
||||
const int s_x = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
int s_y = blockIdx.y * blockDim.y + threadIdx.y;
|
||||
|
||||
if (s_y >= src.rows || (s_x << 2) >= src.cols)
|
||||
return;
|
||||
|
||||
s_y = ::min(::max(s_y, 1), src.rows - 2);
|
||||
|
||||
Bayer2BGR<uchar> bayer;
|
||||
bayer.apply(src, s_x, s_y, blue_last, start_with_green);
|
||||
|
||||
const int d_x = (blockIdx.x * blockDim.x + threadIdx.x) << 2;
|
||||
const int d_y = blockIdx.y * blockDim.y + threadIdx.y;
|
||||
|
||||
dst(d_y, d_x) = toDst<D>(bayer.res0);
|
||||
if (d_x + 1 < src.cols)
|
||||
dst(d_y, d_x + 1) = toDst<D>(bayer.res1);
|
||||
if (d_x + 2 < src.cols)
|
||||
dst(d_y, d_x + 2) = toDst<D>(bayer.res2);
|
||||
if (d_x + 3 < src.cols)
|
||||
dst(d_y, d_x + 3) = toDst<D>(bayer.res3);
|
||||
}
|
||||
|
||||
template <> struct Bayer2BGR<ushort>
|
||||
{
|
||||
ushort3 res0;
|
||||
ushort3 res1;
|
||||
|
||||
__device__ void apply(const PtrStepSzb& src, int s_x, int s_y, bool blue_last, bool start_with_green)
|
||||
{
|
||||
const int s_x = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
int s_y = blockIdx.y * blockDim.y + threadIdx.y;
|
||||
|
||||
if (s_y >= dst.rows || (s_x << 1) >= dst.cols)
|
||||
return;
|
||||
|
||||
s_y = ::min(::max(s_y, 1), dst.rows - 2);
|
||||
|
||||
ushort2 patch[3][3];
|
||||
patch[0][1] = ((const ushort2*) src.ptr(s_y - 1))[s_x];
|
||||
patch[0][0] = ((const ushort2*) src.ptr(s_y - 1))[::max(s_x - 1, 0)];
|
||||
patch[0][2] = ((const ushort2*) src.ptr(s_y - 1))[::min(s_x + 1, ((dst.cols + 1) >> 1) - 1)];
|
||||
patch[0][2] = ((const ushort2*) src.ptr(s_y - 1))[::min(s_x + 1, ((src.cols + 1) >> 1) - 1)];
|
||||
|
||||
patch[1][1] = ((const ushort2*) src.ptr(s_y))[s_x];
|
||||
patch[1][0] = ((const ushort2*) src.ptr(s_y))[::max(s_x - 1, 0)];
|
||||
patch[1][2] = ((const ushort2*) src.ptr(s_y))[::min(s_x + 1, ((dst.cols + 1) >> 1) - 1)];
|
||||
patch[1][2] = ((const ushort2*) src.ptr(s_y))[::min(s_x + 1, ((src.cols + 1) >> 1) - 1)];
|
||||
|
||||
patch[2][1] = ((const ushort2*) src.ptr(s_y + 1))[s_x];
|
||||
patch[2][0] = ((const ushort2*) src.ptr(s_y + 1))[::max(s_x - 1, 0)];
|
||||
patch[2][2] = ((const ushort2*) src.ptr(s_y + 1))[::min(s_x + 1, ((dst.cols + 1) >> 1) - 1)];
|
||||
|
||||
D res0 = VecTraits<D>::all(numeric_limits<ushort>::max());
|
||||
D res1 = VecTraits<D>::all(numeric_limits<ushort>::max());
|
||||
patch[2][2] = ((const ushort2*) src.ptr(s_y + 1))[::min(s_x + 1, ((src.cols + 1) >> 1) - 1)];
|
||||
|
||||
if ((s_y & 1) ^ start_with_green)
|
||||
{
|
||||
@@ -279,53 +299,246 @@ namespace cv { namespace gpu {
|
||||
res1.z = t3;
|
||||
}
|
||||
}
|
||||
|
||||
const int d_x = (blockIdx.x * blockDim.x + threadIdx.x) << 1;
|
||||
const int d_y = blockIdx.y * blockDim.y + threadIdx.y;
|
||||
|
||||
dst(d_y, d_x) = res0;
|
||||
if (d_x + 1 < dst.cols)
|
||||
dst(d_y, d_x + 1) = res1;
|
||||
}
|
||||
};
|
||||
|
||||
template <int cn>
|
||||
void Bayer2BGR_8u_gpu(PtrStepSzb src, PtrStepSzb dst, bool blue_last, bool start_with_green, cudaStream_t stream)
|
||||
{
|
||||
typedef typename TypeVec<uchar, cn>::vec_type dst_t;
|
||||
|
||||
const dim3 block(32, 8);
|
||||
const dim3 grid(divUp(dst.cols, 4 * block.x), divUp(dst.rows, block.y));
|
||||
|
||||
cudaSafeCall( cudaFuncSetCacheConfig(Bayer2BGR_8u<dst_t>, cudaFuncCachePreferL1) );
|
||||
|
||||
Bayer2BGR_8u<dst_t><<<grid, block, 0, stream>>>(src, (PtrStepSz<dst_t>)dst, blue_last, start_with_green);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
if (stream == 0)
|
||||
cudaSafeCall( cudaDeviceSynchronize() );
|
||||
}
|
||||
template <int cn>
|
||||
void Bayer2BGR_16u_gpu(PtrStepSzb src, PtrStepSzb dst, bool blue_last, bool start_with_green, cudaStream_t stream)
|
||||
{
|
||||
typedef typename TypeVec<ushort, cn>::vec_type dst_t;
|
||||
|
||||
const dim3 block(32, 8);
|
||||
const dim3 grid(divUp(dst.cols, 2 * block.x), divUp(dst.rows, block.y));
|
||||
|
||||
cudaSafeCall( cudaFuncSetCacheConfig(Bayer2BGR_16u<dst_t>, cudaFuncCachePreferL1) );
|
||||
|
||||
Bayer2BGR_16u<dst_t><<<grid, block, 0, stream>>>(src, (PtrStepSz<dst_t>)dst, blue_last, start_with_green);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
if (stream == 0)
|
||||
cudaSafeCall( cudaDeviceSynchronize() );
|
||||
}
|
||||
|
||||
template void Bayer2BGR_8u_gpu<3>(PtrStepSzb src, PtrStepSzb dst, bool blue_last, bool start_with_green, cudaStream_t stream);
|
||||
template void Bayer2BGR_8u_gpu<4>(PtrStepSzb src, PtrStepSzb dst, bool blue_last, bool start_with_green, cudaStream_t stream);
|
||||
template void Bayer2BGR_16u_gpu<3>(PtrStepSzb src, PtrStepSzb dst, bool blue_last, bool start_with_green, cudaStream_t stream);
|
||||
template void Bayer2BGR_16u_gpu<4>(PtrStepSzb src, PtrStepSzb dst, bool blue_last, bool start_with_green, cudaStream_t stream);
|
||||
template <typename D> __device__ __forceinline__ D toDst(const ushort3& pix);
|
||||
template <> __device__ __forceinline__ ushort toDst<ushort>(const ushort3& pix)
|
||||
{
|
||||
typename bgr_to_gray_traits<ushort>::functor_type f = bgr_to_gray_traits<ushort>::create_functor();
|
||||
return f(pix);
|
||||
}
|
||||
template <> __device__ __forceinline__ ushort3 toDst<ushort3>(const ushort3& pix)
|
||||
{
|
||||
return pix;
|
||||
}
|
||||
template <> __device__ __forceinline__ ushort4 toDst<ushort4>(const ushort3& pix)
|
||||
{
|
||||
return make_ushort4(pix.x, pix.y, pix.z, numeric_limits<ushort>::max());
|
||||
}
|
||||
}}
|
||||
|
||||
#endif /* CUDA_DISABLER */
|
||||
template <typename D>
|
||||
__global__ void Bayer2BGR_16u(const PtrStepSzb src, PtrStep<D> dst, const bool blue_last, const bool start_with_green)
|
||||
{
|
||||
const int s_x = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
int s_y = blockIdx.y * blockDim.y + threadIdx.y;
|
||||
|
||||
if (s_y >= src.rows || (s_x << 1) >= src.cols)
|
||||
return;
|
||||
|
||||
s_y = ::min(::max(s_y, 1), src.rows - 2);
|
||||
|
||||
Bayer2BGR<ushort> bayer;
|
||||
bayer.apply(src, s_x, s_y, blue_last, start_with_green);
|
||||
|
||||
const int d_x = (blockIdx.x * blockDim.x + threadIdx.x) << 1;
|
||||
const int d_y = blockIdx.y * blockDim.y + threadIdx.y;
|
||||
|
||||
dst(d_y, d_x) = toDst<D>(bayer.res0);
|
||||
if (d_x + 1 < src.cols)
|
||||
dst(d_y, d_x + 1) = toDst<D>(bayer.res1);
|
||||
}
|
||||
|
||||
template <int cn>
|
||||
void Bayer2BGR_8u_gpu(PtrStepSzb src, PtrStepSzb dst, bool blue_last, bool start_with_green, cudaStream_t stream)
|
||||
{
|
||||
typedef typename TypeVec<uchar, cn>::vec_type dst_t;
|
||||
|
||||
const dim3 block(32, 8);
|
||||
const dim3 grid(divUp(src.cols, 4 * block.x), divUp(src.rows, block.y));
|
||||
|
||||
cudaSafeCall( cudaFuncSetCacheConfig(Bayer2BGR_8u<dst_t>, cudaFuncCachePreferL1) );
|
||||
|
||||
Bayer2BGR_8u<dst_t><<<grid, block, 0, stream>>>(src, (PtrStepSz<dst_t>)dst, blue_last, start_with_green);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
if (stream == 0)
|
||||
cudaSafeCall( cudaDeviceSynchronize() );
|
||||
}
|
||||
|
||||
template <int cn>
|
||||
void Bayer2BGR_16u_gpu(PtrStepSzb src, PtrStepSzb dst, bool blue_last, bool start_with_green, cudaStream_t stream)
|
||||
{
|
||||
typedef typename TypeVec<ushort, cn>::vec_type dst_t;
|
||||
|
||||
const dim3 block(32, 8);
|
||||
const dim3 grid(divUp(src.cols, 2 * block.x), divUp(src.rows, block.y));
|
||||
|
||||
cudaSafeCall( cudaFuncSetCacheConfig(Bayer2BGR_16u<dst_t>, cudaFuncCachePreferL1) );
|
||||
|
||||
Bayer2BGR_16u<dst_t><<<grid, block, 0, stream>>>(src, (PtrStepSz<dst_t>)dst, blue_last, start_with_green);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
if (stream == 0)
|
||||
cudaSafeCall( cudaDeviceSynchronize() );
|
||||
}
|
||||
|
||||
template void Bayer2BGR_8u_gpu<1>(PtrStepSzb src, PtrStepSzb dst, bool blue_last, bool start_with_green, cudaStream_t stream);
|
||||
template void Bayer2BGR_8u_gpu<3>(PtrStepSzb src, PtrStepSzb dst, bool blue_last, bool start_with_green, cudaStream_t stream);
|
||||
template void Bayer2BGR_8u_gpu<4>(PtrStepSzb src, PtrStepSzb dst, bool blue_last, bool start_with_green, cudaStream_t stream);
|
||||
|
||||
template void Bayer2BGR_16u_gpu<1>(PtrStepSzb src, PtrStepSzb dst, bool blue_last, bool start_with_green, cudaStream_t stream);
|
||||
template void Bayer2BGR_16u_gpu<3>(PtrStepSzb src, PtrStepSzb dst, bool blue_last, bool start_with_green, cudaStream_t stream);
|
||||
template void Bayer2BGR_16u_gpu<4>(PtrStepSzb src, PtrStepSzb dst, bool blue_last, bool start_with_green, cudaStream_t stream);
|
||||
|
||||
//////////////////////////////////////////////////////////////
|
||||
// Bayer Demosaicing (Malvar, He, and Cutler)
|
||||
//
|
||||
// by Morgan McGuire, Williams College
|
||||
// http://graphics.cs.williams.edu/papers/BayerJGT09/#shaders
|
||||
//
|
||||
// ported to CUDA
|
||||
|
||||
texture<uchar, cudaTextureType2D, cudaReadModeElementType> sourceTex(false, cudaFilterModePoint, cudaAddressModeClamp);
|
||||
|
||||
template <typename DstType>
|
||||
__global__ void MHCdemosaic(PtrStepSz<DstType> dst, const int2 sourceOffset, const int2 firstRed)
|
||||
{
|
||||
const float kAx = -1.0f / 8.0f, kAy = -1.5f / 8.0f, kAz = 0.5f / 8.0f /*kAw = -1.0f / 8.0f*/;
|
||||
const float kBx = 2.0f / 8.0f, /*kBy = 0.0f / 8.0f,*/ /*kBz = 0.0f / 8.0f,*/ kBw = 4.0f / 8.0f ;
|
||||
const float kCx = 4.0f / 8.0f, kCy = 6.0f / 8.0f, kCz = 5.0f / 8.0f /*kCw = 5.0f / 8.0f*/;
|
||||
const float /*kDx = 0.0f / 8.0f,*/ kDy = 2.0f / 8.0f, kDz = -1.0f / 8.0f /*kDw = -1.0f / 8.0f*/;
|
||||
const float kEx = -1.0f / 8.0f, kEy = -1.5f / 8.0f, /*kEz = -1.0f / 8.0f,*/ kEw = 0.5f / 8.0f ;
|
||||
const float kFx = 2.0f / 8.0f, /*kFy = 0.0f / 8.0f,*/ kFz = 4.0f / 8.0f /*kFw = 0.0f / 8.0f*/;
|
||||
|
||||
const int x = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
const int y = blockIdx.y * blockDim.y + threadIdx.y;
|
||||
|
||||
if (x == 0 || x >= dst.cols - 1 || y == 0 || y >= dst.rows - 1)
|
||||
return;
|
||||
|
||||
int2 center;
|
||||
center.x = x + sourceOffset.x;
|
||||
center.y = y + sourceOffset.y;
|
||||
|
||||
int4 xCoord;
|
||||
xCoord.x = center.x - 2;
|
||||
xCoord.y = center.x - 1;
|
||||
xCoord.z = center.x + 1;
|
||||
xCoord.w = center.x + 2;
|
||||
|
||||
int4 yCoord;
|
||||
yCoord.x = center.y - 2;
|
||||
yCoord.y = center.y - 1;
|
||||
yCoord.z = center.y + 1;
|
||||
yCoord.w = center.y + 2;
|
||||
|
||||
float C = tex2D(sourceTex, center.x, center.y); // ( 0, 0)
|
||||
|
||||
float4 Dvec;
|
||||
Dvec.x = tex2D(sourceTex, xCoord.y, yCoord.y); // (-1,-1)
|
||||
Dvec.y = tex2D(sourceTex, xCoord.y, yCoord.z); // (-1, 1)
|
||||
Dvec.z = tex2D(sourceTex, xCoord.z, yCoord.y); // ( 1,-1)
|
||||
Dvec.w = tex2D(sourceTex, xCoord.z, yCoord.z); // ( 1, 1)
|
||||
|
||||
float4 value;
|
||||
value.x = tex2D(sourceTex, center.x, yCoord.x); // ( 0,-2) A0
|
||||
value.y = tex2D(sourceTex, center.x, yCoord.y); // ( 0,-1) B0
|
||||
value.z = tex2D(sourceTex, xCoord.x, center.y); // (-2, 0) E0
|
||||
value.w = tex2D(sourceTex, xCoord.y, center.y); // (-1, 0) F0
|
||||
|
||||
// (A0 + A1), (B0 + B1), (E0 + E1), (F0 + F1)
|
||||
value.x += tex2D(sourceTex, center.x, yCoord.w); // ( 0, 2) A1
|
||||
value.y += tex2D(sourceTex, center.x, yCoord.z); // ( 0, 1) B1
|
||||
value.z += tex2D(sourceTex, xCoord.w, center.y); // ( 2, 0) E1
|
||||
value.w += tex2D(sourceTex, xCoord.z, center.y); // ( 1, 0) F1
|
||||
|
||||
float4 PATTERN;
|
||||
PATTERN.x = kCx * C;
|
||||
PATTERN.y = kCy * C;
|
||||
PATTERN.z = kCz * C;
|
||||
PATTERN.w = PATTERN.z;
|
||||
|
||||
float D = Dvec.x + Dvec.y + Dvec.z + Dvec.w;
|
||||
|
||||
// There are five filter patterns (identity, cross, checker,
|
||||
// theta, phi). Precompute the terms from all of them and then
|
||||
// use swizzles to assign to color channels.
|
||||
//
|
||||
// Channel Matches
|
||||
// x cross (e.g., EE G)
|
||||
// y checker (e.g., EE B)
|
||||
// z theta (e.g., EO R)
|
||||
// w phi (e.g., EO B)
|
||||
|
||||
#define A value.x // A0 + A1
|
||||
#define B value.y // B0 + B1
|
||||
#define E value.z // E0 + E1
|
||||
#define F value.w // F0 + F1
|
||||
|
||||
float3 temp;
|
||||
|
||||
// PATTERN.yzw += (kD.yz * D).xyy;
|
||||
temp.x = kDy * D;
|
||||
temp.y = kDz * D;
|
||||
PATTERN.y += temp.x;
|
||||
PATTERN.z += temp.y;
|
||||
PATTERN.w += temp.y;
|
||||
|
||||
// PATTERN += (kA.xyz * A).xyzx;
|
||||
temp.x = kAx * A;
|
||||
temp.y = kAy * A;
|
||||
temp.z = kAz * A;
|
||||
PATTERN.x += temp.x;
|
||||
PATTERN.y += temp.y;
|
||||
PATTERN.z += temp.z;
|
||||
PATTERN.w += temp.x;
|
||||
|
||||
// PATTERN += (kE.xyw * E).xyxz;
|
||||
temp.x = kEx * E;
|
||||
temp.y = kEy * E;
|
||||
temp.z = kEw * E;
|
||||
PATTERN.x += temp.x;
|
||||
PATTERN.y += temp.y;
|
||||
PATTERN.z += temp.x;
|
||||
PATTERN.w += temp.z;
|
||||
|
||||
// PATTERN.xw += kB.xw * B;
|
||||
PATTERN.x += kBx * B;
|
||||
PATTERN.w += kBw * B;
|
||||
|
||||
// PATTERN.xz += kF.xz * F;
|
||||
PATTERN.x += kFx * F;
|
||||
PATTERN.z += kFz * F;
|
||||
|
||||
// Determine which of four types of pixels we are on.
|
||||
int2 alternate;
|
||||
alternate.x = (x + firstRed.x) % 2;
|
||||
alternate.y = (y + firstRed.y) % 2;
|
||||
|
||||
// in BGR sequence;
|
||||
uchar3 pixelColor =
|
||||
(alternate.y == 0) ?
|
||||
((alternate.x == 0) ?
|
||||
make_uchar3(saturate_cast<uchar>(PATTERN.y), saturate_cast<uchar>(PATTERN.x), saturate_cast<uchar>(C)) :
|
||||
make_uchar3(saturate_cast<uchar>(PATTERN.w), saturate_cast<uchar>(C), saturate_cast<uchar>(PATTERN.z))) :
|
||||
((alternate.x == 0) ?
|
||||
make_uchar3(saturate_cast<uchar>(PATTERN.z), saturate_cast<uchar>(C), saturate_cast<uchar>(PATTERN.w)) :
|
||||
make_uchar3(saturate_cast<uchar>(C), saturate_cast<uchar>(PATTERN.x), saturate_cast<uchar>(PATTERN.y)));
|
||||
|
||||
dst(y, x) = toDst<DstType>(pixelColor);
|
||||
}
|
||||
|
||||
template <int cn>
|
||||
void MHCdemosaic(PtrStepSzb src, int2 sourceOffset, PtrStepSzb dst, int2 firstRed, cudaStream_t stream)
|
||||
{
|
||||
typedef typename TypeVec<uchar, cn>::vec_type dst_t;
|
||||
|
||||
const dim3 block(32, 8);
|
||||
const dim3 grid(divUp(src.cols, block.x), divUp(src.rows, block.y));
|
||||
|
||||
bindTexture(&sourceTex, src);
|
||||
|
||||
MHCdemosaic<dst_t><<<grid, block, 0, stream>>>((PtrStepSz<dst_t>)dst, sourceOffset, firstRed);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
if (stream == 0)
|
||||
cudaSafeCall( cudaDeviceSynchronize() );
|
||||
}
|
||||
|
||||
template void MHCdemosaic<1>(PtrStepSzb src, int2 sourceOffset, PtrStepSzb dst, int2 firstRed, cudaStream_t stream);
|
||||
template void MHCdemosaic<3>(PtrStepSzb src, int2 sourceOffset, PtrStepSzb dst, int2 firstRed, cudaStream_t stream);
|
||||
template void MHCdemosaic<4>(PtrStepSzb src, int2 sourceOffset, PtrStepSzb dst, int2 firstRed, cudaStream_t stream);
|
||||
}}}
|
||||
|
||||
#endif /* CUDA_DISABLER */
|
||||
|
@@ -48,6 +48,7 @@
|
||||
#include "opencv2/gpu/device/transform.hpp"
|
||||
#include "opencv2/gpu/device/limits.hpp"
|
||||
#include "opencv2/gpu/device/saturate_cast.hpp"
|
||||
#include "opencv2/gpu/device/simd_functions.hpp"
|
||||
|
||||
using namespace cv::gpu;
|
||||
using namespace cv::gpu::device;
|
||||
@@ -154,170 +155,28 @@ namespace arithm
|
||||
|
||||
namespace arithm
|
||||
{
|
||||
template <typename T, typename D> struct VAdd4;
|
||||
template <> struct VAdd4<uint, uint> : binary_function<uint, uint, uint>
|
||||
struct VAdd4 : binary_function<uint, uint, uint>
|
||||
{
|
||||
__device__ __forceinline__ uint operator ()(uint a, uint b) const
|
||||
{
|
||||
uint res = 0;
|
||||
|
||||
#if __CUDA_ARCH__ >= 300
|
||||
asm("vadd4.u32.u32.u32.sat %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
|
||||
#elif __CUDA_ARCH__ >= 200
|
||||
asm("vadd.u32.u32.u32.sat %0.b0, %1.b0, %2.b0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
|
||||
asm("vadd.u32.u32.u32.sat %0.b1, %1.b1, %2.b1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
|
||||
asm("vadd.u32.u32.u32.sat %0.b2, %1.b2, %2.b2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
|
||||
asm("vadd.u32.u32.u32.sat %0.b3, %1.b3, %2.b3, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
|
||||
#endif
|
||||
|
||||
return res;
|
||||
return vadd4(a, b);
|
||||
}
|
||||
|
||||
__device__ __forceinline__ VAdd4() {}
|
||||
__device__ __forceinline__ VAdd4(const VAdd4<uint, uint>& other) {}
|
||||
};
|
||||
template <> struct VAdd4<int, uint> : binary_function<int, int, uint>
|
||||
{
|
||||
__device__ __forceinline__ uint operator ()(int a, int b) const
|
||||
{
|
||||
uint res = 0;
|
||||
|
||||
#if __CUDA_ARCH__ >= 300
|
||||
asm("vadd4.u32.s32.s32.sat %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
|
||||
#elif __CUDA_ARCH__ >= 200
|
||||
asm("vadd.u32.s32.s32.sat %0.b0, %1.b0, %2.b0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
|
||||
asm("vadd.u32.s32.s32.sat %0.b1, %1.b1, %2.b1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
|
||||
asm("vadd.u32.s32.s32.sat %0.b2, %1.b2, %2.b2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
|
||||
asm("vadd.u32.s32.s32.sat %0.b3, %1.b3, %2.b3, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
|
||||
#endif
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
__device__ __forceinline__ VAdd4() {}
|
||||
__device__ __forceinline__ VAdd4(const VAdd4<int, uint>& other) {}
|
||||
};
|
||||
template <> struct VAdd4<uint, int> : binary_function<uint, uint, int>
|
||||
{
|
||||
__device__ __forceinline__ int operator ()(uint a, uint b) const
|
||||
{
|
||||
int res = 0;
|
||||
|
||||
#if __CUDA_ARCH__ >= 300
|
||||
asm("vadd4.s32.u32.u32.sat %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
|
||||
#elif __CUDA_ARCH__ >= 200
|
||||
asm("vadd.s32.u32.u32.sat %0.b0, %1.b0, %2.b0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
|
||||
asm("vadd.s32.u32.u32.sat %0.b1, %1.b1, %2.b1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
|
||||
asm("vadd.s32.u32.u32.sat %0.b2, %1.b2, %2.b2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
|
||||
asm("vadd.s32.u32.u32.sat %0.b3, %1.b3, %2.b3, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
|
||||
#endif
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
__device__ __forceinline__ VAdd4() {}
|
||||
__device__ __forceinline__ VAdd4(const VAdd4<uint, int>& other) {}
|
||||
};
|
||||
template <> struct VAdd4<int, int> : binary_function<int, int, int>
|
||||
{
|
||||
__device__ __forceinline__ int operator ()(int a, int b) const
|
||||
{
|
||||
int res = 0;
|
||||
|
||||
#if __CUDA_ARCH__ >= 300
|
||||
asm("vadd4.s32.s32.s32.sat %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
|
||||
#elif __CUDA_ARCH__ >= 200
|
||||
asm("vadd.s32.s32.s32.sat %0.b0, %1.b0, %2.b0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
|
||||
asm("vadd.s32.s32.s32.sat %0.b1, %1.b1, %2.b1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
|
||||
asm("vadd.s32.s32.s32.sat %0.b2, %1.b2, %2.b2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
|
||||
asm("vadd.s32.s32.s32.sat %0.b3, %1.b3, %2.b3, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
|
||||
#endif
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
__device__ __forceinline__ VAdd4() {}
|
||||
__device__ __forceinline__ VAdd4(const VAdd4<int, int>& other) {}
|
||||
__device__ __forceinline__ VAdd4(const VAdd4& other) {}
|
||||
};
|
||||
|
||||
////////////////////////////////////
|
||||
|
||||
template <typename T, typename D> struct VAdd2;
|
||||
template <> struct VAdd2<uint, uint> : binary_function<uint, uint, uint>
|
||||
struct VAdd2 : binary_function<uint, uint, uint>
|
||||
{
|
||||
__device__ __forceinline__ uint operator ()(uint a, uint b) const
|
||||
{
|
||||
uint res = 0;
|
||||
|
||||
#if __CUDA_ARCH__ >= 300
|
||||
asm("vadd2.u32.u32.u32.sat %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
|
||||
#elif __CUDA_ARCH__ >= 200
|
||||
asm("vadd.u32.u32.u32.sat %0.h0, %1.h0, %2.h0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
|
||||
asm("vadd.u32.u32.u32.sat %0.h1, %1.h1, %2.h1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
|
||||
#endif
|
||||
|
||||
return res;
|
||||
return vadd2(a, b);
|
||||
}
|
||||
|
||||
__device__ __forceinline__ VAdd2() {}
|
||||
__device__ __forceinline__ VAdd2(const VAdd2<uint, uint>& other) {}
|
||||
};
|
||||
template <> struct VAdd2<uint, int> : binary_function<uint, uint, int>
|
||||
{
|
||||
__device__ __forceinline__ int operator ()(uint a, uint b) const
|
||||
{
|
||||
int res = 0;
|
||||
|
||||
#if __CUDA_ARCH__ >= 300
|
||||
asm("vadd2.s32.u32.u32.sat %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
|
||||
#elif __CUDA_ARCH__ >= 200
|
||||
asm("vadd.s32.u32.u32.sat %0.h0, %1.h0, %2.h0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
|
||||
asm("vadd.s32.u32.u32.sat %0.h1, %1.h1, %2.h1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
|
||||
#endif
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
__device__ __forceinline__ VAdd2() {}
|
||||
__device__ __forceinline__ VAdd2(const VAdd2<uint, int>& other) {}
|
||||
};
|
||||
template <> struct VAdd2<int, uint> : binary_function<int, int, uint>
|
||||
{
|
||||
__device__ __forceinline__ uint operator ()(int a, int b) const
|
||||
{
|
||||
uint res = 0;
|
||||
|
||||
#if __CUDA_ARCH__ >= 300
|
||||
asm("vadd2.u32.s32.s32.sat %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
|
||||
#elif __CUDA_ARCH__ >= 200
|
||||
asm("vadd.u32.s32.s32.sat %0.h0, %1.h0, %2.h0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
|
||||
asm("vadd.u32.s32.s32.sat %0.h1, %1.h1, %2.h1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
|
||||
#endif
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
__device__ __forceinline__ VAdd2() {}
|
||||
__device__ __forceinline__ VAdd2(const VAdd2<int, uint>& other) {}
|
||||
};
|
||||
template <> struct VAdd2<int, int> : binary_function<int, int, int>
|
||||
{
|
||||
__device__ __forceinline__ int operator ()(int a, int b) const
|
||||
{
|
||||
int res = 0;
|
||||
|
||||
#if __CUDA_ARCH__ >= 300
|
||||
asm("vadd2.s32.s32.s32.sat %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
|
||||
#elif __CUDA_ARCH__ >= 200
|
||||
asm("vadd.s32.s32.s32.sat %0.h0, %1.h0, %2.h0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
|
||||
asm("vadd.s32.s32.s32.sat %0.h1, %1.h1, %2.h1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
|
||||
#endif
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
__device__ __forceinline__ VAdd2() {}
|
||||
__device__ __forceinline__ VAdd2(const VAdd2<int, int>& other) {}
|
||||
__device__ __forceinline__ VAdd2(const VAdd2& other) {}
|
||||
};
|
||||
|
||||
////////////////////////////////////
|
||||
@@ -336,13 +195,13 @@ namespace arithm
|
||||
|
||||
namespace cv { namespace gpu { namespace device
|
||||
{
|
||||
template <typename T, typename D> struct TransformFunctorTraits< arithm::VAdd4<T, D> > : arithm::ArithmFuncTraits<sizeof(T), sizeof(D)>
|
||||
template <> struct TransformFunctorTraits< arithm::VAdd4 > : arithm::ArithmFuncTraits<sizeof(uint), sizeof(uint)>
|
||||
{
|
||||
};
|
||||
|
||||
////////////////////////////////////
|
||||
|
||||
template <typename T, typename D> struct TransformFunctorTraits< arithm::VAdd2<T, D> > : arithm::ArithmFuncTraits<sizeof(T), sizeof(D)>
|
||||
template <> struct TransformFunctorTraits< arithm::VAdd2 > : arithm::ArithmFuncTraits<sizeof(uint), sizeof(uint)>
|
||||
{
|
||||
};
|
||||
|
||||
@@ -355,28 +214,16 @@ namespace cv { namespace gpu { namespace device
|
||||
|
||||
namespace arithm
|
||||
{
|
||||
template <typename T, typename D>
|
||||
void vadd4(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream)
|
||||
void addMat_v4(PtrStepSz<uint> src1, PtrStepSz<uint> src2, PtrStepSz<uint> dst, cudaStream_t stream)
|
||||
{
|
||||
transform((PtrStepSz<T>) src1, (PtrStepSz<T>) src2, (PtrStepSz<D>) dst, VAdd4<T, D>(), WithOutMask(), stream);
|
||||
transform(src1, src2, dst, VAdd4(), WithOutMask(), stream);
|
||||
}
|
||||
|
||||
template void vadd4<uint, uint>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
|
||||
template void vadd4<uint, int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
|
||||
template void vadd4<int, uint>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
|
||||
template void vadd4<int, int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
|
||||
|
||||
template <typename T, typename D>
|
||||
void vadd2(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream)
|
||||
void addMat_v2(PtrStepSz<uint> src1, PtrStepSz<uint> src2, PtrStepSz<uint> dst, cudaStream_t stream)
|
||||
{
|
||||
transform((PtrStepSz<T>) src1, (PtrStepSz<T>) src2, (PtrStepSz<D>) dst, VAdd2<T, D>(), WithOutMask(), stream);
|
||||
transform(src1, src2, dst, VAdd2(), WithOutMask(), stream);
|
||||
}
|
||||
|
||||
template void vadd2<uint, uint>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
|
||||
template void vadd2<uint, int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
|
||||
template void vadd2<int, uint>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
|
||||
template void vadd2<int, int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
|
||||
|
||||
template <typename T, typename D>
|
||||
void addMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream)
|
||||
{
|
||||
@@ -543,170 +390,28 @@ namespace arithm
|
||||
|
||||
namespace arithm
|
||||
{
|
||||
template <typename T, typename D> struct VSub4;
|
||||
template <> struct VSub4<uint, uint> : binary_function<uint, uint, uint>
|
||||
struct VSub4 : binary_function<uint, uint, uint>
|
||||
{
|
||||
__device__ __forceinline__ uint operator ()(uint a, uint b) const
|
||||
{
|
||||
uint res = 0;
|
||||
|
||||
#if __CUDA_ARCH__ >= 300
|
||||
asm("vsub4.u32.u32.u32.sat %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
|
||||
#elif __CUDA_ARCH__ >= 200
|
||||
asm("vsub.u32.u32.u32.sat %0.b0, %1.b0, %2.b0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
|
||||
asm("vsub.u32.u32.u32.sat %0.b1, %1.b1, %2.b1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
|
||||
asm("vsub.u32.u32.u32.sat %0.b2, %1.b2, %2.b2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
|
||||
asm("vsub.u32.u32.u32.sat %0.b3, %1.b3, %2.b3, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
|
||||
#endif
|
||||
|
||||
return res;
|
||||
return vsub4(a, b);
|
||||
}
|
||||
|
||||
__device__ __forceinline__ VSub4() {}
|
||||
__device__ __forceinline__ VSub4(const VSub4<uint, uint>& other) {}
|
||||
};
|
||||
template <> struct VSub4<int, uint> : binary_function<int, int, uint>
|
||||
{
|
||||
__device__ __forceinline__ uint operator ()(int a, int b) const
|
||||
{
|
||||
uint res = 0;
|
||||
|
||||
#if __CUDA_ARCH__ >= 300
|
||||
asm("vsub4.u32.s32.s32.sat %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
|
||||
#elif __CUDA_ARCH__ >= 200
|
||||
asm("vsub.u32.s32.s32.sat %0.b0, %1.b0, %2.b0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
|
||||
asm("vsub.u32.s32.s32.sat %0.b1, %1.b1, %2.b1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
|
||||
asm("vsub.u32.s32.s32.sat %0.b2, %1.b2, %2.b2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
|
||||
asm("vsub.u32.s32.s32.sat %0.b3, %1.b3, %2.b3, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
|
||||
#endif
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
__device__ __forceinline__ VSub4() {}
|
||||
__device__ __forceinline__ VSub4(const VSub4<int, uint>& other) {}
|
||||
};
|
||||
template <> struct VSub4<uint, int> : binary_function<uint, uint, int>
|
||||
{
|
||||
__device__ __forceinline__ int operator ()(uint a, uint b) const
|
||||
{
|
||||
int res = 0;
|
||||
|
||||
#if __CUDA_ARCH__ >= 300
|
||||
asm("vsub4.s32.u32.u32.sat %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
|
||||
#elif __CUDA_ARCH__ >= 200
|
||||
asm("vsub.s32.u32.u32.sat %0.b0, %1.b0, %2.b0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
|
||||
asm("vsub.s32.u32.u32.sat %0.b1, %1.b1, %2.b1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
|
||||
asm("vsub.s32.u32.u32.sat %0.b2, %1.b2, %2.b2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
|
||||
asm("vsub.s32.u32.u32.sat %0.b3, %1.b3, %2.b3, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
|
||||
#endif
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
__device__ __forceinline__ VSub4() {}
|
||||
__device__ __forceinline__ VSub4(const VSub4<uint, int>& other) {}
|
||||
};
|
||||
template <> struct VSub4<int, int> : binary_function<int, int, int>
|
||||
{
|
||||
__device__ __forceinline__ int operator ()(int a, int b) const
|
||||
{
|
||||
int res = 0;
|
||||
|
||||
#if __CUDA_ARCH__ >= 300
|
||||
asm("vsub4.s32.s32.s32.sat %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
|
||||
#elif __CUDA_ARCH__ >= 200
|
||||
asm("vsub.s32.s32.s32.sat %0.b0, %1.b0, %2.b0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
|
||||
asm("vsub.s32.s32.s32.sat %0.b1, %1.b1, %2.b1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
|
||||
asm("vsub.s32.s32.s32.sat %0.b2, %1.b2, %2.b2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
|
||||
asm("vsub.s32.s32.s32.sat %0.b3, %1.b3, %2.b3, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
|
||||
#endif
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
__device__ __forceinline__ VSub4() {}
|
||||
__device__ __forceinline__ VSub4(const VSub4<int, int>& other) {}
|
||||
__device__ __forceinline__ VSub4(const VSub4& other) {}
|
||||
};
|
||||
|
||||
////////////////////////////////////
|
||||
|
||||
template <typename T, typename D> struct VSub2;
|
||||
template <> struct VSub2<uint, uint> : binary_function<uint, uint, uint>
|
||||
struct VSub2 : binary_function<uint, uint, uint>
|
||||
{
|
||||
__device__ __forceinline__ uint operator ()(uint a, uint b) const
|
||||
{
|
||||
uint res = 0;
|
||||
|
||||
#if __CUDA_ARCH__ >= 300
|
||||
asm("vsub2.u32.u32.u32.sat %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
|
||||
#elif __CUDA_ARCH__ >= 200
|
||||
asm("vsub.u32.u32.u32.sat %0.h0, %1.h0, %2.h0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
|
||||
asm("vsub.u32.u32.u32.sat %0.h1, %1.h1, %2.h1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
|
||||
#endif
|
||||
|
||||
return res;
|
||||
return vsub2(a, b);
|
||||
}
|
||||
|
||||
__device__ __forceinline__ VSub2() {}
|
||||
__device__ __forceinline__ VSub2(const VSub2<uint, uint>& other) {}
|
||||
};
|
||||
template <> struct VSub2<uint, int> : binary_function<uint, uint, int>
|
||||
{
|
||||
__device__ __forceinline__ int operator ()(uint a, uint b) const
|
||||
{
|
||||
int res = 0;
|
||||
|
||||
#if __CUDA_ARCH__ >= 300
|
||||
asm("vsub2.s32.u32.u32.sat %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
|
||||
#elif __CUDA_ARCH__ >= 200
|
||||
asm("vsub.s32.u32.u32.sat %0.h0, %1.h0, %2.h0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
|
||||
asm("vsub.s32.u32.u32.sat %0.h1, %1.h1, %2.h1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
|
||||
#endif
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
__device__ __forceinline__ VSub2() {}
|
||||
__device__ __forceinline__ VSub2(const VSub2<uint, int>& other) {}
|
||||
};
|
||||
template <> struct VSub2<int, uint> : binary_function<int, int, uint>
|
||||
{
|
||||
__device__ __forceinline__ uint operator ()(int a, int b) const
|
||||
{
|
||||
uint res = 0;
|
||||
|
||||
#if __CUDA_ARCH__ >= 300
|
||||
asm("vsub2.u32.s32.s32.sat %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
|
||||
#elif __CUDA_ARCH__ >= 200
|
||||
asm("vsub.u32.s32.s32.sat %0.h0, %1.h0, %2.h0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
|
||||
asm("vsub.u32.s32.s32.sat %0.h1, %1.h1, %2.h1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
|
||||
#endif
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
__device__ __forceinline__ VSub2() {}
|
||||
__device__ __forceinline__ VSub2(const VSub2<int, uint>& other) {}
|
||||
};
|
||||
template <> struct VSub2<int, int> : binary_function<int, int, int>
|
||||
{
|
||||
__device__ __forceinline__ int operator ()(int a, int b) const
|
||||
{
|
||||
int res = 0;
|
||||
|
||||
#if __CUDA_ARCH__ >= 300
|
||||
asm("vsub2.s32.s32.s32.sat %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
|
||||
#elif __CUDA_ARCH__ >= 200
|
||||
asm("vsub.s32.s32.s32.sat %0.h0, %1.h0, %2.h0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
|
||||
asm("vsub.s32.s32.s32.sat %0.h1, %1.h1, %2.h1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
|
||||
#endif
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
__device__ __forceinline__ VSub2() {}
|
||||
__device__ __forceinline__ VSub2(const VSub2<int, int>& other) {}
|
||||
__device__ __forceinline__ VSub2(const VSub2& other) {}
|
||||
};
|
||||
|
||||
////////////////////////////////////
|
||||
@@ -725,13 +430,13 @@ namespace arithm
|
||||
|
||||
namespace cv { namespace gpu { namespace device
|
||||
{
|
||||
template <typename T, typename D> struct TransformFunctorTraits< arithm::VSub4<T, D> > : arithm::ArithmFuncTraits<sizeof(T), sizeof(D)>
|
||||
template <> struct TransformFunctorTraits< arithm::VSub4 > : arithm::ArithmFuncTraits<sizeof(uint), sizeof(uint)>
|
||||
{
|
||||
};
|
||||
|
||||
////////////////////////////////////
|
||||
|
||||
template <typename T, typename D> struct TransformFunctorTraits< arithm::VSub2<T, D> > : arithm::ArithmFuncTraits<sizeof(T), sizeof(D)>
|
||||
template <> struct TransformFunctorTraits< arithm::VSub2 > : arithm::ArithmFuncTraits<sizeof(uint), sizeof(uint)>
|
||||
{
|
||||
};
|
||||
|
||||
@@ -744,28 +449,16 @@ namespace cv { namespace gpu { namespace device
|
||||
|
||||
namespace arithm
|
||||
{
|
||||
template <typename T, typename D>
|
||||
void vsub4(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream)
|
||||
void subMat_v4(PtrStepSz<uint> src1, PtrStepSz<uint> src2, PtrStepSz<uint> dst, cudaStream_t stream)
|
||||
{
|
||||
transform((PtrStepSz<T>) src1, (PtrStepSz<T>) src2, (PtrStepSz<D>) dst, VSub4<T, D>(), WithOutMask(), stream);
|
||||
transform(src1, src2, dst, VSub4(), WithOutMask(), stream);
|
||||
}
|
||||
|
||||
template void vsub4<uint, uint>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
|
||||
template void vsub4<uint, int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
|
||||
template void vsub4<int, uint>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
|
||||
template void vsub4<int, int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
|
||||
|
||||
template <typename T, typename D>
|
||||
void vsub2(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream)
|
||||
void subMat_v2(PtrStepSz<uint> src1, PtrStepSz<uint> src2, PtrStepSz<uint> dst, cudaStream_t stream)
|
||||
{
|
||||
transform((PtrStepSz<T>) src1, (PtrStepSz<T>) src2, (PtrStepSz<D>) dst, VSub2<T, D>(), WithOutMask(), stream);
|
||||
transform(src1, src2, dst, VSub2(), WithOutMask(), stream);
|
||||
}
|
||||
|
||||
template void vsub2<uint, uint>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
|
||||
template void vsub2<uint, int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
|
||||
template void vsub2<int, uint>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
|
||||
template void vsub2<int, int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
|
||||
|
||||
template <typename T, typename D>
|
||||
void subMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream)
|
||||
{
|
||||
@@ -1496,90 +1189,28 @@ namespace arithm
|
||||
|
||||
namespace arithm
|
||||
{
|
||||
template <typename T, typename D> struct VAbsDiff4;
|
||||
template <> struct VAbsDiff4<uint, uint> : binary_function<uint, uint, uint>
|
||||
struct VAbsDiff4 : binary_function<uint, uint, uint>
|
||||
{
|
||||
__device__ __forceinline__ uint operator ()(uint a, uint b) const
|
||||
{
|
||||
uint res = 0;
|
||||
|
||||
#if __CUDA_ARCH__ >= 300
|
||||
asm("vabsdiff4.u32.u32.u32.sat %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
|
||||
#elif __CUDA_ARCH__ >= 200
|
||||
asm("vabsdiff.u32.u32.u32.sat %0.b0, %1.b0, %2.b0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
|
||||
asm("vabsdiff.u32.u32.u32.sat %0.b1, %1.b1, %2.b1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
|
||||
asm("vabsdiff.u32.u32.u32.sat %0.b2, %1.b2, %2.b2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
|
||||
asm("vabsdiff.u32.u32.u32.sat %0.b3, %1.b3, %2.b3, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
|
||||
#endif
|
||||
|
||||
return res;
|
||||
return vabsdiff4(a, b);
|
||||
}
|
||||
|
||||
__device__ __forceinline__ VAbsDiff4() {}
|
||||
__device__ __forceinline__ VAbsDiff4(const VAbsDiff4<uint, uint>& other) {}
|
||||
};
|
||||
template <> struct VAbsDiff4<int, int> : binary_function<int, int, int>
|
||||
{
|
||||
__device__ __forceinline__ int operator ()(int a, int b) const
|
||||
{
|
||||
int res = 0;
|
||||
|
||||
#if __CUDA_ARCH__ >= 300
|
||||
asm("vabsdiff4.s32.s32.s32.sat %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
|
||||
#elif __CUDA_ARCH__ >= 200
|
||||
asm("vabsdiff.s32.s32.s32.sat %0.b0, %1.b0, %2.b0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
|
||||
asm("vabsdiff.s32.s32.s32.sat %0.b1, %1.b1, %2.b1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
|
||||
asm("vabsdiff.s32.s32.s32.sat %0.b2, %1.b2, %2.b2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
|
||||
asm("vabsdiff.s32.s32.s32.sat %0.b3, %1.b3, %2.b3, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
|
||||
#endif
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
__device__ __forceinline__ VAbsDiff4() {}
|
||||
__device__ __forceinline__ VAbsDiff4(const VAbsDiff4<int, int>& other) {}
|
||||
__device__ __forceinline__ VAbsDiff4(const VAbsDiff4& other) {}
|
||||
};
|
||||
|
||||
////////////////////////////////////
|
||||
|
||||
template <typename T, typename D> struct VAbsDiff2;
|
||||
template <> struct VAbsDiff2<uint, uint> : binary_function<uint, uint, uint>
|
||||
struct VAbsDiff2 : binary_function<uint, uint, uint>
|
||||
{
|
||||
__device__ __forceinline__ uint operator ()(uint a, uint b) const
|
||||
{
|
||||
uint res = 0;
|
||||
|
||||
#if __CUDA_ARCH__ >= 300
|
||||
asm("vabsdiff2.u32.u32.u32.sat %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
|
||||
#elif __CUDA_ARCH__ >= 200
|
||||
asm("vabsdiff.u32.u32.u32.sat %0.h0, %1.h0, %2.h0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
|
||||
asm("vabsdiff.u32.u32.u32.sat %0.h1, %1.h1, %2.h1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
|
||||
#endif
|
||||
|
||||
return res;
|
||||
return vabsdiff2(a, b);
|
||||
}
|
||||
|
||||
__device__ __forceinline__ VAbsDiff2() {}
|
||||
__device__ __forceinline__ VAbsDiff2(const VAbsDiff2<uint, uint>& other) {}
|
||||
};
|
||||
template <> struct VAbsDiff2<int, int> : binary_function<int, int, int>
|
||||
{
|
||||
__device__ __forceinline__ int operator ()(int a, int b) const
|
||||
{
|
||||
int res = 0;
|
||||
|
||||
#if __CUDA_ARCH__ >= 300
|
||||
asm("vabsdiff2.s32.s32.s32.sat %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
|
||||
#elif __CUDA_ARCH__ >= 200
|
||||
asm("vabsdiff.s32.s32.s32.sat %0.h0, %1.h0, %2.h0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
|
||||
asm("vabsdiff.s32.s32.s32.sat %0.h1, %1.h1, %2.h1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
|
||||
#endif
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
__device__ __forceinline__ VAbsDiff2() {}
|
||||
__device__ __forceinline__ VAbsDiff2(const VAbsDiff2<int, int>& other) {}
|
||||
__device__ __forceinline__ VAbsDiff2(const VAbsDiff2& other) {}
|
||||
};
|
||||
|
||||
////////////////////////////////////
|
||||
@@ -1611,13 +1242,13 @@ namespace arithm
|
||||
|
||||
namespace cv { namespace gpu { namespace device
|
||||
{
|
||||
template <typename T, typename D> struct TransformFunctorTraits< arithm::VAbsDiff4<T, D> > : arithm::ArithmFuncTraits<sizeof(T), sizeof(D)>
|
||||
template <> struct TransformFunctorTraits< arithm::VAbsDiff4 > : arithm::ArithmFuncTraits<sizeof(uint), sizeof(uint)>
|
||||
{
|
||||
};
|
||||
|
||||
////////////////////////////////////
|
||||
|
||||
template <typename T, typename D> struct TransformFunctorTraits< arithm::VAbsDiff2<T, D> > : arithm::ArithmFuncTraits<sizeof(T), sizeof(D)>
|
||||
template <> struct TransformFunctorTraits< arithm::VAbsDiff2 > : arithm::ArithmFuncTraits<sizeof(uint), sizeof(uint)>
|
||||
{
|
||||
};
|
||||
|
||||
@@ -1630,24 +1261,16 @@ namespace cv { namespace gpu { namespace device
|
||||
|
||||
namespace arithm
|
||||
{
|
||||
template <typename T>
|
||||
void vabsDiff4(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream)
|
||||
void absDiffMat_v4(PtrStepSz<uint> src1, PtrStepSz<uint> src2, PtrStepSz<uint> dst, cudaStream_t stream)
|
||||
{
|
||||
transform((PtrStepSz<T>) src1, (PtrStepSz<T>) src2, (PtrStepSz<T>) dst, VAbsDiff4<T, T>(), WithOutMask(), stream);
|
||||
transform(src1, src2, dst, VAbsDiff4(), WithOutMask(), stream);
|
||||
}
|
||||
|
||||
template void vabsDiff4<uint>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
|
||||
template void vabsDiff4<int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
|
||||
|
||||
template <typename T>
|
||||
void vabsDiff2(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream)
|
||||
void absDiffMat_v2(PtrStepSz<uint> src1, PtrStepSz<uint> src2, PtrStepSz<uint> dst, cudaStream_t stream)
|
||||
{
|
||||
transform((PtrStepSz<T>) src1, (PtrStepSz<T>) src2, (PtrStepSz<T>) dst, VAbsDiff2<T, T>(), WithOutMask(), stream);
|
||||
transform(src1, src2, dst, VAbsDiff2(), WithOutMask(), stream);
|
||||
}
|
||||
|
||||
template void vabsDiff2<uint>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
|
||||
template void vabsDiff2<int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
|
||||
|
||||
template <typename T>
|
||||
void absDiffMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream)
|
||||
{
|
||||
@@ -1877,6 +1500,49 @@ namespace arithm
|
||||
|
||||
namespace arithm
|
||||
{
|
||||
struct VCmpEq4 : binary_function<uint, uint, uint>
|
||||
{
|
||||
__device__ __forceinline__ uint operator ()(uint a, uint b) const
|
||||
{
|
||||
return vcmpeq4(a, b);
|
||||
}
|
||||
|
||||
__device__ __forceinline__ VCmpEq4() {}
|
||||
__device__ __forceinline__ VCmpEq4(const VCmpEq4& other) {}
|
||||
};
|
||||
struct VCmpNe4 : binary_function<uint, uint, uint>
|
||||
{
|
||||
__device__ __forceinline__ uint operator ()(uint a, uint b) const
|
||||
{
|
||||
return vcmpne4(a, b);
|
||||
}
|
||||
|
||||
__device__ __forceinline__ VCmpNe4() {}
|
||||
__device__ __forceinline__ VCmpNe4(const VCmpNe4& other) {}
|
||||
};
|
||||
struct VCmpLt4 : binary_function<uint, uint, uint>
|
||||
{
|
||||
__device__ __forceinline__ uint operator ()(uint a, uint b) const
|
||||
{
|
||||
return vcmplt4(a, b);
|
||||
}
|
||||
|
||||
__device__ __forceinline__ VCmpLt4() {}
|
||||
__device__ __forceinline__ VCmpLt4(const VCmpLt4& other) {}
|
||||
};
|
||||
struct VCmpLe4 : binary_function<uint, uint, uint>
|
||||
{
|
||||
__device__ __forceinline__ uint operator ()(uint a, uint b) const
|
||||
{
|
||||
return vcmple4(a, b);
|
||||
}
|
||||
|
||||
__device__ __forceinline__ VCmpLe4() {}
|
||||
__device__ __forceinline__ VCmpLe4(const VCmpLe4& other) {}
|
||||
};
|
||||
|
||||
////////////////////////////////////
|
||||
|
||||
template <class Op, typename T>
|
||||
struct Cmp : binary_function<T, T, uchar>
|
||||
{
|
||||
@@ -1890,6 +1556,21 @@ namespace arithm
|
||||
|
||||
namespace cv { namespace gpu { namespace device
|
||||
{
|
||||
template <> struct TransformFunctorTraits< arithm::VCmpEq4 > : arithm::ArithmFuncTraits<sizeof(uint), sizeof(uint)>
|
||||
{
|
||||
};
|
||||
template <> struct TransformFunctorTraits< arithm::VCmpNe4 > : arithm::ArithmFuncTraits<sizeof(uint), sizeof(uint)>
|
||||
{
|
||||
};
|
||||
template <> struct TransformFunctorTraits< arithm::VCmpLt4 > : arithm::ArithmFuncTraits<sizeof(uint), sizeof(uint)>
|
||||
{
|
||||
};
|
||||
template <> struct TransformFunctorTraits< arithm::VCmpLe4 > : arithm::ArithmFuncTraits<sizeof(uint), sizeof(uint)>
|
||||
{
|
||||
};
|
||||
|
||||
////////////////////////////////////
|
||||
|
||||
template <class Op, typename T> struct TransformFunctorTraits< arithm::Cmp<Op, T> > : arithm::ArithmFuncTraits<sizeof(T), sizeof(uchar)>
|
||||
{
|
||||
};
|
||||
@@ -1897,6 +1578,23 @@ namespace cv { namespace gpu { namespace device
|
||||
|
||||
namespace arithm
|
||||
{
|
||||
void cmpMatEq_v4(PtrStepSz<uint> src1, PtrStepSz<uint> src2, PtrStepSz<uint> dst, cudaStream_t stream)
|
||||
{
|
||||
transform(src1, src2, dst, VCmpEq4(), WithOutMask(), stream);
|
||||
}
|
||||
void cmpMatNe_v4(PtrStepSz<uint> src1, PtrStepSz<uint> src2, PtrStepSz<uint> dst, cudaStream_t stream)
|
||||
{
|
||||
transform(src1, src2, dst, VCmpNe4(), WithOutMask(), stream);
|
||||
}
|
||||
void cmpMatLt_v4(PtrStepSz<uint> src1, PtrStepSz<uint> src2, PtrStepSz<uint> dst, cudaStream_t stream)
|
||||
{
|
||||
transform(src1, src2, dst, VCmpLt4(), WithOutMask(), stream);
|
||||
}
|
||||
void cmpMatLe_v4(PtrStepSz<uint> src1, PtrStepSz<uint> src2, PtrStepSz<uint> dst, cudaStream_t stream)
|
||||
{
|
||||
transform(src1, src2, dst, VCmpLe4(), WithOutMask(), stream);
|
||||
}
|
||||
|
||||
template <template <typename> class Op, typename T>
|
||||
void cmpMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream)
|
||||
{
|
||||
@@ -2303,44 +2001,11 @@ namespace arithm
|
||||
|
||||
namespace arithm
|
||||
{
|
||||
template <typename T> struct VMin4;
|
||||
template <> struct VMin4<uint> : binary_function<uint, uint, uint>
|
||||
struct VMin4 : binary_function<uint, uint, uint>
|
||||
{
|
||||
__device__ __forceinline__ uint operator ()(uint a, uint b) const
|
||||
{
|
||||
uint res = 0;
|
||||
|
||||
#if __CUDA_ARCH__ >= 300
|
||||
asm("vmin4.u32.u32.u32 %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
|
||||
#elif __CUDA_ARCH__ >= 200
|
||||
asm("vmin.u32.u32.u32 %0.b0, %1.b0, %2.b0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
|
||||
asm("vmin.u32.u32.u32 %0.b1, %1.b1, %2.b1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
|
||||
asm("vmin.u32.u32.u32 %0.b2, %1.b2, %2.b2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
|
||||
asm("vmin.u32.u32.u32 %0.b3, %1.b3, %2.b3, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
|
||||
#endif
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
__device__ __forceinline__ VMin4() {}
|
||||
__device__ __forceinline__ VMin4(const VMin4& other) {}
|
||||
};
|
||||
template <> struct VMin4<int> : binary_function<int, int, int>
|
||||
{
|
||||
__device__ __forceinline__ int operator ()(int a, int b) const
|
||||
{
|
||||
int res = 0;
|
||||
|
||||
#if __CUDA_ARCH__ >= 300
|
||||
asm("vmin4.s32.s32.s32 %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
|
||||
#elif __CUDA_ARCH__ >= 200
|
||||
asm("vmin.s32.s32.s32 %0.b0, %1.b0, %2.b0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
|
||||
asm("vmin.s32.s32.s32 %0.b1, %1.b1, %2.b1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
|
||||
asm("vmin.s32.s32.s32 %0.b2, %1.b2, %2.b2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
|
||||
asm("vmin.s32.s32.s32 %0.b3, %1.b3, %2.b3, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
|
||||
#endif
|
||||
|
||||
return res;
|
||||
return vmin4(a, b);
|
||||
}
|
||||
|
||||
__device__ __forceinline__ VMin4() {}
|
||||
@@ -2349,40 +2014,11 @@ namespace arithm
|
||||
|
||||
////////////////////////////////////
|
||||
|
||||
template <typename T> struct VMin2;
|
||||
template <> struct VMin2<uint> : binary_function<uint, uint, uint>
|
||||
struct VMin2 : binary_function<uint, uint, uint>
|
||||
{
|
||||
__device__ __forceinline__ uint operator ()(uint a, uint b) const
|
||||
{
|
||||
uint res = 0;
|
||||
|
||||
#if __CUDA_ARCH__ >= 300
|
||||
asm("vmin2.u32.u32.u32 %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
|
||||
#elif __CUDA_ARCH__ >= 200
|
||||
asm("vmin.u32.u32.u32 %0.h0, %1.h0, %2.h0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
|
||||
asm("vmin.u32.u32.u32 %0.h1, %1.h1, %2.h1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
|
||||
#endif
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
__device__ __forceinline__ VMin2() {}
|
||||
__device__ __forceinline__ VMin2(const VMin2& other) {}
|
||||
};
|
||||
template <> struct VMin2<int> : binary_function<int, int, int>
|
||||
{
|
||||
__device__ __forceinline__ int operator ()(int a, int b) const
|
||||
{
|
||||
int res = 0;
|
||||
|
||||
#if __CUDA_ARCH__ >= 300
|
||||
asm("vmin2.s32.s32.s32 %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
|
||||
#elif __CUDA_ARCH__ >= 200
|
||||
asm("vmin.s32.s32.s32 %0.h0, %1.h0, %2.h0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
|
||||
asm("vmin.s32.s32.s32 %0.h1, %1.h1, %2.h1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
|
||||
#endif
|
||||
|
||||
return res;
|
||||
return vmin2(a, b);
|
||||
}
|
||||
|
||||
__device__ __forceinline__ VMin2() {}
|
||||
@@ -2392,13 +2028,13 @@ namespace arithm
|
||||
|
||||
namespace cv { namespace gpu { namespace device
|
||||
{
|
||||
template <typename T> struct TransformFunctorTraits< arithm::VMin4<T> > : arithm::ArithmFuncTraits<sizeof(T), sizeof(T)>
|
||||
template <> struct TransformFunctorTraits< arithm::VMin4 > : arithm::ArithmFuncTraits<sizeof(uint), sizeof(uint)>
|
||||
{
|
||||
};
|
||||
|
||||
////////////////////////////////////
|
||||
|
||||
template <typename T> struct TransformFunctorTraits< arithm::VMin2<T> > : arithm::ArithmFuncTraits<sizeof(T), sizeof(T)>
|
||||
template <> struct TransformFunctorTraits< arithm::VMin2 > : arithm::ArithmFuncTraits<sizeof(uint), sizeof(uint)>
|
||||
{
|
||||
};
|
||||
|
||||
@@ -2415,14 +2051,14 @@ namespace cv { namespace gpu { namespace device
|
||||
|
||||
namespace arithm
|
||||
{
|
||||
template <typename T> void vmin4(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream)
|
||||
void minMat_v4(PtrStepSz<uint> src1, PtrStepSz<uint> src2, PtrStepSz<uint> dst, cudaStream_t stream)
|
||||
{
|
||||
transform((PtrStepSz<T>) src1, (PtrStepSz<T>) src2, (PtrStepSz<T>) dst, VMin4<T>(), WithOutMask(), stream);
|
||||
transform(src1, src2, dst, VMin4(), WithOutMask(), stream);
|
||||
}
|
||||
|
||||
template <typename T> void vmin2(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream)
|
||||
void minMat_v2(PtrStepSz<uint> src1, PtrStepSz<uint> src2, PtrStepSz<uint> dst, cudaStream_t stream)
|
||||
{
|
||||
transform((PtrStepSz<T>) src1, (PtrStepSz<T>) src2, (PtrStepSz<T>) dst, VMin2<T>(), WithOutMask(), stream);
|
||||
transform(src1, src2, dst, VMin2(), WithOutMask(), stream);
|
||||
}
|
||||
|
||||
template <typename T> void minMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream)
|
||||
@@ -2430,12 +2066,6 @@ namespace arithm
|
||||
transform((PtrStepSz<T>) src1, (PtrStepSz<T>) src2, (PtrStepSz<T>) dst, minimum<T>(), WithOutMask(), stream);
|
||||
}
|
||||
|
||||
template void vmin4<uint>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
|
||||
template void vmin4<int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
|
||||
|
||||
template void vmin2<uint>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
|
||||
template void vmin2<int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
|
||||
|
||||
template void minMat<uchar >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
|
||||
template void minMat<schar >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
|
||||
template void minMat<ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
|
||||
@@ -2463,44 +2093,11 @@ namespace arithm
|
||||
|
||||
namespace arithm
|
||||
{
|
||||
template <typename T> struct VMax4;
|
||||
template <> struct VMax4<uint> : binary_function<uint, uint, uint>
|
||||
struct VMax4 : binary_function<uint, uint, uint>
|
||||
{
|
||||
__device__ __forceinline__ uint operator ()(uint a, uint b) const
|
||||
{
|
||||
uint res = 0;
|
||||
|
||||
#if __CUDA_ARCH__ >= 300
|
||||
asm("vmax4.u32.u32.u32 %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
|
||||
#elif __CUDA_ARCH__ >= 200
|
||||
asm("vmax.u32.u32.u32 %0.b0, %1.b0, %2.b0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
|
||||
asm("vmax.u32.u32.u32 %0.b1, %1.b1, %2.b1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
|
||||
asm("vmax.u32.u32.u32 %0.b2, %1.b2, %2.b2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
|
||||
asm("vmax.u32.u32.u32 %0.b3, %1.b3, %2.b3, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
|
||||
#endif
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
__device__ __forceinline__ VMax4() {}
|
||||
__device__ __forceinline__ VMax4(const VMax4& other) {}
|
||||
};
|
||||
template <> struct VMax4<int> : binary_function<int, int, int>
|
||||
{
|
||||
__device__ __forceinline__ int operator ()(int a, int b) const
|
||||
{
|
||||
int res = 0;
|
||||
|
||||
#if __CUDA_ARCH__ >= 300
|
||||
asm("vmax4.s32.s32.s32 %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
|
||||
#elif __CUDA_ARCH__ >= 200
|
||||
asm("vmax.s32.s32.s32 %0.b0, %1.b0, %2.b0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
|
||||
asm("vmax.s32.s32.s32 %0.b1, %1.b1, %2.b1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
|
||||
asm("vmax.s32.s32.s32 %0.b2, %1.b2, %2.b2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
|
||||
asm("vmax.s32.s32.s32 %0.b3, %1.b3, %2.b3, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
|
||||
#endif
|
||||
|
||||
return res;
|
||||
return vmax4(a, b);
|
||||
}
|
||||
|
||||
__device__ __forceinline__ VMax4() {}
|
||||
@@ -2509,40 +2106,11 @@ namespace arithm
|
||||
|
||||
////////////////////////////////////
|
||||
|
||||
template <typename T> struct VMax2;
|
||||
template <> struct VMax2<uint> : binary_function<uint, uint, uint>
|
||||
struct VMax2 : binary_function<uint, uint, uint>
|
||||
{
|
||||
__device__ __forceinline__ uint operator ()(uint a, uint b) const
|
||||
{
|
||||
uint res = 0;
|
||||
|
||||
#if __CUDA_ARCH__ >= 300
|
||||
asm("vmax2.u32.u32.u32 %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
|
||||
#elif __CUDA_ARCH__ >= 200
|
||||
asm("vmax.u32.u32.u32 %0.h0, %1.h0, %2.h0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
|
||||
asm("vmax.u32.u32.u32 %0.h1, %1.h1, %2.h1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
|
||||
#endif
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
__device__ __forceinline__ VMax2() {}
|
||||
__device__ __forceinline__ VMax2(const VMax2& other) {}
|
||||
};
|
||||
template <> struct VMax2<int> : binary_function<int, int, int>
|
||||
{
|
||||
__device__ __forceinline__ int operator ()(int a, int b) const
|
||||
{
|
||||
int res = 0;
|
||||
|
||||
#if __CUDA_ARCH__ >= 300
|
||||
asm("vmax2.s32.s32.s32 %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
|
||||
#elif __CUDA_ARCH__ >= 200
|
||||
asm("vmax.s32.s32.s32 %0.h0, %1.h0, %2.h0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
|
||||
asm("vmax.s32.s32.s32 %0.h1, %1.h1, %2.h1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
|
||||
#endif
|
||||
|
||||
return res;
|
||||
return vmax2(a, b);
|
||||
}
|
||||
|
||||
__device__ __forceinline__ VMax2() {}
|
||||
@@ -2552,13 +2120,13 @@ namespace arithm
|
||||
|
||||
namespace cv { namespace gpu { namespace device
|
||||
{
|
||||
template <typename T> struct TransformFunctorTraits< arithm::VMax4<T> > : arithm::ArithmFuncTraits<sizeof(T), sizeof(T)>
|
||||
template <> struct TransformFunctorTraits< arithm::VMax4 > : arithm::ArithmFuncTraits<sizeof(uint), sizeof(uint)>
|
||||
{
|
||||
};
|
||||
|
||||
////////////////////////////////////
|
||||
|
||||
template <typename T> struct TransformFunctorTraits< arithm::VMax2<T> > : arithm::ArithmFuncTraits<sizeof(T), sizeof(T)>
|
||||
template <> struct TransformFunctorTraits< arithm::VMax2 > : arithm::ArithmFuncTraits<sizeof(uint), sizeof(uint)>
|
||||
{
|
||||
};
|
||||
|
||||
@@ -2575,14 +2143,14 @@ namespace cv { namespace gpu { namespace device
|
||||
|
||||
namespace arithm
|
||||
{
|
||||
template <typename T> void vmax4(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream)
|
||||
void maxMat_v4(PtrStepSz<uint> src1, PtrStepSz<uint> src2, PtrStepSz<uint> dst, cudaStream_t stream)
|
||||
{
|
||||
transform((PtrStepSz<T>) src1, (PtrStepSz<T>) src2, (PtrStepSz<T>) dst, VMax4<T>(), WithOutMask(), stream);
|
||||
transform(src1, src2, dst, VMax4(), WithOutMask(), stream);
|
||||
}
|
||||
|
||||
template <typename T> void vmax2(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream)
|
||||
void maxMat_v2(PtrStepSz<uint> src1, PtrStepSz<uint> src2, PtrStepSz<uint> dst, cudaStream_t stream)
|
||||
{
|
||||
transform((PtrStepSz<T>) src1, (PtrStepSz<T>) src2, (PtrStepSz<T>) dst, VMax2<T>(), WithOutMask(), stream);
|
||||
transform(src1, src2, dst, VMax2(), WithOutMask(), stream);
|
||||
}
|
||||
|
||||
template <typename T> void maxMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream)
|
||||
@@ -2590,12 +2158,6 @@ namespace arithm
|
||||
transform((PtrStepSz<T>) src1, (PtrStepSz<T>) src2, (PtrStepSz<T>) dst, maximum<T>(), WithOutMask(), stream);
|
||||
}
|
||||
|
||||
template void vmax4<uint>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
|
||||
template void vmax4<int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
|
||||
|
||||
template void vmax2<uint>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
|
||||
template void vmax2<int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
|
||||
|
||||
template void maxMat<uchar >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
|
||||
template void maxMat<schar >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
|
||||
template void maxMat<ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
|
||||
|
@@ -1,934 +0,0 @@
|
||||
/*M///////////////////////////////////////////////////////////////////////////////////////
|
||||
//
|
||||
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
|
||||
//
|
||||
// By downloading, copying, installing or using the software you agree to this license.
|
||||
// If you do not agree to this license, do not download, install,
|
||||
// copy or use the software.
|
||||
//
|
||||
//
|
||||
// License Agreement
|
||||
// For Open Source Computer Vision Library
|
||||
//
|
||||
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
|
||||
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
|
||||
// Third party copyrights are property of their respective owners.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without modification,
|
||||
// are permitted provided that the following conditions are met:
|
||||
//
|
||||
// * Redistribution's of source code must retain the above copyright notice,
|
||||
// this list of conditions and the following disclaimer.
|
||||
//
|
||||
// * Redistribution's in binary form must reproduce the above copyright notice,
|
||||
// this list of conditions and the following disclaimer in the documentation
|
||||
// and/or other materials provided with the distribution.
|
||||
//
|
||||
// * The name of the copyright holders may not be used to endorse or promote products
|
||||
// derived from this software without specific prior written permission.
|
||||
//
|
||||
// This software is provided by the copyright holders and contributors "as is" and
|
||||
// any express or implied warranties, including, but not limited to, the implied
|
||||
// warranties of merchantability and fitness for a particular purpose are disclaimed.
|
||||
// In no event shall the Intel Corporation or contributors be liable for any direct,
|
||||
// indirect, incidental, special, exemplary, or consequential damages
|
||||
// (including, but not limited to, procurement of substitute goods or services;
|
||||
// loss of use, data, or profits; or business interruption) however caused
|
||||
// and on any theory of liability, whether in contract, strict liability,
|
||||
// or tort (including negligence or otherwise) arising in any way out of
|
||||
// the use of this software, even if advised of the possibility of such damage.
|
||||
//
|
||||
// Copyright (c) 2010, Paul Furgale, Chi Hay Tong
|
||||
//
|
||||
// The original code was written by Paul Furgale and Chi Hay Tong
|
||||
// and later optimized and prepared for integration into OpenCV by Itseez.
|
||||
//
|
||||
//M*/
|
||||
|
||||
#if !defined CUDA_DISABLER
|
||||
|
||||
#include "opencv2/gpu/device/common.hpp"
|
||||
#include "opencv2/gpu/device/limits.hpp"
|
||||
#include "opencv2/gpu/device/saturate_cast.hpp"
|
||||
#include "opencv2/gpu/device/reduce.hpp"
|
||||
#include "opencv2/gpu/device/utility.hpp"
|
||||
#include "opencv2/gpu/device/functional.hpp"
|
||||
#include "opencv2/gpu/device/filters.hpp"
|
||||
|
||||
namespace cv { namespace gpu { namespace device
|
||||
{
|
||||
namespace surf
|
||||
{
|
||||
////////////////////////////////////////////////////////////////////////
|
||||
// Global parameters
|
||||
|
||||
// The maximum number of features (before subpixel interpolation) that memory is reserved for.
|
||||
__constant__ int c_max_candidates;
|
||||
// The maximum number of features that memory is reserved for.
|
||||
__constant__ int c_max_features;
|
||||
// The image size.
|
||||
__constant__ int c_img_rows;
|
||||
__constant__ int c_img_cols;
|
||||
// The number of layers.
|
||||
__constant__ int c_nOctaveLayers;
|
||||
// The hessian threshold.
|
||||
__constant__ float c_hessianThreshold;
|
||||
|
||||
// The current octave.
|
||||
__constant__ int c_octave;
|
||||
// The current layer size.
|
||||
__constant__ int c_layer_rows;
|
||||
__constant__ int c_layer_cols;
|
||||
|
||||
void loadGlobalConstants(int maxCandidates, int maxFeatures, int img_rows, int img_cols, int nOctaveLayers, float hessianThreshold)
|
||||
{
|
||||
cudaSafeCall( cudaMemcpyToSymbol(c_max_candidates, &maxCandidates, sizeof(maxCandidates)) );
|
||||
cudaSafeCall( cudaMemcpyToSymbol(c_max_features, &maxFeatures, sizeof(maxFeatures)) );
|
||||
cudaSafeCall( cudaMemcpyToSymbol(c_img_rows, &img_rows, sizeof(img_rows)) );
|
||||
cudaSafeCall( cudaMemcpyToSymbol(c_img_cols, &img_cols, sizeof(img_cols)) );
|
||||
cudaSafeCall( cudaMemcpyToSymbol(c_nOctaveLayers, &nOctaveLayers, sizeof(nOctaveLayers)) );
|
||||
cudaSafeCall( cudaMemcpyToSymbol(c_hessianThreshold, &hessianThreshold, sizeof(hessianThreshold)) );
|
||||
}
|
||||
|
||||
void loadOctaveConstants(int octave, int layer_rows, int layer_cols)
|
||||
{
|
||||
cudaSafeCall( cudaMemcpyToSymbol(c_octave, &octave, sizeof(octave)) );
|
||||
cudaSafeCall( cudaMemcpyToSymbol(c_layer_rows, &layer_rows, sizeof(layer_rows)) );
|
||||
cudaSafeCall( cudaMemcpyToSymbol(c_layer_cols, &layer_cols, sizeof(layer_cols)) );
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////
|
||||
// Integral image texture
|
||||
|
||||
texture<unsigned char, 2, cudaReadModeElementType> imgTex(0, cudaFilterModePoint, cudaAddressModeClamp);
|
||||
texture<unsigned int, 2, cudaReadModeElementType> sumTex(0, cudaFilterModePoint, cudaAddressModeClamp);
|
||||
texture<unsigned int, 2, cudaReadModeElementType> maskSumTex(0, cudaFilterModePoint, cudaAddressModeClamp);
|
||||
|
||||
void bindImgTex(PtrStepSzb img)
|
||||
{
|
||||
bindTexture(&imgTex, img);
|
||||
}
|
||||
|
||||
size_t bindSumTex(PtrStepSz<uint> sum)
|
||||
{
|
||||
size_t offset;
|
||||
cudaChannelFormatDesc desc_sum = cudaCreateChannelDesc<uint>();
|
||||
cudaSafeCall( cudaBindTexture2D(&offset, sumTex, sum.data, desc_sum, sum.cols, sum.rows, sum.step));
|
||||
return offset / sizeof(uint);
|
||||
}
|
||||
size_t bindMaskSumTex(PtrStepSz<uint> maskSum)
|
||||
{
|
||||
size_t offset;
|
||||
cudaChannelFormatDesc desc_sum = cudaCreateChannelDesc<uint>();
|
||||
cudaSafeCall( cudaBindTexture2D(&offset, maskSumTex, maskSum.data, desc_sum, maskSum.cols, maskSum.rows, maskSum.step));
|
||||
return offset / sizeof(uint);
|
||||
}
|
||||
|
||||
template <int N> __device__ float icvCalcHaarPatternSum(const float src[][5], int oldSize, int newSize, int y, int x)
|
||||
{
|
||||
#if __CUDA_ARCH__ && __CUDA_ARCH__ >= 200
|
||||
typedef double real_t;
|
||||
#else
|
||||
typedef float real_t;
|
||||
#endif
|
||||
|
||||
float ratio = (float)newSize / oldSize;
|
||||
|
||||
real_t d = 0;
|
||||
|
||||
#pragma unroll
|
||||
for (int k = 0; k < N; ++k)
|
||||
{
|
||||
int dx1 = __float2int_rn(ratio * src[k][0]);
|
||||
int dy1 = __float2int_rn(ratio * src[k][1]);
|
||||
int dx2 = __float2int_rn(ratio * src[k][2]);
|
||||
int dy2 = __float2int_rn(ratio * src[k][3]);
|
||||
|
||||
real_t t = 0;
|
||||
t += tex2D(sumTex, x + dx1, y + dy1);
|
||||
t -= tex2D(sumTex, x + dx1, y + dy2);
|
||||
t -= tex2D(sumTex, x + dx2, y + dy1);
|
||||
t += tex2D(sumTex, x + dx2, y + dy2);
|
||||
|
||||
d += t * src[k][4] / ((dx2 - dx1) * (dy2 - dy1));
|
||||
}
|
||||
|
||||
return (float)d;
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////
|
||||
// Hessian
|
||||
|
||||
__constant__ float c_DX [3][5] = { {0, 2, 3, 7, 1}, {3, 2, 6, 7, -2}, {6, 2, 9, 7, 1} };
|
||||
__constant__ float c_DY [3][5] = { {2, 0, 7, 3, 1}, {2, 3, 7, 6, -2}, {2, 6, 7, 9, 1} };
|
||||
__constant__ float c_DXY[4][5] = { {1, 1, 4, 4, 1}, {5, 1, 8, 4, -1}, {1, 5, 4, 8, -1}, {5, 5, 8, 8, 1} };
|
||||
|
||||
__host__ __device__ __forceinline__ int calcSize(int octave, int layer)
|
||||
{
|
||||
/* Wavelet size at first layer of first octave. */
|
||||
const int HAAR_SIZE0 = 9;
|
||||
|
||||
/* Wavelet size increment between layers. This should be an even number,
|
||||
such that the wavelet sizes in an octave are either all even or all odd.
|
||||
This ensures that when looking for the neighbours of a sample, the layers
|
||||
above and below are aligned correctly. */
|
||||
const int HAAR_SIZE_INC = 6;
|
||||
|
||||
return (HAAR_SIZE0 + HAAR_SIZE_INC * layer) << octave;
|
||||
}
|
||||
|
||||
__global__ void icvCalcLayerDetAndTrace(PtrStepf det, PtrStepf trace)
|
||||
{
|
||||
// Determine the indices
|
||||
const int gridDim_y = gridDim.y / (c_nOctaveLayers + 2);
|
||||
const int blockIdx_y = blockIdx.y % gridDim_y;
|
||||
const int blockIdx_z = blockIdx.y / gridDim_y;
|
||||
|
||||
const int j = threadIdx.x + blockIdx.x * blockDim.x;
|
||||
const int i = threadIdx.y + blockIdx_y * blockDim.y;
|
||||
const int layer = blockIdx_z;
|
||||
|
||||
const int size = calcSize(c_octave, layer);
|
||||
|
||||
const int samples_i = 1 + ((c_img_rows - size) >> c_octave);
|
||||
const int samples_j = 1 + ((c_img_cols - size) >> c_octave);
|
||||
|
||||
// Ignore pixels where some of the kernel is outside the image
|
||||
const int margin = (size >> 1) >> c_octave;
|
||||
|
||||
if (size <= c_img_rows && size <= c_img_cols && i < samples_i && j < samples_j)
|
||||
{
|
||||
const float dx = icvCalcHaarPatternSum<3>(c_DX , 9, size, (i << c_octave), (j << c_octave));
|
||||
const float dy = icvCalcHaarPatternSum<3>(c_DY , 9, size, (i << c_octave), (j << c_octave));
|
||||
const float dxy = icvCalcHaarPatternSum<4>(c_DXY, 9, size, (i << c_octave), (j << c_octave));
|
||||
|
||||
det.ptr(layer * c_layer_rows + i + margin)[j + margin] = dx * dy - 0.81f * dxy * dxy;
|
||||
trace.ptr(layer * c_layer_rows + i + margin)[j + margin] = dx + dy;
|
||||
}
|
||||
}
|
||||
|
||||
void icvCalcLayerDetAndTrace_gpu(const PtrStepf& det, const PtrStepf& trace, int img_rows, int img_cols,
|
||||
int octave, int nOctaveLayers)
|
||||
{
|
||||
const int min_size = calcSize(octave, 0);
|
||||
const int max_samples_i = 1 + ((img_rows - min_size) >> octave);
|
||||
const int max_samples_j = 1 + ((img_cols - min_size) >> octave);
|
||||
|
||||
dim3 threads(16, 16);
|
||||
|
||||
dim3 grid;
|
||||
grid.x = divUp(max_samples_j, threads.x);
|
||||
grid.y = divUp(max_samples_i, threads.y) * (nOctaveLayers + 2);
|
||||
|
||||
icvCalcLayerDetAndTrace<<<grid, threads>>>(det, trace);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
cudaSafeCall( cudaDeviceSynchronize() );
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////
|
||||
// NONMAX
|
||||
|
||||
__constant__ float c_DM[5] = {0, 0, 9, 9, 1};
|
||||
|
||||
struct WithMask
|
||||
{
|
||||
static __device__ bool check(int sum_i, int sum_j, int size)
|
||||
{
|
||||
float ratio = (float)size / 9.0f;
|
||||
|
||||
float d = 0;
|
||||
|
||||
int dx1 = __float2int_rn(ratio * c_DM[0]);
|
||||
int dy1 = __float2int_rn(ratio * c_DM[1]);
|
||||
int dx2 = __float2int_rn(ratio * c_DM[2]);
|
||||
int dy2 = __float2int_rn(ratio * c_DM[3]);
|
||||
|
||||
float t = 0;
|
||||
t += tex2D(maskSumTex, sum_j + dx1, sum_i + dy1);
|
||||
t -= tex2D(maskSumTex, sum_j + dx1, sum_i + dy2);
|
||||
t -= tex2D(maskSumTex, sum_j + dx2, sum_i + dy1);
|
||||
t += tex2D(maskSumTex, sum_j + dx2, sum_i + dy2);
|
||||
|
||||
d += t * c_DM[4] / ((dx2 - dx1) * (dy2 - dy1));
|
||||
|
||||
return (d >= 0.5f);
|
||||
}
|
||||
};
|
||||
|
||||
template <typename Mask>
|
||||
__global__ void icvFindMaximaInLayer(const PtrStepf det, const PtrStepf trace, int4* maxPosBuffer,
|
||||
unsigned int* maxCounter)
|
||||
{
|
||||
#if __CUDA_ARCH__ && __CUDA_ARCH__ >= 110
|
||||
|
||||
extern __shared__ float N9[];
|
||||
|
||||
// The hidx variables are the indices to the hessian buffer.
|
||||
const int gridDim_y = gridDim.y / c_nOctaveLayers;
|
||||
const int blockIdx_y = blockIdx.y % gridDim_y;
|
||||
const int blockIdx_z = blockIdx.y / gridDim_y;
|
||||
|
||||
const int layer = blockIdx_z + 1;
|
||||
|
||||
const int size = calcSize(c_octave, layer);
|
||||
|
||||
// Ignore pixels without a 3x3x3 neighbourhood in the layer above
|
||||
const int margin = ((calcSize(c_octave, layer + 1) >> 1) >> c_octave) + 1;
|
||||
|
||||
const int j = threadIdx.x + blockIdx.x * (blockDim.x - 2) + margin - 1;
|
||||
const int i = threadIdx.y + blockIdx_y * (blockDim.y - 2) + margin - 1;
|
||||
|
||||
// Is this thread within the hessian buffer?
|
||||
const int zoff = blockDim.x * blockDim.y;
|
||||
const int localLin = threadIdx.x + threadIdx.y * blockDim.x + zoff;
|
||||
N9[localLin - zoff] = det.ptr(c_layer_rows * (layer - 1) + ::min(::max(i, 0), c_img_rows - 1))[::min(::max(j, 0), c_img_cols - 1)];
|
||||
N9[localLin ] = det.ptr(c_layer_rows * (layer ) + ::min(::max(i, 0), c_img_rows - 1))[::min(::max(j, 0), c_img_cols - 1)];
|
||||
N9[localLin + zoff] = det.ptr(c_layer_rows * (layer + 1) + ::min(::max(i, 0), c_img_rows - 1))[::min(::max(j, 0), c_img_cols - 1)];
|
||||
__syncthreads();
|
||||
|
||||
if (i < c_layer_rows - margin && j < c_layer_cols - margin && threadIdx.x > 0 && threadIdx.x < blockDim.x - 1 && threadIdx.y > 0 && threadIdx.y < blockDim.y - 1)
|
||||
{
|
||||
float val0 = N9[localLin];
|
||||
|
||||
if (val0 > c_hessianThreshold)
|
||||
{
|
||||
// Coordinates for the start of the wavelet in the sum image. There
|
||||
// is some integer division involved, so don't try to simplify this
|
||||
// (cancel out sampleStep) without checking the result is the same
|
||||
const int sum_i = (i - ((size >> 1) >> c_octave)) << c_octave;
|
||||
const int sum_j = (j - ((size >> 1) >> c_octave)) << c_octave;
|
||||
|
||||
if (Mask::check(sum_i, sum_j, size))
|
||||
{
|
||||
// Check to see if we have a max (in its 26 neighbours)
|
||||
const bool condmax = val0 > N9[localLin - 1 - blockDim.x - zoff]
|
||||
&& val0 > N9[localLin - blockDim.x - zoff]
|
||||
&& val0 > N9[localLin + 1 - blockDim.x - zoff]
|
||||
&& val0 > N9[localLin - 1 - zoff]
|
||||
&& val0 > N9[localLin - zoff]
|
||||
&& val0 > N9[localLin + 1 - zoff]
|
||||
&& val0 > N9[localLin - 1 + blockDim.x - zoff]
|
||||
&& val0 > N9[localLin + blockDim.x - zoff]
|
||||
&& val0 > N9[localLin + 1 + blockDim.x - zoff]
|
||||
|
||||
&& val0 > N9[localLin - 1 - blockDim.x]
|
||||
&& val0 > N9[localLin - blockDim.x]
|
||||
&& val0 > N9[localLin + 1 - blockDim.x]
|
||||
&& val0 > N9[localLin - 1 ]
|
||||
&& val0 > N9[localLin + 1 ]
|
||||
&& val0 > N9[localLin - 1 + blockDim.x]
|
||||
&& val0 > N9[localLin + blockDim.x]
|
||||
&& val0 > N9[localLin + 1 + blockDim.x]
|
||||
|
||||
&& val0 > N9[localLin - 1 - blockDim.x + zoff]
|
||||
&& val0 > N9[localLin - blockDim.x + zoff]
|
||||
&& val0 > N9[localLin + 1 - blockDim.x + zoff]
|
||||
&& val0 > N9[localLin - 1 + zoff]
|
||||
&& val0 > N9[localLin + zoff]
|
||||
&& val0 > N9[localLin + 1 + zoff]
|
||||
&& val0 > N9[localLin - 1 + blockDim.x + zoff]
|
||||
&& val0 > N9[localLin + blockDim.x + zoff]
|
||||
&& val0 > N9[localLin + 1 + blockDim.x + zoff]
|
||||
;
|
||||
|
||||
if(condmax)
|
||||
{
|
||||
unsigned int ind = atomicInc(maxCounter,(unsigned int) -1);
|
||||
|
||||
if (ind < c_max_candidates)
|
||||
{
|
||||
const int laplacian = (int) copysignf(1.0f, trace.ptr(layer * c_layer_rows + i)[j]);
|
||||
|
||||
maxPosBuffer[ind] = make_int4(j, i, layer, laplacian);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
||||
}
|
||||
|
||||
void icvFindMaximaInLayer_gpu(const PtrStepf& det, const PtrStepf& trace, int4* maxPosBuffer, unsigned int* maxCounter,
|
||||
int img_rows, int img_cols, int octave, bool use_mask, int nOctaveLayers)
|
||||
{
|
||||
const int layer_rows = img_rows >> octave;
|
||||
const int layer_cols = img_cols >> octave;
|
||||
|
||||
const int min_margin = ((calcSize(octave, 2) >> 1) >> octave) + 1;
|
||||
|
||||
dim3 threads(16, 16);
|
||||
|
||||
dim3 grid;
|
||||
grid.x = divUp(layer_cols - 2 * min_margin, threads.x - 2);
|
||||
grid.y = divUp(layer_rows - 2 * min_margin, threads.y - 2) * nOctaveLayers;
|
||||
|
||||
const size_t smem_size = threads.x * threads.y * 3 * sizeof(float);
|
||||
|
||||
if (use_mask)
|
||||
icvFindMaximaInLayer<WithMask><<<grid, threads, smem_size>>>(det, trace, maxPosBuffer, maxCounter);
|
||||
else
|
||||
icvFindMaximaInLayer<WithOutMask><<<grid, threads, smem_size>>>(det, trace, maxPosBuffer, maxCounter);
|
||||
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
cudaSafeCall( cudaDeviceSynchronize() );
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////
|
||||
// INTERPOLATION
|
||||
|
||||
__global__ void icvInterpolateKeypoint(const PtrStepf det, const int4* maxPosBuffer,
|
||||
float* featureX, float* featureY, int* featureLaplacian, int* featureOctave, float* featureSize, float* featureHessian,
|
||||
unsigned int* featureCounter)
|
||||
{
|
||||
#if __CUDA_ARCH__ && __CUDA_ARCH__ >= 110
|
||||
|
||||
const int4 maxPos = maxPosBuffer[blockIdx.x];
|
||||
|
||||
const int j = maxPos.x - 1 + threadIdx.x;
|
||||
const int i = maxPos.y - 1 + threadIdx.y;
|
||||
const int layer = maxPos.z - 1 + threadIdx.z;
|
||||
|
||||
__shared__ float N9[3][3][3];
|
||||
|
||||
N9[threadIdx.z][threadIdx.y][threadIdx.x] = det.ptr(c_layer_rows * layer + i)[j];
|
||||
__syncthreads();
|
||||
|
||||
if (threadIdx.x == 0 && threadIdx.y == 0 && threadIdx.z == 0)
|
||||
{
|
||||
__shared__ float dD[3];
|
||||
|
||||
//dx
|
||||
dD[0] = -0.5f * (N9[1][1][2] - N9[1][1][0]);
|
||||
//dy
|
||||
dD[1] = -0.5f * (N9[1][2][1] - N9[1][0][1]);
|
||||
//ds
|
||||
dD[2] = -0.5f * (N9[2][1][1] - N9[0][1][1]);
|
||||
|
||||
__shared__ float H[3][3];
|
||||
|
||||
//dxx
|
||||
H[0][0] = N9[1][1][0] - 2.0f * N9[1][1][1] + N9[1][1][2];
|
||||
//dxy
|
||||
H[0][1]= 0.25f * (N9[1][2][2] - N9[1][2][0] - N9[1][0][2] + N9[1][0][0]);
|
||||
//dxs
|
||||
H[0][2]= 0.25f * (N9[2][1][2] - N9[2][1][0] - N9[0][1][2] + N9[0][1][0]);
|
||||
//dyx = dxy
|
||||
H[1][0] = H[0][1];
|
||||
//dyy
|
||||
H[1][1] = N9[1][0][1] - 2.0f * N9[1][1][1] + N9[1][2][1];
|
||||
//dys
|
||||
H[1][2]= 0.25f * (N9[2][2][1] - N9[2][0][1] - N9[0][2][1] + N9[0][0][1]);
|
||||
//dsx = dxs
|
||||
H[2][0] = H[0][2];
|
||||
//dsy = dys
|
||||
H[2][1] = H[1][2];
|
||||
//dss
|
||||
H[2][2] = N9[0][1][1] - 2.0f * N9[1][1][1] + N9[2][1][1];
|
||||
|
||||
__shared__ float x[3];
|
||||
|
||||
if (solve3x3(H, dD, x))
|
||||
{
|
||||
if (::fabs(x[0]) <= 1.f && ::fabs(x[1]) <= 1.f && ::fabs(x[2]) <= 1.f)
|
||||
{
|
||||
// if the step is within the interpolation region, perform it
|
||||
|
||||
const int size = calcSize(c_octave, maxPos.z);
|
||||
|
||||
const int sum_i = (maxPos.y - ((size >> 1) >> c_octave)) << c_octave;
|
||||
const int sum_j = (maxPos.x - ((size >> 1) >> c_octave)) << c_octave;
|
||||
|
||||
const float center_i = sum_i + (float)(size - 1) / 2;
|
||||
const float center_j = sum_j + (float)(size - 1) / 2;
|
||||
|
||||
const float px = center_j + x[0] * (1 << c_octave);
|
||||
const float py = center_i + x[1] * (1 << c_octave);
|
||||
|
||||
const int ds = size - calcSize(c_octave, maxPos.z - 1);
|
||||
const float psize = roundf(size + x[2] * ds);
|
||||
|
||||
/* The sampling intervals and wavelet sized for selecting an orientation
|
||||
and building the keypoint descriptor are defined relative to 's' */
|
||||
const float s = psize * 1.2f / 9.0f;
|
||||
|
||||
/* To find the dominant orientation, the gradients in x and y are
|
||||
sampled in a circle of radius 6s using wavelets of size 4s.
|
||||
We ensure the gradient wavelet size is even to ensure the
|
||||
wavelet pattern is balanced and symmetric around its center */
|
||||
const int grad_wav_size = 2 * __float2int_rn(2.0f * s);
|
||||
|
||||
// check when grad_wav_size is too big
|
||||
if ((c_img_rows + 1) >= grad_wav_size && (c_img_cols + 1) >= grad_wav_size)
|
||||
{
|
||||
// Get a new feature index.
|
||||
unsigned int ind = atomicInc(featureCounter, (unsigned int)-1);
|
||||
|
||||
if (ind < c_max_features)
|
||||
{
|
||||
featureX[ind] = px;
|
||||
featureY[ind] = py;
|
||||
featureLaplacian[ind] = maxPos.w;
|
||||
featureOctave[ind] = c_octave;
|
||||
featureSize[ind] = psize;
|
||||
featureHessian[ind] = N9[1][1][1];
|
||||
}
|
||||
} // grad_wav_size check
|
||||
} // If the subpixel interpolation worked
|
||||
}
|
||||
} // If this is thread 0.
|
||||
|
||||
#endif
|
||||
}
|
||||
|
||||
void icvInterpolateKeypoint_gpu(const PtrStepf& det, const int4* maxPosBuffer, unsigned int maxCounter,
|
||||
float* featureX, float* featureY, int* featureLaplacian, int* featureOctave, float* featureSize, float* featureHessian,
|
||||
unsigned int* featureCounter)
|
||||
{
|
||||
dim3 threads;
|
||||
threads.x = 3;
|
||||
threads.y = 3;
|
||||
threads.z = 3;
|
||||
|
||||
dim3 grid;
|
||||
grid.x = maxCounter;
|
||||
|
||||
icvInterpolateKeypoint<<<grid, threads>>>(det, maxPosBuffer, featureX, featureY, featureLaplacian, featureOctave, featureSize, featureHessian, featureCounter);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
cudaSafeCall( cudaDeviceSynchronize() );
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////
|
||||
// Orientation
|
||||
|
||||
#define ORI_SEARCH_INC 5
|
||||
#define ORI_WIN 60
|
||||
#define ORI_SAMPLES 113
|
||||
|
||||
__constant__ float c_aptX[ORI_SAMPLES] = {-6, -5, -5, -5, -5, -5, -5, -5, -4, -4, -4, -4, -4, -4, -4, -4, -4, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 6};
|
||||
__constant__ float c_aptY[ORI_SAMPLES] = {0, -3, -2, -1, 0, 1, 2, 3, -4, -3, -2, -1, 0, 1, 2, 3, 4, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, -6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, 6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, -4, -3, -2, -1, 0, 1, 2, 3, 4, -3, -2, -1, 0, 1, 2, 3, 0};
|
||||
__constant__ float c_aptW[ORI_SAMPLES] = {0.001455130288377404f, 0.001707611023448408f, 0.002547456417232752f, 0.003238451667129993f, 0.0035081731621176f, 0.003238451667129993f, 0.002547456417232752f, 0.001707611023448408f, 0.002003900473937392f, 0.0035081731621176f, 0.005233579315245152f, 0.00665318313986063f, 0.00720730796456337f, 0.00665318313986063f, 0.005233579315245152f, 0.0035081731621176f, 0.002003900473937392f, 0.001707611023448408f, 0.0035081731621176f, 0.006141661666333675f, 0.009162282571196556f, 0.01164754293859005f, 0.01261763460934162f, 0.01164754293859005f, 0.009162282571196556f, 0.006141661666333675f, 0.0035081731621176f, 0.001707611023448408f, 0.002547456417232752f, 0.005233579315245152f, 0.009162282571196556f, 0.01366852037608624f, 0.01737609319388866f, 0.0188232995569706f, 0.01737609319388866f, 0.01366852037608624f, 0.009162282571196556f, 0.005233579315245152f, 0.002547456417232752f, 0.003238451667129993f, 0.00665318313986063f, 0.01164754293859005f, 0.01737609319388866f, 0.02208934165537357f, 0.02392910048365593f, 0.02208934165537357f, 0.01737609319388866f, 0.01164754293859005f, 0.00665318313986063f, 0.003238451667129993f, 0.001455130288377404f, 0.0035081731621176f, 0.00720730796456337f, 0.01261763460934162f, 0.0188232995569706f, 0.02392910048365593f, 0.02592208795249462f, 0.02392910048365593f, 0.0188232995569706f, 0.01261763460934162f, 0.00720730796456337f, 0.0035081731621176f, 0.001455130288377404f, 0.003238451667129993f, 0.00665318313986063f, 0.01164754293859005f, 0.01737609319388866f, 0.02208934165537357f, 0.02392910048365593f, 0.02208934165537357f, 0.01737609319388866f, 0.01164754293859005f, 0.00665318313986063f, 0.003238451667129993f, 0.002547456417232752f, 0.005233579315245152f, 0.009162282571196556f, 0.01366852037608624f, 0.01737609319388866f, 0.0188232995569706f, 0.01737609319388866f, 0.01366852037608624f, 0.009162282571196556f, 0.005233579315245152f, 0.002547456417232752f, 0.001707611023448408f, 0.0035081731621176f, 0.006141661666333675f, 0.009162282571196556f, 0.01164754293859005f, 0.01261763460934162f, 0.01164754293859005f, 0.009162282571196556f, 0.006141661666333675f, 0.0035081731621176f, 0.001707611023448408f, 0.002003900473937392f, 0.0035081731621176f, 0.005233579315245152f, 0.00665318313986063f, 0.00720730796456337f, 0.00665318313986063f, 0.005233579315245152f, 0.0035081731621176f, 0.002003900473937392f, 0.001707611023448408f, 0.002547456417232752f, 0.003238451667129993f, 0.0035081731621176f, 0.003238451667129993f, 0.002547456417232752f, 0.001707611023448408f, 0.001455130288377404f};
|
||||
|
||||
__constant__ float c_NX[2][5] = {{0, 0, 2, 4, -1}, {2, 0, 4, 4, 1}};
|
||||
__constant__ float c_NY[2][5] = {{0, 0, 4, 2, 1}, {0, 2, 4, 4, -1}};
|
||||
|
||||
__global__ void icvCalcOrientation(const float* featureX, const float* featureY, const float* featureSize, float* featureDir)
|
||||
{
|
||||
__shared__ float s_X[128];
|
||||
__shared__ float s_Y[128];
|
||||
__shared__ float s_angle[128];
|
||||
|
||||
__shared__ float s_sumx[32 * 4];
|
||||
__shared__ float s_sumy[32 * 4];
|
||||
|
||||
/* The sampling intervals and wavelet sized for selecting an orientation
|
||||
and building the keypoint descriptor are defined relative to 's' */
|
||||
const float s = featureSize[blockIdx.x] * 1.2f / 9.0f;
|
||||
|
||||
/* To find the dominant orientation, the gradients in x and y are
|
||||
sampled in a circle of radius 6s using wavelets of size 4s.
|
||||
We ensure the gradient wavelet size is even to ensure the
|
||||
wavelet pattern is balanced and symmetric around its center */
|
||||
const int grad_wav_size = 2 * __float2int_rn(2.0f * s);
|
||||
|
||||
// check when grad_wav_size is too big
|
||||
if ((c_img_rows + 1) < grad_wav_size || (c_img_cols + 1) < grad_wav_size)
|
||||
return;
|
||||
|
||||
// Calc X, Y, angle and store it to shared memory
|
||||
const int tid = threadIdx.y * blockDim.x + threadIdx.x;
|
||||
|
||||
float X = 0.0f, Y = 0.0f, angle = 0.0f;
|
||||
|
||||
if (tid < ORI_SAMPLES)
|
||||
{
|
||||
const float margin = (float)(grad_wav_size - 1) / 2.0f;
|
||||
const int x = __float2int_rn(featureX[blockIdx.x] + c_aptX[tid] * s - margin);
|
||||
const int y = __float2int_rn(featureY[blockIdx.x] + c_aptY[tid] * s - margin);
|
||||
|
||||
if (y >= 0 && y < (c_img_rows + 1) - grad_wav_size &&
|
||||
x >= 0 && x < (c_img_cols + 1) - grad_wav_size)
|
||||
{
|
||||
X = c_aptW[tid] * icvCalcHaarPatternSum<2>(c_NX, 4, grad_wav_size, y, x);
|
||||
Y = c_aptW[tid] * icvCalcHaarPatternSum<2>(c_NY, 4, grad_wav_size, y, x);
|
||||
|
||||
angle = atan2f(Y, X);
|
||||
if (angle < 0)
|
||||
angle += 2.0f * CV_PI_F;
|
||||
angle *= 180.0f / CV_PI_F;
|
||||
}
|
||||
}
|
||||
s_X[tid] = X;
|
||||
s_Y[tid] = Y;
|
||||
s_angle[tid] = angle;
|
||||
__syncthreads();
|
||||
|
||||
float bestx = 0, besty = 0, best_mod = 0;
|
||||
|
||||
#if __CUDA_ARCH__ >= 200
|
||||
#pragma unroll
|
||||
#endif
|
||||
for (int i = 0; i < 18; ++i)
|
||||
{
|
||||
const int dir = (i * 4 + threadIdx.y) * ORI_SEARCH_INC;
|
||||
|
||||
float sumx = 0.0f, sumy = 0.0f;
|
||||
int d = ::abs(__float2int_rn(s_angle[threadIdx.x]) - dir);
|
||||
if (d < ORI_WIN / 2 || d > 360 - ORI_WIN / 2)
|
||||
{
|
||||
sumx = s_X[threadIdx.x];
|
||||
sumy = s_Y[threadIdx.x];
|
||||
}
|
||||
d = ::abs(__float2int_rn(s_angle[threadIdx.x + 32]) - dir);
|
||||
if (d < ORI_WIN / 2 || d > 360 - ORI_WIN / 2)
|
||||
{
|
||||
sumx += s_X[threadIdx.x + 32];
|
||||
sumy += s_Y[threadIdx.x + 32];
|
||||
}
|
||||
d = ::abs(__float2int_rn(s_angle[threadIdx.x + 64]) - dir);
|
||||
if (d < ORI_WIN / 2 || d > 360 - ORI_WIN / 2)
|
||||
{
|
||||
sumx += s_X[threadIdx.x + 64];
|
||||
sumy += s_Y[threadIdx.x + 64];
|
||||
}
|
||||
d = ::abs(__float2int_rn(s_angle[threadIdx.x + 96]) - dir);
|
||||
if (d < ORI_WIN / 2 || d > 360 - ORI_WIN / 2)
|
||||
{
|
||||
sumx += s_X[threadIdx.x + 96];
|
||||
sumy += s_Y[threadIdx.x + 96];
|
||||
}
|
||||
|
||||
plus<float> op;
|
||||
device::reduce<32>(smem_tuple(s_sumx + threadIdx.y * 32, s_sumy + threadIdx.y * 32),
|
||||
thrust::tie(sumx, sumy), threadIdx.x, thrust::make_tuple(op, op));
|
||||
|
||||
const float temp_mod = sumx * sumx + sumy * sumy;
|
||||
if (temp_mod > best_mod)
|
||||
{
|
||||
best_mod = temp_mod;
|
||||
bestx = sumx;
|
||||
besty = sumy;
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
}
|
||||
|
||||
if (threadIdx.x == 0)
|
||||
{
|
||||
s_X[threadIdx.y] = bestx;
|
||||
s_Y[threadIdx.y] = besty;
|
||||
s_angle[threadIdx.y] = best_mod;
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
if (threadIdx.x == 0 && threadIdx.y == 0)
|
||||
{
|
||||
int bestIdx = 0;
|
||||
|
||||
if (s_angle[1] > s_angle[bestIdx])
|
||||
bestIdx = 1;
|
||||
if (s_angle[2] > s_angle[bestIdx])
|
||||
bestIdx = 2;
|
||||
if (s_angle[3] > s_angle[bestIdx])
|
||||
bestIdx = 3;
|
||||
|
||||
float kp_dir = atan2f(s_Y[bestIdx], s_X[bestIdx]);
|
||||
if (kp_dir < 0)
|
||||
kp_dir += 2.0f * CV_PI_F;
|
||||
kp_dir *= 180.0f / CV_PI_F;
|
||||
|
||||
kp_dir = 360.0f - kp_dir;
|
||||
if (::fabsf(kp_dir - 360.f) < numeric_limits<float>::epsilon())
|
||||
kp_dir = 0.f;
|
||||
|
||||
featureDir[blockIdx.x] = kp_dir;
|
||||
}
|
||||
}
|
||||
|
||||
#undef ORI_SEARCH_INC
|
||||
#undef ORI_WIN
|
||||
#undef ORI_SAMPLES
|
||||
|
||||
void icvCalcOrientation_gpu(const float* featureX, const float* featureY, const float* featureSize, float* featureDir, int nFeatures)
|
||||
{
|
||||
dim3 threads;
|
||||
threads.x = 32;
|
||||
threads.y = 4;
|
||||
|
||||
dim3 grid;
|
||||
grid.x = nFeatures;
|
||||
|
||||
icvCalcOrientation<<<grid, threads>>>(featureX, featureY, featureSize, featureDir);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
cudaSafeCall( cudaDeviceSynchronize() );
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////
|
||||
// Descriptors
|
||||
|
||||
#define PATCH_SZ 20
|
||||
|
||||
__constant__ float c_DW[PATCH_SZ * PATCH_SZ] =
|
||||
{
|
||||
3.695352233989979e-006f, 8.444558261544444e-006f, 1.760426494001877e-005f, 3.34794785885606e-005f, 5.808438800158911e-005f, 9.193058212986216e-005f, 0.0001327334757661447f, 0.0001748319627949968f, 0.0002100782439811155f, 0.0002302826324012131f, 0.0002302826324012131f, 0.0002100782439811155f, 0.0001748319627949968f, 0.0001327334757661447f, 9.193058212986216e-005f, 5.808438800158911e-005f, 3.34794785885606e-005f, 1.760426494001877e-005f, 8.444558261544444e-006f, 3.695352233989979e-006f,
|
||||
8.444558261544444e-006f, 1.929736572492402e-005f, 4.022897701361217e-005f, 7.650675252079964e-005f, 0.0001327334903180599f, 0.0002100782585330308f, 0.0003033203829545528f, 0.0003995231236331165f, 0.0004800673632416874f, 0.0005262381164357066f, 0.0005262381164357066f, 0.0004800673632416874f, 0.0003995231236331165f, 0.0003033203829545528f, 0.0002100782585330308f, 0.0001327334903180599f, 7.650675252079964e-005f, 4.022897701361217e-005f, 1.929736572492402e-005f, 8.444558261544444e-006f,
|
||||
1.760426494001877e-005f, 4.022897701361217e-005f, 8.386484114453197e-005f, 0.0001594926579855382f, 0.0002767078403849155f, 0.0004379475140012801f, 0.0006323281559161842f, 0.0008328808471560478f, 0.001000790391117334f, 0.001097041997127235f, 0.001097041997127235f, 0.001000790391117334f, 0.0008328808471560478f, 0.0006323281559161842f, 0.0004379475140012801f, 0.0002767078403849155f, 0.0001594926579855382f, 8.386484114453197e-005f, 4.022897701361217e-005f, 1.760426494001877e-005f,
|
||||
3.34794785885606e-005f, 7.650675252079964e-005f, 0.0001594926579855382f, 0.0003033203247468919f, 0.0005262380582280457f, 0.0008328807889483869f, 0.001202550483867526f, 0.001583957928232849f, 0.001903285388834775f, 0.002086334861814976f, 0.002086334861814976f, 0.001903285388834775f, 0.001583957928232849f, 0.001202550483867526f, 0.0008328807889483869f, 0.0005262380582280457f, 0.0003033203247468919f, 0.0001594926579855382f, 7.650675252079964e-005f, 3.34794785885606e-005f,
|
||||
5.808438800158911e-005f, 0.0001327334903180599f, 0.0002767078403849155f, 0.0005262380582280457f, 0.0009129836107604206f, 0.001444985857233405f, 0.002086335094645619f, 0.002748048631474376f, 0.00330205773934722f, 0.003619635012000799f, 0.003619635012000799f, 0.00330205773934722f, 0.002748048631474376f, 0.002086335094645619f, 0.001444985857233405f, 0.0009129836107604206f, 0.0005262380582280457f, 0.0002767078403849155f, 0.0001327334903180599f, 5.808438800158911e-005f,
|
||||
9.193058212986216e-005f, 0.0002100782585330308f, 0.0004379475140012801f, 0.0008328807889483869f, 0.001444985857233405f, 0.002286989474669099f, 0.00330205773934722f, 0.004349356517195702f, 0.00522619066759944f, 0.005728822201490402f, 0.005728822201490402f, 0.00522619066759944f, 0.004349356517195702f, 0.00330205773934722f, 0.002286989474669099f, 0.001444985857233405f, 0.0008328807889483869f, 0.0004379475140012801f, 0.0002100782585330308f, 9.193058212986216e-005f,
|
||||
0.0001327334757661447f, 0.0003033203829545528f, 0.0006323281559161842f, 0.001202550483867526f, 0.002086335094645619f, 0.00330205773934722f, 0.004767658654600382f, 0.006279794964939356f, 0.007545807864516974f, 0.008271530270576477f, 0.008271530270576477f, 0.007545807864516974f, 0.006279794964939356f, 0.004767658654600382f, 0.00330205773934722f, 0.002086335094645619f, 0.001202550483867526f, 0.0006323281559161842f, 0.0003033203829545528f, 0.0001327334757661447f,
|
||||
0.0001748319627949968f, 0.0003995231236331165f, 0.0008328808471560478f, 0.001583957928232849f, 0.002748048631474376f, 0.004349356517195702f, 0.006279794964939356f, 0.008271529339253902f, 0.009939077310264111f, 0.01089497376233339f, 0.01089497376233339f, 0.009939077310264111f, 0.008271529339253902f, 0.006279794964939356f, 0.004349356517195702f, 0.002748048631474376f, 0.001583957928232849f, 0.0008328808471560478f, 0.0003995231236331165f, 0.0001748319627949968f,
|
||||
0.0002100782439811155f, 0.0004800673632416874f, 0.001000790391117334f, 0.001903285388834775f, 0.00330205773934722f, 0.00522619066759944f, 0.007545807864516974f, 0.009939077310264111f, 0.01194280479103327f, 0.01309141051024199f, 0.01309141051024199f, 0.01194280479103327f, 0.009939077310264111f, 0.007545807864516974f, 0.00522619066759944f, 0.00330205773934722f, 0.001903285388834775f, 0.001000790391117334f, 0.0004800673632416874f, 0.0002100782439811155f,
|
||||
0.0002302826324012131f, 0.0005262381164357066f, 0.001097041997127235f, 0.002086334861814976f, 0.003619635012000799f, 0.005728822201490402f, 0.008271530270576477f, 0.01089497376233339f, 0.01309141051024199f, 0.01435048412531614f, 0.01435048412531614f, 0.01309141051024199f, 0.01089497376233339f, 0.008271530270576477f, 0.005728822201490402f, 0.003619635012000799f, 0.002086334861814976f, 0.001097041997127235f, 0.0005262381164357066f, 0.0002302826324012131f,
|
||||
0.0002302826324012131f, 0.0005262381164357066f, 0.001097041997127235f, 0.002086334861814976f, 0.003619635012000799f, 0.005728822201490402f, 0.008271530270576477f, 0.01089497376233339f, 0.01309141051024199f, 0.01435048412531614f, 0.01435048412531614f, 0.01309141051024199f, 0.01089497376233339f, 0.008271530270576477f, 0.005728822201490402f, 0.003619635012000799f, 0.002086334861814976f, 0.001097041997127235f, 0.0005262381164357066f, 0.0002302826324012131f,
|
||||
0.0002100782439811155f, 0.0004800673632416874f, 0.001000790391117334f, 0.001903285388834775f, 0.00330205773934722f, 0.00522619066759944f, 0.007545807864516974f, 0.009939077310264111f, 0.01194280479103327f, 0.01309141051024199f, 0.01309141051024199f, 0.01194280479103327f, 0.009939077310264111f, 0.007545807864516974f, 0.00522619066759944f, 0.00330205773934722f, 0.001903285388834775f, 0.001000790391117334f, 0.0004800673632416874f, 0.0002100782439811155f,
|
||||
0.0001748319627949968f, 0.0003995231236331165f, 0.0008328808471560478f, 0.001583957928232849f, 0.002748048631474376f, 0.004349356517195702f, 0.006279794964939356f, 0.008271529339253902f, 0.009939077310264111f, 0.01089497376233339f, 0.01089497376233339f, 0.009939077310264111f, 0.008271529339253902f, 0.006279794964939356f, 0.004349356517195702f, 0.002748048631474376f, 0.001583957928232849f, 0.0008328808471560478f, 0.0003995231236331165f, 0.0001748319627949968f,
|
||||
0.0001327334757661447f, 0.0003033203829545528f, 0.0006323281559161842f, 0.001202550483867526f, 0.002086335094645619f, 0.00330205773934722f, 0.004767658654600382f, 0.006279794964939356f, 0.007545807864516974f, 0.008271530270576477f, 0.008271530270576477f, 0.007545807864516974f, 0.006279794964939356f, 0.004767658654600382f, 0.00330205773934722f, 0.002086335094645619f, 0.001202550483867526f, 0.0006323281559161842f, 0.0003033203829545528f, 0.0001327334757661447f,
|
||||
9.193058212986216e-005f, 0.0002100782585330308f, 0.0004379475140012801f, 0.0008328807889483869f, 0.001444985857233405f, 0.002286989474669099f, 0.00330205773934722f, 0.004349356517195702f, 0.00522619066759944f, 0.005728822201490402f, 0.005728822201490402f, 0.00522619066759944f, 0.004349356517195702f, 0.00330205773934722f, 0.002286989474669099f, 0.001444985857233405f, 0.0008328807889483869f, 0.0004379475140012801f, 0.0002100782585330308f, 9.193058212986216e-005f,
|
||||
5.808438800158911e-005f, 0.0001327334903180599f, 0.0002767078403849155f, 0.0005262380582280457f, 0.0009129836107604206f, 0.001444985857233405f, 0.002086335094645619f, 0.002748048631474376f, 0.00330205773934722f, 0.003619635012000799f, 0.003619635012000799f, 0.00330205773934722f, 0.002748048631474376f, 0.002086335094645619f, 0.001444985857233405f, 0.0009129836107604206f, 0.0005262380582280457f, 0.0002767078403849155f, 0.0001327334903180599f, 5.808438800158911e-005f,
|
||||
3.34794785885606e-005f, 7.650675252079964e-005f, 0.0001594926579855382f, 0.0003033203247468919f, 0.0005262380582280457f, 0.0008328807889483869f, 0.001202550483867526f, 0.001583957928232849f, 0.001903285388834775f, 0.002086334861814976f, 0.002086334861814976f, 0.001903285388834775f, 0.001583957928232849f, 0.001202550483867526f, 0.0008328807889483869f, 0.0005262380582280457f, 0.0003033203247468919f, 0.0001594926579855382f, 7.650675252079964e-005f, 3.34794785885606e-005f,
|
||||
1.760426494001877e-005f, 4.022897701361217e-005f, 8.386484114453197e-005f, 0.0001594926579855382f, 0.0002767078403849155f, 0.0004379475140012801f, 0.0006323281559161842f, 0.0008328808471560478f, 0.001000790391117334f, 0.001097041997127235f, 0.001097041997127235f, 0.001000790391117334f, 0.0008328808471560478f, 0.0006323281559161842f, 0.0004379475140012801f, 0.0002767078403849155f, 0.0001594926579855382f, 8.386484114453197e-005f, 4.022897701361217e-005f, 1.760426494001877e-005f,
|
||||
8.444558261544444e-006f, 1.929736572492402e-005f, 4.022897701361217e-005f, 7.650675252079964e-005f, 0.0001327334903180599f, 0.0002100782585330308f, 0.0003033203829545528f, 0.0003995231236331165f, 0.0004800673632416874f, 0.0005262381164357066f, 0.0005262381164357066f, 0.0004800673632416874f, 0.0003995231236331165f, 0.0003033203829545528f, 0.0002100782585330308f, 0.0001327334903180599f, 7.650675252079964e-005f, 4.022897701361217e-005f, 1.929736572492402e-005f, 8.444558261544444e-006f,
|
||||
3.695352233989979e-006f, 8.444558261544444e-006f, 1.760426494001877e-005f, 3.34794785885606e-005f, 5.808438800158911e-005f, 9.193058212986216e-005f, 0.0001327334757661447f, 0.0001748319627949968f, 0.0002100782439811155f, 0.0002302826324012131f, 0.0002302826324012131f, 0.0002100782439811155f, 0.0001748319627949968f, 0.0001327334757661447f, 9.193058212986216e-005f, 5.808438800158911e-005f, 3.34794785885606e-005f, 1.760426494001877e-005f, 8.444558261544444e-006f, 3.695352233989979e-006f
|
||||
};
|
||||
|
||||
struct WinReader
|
||||
{
|
||||
typedef uchar elem_type;
|
||||
|
||||
__device__ __forceinline__ uchar operator ()(int i, int j) const
|
||||
{
|
||||
float pixel_x = centerX + (win_offset + j) * cos_dir + (win_offset + i) * sin_dir;
|
||||
float pixel_y = centerY - (win_offset + j) * sin_dir + (win_offset + i) * cos_dir;
|
||||
|
||||
return tex2D(imgTex, pixel_x, pixel_y);
|
||||
}
|
||||
|
||||
float centerX;
|
||||
float centerY;
|
||||
float win_offset;
|
||||
float cos_dir;
|
||||
float sin_dir;
|
||||
int width;
|
||||
int height;
|
||||
};
|
||||
|
||||
__device__ void calc_dx_dy(const float* featureX, const float* featureY, const float* featureSize, const float* featureDir,
|
||||
float& dx, float& dy)
|
||||
{
|
||||
__shared__ float s_PATCH[PATCH_SZ + 1][PATCH_SZ + 1];
|
||||
|
||||
dx = dy = 0.0f;
|
||||
|
||||
WinReader win;
|
||||
|
||||
win.centerX = featureX[blockIdx.x];
|
||||
win.centerY = featureY[blockIdx.x];
|
||||
|
||||
// The sampling intervals and wavelet sized for selecting an orientation
|
||||
// and building the keypoint descriptor are defined relative to 's'
|
||||
const float s = featureSize[blockIdx.x] * 1.2f / 9.0f;
|
||||
|
||||
// Extract a window of pixels around the keypoint of size 20s
|
||||
const int win_size = (int)((PATCH_SZ + 1) * s);
|
||||
|
||||
win.width = win.height = win_size;
|
||||
|
||||
// Nearest neighbour version (faster)
|
||||
win.win_offset = -(win_size - 1.0f) / 2.0f;
|
||||
|
||||
float descriptor_dir = 360.0f - featureDir[blockIdx.x];
|
||||
if (::fabsf(descriptor_dir - 360.f) < numeric_limits<float>::epsilon())
|
||||
descriptor_dir = 0.f;
|
||||
descriptor_dir *= CV_PI_F / 180.0f;
|
||||
sincosf(descriptor_dir, &win.sin_dir, &win.cos_dir);
|
||||
|
||||
const int tid = threadIdx.y * blockDim.x + threadIdx.x;
|
||||
|
||||
const int xLoadInd = tid % (PATCH_SZ + 1);
|
||||
const int yLoadInd = tid / (PATCH_SZ + 1);
|
||||
|
||||
if (yLoadInd < (PATCH_SZ + 1))
|
||||
{
|
||||
if (s > 1)
|
||||
{
|
||||
AreaFilter<WinReader> filter(win, s, s);
|
||||
s_PATCH[yLoadInd][xLoadInd] = filter(yLoadInd, xLoadInd);
|
||||
}
|
||||
else
|
||||
{
|
||||
LinearFilter<WinReader> filter(win);
|
||||
s_PATCH[yLoadInd][xLoadInd] = filter(yLoadInd * s, xLoadInd * s);
|
||||
}
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
|
||||
const int xPatchInd = threadIdx.x % 5;
|
||||
const int yPatchInd = threadIdx.x / 5;
|
||||
|
||||
if (yPatchInd < 5)
|
||||
{
|
||||
const int xBlockInd = threadIdx.y % 4;
|
||||
const int yBlockInd = threadIdx.y / 4;
|
||||
|
||||
const int xInd = xBlockInd * 5 + xPatchInd;
|
||||
const int yInd = yBlockInd * 5 + yPatchInd;
|
||||
|
||||
const float dw = c_DW[yInd * PATCH_SZ + xInd];
|
||||
|
||||
dx = (s_PATCH[yInd ][xInd + 1] - s_PATCH[yInd][xInd] + s_PATCH[yInd + 1][xInd + 1] - s_PATCH[yInd + 1][xInd ]) * dw;
|
||||
dy = (s_PATCH[yInd + 1][xInd ] - s_PATCH[yInd][xInd] + s_PATCH[yInd + 1][xInd + 1] - s_PATCH[yInd ][xInd + 1]) * dw;
|
||||
}
|
||||
}
|
||||
|
||||
__global__ void compute_descriptors_64(PtrStep<float4> descriptors, const float* featureX, const float* featureY, const float* featureSize, const float* featureDir)
|
||||
{
|
||||
__shared__ float smem[32 * 16];
|
||||
|
||||
float* sRow = smem + threadIdx.y * 32;
|
||||
|
||||
float dx, dy;
|
||||
calc_dx_dy(featureX, featureY, featureSize, featureDir, dx, dy);
|
||||
|
||||
float dxabs = ::fabsf(dx);
|
||||
float dyabs = ::fabsf(dy);
|
||||
|
||||
plus<float> op;
|
||||
|
||||
reduce<32>(sRow, dx, threadIdx.x, op);
|
||||
reduce<32>(sRow, dy, threadIdx.x, op);
|
||||
reduce<32>(sRow, dxabs, threadIdx.x, op);
|
||||
reduce<32>(sRow, dyabs, threadIdx.x, op);
|
||||
|
||||
float4* descriptors_block = descriptors.ptr(blockIdx.x) + threadIdx.y;
|
||||
|
||||
// write dx, dy, |dx|, |dy|
|
||||
if (threadIdx.x == 0)
|
||||
*descriptors_block = make_float4(dx, dy, dxabs, dyabs);
|
||||
}
|
||||
|
||||
__global__ void compute_descriptors_128(PtrStep<float4> descriptors, const float* featureX, const float* featureY, const float* featureSize, const float* featureDir)
|
||||
{
|
||||
__shared__ float smem[32 * 16];
|
||||
|
||||
float* sRow = smem + threadIdx.y * 32;
|
||||
|
||||
float dx, dy;
|
||||
calc_dx_dy(featureX, featureY, featureSize, featureDir, dx, dy);
|
||||
|
||||
float4* descriptors_block = descriptors.ptr(blockIdx.x) + threadIdx.y * 2;
|
||||
|
||||
plus<float> op;
|
||||
|
||||
float d1 = 0.0f;
|
||||
float d2 = 0.0f;
|
||||
float abs1 = 0.0f;
|
||||
float abs2 = 0.0f;
|
||||
|
||||
if (dy >= 0)
|
||||
{
|
||||
d1 = dx;
|
||||
abs1 = ::fabsf(dx);
|
||||
}
|
||||
else
|
||||
{
|
||||
d2 = dx;
|
||||
abs2 = ::fabsf(dx);
|
||||
}
|
||||
|
||||
reduce<32>(sRow, d1, threadIdx.x, op);
|
||||
reduce<32>(sRow, d2, threadIdx.x, op);
|
||||
reduce<32>(sRow, abs1, threadIdx.x, op);
|
||||
reduce<32>(sRow, abs2, threadIdx.x, op);
|
||||
|
||||
// write dx (dy >= 0), |dx| (dy >= 0), dx (dy < 0), |dx| (dy < 0)
|
||||
if (threadIdx.x == 0)
|
||||
descriptors_block[0] = make_float4(d1, abs1, d2, abs2);
|
||||
|
||||
if (dx >= 0)
|
||||
{
|
||||
d1 = dy;
|
||||
abs1 = ::fabsf(dy);
|
||||
d2 = 0.0f;
|
||||
abs2 = 0.0f;
|
||||
}
|
||||
else
|
||||
{
|
||||
d1 = 0.0f;
|
||||
abs1 = 0.0f;
|
||||
d2 = dy;
|
||||
abs2 = ::fabsf(dy);
|
||||
}
|
||||
|
||||
reduce<32>(sRow, d1, threadIdx.x, op);
|
||||
reduce<32>(sRow, d2, threadIdx.x, op);
|
||||
reduce<32>(sRow, abs1, threadIdx.x, op);
|
||||
reduce<32>(sRow, abs2, threadIdx.x, op);
|
||||
|
||||
// write dy (dx >= 0), |dy| (dx >= 0), dy (dx < 0), |dy| (dx < 0)
|
||||
if (threadIdx.x == 0)
|
||||
descriptors_block[1] = make_float4(d1, abs1, d2, abs2);
|
||||
}
|
||||
|
||||
template <int BLOCK_DIM_X> __global__ void normalize_descriptors(PtrStepf descriptors)
|
||||
{
|
||||
__shared__ float smem[BLOCK_DIM_X];
|
||||
__shared__ float s_len;
|
||||
|
||||
// no need for thread ID
|
||||
float* descriptor_base = descriptors.ptr(blockIdx.x);
|
||||
|
||||
// read in the unnormalized descriptor values (squared)
|
||||
const float val = descriptor_base[threadIdx.x];
|
||||
|
||||
float len = val * val;
|
||||
reduce<BLOCK_DIM_X>(smem, len, threadIdx.x, plus<float>());
|
||||
|
||||
if (threadIdx.x == 0)
|
||||
s_len = ::sqrtf(len);
|
||||
|
||||
__syncthreads();
|
||||
|
||||
// normalize and store in output
|
||||
descriptor_base[threadIdx.x] = val / s_len;
|
||||
}
|
||||
|
||||
void compute_descriptors_gpu(PtrStepSz<float4> descriptors, const float* featureX, const float* featureY, const float* featureSize, const float* featureDir, int nFeatures)
|
||||
{
|
||||
// compute unnormalized descriptors, then normalize them - odd indexing since grid must be 2D
|
||||
|
||||
if (descriptors.cols == 64)
|
||||
{
|
||||
compute_descriptors_64<<<nFeatures, dim3(32, 16)>>>(descriptors, featureX, featureY, featureSize, featureDir);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
cudaSafeCall( cudaDeviceSynchronize() );
|
||||
|
||||
normalize_descriptors<64><<<nFeatures, 64>>>((PtrStepSzf) descriptors);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
cudaSafeCall( cudaDeviceSynchronize() );
|
||||
}
|
||||
else
|
||||
{
|
||||
compute_descriptors_128<<<nFeatures, dim3(32, 16)>>>(descriptors, featureX, featureY, featureSize, featureDir);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
cudaSafeCall( cudaDeviceSynchronize() );
|
||||
|
||||
normalize_descriptors<128><<<nFeatures, 128>>>((PtrStepSzf) descriptors);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
cudaSafeCall( cudaDeviceSynchronize() );
|
||||
}
|
||||
}
|
||||
} // namespace surf
|
||||
}}} // namespace cv { namespace gpu { namespace device
|
||||
|
||||
|
||||
#endif /* CUDA_DISABLER */
|
@@ -263,11 +263,8 @@ namespace
|
||||
|
||||
namespace arithm
|
||||
{
|
||||
template <typename T, typename D>
|
||||
void vadd4(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
|
||||
|
||||
template <typename T, typename D>
|
||||
void vadd2(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
|
||||
void addMat_v4(PtrStepSz<unsigned int> src1, PtrStepSz<unsigned int> src2, PtrStepSz<unsigned int> dst, cudaStream_t stream);
|
||||
void addMat_v2(PtrStepSz<unsigned int> src1, PtrStepSz<unsigned int> src2, PtrStepSz<unsigned int> dst, cudaStream_t stream);
|
||||
|
||||
template <typename T, typename D>
|
||||
void addMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
|
||||
@@ -345,62 +342,6 @@ void cv::gpu::add(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const Gpu
|
||||
}
|
||||
};
|
||||
|
||||
typedef void (*vfunc_t)(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
|
||||
static const vfunc_t vfuncs4[4][4] =
|
||||
{
|
||||
{
|
||||
vadd4<unsigned int, unsigned int>,
|
||||
vadd4<unsigned int, int>,
|
||||
0,
|
||||
0
|
||||
},
|
||||
{
|
||||
vadd4<int, unsigned int>,
|
||||
vadd4<int, int>,
|
||||
0,
|
||||
0
|
||||
},
|
||||
{
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
0
|
||||
},
|
||||
{
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
0
|
||||
}
|
||||
};
|
||||
static const vfunc_t vfuncs2[4][4] =
|
||||
{
|
||||
{
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
0
|
||||
},
|
||||
{
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
0
|
||||
},
|
||||
{
|
||||
0,
|
||||
0,
|
||||
vadd2<unsigned int, unsigned int>,
|
||||
vadd2<unsigned int, int>
|
||||
},
|
||||
{
|
||||
0,
|
||||
0,
|
||||
vadd2<int, unsigned int>,
|
||||
vadd2<int, int>
|
||||
}
|
||||
};
|
||||
|
||||
if (dtype < 0)
|
||||
dtype = src1.depth();
|
||||
|
||||
@@ -426,7 +367,7 @@ void cv::gpu::add(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const Gpu
|
||||
PtrStepSzb src2_(src1.rows, src1.cols * cn, src2.data, src2.step);
|
||||
PtrStepSzb dst_(src1.rows, src1.cols * cn, dst.data, dst.step);
|
||||
|
||||
if (mask.empty() && sdepth < CV_32S && ddepth < CV_32S)
|
||||
if (mask.empty() && (sdepth == CV_8U || sdepth == CV_16U) && ddepth == sdepth)
|
||||
{
|
||||
const intptr_t src1ptr = reinterpret_cast<intptr_t>(src1_.data);
|
||||
const intptr_t src2ptr = reinterpret_cast<intptr_t>(src2_.data);
|
||||
@@ -434,31 +375,27 @@ void cv::gpu::add(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const Gpu
|
||||
|
||||
const bool isAllAligned = (src1ptr & 31) == 0 && (src2ptr & 31) == 0 && (dstptr & 31) == 0;
|
||||
|
||||
if (deviceSupports(FEATURE_SET_COMPUTE_20) && isAllAligned)
|
||||
if (isAllAligned)
|
||||
{
|
||||
const vfunc_t vfunc4 = vfuncs4[sdepth][ddepth];
|
||||
const vfunc_t vfunc2 = vfuncs2[sdepth][ddepth];
|
||||
|
||||
if (vfunc4 != 0 && (src1_.cols & 3) == 0)
|
||||
if (sdepth == CV_8U && (src1_.cols & 3) == 0)
|
||||
{
|
||||
const int vcols = src1_.cols >> 2;
|
||||
|
||||
vfunc4(PtrStepSzb(src1_.rows, vcols, src1_.data, src1_.step),
|
||||
PtrStepSzb(src1_.rows, vcols, src2_.data, src2_.step),
|
||||
PtrStepSzb(src1_.rows, vcols, dst_.data, dst_.step),
|
||||
stream);
|
||||
addMat_v4(PtrStepSz<unsigned int>(src1_.rows, vcols, (unsigned int*) src1_.data, src1_.step),
|
||||
PtrStepSz<unsigned int>(src1_.rows, vcols, (unsigned int*) src2_.data, src2_.step),
|
||||
PtrStepSz<unsigned int>(src1_.rows, vcols, (unsigned int*) dst_.data, dst_.step),
|
||||
stream);
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
if (vfunc2 != 0 && (src1_.cols & 1) == 0)
|
||||
else if (sdepth == CV_16U && (src1_.cols & 1) == 0)
|
||||
{
|
||||
const int vcols = src1_.cols >> 1;
|
||||
|
||||
vfunc2(PtrStepSzb(src1_.rows, vcols, src1_.data, src1_.step),
|
||||
PtrStepSzb(src1_.rows, vcols, src2_.data, src2_.step),
|
||||
PtrStepSzb(src1_.rows, vcols, dst_.data, dst_.step),
|
||||
stream);
|
||||
addMat_v2(PtrStepSz<unsigned int>(src1_.rows, vcols, (unsigned int*) src1_.data, src1_.step),
|
||||
PtrStepSz<unsigned int>(src1_.rows, vcols, (unsigned int*) src2_.data, src2_.step),
|
||||
PtrStepSz<unsigned int>(src1_.rows, vcols, (unsigned int*) dst_.data, dst_.step),
|
||||
stream);
|
||||
|
||||
return;
|
||||
}
|
||||
@@ -606,11 +543,8 @@ void cv::gpu::add(const GpuMat& src, const Scalar& sc, GpuMat& dst, const GpuMat
|
||||
|
||||
namespace arithm
|
||||
{
|
||||
template <typename T, typename D>
|
||||
void vsub4(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
|
||||
|
||||
template <typename T, typename D>
|
||||
void vsub2(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
|
||||
void subMat_v4(PtrStepSz<unsigned int> src1, PtrStepSz<unsigned int> src2, PtrStepSz<unsigned int> dst, cudaStream_t stream);
|
||||
void subMat_v2(PtrStepSz<unsigned int> src1, PtrStepSz<unsigned int> src2, PtrStepSz<unsigned int> dst, cudaStream_t stream);
|
||||
|
||||
template <typename T, typename D>
|
||||
void subMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
|
||||
@@ -688,62 +622,6 @@ void cv::gpu::subtract(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, cons
|
||||
}
|
||||
};
|
||||
|
||||
typedef void (*vfunc_t)(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
|
||||
static const vfunc_t vfuncs4[4][4] =
|
||||
{
|
||||
{
|
||||
vsub4<unsigned int, unsigned int>,
|
||||
vsub4<unsigned int, int>,
|
||||
0,
|
||||
0
|
||||
},
|
||||
{
|
||||
vsub4<int, unsigned int>,
|
||||
vsub4<int, int>,
|
||||
0,
|
||||
0
|
||||
},
|
||||
{
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
0
|
||||
},
|
||||
{
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
0
|
||||
}
|
||||
};
|
||||
static const vfunc_t vfuncs2[4][4] =
|
||||
{
|
||||
{
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
0
|
||||
},
|
||||
{
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
0
|
||||
},
|
||||
{
|
||||
0,
|
||||
0,
|
||||
vsub2<unsigned int, unsigned int>,
|
||||
vsub2<unsigned int, int>
|
||||
},
|
||||
{
|
||||
0,
|
||||
0,
|
||||
vsub2<int, unsigned int>,
|
||||
vsub2<int, int>
|
||||
}
|
||||
};
|
||||
|
||||
if (dtype < 0)
|
||||
dtype = src1.depth();
|
||||
|
||||
@@ -769,7 +647,7 @@ void cv::gpu::subtract(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, cons
|
||||
PtrStepSzb src2_(src1.rows, src1.cols * cn, src2.data, src2.step);
|
||||
PtrStepSzb dst_(src1.rows, src1.cols * cn, dst.data, dst.step);
|
||||
|
||||
if (mask.empty() && sdepth < CV_32S && ddepth < CV_32S)
|
||||
if (mask.empty() && (sdepth == CV_8U || sdepth == CV_16U) && ddepth == sdepth)
|
||||
{
|
||||
const intptr_t src1ptr = reinterpret_cast<intptr_t>(src1_.data);
|
||||
const intptr_t src2ptr = reinterpret_cast<intptr_t>(src2_.data);
|
||||
@@ -777,31 +655,27 @@ void cv::gpu::subtract(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, cons
|
||||
|
||||
const bool isAllAligned = (src1ptr & 31) == 0 && (src2ptr & 31) == 0 && (dstptr & 31) == 0;
|
||||
|
||||
if (deviceSupports(FEATURE_SET_COMPUTE_20) && isAllAligned)
|
||||
if (isAllAligned)
|
||||
{
|
||||
const vfunc_t vfunc4 = vfuncs4[sdepth][ddepth];
|
||||
const vfunc_t vfunc2 = vfuncs2[sdepth][ddepth];
|
||||
|
||||
if (vfunc4 != 0 && (src1_.cols & 3) == 0)
|
||||
if (sdepth == CV_8U && (src1_.cols & 3) == 0)
|
||||
{
|
||||
const int vcols = src1_.cols >> 2;
|
||||
|
||||
vfunc4(PtrStepSzb(src1_.rows, vcols, src1_.data, src1_.step),
|
||||
PtrStepSzb(src1_.rows, vcols, src2_.data, src2_.step),
|
||||
PtrStepSzb(src1_.rows, vcols, dst_.data, dst_.step),
|
||||
stream);
|
||||
subMat_v4(PtrStepSz<unsigned int>(src1_.rows, vcols, (unsigned int*) src1_.data, src1_.step),
|
||||
PtrStepSz<unsigned int>(src1_.rows, vcols, (unsigned int*) src2_.data, src2_.step),
|
||||
PtrStepSz<unsigned int>(src1_.rows, vcols, (unsigned int*) dst_.data, dst_.step),
|
||||
stream);
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
if (vfunc2 != 0 && (src1_.cols & 1) == 0)
|
||||
else if (sdepth == CV_16U && (src1_.cols & 1) == 0)
|
||||
{
|
||||
const int vcols = src1_.cols >> 1;
|
||||
|
||||
vfunc2(PtrStepSzb(src1_.rows, vcols, src1_.data, src1_.step),
|
||||
PtrStepSzb(src1_.rows, vcols, src2_.data, src2_.step),
|
||||
PtrStepSzb(src1_.rows, vcols, dst_.data, dst_.step),
|
||||
stream);
|
||||
subMat_v2(PtrStepSz<unsigned int>(src1_.rows, vcols, (unsigned int*) src1_.data, src1_.step),
|
||||
PtrStepSz<unsigned int>(src1_.rows, vcols, (unsigned int*) src2_.data, src2_.step),
|
||||
PtrStepSz<unsigned int>(src1_.rows, vcols, (unsigned int*) dst_.data, dst_.step),
|
||||
stream);
|
||||
|
||||
return;
|
||||
}
|
||||
@@ -1585,11 +1459,8 @@ void cv::gpu::divide(double scale, const GpuMat& src, GpuMat& dst, int dtype, St
|
||||
|
||||
namespace arithm
|
||||
{
|
||||
template <typename T>
|
||||
void vabsDiff4(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
|
||||
|
||||
template <typename T>
|
||||
void vabsDiff2(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
|
||||
void absDiffMat_v4(PtrStepSz<unsigned int> src1, PtrStepSz<unsigned int> src2, PtrStepSz<unsigned int> dst, cudaStream_t stream);
|
||||
void absDiffMat_v2(PtrStepSz<unsigned int> src1, PtrStepSz<unsigned int> src2, PtrStepSz<unsigned int> dst, cudaStream_t stream);
|
||||
|
||||
template <typename T>
|
||||
void absDiffMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
|
||||
@@ -1610,20 +1481,6 @@ void cv::gpu::absdiff(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Strea
|
||||
absDiffMat<float>,
|
||||
absDiffMat<double>
|
||||
};
|
||||
static const func_t vfuncs4[] =
|
||||
{
|
||||
vabsDiff4<unsigned int>,
|
||||
vabsDiff4<int>,
|
||||
0,
|
||||
0
|
||||
};
|
||||
static const func_t vfuncs2[] =
|
||||
{
|
||||
0,
|
||||
0,
|
||||
vabsDiff2<unsigned int>,
|
||||
vabsDiff2<int>
|
||||
};
|
||||
|
||||
const int depth = src1.depth();
|
||||
const int cn = src1.channels();
|
||||
@@ -1645,7 +1502,7 @@ void cv::gpu::absdiff(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Strea
|
||||
PtrStepSzb src2_(src1.rows, src1.cols * cn, src2.data, src2.step);
|
||||
PtrStepSzb dst_(src1.rows, src1.cols * cn, dst.data, dst.step);
|
||||
|
||||
if (depth < CV_32S)
|
||||
if (depth == CV_8U || depth == CV_16U)
|
||||
{
|
||||
const intptr_t src1ptr = reinterpret_cast<intptr_t>(src1_.data);
|
||||
const intptr_t src2ptr = reinterpret_cast<intptr_t>(src2_.data);
|
||||
@@ -1653,31 +1510,27 @@ void cv::gpu::absdiff(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Strea
|
||||
|
||||
const bool isAllAligned = (src1ptr & 31) == 0 && (src2ptr & 31) == 0 && (dstptr & 31) == 0;
|
||||
|
||||
if (deviceSupports(FEATURE_SET_COMPUTE_20) && isAllAligned)
|
||||
if (isAllAligned)
|
||||
{
|
||||
const func_t vfunc4 = vfuncs4[depth];
|
||||
const func_t vfunc2 = vfuncs2[depth];
|
||||
|
||||
if (vfunc4 != 0 && (src1_.cols & 3) == 0)
|
||||
if (depth == CV_8U && (src1_.cols & 3) == 0)
|
||||
{
|
||||
const int vcols = src1_.cols >> 2;
|
||||
|
||||
vfunc4(PtrStepSzb(src1_.rows, vcols, src1_.data, src1_.step),
|
||||
PtrStepSzb(src1_.rows, vcols, src2_.data, src2_.step),
|
||||
PtrStepSzb(src1_.rows, vcols, dst_.data, dst_.step),
|
||||
stream);
|
||||
absDiffMat_v4(PtrStepSz<unsigned int>(src1_.rows, vcols, (unsigned int*) src1_.data, src1_.step),
|
||||
PtrStepSz<unsigned int>(src1_.rows, vcols, (unsigned int*) src2_.data, src2_.step),
|
||||
PtrStepSz<unsigned int>(src1_.rows, vcols, (unsigned int*) dst_.data, dst_.step),
|
||||
stream);
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
if (vfunc2 != 0 && (src1_.cols & 1) == 0)
|
||||
else if (depth == CV_16U && (src1_.cols & 1) == 0)
|
||||
{
|
||||
const int vcols = src1_.cols >> 1;
|
||||
|
||||
vfunc2(PtrStepSzb(src1_.rows, vcols, src1_.data, src1_.step),
|
||||
PtrStepSzb(src1_.rows, vcols, src2_.data, src2_.step),
|
||||
PtrStepSzb(src1_.rows, vcols, dst_.data, dst_.step),
|
||||
stream);
|
||||
absDiffMat_v2(PtrStepSz<unsigned int>(src1_.rows, vcols, (unsigned int*) src1_.data, src1_.step),
|
||||
PtrStepSz<unsigned int>(src1_.rows, vcols, (unsigned int*) src2_.data, src2_.step),
|
||||
PtrStepSz<unsigned int>(src1_.rows, vcols, (unsigned int*) dst_.data, dst_.step),
|
||||
stream);
|
||||
|
||||
return;
|
||||
}
|
||||
@@ -1940,6 +1793,11 @@ void cv::gpu::exp(const GpuMat& src, GpuMat& dst, Stream& stream)
|
||||
|
||||
namespace arithm
|
||||
{
|
||||
void cmpMatEq_v4(PtrStepSz<uint> src1, PtrStepSz<uint> src2, PtrStepSz<uint> dst, cudaStream_t stream);
|
||||
void cmpMatNe_v4(PtrStepSz<uint> src1, PtrStepSz<uint> src2, PtrStepSz<uint> dst, cudaStream_t stream);
|
||||
void cmpMatLt_v4(PtrStepSz<uint> src1, PtrStepSz<uint> src2, PtrStepSz<uint> dst, cudaStream_t stream);
|
||||
void cmpMatLe_v4(PtrStepSz<uint> src1, PtrStepSz<uint> src2, PtrStepSz<uint> dst, cudaStream_t stream);
|
||||
|
||||
template <typename T> void cmpMatEq(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
|
||||
template <typename T> void cmpMatNe(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
|
||||
template <typename T> void cmpMatLt(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
|
||||
@@ -1962,6 +1820,12 @@ void cv::gpu::compare(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, int c
|
||||
{cmpMatEq<double> , cmpMatNe<double> , cmpMatLt<double> , cmpMatLe<double> }
|
||||
};
|
||||
|
||||
typedef void (*func_v4_t)(PtrStepSz<uint> src1, PtrStepSz<uint> src2, PtrStepSz<uint> dst, cudaStream_t stream);
|
||||
static const func_v4_t funcs_v4[] =
|
||||
{
|
||||
cmpMatEq_v4, cmpMatNe_v4, cmpMatLt_v4, cmpMatLe_v4
|
||||
};
|
||||
|
||||
const int depth = src1.depth();
|
||||
const int cn = src1.channels();
|
||||
|
||||
@@ -1997,6 +1861,27 @@ void cv::gpu::compare(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, int c
|
||||
PtrStepSzb src2_(src1.rows, src1.cols * cn, psrc2[cmpop]->data, psrc2[cmpop]->step);
|
||||
PtrStepSzb dst_(src1.rows, src1.cols * cn, dst.data, dst.step);
|
||||
|
||||
if (depth == CV_8U && (src1_.cols & 3) == 0)
|
||||
{
|
||||
const intptr_t src1ptr = reinterpret_cast<intptr_t>(src1_.data);
|
||||
const intptr_t src2ptr = reinterpret_cast<intptr_t>(src2_.data);
|
||||
const intptr_t dstptr = reinterpret_cast<intptr_t>(dst_.data);
|
||||
|
||||
const bool isAllAligned = (src1ptr & 31) == 0 && (src2ptr & 31) == 0 && (dstptr & 31) == 0;
|
||||
|
||||
if (isAllAligned)
|
||||
{
|
||||
const int vcols = src1_.cols >> 2;
|
||||
|
||||
funcs_v4[code](PtrStepSz<unsigned int>(src1_.rows, vcols, (unsigned int*) src1_.data, src1_.step),
|
||||
PtrStepSz<unsigned int>(src1_.rows, vcols, (unsigned int*) src2_.data, src2_.step),
|
||||
PtrStepSz<unsigned int>(src1_.rows, vcols, (unsigned int*) dst_.data, dst_.step),
|
||||
stream);
|
||||
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
const func_t func = funcs[depth][code];
|
||||
|
||||
func(src1_, src2_, dst_, stream);
|
||||
@@ -2532,13 +2417,13 @@ void cv::gpu::lshift(const GpuMat& src, Scalar_<int> sc, GpuMat& dst, Stream& st
|
||||
|
||||
namespace arithm
|
||||
{
|
||||
template <typename T> void vmin4(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
|
||||
template <typename T> void vmin2(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
|
||||
void minMat_v4(PtrStepSz<unsigned int> src1, PtrStepSz<unsigned int> src2, PtrStepSz<unsigned int> dst, cudaStream_t stream);
|
||||
void minMat_v2(PtrStepSz<unsigned int> src1, PtrStepSz<unsigned int> src2, PtrStepSz<unsigned int> dst, cudaStream_t stream);
|
||||
template <typename T> void minMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
|
||||
template <typename T> void minScalar(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream);
|
||||
|
||||
template <typename T> void vmax4(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
|
||||
template <typename T> void vmax2(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
|
||||
void maxMat_v4(PtrStepSz<unsigned int> src1, PtrStepSz<unsigned int> src2, PtrStepSz<unsigned int> dst, cudaStream_t stream);
|
||||
void maxMat_v2(PtrStepSz<unsigned int> src1, PtrStepSz<unsigned int> src2, PtrStepSz<unsigned int> dst, cudaStream_t stream);
|
||||
template <typename T> void maxMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
|
||||
template <typename T> void maxScalar(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream);
|
||||
}
|
||||
@@ -2558,20 +2443,6 @@ void cv::gpu::min(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Stream& s
|
||||
minMat<float>,
|
||||
minMat<double>
|
||||
};
|
||||
static const func_t vfuncs4[] =
|
||||
{
|
||||
vmin4<unsigned int>,
|
||||
vmin4<int>,
|
||||
0,
|
||||
0
|
||||
};
|
||||
static const func_t vfuncs2[] =
|
||||
{
|
||||
0,
|
||||
0,
|
||||
vmin2<unsigned int>,
|
||||
vmin2<int>
|
||||
};
|
||||
|
||||
const int depth = src1.depth();
|
||||
const int cn = src1.channels();
|
||||
@@ -2593,7 +2464,7 @@ void cv::gpu::min(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Stream& s
|
||||
PtrStepSzb src2_(src1.rows, src1.cols * cn, src2.data, src2.step);
|
||||
PtrStepSzb dst_(src1.rows, src1.cols * cn, dst.data, dst.step);
|
||||
|
||||
if (depth < CV_32S)
|
||||
if (depth == CV_8U || depth == CV_16U)
|
||||
{
|
||||
const intptr_t src1ptr = reinterpret_cast<intptr_t>(src1_.data);
|
||||
const intptr_t src2ptr = reinterpret_cast<intptr_t>(src2_.data);
|
||||
@@ -2601,31 +2472,27 @@ void cv::gpu::min(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Stream& s
|
||||
|
||||
const bool isAllAligned = (src1ptr & 31) == 0 && (src2ptr & 31) == 0 && (dstptr & 31) == 0;
|
||||
|
||||
if (deviceSupports(FEATURE_SET_COMPUTE_20) && isAllAligned)
|
||||
if (isAllAligned)
|
||||
{
|
||||
const func_t vfunc4 = vfuncs4[depth];
|
||||
const func_t vfunc2 = vfuncs2[depth];
|
||||
|
||||
if (vfunc4 != 0 && (src1_.cols & 3) == 0)
|
||||
if (depth == CV_8U && (src1_.cols & 3) == 0)
|
||||
{
|
||||
const int vcols = src1_.cols >> 2;
|
||||
|
||||
vfunc4(PtrStepSzb(src1_.rows, vcols, src1_.data, src1_.step),
|
||||
PtrStepSzb(src1_.rows, vcols, src2_.data, src2_.step),
|
||||
PtrStepSzb(src1_.rows, vcols, dst_.data, dst_.step),
|
||||
stream);
|
||||
minMat_v4(PtrStepSz<unsigned int>(src1_.rows, vcols, (unsigned int*) src1_.data, src1_.step),
|
||||
PtrStepSz<unsigned int>(src1_.rows, vcols, (unsigned int*) src2_.data, src2_.step),
|
||||
PtrStepSz<unsigned int>(src1_.rows, vcols, (unsigned int*) dst_.data, dst_.step),
|
||||
stream);
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
if (vfunc2 != 0 && (src1_.cols & 1) == 0)
|
||||
else if (depth == CV_16U && (src1_.cols & 1) == 0)
|
||||
{
|
||||
const int vcols = src1_.cols >> 1;
|
||||
|
||||
vfunc2(PtrStepSzb(src1_.rows, vcols, src1_.data, src1_.step),
|
||||
PtrStepSzb(src1_.rows, vcols, src2_.data, src2_.step),
|
||||
PtrStepSzb(src1_.rows, vcols, dst_.data, dst_.step),
|
||||
stream);
|
||||
minMat_v2(PtrStepSz<unsigned int>(src1_.rows, vcols, (unsigned int*) src1_.data, src1_.step),
|
||||
PtrStepSz<unsigned int>(src1_.rows, vcols, (unsigned int*) src2_.data, src2_.step),
|
||||
PtrStepSz<unsigned int>(src1_.rows, vcols, (unsigned int*) dst_.data, dst_.step),
|
||||
stream);
|
||||
|
||||
return;
|
||||
}
|
||||
@@ -2655,20 +2522,6 @@ void cv::gpu::max(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Stream& s
|
||||
maxMat<float>,
|
||||
maxMat<double>
|
||||
};
|
||||
static const func_t vfuncs4[] =
|
||||
{
|
||||
vmax4<unsigned int>,
|
||||
vmax4<int>,
|
||||
0,
|
||||
0
|
||||
};
|
||||
static const func_t vfuncs2[] =
|
||||
{
|
||||
0,
|
||||
0,
|
||||
vmax2<unsigned int>,
|
||||
vmax2<int>
|
||||
};
|
||||
|
||||
const int depth = src1.depth();
|
||||
const int cn = src1.channels();
|
||||
@@ -2690,7 +2543,7 @@ void cv::gpu::max(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Stream& s
|
||||
PtrStepSzb src2_(src1.rows, src1.cols * cn, src2.data, src2.step);
|
||||
PtrStepSzb dst_(src1.rows, src1.cols * cn, dst.data, dst.step);
|
||||
|
||||
if (depth < CV_32S)
|
||||
if (depth == CV_8U || depth == CV_16U)
|
||||
{
|
||||
const intptr_t src1ptr = reinterpret_cast<intptr_t>(src1_.data);
|
||||
const intptr_t src2ptr = reinterpret_cast<intptr_t>(src2_.data);
|
||||
@@ -2698,31 +2551,27 @@ void cv::gpu::max(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Stream& s
|
||||
|
||||
const bool isAllAligned = (src1ptr & 31) == 0 && (src2ptr & 31) == 0 && (dstptr & 31) == 0;
|
||||
|
||||
if (deviceSupports(FEATURE_SET_COMPUTE_20) && isAllAligned)
|
||||
if (isAllAligned)
|
||||
{
|
||||
const func_t vfunc4 = vfuncs4[depth];
|
||||
const func_t vfunc2 = vfuncs2[depth];
|
||||
|
||||
if (vfunc4 != 0 && (src1_.cols & 3) == 0)
|
||||
if (depth == CV_8U && (src1_.cols & 3) == 0)
|
||||
{
|
||||
const int vcols = src1_.cols >> 2;
|
||||
|
||||
vfunc4(PtrStepSzb(src1_.rows, vcols, src1_.data, src1_.step),
|
||||
PtrStepSzb(src1_.rows, vcols, src2_.data, src2_.step),
|
||||
PtrStepSzb(src1_.rows, vcols, dst_.data, dst_.step),
|
||||
stream);
|
||||
maxMat_v4(PtrStepSz<unsigned int>(src1_.rows, vcols, (unsigned int*) src1_.data, src1_.step),
|
||||
PtrStepSz<unsigned int>(src1_.rows, vcols, (unsigned int*) src2_.data, src2_.step),
|
||||
PtrStepSz<unsigned int>(src1_.rows, vcols, (unsigned int*) dst_.data, dst_.step),
|
||||
stream);
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
if (vfunc2 != 0 && (src1_.cols & 1) == 0)
|
||||
else if (depth == CV_16U && (src1_.cols & 1) == 0)
|
||||
{
|
||||
const int vcols = src1_.cols >> 1;
|
||||
|
||||
vfunc2(PtrStepSzb(src1_.rows, vcols, src1_.data, src1_.step),
|
||||
PtrStepSzb(src1_.rows, vcols, src2_.data, src2_.step),
|
||||
PtrStepSzb(src1_.rows, vcols, dst_.data, dst_.step),
|
||||
stream);
|
||||
maxMat_v2(PtrStepSz<unsigned int>(src1_.rows, vcols, (unsigned int*) src1_.data, src1_.step),
|
||||
PtrStepSz<unsigned int>(src1_.rows, vcols, (unsigned int*) src2_.data, src2_.step),
|
||||
PtrStepSz<unsigned int>(src1_.rows, vcols, (unsigned int*) dst_.data, dst_.step),
|
||||
stream);
|
||||
|
||||
return;
|
||||
}
|
||||
|
@@ -1,418 +0,0 @@
|
||||
/*M///////////////////////////////////////////////////////////////////////////////////////
|
||||
//
|
||||
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
|
||||
//
|
||||
// By downloading, copying, installing or using the software you agree to this license.
|
||||
// If you do not agree to this license, do not download, install,
|
||||
// copy or use the software.
|
||||
//
|
||||
//
|
||||
// License Agreement
|
||||
// For Open Source Computer Vision Library
|
||||
//
|
||||
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
|
||||
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
|
||||
// Third party copyrights are property of their respective owners.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without modification,
|
||||
// are permitted provided that the following conditions are met:
|
||||
//
|
||||
// * Redistribution's of source code must retain the above copyright notice,
|
||||
// this list of conditions and the following disclaimer.
|
||||
//
|
||||
// * Redistribution's in binary form must reproduce the above copyright notice,
|
||||
// this list of conditions and the following disclaimer in the documentation
|
||||
// and/or other GpuMaterials provided with the distribution.
|
||||
//
|
||||
// * The name of the copyright holders may not be used to endorse or promote products
|
||||
// derived from this software without specific prior written permission.
|
||||
//
|
||||
// This software is provided by the copyright holders and contributors "as is" and
|
||||
// any express or bpied warranties, including, but not limited to, the bpied
|
||||
// warranties of merchantability and fitness for a particular purpose are disclaimed.
|
||||
// In no event shall the Intel Corporation or contributors be liable for any direct,
|
||||
// indirect, incidental, special, exemplary, or consequential damages
|
||||
// (including, but not limited to, procurement of substitute goods or services;
|
||||
// loss of use, data, or profits; or business interruption) however caused
|
||||
// and on any theory of liability, whether in contract, strict liability,
|
||||
// or tort (including negligence or otherwise) arising in any way out of
|
||||
// the use of this software, even if advised of the possibility of such damage.
|
||||
//
|
||||
//M*/
|
||||
|
||||
#include "precomp.hpp"
|
||||
|
||||
using namespace cv;
|
||||
using namespace cv::gpu;
|
||||
|
||||
#if !defined (HAVE_CUDA) || defined (CUDA_DISABLER)
|
||||
|
||||
cv::gpu::SURF_GPU::SURF_GPU() { throw_nogpu(); }
|
||||
cv::gpu::SURF_GPU::SURF_GPU(double, int, int, bool, float, bool) { throw_nogpu(); }
|
||||
int cv::gpu::SURF_GPU::descriptorSize() const { throw_nogpu(); return 0;}
|
||||
void cv::gpu::SURF_GPU::uploadKeypoints(const std::vector<KeyPoint>&, GpuMat&) { throw_nogpu(); }
|
||||
void cv::gpu::SURF_GPU::downloadKeypoints(const GpuMat&, std::vector<KeyPoint>&) { throw_nogpu(); }
|
||||
void cv::gpu::SURF_GPU::downloadDescriptors(const GpuMat&, std::vector<float>&) { throw_nogpu(); }
|
||||
void cv::gpu::SURF_GPU::operator()(const GpuMat&, const GpuMat&, GpuMat&) { throw_nogpu(); }
|
||||
void cv::gpu::SURF_GPU::operator()(const GpuMat&, const GpuMat&, GpuMat&, GpuMat&, bool) { throw_nogpu(); }
|
||||
void cv::gpu::SURF_GPU::operator()(const GpuMat&, const GpuMat&, std::vector<KeyPoint>&) { throw_nogpu(); }
|
||||
void cv::gpu::SURF_GPU::operator()(const GpuMat&, const GpuMat&, std::vector<KeyPoint>&, GpuMat&, bool) { throw_nogpu(); }
|
||||
void cv::gpu::SURF_GPU::operator()(const GpuMat&, const GpuMat&, std::vector<KeyPoint>&, std::vector<float>&, bool) { throw_nogpu(); }
|
||||
void cv::gpu::SURF_GPU::releaseMemory() { throw_nogpu(); }
|
||||
|
||||
#else /* !defined (HAVE_CUDA) */
|
||||
|
||||
namespace cv { namespace gpu { namespace device
|
||||
{
|
||||
namespace surf
|
||||
{
|
||||
void loadGlobalConstants(int maxCandidates, int maxFeatures, int img_rows, int img_cols, int nOctaveLayers, float hessianThreshold);
|
||||
void loadOctaveConstants(int octave, int layer_rows, int layer_cols);
|
||||
|
||||
void bindImgTex(PtrStepSzb img);
|
||||
size_t bindSumTex(PtrStepSz<unsigned int> sum);
|
||||
size_t bindMaskSumTex(PtrStepSz<unsigned int> maskSum);
|
||||
|
||||
void icvCalcLayerDetAndTrace_gpu(const PtrStepf& det, const PtrStepf& trace, int img_rows, int img_cols,
|
||||
int octave, int nOctaveLayer);
|
||||
|
||||
void icvFindMaximaInLayer_gpu(const PtrStepf& det, const PtrStepf& trace, int4* maxPosBuffer, unsigned int* maxCounter,
|
||||
int img_rows, int img_cols, int octave, bool use_mask, int nLayers);
|
||||
|
||||
void icvInterpolateKeypoint_gpu(const PtrStepf& det, const int4* maxPosBuffer, unsigned int maxCounter,
|
||||
float* featureX, float* featureY, int* featureLaplacian, int* featureOctave, float* featureSize, float* featureHessian,
|
||||
unsigned int* featureCounter);
|
||||
|
||||
void icvCalcOrientation_gpu(const float* featureX, const float* featureY, const float* featureSize, float* featureDir, int nFeatures);
|
||||
|
||||
void compute_descriptors_gpu(PtrStepSz<float4> descriptors, const float* featureX, const float* featureY, const float* featureSize, const float* featureDir, int nFeatures);
|
||||
}
|
||||
}}}
|
||||
|
||||
using namespace ::cv::gpu::device::surf;
|
||||
|
||||
namespace
|
||||
{
|
||||
int calcSize(int octave, int layer)
|
||||
{
|
||||
/* Wavelet size at first layer of first octave. */
|
||||
const int HAAR_SIZE0 = 9;
|
||||
|
||||
/* Wavelet size increment between layers. This should be an even number,
|
||||
such that the wavelet sizes in an octave are either all even or all odd.
|
||||
This ensures that when looking for the neighbours of a sample, the layers
|
||||
|
||||
above and below are aligned correctly. */
|
||||
const int HAAR_SIZE_INC = 6;
|
||||
|
||||
return (HAAR_SIZE0 + HAAR_SIZE_INC * layer) << octave;
|
||||
}
|
||||
|
||||
class SURF_GPU_Invoker
|
||||
{
|
||||
public:
|
||||
SURF_GPU_Invoker(SURF_GPU& surf, const GpuMat& img, const GpuMat& mask) :
|
||||
surf_(surf),
|
||||
img_cols(img.cols), img_rows(img.rows),
|
||||
use_mask(!mask.empty())
|
||||
{
|
||||
CV_Assert(!img.empty() && img.type() == CV_8UC1);
|
||||
CV_Assert(mask.empty() || (mask.size() == img.size() && mask.type() == CV_8UC1));
|
||||
CV_Assert(surf_.nOctaves > 0 && surf_.nOctaveLayers > 0);
|
||||
|
||||
const int min_size = calcSize(surf_.nOctaves - 1, 0);
|
||||
CV_Assert(img_rows - min_size >= 0);
|
||||
CV_Assert(img_cols - min_size >= 0);
|
||||
|
||||
const int layer_rows = img_rows >> (surf_.nOctaves - 1);
|
||||
const int layer_cols = img_cols >> (surf_.nOctaves - 1);
|
||||
const int min_margin = ((calcSize((surf_.nOctaves - 1), 2) >> 1) >> (surf_.nOctaves - 1)) + 1;
|
||||
CV_Assert(layer_rows - 2 * min_margin > 0);
|
||||
CV_Assert(layer_cols - 2 * min_margin > 0);
|
||||
|
||||
maxFeatures = std::min(static_cast<int>(img.size().area() * surf.keypointsRatio), 65535);
|
||||
maxCandidates = std::min(static_cast<int>(1.5 * maxFeatures), 65535);
|
||||
|
||||
CV_Assert(maxFeatures > 0);
|
||||
|
||||
counters.create(1, surf_.nOctaves + 1, CV_32SC1);
|
||||
counters.setTo(Scalar::all(0));
|
||||
|
||||
loadGlobalConstants(maxCandidates, maxFeatures, img_rows, img_cols, surf_.nOctaveLayers, static_cast<float>(surf_.hessianThreshold));
|
||||
|
||||
bindImgTex(img);
|
||||
|
||||
integralBuffered(img, surf_.sum, surf_.intBuffer);
|
||||
sumOffset = bindSumTex(surf_.sum);
|
||||
|
||||
if (use_mask)
|
||||
{
|
||||
min(mask, 1.0, surf_.mask1);
|
||||
integralBuffered(surf_.mask1, surf_.maskSum, surf_.intBuffer);
|
||||
maskOffset = bindMaskSumTex(surf_.maskSum);
|
||||
}
|
||||
}
|
||||
|
||||
void detectKeypoints(GpuMat& keypoints)
|
||||
{
|
||||
ensureSizeIsEnough(img_rows * (surf_.nOctaveLayers + 2), img_cols, CV_32FC1, surf_.det);
|
||||
ensureSizeIsEnough(img_rows * (surf_.nOctaveLayers + 2), img_cols, CV_32FC1, surf_.trace);
|
||||
|
||||
ensureSizeIsEnough(1, maxCandidates, CV_32SC4, surf_.maxPosBuffer);
|
||||
ensureSizeIsEnough(SURF_GPU::ROWS_COUNT, maxFeatures, CV_32FC1, keypoints);
|
||||
keypoints.setTo(Scalar::all(0));
|
||||
|
||||
for (int octave = 0; octave < surf_.nOctaves; ++octave)
|
||||
{
|
||||
const int layer_rows = img_rows >> octave;
|
||||
const int layer_cols = img_cols >> octave;
|
||||
|
||||
loadOctaveConstants(octave, layer_rows, layer_cols);
|
||||
|
||||
icvCalcLayerDetAndTrace_gpu(surf_.det, surf_.trace, img_rows, img_cols, octave, surf_.nOctaveLayers);
|
||||
|
||||
icvFindMaximaInLayer_gpu(surf_.det, surf_.trace, surf_.maxPosBuffer.ptr<int4>(), counters.ptr<unsigned int>() + 1 + octave,
|
||||
img_rows, img_cols, octave, use_mask, surf_.nOctaveLayers);
|
||||
|
||||
unsigned int maxCounter;
|
||||
cudaSafeCall( cudaMemcpy(&maxCounter, counters.ptr<unsigned int>() + 1 + octave, sizeof(unsigned int), cudaMemcpyDeviceToHost) );
|
||||
maxCounter = std::min(maxCounter, static_cast<unsigned int>(maxCandidates));
|
||||
|
||||
if (maxCounter > 0)
|
||||
{
|
||||
icvInterpolateKeypoint_gpu(surf_.det, surf_.maxPosBuffer.ptr<int4>(), maxCounter,
|
||||
keypoints.ptr<float>(SURF_GPU::X_ROW), keypoints.ptr<float>(SURF_GPU::Y_ROW),
|
||||
keypoints.ptr<int>(SURF_GPU::LAPLACIAN_ROW), keypoints.ptr<int>(SURF_GPU::OCTAVE_ROW),
|
||||
keypoints.ptr<float>(SURF_GPU::SIZE_ROW), keypoints.ptr<float>(SURF_GPU::HESSIAN_ROW),
|
||||
counters.ptr<unsigned int>());
|
||||
}
|
||||
}
|
||||
unsigned int featureCounter;
|
||||
cudaSafeCall( cudaMemcpy(&featureCounter, counters.ptr<unsigned int>(), sizeof(unsigned int), cudaMemcpyDeviceToHost) );
|
||||
featureCounter = std::min(featureCounter, static_cast<unsigned int>(maxFeatures));
|
||||
|
||||
keypoints.cols = featureCounter;
|
||||
|
||||
if (surf_.upright)
|
||||
keypoints.row(SURF_GPU::ANGLE_ROW).setTo(Scalar::all(360.0 - 90.0));
|
||||
else
|
||||
findOrientation(keypoints);
|
||||
}
|
||||
|
||||
void findOrientation(GpuMat& keypoints)
|
||||
{
|
||||
const int nFeatures = keypoints.cols;
|
||||
if (nFeatures > 0)
|
||||
{
|
||||
icvCalcOrientation_gpu(keypoints.ptr<float>(SURF_GPU::X_ROW), keypoints.ptr<float>(SURF_GPU::Y_ROW),
|
||||
keypoints.ptr<float>(SURF_GPU::SIZE_ROW), keypoints.ptr<float>(SURF_GPU::ANGLE_ROW), nFeatures);
|
||||
}
|
||||
}
|
||||
|
||||
void computeDescriptors(const GpuMat& keypoints, GpuMat& descriptors, int descriptorSize)
|
||||
{
|
||||
const int nFeatures = keypoints.cols;
|
||||
if (nFeatures > 0)
|
||||
{
|
||||
ensureSizeIsEnough(nFeatures, descriptorSize, CV_32F, descriptors);
|
||||
compute_descriptors_gpu(descriptors, keypoints.ptr<float>(SURF_GPU::X_ROW), keypoints.ptr<float>(SURF_GPU::Y_ROW),
|
||||
keypoints.ptr<float>(SURF_GPU::SIZE_ROW), keypoints.ptr<float>(SURF_GPU::ANGLE_ROW), nFeatures);
|
||||
}
|
||||
}
|
||||
|
||||
private:
|
||||
SURF_GPU& surf_;
|
||||
|
||||
int img_cols, img_rows;
|
||||
|
||||
bool use_mask;
|
||||
|
||||
int maxCandidates;
|
||||
int maxFeatures;
|
||||
|
||||
size_t maskOffset;
|
||||
size_t sumOffset;
|
||||
|
||||
GpuMat counters;
|
||||
};
|
||||
}
|
||||
|
||||
cv::gpu::SURF_GPU::SURF_GPU()
|
||||
{
|
||||
hessianThreshold = 100;
|
||||
extended = true;
|
||||
nOctaves = 4;
|
||||
nOctaveLayers = 2;
|
||||
keypointsRatio = 0.01f;
|
||||
upright = false;
|
||||
}
|
||||
|
||||
cv::gpu::SURF_GPU::SURF_GPU(double _threshold, int _nOctaves, int _nOctaveLayers, bool _extended, float _keypointsRatio, bool _upright)
|
||||
{
|
||||
hessianThreshold = _threshold;
|
||||
extended = _extended;
|
||||
nOctaves = _nOctaves;
|
||||
nOctaveLayers = _nOctaveLayers;
|
||||
keypointsRatio = _keypointsRatio;
|
||||
upright = _upright;
|
||||
}
|
||||
|
||||
int cv::gpu::SURF_GPU::descriptorSize() const
|
||||
{
|
||||
return extended ? 128 : 64;
|
||||
}
|
||||
|
||||
void cv::gpu::SURF_GPU::uploadKeypoints(const std::vector<KeyPoint>& keypoints, GpuMat& keypointsGPU)
|
||||
{
|
||||
if (keypoints.empty())
|
||||
keypointsGPU.release();
|
||||
else
|
||||
{
|
||||
Mat keypointsCPU(SURF_GPU::ROWS_COUNT, static_cast<int>(keypoints.size()), CV_32FC1);
|
||||
|
||||
float* kp_x = keypointsCPU.ptr<float>(SURF_GPU::X_ROW);
|
||||
float* kp_y = keypointsCPU.ptr<float>(SURF_GPU::Y_ROW);
|
||||
int* kp_laplacian = keypointsCPU.ptr<int>(SURF_GPU::LAPLACIAN_ROW);
|
||||
int* kp_octave = keypointsCPU.ptr<int>(SURF_GPU::OCTAVE_ROW);
|
||||
float* kp_size = keypointsCPU.ptr<float>(SURF_GPU::SIZE_ROW);
|
||||
float* kp_dir = keypointsCPU.ptr<float>(SURF_GPU::ANGLE_ROW);
|
||||
float* kp_hessian = keypointsCPU.ptr<float>(SURF_GPU::HESSIAN_ROW);
|
||||
|
||||
for (size_t i = 0, size = keypoints.size(); i < size; ++i)
|
||||
{
|
||||
const KeyPoint& kp = keypoints[i];
|
||||
kp_x[i] = kp.pt.x;
|
||||
kp_y[i] = kp.pt.y;
|
||||
kp_octave[i] = kp.octave;
|
||||
kp_size[i] = kp.size;
|
||||
kp_dir[i] = kp.angle;
|
||||
kp_hessian[i] = kp.response;
|
||||
kp_laplacian[i] = 1;
|
||||
}
|
||||
|
||||
keypointsGPU.upload(keypointsCPU);
|
||||
}
|
||||
}
|
||||
|
||||
void cv::gpu::SURF_GPU::downloadKeypoints(const GpuMat& keypointsGPU, std::vector<KeyPoint>& keypoints)
|
||||
{
|
||||
const int nFeatures = keypointsGPU.cols;
|
||||
|
||||
if (nFeatures == 0)
|
||||
keypoints.clear();
|
||||
else
|
||||
{
|
||||
CV_Assert(keypointsGPU.type() == CV_32FC1 && keypointsGPU.rows == ROWS_COUNT);
|
||||
|
||||
Mat keypointsCPU(keypointsGPU);
|
||||
|
||||
keypoints.resize(nFeatures);
|
||||
|
||||
float* kp_x = keypointsCPU.ptr<float>(SURF_GPU::X_ROW);
|
||||
float* kp_y = keypointsCPU.ptr<float>(SURF_GPU::Y_ROW);
|
||||
int* kp_laplacian = keypointsCPU.ptr<int>(SURF_GPU::LAPLACIAN_ROW);
|
||||
int* kp_octave = keypointsCPU.ptr<int>(SURF_GPU::OCTAVE_ROW);
|
||||
float* kp_size = keypointsCPU.ptr<float>(SURF_GPU::SIZE_ROW);
|
||||
float* kp_dir = keypointsCPU.ptr<float>(SURF_GPU::ANGLE_ROW);
|
||||
float* kp_hessian = keypointsCPU.ptr<float>(SURF_GPU::HESSIAN_ROW);
|
||||
|
||||
for (int i = 0; i < nFeatures; ++i)
|
||||
{
|
||||
KeyPoint& kp = keypoints[i];
|
||||
kp.pt.x = kp_x[i];
|
||||
kp.pt.y = kp_y[i];
|
||||
kp.class_id = kp_laplacian[i];
|
||||
kp.octave = kp_octave[i];
|
||||
kp.size = kp_size[i];
|
||||
kp.angle = kp_dir[i];
|
||||
kp.response = kp_hessian[i];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void cv::gpu::SURF_GPU::downloadDescriptors(const GpuMat& descriptorsGPU, std::vector<float>& descriptors)
|
||||
{
|
||||
if (descriptorsGPU.empty())
|
||||
descriptors.clear();
|
||||
else
|
||||
{
|
||||
CV_Assert(descriptorsGPU.type() == CV_32F);
|
||||
|
||||
descriptors.resize(descriptorsGPU.rows * descriptorsGPU.cols);
|
||||
Mat descriptorsCPU(descriptorsGPU.size(), CV_32F, &descriptors[0]);
|
||||
descriptorsGPU.download(descriptorsCPU);
|
||||
}
|
||||
}
|
||||
|
||||
void cv::gpu::SURF_GPU::operator()(const GpuMat& img, const GpuMat& mask, GpuMat& keypoints)
|
||||
{
|
||||
if (!img.empty())
|
||||
{
|
||||
SURF_GPU_Invoker surf(*this, img, mask);
|
||||
|
||||
surf.detectKeypoints(keypoints);
|
||||
}
|
||||
}
|
||||
|
||||
void cv::gpu::SURF_GPU::operator()(const GpuMat& img, const GpuMat& mask, GpuMat& keypoints, GpuMat& descriptors,
|
||||
bool useProvidedKeypoints)
|
||||
{
|
||||
if (!img.empty())
|
||||
{
|
||||
SURF_GPU_Invoker surf(*this, img, mask);
|
||||
|
||||
if (!useProvidedKeypoints)
|
||||
surf.detectKeypoints(keypoints);
|
||||
else if (!upright)
|
||||
{
|
||||
surf.findOrientation(keypoints);
|
||||
}
|
||||
|
||||
surf.computeDescriptors(keypoints, descriptors, descriptorSize());
|
||||
}
|
||||
}
|
||||
|
||||
void cv::gpu::SURF_GPU::operator()(const GpuMat& img, const GpuMat& mask, std::vector<KeyPoint>& keypoints)
|
||||
{
|
||||
GpuMat keypointsGPU;
|
||||
|
||||
(*this)(img, mask, keypointsGPU);
|
||||
|
||||
downloadKeypoints(keypointsGPU, keypoints);
|
||||
}
|
||||
|
||||
void cv::gpu::SURF_GPU::operator()(const GpuMat& img, const GpuMat& mask, std::vector<KeyPoint>& keypoints,
|
||||
GpuMat& descriptors, bool useProvidedKeypoints)
|
||||
{
|
||||
GpuMat keypointsGPU;
|
||||
|
||||
if (useProvidedKeypoints)
|
||||
uploadKeypoints(keypoints, keypointsGPU);
|
||||
|
||||
(*this)(img, mask, keypointsGPU, descriptors, useProvidedKeypoints);
|
||||
|
||||
downloadKeypoints(keypointsGPU, keypoints);
|
||||
}
|
||||
|
||||
void cv::gpu::SURF_GPU::operator()(const GpuMat& img, const GpuMat& mask, std::vector<KeyPoint>& keypoints,
|
||||
std::vector<float>& descriptors, bool useProvidedKeypoints)
|
||||
{
|
||||
GpuMat descriptorsGPU;
|
||||
|
||||
(*this)(img, mask, keypoints, descriptorsGPU, useProvidedKeypoints);
|
||||
|
||||
downloadDescriptors(descriptorsGPU, descriptors);
|
||||
}
|
||||
|
||||
void cv::gpu::SURF_GPU::releaseMemory()
|
||||
{
|
||||
sum.release();
|
||||
mask1.release();
|
||||
maskSum.release();
|
||||
intBuffer.release();
|
||||
det.release();
|
||||
trace.release();
|
||||
maxPosBuffer.release();
|
||||
}
|
||||
|
||||
#endif /* !defined (HAVE_CUDA) */
|
@@ -49,71 +49,6 @@ using namespace cv::gpu;
|
||||
using namespace cvtest;
|
||||
using namespace testing;
|
||||
|
||||
void printOsInfo()
|
||||
{
|
||||
#if defined _WIN32
|
||||
# if defined _WIN64
|
||||
cout << "OS: Windows x64 \n" << endl;
|
||||
# else
|
||||
cout << "OS: Windows x32 \n" << endl;
|
||||
# endif
|
||||
#elif defined linux
|
||||
# if defined _LP64
|
||||
cout << "OS: Linux x64 \n" << endl;
|
||||
# else
|
||||
cout << "OS: Linux x32 \n" << endl;
|
||||
# endif
|
||||
#elif defined __APPLE__
|
||||
# if defined _LP64
|
||||
cout << "OS: Apple x64 \n" << endl;
|
||||
# else
|
||||
cout << "OS: Apple x32 \n" << endl;
|
||||
# endif
|
||||
#endif
|
||||
}
|
||||
|
||||
void printCudaInfo()
|
||||
{
|
||||
#if !defined HAVE_CUDA || defined(CUDA_DISABLER)
|
||||
cout << "OpenCV was built without CUDA support \n" << endl;
|
||||
#else
|
||||
int driver;
|
||||
cudaDriverGetVersion(&driver);
|
||||
|
||||
cout << "CUDA Driver version: " << driver << '\n';
|
||||
cout << "CUDA Runtime version: " << CUDART_VERSION << '\n';
|
||||
|
||||
cout << endl;
|
||||
|
||||
cout << "GPU module was compiled for the following GPU archs:" << endl;
|
||||
cout << " BIN: " << CUDA_ARCH_BIN << '\n';
|
||||
cout << " PTX: " << CUDA_ARCH_PTX << '\n';
|
||||
|
||||
cout << endl;
|
||||
|
||||
int deviceCount = getCudaEnabledDeviceCount();
|
||||
cout << "CUDA device count: " << deviceCount << '\n';
|
||||
|
||||
cout << endl;
|
||||
|
||||
for (int i = 0; i < deviceCount; ++i)
|
||||
{
|
||||
DeviceInfo info(i);
|
||||
|
||||
cout << "Device [" << i << "] \n";
|
||||
cout << "\t Name: " << info.name() << '\n';
|
||||
cout << "\t Compute capability: " << info.majorVersion() << '.' << info.minorVersion()<< '\n';
|
||||
cout << "\t Multi Processor Count: " << info.multiProcessorCount() << '\n';
|
||||
cout << "\t Total memory: " << static_cast<int>(static_cast<int>(info.totalMemory() / 1024.0) / 1024.0) << " Mb \n";
|
||||
cout << "\t Free memory: " << static_cast<int>(static_cast<int>(info.freeMemory() / 1024.0) / 1024.0) << " Mb \n";
|
||||
if (!info.isCompatible())
|
||||
cout << "\t !!! This device is NOT compatible with current GPU module build \n";
|
||||
|
||||
cout << endl;
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
int main(int argc, char** argv)
|
||||
{
|
||||
try
|
||||
@@ -133,7 +68,6 @@ int main(int argc, char** argv)
|
||||
return 0;
|
||||
}
|
||||
|
||||
printOsInfo();
|
||||
printCudaInfo();
|
||||
|
||||
if (cmd.has("info"))
|
||||
|
@@ -43,9 +43,25 @@
|
||||
|
||||
#ifdef HAVE_CUDA
|
||||
|
||||
using namespace cvtest;
|
||||
|
||||
#if defined(HAVE_XINE) || \
|
||||
defined(HAVE_GSTREAMER) || \
|
||||
defined(HAVE_QUICKTIME) || \
|
||||
defined(HAVE_AVFOUNDATION) || \
|
||||
defined(HAVE_FFMPEG) || \
|
||||
defined(WIN32) /* assume that we have ffmpeg */
|
||||
|
||||
# define BUILD_WITH_VIDEO_INPUT_SUPPORT 1
|
||||
#else
|
||||
# define BUILD_WITH_VIDEO_INPUT_SUPPORT 0
|
||||
#endif
|
||||
|
||||
//////////////////////////////////////////////////////
|
||||
// FGDStatModel
|
||||
|
||||
#if BUILD_WITH_VIDEO_INPUT_SUPPORT
|
||||
|
||||
namespace cv
|
||||
{
|
||||
template<> void Ptr<CvBGStatModel>::delete_obj()
|
||||
@@ -130,9 +146,13 @@ INSTANTIATE_TEST_CASE_P(GPU_Video, FGDStatModel, testing::Combine(
|
||||
testing::Values(std::string("768x576.avi")),
|
||||
testing::Values(Channels(3), Channels(4))));
|
||||
|
||||
#endif
|
||||
|
||||
//////////////////////////////////////////////////////
|
||||
// MOG
|
||||
|
||||
#if BUILD_WITH_VIDEO_INPUT_SUPPORT
|
||||
|
||||
namespace
|
||||
{
|
||||
IMPLEMENT_PARAM_CLASS(UseGray, bool)
|
||||
@@ -204,9 +224,13 @@ INSTANTIATE_TEST_CASE_P(GPU_Video, MOG, testing::Combine(
|
||||
testing::Values(LearningRate(0.0), LearningRate(0.01)),
|
||||
WHOLE_SUBMAT));
|
||||
|
||||
#endif
|
||||
|
||||
//////////////////////////////////////////////////////
|
||||
// MOG2
|
||||
|
||||
#if BUILD_WITH_VIDEO_INPUT_SUPPORT
|
||||
|
||||
namespace
|
||||
{
|
||||
IMPLEMENT_PARAM_CLASS(DetectShadow, bool)
|
||||
@@ -320,46 +344,7 @@ INSTANTIATE_TEST_CASE_P(GPU_Video, MOG2, testing::Combine(
|
||||
testing::Values(DetectShadow(true), DetectShadow(false)),
|
||||
WHOLE_SUBMAT));
|
||||
|
||||
//////////////////////////////////////////////////////
|
||||
// VIBE
|
||||
|
||||
PARAM_TEST_CASE(VIBE, cv::gpu::DeviceInfo, cv::Size, MatType, UseRoi)
|
||||
{
|
||||
};
|
||||
|
||||
GPU_TEST_P(VIBE, Accuracy)
|
||||
{
|
||||
const cv::gpu::DeviceInfo devInfo = GET_PARAM(0);
|
||||
cv::gpu::setDevice(devInfo.deviceID());
|
||||
const cv::Size size = GET_PARAM(1);
|
||||
const int type = GET_PARAM(2);
|
||||
const bool useRoi = GET_PARAM(3);
|
||||
|
||||
const cv::Mat fullfg(size, CV_8UC1, cv::Scalar::all(255));
|
||||
|
||||
cv::Mat frame = randomMat(size, type, 0.0, 100);
|
||||
cv::gpu::GpuMat d_frame = loadMat(frame, useRoi);
|
||||
|
||||
cv::gpu::VIBE_GPU vibe;
|
||||
cv::gpu::GpuMat d_fgmask = createMat(size, CV_8UC1, useRoi);
|
||||
vibe.initialize(d_frame);
|
||||
|
||||
for (int i = 0; i < 20; ++i)
|
||||
vibe(d_frame, d_fgmask);
|
||||
|
||||
frame = randomMat(size, type, 160, 255);
|
||||
d_frame = loadMat(frame, useRoi);
|
||||
vibe(d_frame, d_fgmask);
|
||||
|
||||
// now fgmask should be entirely foreground
|
||||
ASSERT_MAT_NEAR(fullfg, d_fgmask, 0);
|
||||
}
|
||||
|
||||
INSTANTIATE_TEST_CASE_P(GPU_Video, VIBE, testing::Combine(
|
||||
ALL_DEVICES,
|
||||
DIFFERENT_SIZES,
|
||||
testing::Values(MatType(CV_8UC1), MatType(CV_8UC3), MatType(CV_8UC4)),
|
||||
WHOLE_SUBMAT));
|
||||
#endif
|
||||
|
||||
//////////////////////////////////////////////////////
|
||||
// GMG
|
||||
|
@@ -43,6 +43,8 @@
|
||||
|
||||
#ifdef HAVE_CUDA
|
||||
|
||||
using namespace cvtest;
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////
|
||||
// StereoBM
|
||||
|
||||
|
@@ -43,6 +43,8 @@
|
||||
|
||||
#ifdef HAVE_CUDA
|
||||
|
||||
using namespace cvtest;
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
// cvtColor
|
||||
|
||||
@@ -2218,12 +2220,245 @@ GPU_TEST_P(CvtColor, BayerGR2BGR4)
|
||||
EXPECT_MAT_NEAR(dst_gold(cv::Rect(1, 1, dst.cols - 2, dst.rows - 2)), dst3(cv::Rect(1, 1, dst.cols - 2, dst.rows - 2)), 0);
|
||||
}
|
||||
|
||||
GPU_TEST_P(CvtColor, BayerBG2Gray)
|
||||
{
|
||||
if ((depth != CV_8U && depth != CV_16U) || useRoi)
|
||||
return;
|
||||
|
||||
cv::Mat src = randomMat(size, depth);
|
||||
|
||||
cv::gpu::GpuMat dst;
|
||||
cv::gpu::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_BayerBG2GRAY);
|
||||
|
||||
cv::Mat dst_gold;
|
||||
cv::cvtColor(src, dst_gold, cv::COLOR_BayerBG2GRAY);
|
||||
|
||||
EXPECT_MAT_NEAR(dst_gold(cv::Rect(1, 1, dst.cols - 2, dst.rows - 2)), dst(cv::Rect(1, 1, dst.cols - 2, dst.rows - 2)), 2);
|
||||
}
|
||||
|
||||
GPU_TEST_P(CvtColor, BayerGB2Gray)
|
||||
{
|
||||
if ((depth != CV_8U && depth != CV_16U) || useRoi)
|
||||
return;
|
||||
|
||||
cv::Mat src = randomMat(size, depth);
|
||||
|
||||
cv::gpu::GpuMat dst;
|
||||
cv::gpu::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_BayerGB2GRAY);
|
||||
|
||||
cv::Mat dst_gold;
|
||||
cv::cvtColor(src, dst_gold, cv::COLOR_BayerGB2GRAY);
|
||||
|
||||
EXPECT_MAT_NEAR(dst_gold(cv::Rect(1, 1, dst.cols - 2, dst.rows - 2)), dst(cv::Rect(1, 1, dst.cols - 2, dst.rows - 2)), 2);
|
||||
}
|
||||
|
||||
GPU_TEST_P(CvtColor, BayerRG2Gray)
|
||||
{
|
||||
if ((depth != CV_8U && depth != CV_16U) || useRoi)
|
||||
return;
|
||||
|
||||
cv::Mat src = randomMat(size, depth);
|
||||
|
||||
cv::gpu::GpuMat dst;
|
||||
cv::gpu::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_BayerRG2GRAY);
|
||||
|
||||
cv::Mat dst_gold;
|
||||
cv::cvtColor(src, dst_gold, cv::COLOR_BayerRG2GRAY);
|
||||
|
||||
EXPECT_MAT_NEAR(dst_gold(cv::Rect(1, 1, dst.cols - 2, dst.rows - 2)), dst(cv::Rect(1, 1, dst.cols - 2, dst.rows - 2)), 2);
|
||||
}
|
||||
|
||||
GPU_TEST_P(CvtColor, BayerGR2Gray)
|
||||
{
|
||||
if ((depth != CV_8U && depth != CV_16U) || useRoi)
|
||||
return;
|
||||
|
||||
cv::Mat src = randomMat(size, depth);
|
||||
|
||||
cv::gpu::GpuMat dst;
|
||||
cv::gpu::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_BayerGR2GRAY);
|
||||
|
||||
cv::Mat dst_gold;
|
||||
cv::cvtColor(src, dst_gold, cv::COLOR_BayerGR2GRAY);
|
||||
|
||||
EXPECT_MAT_NEAR(dst_gold(cv::Rect(1, 1, dst.cols - 2, dst.rows - 2)), dst(cv::Rect(1, 1, dst.cols - 2, dst.rows - 2)), 2);
|
||||
}
|
||||
|
||||
INSTANTIATE_TEST_CASE_P(GPU_ImgProc, CvtColor, testing::Combine(
|
||||
ALL_DEVICES,
|
||||
DIFFERENT_SIZES,
|
||||
testing::Values(MatDepth(CV_8U), MatDepth(CV_16U), MatDepth(CV_32F)),
|
||||
WHOLE_SUBMAT));
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
// Demosaicing
|
||||
|
||||
struct Demosaicing : testing::TestWithParam<cv::gpu::DeviceInfo>
|
||||
{
|
||||
cv::gpu::DeviceInfo devInfo;
|
||||
|
||||
virtual void SetUp()
|
||||
{
|
||||
devInfo = GetParam();
|
||||
|
||||
cv::gpu::setDevice(devInfo.deviceID());
|
||||
}
|
||||
|
||||
static void mosaic(const cv::Mat_<cv::Vec3b>& src, cv::Mat_<uchar>& dst, cv::Point firstRed)
|
||||
{
|
||||
dst.create(src.size());
|
||||
|
||||
for (int y = 0; y < src.rows; ++y)
|
||||
{
|
||||
for (int x = 0; x < src.cols; ++x)
|
||||
{
|
||||
cv::Vec3b pix = src(y, x);
|
||||
|
||||
cv::Point alternate;
|
||||
alternate.x = (x + firstRed.x) % 2;
|
||||
alternate.y = (y + firstRed.y) % 2;
|
||||
|
||||
if (alternate.y == 0)
|
||||
{
|
||||
if (alternate.x == 0)
|
||||
{
|
||||
// RG
|
||||
// GB
|
||||
dst(y, x) = pix[2];
|
||||
}
|
||||
else
|
||||
{
|
||||
// GR
|
||||
// BG
|
||||
dst(y, x) = pix[1];
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
if (alternate.x == 0)
|
||||
{
|
||||
// GB
|
||||
// RG
|
||||
dst(y, x) = pix[1];
|
||||
}
|
||||
else
|
||||
{
|
||||
// BG
|
||||
// GR
|
||||
dst(y, x) = pix[0];
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
GPU_TEST_P(Demosaicing, BayerBG2BGR)
|
||||
{
|
||||
cv::Mat img = readImage("stereobm/aloe-L.png");
|
||||
|
||||
cv::Mat_<uchar> src;
|
||||
mosaic(img, src, cv::Point(1, 1));
|
||||
|
||||
cv::gpu::GpuMat dst;
|
||||
cv::gpu::demosaicing(loadMat(src), dst, cv::COLOR_BayerBG2BGR);
|
||||
|
||||
EXPECT_MAT_SIMILAR(img, dst, 2e-2);
|
||||
}
|
||||
|
||||
GPU_TEST_P(Demosaicing, BayerGB2BGR)
|
||||
{
|
||||
cv::Mat img = readImage("stereobm/aloe-L.png");
|
||||
|
||||
cv::Mat_<uchar> src;
|
||||
mosaic(img, src, cv::Point(0, 1));
|
||||
|
||||
cv::gpu::GpuMat dst;
|
||||
cv::gpu::demosaicing(loadMat(src), dst, cv::COLOR_BayerGB2BGR);
|
||||
|
||||
EXPECT_MAT_SIMILAR(img, dst, 2e-2);
|
||||
}
|
||||
|
||||
GPU_TEST_P(Demosaicing, BayerRG2BGR)
|
||||
{
|
||||
cv::Mat img = readImage("stereobm/aloe-L.png");
|
||||
|
||||
cv::Mat_<uchar> src;
|
||||
mosaic(img, src, cv::Point(0, 0));
|
||||
|
||||
cv::gpu::GpuMat dst;
|
||||
cv::gpu::demosaicing(loadMat(src), dst, cv::COLOR_BayerRG2BGR);
|
||||
|
||||
EXPECT_MAT_SIMILAR(img, dst, 2e-2);
|
||||
}
|
||||
|
||||
GPU_TEST_P(Demosaicing, BayerGR2BGR)
|
||||
{
|
||||
cv::Mat img = readImage("stereobm/aloe-L.png");
|
||||
|
||||
cv::Mat_<uchar> src;
|
||||
mosaic(img, src, cv::Point(1, 0));
|
||||
|
||||
cv::gpu::GpuMat dst;
|
||||
cv::gpu::demosaicing(loadMat(src), dst, cv::COLOR_BayerGR2BGR);
|
||||
|
||||
EXPECT_MAT_SIMILAR(img, dst, 2e-2);
|
||||
}
|
||||
|
||||
GPU_TEST_P(Demosaicing, BayerBG2BGR_MHT)
|
||||
{
|
||||
cv::Mat img = readImage("stereobm/aloe-L.png");
|
||||
|
||||
cv::Mat_<uchar> src;
|
||||
mosaic(img, src, cv::Point(1, 1));
|
||||
|
||||
cv::gpu::GpuMat dst;
|
||||
cv::gpu::demosaicing(loadMat(src), dst, cv::gpu::COLOR_BayerBG2BGR_MHT);
|
||||
|
||||
EXPECT_MAT_SIMILAR(img, dst, 5e-3);
|
||||
}
|
||||
|
||||
GPU_TEST_P(Demosaicing, BayerGB2BGR_MHT)
|
||||
{
|
||||
cv::Mat img = readImage("stereobm/aloe-L.png");
|
||||
|
||||
cv::Mat_<uchar> src;
|
||||
mosaic(img, src, cv::Point(0, 1));
|
||||
|
||||
cv::gpu::GpuMat dst;
|
||||
cv::gpu::demosaicing(loadMat(src), dst, cv::gpu::COLOR_BayerGB2BGR_MHT);
|
||||
|
||||
EXPECT_MAT_SIMILAR(img, dst, 5e-3);
|
||||
}
|
||||
|
||||
GPU_TEST_P(Demosaicing, BayerRG2BGR_MHT)
|
||||
{
|
||||
cv::Mat img = readImage("stereobm/aloe-L.png");
|
||||
|
||||
cv::Mat_<uchar> src;
|
||||
mosaic(img, src, cv::Point(0, 0));
|
||||
|
||||
cv::gpu::GpuMat dst;
|
||||
cv::gpu::demosaicing(loadMat(src), dst, cv::gpu::COLOR_BayerRG2BGR_MHT);
|
||||
|
||||
EXPECT_MAT_SIMILAR(img, dst, 5e-3);
|
||||
}
|
||||
|
||||
GPU_TEST_P(Demosaicing, BayerGR2BGR_MHT)
|
||||
{
|
||||
cv::Mat img = readImage("stereobm/aloe-L.png");
|
||||
|
||||
cv::Mat_<uchar> src;
|
||||
mosaic(img, src, cv::Point(1, 0));
|
||||
|
||||
cv::gpu::GpuMat dst;
|
||||
cv::gpu::demosaicing(loadMat(src), dst, cv::gpu::COLOR_BayerGR2BGR_MHT);
|
||||
|
||||
EXPECT_MAT_SIMILAR(img, dst, 5e-3);
|
||||
}
|
||||
|
||||
INSTANTIATE_TEST_CASE_P(GPU_ImgProc, Demosaicing, ALL_DEVICES);
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
// swapChannels
|
||||
|
||||
|
@@ -43,6 +43,8 @@
|
||||
|
||||
#ifdef HAVE_CUDA
|
||||
|
||||
using namespace cvtest;
|
||||
|
||||
namespace
|
||||
{
|
||||
IMPLEMENT_PARAM_CLASS(Border, int)
|
||||
|
@@ -43,6 +43,8 @@
|
||||
|
||||
#ifdef HAVE_CUDA
|
||||
|
||||
using namespace cvtest;
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
// Merge
|
||||
|
||||
|
@@ -43,6 +43,8 @@
|
||||
|
||||
#ifdef HAVE_CUDA
|
||||
|
||||
using namespace cvtest;
|
||||
|
||||
////////////////////////////////////////////////////////
|
||||
// BilateralFilter
|
||||
|
||||
|
@@ -43,306 +43,7 @@
|
||||
|
||||
#ifdef HAVE_CUDA
|
||||
|
||||
namespace
|
||||
{
|
||||
bool keyPointsEquals(const cv::KeyPoint& p1, const cv::KeyPoint& p2)
|
||||
{
|
||||
const double maxPtDif = 1.0;
|
||||
const double maxSizeDif = 1.0;
|
||||
const double maxAngleDif = 2.0;
|
||||
const double maxResponseDif = 0.1;
|
||||
|
||||
double dist = cv::norm(p1.pt - p2.pt);
|
||||
|
||||
if (dist < maxPtDif &&
|
||||
fabs(p1.size - p2.size) < maxSizeDif &&
|
||||
abs(p1.angle - p2.angle) < maxAngleDif &&
|
||||
abs(p1.response - p2.response) < maxResponseDif &&
|
||||
p1.octave == p2.octave &&
|
||||
p1.class_id == p2.class_id)
|
||||
{
|
||||
return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
struct KeyPointLess : std::binary_function<cv::KeyPoint, cv::KeyPoint, bool>
|
||||
{
|
||||
bool operator()(const cv::KeyPoint& kp1, const cv::KeyPoint& kp2) const
|
||||
{
|
||||
return kp1.pt.y < kp2.pt.y || (kp1.pt.y == kp2.pt.y && kp1.pt.x < kp2.pt.x);
|
||||
}
|
||||
};
|
||||
|
||||
testing::AssertionResult assertKeyPointsEquals(const char* gold_expr, const char* actual_expr, std::vector<cv::KeyPoint>& gold, std::vector<cv::KeyPoint>& actual)
|
||||
{
|
||||
if (gold.size() != actual.size())
|
||||
{
|
||||
return testing::AssertionFailure() << "KeyPoints size mistmach\n"
|
||||
<< "\"" << gold_expr << "\" : " << gold.size() << "\n"
|
||||
<< "\"" << actual_expr << "\" : " << actual.size();
|
||||
}
|
||||
|
||||
std::sort(actual.begin(), actual.end(), KeyPointLess());
|
||||
std::sort(gold.begin(), gold.end(), KeyPointLess());
|
||||
|
||||
for (size_t i = 0; i < gold.size(); ++i)
|
||||
{
|
||||
const cv::KeyPoint& p1 = gold[i];
|
||||
const cv::KeyPoint& p2 = actual[i];
|
||||
|
||||
if (!keyPointsEquals(p1, p2))
|
||||
{
|
||||
return testing::AssertionFailure() << "KeyPoints differ at " << i << "\n"
|
||||
<< "\"" << gold_expr << "\" vs \"" << actual_expr << "\" : \n"
|
||||
<< "pt : " << testing::PrintToString(p1.pt) << " vs " << testing::PrintToString(p2.pt) << "\n"
|
||||
<< "size : " << p1.size << " vs " << p2.size << "\n"
|
||||
<< "angle : " << p1.angle << " vs " << p2.angle << "\n"
|
||||
<< "response : " << p1.response << " vs " << p2.response << "\n"
|
||||
<< "octave : " << p1.octave << " vs " << p2.octave << "\n"
|
||||
<< "class_id : " << p1.class_id << " vs " << p2.class_id;
|
||||
}
|
||||
}
|
||||
|
||||
return ::testing::AssertionSuccess();
|
||||
}
|
||||
|
||||
#define ASSERT_KEYPOINTS_EQ(gold, actual) EXPECT_PRED_FORMAT2(assertKeyPointsEquals, gold, actual);
|
||||
|
||||
int getMatchedPointsCount(std::vector<cv::KeyPoint>& gold, std::vector<cv::KeyPoint>& actual)
|
||||
{
|
||||
std::sort(actual.begin(), actual.end(), KeyPointLess());
|
||||
std::sort(gold.begin(), gold.end(), KeyPointLess());
|
||||
|
||||
int validCount = 0;
|
||||
|
||||
for (size_t i = 0; i < gold.size(); ++i)
|
||||
{
|
||||
const cv::KeyPoint& p1 = gold[i];
|
||||
const cv::KeyPoint& p2 = actual[i];
|
||||
|
||||
if (keyPointsEquals(p1, p2))
|
||||
++validCount;
|
||||
}
|
||||
|
||||
return validCount;
|
||||
}
|
||||
|
||||
int getMatchedPointsCount(const std::vector<cv::KeyPoint>& keypoints1, const std::vector<cv::KeyPoint>& keypoints2, const std::vector<cv::DMatch>& matches)
|
||||
{
|
||||
int validCount = 0;
|
||||
|
||||
for (size_t i = 0; i < matches.size(); ++i)
|
||||
{
|
||||
const cv::DMatch& m = matches[i];
|
||||
|
||||
const cv::KeyPoint& p1 = keypoints1[m.queryIdx];
|
||||
const cv::KeyPoint& p2 = keypoints2[m.trainIdx];
|
||||
|
||||
if (keyPointsEquals(p1, p2))
|
||||
++validCount;
|
||||
}
|
||||
|
||||
return validCount;
|
||||
}
|
||||
}
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
// SURF
|
||||
|
||||
namespace
|
||||
{
|
||||
IMPLEMENT_PARAM_CLASS(SURF_HessianThreshold, double)
|
||||
IMPLEMENT_PARAM_CLASS(SURF_Octaves, int)
|
||||
IMPLEMENT_PARAM_CLASS(SURF_OctaveLayers, int)
|
||||
IMPLEMENT_PARAM_CLASS(SURF_Extended, bool)
|
||||
IMPLEMENT_PARAM_CLASS(SURF_Upright, bool)
|
||||
}
|
||||
|
||||
PARAM_TEST_CASE(SURF, cv::gpu::DeviceInfo, SURF_HessianThreshold, SURF_Octaves, SURF_OctaveLayers, SURF_Extended, SURF_Upright)
|
||||
{
|
||||
cv::gpu::DeviceInfo devInfo;
|
||||
double hessianThreshold;
|
||||
int nOctaves;
|
||||
int nOctaveLayers;
|
||||
bool extended;
|
||||
bool upright;
|
||||
|
||||
virtual void SetUp()
|
||||
{
|
||||
devInfo = GET_PARAM(0);
|
||||
hessianThreshold = GET_PARAM(1);
|
||||
nOctaves = GET_PARAM(2);
|
||||
nOctaveLayers = GET_PARAM(3);
|
||||
extended = GET_PARAM(4);
|
||||
upright = GET_PARAM(5);
|
||||
|
||||
cv::gpu::setDevice(devInfo.deviceID());
|
||||
}
|
||||
};
|
||||
|
||||
GPU_TEST_P(SURF, Detector)
|
||||
{
|
||||
cv::Mat image = readImage("features2d/aloe.png", cv::IMREAD_GRAYSCALE);
|
||||
ASSERT_FALSE(image.empty());
|
||||
|
||||
cv::gpu::SURF_GPU surf;
|
||||
surf.hessianThreshold = hessianThreshold;
|
||||
surf.nOctaves = nOctaves;
|
||||
surf.nOctaveLayers = nOctaveLayers;
|
||||
surf.extended = extended;
|
||||
surf.upright = upright;
|
||||
surf.keypointsRatio = 0.05f;
|
||||
|
||||
if (!supportFeature(devInfo, cv::gpu::GLOBAL_ATOMICS))
|
||||
{
|
||||
try
|
||||
{
|
||||
std::vector<cv::KeyPoint> keypoints;
|
||||
surf(loadMat(image), cv::gpu::GpuMat(), keypoints);
|
||||
}
|
||||
catch (const cv::Exception& e)
|
||||
{
|
||||
ASSERT_EQ(CV_StsNotImplemented, e.code);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
std::vector<cv::KeyPoint> keypoints;
|
||||
surf(loadMat(image), cv::gpu::GpuMat(), keypoints);
|
||||
|
||||
cv::SURF surf_gold;
|
||||
surf_gold.hessianThreshold = hessianThreshold;
|
||||
surf_gold.nOctaves = nOctaves;
|
||||
surf_gold.nOctaveLayers = nOctaveLayers;
|
||||
surf_gold.extended = extended;
|
||||
surf_gold.upright = upright;
|
||||
|
||||
std::vector<cv::KeyPoint> keypoints_gold;
|
||||
surf_gold(image, cv::noArray(), keypoints_gold);
|
||||
|
||||
ASSERT_EQ(keypoints_gold.size(), keypoints.size());
|
||||
int matchedCount = getMatchedPointsCount(keypoints_gold, keypoints);
|
||||
double matchedRatio = static_cast<double>(matchedCount) / keypoints_gold.size();
|
||||
|
||||
EXPECT_GT(matchedRatio, 0.95);
|
||||
}
|
||||
}
|
||||
|
||||
GPU_TEST_P(SURF, Detector_Masked)
|
||||
{
|
||||
cv::Mat image = readImage("features2d/aloe.png", cv::IMREAD_GRAYSCALE);
|
||||
ASSERT_FALSE(image.empty());
|
||||
|
||||
cv::Mat mask(image.size(), CV_8UC1, cv::Scalar::all(1));
|
||||
mask(cv::Range(0, image.rows / 2), cv::Range(0, image.cols / 2)).setTo(cv::Scalar::all(0));
|
||||
|
||||
cv::gpu::SURF_GPU surf;
|
||||
surf.hessianThreshold = hessianThreshold;
|
||||
surf.nOctaves = nOctaves;
|
||||
surf.nOctaveLayers = nOctaveLayers;
|
||||
surf.extended = extended;
|
||||
surf.upright = upright;
|
||||
surf.keypointsRatio = 0.05f;
|
||||
|
||||
if (!supportFeature(devInfo, cv::gpu::GLOBAL_ATOMICS))
|
||||
{
|
||||
try
|
||||
{
|
||||
std::vector<cv::KeyPoint> keypoints;
|
||||
surf(loadMat(image), loadMat(mask), keypoints);
|
||||
}
|
||||
catch (const cv::Exception& e)
|
||||
{
|
||||
ASSERT_EQ(CV_StsNotImplemented, e.code);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
std::vector<cv::KeyPoint> keypoints;
|
||||
surf(loadMat(image), loadMat(mask), keypoints);
|
||||
|
||||
cv::SURF surf_gold;
|
||||
surf_gold.hessianThreshold = hessianThreshold;
|
||||
surf_gold.nOctaves = nOctaves;
|
||||
surf_gold.nOctaveLayers = nOctaveLayers;
|
||||
surf_gold.extended = extended;
|
||||
surf_gold.upright = upright;
|
||||
|
||||
std::vector<cv::KeyPoint> keypoints_gold;
|
||||
surf_gold(image, mask, keypoints_gold);
|
||||
|
||||
ASSERT_EQ(keypoints_gold.size(), keypoints.size());
|
||||
int matchedCount = getMatchedPointsCount(keypoints_gold, keypoints);
|
||||
double matchedRatio = static_cast<double>(matchedCount) / keypoints_gold.size();
|
||||
|
||||
EXPECT_GT(matchedRatio, 0.95);
|
||||
}
|
||||
}
|
||||
|
||||
GPU_TEST_P(SURF, Descriptor)
|
||||
{
|
||||
cv::Mat image = readImage("features2d/aloe.png", cv::IMREAD_GRAYSCALE);
|
||||
ASSERT_FALSE(image.empty());
|
||||
|
||||
cv::gpu::SURF_GPU surf;
|
||||
surf.hessianThreshold = hessianThreshold;
|
||||
surf.nOctaves = nOctaves;
|
||||
surf.nOctaveLayers = nOctaveLayers;
|
||||
surf.extended = extended;
|
||||
surf.upright = upright;
|
||||
surf.keypointsRatio = 0.05f;
|
||||
|
||||
cv::SURF surf_gold;
|
||||
surf_gold.hessianThreshold = hessianThreshold;
|
||||
surf_gold.nOctaves = nOctaves;
|
||||
surf_gold.nOctaveLayers = nOctaveLayers;
|
||||
surf_gold.extended = extended;
|
||||
surf_gold.upright = upright;
|
||||
|
||||
if (!supportFeature(devInfo, cv::gpu::GLOBAL_ATOMICS))
|
||||
{
|
||||
try
|
||||
{
|
||||
std::vector<cv::KeyPoint> keypoints;
|
||||
cv::gpu::GpuMat descriptors;
|
||||
surf(loadMat(image), cv::gpu::GpuMat(), keypoints, descriptors);
|
||||
}
|
||||
catch (const cv::Exception& e)
|
||||
{
|
||||
ASSERT_EQ(CV_StsNotImplemented, e.code);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
std::vector<cv::KeyPoint> keypoints;
|
||||
surf_gold(image, cv::noArray(), keypoints);
|
||||
|
||||
cv::gpu::GpuMat descriptors;
|
||||
surf(loadMat(image), cv::gpu::GpuMat(), keypoints, descriptors, true);
|
||||
|
||||
cv::Mat descriptors_gold;
|
||||
surf_gold(image, cv::noArray(), keypoints, descriptors_gold, true);
|
||||
|
||||
cv::BFMatcher matcher(cv::NORM_L2);
|
||||
std::vector<cv::DMatch> matches;
|
||||
matcher.match(descriptors_gold, cv::Mat(descriptors), matches);
|
||||
|
||||
int matchedCount = getMatchedPointsCount(keypoints, keypoints, matches);
|
||||
double matchedRatio = static_cast<double>(matchedCount) / keypoints.size();
|
||||
|
||||
EXPECT_GT(matchedRatio, 0.6);
|
||||
}
|
||||
}
|
||||
|
||||
INSTANTIATE_TEST_CASE_P(GPU_Features2D, SURF, testing::Combine(
|
||||
ALL_DEVICES,
|
||||
testing::Values(SURF_HessianThreshold(100.0), SURF_HessianThreshold(500.0), SURF_HessianThreshold(1000.0)),
|
||||
testing::Values(SURF_Octaves(3), SURF_Octaves(4)),
|
||||
testing::Values(SURF_OctaveLayers(2), SURF_OctaveLayers(3)),
|
||||
testing::Values(SURF_Extended(false), SURF_Extended(true)),
|
||||
testing::Values(SURF_Upright(false), SURF_Upright(true))));
|
||||
using namespace cvtest;
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
// FAST
|
||||
|
@@ -43,6 +43,8 @@
|
||||
|
||||
#ifdef HAVE_CUDA
|
||||
|
||||
using namespace cvtest;
|
||||
|
||||
namespace
|
||||
{
|
||||
IMPLEMENT_PARAM_CLASS(KSize, cv::Size)
|
||||
|
@@ -44,6 +44,8 @@
|
||||
|
||||
#ifdef HAVE_CUDA
|
||||
|
||||
using namespace cvtest;
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
// SetTo
|
||||
|
||||
|
@@ -43,6 +43,8 @@
|
||||
|
||||
#ifdef HAVE_CUDA
|
||||
|
||||
using namespace cvtest;
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
// HoughLines
|
||||
|
||||
|
@@ -43,6 +43,8 @@
|
||||
|
||||
#ifdef HAVE_CUDA
|
||||
|
||||
using namespace cvtest;
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
// Integral
|
||||
|
||||
|
@@ -43,6 +43,8 @@
|
||||
|
||||
#ifdef HAVE_CUDA
|
||||
|
||||
using namespace cvtest;
|
||||
|
||||
//#define DUMP
|
||||
|
||||
struct HOG : testing::TestWithParam<cv::gpu::DeviceInfo>, cv::gpu::HOGDescriptor
|
||||
|
@@ -43,6 +43,8 @@
|
||||
|
||||
#if defined(HAVE_CUDA) && defined(HAVE_OPENGL)
|
||||
|
||||
using namespace cvtest;
|
||||
|
||||
/////////////////////////////////////////////
|
||||
// Buffer
|
||||
|
||||
|
@@ -43,6 +43,8 @@
|
||||
|
||||
#ifdef HAVE_CUDA
|
||||
|
||||
using namespace cvtest;
|
||||
|
||||
//////////////////////////////////////////////////////
|
||||
// BroxOpticalFlow
|
||||
|
||||
|
@@ -76,11 +76,10 @@
|
||||
#include "opencv2/imgproc.hpp"
|
||||
#include "opencv2/video.hpp"
|
||||
#include "opencv2/ts.hpp"
|
||||
#include "opencv2/ts/gpu_test.hpp"
|
||||
#include "opencv2/gpu.hpp"
|
||||
#include "opencv2/nonfree.hpp"
|
||||
#include "opencv2/legacy.hpp"
|
||||
|
||||
#include "utility.hpp"
|
||||
#include "interpolation.hpp"
|
||||
#include "main_test_nvidia.h"
|
||||
#endif
|
||||
|
@@ -43,6 +43,8 @@
|
||||
|
||||
#ifdef HAVE_CUDA
|
||||
|
||||
using namespace cvtest;
|
||||
|
||||
////////////////////////////////////////////////////////
|
||||
// pyrDown
|
||||
|
||||
|
@@ -43,6 +43,8 @@
|
||||
|
||||
#ifdef HAVE_CUDA
|
||||
|
||||
using namespace cvtest;
|
||||
|
||||
///////////////////////////////////////////////////////////////////
|
||||
// Gold implementation
|
||||
|
||||
|
@@ -43,6 +43,8 @@
|
||||
|
||||
#ifdef HAVE_CUDA
|
||||
|
||||
using namespace cvtest;
|
||||
|
||||
///////////////////////////////////////////////////////////////////
|
||||
// Gold implementation
|
||||
|
||||
|
@@ -44,6 +44,8 @@
|
||||
|
||||
#ifdef HAVE_CUDA
|
||||
|
||||
using namespace cvtest;
|
||||
|
||||
#if CUDA_VERSION >= 5000
|
||||
|
||||
struct Async : testing::TestWithParam<cv::gpu::DeviceInfo>
|
||||
|
@@ -43,6 +43,8 @@
|
||||
|
||||
#ifdef HAVE_CUDA
|
||||
|
||||
using namespace cvtest;
|
||||
|
||||
CV_ENUM(ThreshOp, cv::THRESH_BINARY, cv::THRESH_BINARY_INV, cv::THRESH_TRUNC, cv::THRESH_TOZERO, cv::THRESH_TOZERO_INV)
|
||||
#define ALL_THRESH_OPS testing::Values(ThreshOp(cv::THRESH_BINARY), ThreshOp(cv::THRESH_BINARY_INV), ThreshOp(cv::THRESH_TRUNC), ThreshOp(cv::THRESH_TOZERO), ThreshOp(cv::THRESH_TOZERO_INV))
|
||||
|
||||
|
@@ -43,6 +43,8 @@
|
||||
|
||||
#ifdef HAVE_CUDA
|
||||
|
||||
using namespace cvtest;
|
||||
|
||||
namespace
|
||||
{
|
||||
cv::Mat createTransfomMatrix(cv::Size srcSize, double angle)
|
||||
|
@@ -43,6 +43,8 @@
|
||||
|
||||
#ifdef HAVE_CUDA
|
||||
|
||||
using namespace cvtest;
|
||||
|
||||
namespace
|
||||
{
|
||||
cv::Mat createTransfomMatrix(cv::Size srcSize, double angle)
|
||||
|
@@ -1,407 +0,0 @@
|
||||
/*M///////////////////////////////////////////////////////////////////////////////////////
|
||||
//
|
||||
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
|
||||
//
|
||||
// By downloading, copying, installing or using the software you agree to this license.
|
||||
// If you do not agree to this license, do not download, install,
|
||||
// copy or use the software.
|
||||
//
|
||||
//
|
||||
// Intel License Agreement
|
||||
// For Open Source Computer Vision Library
|
||||
//
|
||||
// Copyright (C) 2000, Intel Corporation, all rights reserved.
|
||||
// Third party copyrights are property of their respective owners.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without modification,
|
||||
// are permitted provided that the following conditions are met:
|
||||
//
|
||||
// * Redistribution's of source code must retain the above copyright notice,
|
||||
// this list of conditions and the following disclaimer.
|
||||
//
|
||||
// * Redistribution's in binary form must reproduce the above copyright notice,
|
||||
// this list of conditions and the following disclaimer in the documentation
|
||||
// and/or other materials provided with the distribution.
|
||||
//
|
||||
// * The name of Intel Corporation may not be used to endorse or promote products
|
||||
// derived from this software without specific prior written permission.
|
||||
//
|
||||
// This software is provided by the copyright holders and contributors "as is" and
|
||||
// any express or implied warranties, including, but not limited to, the implied
|
||||
// warranties of merchantability and fitness for a particular purpose are disclaimed.
|
||||
// In no event shall the Intel Corporation or contributors be liable for any direct,
|
||||
// indirect, incidental, special, exemplary, or consequential damages
|
||||
// (including, but not limited to, procurement of substitute goods or services;
|
||||
// loss of use, data, or profits; or business interruption) however caused
|
||||
// and on any theory of liability, whether in contract, strict liability,
|
||||
// or tort (including negligence or otherwise) arising in any way out of
|
||||
// the use of this software, even if advised of the possibility of such damage.
|
||||
//
|
||||
//M*/
|
||||
|
||||
#include "test_precomp.hpp"
|
||||
|
||||
#ifdef HAVE_CUDA
|
||||
|
||||
using namespace std;
|
||||
using namespace cv;
|
||||
using namespace cv::gpu;
|
||||
using namespace cvtest;
|
||||
using namespace testing;
|
||||
using namespace testing::internal;
|
||||
|
||||
//////////////////////////////////////////////////////////////////////
|
||||
// random generators
|
||||
|
||||
int randomInt(int minVal, int maxVal)
|
||||
{
|
||||
RNG& rng = TS::ptr()->get_rng();
|
||||
return rng.uniform(minVal, maxVal);
|
||||
}
|
||||
|
||||
double randomDouble(double minVal, double maxVal)
|
||||
{
|
||||
RNG& rng = TS::ptr()->get_rng();
|
||||
return rng.uniform(minVal, maxVal);
|
||||
}
|
||||
|
||||
Size randomSize(int minVal, int maxVal)
|
||||
{
|
||||
return Size(randomInt(minVal, maxVal), randomInt(minVal, maxVal));
|
||||
}
|
||||
|
||||
Scalar randomScalar(double minVal, double maxVal)
|
||||
{
|
||||
return Scalar(randomDouble(minVal, maxVal), randomDouble(minVal, maxVal), randomDouble(minVal, maxVal), randomDouble(minVal, maxVal));
|
||||
}
|
||||
|
||||
Mat randomMat(Size size, int type, double minVal, double maxVal)
|
||||
{
|
||||
return randomMat(TS::ptr()->get_rng(), size, type, minVal, maxVal, false);
|
||||
}
|
||||
|
||||
//////////////////////////////////////////////////////////////////////
|
||||
// GpuMat create
|
||||
|
||||
GpuMat createMat(Size size, int type, bool useRoi)
|
||||
{
|
||||
Size size0 = size;
|
||||
|
||||
if (useRoi)
|
||||
{
|
||||
size0.width += randomInt(5, 15);
|
||||
size0.height += randomInt(5, 15);
|
||||
}
|
||||
|
||||
GpuMat d_m(size0, type);
|
||||
|
||||
if (size0 != size)
|
||||
d_m = d_m(Rect((size0.width - size.width) / 2, (size0.height - size.height) / 2, size.width, size.height));
|
||||
|
||||
return d_m;
|
||||
}
|
||||
|
||||
GpuMat loadMat(const Mat& m, bool useRoi)
|
||||
{
|
||||
GpuMat d_m = createMat(m.size(), m.type(), useRoi);
|
||||
d_m.upload(m);
|
||||
return d_m;
|
||||
}
|
||||
|
||||
//////////////////////////////////////////////////////////////////////
|
||||
// Image load
|
||||
|
||||
Mat readImage(const std::string& fileName, int flags)
|
||||
{
|
||||
return imread(TS::ptr()->get_data_path() + fileName, flags);
|
||||
}
|
||||
|
||||
Mat readImageType(const std::string& fname, int type)
|
||||
{
|
||||
Mat src = readImage(fname, CV_MAT_CN(type) == 1 ? IMREAD_GRAYSCALE : IMREAD_COLOR);
|
||||
if (CV_MAT_CN(type) == 4)
|
||||
{
|
||||
Mat temp;
|
||||
cvtColor(src, temp, COLOR_BGR2BGRA);
|
||||
swap(src, temp);
|
||||
}
|
||||
src.convertTo(src, CV_MAT_DEPTH(type), CV_MAT_DEPTH(type) == CV_32F ? 1.0 / 255.0 : 1.0);
|
||||
return src;
|
||||
}
|
||||
|
||||
//////////////////////////////////////////////////////////////////////
|
||||
// Gpu devices
|
||||
|
||||
bool supportFeature(const DeviceInfo& info, FeatureSet feature)
|
||||
{
|
||||
return TargetArchs::builtWith(feature) && info.supports(feature);
|
||||
}
|
||||
|
||||
DeviceManager& DeviceManager::instance()
|
||||
{
|
||||
static DeviceManager obj;
|
||||
return obj;
|
||||
}
|
||||
|
||||
void DeviceManager::load(int i)
|
||||
{
|
||||
devices_.clear();
|
||||
devices_.reserve(1);
|
||||
|
||||
std::ostringstream msg;
|
||||
|
||||
if (i < 0 || i >= getCudaEnabledDeviceCount())
|
||||
{
|
||||
msg << "Incorrect device number - " << i;
|
||||
throw runtime_error(msg.str());
|
||||
}
|
||||
|
||||
DeviceInfo info(i);
|
||||
|
||||
if (!info.isCompatible())
|
||||
{
|
||||
msg << "Device " << i << " [" << info.name() << "] is NOT compatible with current GPU module build";
|
||||
throw runtime_error(msg.str());
|
||||
}
|
||||
|
||||
devices_.push_back(info);
|
||||
}
|
||||
|
||||
void DeviceManager::loadAll()
|
||||
{
|
||||
int deviceCount = getCudaEnabledDeviceCount();
|
||||
|
||||
devices_.clear();
|
||||
devices_.reserve(deviceCount);
|
||||
|
||||
for (int i = 0; i < deviceCount; ++i)
|
||||
{
|
||||
DeviceInfo info(i);
|
||||
if (info.isCompatible())
|
||||
{
|
||||
devices_.push_back(info);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
//////////////////////////////////////////////////////////////////////
|
||||
// Additional assertion
|
||||
|
||||
namespace
|
||||
{
|
||||
template <typename T, typename OutT> std::string printMatValImpl(const Mat& m, Point p)
|
||||
{
|
||||
const int cn = m.channels();
|
||||
|
||||
std::ostringstream ostr;
|
||||
ostr << "(";
|
||||
|
||||
p.x /= cn;
|
||||
|
||||
ostr << static_cast<OutT>(m.at<T>(p.y, p.x * cn));
|
||||
for (int c = 1; c < m.channels(); ++c)
|
||||
{
|
||||
ostr << ", " << static_cast<OutT>(m.at<T>(p.y, p.x * cn + c));
|
||||
}
|
||||
ostr << ")";
|
||||
|
||||
return ostr.str();
|
||||
}
|
||||
|
||||
std::string printMatVal(const Mat& m, Point p)
|
||||
{
|
||||
typedef std::string (*func_t)(const Mat& m, Point p);
|
||||
|
||||
static const func_t funcs[] =
|
||||
{
|
||||
printMatValImpl<uchar, int>, printMatValImpl<schar, int>, printMatValImpl<ushort, int>, printMatValImpl<short, int>,
|
||||
printMatValImpl<int, int>, printMatValImpl<float, float>, printMatValImpl<double, double>
|
||||
};
|
||||
|
||||
return funcs[m.depth()](m, p);
|
||||
}
|
||||
}
|
||||
|
||||
void minMaxLocGold(const Mat& src, double* minVal_, double* maxVal_, Point* minLoc_, Point* maxLoc_, const Mat& mask)
|
||||
{
|
||||
if (src.depth() != CV_8S)
|
||||
{
|
||||
minMaxLoc(src, minVal_, maxVal_, minLoc_, maxLoc_, mask);
|
||||
return;
|
||||
}
|
||||
|
||||
// OpenCV's minMaxLoc doesn't support CV_8S type
|
||||
double minVal = numeric_limits<double>::max();
|
||||
Point minLoc(-1, -1);
|
||||
|
||||
double maxVal = -numeric_limits<double>::max();
|
||||
Point maxLoc(-1, -1);
|
||||
|
||||
for (int y = 0; y < src.rows; ++y)
|
||||
{
|
||||
const schar* src_row = src.ptr<schar>(y);
|
||||
const uchar* mask_row = mask.empty() ? 0 : mask.ptr<uchar>(y);
|
||||
|
||||
for (int x = 0; x < src.cols; ++x)
|
||||
{
|
||||
if (!mask_row || mask_row[x])
|
||||
{
|
||||
schar val = src_row[x];
|
||||
|
||||
if (val < minVal)
|
||||
{
|
||||
minVal = val;
|
||||
minLoc = cv::Point(x, y);
|
||||
}
|
||||
|
||||
if (val > maxVal)
|
||||
{
|
||||
maxVal = val;
|
||||
maxLoc = cv::Point(x, y);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (minVal_) *minVal_ = minVal;
|
||||
if (maxVal_) *maxVal_ = maxVal;
|
||||
|
||||
if (minLoc_) *minLoc_ = minLoc;
|
||||
if (maxLoc_) *maxLoc_ = maxLoc;
|
||||
}
|
||||
|
||||
Mat getMat(InputArray arr)
|
||||
{
|
||||
if (arr.kind() == _InputArray::GPU_MAT)
|
||||
{
|
||||
Mat m;
|
||||
arr.getGpuMat().download(m);
|
||||
return m;
|
||||
}
|
||||
|
||||
return arr.getMat();
|
||||
}
|
||||
|
||||
AssertionResult assertMatNear(const char* expr1, const char* expr2, const char* eps_expr, InputArray m1_, InputArray m2_, double eps)
|
||||
{
|
||||
Mat m1 = getMat(m1_);
|
||||
Mat m2 = getMat(m2_);
|
||||
|
||||
if (m1.size() != m2.size())
|
||||
{
|
||||
return AssertionFailure() << "Matrices \"" << expr1 << "\" and \"" << expr2 << "\" have different sizes : \""
|
||||
<< expr1 << "\" [" << PrintToString(m1.size()) << "] vs \""
|
||||
<< expr2 << "\" [" << PrintToString(m2.size()) << "]";
|
||||
}
|
||||
|
||||
if (m1.type() != m2.type())
|
||||
{
|
||||
return AssertionFailure() << "Matrices \"" << expr1 << "\" and \"" << expr2 << "\" have different types : \""
|
||||
<< expr1 << "\" [" << PrintToString(MatType(m1.type())) << "] vs \""
|
||||
<< expr2 << "\" [" << PrintToString(MatType(m2.type())) << "]";
|
||||
}
|
||||
|
||||
Mat diff;
|
||||
absdiff(m1.reshape(1), m2.reshape(1), diff);
|
||||
|
||||
double maxVal = 0.0;
|
||||
Point maxLoc;
|
||||
minMaxLocGold(diff, 0, &maxVal, 0, &maxLoc);
|
||||
|
||||
if (maxVal > eps)
|
||||
{
|
||||
return AssertionFailure() << "The max difference between matrices \"" << expr1 << "\" and \"" << expr2
|
||||
<< "\" is " << maxVal << " at (" << maxLoc.y << ", " << maxLoc.x / m1.channels() << ")"
|
||||
<< ", which exceeds \"" << eps_expr << "\", where \""
|
||||
<< expr1 << "\" at (" << maxLoc.y << ", " << maxLoc.x / m1.channels() << ") evaluates to " << printMatVal(m1, maxLoc) << ", \""
|
||||
<< expr2 << "\" at (" << maxLoc.y << ", " << maxLoc.x / m1.channels() << ") evaluates to " << printMatVal(m2, maxLoc) << ", \""
|
||||
<< eps_expr << "\" evaluates to " << eps;
|
||||
}
|
||||
|
||||
return AssertionSuccess();
|
||||
}
|
||||
|
||||
double checkSimilarity(InputArray m1, InputArray m2)
|
||||
{
|
||||
Mat diff;
|
||||
matchTemplate(getMat(m1), getMat(m2), diff, CV_TM_CCORR_NORMED);
|
||||
return std::abs(diff.at<float>(0, 0) - 1.f);
|
||||
}
|
||||
|
||||
//////////////////////////////////////////////////////////////////////
|
||||
// Helper structs for value-parameterized tests
|
||||
|
||||
vector<MatType> types(int depth_start, int depth_end, int cn_start, int cn_end)
|
||||
{
|
||||
vector<MatType> v;
|
||||
|
||||
v.reserve((depth_end - depth_start + 1) * (cn_end - cn_start + 1));
|
||||
|
||||
for (int depth = depth_start; depth <= depth_end; ++depth)
|
||||
{
|
||||
for (int cn = cn_start; cn <= cn_end; ++cn)
|
||||
{
|
||||
v.push_back(MatType(CV_MAKE_TYPE(depth, cn)));
|
||||
}
|
||||
}
|
||||
|
||||
return v;
|
||||
}
|
||||
|
||||
const vector<MatType>& all_types()
|
||||
{
|
||||
static vector<MatType> v = types(CV_8U, CV_64F, 1, 4);
|
||||
|
||||
return v;
|
||||
}
|
||||
|
||||
void cv::gpu::PrintTo(const DeviceInfo& info, ostream* os)
|
||||
{
|
||||
(*os) << info.name();
|
||||
}
|
||||
|
||||
void PrintTo(const UseRoi& useRoi, std::ostream* os)
|
||||
{
|
||||
if (useRoi)
|
||||
(*os) << "sub matrix";
|
||||
else
|
||||
(*os) << "whole matrix";
|
||||
}
|
||||
|
||||
void PrintTo(const Inverse& inverse, std::ostream* os)
|
||||
{
|
||||
if (inverse)
|
||||
(*os) << "inverse";
|
||||
else
|
||||
(*os) << "direct";
|
||||
}
|
||||
|
||||
//////////////////////////////////////////////////////////////////////
|
||||
// Other
|
||||
|
||||
void dumpImage(const std::string& fileName, const Mat& image)
|
||||
{
|
||||
imwrite(TS::ptr()->get_data_path() + fileName, image);
|
||||
}
|
||||
|
||||
void showDiff(InputArray gold_, InputArray actual_, double eps)
|
||||
{
|
||||
Mat gold = getMat(gold_);
|
||||
Mat actual = getMat(actual_);
|
||||
|
||||
Mat diff;
|
||||
absdiff(gold, actual, diff);
|
||||
threshold(diff, diff, eps, 255.0, cv::THRESH_BINARY);
|
||||
|
||||
namedWindow("gold", WINDOW_NORMAL);
|
||||
namedWindow("actual", WINDOW_NORMAL);
|
||||
namedWindow("diff", WINDOW_NORMAL);
|
||||
|
||||
imshow("gold", gold);
|
||||
imshow("actual", actual);
|
||||
imshow("diff", diff);
|
||||
|
||||
waitKey();
|
||||
}
|
||||
|
||||
#endif // HAVE_CUDA
|
@@ -1,331 +0,0 @@
|
||||
/*M///////////////////////////////////////////////////////////////////////////////////////
|
||||
//
|
||||
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
|
||||
//
|
||||
// By downloading, copying, installing or using the software you agree to this license.
|
||||
// If you do not agree to this license, do not download, install,
|
||||
// copy or use the software.
|
||||
//
|
||||
//
|
||||
// Intel License Agreement
|
||||
// For Open Source Computer Vision Library
|
||||
//
|
||||
// Copyright (C) 2000, Intel Corporation, all rights reserved.
|
||||
// Third party copyrights are property of their respective owners.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without modification,
|
||||
// are permitted provided that the following conditions are met:
|
||||
//
|
||||
// * Redistribution's of source code must retain the above copyright notice,
|
||||
// this list of conditions and the following disclaimer.
|
||||
//
|
||||
// * Redistribution's in binary form must reproduce the above copyright notice,
|
||||
// this list of conditions and the following disclaimer in the documentation
|
||||
// and/or other materials provided with the distribution.
|
||||
//
|
||||
// * The name of Intel Corporation may not be used to endorse or promote products
|
||||
// derived from this software without specific prior written permission.
|
||||
//
|
||||
// This software is provided by the copyright holders and contributors "as is" and
|
||||
// any express or implied warranties, including, but not limited to, the implied
|
||||
// warranties of merchantability and fitness for a particular purpose are disclaimed.
|
||||
// In no event shall the Intel Corporation or contributors be liable for any direct,
|
||||
// indirect, incidental, special, exemplary, or consequential damages
|
||||
// (including, but not limited to, procurement of substitute goods or services;
|
||||
// loss of use, data, or profits; or business interruption) however caused
|
||||
// and on any theory of liability, whether in contract, strict liability,
|
||||
// or tort (including negligence or otherwise) arising in any way out of
|
||||
// the use of this software, even if advised of the possibility of such damage.
|
||||
//
|
||||
//M*/
|
||||
|
||||
#ifndef __OPENCV_GPU_TEST_UTILITY_HPP__
|
||||
#define __OPENCV_GPU_TEST_UTILITY_HPP__
|
||||
|
||||
#include "opencv2/core.hpp"
|
||||
#include "opencv2/core/gpumat.hpp"
|
||||
#include "opencv2/highgui.hpp"
|
||||
#include "opencv2/ts.hpp"
|
||||
|
||||
//////////////////////////////////////////////////////////////////////
|
||||
// random generators
|
||||
|
||||
int randomInt(int minVal, int maxVal);
|
||||
double randomDouble(double minVal, double maxVal);
|
||||
cv::Size randomSize(int minVal, int maxVal);
|
||||
cv::Scalar randomScalar(double minVal, double maxVal);
|
||||
cv::Mat randomMat(cv::Size size, int type, double minVal = 0.0, double maxVal = 255.0);
|
||||
|
||||
//////////////////////////////////////////////////////////////////////
|
||||
// GpuMat create
|
||||
|
||||
cv::gpu::GpuMat createMat(cv::Size size, int type, bool useRoi = false);
|
||||
cv::gpu::GpuMat loadMat(const cv::Mat& m, bool useRoi = false);
|
||||
|
||||
//////////////////////////////////////////////////////////////////////
|
||||
// Image load
|
||||
|
||||
//! read image from testdata folder
|
||||
cv::Mat readImage(const std::string& fileName, int flags = cv::IMREAD_COLOR);
|
||||
|
||||
//! read image from testdata folder and convert it to specified type
|
||||
cv::Mat readImageType(const std::string& fname, int type);
|
||||
|
||||
//////////////////////////////////////////////////////////////////////
|
||||
// Gpu devices
|
||||
|
||||
//! return true if device supports specified feature and gpu module was built with support the feature.
|
||||
bool supportFeature(const cv::gpu::DeviceInfo& info, cv::gpu::FeatureSet feature);
|
||||
|
||||
class DeviceManager
|
||||
{
|
||||
public:
|
||||
static DeviceManager& instance();
|
||||
|
||||
void load(int i);
|
||||
void loadAll();
|
||||
|
||||
const std::vector<cv::gpu::DeviceInfo>& values() const { return devices_; }
|
||||
|
||||
private:
|
||||
std::vector<cv::gpu::DeviceInfo> devices_;
|
||||
};
|
||||
|
||||
#define ALL_DEVICES testing::ValuesIn(DeviceManager::instance().values())
|
||||
|
||||
//////////////////////////////////////////////////////////////////////
|
||||
// Additional assertion
|
||||
|
||||
void minMaxLocGold(const cv::Mat& src, double* minVal_, double* maxVal_ = 0, cv::Point* minLoc_ = 0, cv::Point* maxLoc_ = 0, const cv::Mat& mask = cv::Mat());
|
||||
|
||||
cv::Mat getMat(cv::InputArray arr);
|
||||
|
||||
testing::AssertionResult assertMatNear(const char* expr1, const char* expr2, const char* eps_expr, cv::InputArray m1, cv::InputArray m2, double eps);
|
||||
|
||||
#define EXPECT_MAT_NEAR(m1, m2, eps) EXPECT_PRED_FORMAT3(assertMatNear, m1, m2, eps)
|
||||
#define ASSERT_MAT_NEAR(m1, m2, eps) ASSERT_PRED_FORMAT3(assertMatNear, m1, m2, eps)
|
||||
|
||||
#define EXPECT_SCALAR_NEAR(s1, s2, eps) \
|
||||
{ \
|
||||
EXPECT_NEAR(s1[0], s2[0], eps); \
|
||||
EXPECT_NEAR(s1[1], s2[1], eps); \
|
||||
EXPECT_NEAR(s1[2], s2[2], eps); \
|
||||
EXPECT_NEAR(s1[3], s2[3], eps); \
|
||||
}
|
||||
#define ASSERT_SCALAR_NEAR(s1, s2, eps) \
|
||||
{ \
|
||||
ASSERT_NEAR(s1[0], s2[0], eps); \
|
||||
ASSERT_NEAR(s1[1], s2[1], eps); \
|
||||
ASSERT_NEAR(s1[2], s2[2], eps); \
|
||||
ASSERT_NEAR(s1[3], s2[3], eps); \
|
||||
}
|
||||
|
||||
#define EXPECT_POINT2_NEAR(p1, p2, eps) \
|
||||
{ \
|
||||
EXPECT_NEAR(p1.x, p2.x, eps); \
|
||||
EXPECT_NEAR(p1.y, p2.y, eps); \
|
||||
}
|
||||
#define ASSERT_POINT2_NEAR(p1, p2, eps) \
|
||||
{ \
|
||||
ASSERT_NEAR(p1.x, p2.x, eps); \
|
||||
ASSERT_NEAR(p1.y, p2.y, eps); \
|
||||
}
|
||||
|
||||
#define EXPECT_POINT3_NEAR(p1, p2, eps) \
|
||||
{ \
|
||||
EXPECT_NEAR(p1.x, p2.x, eps); \
|
||||
EXPECT_NEAR(p1.y, p2.y, eps); \
|
||||
EXPECT_NEAR(p1.z, p2.z, eps); \
|
||||
}
|
||||
#define ASSERT_POINT3_NEAR(p1, p2, eps) \
|
||||
{ \
|
||||
ASSERT_NEAR(p1.x, p2.x, eps); \
|
||||
ASSERT_NEAR(p1.y, p2.y, eps); \
|
||||
ASSERT_NEAR(p1.z, p2.z, eps); \
|
||||
}
|
||||
|
||||
double checkSimilarity(cv::InputArray m1, cv::InputArray m2);
|
||||
|
||||
#define EXPECT_MAT_SIMILAR(mat1, mat2, eps) \
|
||||
{ \
|
||||
ASSERT_EQ(mat1.type(), mat2.type()); \
|
||||
ASSERT_EQ(mat1.size(), mat2.size()); \
|
||||
EXPECT_LE(checkSimilarity(mat1, mat2), eps); \
|
||||
}
|
||||
#define ASSERT_MAT_SIMILAR(mat1, mat2, eps) \
|
||||
{ \
|
||||
ASSERT_EQ(mat1.type(), mat2.type()); \
|
||||
ASSERT_EQ(mat1.size(), mat2.size()); \
|
||||
ASSERT_LE(checkSimilarity(mat1, mat2), eps); \
|
||||
}
|
||||
|
||||
//////////////////////////////////////////////////////////////////////
|
||||
// Helper structs for value-parameterized tests
|
||||
|
||||
#define GPU_TEST_P(test_case_name, test_name) \
|
||||
class GTEST_TEST_CLASS_NAME_(test_case_name, test_name) \
|
||||
: public test_case_name { \
|
||||
public: \
|
||||
GTEST_TEST_CLASS_NAME_(test_case_name, test_name)() {} \
|
||||
virtual void TestBody(); \
|
||||
private: \
|
||||
void UnsafeTestBody(); \
|
||||
static int AddToRegistry() { \
|
||||
::testing::UnitTest::GetInstance()->parameterized_test_registry(). \
|
||||
GetTestCasePatternHolder<test_case_name>(\
|
||||
#test_case_name, __FILE__, __LINE__)->AddTestPattern(\
|
||||
#test_case_name, \
|
||||
#test_name, \
|
||||
new ::testing::internal::TestMetaFactory< \
|
||||
GTEST_TEST_CLASS_NAME_(test_case_name, test_name)>()); \
|
||||
return 0; \
|
||||
} \
|
||||
static int gtest_registering_dummy_; \
|
||||
GTEST_DISALLOW_COPY_AND_ASSIGN_(\
|
||||
GTEST_TEST_CLASS_NAME_(test_case_name, test_name)); \
|
||||
}; \
|
||||
int GTEST_TEST_CLASS_NAME_(test_case_name, \
|
||||
test_name)::gtest_registering_dummy_ = \
|
||||
GTEST_TEST_CLASS_NAME_(test_case_name, test_name)::AddToRegistry(); \
|
||||
void GTEST_TEST_CLASS_NAME_(test_case_name, test_name)::TestBody() \
|
||||
{ \
|
||||
try \
|
||||
{ \
|
||||
UnsafeTestBody(); \
|
||||
} \
|
||||
catch (...) \
|
||||
{ \
|
||||
cv::gpu::resetDevice(); \
|
||||
throw; \
|
||||
} \
|
||||
} \
|
||||
void GTEST_TEST_CLASS_NAME_(test_case_name, test_name)::UnsafeTestBody()
|
||||
|
||||
#define PARAM_TEST_CASE(name, ...) struct name : testing::TestWithParam< std::tr1::tuple< __VA_ARGS__ > >
|
||||
#define GET_PARAM(k) std::tr1::get< k >(GetParam())
|
||||
|
||||
namespace cv { namespace gpu
|
||||
{
|
||||
void PrintTo(const DeviceInfo& info, std::ostream* os);
|
||||
}}
|
||||
|
||||
#define DIFFERENT_SIZES testing::Values(cv::Size(128, 128), cv::Size(113, 113))
|
||||
|
||||
// Depth
|
||||
|
||||
using perf::MatDepth;
|
||||
|
||||
#define ALL_DEPTH testing::Values(MatDepth(CV_8U), MatDepth(CV_8S), MatDepth(CV_16U), MatDepth(CV_16S), MatDepth(CV_32S), MatDepth(CV_32F), MatDepth(CV_64F))
|
||||
|
||||
#define DEPTH_PAIRS testing::Values(std::make_pair(MatDepth(CV_8U), MatDepth(CV_8U)), \
|
||||
std::make_pair(MatDepth(CV_8U), MatDepth(CV_16U)), \
|
||||
std::make_pair(MatDepth(CV_8U), MatDepth(CV_16S)), \
|
||||
std::make_pair(MatDepth(CV_8U), MatDepth(CV_32S)), \
|
||||
std::make_pair(MatDepth(CV_8U), MatDepth(CV_32F)), \
|
||||
std::make_pair(MatDepth(CV_8U), MatDepth(CV_64F)), \
|
||||
\
|
||||
std::make_pair(MatDepth(CV_16U), MatDepth(CV_16U)), \
|
||||
std::make_pair(MatDepth(CV_16U), MatDepth(CV_32S)), \
|
||||
std::make_pair(MatDepth(CV_16U), MatDepth(CV_32F)), \
|
||||
std::make_pair(MatDepth(CV_16U), MatDepth(CV_64F)), \
|
||||
\
|
||||
std::make_pair(MatDepth(CV_16S), MatDepth(CV_16S)), \
|
||||
std::make_pair(MatDepth(CV_16S), MatDepth(CV_32S)), \
|
||||
std::make_pair(MatDepth(CV_16S), MatDepth(CV_32F)), \
|
||||
std::make_pair(MatDepth(CV_16S), MatDepth(CV_64F)), \
|
||||
\
|
||||
std::make_pair(MatDepth(CV_32S), MatDepth(CV_32S)), \
|
||||
std::make_pair(MatDepth(CV_32S), MatDepth(CV_32F)), \
|
||||
std::make_pair(MatDepth(CV_32S), MatDepth(CV_64F)), \
|
||||
\
|
||||
std::make_pair(MatDepth(CV_32F), MatDepth(CV_32F)), \
|
||||
std::make_pair(MatDepth(CV_32F), MatDepth(CV_64F)), \
|
||||
\
|
||||
std::make_pair(MatDepth(CV_64F), MatDepth(CV_64F)))
|
||||
|
||||
// Type
|
||||
|
||||
using perf::MatType;
|
||||
|
||||
//! return vector with types from specified range.
|
||||
std::vector<MatType> types(int depth_start, int depth_end, int cn_start, int cn_end);
|
||||
|
||||
//! return vector with all types (depth: CV_8U-CV_64F, channels: 1-4).
|
||||
const std::vector<MatType>& all_types();
|
||||
|
||||
#define ALL_TYPES testing::ValuesIn(all_types())
|
||||
#define TYPES(depth_start, depth_end, cn_start, cn_end) testing::ValuesIn(types(depth_start, depth_end, cn_start, cn_end))
|
||||
|
||||
// ROI
|
||||
|
||||
class UseRoi
|
||||
{
|
||||
public:
|
||||
inline UseRoi(bool val = false) : val_(val) {}
|
||||
|
||||
inline operator bool() const { return val_; }
|
||||
|
||||
private:
|
||||
bool val_;
|
||||
};
|
||||
|
||||
void PrintTo(const UseRoi& useRoi, std::ostream* os);
|
||||
|
||||
#define WHOLE_SUBMAT testing::Values(UseRoi(false), UseRoi(true))
|
||||
|
||||
// Direct/Inverse
|
||||
|
||||
class Inverse
|
||||
{
|
||||
public:
|
||||
inline Inverse(bool val = false) : val_(val) {}
|
||||
|
||||
inline operator bool() const { return val_; }
|
||||
|
||||
private:
|
||||
bool val_;
|
||||
};
|
||||
|
||||
void PrintTo(const Inverse& useRoi, std::ostream* os);
|
||||
|
||||
#define DIRECT_INVERSE testing::Values(Inverse(false), Inverse(true))
|
||||
|
||||
// Param class
|
||||
|
||||
#define IMPLEMENT_PARAM_CLASS(name, type) \
|
||||
class name \
|
||||
{ \
|
||||
public: \
|
||||
name ( type arg = type ()) : val_(arg) {} \
|
||||
operator type () const {return val_;} \
|
||||
private: \
|
||||
type val_; \
|
||||
}; \
|
||||
inline void PrintTo( name param, std::ostream* os) \
|
||||
{ \
|
||||
*os << #name << "(" << testing::PrintToString(static_cast< type >(param)) << ")"; \
|
||||
}
|
||||
|
||||
IMPLEMENT_PARAM_CLASS(Channels, int)
|
||||
|
||||
#define ALL_CHANNELS testing::Values(Channels(1), Channels(2), Channels(3), Channels(4))
|
||||
#define IMAGE_CHANNELS testing::Values(Channels(1), Channels(3), Channels(4))
|
||||
|
||||
// Flags and enums
|
||||
|
||||
CV_ENUM(NormCode, cv::NORM_INF, cv::NORM_L1, cv::NORM_L2, cv::NORM_TYPE_MASK, cv::NORM_RELATIVE, cv::NORM_MINMAX)
|
||||
|
||||
CV_ENUM(Interpolation, cv::INTER_NEAREST, cv::INTER_LINEAR, cv::INTER_CUBIC, cv::INTER_AREA)
|
||||
|
||||
CV_ENUM(BorderType, cv::BORDER_REFLECT101, cv::BORDER_REPLICATE, cv::BORDER_CONSTANT, cv::BORDER_REFLECT, cv::BORDER_WRAP)
|
||||
#define ALL_BORDER_TYPES testing::Values(BorderType(cv::BORDER_REFLECT101), BorderType(cv::BORDER_REPLICATE), BorderType(cv::BORDER_CONSTANT), BorderType(cv::BORDER_REFLECT), BorderType(cv::BORDER_WRAP))
|
||||
|
||||
CV_FLAGS(WarpFlags, cv::INTER_NEAREST, cv::INTER_LINEAR, cv::INTER_CUBIC, cv::WARP_INVERSE_MAP)
|
||||
|
||||
//////////////////////////////////////////////////////////////////////
|
||||
// Other
|
||||
|
||||
void dumpImage(const std::string& fileName, const cv::Mat& image);
|
||||
void showDiff(cv::InputArray gold, cv::InputArray actual, double eps);
|
||||
|
||||
#endif // __OPENCV_GPU_TEST_UTILITY_HPP__
|
Reference in New Issue
Block a user