Merge branch '2.4'
This commit is contained in:
@@ -491,6 +491,26 @@ CV_EXPORTS void reprojectImageTo3D(const GpuMat& disp, GpuMat& xyzw, const Mat&
|
||||
//! converts image from one color space to another
|
||||
CV_EXPORTS void cvtColor(const GpuMat& src, GpuMat& dst, int code, int dcn = 0, Stream& stream = Stream::Null());
|
||||
|
||||
enum
|
||||
{
|
||||
// Bayer Demosaicing (Malvar, He, and Cutler)
|
||||
COLOR_BayerBG2BGR_MHT = 256,
|
||||
COLOR_BayerGB2BGR_MHT = 257,
|
||||
COLOR_BayerRG2BGR_MHT = 258,
|
||||
COLOR_BayerGR2BGR_MHT = 259,
|
||||
|
||||
COLOR_BayerBG2RGB_MHT = COLOR_BayerRG2BGR_MHT,
|
||||
COLOR_BayerGB2RGB_MHT = COLOR_BayerGR2BGR_MHT,
|
||||
COLOR_BayerRG2RGB_MHT = COLOR_BayerBG2BGR_MHT,
|
||||
COLOR_BayerGR2RGB_MHT = COLOR_BayerGB2BGR_MHT,
|
||||
|
||||
COLOR_BayerBG2GRAY_MHT = 260,
|
||||
COLOR_BayerGB2GRAY_MHT = 261,
|
||||
COLOR_BayerRG2GRAY_MHT = 262,
|
||||
COLOR_BayerGR2GRAY_MHT = 263
|
||||
};
|
||||
CV_EXPORTS void demosaicing(const GpuMat& src, GpuMat& dst, int code, int dcn = -1, Stream& stream = Stream::Null());
|
||||
|
||||
//! swap channels
|
||||
//! dstOrder - Integer array describing how channel values are permutated. The n-th entry
|
||||
//! of the array contains the number of the channel that is stored in the n-th channel of
|
||||
@@ -894,9 +914,11 @@ CV_EXPORTS void histRange(const GpuMat& src, GpuMat hist[4], const GpuMat levels
|
||||
//! Calculates histogram for 8u one channel image
|
||||
//! Output hist will have one row, 256 cols and CV32SC1 type.
|
||||
CV_EXPORTS void calcHist(const GpuMat& src, GpuMat& hist, Stream& stream = Stream::Null());
|
||||
CV_EXPORTS void calcHist(const GpuMat& src, GpuMat& hist, GpuMat& buf, Stream& stream = Stream::Null());
|
||||
|
||||
//! normalizes the grayscale image brightness and contrast by normalizing its histogram
|
||||
CV_EXPORTS void equalizeHist(const GpuMat& src, GpuMat& dst, Stream& stream = Stream::Null());
|
||||
CV_EXPORTS void equalizeHist(const GpuMat& src, GpuMat& dst, GpuMat& hist, Stream& stream = Stream::Null());
|
||||
CV_EXPORTS void equalizeHist(const GpuMat& src, GpuMat& dst, GpuMat& hist, GpuMat& buf, Stream& stream = Stream::Null());
|
||||
|
||||
//////////////////////////////// StereoBM_GPU ////////////////////////////////
|
||||
@@ -1386,82 +1408,6 @@ private:
|
||||
friend class CascadeClassifier_GPU_LBP;
|
||||
};
|
||||
|
||||
////////////////////////////////// SURF //////////////////////////////////////////
|
||||
|
||||
class CV_EXPORTS SURF_GPU
|
||||
{
|
||||
public:
|
||||
enum KeypointLayout
|
||||
{
|
||||
X_ROW = 0,
|
||||
Y_ROW,
|
||||
LAPLACIAN_ROW,
|
||||
OCTAVE_ROW,
|
||||
SIZE_ROW,
|
||||
ANGLE_ROW,
|
||||
HESSIAN_ROW,
|
||||
ROWS_COUNT
|
||||
};
|
||||
|
||||
//! the default constructor
|
||||
SURF_GPU();
|
||||
//! the full constructor taking all the necessary parameters
|
||||
explicit SURF_GPU(double _hessianThreshold, int _nOctaves=4,
|
||||
int _nOctaveLayers=2, bool _extended=false, float _keypointsRatio=0.01f, bool _upright = false);
|
||||
|
||||
//! returns the descriptor size in float's (64 or 128)
|
||||
int descriptorSize() const;
|
||||
|
||||
//! upload host keypoints to device memory
|
||||
static void uploadKeypoints(const std::vector<KeyPoint>& keypoints, GpuMat& keypointsGPU);
|
||||
//! download keypoints from device to host memory
|
||||
static void downloadKeypoints(const GpuMat& keypointsGPU, std::vector<KeyPoint>& keypoints);
|
||||
|
||||
//! download descriptors from device to host memory
|
||||
static void downloadDescriptors(const GpuMat& descriptorsGPU, std::vector<float>& descriptors);
|
||||
|
||||
//! finds the keypoints using fast hessian detector used in SURF
|
||||
//! supports CV_8UC1 images
|
||||
//! keypoints will have nFeature cols and 6 rows
|
||||
//! keypoints.ptr<float>(X_ROW)[i] will contain x coordinate of i'th feature
|
||||
//! keypoints.ptr<float>(Y_ROW)[i] will contain y coordinate of i'th feature
|
||||
//! keypoints.ptr<float>(LAPLACIAN_ROW)[i] will contain laplacian sign of i'th feature
|
||||
//! keypoints.ptr<float>(OCTAVE_ROW)[i] will contain octave of i'th feature
|
||||
//! keypoints.ptr<float>(SIZE_ROW)[i] will contain size of i'th feature
|
||||
//! keypoints.ptr<float>(ANGLE_ROW)[i] will contain orientation of i'th feature
|
||||
//! keypoints.ptr<float>(HESSIAN_ROW)[i] will contain response of i'th feature
|
||||
void operator()(const GpuMat& img, const GpuMat& mask, GpuMat& keypoints);
|
||||
//! finds the keypoints and computes their descriptors.
|
||||
//! Optionally it can compute descriptors for the user-provided keypoints and recompute keypoints direction
|
||||
void operator()(const GpuMat& img, const GpuMat& mask, GpuMat& keypoints, GpuMat& descriptors,
|
||||
bool useProvidedKeypoints = false);
|
||||
|
||||
void operator()(const GpuMat& img, const GpuMat& mask, std::vector<KeyPoint>& keypoints);
|
||||
void operator()(const GpuMat& img, const GpuMat& mask, std::vector<KeyPoint>& keypoints, GpuMat& descriptors,
|
||||
bool useProvidedKeypoints = false);
|
||||
|
||||
void operator()(const GpuMat& img, const GpuMat& mask, std::vector<KeyPoint>& keypoints, std::vector<float>& descriptors,
|
||||
bool useProvidedKeypoints = false);
|
||||
|
||||
void releaseMemory();
|
||||
|
||||
// SURF parameters
|
||||
double hessianThreshold;
|
||||
int nOctaves;
|
||||
int nOctaveLayers;
|
||||
bool extended;
|
||||
bool upright;
|
||||
|
||||
//! max keypoints = min(keypointsRatio * img.size().area(), 65535)
|
||||
float keypointsRatio;
|
||||
|
||||
GpuMat sum, mask1, maskSum, intBuffer;
|
||||
|
||||
GpuMat det, trace;
|
||||
|
||||
GpuMat maxPosBuffer;
|
||||
};
|
||||
|
||||
////////////////////////////////// FAST //////////////////////////////////////////
|
||||
|
||||
class CV_EXPORTS FAST_GPU
|
||||
@@ -2129,41 +2075,6 @@ private:
|
||||
GpuMat bgmodelUsedModes_; //keep track of number of modes per pixel
|
||||
};
|
||||
|
||||
/*!
|
||||
* The class implements the following algorithm:
|
||||
* "ViBe: A universal background subtraction algorithm for video sequences"
|
||||
* O. Barnich and M. Van D Roogenbroeck
|
||||
* IEEE Transactions on Image Processing, 20(6) :1709-1724, June 2011
|
||||
*/
|
||||
class CV_EXPORTS VIBE_GPU
|
||||
{
|
||||
public:
|
||||
//! the default constructor
|
||||
explicit VIBE_GPU(unsigned long rngSeed = 1234567);
|
||||
|
||||
//! re-initiaization method
|
||||
void initialize(const GpuMat& firstFrame, Stream& stream = Stream::Null());
|
||||
|
||||
//! the update operator
|
||||
void operator()(const GpuMat& frame, GpuMat& fgmask, Stream& stream = Stream::Null());
|
||||
|
||||
//! releases all inner buffers
|
||||
void release();
|
||||
|
||||
int nbSamples; // number of samples per pixel
|
||||
int reqMatches; // #_min
|
||||
int radius; // R
|
||||
int subsamplingFactor; // amount of random subsampling
|
||||
|
||||
private:
|
||||
Size frameSize_;
|
||||
|
||||
unsigned long rngSeed_;
|
||||
GpuMat randStates_;
|
||||
|
||||
GpuMat samples_;
|
||||
};
|
||||
|
||||
/**
|
||||
* Background Subtractor module. Takes a series of images and returns a sequence of mask (8UC1)
|
||||
* images of the same size, where 255 indicates Foreground and 0 represents Background.
|
||||
|
910
modules/gpu/include/opencv2/gpu/device/simd_functions.hpp
Normal file
910
modules/gpu/include/opencv2/gpu/device/simd_functions.hpp
Normal file
@@ -0,0 +1,910 @@
|
||||
/*M///////////////////////////////////////////////////////////////////////////////////////
|
||||
//
|
||||
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
|
||||
//
|
||||
// By downloading, copying, installing or using the software you agree to this license.
|
||||
// If you do not agree to this license, do not download, install,
|
||||
// copy or use the software.
|
||||
//
|
||||
//
|
||||
// License Agreement
|
||||
// For Open Source Computer Vision Library
|
||||
//
|
||||
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
|
||||
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
|
||||
// Copyright (C) 2010-2013, NVIDIA Corporation, all rights reserved.
|
||||
// Third party copyrights are property of their respective owners.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without modification,
|
||||
// are permitted provided that the following conditions are met:
|
||||
//
|
||||
// * Redistribution's of source code must retain the above copyright notice,
|
||||
// this list of conditions and the following disclaimer.
|
||||
//
|
||||
// * Redistribution's in binary form must reproduce the above copyright notice,
|
||||
// this list of conditions and the following disclaimer in the documentation
|
||||
// and/or other materials provided with the distribution.
|
||||
//
|
||||
// * The name of the copyright holders may not be used to endorse or promote products
|
||||
// derived from this software without specific prior written permission.
|
||||
//
|
||||
// This software is provided by the copyright holders and contributors "as is" and
|
||||
// any express or implied warranties, including, but not limited to, the implied
|
||||
// warranties of merchantability and fitness for a particular purpose are disclaimed.
|
||||
// In no event shall the Intel Corporation or contributors be liable for any direct,
|
||||
// indirect, incidental, special, exemplary, or consequential damages
|
||||
// (including, but not limited to, procurement of substitute goods or services;
|
||||
// loss of use, data, or profits; or business interruption) however caused
|
||||
// and on any theory of liability, whether in contract, strict liability,
|
||||
// or tort (including negligence or otherwise) arising in any way out of
|
||||
// the use of this software, even if advised of the possibility of such damage.
|
||||
//
|
||||
//M*/
|
||||
|
||||
/*
|
||||
* Copyright (c) 2013 NVIDIA Corporation. All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
*
|
||||
* Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* Neither the name of NVIDIA Corporation nor the names of its contributors
|
||||
* may be used to endorse or promote products derived from this software
|
||||
* without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
|
||||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#ifndef __OPENCV_GPU_SIMD_FUNCTIONS_HPP__
|
||||
#define __OPENCV_GPU_SIMD_FUNCTIONS_HPP__
|
||||
|
||||
#include "common.hpp"
|
||||
|
||||
/*
|
||||
This header file contains inline functions that implement intra-word SIMD
|
||||
operations, that are hardware accelerated on sm_3x (Kepler) GPUs. Efficient
|
||||
emulation code paths are provided for earlier architectures (sm_1x, sm_2x)
|
||||
to make the code portable across all GPUs supported by CUDA. The following
|
||||
functions are currently implemented:
|
||||
|
||||
vadd2(a,b) per-halfword unsigned addition, with wrap-around: a + b
|
||||
vsub2(a,b) per-halfword unsigned subtraction, with wrap-around: a - b
|
||||
vabsdiff2(a,b) per-halfword unsigned absolute difference: |a - b|
|
||||
vavg2(a,b) per-halfword unsigned average: (a + b) / 2
|
||||
vavrg2(a,b) per-halfword unsigned rounded average: (a + b + 1) / 2
|
||||
vseteq2(a,b) per-halfword unsigned comparison: a == b ? 1 : 0
|
||||
vcmpeq2(a,b) per-halfword unsigned comparison: a == b ? 0xffff : 0
|
||||
vsetge2(a,b) per-halfword unsigned comparison: a >= b ? 1 : 0
|
||||
vcmpge2(a,b) per-halfword unsigned comparison: a >= b ? 0xffff : 0
|
||||
vsetgt2(a,b) per-halfword unsigned comparison: a > b ? 1 : 0
|
||||
vcmpgt2(a,b) per-halfword unsigned comparison: a > b ? 0xffff : 0
|
||||
vsetle2(a,b) per-halfword unsigned comparison: a <= b ? 1 : 0
|
||||
vcmple2(a,b) per-halfword unsigned comparison: a <= b ? 0xffff : 0
|
||||
vsetlt2(a,b) per-halfword unsigned comparison: a < b ? 1 : 0
|
||||
vcmplt2(a,b) per-halfword unsigned comparison: a < b ? 0xffff : 0
|
||||
vsetne2(a,b) per-halfword unsigned comparison: a != b ? 1 : 0
|
||||
vcmpne2(a,b) per-halfword unsigned comparison: a != b ? 0xffff : 0
|
||||
vmax2(a,b) per-halfword unsigned maximum: max(a, b)
|
||||
vmin2(a,b) per-halfword unsigned minimum: min(a, b)
|
||||
|
||||
vadd4(a,b) per-byte unsigned addition, with wrap-around: a + b
|
||||
vsub4(a,b) per-byte unsigned subtraction, with wrap-around: a - b
|
||||
vabsdiff4(a,b) per-byte unsigned absolute difference: |a - b|
|
||||
vavg4(a,b) per-byte unsigned average: (a + b) / 2
|
||||
vavrg4(a,b) per-byte unsigned rounded average: (a + b + 1) / 2
|
||||
vseteq4(a,b) per-byte unsigned comparison: a == b ? 1 : 0
|
||||
vcmpeq4(a,b) per-byte unsigned comparison: a == b ? 0xff : 0
|
||||
vsetge4(a,b) per-byte unsigned comparison: a >= b ? 1 : 0
|
||||
vcmpge4(a,b) per-byte unsigned comparison: a >= b ? 0xff : 0
|
||||
vsetgt4(a,b) per-byte unsigned comparison: a > b ? 1 : 0
|
||||
vcmpgt4(a,b) per-byte unsigned comparison: a > b ? 0xff : 0
|
||||
vsetle4(a,b) per-byte unsigned comparison: a <= b ? 1 : 0
|
||||
vcmple4(a,b) per-byte unsigned comparison: a <= b ? 0xff : 0
|
||||
vsetlt4(a,b) per-byte unsigned comparison: a < b ? 1 : 0
|
||||
vcmplt4(a,b) per-byte unsigned comparison: a < b ? 0xff : 0
|
||||
vsetne4(a,b) per-byte unsigned comparison: a != b ? 1: 0
|
||||
vcmpne4(a,b) per-byte unsigned comparison: a != b ? 0xff: 0
|
||||
vmax4(a,b) per-byte unsigned maximum: max(a, b)
|
||||
vmin4(a,b) per-byte unsigned minimum: min(a, b)
|
||||
*/
|
||||
|
||||
namespace cv { namespace gpu { namespace device
|
||||
{
|
||||
// 2
|
||||
|
||||
static __device__ __forceinline__ unsigned int vadd2(unsigned int a, unsigned int b)
|
||||
{
|
||||
unsigned int r = 0;
|
||||
|
||||
#if __CUDA_ARCH__ >= 300
|
||||
asm("vadd2.u32.u32.u32.sat %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
|
||||
#elif __CUDA_ARCH__ >= 200
|
||||
asm("vadd.u32.u32.u32.sat %0.h0, %1.h0, %2.h0, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
|
||||
asm("vadd.u32.u32.u32.sat %0.h1, %1.h1, %2.h1, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
|
||||
#else
|
||||
unsigned int s;
|
||||
s = a ^ b; // sum bits
|
||||
r = a + b; // actual sum
|
||||
s = s ^ r; // determine carry-ins for each bit position
|
||||
s = s & 0x00010000; // carry-in to high word (= carry-out from low word)
|
||||
r = r - s; // subtract out carry-out from low word
|
||||
#endif
|
||||
|
||||
return r;
|
||||
}
|
||||
|
||||
static __device__ __forceinline__ unsigned int vsub2(unsigned int a, unsigned int b)
|
||||
{
|
||||
unsigned int r = 0;
|
||||
|
||||
#if __CUDA_ARCH__ >= 300
|
||||
asm("vsub2.u32.u32.u32.sat %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
|
||||
#elif __CUDA_ARCH__ >= 200
|
||||
asm("vsub.u32.u32.u32.sat %0.h0, %1.h0, %2.h0, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
|
||||
asm("vsub.u32.u32.u32.sat %0.h1, %1.h1, %2.h1, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
|
||||
#else
|
||||
unsigned int s;
|
||||
s = a ^ b; // sum bits
|
||||
r = a - b; // actual sum
|
||||
s = s ^ r; // determine carry-ins for each bit position
|
||||
s = s & 0x00010000; // borrow to high word
|
||||
r = r + s; // compensate for borrow from low word
|
||||
#endif
|
||||
|
||||
return r;
|
||||
}
|
||||
|
||||
static __device__ __forceinline__ unsigned int vabsdiff2(unsigned int a, unsigned int b)
|
||||
{
|
||||
unsigned int r = 0;
|
||||
|
||||
#if __CUDA_ARCH__ >= 300
|
||||
asm("vabsdiff2.u32.u32.u32.sat %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
|
||||
#elif __CUDA_ARCH__ >= 200
|
||||
asm("vabsdiff.u32.u32.u32.sat %0.h0, %1.h0, %2.h0, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
|
||||
asm("vabsdiff.u32.u32.u32.sat %0.h1, %1.h1, %2.h1, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
|
||||
#else
|
||||
unsigned int s, t, u, v;
|
||||
s = a & 0x0000ffff; // extract low halfword
|
||||
r = b & 0x0000ffff; // extract low halfword
|
||||
u = ::max(r, s); // maximum of low halfwords
|
||||
v = ::min(r, s); // minimum of low halfwords
|
||||
s = a & 0xffff0000; // extract high halfword
|
||||
r = b & 0xffff0000; // extract high halfword
|
||||
t = ::max(r, s); // maximum of high halfwords
|
||||
s = ::min(r, s); // minimum of high halfwords
|
||||
r = u | t; // maximum of both halfwords
|
||||
s = v | s; // minimum of both halfwords
|
||||
r = r - s; // |a - b| = max(a,b) - min(a,b);
|
||||
#endif
|
||||
|
||||
return r;
|
||||
}
|
||||
|
||||
static __device__ __forceinline__ unsigned int vavg2(unsigned int a, unsigned int b)
|
||||
{
|
||||
unsigned int r, s;
|
||||
|
||||
// HAKMEM #23: a + b = 2 * (a & b) + (a ^ b) ==>
|
||||
// (a + b) / 2 = (a & b) + ((a ^ b) >> 1)
|
||||
s = a ^ b;
|
||||
r = a & b;
|
||||
s = s & 0xfffefffe; // ensure shift doesn't cross halfword boundaries
|
||||
s = s >> 1;
|
||||
s = r + s;
|
||||
|
||||
return s;
|
||||
}
|
||||
|
||||
static __device__ __forceinline__ unsigned int vavrg2(unsigned int a, unsigned int b)
|
||||
{
|
||||
unsigned int r = 0;
|
||||
|
||||
#if __CUDA_ARCH__ >= 300
|
||||
asm("vavrg2.u32.u32.u32 %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
|
||||
#else
|
||||
// HAKMEM #23: a + b = 2 * (a | b) - (a ^ b) ==>
|
||||
// (a + b + 1) / 2 = (a | b) - ((a ^ b) >> 1)
|
||||
unsigned int s;
|
||||
s = a ^ b;
|
||||
r = a | b;
|
||||
s = s & 0xfffefffe; // ensure shift doesn't cross half-word boundaries
|
||||
s = s >> 1;
|
||||
r = r - s;
|
||||
#endif
|
||||
|
||||
return r;
|
||||
}
|
||||
|
||||
static __device__ __forceinline__ unsigned int vseteq2(unsigned int a, unsigned int b)
|
||||
{
|
||||
unsigned int r = 0;
|
||||
|
||||
#if __CUDA_ARCH__ >= 300
|
||||
asm("vset2.u32.u32.eq %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
|
||||
#else
|
||||
// inspired by Alan Mycroft's null-byte detection algorithm:
|
||||
// null_byte(x) = ((x - 0x01010101) & (~x & 0x80808080))
|
||||
unsigned int c;
|
||||
r = a ^ b; // 0x0000 if a == b
|
||||
c = r | 0x80008000; // set msbs, to catch carry out
|
||||
r = r ^ c; // extract msbs, msb = 1 if r < 0x8000
|
||||
c = c - 0x00010001; // msb = 0, if r was 0x0000 or 0x8000
|
||||
c = r & ~c; // msb = 1, if r was 0x0000
|
||||
r = c >> 15; // convert to bool
|
||||
#endif
|
||||
|
||||
return r;
|
||||
}
|
||||
|
||||
static __device__ __forceinline__ unsigned int vcmpeq2(unsigned int a, unsigned int b)
|
||||
{
|
||||
unsigned int r, c;
|
||||
|
||||
#if __CUDA_ARCH__ >= 300
|
||||
r = vseteq2(a, b);
|
||||
c = r << 16; // convert bool
|
||||
r = c - r; // into mask
|
||||
#else
|
||||
// inspired by Alan Mycroft's null-byte detection algorithm:
|
||||
// null_byte(x) = ((x - 0x01010101) & (~x & 0x80808080))
|
||||
r = a ^ b; // 0x0000 if a == b
|
||||
c = r | 0x80008000; // set msbs, to catch carry out
|
||||
r = r ^ c; // extract msbs, msb = 1 if r < 0x8000
|
||||
c = c - 0x00010001; // msb = 0, if r was 0x0000 or 0x8000
|
||||
c = r & ~c; // msb = 1, if r was 0x0000
|
||||
r = c >> 15; // convert
|
||||
r = c - r; // msbs to
|
||||
r = c | r; // mask
|
||||
#endif
|
||||
|
||||
return r;
|
||||
}
|
||||
|
||||
static __device__ __forceinline__ unsigned int vsetge2(unsigned int a, unsigned int b)
|
||||
{
|
||||
unsigned int r = 0;
|
||||
|
||||
#if __CUDA_ARCH__ >= 300
|
||||
asm("vset2.u32.u32.ge %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
|
||||
#else
|
||||
unsigned int c;
|
||||
asm("not.b32 %0, %0;" : "+r"(b));
|
||||
c = vavrg2(a, b); // (a + ~b + 1) / 2 = (a - b) / 2
|
||||
c = c & 0x80008000; // msb = carry-outs
|
||||
r = c >> 15; // convert to bool
|
||||
#endif
|
||||
|
||||
return r;
|
||||
}
|
||||
|
||||
static __device__ __forceinline__ unsigned int vcmpge2(unsigned int a, unsigned int b)
|
||||
{
|
||||
unsigned int r, c;
|
||||
|
||||
#if __CUDA_ARCH__ >= 300
|
||||
r = vsetge2(a, b);
|
||||
c = r << 16; // convert bool
|
||||
r = c - r; // into mask
|
||||
#else
|
||||
asm("not.b32 %0, %0;" : "+r"(b));
|
||||
c = vavrg2(a, b); // (a + ~b + 1) / 2 = (a - b) / 2
|
||||
c = c & 0x80008000; // msb = carry-outs
|
||||
r = c >> 15; // convert
|
||||
r = c - r; // msbs to
|
||||
r = c | r; // mask
|
||||
#endif
|
||||
|
||||
return r;
|
||||
}
|
||||
|
||||
static __device__ __forceinline__ unsigned int vsetgt2(unsigned int a, unsigned int b)
|
||||
{
|
||||
unsigned int r = 0;
|
||||
|
||||
#if __CUDA_ARCH__ >= 300
|
||||
asm("vset2.u32.u32.gt %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
|
||||
#else
|
||||
unsigned int c;
|
||||
asm("not.b32 %0, %0;" : "+r"(b));
|
||||
c = vavg2(a, b); // (a + ~b) / 2 = (a - b) / 2 [rounded down]
|
||||
c = c & 0x80008000; // msbs = carry-outs
|
||||
r = c >> 15; // convert to bool
|
||||
#endif
|
||||
|
||||
return r;
|
||||
}
|
||||
|
||||
static __device__ __forceinline__ unsigned int vcmpgt2(unsigned int a, unsigned int b)
|
||||
{
|
||||
unsigned int r, c;
|
||||
|
||||
#if __CUDA_ARCH__ >= 300
|
||||
r = vsetgt2(a, b);
|
||||
c = r << 16; // convert bool
|
||||
r = c - r; // into mask
|
||||
#else
|
||||
asm("not.b32 %0, %0;" : "+r"(b));
|
||||
c = vavg2(a, b); // (a + ~b) / 2 = (a - b) / 2 [rounded down]
|
||||
c = c & 0x80008000; // msbs = carry-outs
|
||||
r = c >> 15; // convert
|
||||
r = c - r; // msbs to
|
||||
r = c | r; // mask
|
||||
#endif
|
||||
|
||||
return r;
|
||||
}
|
||||
|
||||
static __device__ __forceinline__ unsigned int vsetle2(unsigned int a, unsigned int b)
|
||||
{
|
||||
unsigned int r = 0;
|
||||
|
||||
#if __CUDA_ARCH__ >= 300
|
||||
asm("vset2.u32.u32.le %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
|
||||
#else
|
||||
unsigned int c;
|
||||
asm("not.b32 %0, %0;" : "+r"(a));
|
||||
c = vavrg2(a, b); // (b + ~a + 1) / 2 = (b - a) / 2
|
||||
c = c & 0x80008000; // msb = carry-outs
|
||||
r = c >> 15; // convert to bool
|
||||
#endif
|
||||
|
||||
return r;
|
||||
}
|
||||
|
||||
static __device__ __forceinline__ unsigned int vcmple2(unsigned int a, unsigned int b)
|
||||
{
|
||||
unsigned int r, c;
|
||||
|
||||
#if __CUDA_ARCH__ >= 300
|
||||
r = vsetle2(a, b);
|
||||
c = r << 16; // convert bool
|
||||
r = c - r; // into mask
|
||||
#else
|
||||
asm("not.b32 %0, %0;" : "+r"(a));
|
||||
c = vavrg2(a, b); // (b + ~a + 1) / 2 = (b - a) / 2
|
||||
c = c & 0x80008000; // msb = carry-outs
|
||||
r = c >> 15; // convert
|
||||
r = c - r; // msbs to
|
||||
r = c | r; // mask
|
||||
#endif
|
||||
|
||||
return r;
|
||||
}
|
||||
|
||||
static __device__ __forceinline__ unsigned int vsetlt2(unsigned int a, unsigned int b)
|
||||
{
|
||||
unsigned int r = 0;
|
||||
|
||||
#if __CUDA_ARCH__ >= 300
|
||||
asm("vset2.u32.u32.lt %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
|
||||
#else
|
||||
unsigned int c;
|
||||
asm("not.b32 %0, %0;" : "+r"(a));
|
||||
c = vavg2(a, b); // (b + ~a) / 2 = (b - a) / 2 [rounded down]
|
||||
c = c & 0x80008000; // msb = carry-outs
|
||||
r = c >> 15; // convert to bool
|
||||
#endif
|
||||
|
||||
return r;
|
||||
}
|
||||
|
||||
static __device__ __forceinline__ unsigned int vcmplt2(unsigned int a, unsigned int b)
|
||||
{
|
||||
unsigned int r, c;
|
||||
|
||||
#if __CUDA_ARCH__ >= 300
|
||||
r = vsetlt2(a, b);
|
||||
c = r << 16; // convert bool
|
||||
r = c - r; // into mask
|
||||
#else
|
||||
asm("not.b32 %0, %0;" : "+r"(a));
|
||||
c = vavg2(a, b); // (b + ~a) / 2 = (b - a) / 2 [rounded down]
|
||||
c = c & 0x80008000; // msb = carry-outs
|
||||
r = c >> 15; // convert
|
||||
r = c - r; // msbs to
|
||||
r = c | r; // mask
|
||||
#endif
|
||||
|
||||
return r;
|
||||
}
|
||||
|
||||
static __device__ __forceinline__ unsigned int vsetne2(unsigned int a, unsigned int b)
|
||||
{
|
||||
unsigned int r = 0;
|
||||
|
||||
#if __CUDA_ARCH__ >= 300
|
||||
asm ("vset2.u32.u32.ne %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
|
||||
#else
|
||||
// inspired by Alan Mycroft's null-byte detection algorithm:
|
||||
// null_byte(x) = ((x - 0x01010101) & (~x & 0x80808080))
|
||||
unsigned int c;
|
||||
r = a ^ b; // 0x0000 if a == b
|
||||
c = r | 0x80008000; // set msbs, to catch carry out
|
||||
c = c - 0x00010001; // msb = 0, if r was 0x0000 or 0x8000
|
||||
c = r | c; // msb = 1, if r was not 0x0000
|
||||
c = c & 0x80008000; // extract msbs
|
||||
r = c >> 15; // convert to bool
|
||||
#endif
|
||||
|
||||
return r;
|
||||
}
|
||||
|
||||
static __device__ __forceinline__ unsigned int vcmpne2(unsigned int a, unsigned int b)
|
||||
{
|
||||
unsigned int r, c;
|
||||
|
||||
#if __CUDA_ARCH__ >= 300
|
||||
r = vsetne2(a, b);
|
||||
c = r << 16; // convert bool
|
||||
r = c - r; // into mask
|
||||
#else
|
||||
// inspired by Alan Mycroft's null-byte detection algorithm:
|
||||
// null_byte(x) = ((x - 0x01010101) & (~x & 0x80808080))
|
||||
r = a ^ b; // 0x0000 if a == b
|
||||
c = r | 0x80008000; // set msbs, to catch carry out
|
||||
c = c - 0x00010001; // msb = 0, if r was 0x0000 or 0x8000
|
||||
c = r | c; // msb = 1, if r was not 0x0000
|
||||
c = c & 0x80008000; // extract msbs
|
||||
r = c >> 15; // convert
|
||||
r = c - r; // msbs to
|
||||
r = c | r; // mask
|
||||
#endif
|
||||
|
||||
return r;
|
||||
}
|
||||
|
||||
static __device__ __forceinline__ unsigned int vmax2(unsigned int a, unsigned int b)
|
||||
{
|
||||
unsigned int r = 0;
|
||||
|
||||
#if __CUDA_ARCH__ >= 300
|
||||
asm("vmax2.u32.u32.u32 %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
|
||||
#elif __CUDA_ARCH__ >= 200
|
||||
asm("vmax.u32.u32.u32 %0.h0, %1.h0, %2.h0, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
|
||||
asm("vmax.u32.u32.u32 %0.h1, %1.h1, %2.h1, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
|
||||
#else
|
||||
unsigned int s, t, u;
|
||||
r = a & 0x0000ffff; // extract low halfword
|
||||
s = b & 0x0000ffff; // extract low halfword
|
||||
t = ::max(r, s); // maximum of low halfwords
|
||||
r = a & 0xffff0000; // extract high halfword
|
||||
s = b & 0xffff0000; // extract high halfword
|
||||
u = ::max(r, s); // maximum of high halfwords
|
||||
r = t | u; // combine halfword maximums
|
||||
#endif
|
||||
|
||||
return r;
|
||||
}
|
||||
|
||||
static __device__ __forceinline__ unsigned int vmin2(unsigned int a, unsigned int b)
|
||||
{
|
||||
unsigned int r = 0;
|
||||
|
||||
#if __CUDA_ARCH__ >= 300
|
||||
asm("vmin2.u32.u32.u32 %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
|
||||
#elif __CUDA_ARCH__ >= 200
|
||||
asm("vmin.u32.u32.u32 %0.h0, %1.h0, %2.h0, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
|
||||
asm("vmin.u32.u32.u32 %0.h1, %1.h1, %2.h1, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
|
||||
#else
|
||||
unsigned int s, t, u;
|
||||
r = a & 0x0000ffff; // extract low halfword
|
||||
s = b & 0x0000ffff; // extract low halfword
|
||||
t = ::min(r, s); // minimum of low halfwords
|
||||
r = a & 0xffff0000; // extract high halfword
|
||||
s = b & 0xffff0000; // extract high halfword
|
||||
u = ::min(r, s); // minimum of high halfwords
|
||||
r = t | u; // combine halfword minimums
|
||||
#endif
|
||||
|
||||
return r;
|
||||
}
|
||||
|
||||
// 4
|
||||
|
||||
static __device__ __forceinline__ unsigned int vadd4(unsigned int a, unsigned int b)
|
||||
{
|
||||
unsigned int r = 0;
|
||||
|
||||
#if __CUDA_ARCH__ >= 300
|
||||
asm("vadd4.u32.u32.u32.sat %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
|
||||
#elif __CUDA_ARCH__ >= 200
|
||||
asm("vadd.u32.u32.u32.sat %0.b0, %1.b0, %2.b0, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
|
||||
asm("vadd.u32.u32.u32.sat %0.b1, %1.b1, %2.b1, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
|
||||
asm("vadd.u32.u32.u32.sat %0.b2, %1.b2, %2.b2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
|
||||
asm("vadd.u32.u32.u32.sat %0.b3, %1.b3, %2.b3, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
|
||||
#else
|
||||
unsigned int s, t;
|
||||
s = a ^ b; // sum bits
|
||||
r = a & 0x7f7f7f7f; // clear msbs
|
||||
t = b & 0x7f7f7f7f; // clear msbs
|
||||
s = s & 0x80808080; // msb sum bits
|
||||
r = r + t; // add without msbs, record carry-out in msbs
|
||||
r = r ^ s; // sum of msb sum and carry-in bits, w/o carry-out
|
||||
#endif /* __CUDA_ARCH__ >= 300 */
|
||||
|
||||
return r;
|
||||
}
|
||||
|
||||
static __device__ __forceinline__ unsigned int vsub4(unsigned int a, unsigned int b)
|
||||
{
|
||||
unsigned int r = 0;
|
||||
|
||||
#if __CUDA_ARCH__ >= 300
|
||||
asm("vsub4.u32.u32.u32.sat %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
|
||||
#elif __CUDA_ARCH__ >= 200
|
||||
asm("vsub.u32.u32.u32.sat %0.b0, %1.b0, %2.b0, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
|
||||
asm("vsub.u32.u32.u32.sat %0.b1, %1.b1, %2.b1, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
|
||||
asm("vsub.u32.u32.u32.sat %0.b2, %1.b2, %2.b2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
|
||||
asm("vsub.u32.u32.u32.sat %0.b3, %1.b3, %2.b3, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
|
||||
#else
|
||||
unsigned int s, t;
|
||||
s = a ^ ~b; // inverted sum bits
|
||||
r = a | 0x80808080; // set msbs
|
||||
t = b & 0x7f7f7f7f; // clear msbs
|
||||
s = s & 0x80808080; // inverted msb sum bits
|
||||
r = r - t; // subtract w/o msbs, record inverted borrows in msb
|
||||
r = r ^ s; // combine inverted msb sum bits and borrows
|
||||
#endif
|
||||
|
||||
return r;
|
||||
}
|
||||
|
||||
static __device__ __forceinline__ unsigned int vavg4(unsigned int a, unsigned int b)
|
||||
{
|
||||
unsigned int r, s;
|
||||
|
||||
// HAKMEM #23: a + b = 2 * (a & b) + (a ^ b) ==>
|
||||
// (a + b) / 2 = (a & b) + ((a ^ b) >> 1)
|
||||
s = a ^ b;
|
||||
r = a & b;
|
||||
s = s & 0xfefefefe; // ensure following shift doesn't cross byte boundaries
|
||||
s = s >> 1;
|
||||
s = r + s;
|
||||
|
||||
return s;
|
||||
}
|
||||
|
||||
static __device__ __forceinline__ unsigned int vavrg4(unsigned int a, unsigned int b)
|
||||
{
|
||||
unsigned int r = 0;
|
||||
|
||||
#if __CUDA_ARCH__ >= 300
|
||||
asm("vavrg4.u32.u32.u32 %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
|
||||
#else
|
||||
// HAKMEM #23: a + b = 2 * (a | b) - (a ^ b) ==>
|
||||
// (a + b + 1) / 2 = (a | b) - ((a ^ b) >> 1)
|
||||
unsigned int c;
|
||||
c = a ^ b;
|
||||
r = a | b;
|
||||
c = c & 0xfefefefe; // ensure following shift doesn't cross byte boundaries
|
||||
c = c >> 1;
|
||||
r = r - c;
|
||||
#endif
|
||||
|
||||
return r;
|
||||
}
|
||||
|
||||
static __device__ __forceinline__ unsigned int vseteq4(unsigned int a, unsigned int b)
|
||||
{
|
||||
unsigned int r = 0;
|
||||
|
||||
#if __CUDA_ARCH__ >= 300
|
||||
asm("vset4.u32.u32.eq %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
|
||||
#else
|
||||
// inspired by Alan Mycroft's null-byte detection algorithm:
|
||||
// null_byte(x) = ((x - 0x01010101) & (~x & 0x80808080))
|
||||
unsigned int c;
|
||||
r = a ^ b; // 0x00 if a == b
|
||||
c = r | 0x80808080; // set msbs, to catch carry out
|
||||
r = r ^ c; // extract msbs, msb = 1 if r < 0x80
|
||||
c = c - 0x01010101; // msb = 0, if r was 0x00 or 0x80
|
||||
c = r & ~c; // msb = 1, if r was 0x00
|
||||
r = c >> 7; // convert to bool
|
||||
#endif
|
||||
|
||||
return r;
|
||||
}
|
||||
|
||||
static __device__ __forceinline__ unsigned int vcmpeq4(unsigned int a, unsigned int b)
|
||||
{
|
||||
unsigned int r, t;
|
||||
|
||||
#if __CUDA_ARCH__ >= 300
|
||||
r = vseteq4(a, b);
|
||||
t = r << 8; // convert bool
|
||||
r = t - r; // to mask
|
||||
#else
|
||||
// inspired by Alan Mycroft's null-byte detection algorithm:
|
||||
// null_byte(x) = ((x - 0x01010101) & (~x & 0x80808080))
|
||||
t = a ^ b; // 0x00 if a == b
|
||||
r = t | 0x80808080; // set msbs, to catch carry out
|
||||
t = t ^ r; // extract msbs, msb = 1 if t < 0x80
|
||||
r = r - 0x01010101; // msb = 0, if t was 0x00 or 0x80
|
||||
r = t & ~r; // msb = 1, if t was 0x00
|
||||
t = r >> 7; // build mask
|
||||
t = r - t; // from
|
||||
r = t | r; // msbs
|
||||
#endif
|
||||
|
||||
return r;
|
||||
}
|
||||
|
||||
static __device__ __forceinline__ unsigned int vsetle4(unsigned int a, unsigned int b)
|
||||
{
|
||||
unsigned int r = 0;
|
||||
|
||||
#if __CUDA_ARCH__ >= 300
|
||||
asm("vset4.u32.u32.le %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
|
||||
#else
|
||||
unsigned int c;
|
||||
asm("not.b32 %0, %0;" : "+r"(a));
|
||||
c = vavrg4(a, b); // (b + ~a + 1) / 2 = (b - a) / 2
|
||||
c = c & 0x80808080; // msb = carry-outs
|
||||
r = c >> 7; // convert to bool
|
||||
#endif
|
||||
|
||||
return r;
|
||||
}
|
||||
|
||||
static __device__ __forceinline__ unsigned int vcmple4(unsigned int a, unsigned int b)
|
||||
{
|
||||
unsigned int r, c;
|
||||
|
||||
#if __CUDA_ARCH__ >= 300
|
||||
r = vsetle4(a, b);
|
||||
c = r << 8; // convert bool
|
||||
r = c - r; // to mask
|
||||
#else
|
||||
asm("not.b32 %0, %0;" : "+r"(a));
|
||||
c = vavrg4(a, b); // (b + ~a + 1) / 2 = (b - a) / 2
|
||||
c = c & 0x80808080; // msbs = carry-outs
|
||||
r = c >> 7; // convert
|
||||
r = c - r; // msbs to
|
||||
r = c | r; // mask
|
||||
#endif
|
||||
|
||||
return r;
|
||||
}
|
||||
|
||||
static __device__ __forceinline__ unsigned int vsetlt4(unsigned int a, unsigned int b)
|
||||
{
|
||||
unsigned int r = 0;
|
||||
|
||||
#if __CUDA_ARCH__ >= 300
|
||||
asm("vset4.u32.u32.lt %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
|
||||
#else
|
||||
unsigned int c;
|
||||
asm("not.b32 %0, %0;" : "+r"(a));
|
||||
c = vavg4(a, b); // (b + ~a) / 2 = (b - a) / 2 [rounded down]
|
||||
c = c & 0x80808080; // msb = carry-outs
|
||||
r = c >> 7; // convert to bool
|
||||
#endif
|
||||
|
||||
return r;
|
||||
}
|
||||
|
||||
static __device__ __forceinline__ unsigned int vcmplt4(unsigned int a, unsigned int b)
|
||||
{
|
||||
unsigned int r, c;
|
||||
|
||||
#if __CUDA_ARCH__ >= 300
|
||||
r = vsetlt4(a, b);
|
||||
c = r << 8; // convert bool
|
||||
r = c - r; // to mask
|
||||
#else
|
||||
asm("not.b32 %0, %0;" : "+r"(a));
|
||||
c = vavg4(a, b); // (b + ~a) / 2 = (b - a) / 2 [rounded down]
|
||||
c = c & 0x80808080; // msbs = carry-outs
|
||||
r = c >> 7; // convert
|
||||
r = c - r; // msbs to
|
||||
r = c | r; // mask
|
||||
#endif
|
||||
|
||||
return r;
|
||||
}
|
||||
|
||||
static __device__ __forceinline__ unsigned int vsetge4(unsigned int a, unsigned int b)
|
||||
{
|
||||
unsigned int r = 0;
|
||||
|
||||
#if __CUDA_ARCH__ >= 300
|
||||
asm("vset4.u32.u32.ge %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
|
||||
#else
|
||||
unsigned int c;
|
||||
asm("not.b32 %0, %0;" : "+r"(b));
|
||||
c = vavrg4(a, b); // (a + ~b + 1) / 2 = (a - b) / 2
|
||||
c = c & 0x80808080; // msb = carry-outs
|
||||
r = c >> 7; // convert to bool
|
||||
#endif
|
||||
|
||||
return r;
|
||||
}
|
||||
|
||||
static __device__ __forceinline__ unsigned int vcmpge4(unsigned int a, unsigned int b)
|
||||
{
|
||||
unsigned int r, s;
|
||||
|
||||
#if __CUDA_ARCH__ >= 300
|
||||
r = vsetge4(a, b);
|
||||
s = r << 8; // convert bool
|
||||
r = s - r; // to mask
|
||||
#else
|
||||
asm ("not.b32 %0,%0;" : "+r"(b));
|
||||
r = vavrg4 (a, b); // (a + ~b + 1) / 2 = (a - b) / 2
|
||||
r = r & 0x80808080; // msb = carry-outs
|
||||
s = r >> 7; // build mask
|
||||
s = r - s; // from
|
||||
r = s | r; // msbs
|
||||
#endif
|
||||
|
||||
return r;
|
||||
}
|
||||
|
||||
static __device__ __forceinline__ unsigned int vsetgt4(unsigned int a, unsigned int b)
|
||||
{
|
||||
unsigned int r = 0;
|
||||
|
||||
#if __CUDA_ARCH__ >= 300
|
||||
asm("vset4.u32.u32.gt %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
|
||||
#else
|
||||
unsigned int c;
|
||||
asm("not.b32 %0, %0;" : "+r"(b));
|
||||
c = vavg4(a, b); // (a + ~b) / 2 = (a - b) / 2 [rounded down]
|
||||
c = c & 0x80808080; // msb = carry-outs
|
||||
r = c >> 7; // convert to bool
|
||||
#endif
|
||||
|
||||
return r;
|
||||
}
|
||||
|
||||
static __device__ __forceinline__ unsigned int vcmpgt4(unsigned int a, unsigned int b)
|
||||
{
|
||||
unsigned int r, c;
|
||||
|
||||
#if __CUDA_ARCH__ >= 300
|
||||
r = vsetgt4(a, b);
|
||||
c = r << 8; // convert bool
|
||||
r = c - r; // to mask
|
||||
#else
|
||||
asm("not.b32 %0, %0;" : "+r"(b));
|
||||
c = vavg4(a, b); // (a + ~b) / 2 = (a - b) / 2 [rounded down]
|
||||
c = c & 0x80808080; // msb = carry-outs
|
||||
r = c >> 7; // convert
|
||||
r = c - r; // msbs to
|
||||
r = c | r; // mask
|
||||
#endif
|
||||
|
||||
return r;
|
||||
}
|
||||
|
||||
static __device__ __forceinline__ unsigned int vsetne4(unsigned int a, unsigned int b)
|
||||
{
|
||||
unsigned int r = 0;
|
||||
|
||||
#if __CUDA_ARCH__ >= 300
|
||||
asm("vset4.u32.u32.ne %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
|
||||
#else
|
||||
// inspired by Alan Mycroft's null-byte detection algorithm:
|
||||
// null_byte(x) = ((x - 0x01010101) & (~x & 0x80808080))
|
||||
unsigned int c;
|
||||
r = a ^ b; // 0x00 if a == b
|
||||
c = r | 0x80808080; // set msbs, to catch carry out
|
||||
c = c - 0x01010101; // msb = 0, if r was 0x00 or 0x80
|
||||
c = r | c; // msb = 1, if r was not 0x00
|
||||
c = c & 0x80808080; // extract msbs
|
||||
r = c >> 7; // convert to bool
|
||||
#endif
|
||||
|
||||
return r;
|
||||
}
|
||||
|
||||
static __device__ __forceinline__ unsigned int vcmpne4(unsigned int a, unsigned int b)
|
||||
{
|
||||
unsigned int r, c;
|
||||
|
||||
#if __CUDA_ARCH__ >= 300
|
||||
r = vsetne4(a, b);
|
||||
c = r << 8; // convert bool
|
||||
r = c - r; // to mask
|
||||
#else
|
||||
// inspired by Alan Mycroft's null-byte detection algorithm:
|
||||
// null_byte(x) = ((x - 0x01010101) & (~x & 0x80808080))
|
||||
r = a ^ b; // 0x00 if a == b
|
||||
c = r | 0x80808080; // set msbs, to catch carry out
|
||||
c = c - 0x01010101; // msb = 0, if r was 0x00 or 0x80
|
||||
c = r | c; // msb = 1, if r was not 0x00
|
||||
c = c & 0x80808080; // extract msbs
|
||||
r = c >> 7; // convert
|
||||
r = c - r; // msbs to
|
||||
r = c | r; // mask
|
||||
#endif
|
||||
|
||||
return r;
|
||||
}
|
||||
|
||||
static __device__ __forceinline__ unsigned int vabsdiff4(unsigned int a, unsigned int b)
|
||||
{
|
||||
unsigned int r = 0;
|
||||
|
||||
#if __CUDA_ARCH__ >= 300
|
||||
asm("vabsdiff4.u32.u32.u32.sat %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
|
||||
#elif __CUDA_ARCH__ >= 200
|
||||
asm("vabsdiff.u32.u32.u32.sat %0.b0, %1.b0, %2.b0, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
|
||||
asm("vabsdiff.u32.u32.u32.sat %0.b1, %1.b1, %2.b1, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
|
||||
asm("vabsdiff.u32.u32.u32.sat %0.b2, %1.b2, %2.b2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
|
||||
asm("vabsdiff.u32.u32.u32.sat %0.b3, %1.b3, %2.b3, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
|
||||
#else
|
||||
unsigned int s;
|
||||
s = vcmpge4(a, b); // mask = 0xff if a >= b
|
||||
r = a ^ b; //
|
||||
s = (r & s) ^ b; // select a when a >= b, else select b => max(a,b)
|
||||
r = s ^ r; // select a when b >= a, else select b => min(a,b)
|
||||
r = s - r; // |a - b| = max(a,b) - min(a,b);
|
||||
#endif
|
||||
|
||||
return r;
|
||||
}
|
||||
|
||||
static __device__ __forceinline__ unsigned int vmax4(unsigned int a, unsigned int b)
|
||||
{
|
||||
unsigned int r = 0;
|
||||
|
||||
#if __CUDA_ARCH__ >= 300
|
||||
asm("vmax4.u32.u32.u32 %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
|
||||
#elif __CUDA_ARCH__ >= 200
|
||||
asm("vmax.u32.u32.u32 %0.b0, %1.b0, %2.b0, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
|
||||
asm("vmax.u32.u32.u32 %0.b1, %1.b1, %2.b1, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
|
||||
asm("vmax.u32.u32.u32 %0.b2, %1.b2, %2.b2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
|
||||
asm("vmax.u32.u32.u32 %0.b3, %1.b3, %2.b3, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
|
||||
#else
|
||||
unsigned int s;
|
||||
s = vcmpge4(a, b); // mask = 0xff if a >= b
|
||||
r = a & s; // select a when b >= a
|
||||
s = b & ~s; // select b when b < a
|
||||
r = r | s; // combine byte selections
|
||||
#endif
|
||||
|
||||
return r; // byte-wise unsigned maximum
|
||||
}
|
||||
|
||||
static __device__ __forceinline__ unsigned int vmin4(unsigned int a, unsigned int b)
|
||||
{
|
||||
unsigned int r = 0;
|
||||
|
||||
#if __CUDA_ARCH__ >= 300
|
||||
asm("vmin4.u32.u32.u32 %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
|
||||
#elif __CUDA_ARCH__ >= 200
|
||||
asm("vmin.u32.u32.u32 %0.b0, %1.b0, %2.b0, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
|
||||
asm("vmin.u32.u32.u32 %0.b1, %1.b1, %2.b1, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
|
||||
asm("vmin.u32.u32.u32 %0.b2, %1.b2, %2.b2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
|
||||
asm("vmin.u32.u32.u32 %0.b3, %1.b3, %2.b3, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
|
||||
#else
|
||||
unsigned int s;
|
||||
s = vcmpge4(b, a); // mask = 0xff if a >= b
|
||||
r = a & s; // select a when b >= a
|
||||
s = b & ~s; // select b when b < a
|
||||
r = r | s; // combine byte selections
|
||||
#endif
|
||||
|
||||
return r;
|
||||
}
|
||||
}}}
|
||||
|
||||
#endif // __OPENCV_GPU_SIMD_FUNCTIONS_HPP__
|
@@ -7,7 +7,7 @@
|
||||
// copy or use the software.
|
||||
//
|
||||
//
|
||||
// License Agreement
|
||||
// License Agreement
|
||||
// For Open Source Computer Vision Library
|
||||
//
|
||||
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
|
||||
|
Reference in New Issue
Block a user