fixes for gpu module:
- fixed printCudaDeviceInfo for new CC - fixed some compilation errors and warnings - removed unset command from CMake script - removed unused std imports
This commit is contained in:
@@ -268,14 +268,14 @@ void cv::gpu::BruteForceMatcher_GPU_base::matchConvert(const Mat& trainIdx, cons
|
||||
const float* distance_ptr = distance.ptr<float>();
|
||||
for (int queryIdx = 0; queryIdx < nQuery; ++queryIdx, ++trainIdx_ptr, ++distance_ptr)
|
||||
{
|
||||
int _trainIdx = *trainIdx_ptr;
|
||||
int train_idx = *trainIdx_ptr;
|
||||
|
||||
if (_trainIdx == -1)
|
||||
if (train_idx == -1)
|
||||
continue;
|
||||
|
||||
float _distance = *distance_ptr;
|
||||
float distance_local = *distance_ptr;
|
||||
|
||||
DMatch m(queryIdx, _trainIdx, 0, _distance);
|
||||
DMatch m(queryIdx, train_idx, 0, distance_local);
|
||||
|
||||
matches.push_back(m);
|
||||
}
|
||||
@@ -413,16 +413,16 @@ void cv::gpu::BruteForceMatcher_GPU_base::matchConvert(const Mat& trainIdx, cons
|
||||
const float* distance_ptr = distance.ptr<float>();
|
||||
for (int queryIdx = 0; queryIdx < nQuery; ++queryIdx, ++trainIdx_ptr, ++imgIdx_ptr, ++distance_ptr)
|
||||
{
|
||||
int trainIdx = *trainIdx_ptr;
|
||||
int _trainIdx = *trainIdx_ptr;
|
||||
|
||||
if (trainIdx == -1)
|
||||
if (_trainIdx == -1)
|
||||
continue;
|
||||
|
||||
int imgIdx = *imgIdx_ptr;
|
||||
int _imgIdx = *imgIdx_ptr;
|
||||
|
||||
float distance = *distance_ptr;
|
||||
float _distance = *distance_ptr;
|
||||
|
||||
DMatch m(queryIdx, trainIdx, imgIdx, distance);
|
||||
DMatch m(queryIdx, _trainIdx, _imgIdx, _distance);
|
||||
|
||||
matches.push_back(m);
|
||||
}
|
||||
@@ -548,13 +548,13 @@ void cv::gpu::BruteForceMatcher_GPU_base::knnMatchConvert(const Mat& trainIdx, c
|
||||
|
||||
for (int i = 0; i < k; ++i, ++trainIdx_ptr, ++distance_ptr)
|
||||
{
|
||||
int trainIdx = *trainIdx_ptr;
|
||||
int _trainIdx = *trainIdx_ptr;
|
||||
|
||||
if (trainIdx != -1)
|
||||
if (_trainIdx != -1)
|
||||
{
|
||||
float distance = *distance_ptr;
|
||||
float _distance = *distance_ptr;
|
||||
|
||||
DMatch m(queryIdx, trainIdx, 0, distance);
|
||||
DMatch m(queryIdx, _trainIdx, 0, _distance);
|
||||
|
||||
curMatches.push_back(m);
|
||||
}
|
||||
@@ -667,15 +667,15 @@ void cv::gpu::BruteForceMatcher_GPU_base::knnMatch2Convert(const Mat& trainIdx,
|
||||
|
||||
for (int i = 0; i < 2; ++i, ++trainIdx_ptr, ++imgIdx_ptr, ++distance_ptr)
|
||||
{
|
||||
int trainIdx = *trainIdx_ptr;
|
||||
int _trainIdx = *trainIdx_ptr;
|
||||
|
||||
if (trainIdx != -1)
|
||||
if (_trainIdx != -1)
|
||||
{
|
||||
int imgIdx = *imgIdx_ptr;
|
||||
int _imgIdx = *imgIdx_ptr;
|
||||
|
||||
float distance = *distance_ptr;
|
||||
float _distance = *distance_ptr;
|
||||
|
||||
DMatch m(queryIdx, trainIdx, imgIdx, distance);
|
||||
DMatch m(queryIdx, _trainIdx, _imgIdx, _distance);
|
||||
|
||||
curMatches.push_back(m);
|
||||
}
|
||||
@@ -852,25 +852,25 @@ void cv::gpu::BruteForceMatcher_GPU_base::radiusMatchConvert(const Mat& trainIdx
|
||||
const int* trainIdx_ptr = trainIdx.ptr<int>(queryIdx);
|
||||
const float* distance_ptr = distance.ptr<float>(queryIdx);
|
||||
|
||||
const int nMatches = std::min(nMatches_ptr[queryIdx], trainIdx.cols);
|
||||
const int nMatched = std::min(nMatches_ptr[queryIdx], trainIdx.cols);
|
||||
|
||||
if (nMatches == 0)
|
||||
if (nMatched == 0)
|
||||
{
|
||||
if (!compactResult)
|
||||
matches.push_back(vector<DMatch>());
|
||||
continue;
|
||||
}
|
||||
|
||||
matches.push_back(vector<DMatch>(nMatches));
|
||||
matches.push_back(vector<DMatch>(nMatched));
|
||||
vector<DMatch>& curMatches = matches.back();
|
||||
|
||||
for (int i = 0; i < nMatches; ++i, ++trainIdx_ptr, ++distance_ptr)
|
||||
for (int i = 0; i < nMatched; ++i, ++trainIdx_ptr, ++distance_ptr)
|
||||
{
|
||||
int trainIdx = *trainIdx_ptr;
|
||||
int _trainIdx = *trainIdx_ptr;
|
||||
|
||||
float distance = *distance_ptr;
|
||||
float _distance = *distance_ptr;
|
||||
|
||||
DMatch m(queryIdx, trainIdx, 0, distance);
|
||||
DMatch m(queryIdx, _trainIdx, 0, _distance);
|
||||
|
||||
curMatches[i] = m;
|
||||
}
|
||||
@@ -990,9 +990,9 @@ void cv::gpu::BruteForceMatcher_GPU_base::radiusMatchConvert(const Mat& trainIdx
|
||||
const int* imgIdx_ptr = imgIdx.ptr<int>(queryIdx);
|
||||
const float* distance_ptr = distance.ptr<float>(queryIdx);
|
||||
|
||||
const int nMatches = std::min(nMatches_ptr[queryIdx], trainIdx.cols);
|
||||
const int nMatched = std::min(nMatches_ptr[queryIdx], trainIdx.cols);
|
||||
|
||||
if (nMatches == 0)
|
||||
if (nMatched == 0)
|
||||
{
|
||||
if (!compactResult)
|
||||
matches.push_back(vector<DMatch>());
|
||||
@@ -1001,9 +1001,9 @@ void cv::gpu::BruteForceMatcher_GPU_base::radiusMatchConvert(const Mat& trainIdx
|
||||
|
||||
matches.push_back(vector<DMatch>());
|
||||
vector<DMatch>& curMatches = matches.back();
|
||||
curMatches.reserve(nMatches);
|
||||
curMatches.reserve(nMatched);
|
||||
|
||||
for (int i = 0; i < nMatches; ++i, ++trainIdx_ptr, ++imgIdx_ptr, ++distance_ptr)
|
||||
for (int i = 0; i < nMatched; ++i, ++trainIdx_ptr, ++imgIdx_ptr, ++distance_ptr)
|
||||
{
|
||||
int _trainIdx = *trainIdx_ptr;
|
||||
int _imgIdx = *imgIdx_ptr;
|
||||
|
@@ -622,7 +622,7 @@ private:
|
||||
}
|
||||
|
||||
// copy data structures on gpu
|
||||
stage_mat.upload(cv::Mat(1, stages.size() * sizeof(Stage), CV_8UC1, (uchar*)&(stages[0]) ));
|
||||
stage_mat.upload(cv::Mat(1, (int) (stages.size() * sizeof(Stage)), CV_8UC1, (uchar*)&(stages[0]) ));
|
||||
trees_mat.upload(cv::Mat(cl_trees).reshape(1,1));
|
||||
nodes_mat.upload(cv::Mat(cl_nodes).reshape(1,1));
|
||||
leaves_mat.upload(cv::Mat(cl_leaves).reshape(1,1));
|
||||
|
@@ -497,6 +497,7 @@ namespace cv { namespace gpu { namespace device
|
||||
|
||||
void labelComponents(const PtrStepSzb& edges, PtrStepSzi comps, int flags, cudaStream_t stream)
|
||||
{
|
||||
(void) flags;
|
||||
dim3 block(CTA_SIZE_X, CTA_SIZE_Y);
|
||||
dim3 grid(divUp(edges.cols, TILE_COLS), divUp(edges.rows, TILE_ROWS));
|
||||
|
||||
@@ -529,4 +530,4 @@ namespace cv { namespace gpu { namespace device
|
||||
}
|
||||
} } }
|
||||
|
||||
#endif /* CUDA_DISABLER */
|
||||
#endif /* CUDA_DISABLER */
|
||||
|
@@ -47,6 +47,7 @@
|
||||
|
||||
#if !defined CUDA_DISABLER
|
||||
|
||||
#include <thrust/device_ptr.h>
|
||||
#include <thrust/sort.h>
|
||||
|
||||
#include "opencv2/gpu/device/common.hpp"
|
||||
@@ -148,4 +149,4 @@ namespace cv { namespace gpu { namespace device
|
||||
}}}
|
||||
|
||||
|
||||
#endif /* CUDA_DISABLER */
|
||||
#endif /* CUDA_DISABLER */
|
||||
|
@@ -42,7 +42,9 @@
|
||||
|
||||
#if !defined CUDA_DISABLER
|
||||
|
||||
#include <thrust/device_ptr.h>
|
||||
#include <thrust/sort.h>
|
||||
|
||||
#include "opencv2/gpu/device/common.hpp"
|
||||
#include "opencv2/gpu/device/emulation.hpp"
|
||||
#include "opencv2/gpu/device/vec_math.hpp"
|
||||
@@ -1509,4 +1511,4 @@ namespace cv { namespace gpu { namespace device
|
||||
}}}
|
||||
|
||||
|
||||
#endif /* CUDA_DISABLER */
|
||||
#endif /* CUDA_DISABLER */
|
||||
|
@@ -295,7 +295,7 @@ namespace cv { namespace gpu { namespace device
|
||||
int grid = divUp(workAmount, block);
|
||||
cudaFuncSetCacheConfig(lbp_cascade, cudaFuncCachePreferL1);
|
||||
Cascade cascade((Stage*)mstages.ptr(), nstages, (ClNode*)mnodes.ptr(), mleaves.ptr(), msubsets.ptr(), (uchar4*)mfeatures.ptr(), subsetSize);
|
||||
lbp_cascade<<<grid, block>>>(cascade, frameW, frameH, windowW, windowH, initialScale, factor, workAmount, integral.ptr(), integral.step / sizeof(int), objects, classified);
|
||||
lbp_cascade<<<grid, block>>>(cascade, frameW, frameH, windowW, windowH, initialScale, factor, workAmount, integral.ptr(), (int)integral.step / sizeof(int), objects, classified);
|
||||
}
|
||||
}
|
||||
}}}
|
||||
|
@@ -76,7 +76,7 @@ namespace cv { namespace gpu { namespace device
|
||||
static __device__ __forceinline__ void calc(int x, int y, float x_data, float y_data, float* dst, size_t dst_step, float scale)
|
||||
{
|
||||
float angle = ::atan2f(y_data, x_data);
|
||||
angle += (angle < 0) * 2.0 * CV_PI;
|
||||
angle += (angle < 0) * 2.0f * CV_PI_F;
|
||||
dst[y * dst_step + x] = scale * angle;
|
||||
}
|
||||
};
|
||||
@@ -140,7 +140,7 @@ namespace cv { namespace gpu { namespace device
|
||||
grid.x = divUp(x.cols, threads.x);
|
||||
grid.y = divUp(x.rows, threads.y);
|
||||
|
||||
const float scale = angleInDegrees ? (float)(180.0f / CV_PI) : 1.f;
|
||||
const float scale = angleInDegrees ? (180.0f / CV_PI_F) : 1.f;
|
||||
|
||||
cartToPolar<Mag, Angle><<<grid, threads, 0, stream>>>(
|
||||
x.data, x.step/x.elemSize(), y.data, y.step/y.elemSize(),
|
||||
@@ -190,7 +190,7 @@ namespace cv { namespace gpu { namespace device
|
||||
grid.x = divUp(mag.cols, threads.x);
|
||||
grid.y = divUp(mag.rows, threads.y);
|
||||
|
||||
const float scale = angleInDegrees ? (float)(CV_PI / 180.0f) : 1.0f;
|
||||
const float scale = angleInDegrees ? (CV_PI_F / 180.0f) : 1.0f;
|
||||
|
||||
polarToCart<Mag><<<grid, threads, 0, stream>>>(mag.data, mag.step/mag.elemSize(),
|
||||
angle.data, angle.step/angle.elemSize(), scale, x.data, x.step/x.elemSize(), y.data, y.step/y.elemSize(), mag.cols, mag.rows);
|
||||
@@ -214,4 +214,4 @@ namespace cv { namespace gpu { namespace device
|
||||
} // namespace mathfunc
|
||||
}}} // namespace cv { namespace gpu { namespace device
|
||||
|
||||
#endif /* CUDA_DISABLER */
|
||||
#endif /* CUDA_DISABLER */
|
||||
|
@@ -164,40 +164,40 @@ namespace cv { namespace gpu { namespace device
|
||||
|
||||
r = ::fmin(r, 2.5f);
|
||||
|
||||
v[1].x = arrow_x + r * ::cosf(theta - CV_PI / 2.0f);
|
||||
v[1].y = arrow_y + r * ::sinf(theta - CV_PI / 2.0f);
|
||||
v[1].x = arrow_x + r * ::cosf(theta - CV_PI_F / 2.0f);
|
||||
v[1].y = arrow_y + r * ::sinf(theta - CV_PI_F / 2.0f);
|
||||
|
||||
v[4].x = arrow_x + r * ::cosf(theta + CV_PI / 2.0f);
|
||||
v[4].y = arrow_y + r * ::sinf(theta + CV_PI / 2.0f);
|
||||
v[4].x = arrow_x + r * ::cosf(theta + CV_PI_F / 2.0f);
|
||||
v[4].y = arrow_y + r * ::sinf(theta + CV_PI_F / 2.0f);
|
||||
|
||||
int indx = (y * u_avg.cols + x) * NUM_VERTS_PER_ARROW * 3;
|
||||
|
||||
color_data[indx] = (theta - CV_PI) / CV_PI * 180.0f;
|
||||
color_data[indx] = (theta - CV_PI_F) / CV_PI_F * 180.0f;
|
||||
vertex_data[indx++] = v[0].x * xscale;
|
||||
vertex_data[indx++] = v[0].y * yscale;
|
||||
vertex_data[indx++] = v[0].z;
|
||||
|
||||
color_data[indx] = (theta - CV_PI) / CV_PI * 180.0f;
|
||||
color_data[indx] = (theta - CV_PI_F) / CV_PI_F * 180.0f;
|
||||
vertex_data[indx++] = v[1].x * xscale;
|
||||
vertex_data[indx++] = v[1].y * yscale;
|
||||
vertex_data[indx++] = v[1].z;
|
||||
|
||||
color_data[indx] = (theta - CV_PI) / CV_PI * 180.0f;
|
||||
color_data[indx] = (theta - CV_PI_F) / CV_PI_F * 180.0f;
|
||||
vertex_data[indx++] = v[2].x * xscale;
|
||||
vertex_data[indx++] = v[2].y * yscale;
|
||||
vertex_data[indx++] = v[2].z;
|
||||
|
||||
color_data[indx] = (theta - CV_PI) / CV_PI * 180.0f;
|
||||
color_data[indx] = (theta - CV_PI_F) / CV_PI_F * 180.0f;
|
||||
vertex_data[indx++] = v[3].x * xscale;
|
||||
vertex_data[indx++] = v[3].y * yscale;
|
||||
vertex_data[indx++] = v[3].z;
|
||||
|
||||
color_data[indx] = (theta - CV_PI) / CV_PI * 180.0f;
|
||||
color_data[indx] = (theta - CV_PI_F) / CV_PI_F * 180.0f;
|
||||
vertex_data[indx++] = v[4].x * xscale;
|
||||
vertex_data[indx++] = v[4].y * yscale;
|
||||
vertex_data[indx++] = v[4].z;
|
||||
|
||||
color_data[indx] = (theta - CV_PI) / CV_PI * 180.0f;
|
||||
color_data[indx] = (theta - CV_PI_F) / CV_PI_F * 180.0f;
|
||||
vertex_data[indx++] = v[5].x * xscale;
|
||||
vertex_data[indx++] = v[5].y * yscale;
|
||||
vertex_data[indx++] = v[5].z;
|
||||
@@ -217,4 +217,4 @@ namespace cv { namespace gpu { namespace device
|
||||
}
|
||||
}}}
|
||||
|
||||
#endif /* CUDA_DISABLER */
|
||||
#endif /* CUDA_DISABLER */
|
||||
|
@@ -42,7 +42,6 @@
|
||||
|
||||
#if !defined CUDA_DISABLER
|
||||
|
||||
#include <stdio.h>
|
||||
#include "internal_shared.hpp"
|
||||
#include "opencv2/gpu/device/common.hpp"
|
||||
#include "opencv2/gpu/device/border_interpolate.hpp"
|
||||
@@ -57,8 +56,6 @@
|
||||
#define BORDER_SIZE 5
|
||||
#define MAX_KSIZE_HALF 100
|
||||
|
||||
using namespace std;
|
||||
|
||||
namespace cv { namespace gpu { namespace device { namespace optflow_farneback
|
||||
{
|
||||
__constant__ float c_g[8];
|
||||
|
@@ -267,7 +267,7 @@ namespace cv { namespace gpu { namespace device
|
||||
}
|
||||
__device__ __forceinline__ float4 abs_(const float4& a)
|
||||
{
|
||||
return fabs(a);
|
||||
return abs(a);
|
||||
}
|
||||
|
||||
template <int cn, int PATCH_X, int PATCH_Y, bool calcErr>
|
||||
@@ -681,4 +681,4 @@ namespace cv { namespace gpu { namespace device
|
||||
}
|
||||
}}}
|
||||
|
||||
#endif /* CUDA_DISABLER */
|
||||
#endif /* CUDA_DISABLER */
|
||||
|
@@ -508,4 +508,4 @@ namespace cv { namespace gpu { namespace device
|
||||
}}} // namespace cv { namespace gpu { namespace device
|
||||
|
||||
|
||||
#endif /* CUDA_DISABLER */
|
||||
#endif /* CUDA_DISABLER */
|
||||
|
@@ -454,7 +454,7 @@ namespace cv { namespace gpu { namespace device
|
||||
grid.x = divUp(cols, threads.x << 1);
|
||||
grid.y = divUp(rows, threads.y);
|
||||
|
||||
int elem_step = u.step/sizeof(T);
|
||||
int elem_step = (int)(u.step / sizeof(T));
|
||||
|
||||
for(int t = 0; t < iters; ++t)
|
||||
{
|
||||
|
@@ -638,7 +638,7 @@ namespace cv { namespace gpu { namespace device
|
||||
kp_dir *= 180.0f / CV_PI_F;
|
||||
|
||||
kp_dir = 360.0f - kp_dir;
|
||||
if (abs(kp_dir - 360.f) < FLT_EPSILON)
|
||||
if (::fabsf(kp_dir - 360.f) < FLT_EPSILON)
|
||||
kp_dir = 0.f;
|
||||
|
||||
featureDir[blockIdx.x] = kp_dir;
|
||||
@@ -1003,4 +1003,4 @@ namespace cv { namespace gpu { namespace device
|
||||
}}} // namespace cv { namespace gpu { namespace device
|
||||
|
||||
|
||||
#endif /* CUDA_DISABLER */
|
||||
#endif /* CUDA_DISABLER */
|
||||
|
@@ -85,7 +85,7 @@ namespace cv
|
||||
|
||||
namespace device
|
||||
{
|
||||
using pcl::gpu::TextureBinder;
|
||||
using cv::gpu::TextureBinder;
|
||||
}
|
||||
}
|
||||
|
||||
|
@@ -125,9 +125,6 @@ int cv::gpu::FAST_GPU::calcKeyPointsLocation(const GpuMat& img, const GpuMat& ma
|
||||
CV_Assert(img.type() == CV_8UC1);
|
||||
CV_Assert(mask.empty() || (mask.type() == CV_8UC1 && mask.size() == img.size()));
|
||||
|
||||
if (!TargetArchs::builtWith(GLOBAL_ATOMICS) || !DeviceInfo().supports(GLOBAL_ATOMICS))
|
||||
CV_Error(CV_StsNotImplemented, "The device doesn't support global atomics");
|
||||
|
||||
int maxKeypoints = static_cast<int>(keypointsRatio * img.size().area());
|
||||
|
||||
ensureSizeIsEnough(1, maxKeypoints, CV_16SC2, kpLoc_);
|
||||
@@ -148,9 +145,6 @@ int cv::gpu::FAST_GPU::getKeyPoints(GpuMat& keypoints)
|
||||
{
|
||||
using namespace cv::gpu::device::fast;
|
||||
|
||||
if (!TargetArchs::builtWith(GLOBAL_ATOMICS) || !DeviceInfo().supports(GLOBAL_ATOMICS))
|
||||
CV_Error(CV_StsNotImplemented, "The device doesn't support global atomics");
|
||||
|
||||
if (count_ == 0)
|
||||
return 0;
|
||||
|
||||
|
@@ -68,9 +68,6 @@ void cv::gpu::GoodFeaturesToTrackDetector_GPU::operator ()(const GpuMat& image,
|
||||
CV_Assert(qualityLevel > 0 && minDistance >= 0 && maxCorners >= 0);
|
||||
CV_Assert(mask.empty() || (mask.type() == CV_8UC1 && mask.size() == image.size()));
|
||||
|
||||
if (!TargetArchs::builtWith(GLOBAL_ATOMICS) || !DeviceInfo().supports(GLOBAL_ATOMICS))
|
||||
CV_Error(CV_StsNotImplemented, "The device doesn't support global atomics");
|
||||
|
||||
ensureSizeIsEnough(image.size(), CV_32F, eig_);
|
||||
|
||||
if (useHarrisDetector)
|
||||
|
@@ -45,8 +45,6 @@
|
||||
#include <vector>
|
||||
#include "NCV.hpp"
|
||||
|
||||
using namespace std;
|
||||
|
||||
|
||||
//==============================================================================
|
||||
//
|
||||
@@ -55,16 +53,16 @@ using namespace std;
|
||||
//==============================================================================
|
||||
|
||||
|
||||
static void stdDebugOutput(const string &msg)
|
||||
static void stdDebugOutput(const std::string &msg)
|
||||
{
|
||||
cout << msg;
|
||||
std::cout << msg;
|
||||
}
|
||||
|
||||
|
||||
static NCVDebugOutputHandler *debugOutputHandler = stdDebugOutput;
|
||||
|
||||
|
||||
void ncvDebugOutput(const string &msg)
|
||||
void ncvDebugOutput(const std::string &msg)
|
||||
{
|
||||
debugOutputHandler(msg);
|
||||
}
|
||||
|
@@ -288,7 +288,7 @@ NCV_EXPORTS void ncvSetDebugOutputHandler(NCVDebugOutputHandler* func);
|
||||
do \
|
||||
{ \
|
||||
cudaError_t res = cudacall; \
|
||||
ncvAssertPrintReturn(cudaSuccess==res, "cudaError_t=" << res, errCode); \
|
||||
ncvAssertPrintReturn(cudaSuccess==res, "cudaError_t=" << (int)res, errCode); \
|
||||
} while (0)
|
||||
|
||||
|
||||
@@ -296,7 +296,7 @@ NCV_EXPORTS void ncvSetDebugOutputHandler(NCVDebugOutputHandler* func);
|
||||
do \
|
||||
{ \
|
||||
cudaError_t res = cudaGetLastError(); \
|
||||
ncvAssertPrintReturn(cudaSuccess==res, "cudaError_t=" << res, errCode); \
|
||||
ncvAssertPrintReturn(cudaSuccess==res, "cudaError_t=" << (int)res, errCode); \
|
||||
} while (0)
|
||||
|
||||
|
||||
|
@@ -1,205 +0,0 @@
|
||||
/*M///////////////////////////////////////////////////////////////////////////////////////
|
||||
//
|
||||
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
|
||||
//
|
||||
// By downloading, copying, installing or using the software you agree to this license.
|
||||
// If you do not agree to this license, do not download, install,
|
||||
// copy or use the software.
|
||||
//
|
||||
//
|
||||
// License Agreement
|
||||
// For Open Source Computer Vision Library
|
||||
//
|
||||
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
|
||||
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
|
||||
// Third party copyrights are property of their respective owners.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without modification,
|
||||
// are permitted provided that the following conditions are met:
|
||||
//
|
||||
// * Redistribution's of source code must retain the above copyright notice,
|
||||
// this list of conditions and the following disclaimer.
|
||||
//
|
||||
// * Redistribution's in binary form must reproduce the above copyright notice,
|
||||
// this list of conditions and the following disclaimer in the documentation
|
||||
// and/or other materials provided with the distribution.
|
||||
//
|
||||
// * The name of the copyright holders may not be used to endorse or promote products
|
||||
// derived from this software without specific prior written permission.
|
||||
//
|
||||
// This software is provided by the copyright holders and contributors "as is" and
|
||||
// any express or implied warranties, including, but not limited to, the implied
|
||||
// warranties of merchantability and fitness for a particular purpose are disclaimed.
|
||||
// In no event shall the Intel Corporation or contributors be liable for any direct,
|
||||
// indirect, incidental, special, exemplary, or consequential damages
|
||||
// (including, but not limited to, procurement of substitute goods or services;
|
||||
// loss of use, data, or profits; or business interruption) however caused
|
||||
// and on any theory of liability, whether in contract, strict liability,
|
||||
// or tort (including negligence or otherwise) arising in any way out of
|
||||
// the use of this software, even if advised of the possibility of such damage.
|
||||
//
|
||||
//M*/
|
||||
|
||||
#ifndef __OPENCV_GPU_DEVICE_BLOCK_HPP__
|
||||
#define __OPENCV_GPU_DEVICE_BLOCK_HPP__
|
||||
|
||||
namespace cv { namespace gpu { namespace device
|
||||
{
|
||||
struct Block
|
||||
{
|
||||
static __device__ __forceinline__ unsigned int id()
|
||||
{
|
||||
return blockIdx.x;
|
||||
}
|
||||
|
||||
static __device__ __forceinline__ unsigned int stride()
|
||||
{
|
||||
return blockDim.x * blockDim.y * blockDim.z;
|
||||
}
|
||||
|
||||
static __device__ __forceinline__ void sync()
|
||||
{
|
||||
__syncthreads();
|
||||
}
|
||||
|
||||
static __device__ __forceinline__ int flattenedThreadId()
|
||||
{
|
||||
return threadIdx.z * blockDim.x * blockDim.y + threadIdx.y * blockDim.x + threadIdx.x;
|
||||
}
|
||||
|
||||
template<typename It, typename T>
|
||||
static __device__ __forceinline__ void fill(It beg, It end, const T& value)
|
||||
{
|
||||
int STRIDE = stride();
|
||||
It t = beg + flattenedThreadId();
|
||||
|
||||
for(; t < end; t += STRIDE)
|
||||
*t = value;
|
||||
}
|
||||
|
||||
template<typename OutIt, typename T>
|
||||
static __device__ __forceinline__ void yota(OutIt beg, OutIt end, T value)
|
||||
{
|
||||
int STRIDE = stride();
|
||||
int tid = flattenedThreadId();
|
||||
value += tid;
|
||||
|
||||
for(OutIt t = beg + tid; t < end; t += STRIDE, value += STRIDE)
|
||||
*t = value;
|
||||
}
|
||||
|
||||
template<typename InIt, typename OutIt>
|
||||
static __device__ __forceinline__ void copy(InIt beg, InIt end, OutIt out)
|
||||
{
|
||||
int STRIDE = stride();
|
||||
InIt t = beg + flattenedThreadId();
|
||||
OutIt o = out + (t - beg);
|
||||
|
||||
for(; t < end; t += STRIDE, o += STRIDE)
|
||||
*o = *t;
|
||||
}
|
||||
|
||||
template<typename InIt, typename OutIt, class UnOp>
|
||||
static __device__ __forceinline__ void transfrom(InIt beg, InIt end, OutIt out, UnOp op)
|
||||
{
|
||||
int STRIDE = stride();
|
||||
InIt t = beg + flattenedThreadId();
|
||||
OutIt o = out + (t - beg);
|
||||
|
||||
for(; t < end; t += STRIDE, o += STRIDE)
|
||||
*o = op(*t);
|
||||
}
|
||||
|
||||
template<typename InIt1, typename InIt2, typename OutIt, class BinOp>
|
||||
static __device__ __forceinline__ void transfrom(InIt1 beg1, InIt1 end1, InIt2 beg2, OutIt out, BinOp op)
|
||||
{
|
||||
int STRIDE = stride();
|
||||
InIt1 t1 = beg1 + flattenedThreadId();
|
||||
InIt2 t2 = beg2 + flattenedThreadId();
|
||||
OutIt o = out + (t1 - beg1);
|
||||
|
||||
for(; t1 < end1; t1 += STRIDE, t2 += STRIDE, o += STRIDE)
|
||||
*o = op(*t1, *t2);
|
||||
}
|
||||
|
||||
template<int CTA_SIZE, typename T, class BinOp>
|
||||
static __device__ __forceinline__ void reduce(volatile T* buffer, BinOp op)
|
||||
{
|
||||
int tid = flattenedThreadId();
|
||||
T val = buffer[tid];
|
||||
|
||||
if (CTA_SIZE >= 1024) { if (tid < 512) buffer[tid] = val = op(val, buffer[tid + 512]); __syncthreads(); }
|
||||
if (CTA_SIZE >= 512) { if (tid < 256) buffer[tid] = val = op(val, buffer[tid + 256]); __syncthreads(); }
|
||||
if (CTA_SIZE >= 256) { if (tid < 128) buffer[tid] = val = op(val, buffer[tid + 128]); __syncthreads(); }
|
||||
if (CTA_SIZE >= 128) { if (tid < 64) buffer[tid] = val = op(val, buffer[tid + 64]); __syncthreads(); }
|
||||
|
||||
if (tid < 32)
|
||||
{
|
||||
if (CTA_SIZE >= 64) { buffer[tid] = val = op(val, buffer[tid + 32]); }
|
||||
if (CTA_SIZE >= 32) { buffer[tid] = val = op(val, buffer[tid + 16]); }
|
||||
if (CTA_SIZE >= 16) { buffer[tid] = val = op(val, buffer[tid + 8]); }
|
||||
if (CTA_SIZE >= 8) { buffer[tid] = val = op(val, buffer[tid + 4]); }
|
||||
if (CTA_SIZE >= 4) { buffer[tid] = val = op(val, buffer[tid + 2]); }
|
||||
if (CTA_SIZE >= 2) { buffer[tid] = val = op(val, buffer[tid + 1]); }
|
||||
}
|
||||
}
|
||||
|
||||
template<int CTA_SIZE, typename T, class BinOp>
|
||||
static __device__ __forceinline__ T reduce(volatile T* buffer, T init, BinOp op)
|
||||
{
|
||||
int tid = flattenedThreadId();
|
||||
T val = buffer[tid] = init;
|
||||
__syncthreads();
|
||||
|
||||
if (CTA_SIZE >= 1024) { if (tid < 512) buffer[tid] = val = op(val, buffer[tid + 512]); __syncthreads(); }
|
||||
if (CTA_SIZE >= 512) { if (tid < 256) buffer[tid] = val = op(val, buffer[tid + 256]); __syncthreads(); }
|
||||
if (CTA_SIZE >= 256) { if (tid < 128) buffer[tid] = val = op(val, buffer[tid + 128]); __syncthreads(); }
|
||||
if (CTA_SIZE >= 128) { if (tid < 64) buffer[tid] = val = op(val, buffer[tid + 64]); __syncthreads(); }
|
||||
|
||||
if (tid < 32)
|
||||
{
|
||||
if (CTA_SIZE >= 64) { buffer[tid] = val = op(val, buffer[tid + 32]); }
|
||||
if (CTA_SIZE >= 32) { buffer[tid] = val = op(val, buffer[tid + 16]); }
|
||||
if (CTA_SIZE >= 16) { buffer[tid] = val = op(val, buffer[tid + 8]); }
|
||||
if (CTA_SIZE >= 8) { buffer[tid] = val = op(val, buffer[tid + 4]); }
|
||||
if (CTA_SIZE >= 4) { buffer[tid] = val = op(val, buffer[tid + 2]); }
|
||||
if (CTA_SIZE >= 2) { buffer[tid] = val = op(val, buffer[tid + 1]); }
|
||||
}
|
||||
__syncthreads();
|
||||
return buffer[0];
|
||||
}
|
||||
|
||||
template <typename T, class BinOp>
|
||||
static __device__ __forceinline__ void reduce_n(T* data, unsigned int n, BinOp op)
|
||||
{
|
||||
int ftid = flattenedThreadId();
|
||||
int sft = stride();
|
||||
|
||||
if (sft < n)
|
||||
{
|
||||
for (unsigned int i = sft + ftid; i < n; i += sft)
|
||||
data[ftid] = op(data[ftid], data[i]);
|
||||
|
||||
__syncthreads();
|
||||
|
||||
n = sft;
|
||||
}
|
||||
|
||||
while (n > 1)
|
||||
{
|
||||
unsigned int half = n/2;
|
||||
|
||||
if (ftid < half)
|
||||
data[ftid] = op(data[ftid], data[n - ftid - 1]);
|
||||
|
||||
__syncthreads();
|
||||
|
||||
n = n - half;
|
||||
}
|
||||
}
|
||||
};
|
||||
}}}
|
||||
|
||||
#endif /* __OPENCV_GPU_DEVICE_BLOCK_HPP__ */
|
||||
|
||||
|
Reference in New Issue
Block a user