fixed #1279
This commit is contained in:
parent
ed801d3e65
commit
dfaa8af6ee
@ -566,9 +566,6 @@ namespace cv { namespace gpu { namespace surf
|
|||||||
|
|
||||||
float* s_sum_row = s_sum + threadIdx.y * 32;
|
float* s_sum_row = s_sum + threadIdx.y * 32;
|
||||||
|
|
||||||
//reduceSum32(s_sum_row, sumx);
|
|
||||||
//reduceSum32(s_sum_row, sumy);
|
|
||||||
|
|
||||||
warpReduce32(s_sum_row, sumx, threadIdx.x, plus<volatile float>());
|
warpReduce32(s_sum_row, sumx, threadIdx.x, plus<volatile float>());
|
||||||
warpReduce32(s_sum_row, sumy, threadIdx.x, plus<volatile float>());
|
warpReduce32(s_sum_row, sumy, threadIdx.x, plus<volatile float>());
|
||||||
|
|
||||||
|
@ -46,13 +46,13 @@
|
|||||||
#include "internal_shared.hpp"
|
#include "internal_shared.hpp"
|
||||||
#include "saturate_cast.hpp"
|
#include "saturate_cast.hpp"
|
||||||
|
|
||||||
#ifndef __CUDA_ARCH__
|
#ifndef __CUDA_ARCH__
|
||||||
#define __CUDA_ARCH__ 0
|
#define __CUDA_ARCH__ 0
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#define OPENCV_GPU_LOG_WARP_SIZE (5)
|
#define OPENCV_GPU_LOG_WARP_SIZE (5)
|
||||||
#define OPENCV_GPU_WARP_SIZE (1 << OPENCV_GPU_LOG_WARP_SIZE)
|
#define OPENCV_GPU_WARP_SIZE (1 << OPENCV_GPU_LOG_WARP_SIZE)
|
||||||
#define OPENCV_GPU_LOG_MEM_BANKS ((__CUDA_ARCH__ >= 200) ? 5 : 4) // 32 banks on fermi, 16 on tesla
|
#define OPENCV_GPU_LOG_MEM_BANKS ((__CUDA_ARCH__ >= 200) ? 5 : 4) // 32 banks on fermi, 16 on tesla
|
||||||
#define OPENCV_GPU_MEM_BANKS (1 << OPENCV_GPU_LOG_MEM_BANKS)
|
#define OPENCV_GPU_MEM_BANKS (1 << OPENCV_GPU_LOG_MEM_BANKS)
|
||||||
|
|
||||||
#if defined(_WIN64) || defined(__LP64__)
|
#if defined(_WIN64) || defined(__LP64__)
|
||||||
@ -65,15 +65,15 @@
|
|||||||
|
|
||||||
namespace cv { namespace gpu { namespace device
|
namespace cv { namespace gpu { namespace device
|
||||||
{
|
{
|
||||||
template <typename T> void __host__ __device__ __forceinline__ swap(T &a, T &b)
|
template <typename T> void __host__ __device__ __forceinline__ swap(T &a, T &b)
|
||||||
{
|
{
|
||||||
T temp = a;
|
T temp = a;
|
||||||
a = b;
|
a = b;
|
||||||
b = temp;
|
b = temp;
|
||||||
}
|
}
|
||||||
|
|
||||||
// warp-synchronous 32 elements reduction
|
// warp-synchronous 32 elements reduction
|
||||||
template <typename T, typename Op> __device__ __forceinline__ void warpReduce32(volatile T* data, volatile T& partial_reduction, int tid, Op op)
|
template <typename T, typename Op> __device__ __forceinline__ void warpReduce32(volatile T* data, T& partial_reduction, int tid, Op op)
|
||||||
{
|
{
|
||||||
data[tid] = partial_reduction;
|
data[tid] = partial_reduction;
|
||||||
|
|
||||||
@ -88,7 +88,7 @@ namespace cv { namespace gpu { namespace device
|
|||||||
}
|
}
|
||||||
|
|
||||||
// warp-synchronous 16 elements reduction
|
// warp-synchronous 16 elements reduction
|
||||||
template <typename T, typename Op> __device__ __forceinline__ void warpReduce16(volatile T* data, volatile T& partial_reduction, int tid, Op op)
|
template <typename T, typename Op> __device__ __forceinline__ void warpReduce16(volatile T* data, T& partial_reduction, int tid, Op op)
|
||||||
{
|
{
|
||||||
data[tid] = partial_reduction;
|
data[tid] = partial_reduction;
|
||||||
|
|
||||||
@ -102,7 +102,7 @@ namespace cv { namespace gpu { namespace device
|
|||||||
}
|
}
|
||||||
|
|
||||||
// warp-synchronous reduction
|
// warp-synchronous reduction
|
||||||
template <int n, typename T, typename Op> __device__ __forceinline__ void warpReduce(volatile T* data, volatile T& partial_reduction, int tid, Op op)
|
template <int n, typename T, typename Op> __device__ __forceinline__ void warpReduce(volatile T* data, T& partial_reduction, int tid, Op op)
|
||||||
{
|
{
|
||||||
if (tid < n)
|
if (tid < n)
|
||||||
data[tid] = partial_reduction;
|
data[tid] = partial_reduction;
|
||||||
|
@ -109,9 +109,11 @@ int main(int argc, char** argv)
|
|||||||
cvtest::TS::ptr()->init("gpu");
|
cvtest::TS::ptr()->init("gpu");
|
||||||
testing::InitGoogleTest(&argc, argv);
|
testing::InitGoogleTest(&argc, argv);
|
||||||
|
|
||||||
//cv::CommandLineParser parser(argc, (const char**)argv);
|
const char* keys ="{ nvtest_output_level | nvtest_output_level | none | NVidia test verbosity level }";
|
||||||
|
|
||||||
std::string outputLevel = "none";//parser.get<std::string>("nvtest_output_level", "none");
|
cv::CommandLineParser parser(argc, (const char**)argv, keys);
|
||||||
|
|
||||||
|
std::string outputLevel = parser.get<std::string>("nvtest_output_level", "none");
|
||||||
|
|
||||||
if (outputLevel == "none")
|
if (outputLevel == "none")
|
||||||
nvidiaTestOutputLevel = OutputLevelNone;
|
nvidiaTestOutputLevel = OutputLevelNone;
|
||||||
|
Loading…
x
Reference in New Issue
Block a user