This commit is contained in:
Vladislav Vinogradov
2011-08-08 05:59:09 +00:00
parent ed801d3e65
commit dfaa8af6ee
3 changed files with 17 additions and 18 deletions

View File

@@ -566,9 +566,6 @@ namespace cv { namespace gpu { namespace surf
float* s_sum_row = s_sum + threadIdx.y * 32;
//reduceSum32(s_sum_row, sumx);
//reduceSum32(s_sum_row, sumy);
warpReduce32(s_sum_row, sumx, threadIdx.x, plus<volatile float>());
warpReduce32(s_sum_row, sumy, threadIdx.x, plus<volatile float>());

View File

@@ -46,13 +46,13 @@
#include "internal_shared.hpp"
#include "saturate_cast.hpp"
#ifndef __CUDA_ARCH__
#define __CUDA_ARCH__ 0
#ifndef __CUDA_ARCH__
#define __CUDA_ARCH__ 0
#endif
#define OPENCV_GPU_LOG_WARP_SIZE (5)
#define OPENCV_GPU_WARP_SIZE (1 << OPENCV_GPU_LOG_WARP_SIZE)
#define OPENCV_GPU_LOG_MEM_BANKS ((__CUDA_ARCH__ >= 200) ? 5 : 4) // 32 banks on fermi, 16 on tesla
#define OPENCV_GPU_LOG_WARP_SIZE (5)
#define OPENCV_GPU_WARP_SIZE (1 << OPENCV_GPU_LOG_WARP_SIZE)
#define OPENCV_GPU_LOG_MEM_BANKS ((__CUDA_ARCH__ >= 200) ? 5 : 4) // 32 banks on fermi, 16 on tesla
#define OPENCV_GPU_MEM_BANKS (1 << OPENCV_GPU_LOG_MEM_BANKS)
#if defined(_WIN64) || defined(__LP64__)
@@ -65,15 +65,15 @@
namespace cv { namespace gpu { namespace device
{
template <typename T> void __host__ __device__ __forceinline__ swap(T &a, T &b)
{
T temp = a;
a = b;
b = temp;
template <typename T> void __host__ __device__ __forceinline__ swap(T &a, T &b)
{
T temp = a;
a = b;
b = temp;
}
// warp-synchronous 32 elements reduction
template <typename T, typename Op> __device__ __forceinline__ void warpReduce32(volatile T* data, volatile T& partial_reduction, int tid, Op op)
template <typename T, typename Op> __device__ __forceinline__ void warpReduce32(volatile T* data, T& partial_reduction, int tid, Op op)
{
data[tid] = partial_reduction;
@@ -88,7 +88,7 @@ namespace cv { namespace gpu { namespace device
}
// warp-synchronous 16 elements reduction
template <typename T, typename Op> __device__ __forceinline__ void warpReduce16(volatile T* data, volatile T& partial_reduction, int tid, Op op)
template <typename T, typename Op> __device__ __forceinline__ void warpReduce16(volatile T* data, T& partial_reduction, int tid, Op op)
{
data[tid] = partial_reduction;
@@ -102,7 +102,7 @@ namespace cv { namespace gpu { namespace device
}
// warp-synchronous reduction
template <int n, typename T, typename Op> __device__ __forceinline__ void warpReduce(volatile T* data, volatile T& partial_reduction, int tid, Op op)
template <int n, typename T, typename Op> __device__ __forceinline__ void warpReduce(volatile T* data, T& partial_reduction, int tid, Op op)
{
if (tid < n)
data[tid] = partial_reduction;