From e7f6c4b7efc6fa174cc71eeb2c51a32d1e588d89 Mon Sep 17 00:00:00 2001 From: Marina Kolpakova Date: Wed, 20 Jun 2012 05:41:16 +0000 Subject: [PATCH] scan operations are moved in separate header --- cmake/OpenCVDetectCUDA.cmake | 10 +- modules/gpu/include/opencv2/gpu/gpu.hpp | 4 - modules/gpu/perf/perf_imgproc.cpp | 6 +- modules/gpu/src/cuda/element_operations.cu | 8 +- modules/gpu/src/cuda/matrix_reductions.cu | 2 +- modules/gpu/src/cuda/resize.cu | 365 +----------------- modules/gpu/src/cuda/split_merge.cu | 36 +- modules/gpu/src/nvidia/NCVBroxOpticalFlow.cu | 136 +++---- modules/gpu/src/nvidia/core/NCV.cu | 16 +- modules/gpu/src/opencv2/gpu/device/common.hpp | 14 +- .../src/opencv2/gpu/device/datamov_utils.hpp | 20 +- .../src/opencv2/gpu/device/dynamic_smem.hpp | 2 +- .../gpu/src/opencv2/gpu/device/emulation.hpp | 18 +- .../gpu/src/opencv2/gpu/device/funcattrib.hpp | 8 +- .../gpu/src/opencv2/gpu/device/functional.hpp | 56 ++- modules/gpu/src/opencv2/gpu/device/limits.hpp | 2 +- .../src/opencv2/gpu/device/saturate_cast.hpp | 112 +++--- modules/gpu/src/opencv2/gpu/device/scan.hpp | 166 ++++++++ .../src/opencv2/gpu/device/static_check.hpp | 22 +- .../gpu/src/opencv2/gpu/device/transform.hpp | 2 +- .../src/opencv2/gpu/device/type_traits.hpp | 22 +- .../gpu/src/opencv2/gpu/device/utility.hpp | 32 +- .../src/opencv2/gpu/device/vec_distance.hpp | 8 +- .../gpu/src/opencv2/gpu/device/vec_math.hpp | 6 +- .../gpu/src/opencv2/gpu/device/vec_traits.hpp | 14 +- modules/gpu/src/opencv2/gpu/device/warp.hpp | 14 +- modules/gpu/src/resize.cpp | 42 -- modules/gpu/test/test_resize.cpp | 39 -- 28 files changed, 462 insertions(+), 720 deletions(-) create mode 100644 modules/gpu/src/opencv2/gpu/device/scan.hpp diff --git a/cmake/OpenCVDetectCUDA.cmake b/cmake/OpenCVDetectCUDA.cmake index 84b95a430..9ece3a059 100644 --- a/cmake/OpenCVDetectCUDA.cmake +++ b/cmake/OpenCVDetectCUDA.cmake @@ -1,8 +1,8 @@ if(${CMAKE_VERSION} VERSION_LESS "2.8.3") message(STATUS WITH_CUDA flag requires CMake 2.8.3. CUDA support is disabled.) - return() + return() endif() - + find_package(CUDA 4.1) if(CUDA_FOUND) @@ -23,7 +23,7 @@ if(CUDA_FOUND) else() set(CUDA_ARCH_BIN "1.1 1.2 1.3 2.0 2.1(2.0)" CACHE STRING "Specify 'real' GPU architectures to build binaries for, BIN(PTX) format is supported") endif() - + set(CUDA_ARCH_PTX "2.0" CACHE STRING "Specify 'virtual' PTX architectures to build PTX intermediate code for") string(REGEX REPLACE "\\." "" ARCH_BIN_NO_POINTS "${CUDA_ARCH_BIN}") @@ -89,8 +89,8 @@ if(CUDA_FOUND) set (CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} -Xcompiler -fno-finite-math-only) endif() - # we remove -ggdb3 flag as it leads to preprocessor errors when compiling CUDA files (CUDA 4.1) - set(CMAKE_CXX_FLAGS_DEBUG_ ${CMAKE_CXX_FLAGS_DEBUG}) + # we remove -ggdb3 flag as it leads to preprocessor errors when compiling CUDA files (CUDA 4.1) + set(CMAKE_CXX_FLAGS_DEBUG_ ${CMAKE_CXX_FLAGS_DEBUG}) string(REPLACE "-ggdb3" "" CMAKE_CXX_FLAGS_DEBUG ${CMAKE_CXX_FLAGS_DEBUG}) CUDA_COMPILE(${VAR} ${ARGN}) set(CMAKE_CXX_DEBUG_FLAGS ${CMAKE_CXX_FLAGS_DEBUG_}) diff --git a/modules/gpu/include/opencv2/gpu/gpu.hpp b/modules/gpu/include/opencv2/gpu/gpu.hpp index ad11240f3..370c2c4d4 100644 --- a/modules/gpu/include/opencv2/gpu/gpu.hpp +++ b/modules/gpu/include/opencv2/gpu/gpu.hpp @@ -629,10 +629,6 @@ CV_EXPORTS double threshold(const GpuMat& src, GpuMat& dst, double thresh, doubl //! Supports INTER_NEAREST, INTER_LINEAR, INTER_CUBIC, INTER_AREA CV_EXPORTS void resize(const GpuMat& src, GpuMat& dst, Size dsize, double fx=0, double fy=0, int interpolation = INTER_LINEAR, Stream& stream = Stream::Null()); -//! resizes the image -//! Supports INTER_AREA -CV_EXPORTS void resize(const GpuMat& src, GpuMat& dst, GpuMat& buffer, Size dsize, double fx=0, double fy=0, int interpolation = INTER_AREA, Stream& stream = Stream::Null()); - //! warps the image using affine transformation //! Supports INTER_NEAREST, INTER_LINEAR, INTER_CUBIC CV_EXPORTS void warpAffine(const GpuMat& src, GpuMat& dst, const Mat& M, Size dsize, int flags = INTER_LINEAR, diff --git a/modules/gpu/perf/perf_imgproc.cpp b/modules/gpu/perf/perf_imgproc.cpp index 78ca5bf0e..ab7fb4240 100644 --- a/modules/gpu/perf/perf_imgproc.cpp +++ b/modules/gpu/perf/perf_imgproc.cpp @@ -118,10 +118,10 @@ GPU_PERF_TEST(ResizeArea, cv::gpu::DeviceInfo, cv::Size, MatType, Scale) INSTANTIATE_TEST_CASE_P(ImgProc, ResizeArea, testing::Combine( ALL_DEVICES, - testing::Values(perf::sz1080p, cv::Size(4096, 2048)), - testing::Values(MatType(CV_8UC1)/*, MatType(CV_8UC3), MatType(CV_8UC4), + testing::Values(perf::sz1080p/*, cv::Size(4096, 2048)*/), + testing::Values(MatType(CV_8UC1), MatType(CV_8UC3), MatType(CV_8UC4), MatType(CV_16UC1), MatType(CV_16UC3), MatType(CV_16UC4), - MatType(CV_32FC1), MatType(CV_32FC3), MatType(CV_32FC4)*/), + MatType(CV_32FC1), MatType(CV_32FC3), MatType(CV_32FC4)), testing::Values(Scale(0.2),Scale(0.1),Scale(0.05)))); ////////////////////////////////////////////////////////////////////// diff --git a/modules/gpu/src/cuda/element_operations.cu b/modules/gpu/src/cuda/element_operations.cu index f447bf414..9667280e4 100644 --- a/modules/gpu/src/cuda/element_operations.cu +++ b/modules/gpu/src/cuda/element_operations.cu @@ -1253,7 +1253,7 @@ namespace cv { namespace gpu { namespace device { const T val; - __host__ explicit CompareScalar(T val) : val(val) {} + __host__ explicit CompareScalar(T val_) : val(val_) {} __device__ __forceinline__ uchar operator()(T src) const { @@ -1266,7 +1266,7 @@ namespace cv { namespace gpu { namespace device { const TYPE_VEC(T, 2) val; - __host__ explicit CompareScalar(TYPE_VEC(T, 2) val) : val(val) {} + __host__ explicit CompareScalar(TYPE_VEC(T, 2) val_) : val(val_) {} __device__ __forceinline__ TYPE_VEC(uchar, 2) operator()(const TYPE_VEC(T, 2) & src) const { @@ -1281,7 +1281,7 @@ namespace cv { namespace gpu { namespace device { const TYPE_VEC(T, 3) val; - __host__ explicit CompareScalar(TYPE_VEC(T, 3) val) : val(val) {} + __host__ explicit CompareScalar(TYPE_VEC(T, 3) val_) : val(val_) {} __device__ __forceinline__ TYPE_VEC(uchar, 3) operator()(const TYPE_VEC(T, 3) & src) const { @@ -1297,7 +1297,7 @@ namespace cv { namespace gpu { namespace device { const TYPE_VEC(T, 4) val; - __host__ explicit CompareScalar(TYPE_VEC(T, 4) val) : val(val) {} + __host__ explicit CompareScalar(TYPE_VEC(T, 4) val_) : val(val_) {} __device__ __forceinline__ TYPE_VEC(uchar, 4) operator()(const TYPE_VEC(T, 4) & src) const { diff --git a/modules/gpu/src/cuda/matrix_reductions.cu b/modules/gpu/src/cuda/matrix_reductions.cu index 633b7ee5c..a0be65c3c 100644 --- a/modules/gpu/src/cuda/matrix_reductions.cu +++ b/modules/gpu/src/cuda/matrix_reductions.cu @@ -72,7 +72,7 @@ namespace cv { namespace gpu { namespace device struct Mask8U { - explicit Mask8U(PtrStepb mask): mask(mask) {} + explicit Mask8U(PtrStepb mask_): mask(mask_) {} __device__ __forceinline__ bool operator()(int y, int x) const { diff --git a/modules/gpu/src/cuda/resize.cu b/modules/gpu/src/cuda/resize.cu index fb7aefd51..c0f9d587a 100644 --- a/modules/gpu/src/cuda/resize.cu +++ b/modules/gpu/src/cuda/resize.cu @@ -46,7 +46,8 @@ #include "opencv2/gpu/device/vec_math.hpp" #include "opencv2/gpu/device/saturate_cast.hpp" #include "opencv2/gpu/device/filters.hpp" -# include +#include +#include namespace cv { namespace gpu { namespace device { @@ -285,367 +286,5 @@ namespace cv { namespace gpu { namespace device typedef float scan_line_type; }; -// template -// __global__ void resize_area_scan(const DevMem2D_ src, DevMem2D_ dst, int fx, int fy, DevMem2D_ buffer) -// { -// typedef typename scan_traits::scan_line_type W; -// extern __shared__ W line[]; - -// const int x = threadIdx.x; -// const int y = blockIdx.x; - -// if (y >= src.rows) return; - -// int offset = 1; - -// line[2 * x + 0] = src(y, 2 * x + 0); -// line[2 * x + 1] = src(y, 2 * x + 1); - -// __syncthreads();//??? -// // reduction -// for (int d = blockDim.x; d > 0; d >>= 1) -// { -// __syncthreads(); -// if (x < d) -// { -// int ai = 2 * x * offset -1 + 1 * offset; -// int bi = 2 * x * offset -1 + 2 * offset; -// line[bi] += line[ai]; -// } - -// offset *= 2; -// } - -// __syncthreads(); -// // convolution -// if (x == 0) { line[(blockDim.x << 1) - 1] = 0; printf("offset: %d!!!!!!!!!!!!!\n", fx);} - -// for (int d = 1; d < (blockDim.x << 1); d *= 2) -// { -// offset >>= 1; - -// __syncthreads(); -// if (x < d) -// { -// int ai = offset * 2 * x + 1 * offset - 1; -// int bi = offset * 2 * x + 2 * offset - 1; - -// W t = line[ai]; -// line[ai] = line[bi]; -// line[bi] += t; -// } -// } -// __syncthreads(); - -// // calculate sum -// int start = 0; -// int out_idx = 0; -// int end = start + fx; -// while (start < (blockDim.x << 1) && end < (blockDim.x << 1)) -// { -// buffer(y, out_idx) = saturate_cast((line[end] - line[start]) / fx); -// start = end; -// end = start + fx; -// out_idx++; -// } - -// } - - template - __device__ void scan_y(DevMem2D_::scan_line_type> buffer,int fx, int fy, DevMem2D_ dst, - typename scan_traits::scan_line_type* line, int g_base) - { - typedef typename scan_traits::scan_line_type W; - - const int y = threadIdx.x; - const int x = blockIdx.x; - - float scale = 1.f / (fx * fy); - - if (x >= buffer.cols) return; - - int offset = 1; - line[2 * y + 0] = buffer((g_base * fy) + 2 * y + 1, x); - - if (y != (blockDim.x -1) ) - line[2 * y + 1] = buffer((g_base * fy) + 2 * y + 2, x); - else - line[2 * y + 1] = 0; - - __syncthreads(); - - // reduction - for (int d = blockDim.x; d > 0; d >>= 1) - { - __syncthreads(); - if (y < d) - { - int ai = 2 * y * offset -1 + 1 * offset; - int bi = 2 * y * offset -1 + 2 * offset; - line[bi] += line[ai]; - } - - offset *= 2; - } - - __syncthreads(); - // convolution - if (y == 0) line[(blockDim.x << 1) - 1] = (W)buffer(0, x); - - for (int d = 1; d < (blockDim.x << 1); d *= 2) - { - offset >>= 1; - - __syncthreads(); - if (y < d) - { - int ai = offset * 2 * y + 1 * offset - 1; - int bi = offset * 2 * y + 2 * offset - 1; - - - W t = line[ai]; - line[ai] = line[bi]; - line[bi] += t; - } - } - __syncthreads(); - - if (y < dst.rows) - { - W start = (y == 0)? (W)0:line[y * fy -1]; - W end = line[y * fy + fy - 1]; - dst(g_base + y ,x) = saturate_cast((end - start) * scale); - } - } - - template - __device__ void scan_x(const DevMem2D_ src, int fx, int fy, DevMem2D_::scan_line_type> buffer, - typename scan_traits::scan_line_type* line, int g_base) - { - typedef typename scan_traits::scan_line_type W; - - const int x = threadIdx.x; - const int y = blockIdx.x; - - float scale = 1.f / (fx * fy); - - if (y >= src.rows) return; - - int offset = 1; - - line[2 * x + 0] = (W)src(y, (g_base * fx) + 2 * x + 1); - - if (x != (blockDim.x -1) ) - line[2 * x + 1] = (W)src(y, (g_base * fx) + 2 * x + 2); - else - line[2 * x + 1] = 0; - - __syncthreads(); - - // reduction - for (int d = blockDim.x; d > 0; d >>= 1) - { - __syncthreads(); - if (x < d) - { - int ai = 2 * x * offset -1 + 1 * offset; - int bi = 2 * x * offset -1 + 2 * offset; - line[bi] += line[ai]; - } - - offset *= 2; - } - - __syncthreads(); - // convolution - if (x == 0) line[(blockDim.x << 1) - 1] = (W)src(y, 0); - - for (int d = 1; d < (blockDim.x << 1); d *= 2) - { - offset >>= 1; - - __syncthreads(); - if (x < d) - { - int ai = offset * 2 * x + 1 * offset - 1; - int bi = offset * 2 * x + 2 * offset - 1; - - W t = line[ai]; - line[ai] = line[bi]; - line[bi] += t; - } - } - __syncthreads(); - - if (x < buffer.cols) - { - W start = (x == 0)? (W)0:line[x * fx -1]; - W end = line[x * fx + fx - 1]; - buffer(y, g_base + x) =(end - start); - } - } - - enum ScanKind { exclusive, inclusive } ; - - template - __device__ __forceinline__ T scan_warp ( volatile T *ptr , const unsigned int idx = threadIdx.x ) - { - const unsigned int lane = idx & 31; - - if ( lane >= 1) ptr [idx ] = ptr [idx - 1] + ptr [idx]; - if ( lane >= 2) ptr [idx ] = ptr [idx - 2] + ptr [idx]; - if ( lane >= 4) ptr [idx ] = ptr [idx - 4] + ptr [idx]; - if ( lane >= 8) ptr [idx ] = ptr [idx - 8] + ptr [idx]; - if ( lane >= 16) ptr [idx ] = ptr [idx - 16] + ptr [idx]; - - if( Kind == inclusive ) - return ptr [idx ]; - else - return (lane > 0) ? ptr [idx - 1] : 0; - } - - template - __device__ __forceinline__ T scan_block( volatile T *ptr) - { - const unsigned int idx = threadIdx.x; - const unsigned int lane = idx & 31; - const unsigned int warp = idx >> 5; - - T val = scan_warp ( ptr , idx ); - __syncthreads (); - - if( lane == 31 ) - ptr [ warp ] = ptr [idx ]; - - __syncthreads (); - - if( warp == 0 ) - scan_warp( ptr , idx ); - - __syncthreads (); - - if ( warp > 0) - val = ptr [warp -1] + val; - - __syncthreads (); - - ptr[idx] = val; - - __syncthreads (); - - return val ; - } - - template - __global__ void resise_scan_fast_x(const DevMem2D_ src, DevMem2D_ dst, int fx, int fy, int thred_lines, int stride) - { - extern __shared__ W sbuf[]; - - const unsigned int tid = threadIdx. x; - - // load line-block on shared memory - int y = blockIdx.x / thred_lines; - int input_stride = (blockIdx.x % thred_lines) * stride; - int x = input_stride + tid; - - // store global data in shared memory - if (x < src.cols && y < src.rows) - sbuf[tid] = src(y, x); - else - sbuf[tid] = 0; - __syncthreads(); - - scan_block(sbuf); - - float scale = __fdividef(1.f, fx); - int out_stride = input_stride / fx; - int count = blockDim.x / fx; - - if (tid < count) - { - int start_idx = (tid == 0)? 0 : tid * fx - 1; - int end_idx = tid * fx + fx - 1; - - W start = (tid == 0)? (W)0:sbuf[start_idx]; - W end = sbuf[end_idx]; - - dst(y, out_stride + tid) = (end - start); - } - } - - template - __global__ void resise_scan_fast_y(const DevMem2D_ src, DevMem2D_ dst, int fx, int fy, int thred_lines, int stride) - { - extern __shared__ W sbuf[]; - - const unsigned int tid = threadIdx. x; - - // load line-block on shared memory - int x = blockIdx.x / thred_lines; - - int global_stride = (blockIdx.x % thred_lines) * stride; - int y = global_stride + tid; - - // store global data in shared memory - if (x < src.cols && y < src.rows) - sbuf[tid] = src(y, x); - else - sbuf[tid] = 0; - - __syncthreads(); - scan_block(sbuf); - - float scale = __fdividef(1.f, fx * fy); - int out_stride = global_stride / fx; - int count = blockDim.x / fx; - - if (tid < count) - { - int start_idx = (tid == 0)? 0 : tid * fx - 1; - int end_idx = tid * fx + fx - 1; - - W start = (tid == 0)? (W)0:sbuf[start_idx]; - W end = sbuf[end_idx]; - - dst(out_stride + tid, x) = saturate_cast((end - start) * scale); - } - } - - template - void resize_area_gpu(const DevMem2Db src, DevMem2Db dst,float fx, float fy, - int interpolation, DevMem2Df buffer, cudaStream_t stream) - { - (void)interpolation; - - int iscale_x = round(fx); - int iscale_y = round(fy); - - int warps = 4; - const int threads = 32 * warps; - int input_stride = threads / iscale_x; - - int thred_lines = divUp(src.cols, input_stride * iscale_x); - int blocks = src.rows * thred_lines; - - typedef typename scan_traits::scan_line_type smem_type; - - resise_scan_fast_x<<>> - (src, buffer, iscale_x, iscale_y, thred_lines, input_stride * iscale_x); - - input_stride = threads / iscale_y; - thred_lines = divUp(src.rows, input_stride * iscale_y); - blocks = dst.cols * thred_lines; - - resise_scan_fast_y<<>> - (buffer, dst, iscale_x, iscale_y, thred_lines, input_stride * iscale_y); - - cudaSafeCall( cudaGetLastError() ); - - if (stream == 0) - cudaSafeCall( cudaDeviceSynchronize() ); - } - - template void resize_area_gpu(DevMem2Db src, DevMem2Db dst, float fx, float fy, int interpolation, DevMem2Df buffer, cudaStream_t stream); - } // namespace imgproc }}} // namespace cv { namespace gpu { namespace device diff --git a/modules/gpu/src/cuda/split_merge.cu b/modules/gpu/src/cuda/split_merge.cu index a9c08448e..6aa98bdff 100644 --- a/modules/gpu/src/cuda/split_merge.cu +++ b/modules/gpu/src/cuda/split_merge.cu @@ -228,9 +228,9 @@ namespace cv { namespace gpu { namespace device template static void mergeC2_(const DevMem2Db* src, DevMem2Db& dst, const cudaStream_t& stream) { - dim3 blockDim(32, 8); - dim3 gridDim(divUp(dst.cols, blockDim.x), divUp(dst.rows, blockDim.y)); - mergeC2_<<>>( + dim3 block(32, 8); + dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y)); + mergeC2_<<>>( src[0].data, src[0].step, src[1].data, src[1].step, dst.rows, dst.cols, dst.data, dst.step); @@ -244,9 +244,9 @@ namespace cv { namespace gpu { namespace device template static void mergeC3_(const DevMem2Db* src, DevMem2Db& dst, const cudaStream_t& stream) { - dim3 blockDim(32, 8); - dim3 gridDim(divUp(dst.cols, blockDim.x), divUp(dst.rows, blockDim.y)); - mergeC3_<<>>( + dim3 block(32, 8); + dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y)); + mergeC3_<<>>( src[0].data, src[0].step, src[1].data, src[1].step, src[2].data, src[2].step, @@ -261,9 +261,9 @@ namespace cv { namespace gpu { namespace device template static void mergeC4_(const DevMem2Db* src, DevMem2Db& dst, const cudaStream_t& stream) { - dim3 blockDim(32, 8); - dim3 gridDim(divUp(dst.cols, blockDim.x), divUp(dst.rows, blockDim.y)); - mergeC4_<<>>( + dim3 block(32, 8); + dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y)); + mergeC4_<<>>( src[0].data, src[0].step, src[1].data, src[1].step, src[2].data, src[2].step, @@ -437,9 +437,9 @@ namespace cv { namespace gpu { namespace device template static void splitC2_(const DevMem2Db& src, DevMem2Db* dst, const cudaStream_t& stream) { - dim3 blockDim(32, 8); - dim3 gridDim(divUp(src.cols, blockDim.x), divUp(src.rows, blockDim.y)); - splitC2_<<>>( + dim3 block(32, 8); + dim3 grid(divUp(src.cols, block.x), divUp(src.rows, block.y)); + splitC2_<<>>( src.data, src.step, src.rows, src.cols, dst[0].data, dst[0].step, dst[1].data, dst[1].step); @@ -453,9 +453,9 @@ namespace cv { namespace gpu { namespace device template static void splitC3_(const DevMem2Db& src, DevMem2Db* dst, const cudaStream_t& stream) { - dim3 blockDim(32, 8); - dim3 gridDim(divUp(src.cols, blockDim.x), divUp(src.rows, blockDim.y)); - splitC3_<<>>( + dim3 block(32, 8); + dim3 grid(divUp(src.cols, block.x), divUp(src.rows, block.y)); + splitC3_<<>>( src.data, src.step, src.rows, src.cols, dst[0].data, dst[0].step, dst[1].data, dst[1].step, @@ -470,9 +470,9 @@ namespace cv { namespace gpu { namespace device template static void splitC4_(const DevMem2Db& src, DevMem2Db* dst, const cudaStream_t& stream) { - dim3 blockDim(32, 8); - dim3 gridDim(divUp(src.cols, blockDim.x), divUp(src.rows, blockDim.y)); - splitC4_<<>>( + dim3 block(32, 8); + dim3 grid(divUp(src.cols, block.x), divUp(src.rows, block.y)); + splitC4_<<>>( src.data, src.step, src.rows, src.cols, dst[0].data, dst[0].step, dst[1].data, dst[1].step, diff --git a/modules/gpu/src/nvidia/NCVBroxOpticalFlow.cu b/modules/gpu/src/nvidia/NCVBroxOpticalFlow.cu index 574de533d..6ade899c3 100644 --- a/modules/gpu/src/nvidia/NCVBroxOpticalFlow.cu +++ b/modules/gpu/src/nvidia/NCVBroxOpticalFlow.cu @@ -1,7 +1,7 @@ /*M/////////////////////////////////////////////////////////////////////////////////////// // -// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. -// +// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. +// // By downloading, copying, installing or using the software you agree to this license. // If you do not agree to this license, do not download, install, // copy or use the software. @@ -129,9 +129,9 @@ texture tex_diffusivity_y; __global__ void pointwise_add(float *d_res, const float *d_op1, const float *d_op2, const int len) { const int pos = blockIdx.x*blockDim.x + threadIdx.x; - + if(pos >= len) return; - + d_res[pos] = d_op1[pos] + d_op2[pos]; } @@ -265,7 +265,7 @@ __forceinline__ __device__ void diffusivity_along_y(float *s, int pos, const flo /////////////////////////////////////////////////////////////////////////////// template __forceinline__ __device__ void load_array_element(float *smem, int is, int js, int i, int j, int w, int h, int p) -{ +{ //position within shared memory array const int ijs = js * PSOR_PITCH + is; //mirror reflection across borders @@ -299,7 +299,7 @@ __forceinline__ __device__ void load_array_element(float *smem, int is, int js, ///\param h number of rows in global memory array ///\param p global memory array pitch in floats /////////////////////////////////////////////////////////////////////////////// -template +template __forceinline__ __device__ void load_array(float *smem, int ig, int jg, int w, int h, int p) { const int i = threadIdx.x + 2; @@ -381,7 +381,7 @@ __forceinline__ __device__ void load_array(float *smem, int ig, int jg, int w, i /// \param gamma (in) gamma in Brox model (edge importance) /////////////////////////////////////////////////////////////////////////////// -__global__ void prepare_sor_stage_1_tex(float *diffusivity_x, float *diffusivity_y, +__global__ void prepare_sor_stage_1_tex(float *diffusivity_x, float *diffusivity_y, float *denominator_u, float *denominator_v, float *numerator_dudv, float *numerator_u, float *numerator_v, @@ -532,16 +532,16 @@ __global__ void prepare_sor_stage_2(float *inv_denominator_u, float *inv_denomin // Red-Black SOR ///////////////////////////////////////////////////////////////////////////////////////// -template __global__ void sor_pass(float *new_du, - float *new_dv, - const float *g_inv_denominator_u, +template __global__ void sor_pass(float *new_du, + float *new_dv, + const float *g_inv_denominator_u, const float *g_inv_denominator_v, - const float *g_numerator_u, - const float *g_numerator_v, - const float *g_numerator_dudv, - float omega, - int width, - int height, + const float *g_numerator_u, + const float *g_numerator_v, + const float *g_numerator_dudv, + float omega, + int width, + int height, int stride) { int i = blockIdx.x * blockDim.x + threadIdx.x; @@ -604,7 +604,7 @@ template __global__ void sor_pass(float *new_du, if((i+j)%2 == isBlack) { // update du - float numerator_u = (s_left*(u_left + du_left) + s_up*(u_up + du_up) + s_right*(u_right + du_right) + s_down*(u_down + du_down) - + float numerator_u = (s_left*(u_left + du_left) + s_up*(u_up + du_up) + s_right*(u_right + du_right) + s_down*(u_down + du_down) - u * (s_left + s_right + s_up + s_down) - g_numerator_u[pos] - numerator_dudv*dv); du = (1.0f - omega) * du + omega * g_inv_denominator_u[pos] * numerator_u; @@ -644,7 +644,7 @@ void InitTextures() initTexture2D(tex_I1); initTexture2D(tex_fine); // for downsampling initTexture2D(tex_coarse); // for prolongation - + initTexture2D(tex_Ix); initTexture2D(tex_Ixx); initTexture2D(tex_Ix0); @@ -725,7 +725,7 @@ NCVStatus NCVBroxOpticalFlow(const NCVBroxOpticalFlowDescriptor desc, const Ncv32u kSourceHeight = frame0.height(); ncvAssertPrintReturn(frame1.width() == kSourceWidth && frame1.height() == kSourceHeight, "Frame dims do not match", NCV_INCONSISTENT_INPUT); - ncvAssertReturn(uOut.width() == kSourceWidth && vOut.width() == kSourceWidth && + ncvAssertReturn(uOut.width() == kSourceWidth && vOut.width() == kSourceWidth && uOut.height() == kSourceHeight && vOut.height() == kSourceHeight, NCV_INCONSISTENT_INPUT); ncvAssertReturn(gpu_mem_allocator.isInitialized(), NCV_ALLOCATOR_NOT_INITIALIZED); @@ -780,7 +780,7 @@ NCVStatus NCVBroxOpticalFlow(const NCVBroxOpticalFlowDescriptor desc, SAFE_VECTOR_DECL(dv_new, gpu_mem_allocator, kSizeInPixelsAligned); // temporary storage - SAFE_VECTOR_DECL(device_buffer, gpu_mem_allocator, + SAFE_VECTOR_DECL(device_buffer, gpu_mem_allocator, alignUp(kSourceWidth, kStrideAlignmentFloat) * alignUp(kSourceHeight, kStrideAlignmentFloat)); // image derivatives @@ -800,7 +800,7 @@ NCVStatus NCVBroxOpticalFlow(const NCVBroxOpticalFlowDescriptor desc, { const float derivativeFilterHost[kDFilterSize] = {1.0f, -8.0f, 0.0f, 8.0f, -1.0f}; - ncvAssertCUDAReturn(cudaMemcpy(derivativeFilter.ptr(), derivativeFilterHost, sizeof(float) * kDFilterSize, + ncvAssertCUDAReturn(cudaMemcpy(derivativeFilter.ptr(), derivativeFilterHost, sizeof(float) * kDFilterSize, cudaMemcpyHostToDevice), NCV_CUDA_ERROR); InitTextures(); @@ -827,10 +827,10 @@ NCVStatus NCVBroxOpticalFlow(const NCVBroxOpticalFlowDescriptor desc, size_t src_width_in_bytes = kSourceWidth * sizeof(float); size_t src_pitch_in_bytes = frame0.pitch(); - ncvAssertCUDAReturn( cudaMemcpy2DAsync(pI0->ptr(), dst_width_in_bytes, frame0.ptr(), + ncvAssertCUDAReturn( cudaMemcpy2DAsync(pI0->ptr(), dst_width_in_bytes, frame0.ptr(), src_pitch_in_bytes, src_width_in_bytes, kSourceHeight, cudaMemcpyDeviceToDevice, stream), NCV_CUDA_ERROR ); - ncvAssertCUDAReturn( cudaMemcpy2DAsync(pI1->ptr(), dst_width_in_bytes, frame1.ptr(), + ncvAssertCUDAReturn( cudaMemcpy2DAsync(pI1->ptr(), dst_width_in_bytes, frame1.ptr(), src_pitch_in_bytes, src_width_in_bytes, kSourceHeight, cudaMemcpyDeviceToDevice, stream), NCV_CUDA_ERROR ); } @@ -876,11 +876,11 @@ NCVStatus NCVBroxOpticalFlow(const NCVBroxOpticalFlowDescriptor desc, NcvRect32u dstROI (0, 0, level_width, level_height); // frame 0 - ncvAssertReturnNcvStat( nppiStResize_32f_C1R (I0->ptr(), srcSize, prev_level_pitch, srcROI, + ncvAssertReturnNcvStat( nppiStResize_32f_C1R (I0->ptr(), srcSize, prev_level_pitch, srcROI, level_frame0->ptr(), dstSize, level_width_aligned * sizeof (float), dstROI, scale_factor, scale_factor, nppStSupersample) ); // frame 1 - ncvAssertReturnNcvStat( nppiStResize_32f_C1R (I1->ptr(), srcSize, prev_level_pitch, srcROI, + ncvAssertReturnNcvStat( nppiStResize_32f_C1R (I1->ptr(), srcSize, prev_level_pitch, srcROI, level_frame1->ptr(), dstSize, level_width_aligned * sizeof (float), dstROI, scale_factor, scale_factor, nppStSupersample) ); } @@ -956,14 +956,14 @@ NCVStatus NCVBroxOpticalFlow(const NCVBroxOpticalFlowDescriptor desc, dim3 dThreads(32, 6); const int kPitchTex = kLevelStride * sizeof(float); - + NcvSize32u srcSize(kLevelWidth, kLevelHeight); Ncv32u nSrcStep = kLevelStride * sizeof(float); NcvRect32u oROI(0, 0, kLevelWidth, kLevelHeight); // Ix0 ncvAssertReturnNcvStat( nppiStFilterRowBorder_32f_C1R (I0->ptr(), srcSize, nSrcStep, Ix0.ptr(), srcSize, nSrcStep, oROI, - nppStBorderMirror, derivativeFilter.ptr(), kDFilterSize, kDFilterSize/2, 1.0f/12.0f) ); + nppStBorderMirror, derivativeFilter.ptr(), kDFilterSize, kDFilterSize/2, 1.0f/12.0f) ); // Iy0 ncvAssertReturnNcvStat( nppiStFilterColumnBorder_32f_C1R (I0->ptr(), srcSize, nSrcStep, Iy0.ptr(), srcSize, nSrcStep, oROI, @@ -987,8 +987,8 @@ NCVStatus NCVBroxOpticalFlow(const NCVBroxOpticalFlowDescriptor desc, // Ixy ncvAssertReturnNcvStat( nppiStFilterRowBorder_32f_C1R (Iy.ptr(), srcSize, nSrcStep, Ixy.ptr(), srcSize, nSrcStep, oROI, - nppStBorderMirror, derivativeFilter.ptr(), kDFilterSize, kDFilterSize/2, 1.0f/12.0f) ); - + nppStBorderMirror, derivativeFilter.ptr(), kDFilterSize, kDFilterSize/2, 1.0f/12.0f) ); + ncvAssertCUDAReturn(cudaBindTexture2D(0, tex_Ix, Ix.ptr(), ch_desc, kLevelWidth, kLevelHeight, kPitchTex), NCV_CUDA_ERROR); ncvAssertCUDAReturn(cudaBindTexture2D(0, tex_Ixx, Ixx.ptr(), ch_desc, kLevelWidth, kLevelHeight, kPitchTex), NCV_CUDA_ERROR); ncvAssertCUDAReturn(cudaBindTexture2D(0, tex_Ix0, Ix0.ptr(), ch_desc, kLevelWidth, kLevelHeight, kPitchTex), NCV_CUDA_ERROR); @@ -1017,21 +1017,21 @@ NCVStatus NCVBroxOpticalFlow(const NCVBroxOpticalFlowDescriptor desc, { //compute coefficients prepare_sor_stage_1_tex<<>> - (diffusivity_x.ptr(), - diffusivity_y.ptr(), - denom_u.ptr(), - denom_v.ptr(), - num_dudv.ptr(), - num_u.ptr(), - num_v.ptr(), - kLevelWidth, - kLevelHeight, - kLevelStride, - alpha, + (diffusivity_x.ptr(), + diffusivity_y.ptr(), + denom_u.ptr(), + denom_v.ptr(), + num_dudv.ptr(), + num_u.ptr(), + num_v.ptr(), + kLevelWidth, + kLevelHeight, + kLevelStride, + alpha, gamma); ncvAssertCUDALastErrorReturn(NCV_CUDA_ERROR); - + ncvAssertCUDAReturn(cudaBindTexture(0, tex_diffusivity_x, diffusivity_x.ptr(), ch_desc, kLevelSizeInBytes), NCV_CUDA_ERROR); ncvAssertCUDAReturn(cudaBindTexture(0, tex_diffusivity_y, diffusivity_y.ptr(), ch_desc, kLevelSizeInBytes), NCV_CUDA_ERROR); @@ -1043,7 +1043,7 @@ NCVStatus NCVBroxOpticalFlow(const NCVBroxOpticalFlowDescriptor desc, prepare_sor_stage_2<<>>(denom_u.ptr(), denom_v.ptr(), kLevelWidth, kLevelHeight, kLevelStride); ncvAssertCUDALastErrorReturn(NCV_CUDA_ERROR); - + // linear system coefficients ncvAssertCUDAReturn(cudaBindTexture(0, tex_diffusivity_x, diffusivity_x.ptr(), ch_desc, kLevelSizeInBytes), NCV_CUDA_ERROR); ncvAssertCUDAReturn(cudaBindTexture(0, tex_diffusivity_y, diffusivity_y.ptr(), ch_desc, kLevelSizeInBytes), NCV_CUDA_ERROR); @@ -1055,26 +1055,26 @@ NCVStatus NCVBroxOpticalFlow(const NCVBroxOpticalFlowDescriptor desc, ncvAssertCUDAReturn(cudaBindTexture(0, tex_inv_denominator_u, denom_u.ptr(), ch_desc, kLevelSizeInBytes), NCV_CUDA_ERROR); ncvAssertCUDAReturn(cudaBindTexture(0, tex_inv_denominator_v, denom_v.ptr(), ch_desc, kLevelSizeInBytes), NCV_CUDA_ERROR); - + //solve linear system for (Ncv32u solver_iteration = 0; solver_iteration < desc.number_of_solver_iterations; ++solver_iteration) { float omega = 1.99f; - + ncvAssertCUDAReturn(cudaBindTexture(0, tex_du, du.ptr(), ch_desc, kLevelSizeInBytes), NCV_CUDA_ERROR); ncvAssertCUDAReturn(cudaBindTexture(0, tex_dv, dv.ptr(), ch_desc, kLevelSizeInBytes), NCV_CUDA_ERROR); sor_pass<0><<>> - (du_new.ptr(), - dv_new.ptr(), - denom_u.ptr(), + (du_new.ptr(), + dv_new.ptr(), + denom_u.ptr(), denom_v.ptr(), - num_u.ptr(), - num_v.ptr(), - num_dudv.ptr(), - omega, - kLevelWidth, - kLevelHeight, + num_u.ptr(), + num_v.ptr(), + num_dudv.ptr(), + omega, + kLevelWidth, + kLevelHeight, kLevelStride); ncvAssertCUDALastErrorReturn(NCV_CUDA_ERROR); @@ -1083,16 +1083,16 @@ NCVStatus NCVBroxOpticalFlow(const NCVBroxOpticalFlowDescriptor desc, ncvAssertCUDAReturn(cudaBindTexture(0, tex_dv, dv_new.ptr(), ch_desc, kLevelSizeInBytes), NCV_CUDA_ERROR); sor_pass<1><<>> - (du.ptr(), - dv.ptr(), - denom_u.ptr(), + (du.ptr(), + dv.ptr(), + denom_u.ptr(), denom_v.ptr(), - num_u.ptr(), - num_v.ptr(), - num_dudv.ptr(), - omega, - kLevelWidth, - kLevelHeight, + num_u.ptr(), + num_v.ptr(), + num_dudv.ptr(), + omega, + kLevelWidth, + kLevelHeight, kLevelStride); ncvAssertCUDALastErrorReturn(NCV_CUDA_ERROR); @@ -1120,19 +1120,19 @@ NCVStatus NCVBroxOpticalFlow(const NCVBroxOpticalFlowDescriptor desc, dim3 p_blocks(iDivUp(nw, 32), iDivUp(nh, 8)); dim3 p_threads(32, 8); - - NcvSize32u srcSize (kLevelWidth, kLevelHeight); + + NcvSize32u inner_srcSize (kLevelWidth, kLevelHeight); NcvSize32u dstSize (nw, nh); NcvRect32u srcROI (0, 0, kLevelWidth, kLevelHeight); NcvRect32u dstROI (0, 0, nw, nh); - ncvAssertReturnNcvStat( nppiStResize_32f_C1R (ptrU->ptr(), srcSize, kLevelStride * sizeof (float), srcROI, + ncvAssertReturnNcvStat( nppiStResize_32f_C1R (ptrU->ptr(), inner_srcSize, kLevelStride * sizeof (float), srcROI, ptrUNew->ptr(), dstSize, ns * sizeof (float), dstROI, 1.0f/scale_factor, 1.0f/scale_factor, nppStBicubic) ); ScaleVector(ptrUNew->ptr(), ptrUNew->ptr(), 1.0f/scale_factor, ns * nh, stream); ncvAssertCUDALastErrorReturn(NCV_CUDA_ERROR); - ncvAssertReturnNcvStat( nppiStResize_32f_C1R (ptrV->ptr(), srcSize, kLevelStride * sizeof (float), srcROI, + ncvAssertReturnNcvStat( nppiStResize_32f_C1R (ptrV->ptr(), inner_srcSize, kLevelStride * sizeof (float), srcROI, ptrVNew->ptr(), dstSize, ns * sizeof (float), dstROI, 1.0f/scale_factor, 1.0f/scale_factor, nppStBicubic) ); ScaleVector(ptrVNew->ptr(), ptrVNew->ptr(), 1.0f/scale_factor, ns * nh, stream); @@ -1148,11 +1148,11 @@ NCVStatus NCVBroxOpticalFlow(const NCVBroxOpticalFlowDescriptor desc, ncvAssertCUDAReturn(cudaStreamSynchronize(stream), NCV_CUDA_ERROR); ncvAssertCUDAReturn( cudaMemcpy2DAsync - (uOut.ptr(), uOut.pitch(), ptrU->ptr(), + (uOut.ptr(), uOut.pitch(), ptrU->ptr(), kSourcePitch, kSourceWidth*sizeof(float), kSourceHeight, cudaMemcpyDeviceToDevice, stream), NCV_CUDA_ERROR ); ncvAssertCUDAReturn( cudaMemcpy2DAsync - (vOut.ptr(), vOut.pitch(), ptrV->ptr(), + (vOut.ptr(), vOut.pitch(), ptrV->ptr(), kSourcePitch, kSourceWidth*sizeof(float), kSourceHeight, cudaMemcpyDeviceToDevice, stream), NCV_CUDA_ERROR ); ncvAssertCUDAReturn(cudaStreamSynchronize(stream), NCV_CUDA_ERROR); diff --git a/modules/gpu/src/nvidia/core/NCV.cu b/modules/gpu/src/nvidia/core/NCV.cu index ba4e08e90..d877b585c 100644 --- a/modules/gpu/src/nvidia/core/NCV.cu +++ b/modules/gpu/src/nvidia/core/NCV.cu @@ -252,7 +252,7 @@ NCVStatus memSegCopyHelper2D(void *dst, Ncv32u dstPitch, NCVMemoryType dstType, //=================================================================== -NCVMemStackAllocator::NCVMemStackAllocator(Ncv32u alignment) +NCVMemStackAllocator::NCVMemStackAllocator(Ncv32u alignment_) : currentSize(0), _maxSize(0), @@ -260,23 +260,23 @@ NCVMemStackAllocator::NCVMemStackAllocator(Ncv32u alignment) begin(NULL), end(NULL), _memType(NCVMemoryTypeNone), - _alignment(alignment), + _alignment(alignment_), bReusesMemory(false) { - NcvBool bProperAlignment = (alignment & (alignment-1)) == 0; + NcvBool bProperAlignment = (alignment_ & (alignment_ - 1)) == 0; ncvAssertPrintCheck(bProperAlignment, "NCVMemStackAllocator ctor:: alignment not power of 2"); } -NCVMemStackAllocator::NCVMemStackAllocator(NCVMemoryType memT, size_t capacity, Ncv32u alignment, void *reusePtr) +NCVMemStackAllocator::NCVMemStackAllocator(NCVMemoryType memT, size_t capacity, Ncv32u alignment_, void *reusePtr) : currentSize(0), _maxSize(0), allocBegin(NULL), _memType(memT), - _alignment(alignment) + _alignment(alignment_) { - NcvBool bProperAlignment = (alignment & (alignment-1)) == 0; + NcvBool bProperAlignment = (alignment_ & (alignment_ - 1)) == 0; ncvAssertPrintCheck(bProperAlignment, "NCVMemStackAllocator ctor:: _alignment not power of 2"); ncvAssertPrintCheck(memT != NCVMemoryTypeNone, "NCVMemStackAllocator ctor:: Incorrect allocator type"); @@ -425,12 +425,12 @@ size_t NCVMemStackAllocator::maxSize(void) const //=================================================================== -NCVMemNativeAllocator::NCVMemNativeAllocator(NCVMemoryType memT, Ncv32u alignment) +NCVMemNativeAllocator::NCVMemNativeAllocator(NCVMemoryType memT, Ncv32u alignment_) : currentSize(0), _maxSize(0), _memType(memT), - _alignment(alignment) + _alignment(alignment_) { ncvAssertPrintReturn(memT != NCVMemoryTypeNone, "NCVMemNativeAllocator ctor:: counting not permitted for this allocator type", ); } diff --git a/modules/gpu/src/opencv2/gpu/device/common.hpp b/modules/gpu/src/opencv2/gpu/device/common.hpp index 24a447b81..56bcc1af8 100644 --- a/modules/gpu/src/opencv2/gpu/device/common.hpp +++ b/modules/gpu/src/opencv2/gpu/device/common.hpp @@ -64,7 +64,7 @@ #define cudaSafeCall(expr) ___cudaSafeCall(expr, __FILE__, __LINE__) #endif -namespace cv { namespace gpu +namespace cv { namespace gpu { void error(const char *error_string, const char *file, const int line, const char *func); @@ -87,14 +87,14 @@ static inline void ___cudaSafeCall(cudaError_t err, const char *file, const int #ifdef __CUDACC__ -namespace cv { namespace gpu -{ - __host__ __device__ __forceinline__ int divUp(int total, int grain) - { - return (total + grain - 1) / grain; +namespace cv { namespace gpu +{ + __host__ __device__ __forceinline__ int divUp(int total, int grain) + { + return (total + grain - 1) / grain; } - namespace device + namespace device { typedef unsigned char uchar; typedef unsigned short ushort; diff --git a/modules/gpu/src/opencv2/gpu/device/datamov_utils.hpp b/modules/gpu/src/opencv2/gpu/device/datamov_utils.hpp index bd5c49fbe..a0961f1ac 100644 --- a/modules/gpu/src/opencv2/gpu/device/datamov_utils.hpp +++ b/modules/gpu/src/opencv2/gpu/device/datamov_utils.hpp @@ -45,7 +45,7 @@ #include "common.hpp" -namespace cv { namespace gpu { namespace device +namespace cv { namespace gpu { namespace device { #if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 200 @@ -54,13 +54,13 @@ namespace cv { namespace gpu { namespace device { __device__ __forceinline__ static void Load(const T* ptr, int offset, T& val) { val = ptr[offset]; } }; - - #else // __CUDA_ARCH__ >= 200 - #if defined(_WIN64) || defined(__LP64__) + #else // __CUDA_ARCH__ >= 200 + + #if defined(_WIN64) || defined(__LP64__) // 64-bit register modifier for inlined asm #define OPENCV_GPU_ASM_PTR "l" - #else + #else // 32-bit register modifier for inlined asm #define OPENCV_GPU_ASM_PTR "r" #endif @@ -84,21 +84,21 @@ namespace cv { namespace gpu { namespace device asm("ld.global."#ptx_type" %0, [%1];" : "=r"(*reinterpret_cast(&val)) : OPENCV_GPU_ASM_PTR(ptr + offset)); \ } \ }; - + OPENCV_GPU_DEFINE_FORCE_GLOB_B(uchar, u8) OPENCV_GPU_DEFINE_FORCE_GLOB_B(schar, s8) OPENCV_GPU_DEFINE_FORCE_GLOB_B(char, b8) OPENCV_GPU_DEFINE_FORCE_GLOB (ushort, u16, h) OPENCV_GPU_DEFINE_FORCE_GLOB (short, s16, h) OPENCV_GPU_DEFINE_FORCE_GLOB (uint, u32, r) - OPENCV_GPU_DEFINE_FORCE_GLOB (int, s32, r) - OPENCV_GPU_DEFINE_FORCE_GLOB (float, f32, f) - OPENCV_GPU_DEFINE_FORCE_GLOB (double, f64, d) + OPENCV_GPU_DEFINE_FORCE_GLOB (int, s32, r) + OPENCV_GPU_DEFINE_FORCE_GLOB (float, f32, f) + OPENCV_GPU_DEFINE_FORCE_GLOB (double, f64, d) #undef OPENCV_GPU_DEFINE_FORCE_GLOB #undef OPENCV_GPU_DEFINE_FORCE_GLOB_B #undef OPENCV_GPU_ASM_PTR - + #endif // __CUDA_ARCH__ >= 200 }}} // namespace cv { namespace gpu { namespace device diff --git a/modules/gpu/src/opencv2/gpu/device/dynamic_smem.hpp b/modules/gpu/src/opencv2/gpu/device/dynamic_smem.hpp index 7ce6994fd..4d073c5f8 100644 --- a/modules/gpu/src/opencv2/gpu/device/dynamic_smem.hpp +++ b/modules/gpu/src/opencv2/gpu/device/dynamic_smem.hpp @@ -44,7 +44,7 @@ #define __OPENCV_GPU_DYNAMIC_SMEM_HPP__ namespace cv { namespace gpu { namespace device -{ +{ template struct DynamicSharedMem { __device__ __forceinline__ operator T*() diff --git a/modules/gpu/src/opencv2/gpu/device/emulation.hpp b/modules/gpu/src/opencv2/gpu/device/emulation.hpp index 1fd3d9f06..9b4de6c1a 100644 --- a/modules/gpu/src/opencv2/gpu/device/emulation.hpp +++ b/modules/gpu/src/opencv2/gpu/device/emulation.hpp @@ -45,21 +45,21 @@ #include "warp_reduce.hpp" -namespace cv { namespace gpu { namespace device +namespace cv { namespace gpu { namespace device { struct Emulation { - static __forceinline__ __device__ int Ballot(int predicate, volatile int* cta_buffer) - { + static __forceinline__ __device__ int Ballot(int predicate, volatile int* cta_buffer) + { #if __CUDA_ARCH__ >= 200 - (void)cta_buffer; - return __ballot(predicate); + (void)cta_buffer; + return __ballot(predicate); #else - int tid = threadIdx.x; - cta_buffer[tid] = predicate ? (1 << (tid & 31)) : 0; - return warp_reduce(cta_buffer); + int tid = threadIdx.x; + cta_buffer[tid] = predicate ? (1 << (tid & 31)) : 0; + return warp_reduce(cta_buffer); #endif - } + } }; }}} // namespace cv { namespace gpu { namespace device diff --git a/modules/gpu/src/opencv2/gpu/device/funcattrib.hpp b/modules/gpu/src/opencv2/gpu/device/funcattrib.hpp index 4be6dd337..984b567f7 100644 --- a/modules/gpu/src/opencv2/gpu/device/funcattrib.hpp +++ b/modules/gpu/src/opencv2/gpu/device/funcattrib.hpp @@ -46,14 +46,14 @@ #include -namespace cv { namespace gpu { namespace device +namespace cv { namespace gpu { namespace device { - template + template void printFuncAttrib(Func& func) { cudaFuncAttributes attrs; - cudaFuncGetAttributes(&attrs, func); + cudaFuncGetAttributes(&attrs, func); printf("=== Function stats ===\n"); printf("Name: \n"); @@ -65,7 +65,7 @@ namespace cv { namespace gpu { namespace device printf("ptxVersion = %d\n", attrs.ptxVersion); printf("binaryVersion = %d\n", attrs.binaryVersion); printf("\n"); - fflush(stdout); + fflush(stdout); } }}} // namespace cv { namespace gpu { namespace device diff --git a/modules/gpu/src/opencv2/gpu/device/functional.hpp b/modules/gpu/src/opencv2/gpu/device/functional.hpp index 435fe65f3..32e6d0e37 100644 --- a/modules/gpu/src/opencv2/gpu/device/functional.hpp +++ b/modules/gpu/src/opencv2/gpu/device/functional.hpp @@ -48,7 +48,7 @@ #include "vec_traits.hpp" #include "type_traits.hpp" -namespace cv { namespace gpu { namespace device +namespace cv { namespace gpu { namespace device { // Function Objects @@ -257,7 +257,7 @@ namespace cv { namespace gpu { namespace device template struct bit_not : unary_function { - __device__ __forceinline__ T operator ()(typename TypeTraits::ParameterType v) const + __device__ __forceinline__ T operator ()(typename TypeTraits::ParameterType v) const { return ~v; } @@ -268,7 +268,7 @@ namespace cv { namespace gpu { namespace device // Generalized Identity Operations template struct identity : unary_function { - __device__ __forceinline__ typename TypeTraits::ParameterType operator()(typename TypeTraits::ParameterType x) const + __device__ __forceinline__ typename TypeTraits::ParameterType operator()(typename TypeTraits::ParameterType x) const { return x; } @@ -278,7 +278,7 @@ namespace cv { namespace gpu { namespace device template struct project1st : binary_function { - __device__ __forceinline__ typename TypeTraits::ParameterType operator()(typename TypeTraits::ParameterType lhs, typename TypeTraits::ParameterType rhs) const + __device__ __forceinline__ typename TypeTraits::ParameterType operator()(typename TypeTraits::ParameterType lhs, typename TypeTraits::ParameterType rhs) const { return lhs; } @@ -288,7 +288,7 @@ namespace cv { namespace gpu { namespace device template struct project2nd : binary_function { - __device__ __forceinline__ typename TypeTraits::ParameterType operator()(typename TypeTraits::ParameterType lhs, typename TypeTraits::ParameterType rhs) const + __device__ __forceinline__ typename TypeTraits::ParameterType operator()(typename TypeTraits::ParameterType lhs, typename TypeTraits::ParameterType rhs) const { return rhs; } @@ -308,7 +308,7 @@ namespace cv { namespace gpu { namespace device template struct maximum : binary_function { - __device__ __forceinline__ T operator()(typename TypeTraits::ParameterType lhs, typename TypeTraits::ParameterType rhs) const + __device__ __forceinline__ T operator()(typename TypeTraits::ParameterType lhs, typename TypeTraits::ParameterType rhs) const { return lhs < rhs ? rhs : lhs; } @@ -328,7 +328,7 @@ namespace cv { namespace gpu { namespace device template struct minimum : binary_function { - __device__ __forceinline__ T operator()(typename TypeTraits::ParameterType lhs, typename TypeTraits::ParameterType rhs) const + __device__ __forceinline__ T operator()(typename TypeTraits::ParameterType lhs, typename TypeTraits::ParameterType rhs) const { return lhs < rhs ? lhs : rhs; } @@ -410,12 +410,14 @@ namespace cv { namespace gpu { namespace device #undef OPENCV_GPU_IMPLEMENT_UN_FUNCTOR #undef OPENCV_GPU_IMPLEMENT_BIN_FUNCTOR - template struct hypot_sqr_func : binary_function + template struct hypot_sqr_func : binary_function { __device__ __forceinline__ T operator ()(typename TypeTraits::ParameterType src1, typename TypeTraits::ParameterType src2) const { return src1 * src1 + src2 * src2; } + __device__ __forceinline__ hypot_sqr_func(const hypot_sqr_func& other) : binary_function(){} + __device__ __forceinline__ hypot_sqr_func() : binary_function(){} }; // Saturate Cast Functor @@ -438,6 +440,7 @@ namespace cv { namespace gpu { namespace device { return (src > thresh) * maxVal; } + __device__ __forceinline__ thresh_binary_func(const thresh_binary_func& other) : unary_function(), thresh(other.thresh), maxVal(other.maxVal){} @@ -455,6 +458,7 @@ namespace cv { namespace gpu { namespace device { return (src <= thresh) * maxVal; } + __device__ __forceinline__ thresh_binary_inv_func(const thresh_binary_inv_func& other) : unary_function(), thresh(other.thresh), maxVal(other.maxVal){} @@ -519,12 +523,16 @@ namespace cv { namespace gpu { namespace device explicit __host__ __device__ __forceinline__ unary_negate(const Predicate& p) : pred(p) {} __device__ __forceinline__ bool operator()(typename TypeTraits::ParameterType x) const - { - return !pred(x); + { + return !pred(x); } + __device__ __forceinline__ unary_negate(const unary_negate& other) : unary_function(){} + __device__ __forceinline__ unary_negate() : unary_function(){} + const Predicate pred; }; + template __host__ __device__ __forceinline__ unary_negate not1(const Predicate& pred) { return unary_negate(pred); @@ -534,19 +542,26 @@ namespace cv { namespace gpu { namespace device { explicit __host__ __device__ __forceinline__ binary_negate(const Predicate& p) : pred(p) {} - __device__ __forceinline__ bool operator()(typename TypeTraits::ParameterType x, typename TypeTraits::ParameterType y) const - { - return !pred(x,y); + __device__ __forceinline__ bool operator()(typename TypeTraits::ParameterType x, + typename TypeTraits::ParameterType y) const + { + return !pred(x,y); } + __device__ __forceinline__ binary_negate(const binary_negate& other) + : binary_function(){} + + __device__ __forceinline__ binary_negate() : + binary_function(){} const Predicate pred; }; + template __host__ __device__ __forceinline__ binary_negate not2(const BinaryPredicate& pred) { return binary_negate(pred); } - template struct binder1st : unary_function + template struct binder1st : unary_function { __host__ __device__ __forceinline__ binder1st(const Op& op_, const typename Op::first_argument_type& arg1_) : op(op_), arg1(arg1_) {} @@ -555,15 +570,19 @@ namespace cv { namespace gpu { namespace device return op(arg1, a); } + __device__ __forceinline__ binder1st(const binder1st& other) : + unary_function(){} + const Op op; const typename Op::first_argument_type arg1; }; + template __host__ __device__ __forceinline__ binder1st bind1st(const Op& op, const T& x) { return binder1st(op, typename Op::first_argument_type(x)); } - template struct binder2nd : unary_function + template struct binder2nd : unary_function { __host__ __device__ __forceinline__ binder2nd(const Op& op_, const typename Op::second_argument_type& arg2_) : op(op_), arg2(arg2_) {} @@ -572,16 +591,19 @@ namespace cv { namespace gpu { namespace device return op(a, arg2); } + __device__ __forceinline__ binder2nd(const binder2nd& other) : + unary_function(), op(other.op), arg2(other.arg2){} + const Op op; const typename Op::second_argument_type arg2; }; + template __host__ __device__ __forceinline__ binder2nd bind2nd(const Op& op, const T& x) { return binder2nd(op, typename Op::second_argument_type(x)); } // Functor Traits - template struct IsUnaryFunction { typedef char Yes; @@ -618,7 +640,7 @@ namespace cv { namespace gpu { namespace device { enum { shift = UnOpShift::shift }; }; - + template struct BinOpShift { enum { shift = 1 }; }; template struct BinOpShift { enum { shift = 4 }; }; template struct BinOpShift { enum { shift = 2 }; }; diff --git a/modules/gpu/src/opencv2/gpu/device/limits.hpp b/modules/gpu/src/opencv2/gpu/device/limits.hpp index 396e9a310..f5dd39bdc 100644 --- a/modules/gpu/src/opencv2/gpu/device/limits.hpp +++ b/modules/gpu/src/opencv2/gpu/device/limits.hpp @@ -46,7 +46,7 @@ #include #include "common.hpp" -namespace cv { namespace gpu { namespace device +namespace cv { namespace gpu { namespace device { template struct numeric_limits { diff --git a/modules/gpu/src/opencv2/gpu/device/saturate_cast.hpp b/modules/gpu/src/opencv2/gpu/device/saturate_cast.hpp index d9fa5ce0c..37204cc41 100644 --- a/modules/gpu/src/opencv2/gpu/device/saturate_cast.hpp +++ b/modules/gpu/src/opencv2/gpu/device/saturate_cast.hpp @@ -57,35 +57,35 @@ namespace cv { namespace gpu { namespace device template __device__ __forceinline__ _Tp saturate_cast(double v) { return _Tp(v); } template<> __device__ __forceinline__ uchar saturate_cast(schar v) - { - return (uchar) ::max((int)v, 0); + { + return (uchar) ::max((int)v, 0); } template<> __device__ __forceinline__ uchar saturate_cast(ushort v) - { - return (uchar) ::min((uint)v, (uint)UCHAR_MAX); + { + return (uchar) ::min((uint)v, (uint)UCHAR_MAX); } template<> __device__ __forceinline__ uchar saturate_cast(int v) - { - return (uchar)((uint)v <= UCHAR_MAX ? v : v > 0 ? UCHAR_MAX : 0); + { + return (uchar)((uint)v <= UCHAR_MAX ? v : v > 0 ? UCHAR_MAX : 0); } template<> __device__ __forceinline__ uchar saturate_cast(uint v) - { - return (uchar) ::min(v, (uint)UCHAR_MAX); + { + return (uchar) ::min(v, (uint)UCHAR_MAX); } template<> __device__ __forceinline__ uchar saturate_cast(short v) - { - return saturate_cast((uint)v); + { + return saturate_cast((uint)v); } template<> __device__ __forceinline__ uchar saturate_cast(float v) - { - int iv = __float2int_rn(v); - return saturate_cast(iv); + { + int iv = __float2int_rn(v); + return saturate_cast(iv); } template<> __device__ __forceinline__ uchar saturate_cast(double v) { #if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 130 - int iv = __double2int_rn(v); + int iv = __double2int_rn(v); return saturate_cast(iv); #else return saturate_cast((float)v); @@ -93,35 +93,35 @@ namespace cv { namespace gpu { namespace device } template<> __device__ __forceinline__ schar saturate_cast(uchar v) - { - return (schar) ::min((int)v, SCHAR_MAX); + { + return (schar) ::min((int)v, SCHAR_MAX); } template<> __device__ __forceinline__ schar saturate_cast(ushort v) - { - return (schar) ::min((uint)v, (uint)SCHAR_MAX); + { + return (schar) ::min((uint)v, (uint)SCHAR_MAX); } template<> __device__ __forceinline__ schar saturate_cast(int v) { return (schar)((uint)(v-SCHAR_MIN) <= (uint)UCHAR_MAX ? v : v > 0 ? SCHAR_MAX : SCHAR_MIN); } template<> __device__ __forceinline__ schar saturate_cast(short v) - { - return saturate_cast((int)v); + { + return saturate_cast((int)v); } template<> __device__ __forceinline__ schar saturate_cast(uint v) - { - return (schar) ::min(v, (uint)SCHAR_MAX); + { + return (schar) ::min(v, (uint)SCHAR_MAX); } template<> __device__ __forceinline__ schar saturate_cast(float v) - { - int iv = __float2int_rn(v); - return saturate_cast(iv); + { + int iv = __float2int_rn(v); + return saturate_cast(iv); } template<> __device__ __forceinline__ schar saturate_cast(double v) - { + { #if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 130 - int iv = __double2int_rn(v); + int iv = __double2int_rn(v); return saturate_cast(iv); #else return saturate_cast((float)v); @@ -129,30 +129,30 @@ namespace cv { namespace gpu { namespace device } template<> __device__ __forceinline__ ushort saturate_cast(schar v) - { - return (ushort) ::max((int)v, 0); + { + return (ushort) ::max((int)v, 0); } template<> __device__ __forceinline__ ushort saturate_cast(short v) - { - return (ushort) ::max((int)v, 0); + { + return (ushort) ::max((int)v, 0); } template<> __device__ __forceinline__ ushort saturate_cast(int v) - { - return (ushort)((uint)v <= (uint)USHRT_MAX ? v : v > 0 ? USHRT_MAX : 0); + { + return (ushort)((uint)v <= (uint)USHRT_MAX ? v : v > 0 ? USHRT_MAX : 0); } template<> __device__ __forceinline__ ushort saturate_cast(uint v) - { - return (ushort) ::min(v, (uint)USHRT_MAX); + { + return (ushort) ::min(v, (uint)USHRT_MAX); } template<> __device__ __forceinline__ ushort saturate_cast(float v) { - int iv = __float2int_rn(v); - return saturate_cast(iv); + int iv = __float2int_rn(v); + return saturate_cast(iv); } template<> __device__ __forceinline__ ushort saturate_cast(double v) - { + { #if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 130 - int iv = __double2int_rn(v); + int iv = __double2int_rn(v); return saturate_cast(iv); #else return saturate_cast((float)v); @@ -160,37 +160,37 @@ namespace cv { namespace gpu { namespace device } template<> __device__ __forceinline__ short saturate_cast(ushort v) - { - return (short) ::min((int)v, SHRT_MAX); + { + return (short) ::min((int)v, SHRT_MAX); } template<> __device__ __forceinline__ short saturate_cast(int v) { return (short)((uint)(v - SHRT_MIN) <= (uint)USHRT_MAX ? v : v > 0 ? SHRT_MAX : SHRT_MIN); } template<> __device__ __forceinline__ short saturate_cast(uint v) - { - return (short) ::min(v, (uint)SHRT_MAX); + { + return (short) ::min(v, (uint)SHRT_MAX); } template<> __device__ __forceinline__ short saturate_cast(float v) - { - int iv = __float2int_rn(v); - return saturate_cast(iv); + { + int iv = __float2int_rn(v); + return saturate_cast(iv); } template<> __device__ __forceinline__ short saturate_cast(double v) - { + { #if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 130 - int iv = __double2int_rn(v); + int iv = __double2int_rn(v); return saturate_cast(iv); #else return saturate_cast((float)v); #endif } - template<> __device__ __forceinline__ int saturate_cast(float v) - { - return __float2int_rn(v); + template<> __device__ __forceinline__ int saturate_cast(float v) + { + return __float2int_rn(v); } - template<> __device__ __forceinline__ int saturate_cast(double v) + template<> __device__ __forceinline__ int saturate_cast(double v) { #if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 130 return __double2int_rn(v); @@ -200,11 +200,11 @@ namespace cv { namespace gpu { namespace device } template<> __device__ __forceinline__ uint saturate_cast(float v) - { - return __float2uint_rn(v); + { + return __float2uint_rn(v); } - template<> __device__ __forceinline__ uint saturate_cast(double v) - { + template<> __device__ __forceinline__ uint saturate_cast(double v) + { #if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 130 return __double2uint_rn(v); #else diff --git a/modules/gpu/src/opencv2/gpu/device/scan.hpp b/modules/gpu/src/opencv2/gpu/device/scan.hpp new file mode 100644 index 000000000..b55ff4181 --- /dev/null +++ b/modules/gpu/src/opencv2/gpu/device/scan.hpp @@ -0,0 +1,166 @@ +/*M/////////////////////////////////////////////////////////////////////////////////////// +// +// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. +// +// By downloading, copying, installing or using the software you agree to this license. +// If you do not agree to this license, do not download, install, +// copy or use the software. +// +// +// License Agreement +// For Open Source Computer Vision Library +// +// Copyright (C) 2000-2008, Intel Corporation, all rights reserved. +// Copyright (C) 2009, Willow Garage Inc., all rights reserved. +// Third party copyrights are property of their respective owners. +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// * Redistribution's of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// * Redistribution's in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// * The name of the copyright holders may not be used to endorse or promote products +// derived from this software without specific prior written permission. +// +// This software is provided by the copyright holders and contributors "as is" and +// any express or implied warranties, including, but not limited to, the implied +// warranties of merchantability and fitness for a particular purpose are disclaimed. +// In no event shall the Intel Corporation or contributors be liable for any direct, +// indirect, incidental, special, exemplary, or consequential damages +// (including, but not limited to, procurement of substitute goods or services; +// loss of use, data, or profits; or business interruption) however caused +// and on any theory of liability, whether in contract, strict liability, +// or tort (including negligence or otherwise) arising in any way out of +// the use of this software, even if advised of the possibility of such damage. +// +//M*/ + +#ifndef __OPENCV_GPU_SCAN_HPP__ +#define __OPENCV_GPU_SCAN_HPP__ + + enum ScanKind { EXCLUSIVE = 0, INCLUSIVE = 1 }; + + template struct WarpScan + { + __device__ __forceinline__ WarpScan() {} + __device__ __forceinline__ WarpScan(const WarpScan& other) { (void)other; } + + __device__ __forceinline__ T operator()( volatile T *ptr , const unsigned int idx) + { + const unsigned int lane = idx & 31; + F op; + + if ( lane >= 1) ptr [idx ] = op(ptr [idx - 1], ptr [idx]); + if ( lane >= 2) ptr [idx ] = op(ptr [idx - 2], ptr [idx]); + if ( lane >= 4) ptr [idx ] = op(ptr [idx - 4], ptr [idx]); + if ( lane >= 8) ptr [idx ] = op(ptr [idx - 8], ptr [idx]); + if ( lane >= 16) ptr [idx ] = op(ptr [idx - 16], ptr [idx]); + + if( Kind == INCLUSIVE ) + return ptr [idx]; + else + return (lane > 0) ? ptr [idx - 1] : 0; + } + + __device__ __forceinline__ unsigned int index(const unsigned int tid) + { + return tid; + } + + __device__ __forceinline__ void init(volatile T *ptr){} + + static const int warp_offset = 0; + + typedef WarpScan merge; + }; + + template struct WarpScanNoComp + { + __device__ __forceinline__ WarpScanNoComp() {} + __device__ __forceinline__ WarpScanNoComp(const WarpScanNoComp& other) { (void)other; } + + __device__ __forceinline__ T operator()( volatile T *ptr , const unsigned int idx) + { + const unsigned int lane = threadIdx.x & 31; + F op; + + ptr [idx ] = op(ptr [idx - 1], ptr [idx]); + ptr [idx ] = op(ptr [idx - 2], ptr [idx]); + ptr [idx ] = op(ptr [idx - 4], ptr [idx]); + ptr [idx ] = op(ptr [idx - 8], ptr [idx]); + ptr [idx ] = op(ptr [idx - 16], ptr [idx]); + + if( Kind == INCLUSIVE ) + return ptr [idx]; + else + return (lane > 0) ? ptr [idx - 1] : 0; + } + + __device__ __forceinline__ unsigned int index(const unsigned int tid) + { + return (tid >> warp_log) * warp_smem_stride + 16 + (tid & warp_mask); + } + + __device__ __forceinline__ void init(volatile T *ptr) + { + ptr[threadIdx.x] = 0; + } + + static const int warp_smem_stride = 32 + 16 + 1; + static const int warp_offset = 16; + static const int warp_log = 5; + static const int warp_mask = 31; + + typedef WarpScanNoComp merge; + }; + + template struct BlockScan + { + __device__ __forceinline__ BlockScan() {} + __device__ __forceinline__ BlockScan(const BlockScan& other) { (void)other; } + + __device__ __forceinline__ T operator()(volatile T *ptr) + { + const unsigned int tid = threadIdx.x; + const unsigned int lane = tid & warp_mask; + const unsigned int warp = tid >> warp_log; + + Sc scan; + typename Sc::merge merge_scan; + const unsigned int idx = scan.index(tid); + + T val = scan(ptr, idx); + __syncthreads (); + + if( warp == 0) + scan.init(ptr); + __syncthreads (); + + if( lane == 31 ) + ptr [scan.warp_offset + warp ] = (Kind == INCLUSIVE) ? val : ptr [idx]; + __syncthreads (); + + if( warp == 0 ) + merge_scan(ptr, idx); + __syncthreads(); + + if ( warp > 0) + val = ptr [scan.warp_offset + warp - 1] + val; + __syncthreads (); + + ptr[idx] = val; + __syncthreads (); + + return val ; + } + + static const int warp_log = 5; + static const int warp_mask = 31; + }; + +#endif \ No newline at end of file diff --git a/modules/gpu/src/opencv2/gpu/device/static_check.hpp b/modules/gpu/src/opencv2/gpu/device/static_check.hpp index 7f6aafe4c..178c0f707 100644 --- a/modules/gpu/src/opencv2/gpu/device/static_check.hpp +++ b/modules/gpu/src/opencv2/gpu/device/static_check.hpp @@ -43,27 +43,27 @@ #ifndef __OPENCV_GPU_GPU_DEVICE_STATIC_CHECK_HPP__ #define __OPENCV_GPU_GPU_DEVICE_STATIC_CHECK_HPP__ -#if defined(__CUDACC__) - #define __OPENCV_GPU_HOST_DEVICE__ __host__ __device__ __forceinline__ +#if defined(__CUDACC__) + #define __OPENCV_GPU_HOST_DEVICE__ __host__ __device__ __forceinline__ #else #define __OPENCV_GPU_HOST_DEVICE__ -#endif +#endif -namespace cv { namespace gpu -{ +namespace cv { namespace gpu +{ namespace device { template struct Static {}; - - template<> struct Static - { - __OPENCV_GPU_HOST_DEVICE__ static void check() {}; + + template<> struct Static + { + __OPENCV_GPU_HOST_DEVICE__ static void check() {}; }; - } + } using ::cv::gpu::device::Static; }} #undef __OPENCV_GPU_HOST_DEVICE__ -#endif /* __OPENCV_GPU_GPU_DEVICE_STATIC_CHECK_HPP__ */ \ No newline at end of file +#endif /* __OPENCV_GPU_GPU_DEVICE_STATIC_CHECK_HPP__ */ \ No newline at end of file diff --git a/modules/gpu/src/opencv2/gpu/device/transform.hpp b/modules/gpu/src/opencv2/gpu/device/transform.hpp index a0e79dfa1..d6b3cf9d5 100644 --- a/modules/gpu/src/opencv2/gpu/device/transform.hpp +++ b/modules/gpu/src/opencv2/gpu/device/transform.hpp @@ -47,7 +47,7 @@ #include "utility.hpp" #include "detail/transform_detail.hpp" -namespace cv { namespace gpu { namespace device +namespace cv { namespace gpu { namespace device { template static inline void transform(DevMem2D_ src, DevMem2D_ dst, UnOp op, const Mask& mask, cudaStream_t stream) diff --git a/modules/gpu/src/opencv2/gpu/device/type_traits.hpp b/modules/gpu/src/opencv2/gpu/device/type_traits.hpp index 93c7f1b84..e3684df86 100644 --- a/modules/gpu/src/opencv2/gpu/device/type_traits.hpp +++ b/modules/gpu/src/opencv2/gpu/device/type_traits.hpp @@ -45,11 +45,11 @@ #include "detail/type_traits_detail.hpp" -namespace cv { namespace gpu { namespace device +namespace cv { namespace gpu { namespace device { template struct IsSimpleParameter { - enum {value = type_traits_detail::IsIntegral::value || type_traits_detail::IsFloat::value || + enum {value = type_traits_detail::IsIntegral::value || type_traits_detail::IsFloat::value || type_traits_detail::PointerTraits::type>::value}; }; @@ -65,16 +65,16 @@ namespace cv { namespace gpu { namespace device enum { isVolatile = type_traits_detail::UnVolatile::value }; enum { isReference = type_traits_detail::ReferenceTraits::value }; - enum { isPointer = type_traits_detail::PointerTraits::type>::value }; + enum { isPointer = type_traits_detail::PointerTraits::type>::value }; - enum { isUnsignedInt = type_traits_detail::IsUnsignedIntegral::value }; - enum { isSignedInt = type_traits_detail::IsSignedIntergral::value }; - enum { isIntegral = type_traits_detail::IsIntegral::value }; - enum { isFloat = type_traits_detail::IsFloat::value }; - enum { isArith = isIntegral || isFloat }; - enum { isVec = type_traits_detail::IsVec::value }; - - typedef typename type_traits_detail::Select::value, + enum { isUnsignedInt = type_traits_detail::IsUnsignedIntegral::value }; + enum { isSignedInt = type_traits_detail::IsSignedIntergral::value }; + enum { isIntegral = type_traits_detail::IsIntegral::value }; + enum { isFloat = type_traits_detail::IsFloat::value }; + enum { isArith = isIntegral || isFloat }; + enum { isVec = type_traits_detail::IsVec::value }; + + typedef typename type_traits_detail::Select::value, T, typename type_traits_detail::AddParameterType::type>::type ParameterType; }; }}} diff --git a/modules/gpu/src/opencv2/gpu/device/utility.hpp b/modules/gpu/src/opencv2/gpu/device/utility.hpp index cb4db80a0..78d82e33a 100644 --- a/modules/gpu/src/opencv2/gpu/device/utility.hpp +++ b/modules/gpu/src/opencv2/gpu/device/utility.hpp @@ -47,17 +47,17 @@ #include "datamov_utils.hpp" #include "detail/utility_detail.hpp" -namespace cv { namespace gpu { namespace device +namespace cv { namespace gpu { namespace device { - #define OPENCV_GPU_LOG_WARP_SIZE (5) - #define OPENCV_GPU_WARP_SIZE (1 << OPENCV_GPU_LOG_WARP_SIZE) + #define OPENCV_GPU_LOG_WARP_SIZE (5) + #define OPENCV_GPU_WARP_SIZE (1 << OPENCV_GPU_LOG_WARP_SIZE) #define OPENCV_GPU_LOG_MEM_BANKS ((__CUDA_ARCH__ >= 200) ? 5 : 4) // 32 banks on fermi, 16 on tesla #define OPENCV_GPU_MEM_BANKS (1 << OPENCV_GPU_LOG_MEM_BANKS) /////////////////////////////////////////////////////////////////////////////// // swap - template void __device__ __host__ __forceinline__ swap(T& a, T& b) + template void __device__ __host__ __forceinline__ swap(T& a, T& b) { const T temp = a; a = b; @@ -71,9 +71,9 @@ namespace cv { namespace gpu { namespace device { explicit __host__ __device__ __forceinline__ SingleMask(PtrStepb mask_) : mask(mask_) {} __host__ __device__ __forceinline__ SingleMask(const SingleMask& mask_): mask(mask_.mask){} - + __device__ __forceinline__ bool operator()(int y, int x) const - { + { return mask.ptr(y)[x] != 0; } @@ -82,13 +82,13 @@ namespace cv { namespace gpu { namespace device struct SingleMaskChannels { - __host__ __device__ __forceinline__ SingleMaskChannels(PtrStepb mask_, int channels_) + __host__ __device__ __forceinline__ SingleMaskChannels(PtrStepb mask_, int channels_) : mask(mask_), channels(channels_) {} __host__ __device__ __forceinline__ SingleMaskChannels(const SingleMaskChannels& mask_) :mask(mask_.mask), channels(mask_.channels){} - + __device__ __forceinline__ bool operator()(int y, int x) const - { + { return mask.ptr(y)[x / channels] != 0; } @@ -112,7 +112,7 @@ namespace cv { namespace gpu { namespace device { curMask = maskCollection[z]; } - + __device__ __forceinline__ bool operator()(int y, int x) const { uchar val; @@ -165,20 +165,20 @@ namespace cv { namespace gpu { namespace device utility_detail::ReductionDispatcher::reduce(data, partial_reduction, tid, op); } - template + template __device__ __forceinline__ void reducePredVal(volatile T* sdata, T& myData, V* sval, V& myVal, int tid, const Pred& pred) { StaticAssert= 8 && n <= 512>::check(); utility_detail::PredValReductionDispatcher::reduce(myData, myVal, sdata, sval, tid, pred); } - template + template __device__ __forceinline__ void reducePredVal2(volatile T* sdata, T& myData, V1* sval1, V1& myVal1, V2* sval2, V2& myVal2, int tid, const Pred& pred) { StaticAssert= 8 && n <= 512>::check(); utility_detail::PredVal2ReductionDispatcher::reduce(myData, myVal1, myVal2, sdata, sval1, sval2, tid, pred); } - + /////////////////////////////////////////////////////////////////////////////// // Solve linear system @@ -212,17 +212,17 @@ namespace cv { namespace gpu { namespace device { double invdet = 1.0 / det; - x[0] = saturate_cast(invdet * + x[0] = saturate_cast(invdet * (b[0] * (A[1][1] * A[2][2] - A[1][2] * A[2][1]) - A[0][1] * (b[1] * A[2][2] - A[1][2] * b[2] ) + A[0][2] * (b[1] * A[2][1] - A[1][1] * b[2] ))); - x[1] = saturate_cast(invdet * + x[1] = saturate_cast(invdet * (A[0][0] * (b[1] * A[2][2] - A[1][2] * b[2] ) - b[0] * (A[1][0] * A[2][2] - A[1][2] * A[2][0]) + A[0][2] * (A[1][0] * b[2] - b[1] * A[2][0]))); - x[2] = saturate_cast(invdet * + x[2] = saturate_cast(invdet * (A[0][0] * (A[1][1] * b[2] - b[1] * A[2][1]) - A[0][1] * (A[1][0] * b[2] - b[1] * A[2][0]) + b[0] * (A[1][0] * A[2][1] - A[1][1] * A[2][0]))); diff --git a/modules/gpu/src/opencv2/gpu/device/vec_distance.hpp b/modules/gpu/src/opencv2/gpu/device/vec_distance.hpp index a1ead9f52..113f3dd51 100644 --- a/modules/gpu/src/opencv2/gpu/device/vec_distance.hpp +++ b/modules/gpu/src/opencv2/gpu/device/vec_distance.hpp @@ -47,7 +47,7 @@ #include "functional.hpp" #include "detail/vec_distance_detail.hpp" -namespace cv { namespace gpu { namespace device +namespace cv { namespace gpu { namespace device { template struct L1Dist { @@ -150,7 +150,7 @@ namespace cv { namespace gpu { namespace device }; // calc distance between two vectors in global memory - template + template __device__ void calcVecDiffGlobal(const T1* vec1, const T2* vec2, int len, Dist& dist, typename Dist::result_type* smem, int tid) { for (int i = tid; i < len; i += THREAD_DIM) @@ -170,9 +170,9 @@ namespace cv { namespace gpu { namespace device // calc distance between two vectors, first vector is cached in register or shared memory, second vector is in global memory template __device__ __forceinline__ void calcVecDiffCached(const T1* vecCached, const T2* vecGlob, int len, Dist& dist, typename Dist::result_type* smem, int tid) - { + { vec_distance_detail::VecDiffCachedCalculator::calc(vecCached, vecGlob, len, dist, tid); - + dist.reduceAll(smem, tid); } diff --git a/modules/gpu/src/opencv2/gpu/device/vec_math.hpp b/modules/gpu/src/opencv2/gpu/device/vec_math.hpp index 833abcbc3..67e064a38 100644 --- a/modules/gpu/src/opencv2/gpu/device/vec_math.hpp +++ b/modules/gpu/src/opencv2/gpu/device/vec_math.hpp @@ -47,7 +47,7 @@ #include "vec_traits.hpp" #include "functional.hpp" -namespace cv { namespace gpu { namespace device +namespace cv { namespace gpu { namespace device { namespace vec_math_detail { @@ -150,7 +150,7 @@ namespace cv { namespace gpu { namespace device } namespace vec_math_detail - { + { template struct BinOpTraits { typedef int argument_type; @@ -326,5 +326,5 @@ namespace cv { namespace gpu { namespace device #undef OPENCV_GPU_IMPLEMENT_VEC_OP #undef OPENCV_GPU_IMPLEMENT_VEC_INT_OP }}} // namespace cv { namespace gpu { namespace device - + #endif // __OPENCV_GPU_VECMATH_HPP__ \ No newline at end of file diff --git a/modules/gpu/src/opencv2/gpu/device/vec_traits.hpp b/modules/gpu/src/opencv2/gpu/device/vec_traits.hpp index 955fe0d4b..9e309fbe0 100644 --- a/modules/gpu/src/opencv2/gpu/device/vec_traits.hpp +++ b/modules/gpu/src/opencv2/gpu/device/vec_traits.hpp @@ -45,7 +45,7 @@ #include "common.hpp" -namespace cv { namespace gpu { namespace device +namespace cv { namespace gpu { namespace device { template struct TypeVec; @@ -219,18 +219,18 @@ namespace cv { namespace gpu { namespace device #undef OPENCV_GPU_IMPLEMENT_VEC_TRAITS - template<> struct VecTraits - { + template<> struct VecTraits + { typedef char elem_type; - enum {cn=1}; + enum {cn=1}; static __device__ __host__ __forceinline__ char all(char v) {return v;} static __device__ __host__ __forceinline__ char make(char x) {return x;} static __device__ __host__ __forceinline__ char make(const char* x) {return *x;} }; - template<> struct VecTraits - { + template<> struct VecTraits + { typedef schar elem_type; - enum {cn=1}; + enum {cn=1}; static __device__ __host__ __forceinline__ schar all(schar v) {return v;} static __device__ __host__ __forceinline__ schar make(schar x) {return x;} static __device__ __host__ __forceinline__ schar make(const schar* x) {return *x;} diff --git a/modules/gpu/src/opencv2/gpu/device/warp.hpp b/modules/gpu/src/opencv2/gpu/device/warp.hpp index 0ac67f47a..59b8568c7 100644 --- a/modules/gpu/src/opencv2/gpu/device/warp.hpp +++ b/modules/gpu/src/opencv2/gpu/device/warp.hpp @@ -43,7 +43,7 @@ #ifndef __OPENCV_GPU_DEVICE_WARP_HPP__ #define __OPENCV_GPU_DEVICE_WARP_HPP__ -namespace cv { namespace gpu { namespace device +namespace cv { namespace gpu { namespace device { struct Warp { @@ -64,18 +64,18 @@ namespace cv { namespace gpu { namespace device template static __device__ __forceinline__ void fill(It beg, It end, const T& value) - { + { for(It t = beg + laneId(); t < end; t += STRIDE) *t = value; - } + } template static __device__ __forceinline__ OutIt copy(InIt beg, InIt end, OutIt out) - { + { for(InIt t = beg + laneId(); t < end; t += STRIDE, out += STRIDE) *out = *t; return out; - } + } template static __device__ __forceinline__ OutIt transform(InIt beg, InIt end, OutIt out, UnOp op) @@ -90,7 +90,7 @@ namespace cv { namespace gpu { namespace device { unsigned int lane = laneId(); - InIt1 t1 = beg1 + lane; + InIt1 t1 = beg1 + lane; InIt2 t2 = beg2 + lane; for(; t1 < end1; t1 += STRIDE, t2 += STRIDE, out += STRIDE) *out = op(*t1, *t2); @@ -100,7 +100,7 @@ namespace cv { namespace gpu { namespace device template static __device__ __forceinline__ void yota(OutIt beg, OutIt end, T value) { - unsigned int lane = laneId(); + unsigned int lane = laneId(); value += lane; for(OutIt t = beg + lane; t < end; t += STRIDE, value += STRIDE) diff --git a/modules/gpu/src/resize.cpp b/modules/gpu/src/resize.cpp index 25bdce42d..626f5aa0a 100644 --- a/modules/gpu/src/resize.cpp +++ b/modules/gpu/src/resize.cpp @@ -80,51 +80,9 @@ namespace cv { namespace gpu { namespace device template void resize_gpu(DevMem2Db src, DevMem2Db srcWhole, int xoff, int yoff, float fx, float fy, DevMem2Db dst, int interpolation, cudaStream_t stream); - - template - void resize_area_gpu(const DevMem2Db src, DevMem2Db dst,float fx, float fy, - int interpolation, DevMem2Df buffer, cudaStream_t stream); } }}} -void cv::gpu::resize(const GpuMat& src, GpuMat& dst, GpuMat& buffer, Size dsize, double fx, double fy, - int interpolation, Stream& s) -{ - CV_Assert(src.depth() <= CV_32F && src.channels() <= 4); - CV_Assert(interpolation == INTER_AREA); - CV_Assert( (fx < 1.0) && (fy < 1.0)); - CV_Assert(!(dsize == Size()) || (fx > 0 && fy > 0)); - CV_Assert(src.cols >= 128 && src.rows >= 128); - CV_Assert((fx - 128.0) <= 0 && (fy - 128.0) <= 0); - - if (dsize == Size()) - dsize = Size(saturate_cast(src.cols * fx), saturate_cast(src.rows * fy)); - else - { - fx = static_cast(dsize.width) / src.cols; - fy = static_cast(dsize.height) / src.rows; - } - - fx = static_cast(1.0 / fx); - fy = static_cast(1.0 / fy); - - dst.create(dsize, src.type()); - buffer.create(cv::Size(dsize.width, src.rows), CV_32FC1); - - if (dsize == src.size()) - { - if (s) - s.enqueueCopy(src, dst); - else - src.copyTo(dst); - return; - } - - cudaStream_t stream = StreamAccessor::getStream(s); - - cv::gpu::device::imgproc::resize_area_gpu(src, dst, fx, fy, interpolation, buffer, stream); -} - void cv::gpu::resize(const GpuMat& src, GpuMat& dst, Size dsize, double fx, double fy, int interpolation, Stream& s) { CV_Assert(src.depth() <= CV_32F && src.channels() <= 4); diff --git a/modules/gpu/test/test_resize.cpp b/modules/gpu/test/test_resize.cpp index 81de33a1b..4873dcc4d 100644 --- a/modules/gpu/test/test_resize.cpp +++ b/modules/gpu/test/test_resize.cpp @@ -182,45 +182,6 @@ PARAM_TEST_CASE(ResizeArea, cv::gpu::DeviceInfo, cv::Size, MatType, double, Inte } }; -TEST_P(ResizeArea, Accuracy) -{ - cv::Mat src = randomMat(size, type); - - cv::gpu::GpuMat dst = createMat(cv::Size(cv::saturate_cast(src.cols * coeff), cv::saturate_cast(src.rows * coeff)), type, useRoi); - cv::gpu::GpuMat buffer = createMat(cv::Size(dst.cols, src.rows), CV_32FC1); - - cv::gpu::resize(loadMat(src, useRoi), dst, buffer, cv::Size(), coeff, coeff, interpolation); - - cv::Mat dst_cpu; - - cv::resize(src, dst_cpu, cv::Size(), coeff, coeff, interpolation); - - cv::Mat gpu_buff; - buffer.download(gpu_buff); - - cv::Mat gpu; - dst.download(gpu); - - // std::cout // << src - // // << std::endl << std::endl - // // << gpu_buff - // // << std::endl << std::endl - // << gpu - // << std::endl << std::endl - // << dst_cpu<< std::endl; - - - EXPECT_MAT_NEAR(dst_cpu, dst, src.depth() == CV_32F ? 1e-2 : 1.0); -} - -INSTANTIATE_TEST_CASE_P(GPU_ImgProc, ResizeArea, testing::Combine( - ALL_DEVICES, - testing::Values(cv::Size(640, 480)),//DIFFERENT_SIZES, - testing::Values(MatType(CV_8UC1)/*MatType(CV_8UC3), MatType(CV_16UC1), MatType(CV_16UC3), MatType(CV_16UC4), MatType(CV_32FC1), MatType(CV_32FC3), MatType(CV_32FC4)*/), - testing::Values(0.05, 0.1), - testing::Values(Interpolation(cv::INTER_AREA)), - WHOLE_SUBMAT)); - /////////////////////////////////////////////////////////////////// // Test NPP