diff --git a/cmake/OpenCVDetectCUDA.cmake b/cmake/OpenCVDetectCUDA.cmake index 84b95a430..9ece3a059 100644 --- a/cmake/OpenCVDetectCUDA.cmake +++ b/cmake/OpenCVDetectCUDA.cmake @@ -1,8 +1,8 @@ if(${CMAKE_VERSION} VERSION_LESS "2.8.3") message(STATUS WITH_CUDA flag requires CMake 2.8.3. CUDA support is disabled.) - return() + return() endif() - + find_package(CUDA 4.1) if(CUDA_FOUND) @@ -23,7 +23,7 @@ if(CUDA_FOUND) else() set(CUDA_ARCH_BIN "1.1 1.2 1.3 2.0 2.1(2.0)" CACHE STRING "Specify 'real' GPU architectures to build binaries for, BIN(PTX) format is supported") endif() - + set(CUDA_ARCH_PTX "2.0" CACHE STRING "Specify 'virtual' PTX architectures to build PTX intermediate code for") string(REGEX REPLACE "\\." "" ARCH_BIN_NO_POINTS "${CUDA_ARCH_BIN}") @@ -89,8 +89,8 @@ if(CUDA_FOUND) set (CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} -Xcompiler -fno-finite-math-only) endif() - # we remove -ggdb3 flag as it leads to preprocessor errors when compiling CUDA files (CUDA 4.1) - set(CMAKE_CXX_FLAGS_DEBUG_ ${CMAKE_CXX_FLAGS_DEBUG}) + # we remove -ggdb3 flag as it leads to preprocessor errors when compiling CUDA files (CUDA 4.1) + set(CMAKE_CXX_FLAGS_DEBUG_ ${CMAKE_CXX_FLAGS_DEBUG}) string(REPLACE "-ggdb3" "" CMAKE_CXX_FLAGS_DEBUG ${CMAKE_CXX_FLAGS_DEBUG}) CUDA_COMPILE(${VAR} ${ARGN}) set(CMAKE_CXX_DEBUG_FLAGS ${CMAKE_CXX_FLAGS_DEBUG_}) diff --git a/modules/gpu/perf/perf_imgproc.cpp b/modules/gpu/perf/perf_imgproc.cpp index b261ad086..ab7fb4240 100644 --- a/modules/gpu/perf/perf_imgproc.cpp +++ b/modules/gpu/perf/perf_imgproc.cpp @@ -90,6 +90,40 @@ INSTANTIATE_TEST_CASE_P(ImgProc, Resize, testing::Combine( Interpolation(cv::INTER_CUBIC), Interpolation(cv::INTER_AREA)), testing::Values(Scale(0.5), Scale(0.3), Scale(2.0)))); +GPU_PERF_TEST(ResizeArea, cv::gpu::DeviceInfo, cv::Size, MatType, Scale) +{ + cv::gpu::DeviceInfo devInfo = GET_PARAM(0); + cv::gpu::setDevice(devInfo.deviceID()); + + cv::Size size = GET_PARAM(1); + int type = GET_PARAM(2); + int interpolation = cv::INTER_AREA; + double f = GET_PARAM(3); + + cv::Mat src_host(size, type); + fill(src_host, 0, 255); + + cv::gpu::GpuMat src(src_host); + cv::gpu::GpuMat dst; + + cv::gpu::resize(src, dst, cv::Size(), f, f, interpolation); + + declare.time(1.0); + + TEST_CYCLE() + { + cv::gpu::resize(src, dst, cv::Size(), f, f, interpolation); + } +} + +INSTANTIATE_TEST_CASE_P(ImgProc, ResizeArea, testing::Combine( + ALL_DEVICES, + testing::Values(perf::sz1080p/*, cv::Size(4096, 2048)*/), + testing::Values(MatType(CV_8UC1), MatType(CV_8UC3), MatType(CV_8UC4), + MatType(CV_16UC1), MatType(CV_16UC3), MatType(CV_16UC4), + MatType(CV_32FC1), MatType(CV_32FC3), MatType(CV_32FC4)), + testing::Values(Scale(0.2),Scale(0.1),Scale(0.05)))); + ////////////////////////////////////////////////////////////////////// // WarpAffine diff --git a/modules/gpu/src/cuda/matrix_reductions.cu b/modules/gpu/src/cuda/matrix_reductions.cu index 633b7ee5c..a0be65c3c 100644 --- a/modules/gpu/src/cuda/matrix_reductions.cu +++ b/modules/gpu/src/cuda/matrix_reductions.cu @@ -72,7 +72,7 @@ namespace cv { namespace gpu { namespace device struct Mask8U { - explicit Mask8U(PtrStepb mask): mask(mask) {} + explicit Mask8U(PtrStepb mask_): mask(mask_) {} __device__ __forceinline__ bool operator()(int y, int x) const { diff --git a/modules/gpu/src/cuda/resize.cu b/modules/gpu/src/cuda/resize.cu index b8edce79a..7bcb5a3a6 100644 --- a/modules/gpu/src/cuda/resize.cu +++ b/modules/gpu/src/cuda/resize.cu @@ -46,7 +46,8 @@ #include "opencv2/gpu/device/vec_math.hpp" #include "opencv2/gpu/device/saturate_cast.hpp" #include "opencv2/gpu/device/filters.hpp" -# include +#include +#include namespace cv { namespace gpu { namespace device { diff --git a/modules/gpu/src/cuda/split_merge.cu b/modules/gpu/src/cuda/split_merge.cu index a9c08448e..6aa98bdff 100644 --- a/modules/gpu/src/cuda/split_merge.cu +++ b/modules/gpu/src/cuda/split_merge.cu @@ -228,9 +228,9 @@ namespace cv { namespace gpu { namespace device template static void mergeC2_(const DevMem2Db* src, DevMem2Db& dst, const cudaStream_t& stream) { - dim3 blockDim(32, 8); - dim3 gridDim(divUp(dst.cols, blockDim.x), divUp(dst.rows, blockDim.y)); - mergeC2_<<>>( + dim3 block(32, 8); + dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y)); + mergeC2_<<>>( src[0].data, src[0].step, src[1].data, src[1].step, dst.rows, dst.cols, dst.data, dst.step); @@ -244,9 +244,9 @@ namespace cv { namespace gpu { namespace device template static void mergeC3_(const DevMem2Db* src, DevMem2Db& dst, const cudaStream_t& stream) { - dim3 blockDim(32, 8); - dim3 gridDim(divUp(dst.cols, blockDim.x), divUp(dst.rows, blockDim.y)); - mergeC3_<<>>( + dim3 block(32, 8); + dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y)); + mergeC3_<<>>( src[0].data, src[0].step, src[1].data, src[1].step, src[2].data, src[2].step, @@ -261,9 +261,9 @@ namespace cv { namespace gpu { namespace device template static void mergeC4_(const DevMem2Db* src, DevMem2Db& dst, const cudaStream_t& stream) { - dim3 blockDim(32, 8); - dim3 gridDim(divUp(dst.cols, blockDim.x), divUp(dst.rows, blockDim.y)); - mergeC4_<<>>( + dim3 block(32, 8); + dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y)); + mergeC4_<<>>( src[0].data, src[0].step, src[1].data, src[1].step, src[2].data, src[2].step, @@ -437,9 +437,9 @@ namespace cv { namespace gpu { namespace device template static void splitC2_(const DevMem2Db& src, DevMem2Db* dst, const cudaStream_t& stream) { - dim3 blockDim(32, 8); - dim3 gridDim(divUp(src.cols, blockDim.x), divUp(src.rows, blockDim.y)); - splitC2_<<>>( + dim3 block(32, 8); + dim3 grid(divUp(src.cols, block.x), divUp(src.rows, block.y)); + splitC2_<<>>( src.data, src.step, src.rows, src.cols, dst[0].data, dst[0].step, dst[1].data, dst[1].step); @@ -453,9 +453,9 @@ namespace cv { namespace gpu { namespace device template static void splitC3_(const DevMem2Db& src, DevMem2Db* dst, const cudaStream_t& stream) { - dim3 blockDim(32, 8); - dim3 gridDim(divUp(src.cols, blockDim.x), divUp(src.rows, blockDim.y)); - splitC3_<<>>( + dim3 block(32, 8); + dim3 grid(divUp(src.cols, block.x), divUp(src.rows, block.y)); + splitC3_<<>>( src.data, src.step, src.rows, src.cols, dst[0].data, dst[0].step, dst[1].data, dst[1].step, @@ -470,9 +470,9 @@ namespace cv { namespace gpu { namespace device template static void splitC4_(const DevMem2Db& src, DevMem2Db* dst, const cudaStream_t& stream) { - dim3 blockDim(32, 8); - dim3 gridDim(divUp(src.cols, blockDim.x), divUp(src.rows, blockDim.y)); - splitC4_<<>>( + dim3 block(32, 8); + dim3 grid(divUp(src.cols, block.x), divUp(src.rows, block.y)); + splitC4_<<>>( src.data, src.step, src.rows, src.cols, dst[0].data, dst[0].step, dst[1].data, dst[1].step, diff --git a/modules/gpu/src/nvidia/NCVBroxOpticalFlow.cu b/modules/gpu/src/nvidia/NCVBroxOpticalFlow.cu index 574de533d..6ade899c3 100644 --- a/modules/gpu/src/nvidia/NCVBroxOpticalFlow.cu +++ b/modules/gpu/src/nvidia/NCVBroxOpticalFlow.cu @@ -1,7 +1,7 @@ /*M/////////////////////////////////////////////////////////////////////////////////////// // -// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. -// +// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. +// // By downloading, copying, installing or using the software you agree to this license. // If you do not agree to this license, do not download, install, // copy or use the software. @@ -129,9 +129,9 @@ texture tex_diffusivity_y; __global__ void pointwise_add(float *d_res, const float *d_op1, const float *d_op2, const int len) { const int pos = blockIdx.x*blockDim.x + threadIdx.x; - + if(pos >= len) return; - + d_res[pos] = d_op1[pos] + d_op2[pos]; } @@ -265,7 +265,7 @@ __forceinline__ __device__ void diffusivity_along_y(float *s, int pos, const flo /////////////////////////////////////////////////////////////////////////////// template __forceinline__ __device__ void load_array_element(float *smem, int is, int js, int i, int j, int w, int h, int p) -{ +{ //position within shared memory array const int ijs = js * PSOR_PITCH + is; //mirror reflection across borders @@ -299,7 +299,7 @@ __forceinline__ __device__ void load_array_element(float *smem, int is, int js, ///\param h number of rows in global memory array ///\param p global memory array pitch in floats /////////////////////////////////////////////////////////////////////////////// -template +template __forceinline__ __device__ void load_array(float *smem, int ig, int jg, int w, int h, int p) { const int i = threadIdx.x + 2; @@ -381,7 +381,7 @@ __forceinline__ __device__ void load_array(float *smem, int ig, int jg, int w, i /// \param gamma (in) gamma in Brox model (edge importance) /////////////////////////////////////////////////////////////////////////////// -__global__ void prepare_sor_stage_1_tex(float *diffusivity_x, float *diffusivity_y, +__global__ void prepare_sor_stage_1_tex(float *diffusivity_x, float *diffusivity_y, float *denominator_u, float *denominator_v, float *numerator_dudv, float *numerator_u, float *numerator_v, @@ -532,16 +532,16 @@ __global__ void prepare_sor_stage_2(float *inv_denominator_u, float *inv_denomin // Red-Black SOR ///////////////////////////////////////////////////////////////////////////////////////// -template __global__ void sor_pass(float *new_du, - float *new_dv, - const float *g_inv_denominator_u, +template __global__ void sor_pass(float *new_du, + float *new_dv, + const float *g_inv_denominator_u, const float *g_inv_denominator_v, - const float *g_numerator_u, - const float *g_numerator_v, - const float *g_numerator_dudv, - float omega, - int width, - int height, + const float *g_numerator_u, + const float *g_numerator_v, + const float *g_numerator_dudv, + float omega, + int width, + int height, int stride) { int i = blockIdx.x * blockDim.x + threadIdx.x; @@ -604,7 +604,7 @@ template __global__ void sor_pass(float *new_du, if((i+j)%2 == isBlack) { // update du - float numerator_u = (s_left*(u_left + du_left) + s_up*(u_up + du_up) + s_right*(u_right + du_right) + s_down*(u_down + du_down) - + float numerator_u = (s_left*(u_left + du_left) + s_up*(u_up + du_up) + s_right*(u_right + du_right) + s_down*(u_down + du_down) - u * (s_left + s_right + s_up + s_down) - g_numerator_u[pos] - numerator_dudv*dv); du = (1.0f - omega) * du + omega * g_inv_denominator_u[pos] * numerator_u; @@ -644,7 +644,7 @@ void InitTextures() initTexture2D(tex_I1); initTexture2D(tex_fine); // for downsampling initTexture2D(tex_coarse); // for prolongation - + initTexture2D(tex_Ix); initTexture2D(tex_Ixx); initTexture2D(tex_Ix0); @@ -725,7 +725,7 @@ NCVStatus NCVBroxOpticalFlow(const NCVBroxOpticalFlowDescriptor desc, const Ncv32u kSourceHeight = frame0.height(); ncvAssertPrintReturn(frame1.width() == kSourceWidth && frame1.height() == kSourceHeight, "Frame dims do not match", NCV_INCONSISTENT_INPUT); - ncvAssertReturn(uOut.width() == kSourceWidth && vOut.width() == kSourceWidth && + ncvAssertReturn(uOut.width() == kSourceWidth && vOut.width() == kSourceWidth && uOut.height() == kSourceHeight && vOut.height() == kSourceHeight, NCV_INCONSISTENT_INPUT); ncvAssertReturn(gpu_mem_allocator.isInitialized(), NCV_ALLOCATOR_NOT_INITIALIZED); @@ -780,7 +780,7 @@ NCVStatus NCVBroxOpticalFlow(const NCVBroxOpticalFlowDescriptor desc, SAFE_VECTOR_DECL(dv_new, gpu_mem_allocator, kSizeInPixelsAligned); // temporary storage - SAFE_VECTOR_DECL(device_buffer, gpu_mem_allocator, + SAFE_VECTOR_DECL(device_buffer, gpu_mem_allocator, alignUp(kSourceWidth, kStrideAlignmentFloat) * alignUp(kSourceHeight, kStrideAlignmentFloat)); // image derivatives @@ -800,7 +800,7 @@ NCVStatus NCVBroxOpticalFlow(const NCVBroxOpticalFlowDescriptor desc, { const float derivativeFilterHost[kDFilterSize] = {1.0f, -8.0f, 0.0f, 8.0f, -1.0f}; - ncvAssertCUDAReturn(cudaMemcpy(derivativeFilter.ptr(), derivativeFilterHost, sizeof(float) * kDFilterSize, + ncvAssertCUDAReturn(cudaMemcpy(derivativeFilter.ptr(), derivativeFilterHost, sizeof(float) * kDFilterSize, cudaMemcpyHostToDevice), NCV_CUDA_ERROR); InitTextures(); @@ -827,10 +827,10 @@ NCVStatus NCVBroxOpticalFlow(const NCVBroxOpticalFlowDescriptor desc, size_t src_width_in_bytes = kSourceWidth * sizeof(float); size_t src_pitch_in_bytes = frame0.pitch(); - ncvAssertCUDAReturn( cudaMemcpy2DAsync(pI0->ptr(), dst_width_in_bytes, frame0.ptr(), + ncvAssertCUDAReturn( cudaMemcpy2DAsync(pI0->ptr(), dst_width_in_bytes, frame0.ptr(), src_pitch_in_bytes, src_width_in_bytes, kSourceHeight, cudaMemcpyDeviceToDevice, stream), NCV_CUDA_ERROR ); - ncvAssertCUDAReturn( cudaMemcpy2DAsync(pI1->ptr(), dst_width_in_bytes, frame1.ptr(), + ncvAssertCUDAReturn( cudaMemcpy2DAsync(pI1->ptr(), dst_width_in_bytes, frame1.ptr(), src_pitch_in_bytes, src_width_in_bytes, kSourceHeight, cudaMemcpyDeviceToDevice, stream), NCV_CUDA_ERROR ); } @@ -876,11 +876,11 @@ NCVStatus NCVBroxOpticalFlow(const NCVBroxOpticalFlowDescriptor desc, NcvRect32u dstROI (0, 0, level_width, level_height); // frame 0 - ncvAssertReturnNcvStat( nppiStResize_32f_C1R (I0->ptr(), srcSize, prev_level_pitch, srcROI, + ncvAssertReturnNcvStat( nppiStResize_32f_C1R (I0->ptr(), srcSize, prev_level_pitch, srcROI, level_frame0->ptr(), dstSize, level_width_aligned * sizeof (float), dstROI, scale_factor, scale_factor, nppStSupersample) ); // frame 1 - ncvAssertReturnNcvStat( nppiStResize_32f_C1R (I1->ptr(), srcSize, prev_level_pitch, srcROI, + ncvAssertReturnNcvStat( nppiStResize_32f_C1R (I1->ptr(), srcSize, prev_level_pitch, srcROI, level_frame1->ptr(), dstSize, level_width_aligned * sizeof (float), dstROI, scale_factor, scale_factor, nppStSupersample) ); } @@ -956,14 +956,14 @@ NCVStatus NCVBroxOpticalFlow(const NCVBroxOpticalFlowDescriptor desc, dim3 dThreads(32, 6); const int kPitchTex = kLevelStride * sizeof(float); - + NcvSize32u srcSize(kLevelWidth, kLevelHeight); Ncv32u nSrcStep = kLevelStride * sizeof(float); NcvRect32u oROI(0, 0, kLevelWidth, kLevelHeight); // Ix0 ncvAssertReturnNcvStat( nppiStFilterRowBorder_32f_C1R (I0->ptr(), srcSize, nSrcStep, Ix0.ptr(), srcSize, nSrcStep, oROI, - nppStBorderMirror, derivativeFilter.ptr(), kDFilterSize, kDFilterSize/2, 1.0f/12.0f) ); + nppStBorderMirror, derivativeFilter.ptr(), kDFilterSize, kDFilterSize/2, 1.0f/12.0f) ); // Iy0 ncvAssertReturnNcvStat( nppiStFilterColumnBorder_32f_C1R (I0->ptr(), srcSize, nSrcStep, Iy0.ptr(), srcSize, nSrcStep, oROI, @@ -987,8 +987,8 @@ NCVStatus NCVBroxOpticalFlow(const NCVBroxOpticalFlowDescriptor desc, // Ixy ncvAssertReturnNcvStat( nppiStFilterRowBorder_32f_C1R (Iy.ptr(), srcSize, nSrcStep, Ixy.ptr(), srcSize, nSrcStep, oROI, - nppStBorderMirror, derivativeFilter.ptr(), kDFilterSize, kDFilterSize/2, 1.0f/12.0f) ); - + nppStBorderMirror, derivativeFilter.ptr(), kDFilterSize, kDFilterSize/2, 1.0f/12.0f) ); + ncvAssertCUDAReturn(cudaBindTexture2D(0, tex_Ix, Ix.ptr(), ch_desc, kLevelWidth, kLevelHeight, kPitchTex), NCV_CUDA_ERROR); ncvAssertCUDAReturn(cudaBindTexture2D(0, tex_Ixx, Ixx.ptr(), ch_desc, kLevelWidth, kLevelHeight, kPitchTex), NCV_CUDA_ERROR); ncvAssertCUDAReturn(cudaBindTexture2D(0, tex_Ix0, Ix0.ptr(), ch_desc, kLevelWidth, kLevelHeight, kPitchTex), NCV_CUDA_ERROR); @@ -1017,21 +1017,21 @@ NCVStatus NCVBroxOpticalFlow(const NCVBroxOpticalFlowDescriptor desc, { //compute coefficients prepare_sor_stage_1_tex<<>> - (diffusivity_x.ptr(), - diffusivity_y.ptr(), - denom_u.ptr(), - denom_v.ptr(), - num_dudv.ptr(), - num_u.ptr(), - num_v.ptr(), - kLevelWidth, - kLevelHeight, - kLevelStride, - alpha, + (diffusivity_x.ptr(), + diffusivity_y.ptr(), + denom_u.ptr(), + denom_v.ptr(), + num_dudv.ptr(), + num_u.ptr(), + num_v.ptr(), + kLevelWidth, + kLevelHeight, + kLevelStride, + alpha, gamma); ncvAssertCUDALastErrorReturn(NCV_CUDA_ERROR); - + ncvAssertCUDAReturn(cudaBindTexture(0, tex_diffusivity_x, diffusivity_x.ptr(), ch_desc, kLevelSizeInBytes), NCV_CUDA_ERROR); ncvAssertCUDAReturn(cudaBindTexture(0, tex_diffusivity_y, diffusivity_y.ptr(), ch_desc, kLevelSizeInBytes), NCV_CUDA_ERROR); @@ -1043,7 +1043,7 @@ NCVStatus NCVBroxOpticalFlow(const NCVBroxOpticalFlowDescriptor desc, prepare_sor_stage_2<<>>(denom_u.ptr(), denom_v.ptr(), kLevelWidth, kLevelHeight, kLevelStride); ncvAssertCUDALastErrorReturn(NCV_CUDA_ERROR); - + // linear system coefficients ncvAssertCUDAReturn(cudaBindTexture(0, tex_diffusivity_x, diffusivity_x.ptr(), ch_desc, kLevelSizeInBytes), NCV_CUDA_ERROR); ncvAssertCUDAReturn(cudaBindTexture(0, tex_diffusivity_y, diffusivity_y.ptr(), ch_desc, kLevelSizeInBytes), NCV_CUDA_ERROR); @@ -1055,26 +1055,26 @@ NCVStatus NCVBroxOpticalFlow(const NCVBroxOpticalFlowDescriptor desc, ncvAssertCUDAReturn(cudaBindTexture(0, tex_inv_denominator_u, denom_u.ptr(), ch_desc, kLevelSizeInBytes), NCV_CUDA_ERROR); ncvAssertCUDAReturn(cudaBindTexture(0, tex_inv_denominator_v, denom_v.ptr(), ch_desc, kLevelSizeInBytes), NCV_CUDA_ERROR); - + //solve linear system for (Ncv32u solver_iteration = 0; solver_iteration < desc.number_of_solver_iterations; ++solver_iteration) { float omega = 1.99f; - + ncvAssertCUDAReturn(cudaBindTexture(0, tex_du, du.ptr(), ch_desc, kLevelSizeInBytes), NCV_CUDA_ERROR); ncvAssertCUDAReturn(cudaBindTexture(0, tex_dv, dv.ptr(), ch_desc, kLevelSizeInBytes), NCV_CUDA_ERROR); sor_pass<0><<>> - (du_new.ptr(), - dv_new.ptr(), - denom_u.ptr(), + (du_new.ptr(), + dv_new.ptr(), + denom_u.ptr(), denom_v.ptr(), - num_u.ptr(), - num_v.ptr(), - num_dudv.ptr(), - omega, - kLevelWidth, - kLevelHeight, + num_u.ptr(), + num_v.ptr(), + num_dudv.ptr(), + omega, + kLevelWidth, + kLevelHeight, kLevelStride); ncvAssertCUDALastErrorReturn(NCV_CUDA_ERROR); @@ -1083,16 +1083,16 @@ NCVStatus NCVBroxOpticalFlow(const NCVBroxOpticalFlowDescriptor desc, ncvAssertCUDAReturn(cudaBindTexture(0, tex_dv, dv_new.ptr(), ch_desc, kLevelSizeInBytes), NCV_CUDA_ERROR); sor_pass<1><<>> - (du.ptr(), - dv.ptr(), - denom_u.ptr(), + (du.ptr(), + dv.ptr(), + denom_u.ptr(), denom_v.ptr(), - num_u.ptr(), - num_v.ptr(), - num_dudv.ptr(), - omega, - kLevelWidth, - kLevelHeight, + num_u.ptr(), + num_v.ptr(), + num_dudv.ptr(), + omega, + kLevelWidth, + kLevelHeight, kLevelStride); ncvAssertCUDALastErrorReturn(NCV_CUDA_ERROR); @@ -1120,19 +1120,19 @@ NCVStatus NCVBroxOpticalFlow(const NCVBroxOpticalFlowDescriptor desc, dim3 p_blocks(iDivUp(nw, 32), iDivUp(nh, 8)); dim3 p_threads(32, 8); - - NcvSize32u srcSize (kLevelWidth, kLevelHeight); + + NcvSize32u inner_srcSize (kLevelWidth, kLevelHeight); NcvSize32u dstSize (nw, nh); NcvRect32u srcROI (0, 0, kLevelWidth, kLevelHeight); NcvRect32u dstROI (0, 0, nw, nh); - ncvAssertReturnNcvStat( nppiStResize_32f_C1R (ptrU->ptr(), srcSize, kLevelStride * sizeof (float), srcROI, + ncvAssertReturnNcvStat( nppiStResize_32f_C1R (ptrU->ptr(), inner_srcSize, kLevelStride * sizeof (float), srcROI, ptrUNew->ptr(), dstSize, ns * sizeof (float), dstROI, 1.0f/scale_factor, 1.0f/scale_factor, nppStBicubic) ); ScaleVector(ptrUNew->ptr(), ptrUNew->ptr(), 1.0f/scale_factor, ns * nh, stream); ncvAssertCUDALastErrorReturn(NCV_CUDA_ERROR); - ncvAssertReturnNcvStat( nppiStResize_32f_C1R (ptrV->ptr(), srcSize, kLevelStride * sizeof (float), srcROI, + ncvAssertReturnNcvStat( nppiStResize_32f_C1R (ptrV->ptr(), inner_srcSize, kLevelStride * sizeof (float), srcROI, ptrVNew->ptr(), dstSize, ns * sizeof (float), dstROI, 1.0f/scale_factor, 1.0f/scale_factor, nppStBicubic) ); ScaleVector(ptrVNew->ptr(), ptrVNew->ptr(), 1.0f/scale_factor, ns * nh, stream); @@ -1148,11 +1148,11 @@ NCVStatus NCVBroxOpticalFlow(const NCVBroxOpticalFlowDescriptor desc, ncvAssertCUDAReturn(cudaStreamSynchronize(stream), NCV_CUDA_ERROR); ncvAssertCUDAReturn( cudaMemcpy2DAsync - (uOut.ptr(), uOut.pitch(), ptrU->ptr(), + (uOut.ptr(), uOut.pitch(), ptrU->ptr(), kSourcePitch, kSourceWidth*sizeof(float), kSourceHeight, cudaMemcpyDeviceToDevice, stream), NCV_CUDA_ERROR ); ncvAssertCUDAReturn( cudaMemcpy2DAsync - (vOut.ptr(), vOut.pitch(), ptrV->ptr(), + (vOut.ptr(), vOut.pitch(), ptrV->ptr(), kSourcePitch, kSourceWidth*sizeof(float), kSourceHeight, cudaMemcpyDeviceToDevice, stream), NCV_CUDA_ERROR ); ncvAssertCUDAReturn(cudaStreamSynchronize(stream), NCV_CUDA_ERROR); diff --git a/modules/gpu/src/nvidia/core/NCV.cu b/modules/gpu/src/nvidia/core/NCV.cu index 7b271aeec..7a2d73e6b 100644 --- a/modules/gpu/src/nvidia/core/NCV.cu +++ b/modules/gpu/src/nvidia/core/NCV.cu @@ -252,7 +252,7 @@ NCVStatus memSegCopyHelper2D(void *dst, Ncv32u dstPitch, NCVMemoryType dstType, //=================================================================== -NCVMemStackAllocator::NCVMemStackAllocator(Ncv32u alignment) +NCVMemStackAllocator::NCVMemStackAllocator(Ncv32u alignment_) : currentSize(0), _maxSize(0), @@ -260,23 +260,23 @@ NCVMemStackAllocator::NCVMemStackAllocator(Ncv32u alignment) begin(NULL), end(NULL), _memType(NCVMemoryTypeNone), - _alignment(alignment), + _alignment(alignment_), bReusesMemory(false) { - NcvBool bProperAlignment = (alignment & (alignment-1)) == 0; + NcvBool bProperAlignment = (alignment_ & (alignment_ - 1)) == 0; ncvAssertPrintCheck(bProperAlignment, "NCVMemStackAllocator ctor:: alignment not power of 2"); } -NCVMemStackAllocator::NCVMemStackAllocator(NCVMemoryType memT, size_t capacity, Ncv32u alignment, void *reusePtr) +NCVMemStackAllocator::NCVMemStackAllocator(NCVMemoryType memT, size_t capacity, Ncv32u alignment_, void *reusePtr) : currentSize(0), _maxSize(0), allocBegin(NULL), _memType(memT), - _alignment(alignment) + _alignment(alignment_) { - NcvBool bProperAlignment = (alignment & (alignment-1)) == 0; + NcvBool bProperAlignment = (alignment_ & (alignment_ - 1)) == 0; ncvAssertPrintCheck(bProperAlignment, "NCVMemStackAllocator ctor:: _alignment not power of 2"); ncvAssertPrintCheck(memT != NCVMemoryTypeNone, "NCVMemStackAllocator ctor:: Incorrect allocator type"); @@ -425,12 +425,12 @@ size_t NCVMemStackAllocator::maxSize(void) const //=================================================================== -NCVMemNativeAllocator::NCVMemNativeAllocator(NCVMemoryType memT, Ncv32u alignment) +NCVMemNativeAllocator::NCVMemNativeAllocator(NCVMemoryType memT, Ncv32u alignment_) : currentSize(0), _maxSize(0), _memType(memT), - _alignment(alignment) + _alignment(alignment_) { ncvAssertPrintReturn(memT != NCVMemoryTypeNone, "NCVMemNativeAllocator ctor:: counting not permitted for this allocator type", ); } diff --git a/modules/gpu/src/opencv2/gpu/device/common.hpp b/modules/gpu/src/opencv2/gpu/device/common.hpp index 24a447b81..56bcc1af8 100644 --- a/modules/gpu/src/opencv2/gpu/device/common.hpp +++ b/modules/gpu/src/opencv2/gpu/device/common.hpp @@ -64,7 +64,7 @@ #define cudaSafeCall(expr) ___cudaSafeCall(expr, __FILE__, __LINE__) #endif -namespace cv { namespace gpu +namespace cv { namespace gpu { void error(const char *error_string, const char *file, const int line, const char *func); @@ -87,14 +87,14 @@ static inline void ___cudaSafeCall(cudaError_t err, const char *file, const int #ifdef __CUDACC__ -namespace cv { namespace gpu -{ - __host__ __device__ __forceinline__ int divUp(int total, int grain) - { - return (total + grain - 1) / grain; +namespace cv { namespace gpu +{ + __host__ __device__ __forceinline__ int divUp(int total, int grain) + { + return (total + grain - 1) / grain; } - namespace device + namespace device { typedef unsigned char uchar; typedef unsigned short ushort; diff --git a/modules/gpu/src/opencv2/gpu/device/datamov_utils.hpp b/modules/gpu/src/opencv2/gpu/device/datamov_utils.hpp index bd5c49fbe..a0961f1ac 100644 --- a/modules/gpu/src/opencv2/gpu/device/datamov_utils.hpp +++ b/modules/gpu/src/opencv2/gpu/device/datamov_utils.hpp @@ -45,7 +45,7 @@ #include "common.hpp" -namespace cv { namespace gpu { namespace device +namespace cv { namespace gpu { namespace device { #if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 200 @@ -54,13 +54,13 @@ namespace cv { namespace gpu { namespace device { __device__ __forceinline__ static void Load(const T* ptr, int offset, T& val) { val = ptr[offset]; } }; - - #else // __CUDA_ARCH__ >= 200 - #if defined(_WIN64) || defined(__LP64__) + #else // __CUDA_ARCH__ >= 200 + + #if defined(_WIN64) || defined(__LP64__) // 64-bit register modifier for inlined asm #define OPENCV_GPU_ASM_PTR "l" - #else + #else // 32-bit register modifier for inlined asm #define OPENCV_GPU_ASM_PTR "r" #endif @@ -84,21 +84,21 @@ namespace cv { namespace gpu { namespace device asm("ld.global."#ptx_type" %0, [%1];" : "=r"(*reinterpret_cast(&val)) : OPENCV_GPU_ASM_PTR(ptr + offset)); \ } \ }; - + OPENCV_GPU_DEFINE_FORCE_GLOB_B(uchar, u8) OPENCV_GPU_DEFINE_FORCE_GLOB_B(schar, s8) OPENCV_GPU_DEFINE_FORCE_GLOB_B(char, b8) OPENCV_GPU_DEFINE_FORCE_GLOB (ushort, u16, h) OPENCV_GPU_DEFINE_FORCE_GLOB (short, s16, h) OPENCV_GPU_DEFINE_FORCE_GLOB (uint, u32, r) - OPENCV_GPU_DEFINE_FORCE_GLOB (int, s32, r) - OPENCV_GPU_DEFINE_FORCE_GLOB (float, f32, f) - OPENCV_GPU_DEFINE_FORCE_GLOB (double, f64, d) + OPENCV_GPU_DEFINE_FORCE_GLOB (int, s32, r) + OPENCV_GPU_DEFINE_FORCE_GLOB (float, f32, f) + OPENCV_GPU_DEFINE_FORCE_GLOB (double, f64, d) #undef OPENCV_GPU_DEFINE_FORCE_GLOB #undef OPENCV_GPU_DEFINE_FORCE_GLOB_B #undef OPENCV_GPU_ASM_PTR - + #endif // __CUDA_ARCH__ >= 200 }}} // namespace cv { namespace gpu { namespace device diff --git a/modules/gpu/src/opencv2/gpu/device/dynamic_smem.hpp b/modules/gpu/src/opencv2/gpu/device/dynamic_smem.hpp index 7ce6994fd..4d073c5f8 100644 --- a/modules/gpu/src/opencv2/gpu/device/dynamic_smem.hpp +++ b/modules/gpu/src/opencv2/gpu/device/dynamic_smem.hpp @@ -44,7 +44,7 @@ #define __OPENCV_GPU_DYNAMIC_SMEM_HPP__ namespace cv { namespace gpu { namespace device -{ +{ template struct DynamicSharedMem { __device__ __forceinline__ operator T*() diff --git a/modules/gpu/src/opencv2/gpu/device/emulation.hpp b/modules/gpu/src/opencv2/gpu/device/emulation.hpp index 1fd3d9f06..9b4de6c1a 100644 --- a/modules/gpu/src/opencv2/gpu/device/emulation.hpp +++ b/modules/gpu/src/opencv2/gpu/device/emulation.hpp @@ -45,21 +45,21 @@ #include "warp_reduce.hpp" -namespace cv { namespace gpu { namespace device +namespace cv { namespace gpu { namespace device { struct Emulation { - static __forceinline__ __device__ int Ballot(int predicate, volatile int* cta_buffer) - { + static __forceinline__ __device__ int Ballot(int predicate, volatile int* cta_buffer) + { #if __CUDA_ARCH__ >= 200 - (void)cta_buffer; - return __ballot(predicate); + (void)cta_buffer; + return __ballot(predicate); #else - int tid = threadIdx.x; - cta_buffer[tid] = predicate ? (1 << (tid & 31)) : 0; - return warp_reduce(cta_buffer); + int tid = threadIdx.x; + cta_buffer[tid] = predicate ? (1 << (tid & 31)) : 0; + return warp_reduce(cta_buffer); #endif - } + } }; }}} // namespace cv { namespace gpu { namespace device diff --git a/modules/gpu/src/opencv2/gpu/device/funcattrib.hpp b/modules/gpu/src/opencv2/gpu/device/funcattrib.hpp index 4be6dd337..984b567f7 100644 --- a/modules/gpu/src/opencv2/gpu/device/funcattrib.hpp +++ b/modules/gpu/src/opencv2/gpu/device/funcattrib.hpp @@ -46,14 +46,14 @@ #include -namespace cv { namespace gpu { namespace device +namespace cv { namespace gpu { namespace device { - template + template void printFuncAttrib(Func& func) { cudaFuncAttributes attrs; - cudaFuncGetAttributes(&attrs, func); + cudaFuncGetAttributes(&attrs, func); printf("=== Function stats ===\n"); printf("Name: \n"); @@ -65,7 +65,7 @@ namespace cv { namespace gpu { namespace device printf("ptxVersion = %d\n", attrs.ptxVersion); printf("binaryVersion = %d\n", attrs.binaryVersion); printf("\n"); - fflush(stdout); + fflush(stdout); } }}} // namespace cv { namespace gpu { namespace device diff --git a/modules/gpu/src/opencv2/gpu/device/functional.hpp b/modules/gpu/src/opencv2/gpu/device/functional.hpp index 435fe65f3..32e6d0e37 100644 --- a/modules/gpu/src/opencv2/gpu/device/functional.hpp +++ b/modules/gpu/src/opencv2/gpu/device/functional.hpp @@ -48,7 +48,7 @@ #include "vec_traits.hpp" #include "type_traits.hpp" -namespace cv { namespace gpu { namespace device +namespace cv { namespace gpu { namespace device { // Function Objects @@ -257,7 +257,7 @@ namespace cv { namespace gpu { namespace device template struct bit_not : unary_function { - __device__ __forceinline__ T operator ()(typename TypeTraits::ParameterType v) const + __device__ __forceinline__ T operator ()(typename TypeTraits::ParameterType v) const { return ~v; } @@ -268,7 +268,7 @@ namespace cv { namespace gpu { namespace device // Generalized Identity Operations template struct identity : unary_function { - __device__ __forceinline__ typename TypeTraits::ParameterType operator()(typename TypeTraits::ParameterType x) const + __device__ __forceinline__ typename TypeTraits::ParameterType operator()(typename TypeTraits::ParameterType x) const { return x; } @@ -278,7 +278,7 @@ namespace cv { namespace gpu { namespace device template struct project1st : binary_function { - __device__ __forceinline__ typename TypeTraits::ParameterType operator()(typename TypeTraits::ParameterType lhs, typename TypeTraits::ParameterType rhs) const + __device__ __forceinline__ typename TypeTraits::ParameterType operator()(typename TypeTraits::ParameterType lhs, typename TypeTraits::ParameterType rhs) const { return lhs; } @@ -288,7 +288,7 @@ namespace cv { namespace gpu { namespace device template struct project2nd : binary_function { - __device__ __forceinline__ typename TypeTraits::ParameterType operator()(typename TypeTraits::ParameterType lhs, typename TypeTraits::ParameterType rhs) const + __device__ __forceinline__ typename TypeTraits::ParameterType operator()(typename TypeTraits::ParameterType lhs, typename TypeTraits::ParameterType rhs) const { return rhs; } @@ -308,7 +308,7 @@ namespace cv { namespace gpu { namespace device template struct maximum : binary_function { - __device__ __forceinline__ T operator()(typename TypeTraits::ParameterType lhs, typename TypeTraits::ParameterType rhs) const + __device__ __forceinline__ T operator()(typename TypeTraits::ParameterType lhs, typename TypeTraits::ParameterType rhs) const { return lhs < rhs ? rhs : lhs; } @@ -328,7 +328,7 @@ namespace cv { namespace gpu { namespace device template struct minimum : binary_function { - __device__ __forceinline__ T operator()(typename TypeTraits::ParameterType lhs, typename TypeTraits::ParameterType rhs) const + __device__ __forceinline__ T operator()(typename TypeTraits::ParameterType lhs, typename TypeTraits::ParameterType rhs) const { return lhs < rhs ? lhs : rhs; } @@ -410,12 +410,14 @@ namespace cv { namespace gpu { namespace device #undef OPENCV_GPU_IMPLEMENT_UN_FUNCTOR #undef OPENCV_GPU_IMPLEMENT_BIN_FUNCTOR - template struct hypot_sqr_func : binary_function + template struct hypot_sqr_func : binary_function { __device__ __forceinline__ T operator ()(typename TypeTraits::ParameterType src1, typename TypeTraits::ParameterType src2) const { return src1 * src1 + src2 * src2; } + __device__ __forceinline__ hypot_sqr_func(const hypot_sqr_func& other) : binary_function(){} + __device__ __forceinline__ hypot_sqr_func() : binary_function(){} }; // Saturate Cast Functor @@ -438,6 +440,7 @@ namespace cv { namespace gpu { namespace device { return (src > thresh) * maxVal; } + __device__ __forceinline__ thresh_binary_func(const thresh_binary_func& other) : unary_function(), thresh(other.thresh), maxVal(other.maxVal){} @@ -455,6 +458,7 @@ namespace cv { namespace gpu { namespace device { return (src <= thresh) * maxVal; } + __device__ __forceinline__ thresh_binary_inv_func(const thresh_binary_inv_func& other) : unary_function(), thresh(other.thresh), maxVal(other.maxVal){} @@ -519,12 +523,16 @@ namespace cv { namespace gpu { namespace device explicit __host__ __device__ __forceinline__ unary_negate(const Predicate& p) : pred(p) {} __device__ __forceinline__ bool operator()(typename TypeTraits::ParameterType x) const - { - return !pred(x); + { + return !pred(x); } + __device__ __forceinline__ unary_negate(const unary_negate& other) : unary_function(){} + __device__ __forceinline__ unary_negate() : unary_function(){} + const Predicate pred; }; + template __host__ __device__ __forceinline__ unary_negate not1(const Predicate& pred) { return unary_negate(pred); @@ -534,19 +542,26 @@ namespace cv { namespace gpu { namespace device { explicit __host__ __device__ __forceinline__ binary_negate(const Predicate& p) : pred(p) {} - __device__ __forceinline__ bool operator()(typename TypeTraits::ParameterType x, typename TypeTraits::ParameterType y) const - { - return !pred(x,y); + __device__ __forceinline__ bool operator()(typename TypeTraits::ParameterType x, + typename TypeTraits::ParameterType y) const + { + return !pred(x,y); } + __device__ __forceinline__ binary_negate(const binary_negate& other) + : binary_function(){} + + __device__ __forceinline__ binary_negate() : + binary_function(){} const Predicate pred; }; + template __host__ __device__ __forceinline__ binary_negate not2(const BinaryPredicate& pred) { return binary_negate(pred); } - template struct binder1st : unary_function + template struct binder1st : unary_function { __host__ __device__ __forceinline__ binder1st(const Op& op_, const typename Op::first_argument_type& arg1_) : op(op_), arg1(arg1_) {} @@ -555,15 +570,19 @@ namespace cv { namespace gpu { namespace device return op(arg1, a); } + __device__ __forceinline__ binder1st(const binder1st& other) : + unary_function(){} + const Op op; const typename Op::first_argument_type arg1; }; + template __host__ __device__ __forceinline__ binder1st bind1st(const Op& op, const T& x) { return binder1st(op, typename Op::first_argument_type(x)); } - template struct binder2nd : unary_function + template struct binder2nd : unary_function { __host__ __device__ __forceinline__ binder2nd(const Op& op_, const typename Op::second_argument_type& arg2_) : op(op_), arg2(arg2_) {} @@ -572,16 +591,19 @@ namespace cv { namespace gpu { namespace device return op(a, arg2); } + __device__ __forceinline__ binder2nd(const binder2nd& other) : + unary_function(), op(other.op), arg2(other.arg2){} + const Op op; const typename Op::second_argument_type arg2; }; + template __host__ __device__ __forceinline__ binder2nd bind2nd(const Op& op, const T& x) { return binder2nd(op, typename Op::second_argument_type(x)); } // Functor Traits - template struct IsUnaryFunction { typedef char Yes; @@ -618,7 +640,7 @@ namespace cv { namespace gpu { namespace device { enum { shift = UnOpShift::shift }; }; - + template struct BinOpShift { enum { shift = 1 }; }; template struct BinOpShift { enum { shift = 4 }; }; template struct BinOpShift { enum { shift = 2 }; }; diff --git a/modules/gpu/src/opencv2/gpu/device/limits.hpp b/modules/gpu/src/opencv2/gpu/device/limits.hpp index 396e9a310..f5dd39bdc 100644 --- a/modules/gpu/src/opencv2/gpu/device/limits.hpp +++ b/modules/gpu/src/opencv2/gpu/device/limits.hpp @@ -46,7 +46,7 @@ #include #include "common.hpp" -namespace cv { namespace gpu { namespace device +namespace cv { namespace gpu { namespace device { template struct numeric_limits { diff --git a/modules/gpu/src/opencv2/gpu/device/saturate_cast.hpp b/modules/gpu/src/opencv2/gpu/device/saturate_cast.hpp index d9fa5ce0c..37204cc41 100644 --- a/modules/gpu/src/opencv2/gpu/device/saturate_cast.hpp +++ b/modules/gpu/src/opencv2/gpu/device/saturate_cast.hpp @@ -57,35 +57,35 @@ namespace cv { namespace gpu { namespace device template __device__ __forceinline__ _Tp saturate_cast(double v) { return _Tp(v); } template<> __device__ __forceinline__ uchar saturate_cast(schar v) - { - return (uchar) ::max((int)v, 0); + { + return (uchar) ::max((int)v, 0); } template<> __device__ __forceinline__ uchar saturate_cast(ushort v) - { - return (uchar) ::min((uint)v, (uint)UCHAR_MAX); + { + return (uchar) ::min((uint)v, (uint)UCHAR_MAX); } template<> __device__ __forceinline__ uchar saturate_cast(int v) - { - return (uchar)((uint)v <= UCHAR_MAX ? v : v > 0 ? UCHAR_MAX : 0); + { + return (uchar)((uint)v <= UCHAR_MAX ? v : v > 0 ? UCHAR_MAX : 0); } template<> __device__ __forceinline__ uchar saturate_cast(uint v) - { - return (uchar) ::min(v, (uint)UCHAR_MAX); + { + return (uchar) ::min(v, (uint)UCHAR_MAX); } template<> __device__ __forceinline__ uchar saturate_cast(short v) - { - return saturate_cast((uint)v); + { + return saturate_cast((uint)v); } template<> __device__ __forceinline__ uchar saturate_cast(float v) - { - int iv = __float2int_rn(v); - return saturate_cast(iv); + { + int iv = __float2int_rn(v); + return saturate_cast(iv); } template<> __device__ __forceinline__ uchar saturate_cast(double v) { #if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 130 - int iv = __double2int_rn(v); + int iv = __double2int_rn(v); return saturate_cast(iv); #else return saturate_cast((float)v); @@ -93,35 +93,35 @@ namespace cv { namespace gpu { namespace device } template<> __device__ __forceinline__ schar saturate_cast(uchar v) - { - return (schar) ::min((int)v, SCHAR_MAX); + { + return (schar) ::min((int)v, SCHAR_MAX); } template<> __device__ __forceinline__ schar saturate_cast(ushort v) - { - return (schar) ::min((uint)v, (uint)SCHAR_MAX); + { + return (schar) ::min((uint)v, (uint)SCHAR_MAX); } template<> __device__ __forceinline__ schar saturate_cast(int v) { return (schar)((uint)(v-SCHAR_MIN) <= (uint)UCHAR_MAX ? v : v > 0 ? SCHAR_MAX : SCHAR_MIN); } template<> __device__ __forceinline__ schar saturate_cast(short v) - { - return saturate_cast((int)v); + { + return saturate_cast((int)v); } template<> __device__ __forceinline__ schar saturate_cast(uint v) - { - return (schar) ::min(v, (uint)SCHAR_MAX); + { + return (schar) ::min(v, (uint)SCHAR_MAX); } template<> __device__ __forceinline__ schar saturate_cast(float v) - { - int iv = __float2int_rn(v); - return saturate_cast(iv); + { + int iv = __float2int_rn(v); + return saturate_cast(iv); } template<> __device__ __forceinline__ schar saturate_cast(double v) - { + { #if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 130 - int iv = __double2int_rn(v); + int iv = __double2int_rn(v); return saturate_cast(iv); #else return saturate_cast((float)v); @@ -129,30 +129,30 @@ namespace cv { namespace gpu { namespace device } template<> __device__ __forceinline__ ushort saturate_cast(schar v) - { - return (ushort) ::max((int)v, 0); + { + return (ushort) ::max((int)v, 0); } template<> __device__ __forceinline__ ushort saturate_cast(short v) - { - return (ushort) ::max((int)v, 0); + { + return (ushort) ::max((int)v, 0); } template<> __device__ __forceinline__ ushort saturate_cast(int v) - { - return (ushort)((uint)v <= (uint)USHRT_MAX ? v : v > 0 ? USHRT_MAX : 0); + { + return (ushort)((uint)v <= (uint)USHRT_MAX ? v : v > 0 ? USHRT_MAX : 0); } template<> __device__ __forceinline__ ushort saturate_cast(uint v) - { - return (ushort) ::min(v, (uint)USHRT_MAX); + { + return (ushort) ::min(v, (uint)USHRT_MAX); } template<> __device__ __forceinline__ ushort saturate_cast(float v) { - int iv = __float2int_rn(v); - return saturate_cast(iv); + int iv = __float2int_rn(v); + return saturate_cast(iv); } template<> __device__ __forceinline__ ushort saturate_cast(double v) - { + { #if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 130 - int iv = __double2int_rn(v); + int iv = __double2int_rn(v); return saturate_cast(iv); #else return saturate_cast((float)v); @@ -160,37 +160,37 @@ namespace cv { namespace gpu { namespace device } template<> __device__ __forceinline__ short saturate_cast(ushort v) - { - return (short) ::min((int)v, SHRT_MAX); + { + return (short) ::min((int)v, SHRT_MAX); } template<> __device__ __forceinline__ short saturate_cast(int v) { return (short)((uint)(v - SHRT_MIN) <= (uint)USHRT_MAX ? v : v > 0 ? SHRT_MAX : SHRT_MIN); } template<> __device__ __forceinline__ short saturate_cast(uint v) - { - return (short) ::min(v, (uint)SHRT_MAX); + { + return (short) ::min(v, (uint)SHRT_MAX); } template<> __device__ __forceinline__ short saturate_cast(float v) - { - int iv = __float2int_rn(v); - return saturate_cast(iv); + { + int iv = __float2int_rn(v); + return saturate_cast(iv); } template<> __device__ __forceinline__ short saturate_cast(double v) - { + { #if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 130 - int iv = __double2int_rn(v); + int iv = __double2int_rn(v); return saturate_cast(iv); #else return saturate_cast((float)v); #endif } - template<> __device__ __forceinline__ int saturate_cast(float v) - { - return __float2int_rn(v); + template<> __device__ __forceinline__ int saturate_cast(float v) + { + return __float2int_rn(v); } - template<> __device__ __forceinline__ int saturate_cast(double v) + template<> __device__ __forceinline__ int saturate_cast(double v) { #if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 130 return __double2int_rn(v); @@ -200,11 +200,11 @@ namespace cv { namespace gpu { namespace device } template<> __device__ __forceinline__ uint saturate_cast(float v) - { - return __float2uint_rn(v); + { + return __float2uint_rn(v); } - template<> __device__ __forceinline__ uint saturate_cast(double v) - { + template<> __device__ __forceinline__ uint saturate_cast(double v) + { #if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 130 return __double2uint_rn(v); #else diff --git a/modules/gpu/src/opencv2/gpu/device/scan.hpp b/modules/gpu/src/opencv2/gpu/device/scan.hpp new file mode 100644 index 000000000..b55ff4181 --- /dev/null +++ b/modules/gpu/src/opencv2/gpu/device/scan.hpp @@ -0,0 +1,166 @@ +/*M/////////////////////////////////////////////////////////////////////////////////////// +// +// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. +// +// By downloading, copying, installing or using the software you agree to this license. +// If you do not agree to this license, do not download, install, +// copy or use the software. +// +// +// License Agreement +// For Open Source Computer Vision Library +// +// Copyright (C) 2000-2008, Intel Corporation, all rights reserved. +// Copyright (C) 2009, Willow Garage Inc., all rights reserved. +// Third party copyrights are property of their respective owners. +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// * Redistribution's of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// * Redistribution's in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// * The name of the copyright holders may not be used to endorse or promote products +// derived from this software without specific prior written permission. +// +// This software is provided by the copyright holders and contributors "as is" and +// any express or implied warranties, including, but not limited to, the implied +// warranties of merchantability and fitness for a particular purpose are disclaimed. +// In no event shall the Intel Corporation or contributors be liable for any direct, +// indirect, incidental, special, exemplary, or consequential damages +// (including, but not limited to, procurement of substitute goods or services; +// loss of use, data, or profits; or business interruption) however caused +// and on any theory of liability, whether in contract, strict liability, +// or tort (including negligence or otherwise) arising in any way out of +// the use of this software, even if advised of the possibility of such damage. +// +//M*/ + +#ifndef __OPENCV_GPU_SCAN_HPP__ +#define __OPENCV_GPU_SCAN_HPP__ + + enum ScanKind { EXCLUSIVE = 0, INCLUSIVE = 1 }; + + template struct WarpScan + { + __device__ __forceinline__ WarpScan() {} + __device__ __forceinline__ WarpScan(const WarpScan& other) { (void)other; } + + __device__ __forceinline__ T operator()( volatile T *ptr , const unsigned int idx) + { + const unsigned int lane = idx & 31; + F op; + + if ( lane >= 1) ptr [idx ] = op(ptr [idx - 1], ptr [idx]); + if ( lane >= 2) ptr [idx ] = op(ptr [idx - 2], ptr [idx]); + if ( lane >= 4) ptr [idx ] = op(ptr [idx - 4], ptr [idx]); + if ( lane >= 8) ptr [idx ] = op(ptr [idx - 8], ptr [idx]); + if ( lane >= 16) ptr [idx ] = op(ptr [idx - 16], ptr [idx]); + + if( Kind == INCLUSIVE ) + return ptr [idx]; + else + return (lane > 0) ? ptr [idx - 1] : 0; + } + + __device__ __forceinline__ unsigned int index(const unsigned int tid) + { + return tid; + } + + __device__ __forceinline__ void init(volatile T *ptr){} + + static const int warp_offset = 0; + + typedef WarpScan merge; + }; + + template struct WarpScanNoComp + { + __device__ __forceinline__ WarpScanNoComp() {} + __device__ __forceinline__ WarpScanNoComp(const WarpScanNoComp& other) { (void)other; } + + __device__ __forceinline__ T operator()( volatile T *ptr , const unsigned int idx) + { + const unsigned int lane = threadIdx.x & 31; + F op; + + ptr [idx ] = op(ptr [idx - 1], ptr [idx]); + ptr [idx ] = op(ptr [idx - 2], ptr [idx]); + ptr [idx ] = op(ptr [idx - 4], ptr [idx]); + ptr [idx ] = op(ptr [idx - 8], ptr [idx]); + ptr [idx ] = op(ptr [idx - 16], ptr [idx]); + + if( Kind == INCLUSIVE ) + return ptr [idx]; + else + return (lane > 0) ? ptr [idx - 1] : 0; + } + + __device__ __forceinline__ unsigned int index(const unsigned int tid) + { + return (tid >> warp_log) * warp_smem_stride + 16 + (tid & warp_mask); + } + + __device__ __forceinline__ void init(volatile T *ptr) + { + ptr[threadIdx.x] = 0; + } + + static const int warp_smem_stride = 32 + 16 + 1; + static const int warp_offset = 16; + static const int warp_log = 5; + static const int warp_mask = 31; + + typedef WarpScanNoComp merge; + }; + + template struct BlockScan + { + __device__ __forceinline__ BlockScan() {} + __device__ __forceinline__ BlockScan(const BlockScan& other) { (void)other; } + + __device__ __forceinline__ T operator()(volatile T *ptr) + { + const unsigned int tid = threadIdx.x; + const unsigned int lane = tid & warp_mask; + const unsigned int warp = tid >> warp_log; + + Sc scan; + typename Sc::merge merge_scan; + const unsigned int idx = scan.index(tid); + + T val = scan(ptr, idx); + __syncthreads (); + + if( warp == 0) + scan.init(ptr); + __syncthreads (); + + if( lane == 31 ) + ptr [scan.warp_offset + warp ] = (Kind == INCLUSIVE) ? val : ptr [idx]; + __syncthreads (); + + if( warp == 0 ) + merge_scan(ptr, idx); + __syncthreads(); + + if ( warp > 0) + val = ptr [scan.warp_offset + warp - 1] + val; + __syncthreads (); + + ptr[idx] = val; + __syncthreads (); + + return val ; + } + + static const int warp_log = 5; + static const int warp_mask = 31; + }; + +#endif \ No newline at end of file diff --git a/modules/gpu/src/opencv2/gpu/device/static_check.hpp b/modules/gpu/src/opencv2/gpu/device/static_check.hpp index 7f6aafe4c..178c0f707 100644 --- a/modules/gpu/src/opencv2/gpu/device/static_check.hpp +++ b/modules/gpu/src/opencv2/gpu/device/static_check.hpp @@ -43,27 +43,27 @@ #ifndef __OPENCV_GPU_GPU_DEVICE_STATIC_CHECK_HPP__ #define __OPENCV_GPU_GPU_DEVICE_STATIC_CHECK_HPP__ -#if defined(__CUDACC__) - #define __OPENCV_GPU_HOST_DEVICE__ __host__ __device__ __forceinline__ +#if defined(__CUDACC__) + #define __OPENCV_GPU_HOST_DEVICE__ __host__ __device__ __forceinline__ #else #define __OPENCV_GPU_HOST_DEVICE__ -#endif +#endif -namespace cv { namespace gpu -{ +namespace cv { namespace gpu +{ namespace device { template struct Static {}; - - template<> struct Static - { - __OPENCV_GPU_HOST_DEVICE__ static void check() {}; + + template<> struct Static + { + __OPENCV_GPU_HOST_DEVICE__ static void check() {}; }; - } + } using ::cv::gpu::device::Static; }} #undef __OPENCV_GPU_HOST_DEVICE__ -#endif /* __OPENCV_GPU_GPU_DEVICE_STATIC_CHECK_HPP__ */ \ No newline at end of file +#endif /* __OPENCV_GPU_GPU_DEVICE_STATIC_CHECK_HPP__ */ \ No newline at end of file diff --git a/modules/gpu/src/opencv2/gpu/device/transform.hpp b/modules/gpu/src/opencv2/gpu/device/transform.hpp index a0e79dfa1..d6b3cf9d5 100644 --- a/modules/gpu/src/opencv2/gpu/device/transform.hpp +++ b/modules/gpu/src/opencv2/gpu/device/transform.hpp @@ -47,7 +47,7 @@ #include "utility.hpp" #include "detail/transform_detail.hpp" -namespace cv { namespace gpu { namespace device +namespace cv { namespace gpu { namespace device { template static inline void transform(DevMem2D_ src, DevMem2D_ dst, UnOp op, const Mask& mask, cudaStream_t stream) diff --git a/modules/gpu/src/opencv2/gpu/device/type_traits.hpp b/modules/gpu/src/opencv2/gpu/device/type_traits.hpp index 93c7f1b84..e3684df86 100644 --- a/modules/gpu/src/opencv2/gpu/device/type_traits.hpp +++ b/modules/gpu/src/opencv2/gpu/device/type_traits.hpp @@ -45,11 +45,11 @@ #include "detail/type_traits_detail.hpp" -namespace cv { namespace gpu { namespace device +namespace cv { namespace gpu { namespace device { template struct IsSimpleParameter { - enum {value = type_traits_detail::IsIntegral::value || type_traits_detail::IsFloat::value || + enum {value = type_traits_detail::IsIntegral::value || type_traits_detail::IsFloat::value || type_traits_detail::PointerTraits::type>::value}; }; @@ -65,16 +65,16 @@ namespace cv { namespace gpu { namespace device enum { isVolatile = type_traits_detail::UnVolatile::value }; enum { isReference = type_traits_detail::ReferenceTraits::value }; - enum { isPointer = type_traits_detail::PointerTraits::type>::value }; + enum { isPointer = type_traits_detail::PointerTraits::type>::value }; - enum { isUnsignedInt = type_traits_detail::IsUnsignedIntegral::value }; - enum { isSignedInt = type_traits_detail::IsSignedIntergral::value }; - enum { isIntegral = type_traits_detail::IsIntegral::value }; - enum { isFloat = type_traits_detail::IsFloat::value }; - enum { isArith = isIntegral || isFloat }; - enum { isVec = type_traits_detail::IsVec::value }; - - typedef typename type_traits_detail::Select::value, + enum { isUnsignedInt = type_traits_detail::IsUnsignedIntegral::value }; + enum { isSignedInt = type_traits_detail::IsSignedIntergral::value }; + enum { isIntegral = type_traits_detail::IsIntegral::value }; + enum { isFloat = type_traits_detail::IsFloat::value }; + enum { isArith = isIntegral || isFloat }; + enum { isVec = type_traits_detail::IsVec::value }; + + typedef typename type_traits_detail::Select::value, T, typename type_traits_detail::AddParameterType::type>::type ParameterType; }; }}} diff --git a/modules/gpu/src/opencv2/gpu/device/utility.hpp b/modules/gpu/src/opencv2/gpu/device/utility.hpp index cb4db80a0..78d82e33a 100644 --- a/modules/gpu/src/opencv2/gpu/device/utility.hpp +++ b/modules/gpu/src/opencv2/gpu/device/utility.hpp @@ -47,17 +47,17 @@ #include "datamov_utils.hpp" #include "detail/utility_detail.hpp" -namespace cv { namespace gpu { namespace device +namespace cv { namespace gpu { namespace device { - #define OPENCV_GPU_LOG_WARP_SIZE (5) - #define OPENCV_GPU_WARP_SIZE (1 << OPENCV_GPU_LOG_WARP_SIZE) + #define OPENCV_GPU_LOG_WARP_SIZE (5) + #define OPENCV_GPU_WARP_SIZE (1 << OPENCV_GPU_LOG_WARP_SIZE) #define OPENCV_GPU_LOG_MEM_BANKS ((__CUDA_ARCH__ >= 200) ? 5 : 4) // 32 banks on fermi, 16 on tesla #define OPENCV_GPU_MEM_BANKS (1 << OPENCV_GPU_LOG_MEM_BANKS) /////////////////////////////////////////////////////////////////////////////// // swap - template void __device__ __host__ __forceinline__ swap(T& a, T& b) + template void __device__ __host__ __forceinline__ swap(T& a, T& b) { const T temp = a; a = b; @@ -71,9 +71,9 @@ namespace cv { namespace gpu { namespace device { explicit __host__ __device__ __forceinline__ SingleMask(PtrStepb mask_) : mask(mask_) {} __host__ __device__ __forceinline__ SingleMask(const SingleMask& mask_): mask(mask_.mask){} - + __device__ __forceinline__ bool operator()(int y, int x) const - { + { return mask.ptr(y)[x] != 0; } @@ -82,13 +82,13 @@ namespace cv { namespace gpu { namespace device struct SingleMaskChannels { - __host__ __device__ __forceinline__ SingleMaskChannels(PtrStepb mask_, int channels_) + __host__ __device__ __forceinline__ SingleMaskChannels(PtrStepb mask_, int channels_) : mask(mask_), channels(channels_) {} __host__ __device__ __forceinline__ SingleMaskChannels(const SingleMaskChannels& mask_) :mask(mask_.mask), channels(mask_.channels){} - + __device__ __forceinline__ bool operator()(int y, int x) const - { + { return mask.ptr(y)[x / channels] != 0; } @@ -112,7 +112,7 @@ namespace cv { namespace gpu { namespace device { curMask = maskCollection[z]; } - + __device__ __forceinline__ bool operator()(int y, int x) const { uchar val; @@ -165,20 +165,20 @@ namespace cv { namespace gpu { namespace device utility_detail::ReductionDispatcher::reduce(data, partial_reduction, tid, op); } - template + template __device__ __forceinline__ void reducePredVal(volatile T* sdata, T& myData, V* sval, V& myVal, int tid, const Pred& pred) { StaticAssert= 8 && n <= 512>::check(); utility_detail::PredValReductionDispatcher::reduce(myData, myVal, sdata, sval, tid, pred); } - template + template __device__ __forceinline__ void reducePredVal2(volatile T* sdata, T& myData, V1* sval1, V1& myVal1, V2* sval2, V2& myVal2, int tid, const Pred& pred) { StaticAssert= 8 && n <= 512>::check(); utility_detail::PredVal2ReductionDispatcher::reduce(myData, myVal1, myVal2, sdata, sval1, sval2, tid, pred); } - + /////////////////////////////////////////////////////////////////////////////// // Solve linear system @@ -212,17 +212,17 @@ namespace cv { namespace gpu { namespace device { double invdet = 1.0 / det; - x[0] = saturate_cast(invdet * + x[0] = saturate_cast(invdet * (b[0] * (A[1][1] * A[2][2] - A[1][2] * A[2][1]) - A[0][1] * (b[1] * A[2][2] - A[1][2] * b[2] ) + A[0][2] * (b[1] * A[2][1] - A[1][1] * b[2] ))); - x[1] = saturate_cast(invdet * + x[1] = saturate_cast(invdet * (A[0][0] * (b[1] * A[2][2] - A[1][2] * b[2] ) - b[0] * (A[1][0] * A[2][2] - A[1][2] * A[2][0]) + A[0][2] * (A[1][0] * b[2] - b[1] * A[2][0]))); - x[2] = saturate_cast(invdet * + x[2] = saturate_cast(invdet * (A[0][0] * (A[1][1] * b[2] - b[1] * A[2][1]) - A[0][1] * (A[1][0] * b[2] - b[1] * A[2][0]) + b[0] * (A[1][0] * A[2][1] - A[1][1] * A[2][0]))); diff --git a/modules/gpu/src/opencv2/gpu/device/vec_distance.hpp b/modules/gpu/src/opencv2/gpu/device/vec_distance.hpp index a1ead9f52..113f3dd51 100644 --- a/modules/gpu/src/opencv2/gpu/device/vec_distance.hpp +++ b/modules/gpu/src/opencv2/gpu/device/vec_distance.hpp @@ -47,7 +47,7 @@ #include "functional.hpp" #include "detail/vec_distance_detail.hpp" -namespace cv { namespace gpu { namespace device +namespace cv { namespace gpu { namespace device { template struct L1Dist { @@ -150,7 +150,7 @@ namespace cv { namespace gpu { namespace device }; // calc distance between two vectors in global memory - template + template __device__ void calcVecDiffGlobal(const T1* vec1, const T2* vec2, int len, Dist& dist, typename Dist::result_type* smem, int tid) { for (int i = tid; i < len; i += THREAD_DIM) @@ -170,9 +170,9 @@ namespace cv { namespace gpu { namespace device // calc distance between two vectors, first vector is cached in register or shared memory, second vector is in global memory template __device__ __forceinline__ void calcVecDiffCached(const T1* vecCached, const T2* vecGlob, int len, Dist& dist, typename Dist::result_type* smem, int tid) - { + { vec_distance_detail::VecDiffCachedCalculator::calc(vecCached, vecGlob, len, dist, tid); - + dist.reduceAll(smem, tid); } diff --git a/modules/gpu/src/opencv2/gpu/device/vec_math.hpp b/modules/gpu/src/opencv2/gpu/device/vec_math.hpp index 833abcbc3..67e064a38 100644 --- a/modules/gpu/src/opencv2/gpu/device/vec_math.hpp +++ b/modules/gpu/src/opencv2/gpu/device/vec_math.hpp @@ -47,7 +47,7 @@ #include "vec_traits.hpp" #include "functional.hpp" -namespace cv { namespace gpu { namespace device +namespace cv { namespace gpu { namespace device { namespace vec_math_detail { @@ -150,7 +150,7 @@ namespace cv { namespace gpu { namespace device } namespace vec_math_detail - { + { template struct BinOpTraits { typedef int argument_type; @@ -326,5 +326,5 @@ namespace cv { namespace gpu { namespace device #undef OPENCV_GPU_IMPLEMENT_VEC_OP #undef OPENCV_GPU_IMPLEMENT_VEC_INT_OP }}} // namespace cv { namespace gpu { namespace device - + #endif // __OPENCV_GPU_VECMATH_HPP__ \ No newline at end of file diff --git a/modules/gpu/src/opencv2/gpu/device/vec_traits.hpp b/modules/gpu/src/opencv2/gpu/device/vec_traits.hpp index 955fe0d4b..9e309fbe0 100644 --- a/modules/gpu/src/opencv2/gpu/device/vec_traits.hpp +++ b/modules/gpu/src/opencv2/gpu/device/vec_traits.hpp @@ -45,7 +45,7 @@ #include "common.hpp" -namespace cv { namespace gpu { namespace device +namespace cv { namespace gpu { namespace device { template struct TypeVec; @@ -219,18 +219,18 @@ namespace cv { namespace gpu { namespace device #undef OPENCV_GPU_IMPLEMENT_VEC_TRAITS - template<> struct VecTraits - { + template<> struct VecTraits + { typedef char elem_type; - enum {cn=1}; + enum {cn=1}; static __device__ __host__ __forceinline__ char all(char v) {return v;} static __device__ __host__ __forceinline__ char make(char x) {return x;} static __device__ __host__ __forceinline__ char make(const char* x) {return *x;} }; - template<> struct VecTraits - { + template<> struct VecTraits + { typedef schar elem_type; - enum {cn=1}; + enum {cn=1}; static __device__ __host__ __forceinline__ schar all(schar v) {return v;} static __device__ __host__ __forceinline__ schar make(schar x) {return x;} static __device__ __host__ __forceinline__ schar make(const schar* x) {return *x;} diff --git a/modules/gpu/src/opencv2/gpu/device/warp.hpp b/modules/gpu/src/opencv2/gpu/device/warp.hpp index 0ac67f47a..59b8568c7 100644 --- a/modules/gpu/src/opencv2/gpu/device/warp.hpp +++ b/modules/gpu/src/opencv2/gpu/device/warp.hpp @@ -43,7 +43,7 @@ #ifndef __OPENCV_GPU_DEVICE_WARP_HPP__ #define __OPENCV_GPU_DEVICE_WARP_HPP__ -namespace cv { namespace gpu { namespace device +namespace cv { namespace gpu { namespace device { struct Warp { @@ -64,18 +64,18 @@ namespace cv { namespace gpu { namespace device template static __device__ __forceinline__ void fill(It beg, It end, const T& value) - { + { for(It t = beg + laneId(); t < end; t += STRIDE) *t = value; - } + } template static __device__ __forceinline__ OutIt copy(InIt beg, InIt end, OutIt out) - { + { for(InIt t = beg + laneId(); t < end; t += STRIDE, out += STRIDE) *out = *t; return out; - } + } template static __device__ __forceinline__ OutIt transform(InIt beg, InIt end, OutIt out, UnOp op) @@ -90,7 +90,7 @@ namespace cv { namespace gpu { namespace device { unsigned int lane = laneId(); - InIt1 t1 = beg1 + lane; + InIt1 t1 = beg1 + lane; InIt2 t2 = beg2 + lane; for(; t1 < end1; t1 += STRIDE, t2 += STRIDE, out += STRIDE) *out = op(*t1, *t2); @@ -100,7 +100,7 @@ namespace cv { namespace gpu { namespace device template static __device__ __forceinline__ void yota(OutIt beg, OutIt end, T value) { - unsigned int lane = laneId(); + unsigned int lane = laneId(); value += lane; for(OutIt t = beg + lane; t < end; t += STRIDE, value += STRIDE) diff --git a/modules/gpu/src/resize.cpp b/modules/gpu/src/resize.cpp index 672dbc288..626f5aa0a 100644 --- a/modules/gpu/src/resize.cpp +++ b/modules/gpu/src/resize.cpp @@ -44,7 +44,32 @@ #ifndef HAVE_CUDA -void cv::gpu::resize(const GpuMat&, GpuMat&, Size, double, double, int, Stream&) { throw_nogpu(); } +void cv::gpu::resize(const GpuMat& src, GpuMat& dst, Size dsize, double fx, double fy, int interpolation, Stream& s) +{ + (void)src; + (void)dst; + (void)dsize; + (void)fx; + (void)fy; + (void)interpolation; + (void)s; + + throw_nogpu(); +} +void cv::gpu::resize(const GpuMat& src, GpuMat& dst,GpuMat& buffer, Size dsize, + double fx, double fy, int interpolation, Stream& s) +{ + (void)src; + (void)dst; + (void)dsize; + (void)fx; + (void)fy; + (void)interpolation; + (void)buffer; + (void)s; + + throw_nogpu(); +} #else // HAVE_CUDA