diff --git a/modules/gpu/src/cuda/mathfunc.cu b/modules/gpu/src/cuda/mathfunc.cu index dddbd6f7e..5fbb73ab7 100644 --- a/modules/gpu/src/cuda/mathfunc.cu +++ b/modules/gpu/src/cuda/mathfunc.cu @@ -59,7 +59,7 @@ using namespace cv::gpu::device; namespace cv { namespace gpu { namespace mathfunc { template - __device__ void sum_in_smem(volatile T* data, const unsigned int tid) + __device__ void sum_in_smem(volatile T* data, const uint tid) { T sum = data[tid]; @@ -270,102 +270,183 @@ namespace cv { namespace gpu { namespace mathfunc struct Mask8U { explicit Mask8U(PtrStep mask): mask(mask) {} - __device__ bool operator()(int y, int x) const { return mask.ptr(y)[x]; } + + __device__ bool operator()(int y, int x) const + { + return mask.ptr(y)[x]; + } + PtrStep mask; }; - struct MaskTrue { __device__ bool operator()(int y, int x) const { return true; } }; + + struct MaskTrue + { + __device__ bool operator()(int y, int x) const + { + return true; + } + }; + + //------------------------------------------------------------------------ // Unary operations - enum { UN_OP_NOT }; + enum + { + UN_OP_NOT + }; + template struct UnOp; + template struct UnOp { - static __device__ T call(T x) - { - return ~x; - } + typedef typename TypeVec::vec_t Vec2; + typedef typename TypeVec::vec_t Vec3; + typedef typename TypeVec::vec_t Vec4; + static __device__ T call(T v) { return ~v; } + static __device__ Vec2 call(Vec2 v) { return VecTraits::make(~v.x, ~v.y); } + static __device__ Vec3 call(Vec3 v) { return VecTraits::make(~v.x, ~v.y, ~v.z); } + static __device__ Vec4 call(Vec4 v) { return VecTraits::make(~v.x, ~v.y, ~v.z, ~v.w); } }; - template - __global__ void bitwise_un_op(int rows, int cols, const PtrStep src, PtrStep dst, Mask mask) + + template + __global__ void bitwise_un_op(int rows, int cols, const PtrStep src, PtrStep dst) { - const int x = blockDim.x * blockIdx.x + threadIdx.x; + const int x = (blockDim.x * blockIdx.x + threadIdx.x) * 4; const int y = blockDim.y * blockIdx.y + threadIdx.y; - if (x < cols && y < rows && mask(y, x)) + if (y < rows) { - T* dsty = (T*)dst.ptr(y); - const T* srcy = (const T*)src.ptr(y); - - #pragma unroll - for (int i = 0; i < cn; ++i) - dsty[cn * x + i] = UnOp::call(srcy[cn * x + i]); + uchar* dst_ptr = dst.ptr(y) + x; + const uchar* src_ptr = src.ptr(y) + x; + if (x + sizeof(uint) - 1 < cols) + { + *(uint*)dst_ptr = UnOp::call(*(uint*)src_ptr); + } + else + { + const uchar* src_end = src.ptr(y) + cols; + while (src_ptr < src_end) + { + *dst_ptr++ = UnOp::call(*src_ptr++); + } + } } } - template - void bitwise_un_op(int rows, int cols, const PtrStep src, PtrStep dst, int elem_size, Mask mask, cudaStream_t stream) + + template + __global__ void bitwise_un_op(int rows, int cols, const PtrStep src, PtrStep dst, const PtrStep mask) + { + typedef typename TypeVec::vec_t Type; + const int x = blockDim.x * blockIdx.x + threadIdx.x; + const int y = blockDim.y * blockIdx.y + threadIdx.y; + + if (x < cols && y < rows && mask.ptr(y)[x]) + { + Type* dst_row = (Type*)dst.ptr(y); + const Type* src_row = (const Type*)src.ptr(y); + dst_row[x] = UnOp::call(src_row[x]); + } + } + + + template + __global__ void bitwise_un_op_two_loads(int rows, int cols, const PtrStep src, PtrStep dst, const PtrStep mask) + { + typedef typename TypeVec::vec_t Type; + const int x = blockDim.x * blockIdx.x + threadIdx.x; + const int y = blockDim.y * blockIdx.y + threadIdx.y; + + if (x < cols && y < rows && mask.ptr(y)[x]) + { + Type* dst_row = (Type*)dst.ptr(y); + const Type* src_row = (const Type*)src.ptr(y); + dst_row[2 * x] = UnOp::call(src_row[2 * x]); + dst_row[2 * x + 1] = UnOp::call(src_row[2 * x + 1]); + } + } + + + template + void bitwise_un_op(int rows, int cols, const PtrStep src, PtrStep dst, int elem_size, cudaStream_t stream) + { + dim3 threads(16, 16); + dim3 grid(divUp(cols * elem_size, threads.x * sizeof(uint)), + divUp(rows, threads.y)); + bitwise_un_op<<>>(rows, cols * elem_size, src, dst); + if (stream == 0) cudaSafeCall(cudaThreadSynchronize()); + } + + + template + void bitwise_un_op(int rows, int cols, const PtrStep src, PtrStep dst, int elem_size, const PtrStep mask, cudaStream_t stream) { dim3 threads(16, 16); dim3 grid(divUp(cols, threads.x), divUp(rows, threads.y)); switch (elem_size) { case 1: - bitwise_un_op ><<>>(rows, cols, src, dst, mask); + bitwise_un_op<<>>(rows, cols, src, dst, mask); break; case 2: - bitwise_un_op ><<>>(rows, cols, src, dst, mask); + bitwise_un_op<<>>(rows, cols, src, dst, mask); break; case 3: - bitwise_un_op ><<>>(rows, cols, src, dst, mask); + bitwise_un_op<<>>(rows, cols, src, dst, mask); break; case 4: - bitwise_un_op ><<>>(rows, cols, src, dst, mask); + bitwise_un_op<<>>(rows, cols, src, dst, mask); break; case 6: - bitwise_un_op ><<>>(rows, cols, src, dst, mask); + bitwise_un_op<<>>(rows, cols, src, dst, mask); break; case 8: - bitwise_un_op ><<>>(rows, cols, src, dst, mask); + bitwise_un_op<<>>(rows, cols, src, dst, mask); break; case 12: - bitwise_un_op ><<>>(rows, cols, src, dst, mask); + bitwise_un_op<<>>(rows, cols, src, dst, mask); break; case 16: - bitwise_un_op ><<>>(rows, cols, src, dst, mask); + bitwise_un_op<<>>(rows, cols, src, dst, mask); break; case 24: - bitwise_un_op ><<>>(rows, cols, src, dst, mask); + bitwise_un_op_two_loads<<>>(rows, cols, src, dst, mask); break; case 32: - bitwise_un_op ><<>>(rows, cols, src, dst, mask); + bitwise_un_op_two_loads<<>>(rows, cols, src, dst, mask); break; } - if (stream == 0) cudaSafeCall(cudaThreadSynchronize()); + if (stream == 0) cudaSafeCall(cudaThreadSynchronize()); } - void bitwise_not_caller(int rows, int cols,const PtrStep src, int elem_size, PtrStep dst, cudaStream_t stream) + + void bitwise_not_caller(int rows, int cols, const PtrStep src, int elem_size, PtrStep dst, cudaStream_t stream) { - bitwise_un_op(rows, cols, src, dst, elem_size, MaskTrue(), stream); + bitwise_un_op(rows, cols, src, dst, elem_size, stream); } + void bitwise_not_caller(int rows, int cols,const PtrStep src, int elem_size, PtrStep dst, const PtrStep mask, cudaStream_t stream) { - bitwise_un_op(rows, cols, src, dst, elem_size, Mask8U(mask), stream); + bitwise_un_op(rows, cols, src, dst, elem_size, mask, stream); } + //------------------------------------------------------------------------ // Binary operations enum { BIN_OP_OR, BIN_OP_AND, BIN_OP_XOR }; + template struct BinOp; + template struct BinOp { @@ -375,6 +456,7 @@ namespace cv { namespace gpu { namespace mathfunc } }; + template struct BinOp { @@ -393,6 +475,7 @@ namespace cv { namespace gpu { namespace mathfunc } }; + template __global__ void bitwise_bin_op(int rows, int cols, const PtrStep src1, const PtrStep src2, PtrStep dst, Mask mask) { @@ -411,6 +494,7 @@ namespace cv { namespace gpu { namespace mathfunc } } + template void bitwise_bin_op(int rows, int cols, const PtrStep src1, const PtrStep src2, PtrStep dst, int elem_size, Mask mask, cudaStream_t stream) { @@ -419,64 +503,70 @@ namespace cv { namespace gpu { namespace mathfunc switch (elem_size) { case 1: - bitwise_bin_op ><<>>(rows, cols, src1, src2, dst, mask); + bitwise_bin_op ><<>>(rows, cols, src1, src2, dst, mask); break; case 2: - bitwise_bin_op ><<>>(rows, cols, src1, src2, dst, mask); + bitwise_bin_op ><<>>(rows, cols, src1, src2, dst, mask); break; case 3: - bitwise_bin_op ><<>>(rows, cols, src1, src2, dst, mask); + bitwise_bin_op ><<>>(rows, cols, src1, src2, dst, mask); break; case 4: - bitwise_bin_op ><<>>(rows, cols, src1, src2, dst, mask); + bitwise_bin_op ><<>>(rows, cols, src1, src2, dst, mask); break; case 6: - bitwise_bin_op ><<>>(rows, cols, src1, src2, dst, mask); + bitwise_bin_op ><<>>(rows, cols, src1, src2, dst, mask); break; case 8: - bitwise_bin_op ><<>>(rows, cols, src1, src2, dst, mask); + bitwise_bin_op ><<>>(rows, cols, src1, src2, dst, mask); break; case 12: - bitwise_bin_op ><<>>(rows, cols, src1, src2, dst, mask); + bitwise_bin_op ><<>>(rows, cols, src1, src2, dst, mask); break; case 16: - bitwise_bin_op ><<>>(rows, cols, src1, src2, dst, mask); + bitwise_bin_op ><<>>(rows, cols, src1, src2, dst, mask); break; case 24: - bitwise_bin_op ><<>>(rows, cols, src1, src2, dst, mask); + bitwise_bin_op ><<>>(rows, cols, src1, src2, dst, mask); break; case 32: - bitwise_bin_op ><<>>(rows, cols, src1, src2, dst, mask); + bitwise_bin_op ><<>>(rows, cols, src1, src2, dst, mask); break; } if (stream == 0) cudaSafeCall(cudaThreadSynchronize()); } + void bitwise_or_caller(int rows, int cols, const PtrStep src1, const PtrStep src2, int elem_size, PtrStep dst, cudaStream_t stream) { bitwise_bin_op(rows, cols, src1, src2, dst, elem_size, MaskTrue(), stream); } + void bitwise_or_caller(int rows, int cols, const PtrStep src1, const PtrStep src2, int elem_size, PtrStep dst, const PtrStep mask, cudaStream_t stream) { bitwise_bin_op(rows, cols, src1, src2, dst, elem_size, Mask8U(mask), stream); } + void bitwise_and_caller(int rows, int cols, const PtrStep src1, const PtrStep src2, int elem_size, PtrStep dst, cudaStream_t stream) { bitwise_bin_op(rows, cols, src1, src2, dst, elem_size, MaskTrue(), stream); } + void bitwise_and_caller(int rows, int cols, const PtrStep src1, const PtrStep src2, int elem_size, PtrStep dst, const PtrStep mask, cudaStream_t stream) { bitwise_bin_op(rows, cols, src1, src2, dst, elem_size, Mask8U(mask), stream); } + void bitwise_xor_caller(int rows, int cols, const PtrStep src1, const PtrStep src2, int elem_size, PtrStep dst, cudaStream_t stream) { bitwise_bin_op(rows, cols, src1, src2, dst, elem_size, MaskTrue(), stream); } + void bitwise_xor_caller(int rows, int cols, const PtrStep src1, const PtrStep src2, int elem_size, PtrStep dst, const PtrStep mask, cudaStream_t stream) { bitwise_bin_op(rows, cols, src1, src2, dst, elem_size, Mask8U(mask), stream); @@ -490,9 +580,9 @@ namespace cv { namespace gpu { namespace mathfunc // To avoid shared bank conflicts we convert each value into value of // appropriate type (32 bits minimum) template struct MinMaxTypeTraits {}; - template <> struct MinMaxTypeTraits { typedef int best_type; }; + template <> struct MinMaxTypeTraits { typedef int best_type; }; template <> struct MinMaxTypeTraits { typedef int best_type; }; - template <> struct MinMaxTypeTraits { typedef int best_type; }; + template <> struct MinMaxTypeTraits { typedef int best_type; }; template <> struct MinMaxTypeTraits { typedef int best_type; }; template <> struct MinMaxTypeTraits { typedef int best_type; }; template <> struct MinMaxTypeTraits { typedef float best_type; }; @@ -506,7 +596,7 @@ namespace cv { namespace gpu { namespace mathfunc __constant__ int ctheight; // Global counter of blocks finished its work - __device__ unsigned int blocks_finished = 0; + __device__ uint blocks_finished = 0; // Estimates good thread configuration @@ -542,7 +632,7 @@ namespace cv { namespace gpu { namespace mathfunc // Does min and max in shared memory template - __device__ void merge(unsigned int tid, unsigned int offset, volatile T* minval, volatile T* maxval) + __device__ void merge(uint tid, uint offset, volatile T* minval, volatile T* maxval) { minval[tid] = min(minval[tid], minval[tid + offset]); maxval[tid] = max(maxval[tid], maxval[tid + offset]); @@ -550,7 +640,7 @@ namespace cv { namespace gpu { namespace mathfunc template - __device__ void find_min_max_in_smem(volatile T* minval, volatile T* maxval, const unsigned int tid) + __device__ void find_min_max_in_smem(volatile T* minval, volatile T* maxval, const uint tid) { if (size >= 512) { if (tid < 256) { merge(tid, 256, minval, maxval); } __syncthreads(); } if (size >= 256) { if (tid < 128) { merge(tid, 128, minval, maxval); } __syncthreads(); } @@ -575,18 +665,18 @@ namespace cv { namespace gpu { namespace mathfunc __shared__ best_type sminval[nthreads]; __shared__ best_type smaxval[nthreads]; - unsigned int x0 = blockIdx.x * blockDim.x * ctwidth + threadIdx.x; - unsigned int y0 = blockIdx.y * blockDim.y * ctheight + threadIdx.y; - unsigned int tid = threadIdx.y * blockDim.x + threadIdx.x; + uint x0 = blockIdx.x * blockDim.x * ctwidth + threadIdx.x; + uint y0 = blockIdx.y * blockDim.y * ctheight + threadIdx.y; + uint tid = threadIdx.y * blockDim.x + threadIdx.x; T mymin = numeric_limits_gpu::max(); T mymax = numeric_limits_gpu::is_signed ? -numeric_limits_gpu::max() : numeric_limits_gpu::min(); - unsigned int y_end = min(y0 + (ctheight - 1) * blockDim.y + 1, src.rows); - unsigned int x_end = min(x0 + (ctwidth - 1) * blockDim.x + 1, src.cols); - for (unsigned int y = y0; y < y_end; y += blockDim.y) + uint y_end = min(y0 + (ctheight - 1) * blockDim.y + 1, src.rows); + uint x_end = min(x0 + (ctwidth - 1) * blockDim.x + 1, src.cols); + for (uint y = y0; y < y_end; y += blockDim.y) { const T* src_row = (const T*)src.ptr(y); - for (unsigned int x = x0; x < x_end; x += blockDim.x) + for (uint x = x0; x < x_end; x += blockDim.x) { T val = src_row[x]; if (mask(y, x)) @@ -618,7 +708,7 @@ namespace cv { namespace gpu { namespace mathfunc maxval[blockIdx.y * gridDim.x + blockIdx.x] = (T)smaxval[0]; __threadfence(); - unsigned int ticket = atomicInc(&blocks_finished, gridDim.x * gridDim.y); + uint ticket = atomicInc(&blocks_finished, gridDim.x * gridDim.y); is_last = ticket == gridDim.x * gridDim.y - 1; } @@ -626,7 +716,7 @@ namespace cv { namespace gpu { namespace mathfunc if (is_last) { - unsigned int idx = min(tid, gridDim.x * gridDim.y - 1); + uint idx = min(tid, gridDim.x * gridDim.y - 1); sminval[tid] = minval[idx]; smaxval[tid] = maxval[idx]; @@ -671,9 +761,9 @@ namespace cv { namespace gpu { namespace mathfunc *maxval = maxval_; } - template void min_max_mask_caller(const DevMem2D, const PtrStep, double*, double*, PtrStep); + template void min_max_mask_caller(const DevMem2D, const PtrStep, double*, double*, PtrStep); template void min_max_mask_caller(const DevMem2D, const PtrStep, double*, double*, PtrStep); - template void min_max_mask_caller(const DevMem2D, const PtrStep, double*, double*, PtrStep); + template void min_max_mask_caller(const DevMem2D, const PtrStep, double*, double*, PtrStep); template void min_max_mask_caller(const DevMem2D, const PtrStep, double*, double*, PtrStep); template void min_max_mask_caller(const DevMem2D, const PtrStep, double*, double*, PtrStep); template void min_max_mask_caller(const DevMem2D, const PtrStep, double*, double*, PtrStep); @@ -700,9 +790,9 @@ namespace cv { namespace gpu { namespace mathfunc *maxval = maxval_; } - template void min_max_caller(const DevMem2D, double*, double*, PtrStep); + template void min_max_caller(const DevMem2D, double*, double*, PtrStep); template void min_max_caller(const DevMem2D, double*, double*, PtrStep); - template void min_max_caller(const DevMem2D, double*, double*, PtrStep); + template void min_max_caller(const DevMem2D, double*, double*, PtrStep); template void min_max_caller(const DevMem2D, double*, double*, PtrStep); template void min_max_caller(const DevMem2D, double*, double*, PtrStep); template void min_max_caller(const DevMem2D, double*,double*, PtrStep); @@ -716,8 +806,8 @@ namespace cv { namespace gpu { namespace mathfunc __shared__ best_type sminval[nthreads]; __shared__ best_type smaxval[nthreads]; - unsigned int tid = threadIdx.y * blockDim.x + threadIdx.x; - unsigned int idx = min(tid, gridDim.x * gridDim.y - 1); + uint tid = threadIdx.y * blockDim.x + threadIdx.x; + uint idx = min(tid, gridDim.x * gridDim.y - 1); sminval[tid] = minval[idx]; smaxval[tid] = maxval[idx]; @@ -754,9 +844,9 @@ namespace cv { namespace gpu { namespace mathfunc *maxval = maxval_; } - template void min_max_mask_multipass_caller(const DevMem2D, const PtrStep, double*, double*, PtrStep); + template void min_max_mask_multipass_caller(const DevMem2D, const PtrStep, double*, double*, PtrStep); template void min_max_mask_multipass_caller(const DevMem2D, const PtrStep, double*, double*, PtrStep); - template void min_max_mask_multipass_caller(const DevMem2D, const PtrStep, double*, double*, PtrStep); + template void min_max_mask_multipass_caller(const DevMem2D, const PtrStep, double*, double*, PtrStep); template void min_max_mask_multipass_caller(const DevMem2D, const PtrStep, double*, double*, PtrStep); template void min_max_mask_multipass_caller(const DevMem2D, const PtrStep, double*, double*, PtrStep); template void min_max_mask_multipass_caller(const DevMem2D, const PtrStep, double*, double*, PtrStep); @@ -783,9 +873,9 @@ namespace cv { namespace gpu { namespace mathfunc *maxval = maxval_; } - template void min_max_multipass_caller(const DevMem2D, double*, double*, PtrStep); + template void min_max_multipass_caller(const DevMem2D, double*, double*, PtrStep); template void min_max_multipass_caller(const DevMem2D, double*, double*, PtrStep); - template void min_max_multipass_caller(const DevMem2D, double*, double*, PtrStep); + template void min_max_multipass_caller(const DevMem2D, double*, double*, PtrStep); template void min_max_multipass_caller(const DevMem2D, double*, double*, PtrStep); template void min_max_multipass_caller(const DevMem2D, double*, double*, PtrStep); template void min_max_multipass_caller(const DevMem2D, double*, double*, PtrStep); @@ -801,7 +891,7 @@ namespace cv { namespace gpu { namespace mathfunc __constant__ int ctheight; // Global counter of blocks finished its work - __device__ unsigned int blocks_finished = 0; + __device__ uint blocks_finished = 0; // Estimates good thread configuration @@ -839,8 +929,8 @@ namespace cv { namespace gpu { namespace mathfunc template - __device__ void merge(unsigned int tid, unsigned int offset, volatile T* minval, volatile T* maxval, - volatile unsigned int* minloc, volatile unsigned int* maxloc) + __device__ void merge(uint tid, uint offset, volatile T* minval, volatile T* maxval, + volatile uint* minloc, volatile uint* maxloc) { T val = minval[tid + offset]; if (val < minval[tid]) @@ -858,8 +948,8 @@ namespace cv { namespace gpu { namespace mathfunc template - __device__ void find_min_max_loc_in_smem(volatile T* minval, volatile T* maxval, volatile unsigned int* minloc, - volatile unsigned int* maxloc, const unsigned int tid) + __device__ void find_min_max_loc_in_smem(volatile T* minval, volatile T* maxval, volatile uint* minloc, + volatile uint* maxloc, const uint tid) { if (size >= 512) { if (tid < 256) { merge(tid, 256, minval, maxval, minloc, maxloc); } __syncthreads(); } if (size >= 256) { if (tid < 128) { merge(tid, 128, minval, maxval, minloc, maxloc); } __syncthreads(); } @@ -879,29 +969,29 @@ namespace cv { namespace gpu { namespace mathfunc template __global__ void min_max_loc_kernel(const DevMem2D src, Mask mask, T* minval, T* maxval, - unsigned int* minloc, unsigned int* maxloc) + uint* minloc, uint* maxloc) { typedef typename MinMaxTypeTraits::best_type best_type; __shared__ best_type sminval[nthreads]; __shared__ best_type smaxval[nthreads]; - __shared__ unsigned int sminloc[nthreads]; - __shared__ unsigned int smaxloc[nthreads]; + __shared__ uint sminloc[nthreads]; + __shared__ uint smaxloc[nthreads]; - unsigned int x0 = blockIdx.x * blockDim.x * ctwidth + threadIdx.x; - unsigned int y0 = blockIdx.y * blockDim.y * ctheight + threadIdx.y; - unsigned int tid = threadIdx.y * blockDim.x + threadIdx.x; + uint x0 = blockIdx.x * blockDim.x * ctwidth + threadIdx.x; + uint y0 = blockIdx.y * blockDim.y * ctheight + threadIdx.y; + uint tid = threadIdx.y * blockDim.x + threadIdx.x; T mymin = numeric_limits_gpu::max(); T mymax = numeric_limits_gpu::is_signed ? -numeric_limits_gpu::max() : numeric_limits_gpu::min(); - unsigned int myminloc = 0; - unsigned int mymaxloc = 0; - unsigned int y_end = min(y0 + (ctheight - 1) * blockDim.y + 1, src.rows); - unsigned int x_end = min(x0 + (ctwidth - 1) * blockDim.x + 1, src.cols); + uint myminloc = 0; + uint mymaxloc = 0; + uint y_end = min(y0 + (ctheight - 1) * blockDim.y + 1, src.rows); + uint x_end = min(x0 + (ctwidth - 1) * blockDim.x + 1, src.cols); - for (unsigned int y = y0; y < y_end; y += blockDim.y) + for (uint y = y0; y < y_end; y += blockDim.y) { const T* ptr = (const T*)src.ptr(y); - for (unsigned int x = x0; x < x_end; x += blockDim.x) + for (uint x = x0; x < x_end; x += blockDim.x) { if (mask(y, x)) { @@ -931,7 +1021,7 @@ namespace cv { namespace gpu { namespace mathfunc maxloc[blockIdx.y * gridDim.x + blockIdx.x] = smaxloc[0]; __threadfence(); - unsigned int ticket = atomicInc(&blocks_finished, gridDim.x * gridDim.y); + uint ticket = atomicInc(&blocks_finished, gridDim.x * gridDim.y); is_last = ticket == gridDim.x * gridDim.y - 1; } @@ -939,7 +1029,7 @@ namespace cv { namespace gpu { namespace mathfunc if (is_last) { - unsigned int idx = min(tid, gridDim.x * gridDim.y - 1); + uint idx = min(tid, gridDim.x * gridDim.y - 1); sminval[tid] = minval[idx]; smaxval[tid] = maxval[idx]; @@ -980,8 +1070,8 @@ namespace cv { namespace gpu { namespace mathfunc T* minval_buf = (T*)valbuf.ptr(0); T* maxval_buf = (T*)valbuf.ptr(1); - unsigned int* minloc_buf = (unsigned int*)locbuf.ptr(0); - unsigned int* maxloc_buf = (unsigned int*)locbuf.ptr(1); + uint* minloc_buf = (uint*)locbuf.ptr(0); + uint* maxloc_buf = (uint*)locbuf.ptr(1); min_max_loc_kernel<256, T, Mask8U><<>>(src, Mask8U(mask), minval_buf, maxval_buf, minloc_buf, maxloc_buf); cudaSafeCall(cudaThreadSynchronize()); @@ -992,16 +1082,16 @@ namespace cv { namespace gpu { namespace mathfunc *minval = minval_; *maxval = maxval_; - unsigned int minloc_, maxloc_; + uint minloc_, maxloc_; cudaSafeCall(cudaMemcpy(&minloc_, minloc_buf, sizeof(int), cudaMemcpyDeviceToHost)); cudaSafeCall(cudaMemcpy(&maxloc_, maxloc_buf, sizeof(int), cudaMemcpyDeviceToHost)); minloc[1] = minloc_ / src.cols; minloc[0] = minloc_ - minloc[1] * src.cols; maxloc[1] = maxloc_ / src.cols; maxloc[0] = maxloc_ - maxloc[1] * src.cols; } - template void min_max_loc_mask_caller(const DevMem2D, const PtrStep, double*, double*, int[2], int[2], PtrStep, PtrStep); + template void min_max_loc_mask_caller(const DevMem2D, const PtrStep, double*, double*, int[2], int[2], PtrStep, PtrStep); template void min_max_loc_mask_caller(const DevMem2D, const PtrStep, double*, double*, int[2], int[2], PtrStep, PtrStep); - template void min_max_loc_mask_caller(const DevMem2D, const PtrStep, double*, double*, int[2], int[2], PtrStep, PtrStep); + template void min_max_loc_mask_caller(const DevMem2D, const PtrStep, double*, double*, int[2], int[2], PtrStep, PtrStep); template void min_max_loc_mask_caller(const DevMem2D, const PtrStep, double*, double*, int[2], int[2], PtrStep, PtrStep); template void min_max_loc_mask_caller(const DevMem2D, const PtrStep, double*, double*, int[2], int[2], PtrStep, PtrStep); template void min_max_loc_mask_caller(const DevMem2D, const PtrStep, double*, double*, int[2], int[2], PtrStep, PtrStep); @@ -1018,8 +1108,8 @@ namespace cv { namespace gpu { namespace mathfunc T* minval_buf = (T*)valbuf.ptr(0); T* maxval_buf = (T*)valbuf.ptr(1); - unsigned int* minloc_buf = (unsigned int*)locbuf.ptr(0); - unsigned int* maxloc_buf = (unsigned int*)locbuf.ptr(1); + uint* minloc_buf = (uint*)locbuf.ptr(0); + uint* maxloc_buf = (uint*)locbuf.ptr(1); min_max_loc_kernel<256, T, MaskTrue><<>>(src, MaskTrue(), minval_buf, maxval_buf, minloc_buf, maxloc_buf); cudaSafeCall(cudaThreadSynchronize()); @@ -1030,16 +1120,16 @@ namespace cv { namespace gpu { namespace mathfunc *minval = minval_; *maxval = maxval_; - unsigned int minloc_, maxloc_; + uint minloc_, maxloc_; cudaSafeCall(cudaMemcpy(&minloc_, minloc_buf, sizeof(int), cudaMemcpyDeviceToHost)); cudaSafeCall(cudaMemcpy(&maxloc_, maxloc_buf, sizeof(int), cudaMemcpyDeviceToHost)); minloc[1] = minloc_ / src.cols; minloc[0] = minloc_ - minloc[1] * src.cols; maxloc[1] = maxloc_ / src.cols; maxloc[0] = maxloc_ - maxloc[1] * src.cols; } - template void min_max_loc_caller(const DevMem2D, double*, double*, int[2], int[2], PtrStep, PtrStep); + template void min_max_loc_caller(const DevMem2D, double*, double*, int[2], int[2], PtrStep, PtrStep); template void min_max_loc_caller(const DevMem2D, double*, double*, int[2], int[2], PtrStep, PtrStep); - template void min_max_loc_caller(const DevMem2D, double*, double*, int[2], int[2], PtrStep, PtrStep); + template void min_max_loc_caller(const DevMem2D, double*, double*, int[2], int[2], PtrStep, PtrStep); template void min_max_loc_caller(const DevMem2D, double*, double*, int[2], int[2], PtrStep, PtrStep); template void min_max_loc_caller(const DevMem2D, double*, double*, int[2], int[2], PtrStep, PtrStep); template void min_max_loc_caller(const DevMem2D, double*, double*, int[2], int[2], PtrStep, PtrStep); @@ -1048,16 +1138,16 @@ namespace cv { namespace gpu { namespace mathfunc // This kernel will be used only when compute capability is 1.0 template - __global__ void min_max_loc_pass2_kernel(T* minval, T* maxval, unsigned int* minloc, unsigned int* maxloc, int size) + __global__ void min_max_loc_pass2_kernel(T* minval, T* maxval, uint* minloc, uint* maxloc, int size) { typedef typename MinMaxTypeTraits::best_type best_type; __shared__ best_type sminval[nthreads]; __shared__ best_type smaxval[nthreads]; - __shared__ unsigned int sminloc[nthreads]; - __shared__ unsigned int smaxloc[nthreads]; + __shared__ uint sminloc[nthreads]; + __shared__ uint smaxloc[nthreads]; - unsigned int tid = threadIdx.y * blockDim.x + threadIdx.x; - unsigned int idx = min(tid, gridDim.x * gridDim.y - 1); + uint tid = threadIdx.y * blockDim.x + threadIdx.x; + uint idx = min(tid, gridDim.x * gridDim.y - 1); sminval[tid] = minval[idx]; smaxval[tid] = maxval[idx]; @@ -1087,8 +1177,8 @@ namespace cv { namespace gpu { namespace mathfunc T* minval_buf = (T*)valbuf.ptr(0); T* maxval_buf = (T*)valbuf.ptr(1); - unsigned int* minloc_buf = (unsigned int*)locbuf.ptr(0); - unsigned int* maxloc_buf = (unsigned int*)locbuf.ptr(1); + uint* minloc_buf = (uint*)locbuf.ptr(0); + uint* maxloc_buf = (uint*)locbuf.ptr(1); min_max_loc_kernel<256, T, Mask8U><<>>(src, Mask8U(mask), minval_buf, maxval_buf, minloc_buf, maxloc_buf); min_max_loc_pass2_kernel<256, T><<<1, 256>>>(minval_buf, maxval_buf, minloc_buf, maxloc_buf, grid.x * grid.y); @@ -1100,16 +1190,16 @@ namespace cv { namespace gpu { namespace mathfunc *minval = minval_; *maxval = maxval_; - unsigned int minloc_, maxloc_; + uint minloc_, maxloc_; cudaSafeCall(cudaMemcpy(&minloc_, minloc_buf, sizeof(int), cudaMemcpyDeviceToHost)); cudaSafeCall(cudaMemcpy(&maxloc_, maxloc_buf, sizeof(int), cudaMemcpyDeviceToHost)); minloc[1] = minloc_ / src.cols; minloc[0] = minloc_ - minloc[1] * src.cols; maxloc[1] = maxloc_ / src.cols; maxloc[0] = maxloc_ - maxloc[1] * src.cols; } - template void min_max_loc_mask_multipass_caller(const DevMem2D, const PtrStep, double*, double*, int[2], int[2], PtrStep, PtrStep); + template void min_max_loc_mask_multipass_caller(const DevMem2D, const PtrStep, double*, double*, int[2], int[2], PtrStep, PtrStep); template void min_max_loc_mask_multipass_caller(const DevMem2D, const PtrStep, double*, double*, int[2], int[2], PtrStep, PtrStep); - template void min_max_loc_mask_multipass_caller(const DevMem2D, const PtrStep, double*, double*, int[2], int[2], PtrStep, PtrStep); + template void min_max_loc_mask_multipass_caller(const DevMem2D, const PtrStep, double*, double*, int[2], int[2], PtrStep, PtrStep); template void min_max_loc_mask_multipass_caller(const DevMem2D, const PtrStep, double*, double*, int[2], int[2], PtrStep, PtrStep); template void min_max_loc_mask_multipass_caller(const DevMem2D, const PtrStep, double*, double*, int[2], int[2], PtrStep, PtrStep); template void min_max_loc_mask_multipass_caller(const DevMem2D, const PtrStep, double*, double*, int[2], int[2], PtrStep, PtrStep); @@ -1125,8 +1215,8 @@ namespace cv { namespace gpu { namespace mathfunc T* minval_buf = (T*)valbuf.ptr(0); T* maxval_buf = (T*)valbuf.ptr(1); - unsigned int* minloc_buf = (unsigned int*)locbuf.ptr(0); - unsigned int* maxloc_buf = (unsigned int*)locbuf.ptr(1); + uint* minloc_buf = (uint*)locbuf.ptr(0); + uint* maxloc_buf = (uint*)locbuf.ptr(1); min_max_loc_kernel<256, T, MaskTrue><<>>(src, MaskTrue(), minval_buf, maxval_buf, minloc_buf, maxloc_buf); min_max_loc_pass2_kernel<256, T><<<1, 256>>>(minval_buf, maxval_buf, minloc_buf, maxloc_buf, grid.x * grid.y); @@ -1138,16 +1228,16 @@ namespace cv { namespace gpu { namespace mathfunc *minval = minval_; *maxval = maxval_; - unsigned int minloc_, maxloc_; + uint minloc_, maxloc_; cudaSafeCall(cudaMemcpy(&minloc_, minloc_buf, sizeof(int), cudaMemcpyDeviceToHost)); cudaSafeCall(cudaMemcpy(&maxloc_, maxloc_buf, sizeof(int), cudaMemcpyDeviceToHost)); minloc[1] = minloc_ / src.cols; minloc[0] = minloc_ - minloc[1] * src.cols; maxloc[1] = maxloc_ / src.cols; maxloc[0] = maxloc_ - maxloc[1] * src.cols; } - template void min_max_loc_multipass_caller(const DevMem2D, double*, double*, int[2], int[2], PtrStep, PtrStep); + template void min_max_loc_multipass_caller(const DevMem2D, double*, double*, int[2], int[2], PtrStep, PtrStep); template void min_max_loc_multipass_caller(const DevMem2D, double*, double*, int[2], int[2], PtrStep, PtrStep); - template void min_max_loc_multipass_caller(const DevMem2D, double*, double*, int[2], int[2], PtrStep, PtrStep); + template void min_max_loc_multipass_caller(const DevMem2D, double*, double*, int[2], int[2], PtrStep, PtrStep); template void min_max_loc_multipass_caller(const DevMem2D, double*, double*, int[2], int[2], PtrStep, PtrStep); template void min_max_loc_multipass_caller(const DevMem2D, double*, double*, int[2], int[2], PtrStep, PtrStep); template void min_max_loc_multipass_caller(const DevMem2D, double*, double*, int[2], int[2], PtrStep, PtrStep); @@ -1163,7 +1253,7 @@ namespace cv { namespace gpu { namespace mathfunc __constant__ int ctwidth; __constant__ int ctheight; - __device__ unsigned int blocks_finished = 0; + __device__ uint blocks_finished = 0; void estimate_thread_cfg(int cols, int rows, dim3& threads, dim3& grid) { @@ -1193,26 +1283,26 @@ namespace cv { namespace gpu { namespace mathfunc template - __global__ void count_non_zero_kernel(const DevMem2D src, volatile unsigned int* count) + __global__ void count_non_zero_kernel(const DevMem2D src, volatile uint* count) { - __shared__ unsigned int scount[nthreads]; + __shared__ uint scount[nthreads]; - unsigned int x0 = blockIdx.x * blockDim.x * ctwidth + threadIdx.x; - unsigned int y0 = blockIdx.y * blockDim.y * ctheight + threadIdx.y; - unsigned int tid = threadIdx.y * blockDim.x + threadIdx.x; + uint x0 = blockIdx.x * blockDim.x * ctwidth + threadIdx.x; + uint y0 = blockIdx.y * blockDim.y * ctheight + threadIdx.y; + uint tid = threadIdx.y * blockDim.x + threadIdx.x; - unsigned int cnt = 0; - for (unsigned int y = 0; y < ctheight && y0 + y * blockDim.y < src.rows; ++y) + uint cnt = 0; + for (uint y = 0; y < ctheight && y0 + y * blockDim.y < src.rows; ++y) { const T* ptr = (const T*)src.ptr(y0 + y * blockDim.y); - for (unsigned int x = 0; x < ctwidth && x0 + x * blockDim.x < src.cols; ++x) + for (uint x = 0; x < ctwidth && x0 + x * blockDim.x < src.cols; ++x) cnt += ptr[x0 + x * blockDim.x] != 0; } scount[tid] = cnt; __syncthreads(); - sum_in_smem(scount, tid); + sum_in_smem(scount, tid); #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 110 __shared__ bool is_last; @@ -1222,7 +1312,7 @@ namespace cv { namespace gpu { namespace mathfunc count[blockIdx.y * gridDim.x + blockIdx.x] = scount[0]; __threadfence(); - unsigned int ticket = atomicInc(&blocks_finished, gridDim.x * gridDim.y); + uint ticket = atomicInc(&blocks_finished, gridDim.x * gridDim.y); is_last = ticket == gridDim.x * gridDim.y - 1; } @@ -1233,7 +1323,7 @@ namespace cv { namespace gpu { namespace mathfunc scount[tid] = tid < gridDim.x * gridDim.y ? count[tid] : 0; __syncthreads(); - sum_in_smem(scount, tid); + sum_in_smem(scount, tid); if (tid == 0) { @@ -1254,20 +1344,20 @@ namespace cv { namespace gpu { namespace mathfunc estimate_thread_cfg(src.cols, src.rows, threads, grid); set_kernel_consts(src.cols, src.rows, threads, grid); - unsigned int* count_buf = (unsigned int*)buf.ptr(0); + uint* count_buf = (uint*)buf.ptr(0); count_non_zero_kernel<256, T><<>>(src, count_buf); cudaSafeCall(cudaThreadSynchronize()); - unsigned int count; + uint count; cudaSafeCall(cudaMemcpy(&count, count_buf, sizeof(int), cudaMemcpyDeviceToHost)); return count; } - template int count_non_zero_caller(const DevMem2D, PtrStep); + template int count_non_zero_caller(const DevMem2D, PtrStep); template int count_non_zero_caller(const DevMem2D, PtrStep); - template int count_non_zero_caller(const DevMem2D, PtrStep); + template int count_non_zero_caller(const DevMem2D, PtrStep); template int count_non_zero_caller(const DevMem2D, PtrStep); template int count_non_zero_caller(const DevMem2D, PtrStep); template int count_non_zero_caller(const DevMem2D, PtrStep); @@ -1275,15 +1365,15 @@ namespace cv { namespace gpu { namespace mathfunc template - __global__ void count_non_zero_pass2_kernel(unsigned int* count, int size) + __global__ void count_non_zero_pass2_kernel(uint* count, int size) { - __shared__ unsigned int scount[nthreads]; - unsigned int tid = threadIdx.y * blockDim.x + threadIdx.x; + __shared__ uint scount[nthreads]; + uint tid = threadIdx.y * blockDim.x + threadIdx.x; scount[tid] = tid < size ? count[tid] : 0; __syncthreads(); - sum_in_smem(scount, tid); + sum_in_smem(scount, tid); if (tid == 0) count[0] = scount[0]; @@ -1297,21 +1387,21 @@ namespace cv { namespace gpu { namespace mathfunc estimate_thread_cfg(src.cols, src.rows, threads, grid); set_kernel_consts(src.cols, src.rows, threads, grid); - unsigned int* count_buf = (unsigned int*)buf.ptr(0); + uint* count_buf = (uint*)buf.ptr(0); count_non_zero_kernel<256, T><<>>(src, count_buf); count_non_zero_pass2_kernel<256, T><<<1, 256>>>(count_buf, grid.x * grid.y); cudaSafeCall(cudaThreadSynchronize()); - unsigned int count; + uint count; cudaSafeCall(cudaMemcpy(&count, count_buf, sizeof(int), cudaMemcpyDeviceToHost)); return count; } - template int count_non_zero_multipass_caller(const DevMem2D, PtrStep); + template int count_non_zero_multipass_caller(const DevMem2D, PtrStep); template int count_non_zero_multipass_caller(const DevMem2D, PtrStep); - template int count_non_zero_multipass_caller(const DevMem2D, PtrStep); + template int count_non_zero_multipass_caller(const DevMem2D, PtrStep); template int count_non_zero_multipass_caller(const DevMem2D, PtrStep); template int count_non_zero_multipass_caller(const DevMem2D, PtrStep); template int count_non_zero_multipass_caller(const DevMem2D, PtrStep); @@ -1485,9 +1575,9 @@ namespace cv { namespace gpu { namespace mathfunc { template struct SumType {}; - template <> struct SumType { typedef unsigned int R; }; + template <> struct SumType { typedef uint R; }; template <> struct SumType { typedef int R; }; - template <> struct SumType { typedef unsigned int R; }; + template <> struct SumType { typedef uint R; }; template <> struct SumType { typedef int R; }; template <> struct SumType { typedef int R; }; template <> struct SumType { typedef float R; }; @@ -1501,7 +1591,7 @@ namespace cv { namespace gpu { namespace mathfunc __constant__ int ctwidth; __constant__ int ctheight; - __device__ unsigned int blocks_finished = 0; + __device__ uint blocks_finished = 0; const int threads_x = 32; const int threads_y = 8; @@ -1564,7 +1654,7 @@ namespace cv { namespace gpu { namespace mathfunc result[bid] = smem[0]; __threadfence(); - unsigned int ticket = atomicInc(&blocks_finished, gridDim.x * gridDim.y); + uint ticket = atomicInc(&blocks_finished, gridDim.x * gridDim.y); is_last = (ticket == gridDim.x * gridDim.y - 1); } @@ -1648,7 +1738,7 @@ namespace cv { namespace gpu { namespace mathfunc result[bid] = res; __threadfence(); - unsigned int ticket = atomicInc(&blocks_finished, gridDim.x * gridDim.y); + uint ticket = atomicInc(&blocks_finished, gridDim.x * gridDim.y); is_last = (ticket == gridDim.x * gridDim.y - 1); } @@ -1756,7 +1846,7 @@ namespace cv { namespace gpu { namespace mathfunc result[bid] = res; __threadfence(); - unsigned int ticket = atomicInc(&blocks_finished, gridDim.x * gridDim.y); + uint ticket = atomicInc(&blocks_finished, gridDim.x * gridDim.y); is_last = (ticket == gridDim.x * gridDim.y - 1); } @@ -1874,7 +1964,7 @@ namespace cv { namespace gpu { namespace mathfunc result[bid] = res; __threadfence(); - unsigned int ticket = atomicInc(&blocks_finished, gridDim.x * gridDim.y); + uint ticket = atomicInc(&blocks_finished, gridDim.x * gridDim.y); is_last = (ticket == gridDim.x * gridDim.y - 1); } @@ -1996,9 +2086,9 @@ namespace cv { namespace gpu { namespace mathfunc sum[3] = result[3]; } - template void sum_multipass_caller(const DevMem2D, PtrStep, double*, int); + template void sum_multipass_caller(const DevMem2D, PtrStep, double*, int); template void sum_multipass_caller(const DevMem2D, PtrStep, double*, int); - template void sum_multipass_caller(const DevMem2D, PtrStep, double*, int); + template void sum_multipass_caller(const DevMem2D, PtrStep, double*, int); template void sum_multipass_caller(const DevMem2D, PtrStep, double*, int); template void sum_multipass_caller(const DevMem2D, PtrStep, double*, int); template void sum_multipass_caller(const DevMem2D, PtrStep, double*, int); @@ -2044,9 +2134,9 @@ namespace cv { namespace gpu { namespace mathfunc sum[3] = result[3]; } - template void sum_caller(const DevMem2D, PtrStep, double*, int); + template void sum_caller(const DevMem2D, PtrStep, double*, int); template void sum_caller(const DevMem2D, PtrStep, double*, int); - template void sum_caller(const DevMem2D, PtrStep, double*, int); + template void sum_caller(const DevMem2D, PtrStep, double*, int); template void sum_caller(const DevMem2D, PtrStep, double*, int); template void sum_caller(const DevMem2D, PtrStep, double*, int); template void sum_caller(const DevMem2D, PtrStep, double*, int); @@ -2100,9 +2190,9 @@ namespace cv { namespace gpu { namespace mathfunc sum[3] = result[3]; } - template void sqsum_multipass_caller(const DevMem2D, PtrStep, double*, int); + template void sqsum_multipass_caller(const DevMem2D, PtrStep, double*, int); template void sqsum_multipass_caller(const DevMem2D, PtrStep, double*, int); - template void sqsum_multipass_caller(const DevMem2D, PtrStep, double*, int); + template void sqsum_multipass_caller(const DevMem2D, PtrStep, double*, int); template void sqsum_multipass_caller(const DevMem2D, PtrStep, double*, int); template void sqsum_multipass_caller(const DevMem2D, PtrStep, double*, int); template void sqsum_multipass_caller(const DevMem2D, PtrStep, double*, int); @@ -2148,9 +2238,9 @@ namespace cv { namespace gpu { namespace mathfunc sum[3] = result[3]; } - template void sqsum_caller(const DevMem2D, PtrStep, double*, int); + template void sqsum_caller(const DevMem2D, PtrStep, double*, int); template void sqsum_caller(const DevMem2D, PtrStep, double*, int); - template void sqsum_caller(const DevMem2D, PtrStep, double*, int); + template void sqsum_caller(const DevMem2D, PtrStep, double*, int); template void sqsum_caller(const DevMem2D, PtrStep, double*, int); template void sqsum_caller(const DevMem2D, PtrStep, double*, int); template void sqsum_caller(const DevMem2D, PtrStep, double*, int); diff --git a/tests/gpu/src/bitwise_oper.cpp b/tests/gpu/src/bitwise_oper.cpp index 7ad90ca9b..9e52e7601 100644 --- a/tests/gpu/src/bitwise_oper.cpp +++ b/tests/gpu/src/bitwise_oper.cpp @@ -44,7 +44,7 @@ #include "gputest.hpp" #define CHECK(pred, err) if (!(pred)) { \ - ts->printf(CvTS::LOG, "Fail: \"%s\" at line: %d\n", #pred, __LINE__); \ + ts->printf(CvTS::CONSOLE, "Fail: \"%s\" at line: %d\n", #pred, __LINE__); \ ts->set_failed_test_info(err); \ return; } @@ -88,12 +88,22 @@ struct CV_GpuBitwiseTest: public CvTest mask.setTo(Scalar(1)); gpu::GpuMat dst; - gpu::bitwise_not(gpu::GpuMat(src), dst, mask); + gpu::bitwise_not(gpu::GpuMat(src), dst); CHECK(dst_gold.size() == dst.size(), CvTS::FAIL_INVALID_OUTPUT); CHECK(dst_gold.type() == dst.type(), CvTS::FAIL_INVALID_OUTPUT); Mat dsth(dst); + for (int i = 0; i < dst_gold.rows; ++i) + CHECK(memcmp(dst_gold.ptr(i), dsth.ptr(i), dst_gold.cols * dst_gold.elemSize()) == 0, CvTS::FAIL_INVALID_OUTPUT); + + dst.setTo(Scalar::all(0)); + gpu::bitwise_not(gpu::GpuMat(src), dst, mask); + + CHECK(dst_gold.size() == dst.size(), CvTS::FAIL_INVALID_OUTPUT); + CHECK(dst_gold.type() == dst.type(), CvTS::FAIL_INVALID_OUTPUT); + + dsth = dst; for (int i = 0; i < dst_gold.rows; ++i) CHECK(memcmp(dst_gold.ptr(i), dsth.ptr(i), dst_gold.cols * dst_gold.elemSize()) == 0, CvTS::FAIL_INVALID_OUTPUT) }