/*M/////////////////////////////////////////////////////////////////////////////////////// // // IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. // // By downloading, copying, installing or using the software you agree to this license. // If you do not agree to this license, do not download, install, // copy or use the software. // // // License Agreement // For Open Source Computer Vision Library // // Copyright (C) 2000-2008, Intel Corporation, all rights reserved. // Copyright (C) 2009, Willow Garage Inc., all rights reserved. // Third party copyrights are property of their respective owners. // // Redistribution and use in source and binary forms, with or without modification, // are permitted provided that the following conditions are met: // // * Redistribution's of source code must retain the above copyright notice, // this list of conditions and the following disclaimer. // // * Redistribution's in binary form must reproduce the above copyright notice, // this list of conditions and the following disclaimer in the documentation // and/or other materials provided with the distribution. // // * The name of the copyright holders may not be used to endorse or promote products // derived from this software without specific prior written permission. // // This software is provided by the copyright holders and contributors "as is" and // any express or implied warranties, including, but not limited to, the implied // warranties of merchantability and fitness for a particular purpose are disclaimed. // In no event shall the Intel Corporation or contributors be liable for any direct, // indirect, incidental, special, exemplary, or consequential damages // (including, but not limited to, procurement of substitute goods or services; // loss of use, data, or profits; or business interruption) however caused // and on any theory of liability, whether in contract, strict liability, // or tort (including negligence or otherwise) arising in any way out of // the use of this software, even if advised of the possibility of such damage. // //M*/ #include "internal_shared.hpp" #include "opencv2/gpu/device/limits.hpp" #include "opencv2/gpu/device/saturate_cast.hpp" #include "opencv2/gpu/device/vec_math.hpp" namespace cv { namespace gpu { namespace device { namespace matrix_reductions { // Performs reduction in shared memory template __device__ void sumInSmem(volatile T* data, const uint tid) { T sum = data[tid]; if (size >= 512) { if (tid < 256) { data[tid] = sum = sum + data[tid + 256]; } __syncthreads(); } if (size >= 256) { if (tid < 128) { data[tid] = sum = sum + data[tid + 128]; } __syncthreads(); } if (size >= 128) { if (tid < 64) { data[tid] = sum = sum + data[tid + 64]; } __syncthreads(); } if (tid < 32) { if (size >= 64) data[tid] = sum = sum + data[tid + 32]; if (size >= 32) data[tid] = sum = sum + data[tid + 16]; if (size >= 16) data[tid] = sum = sum + data[tid + 8]; if (size >= 8) data[tid] = sum = sum + data[tid + 4]; if (size >= 4) data[tid] = sum = sum + data[tid + 2]; if (size >= 2) data[tid] = sum = sum + data[tid + 1]; } } struct Mask8U { explicit Mask8U(PtrStepb mask): mask(mask) {} __device__ __forceinline__ bool operator()(int y, int x) const { return mask.ptr(y)[x]; } PtrStepb mask; }; struct MaskTrue { __device__ __forceinline__ bool operator()(int y, int x) const { return true; } }; ////////////////////////////////////////////////////////////////////////////// // Min max // To avoid shared bank conflicts we convert each value into value of // appropriate type (32 bits minimum) template struct MinMaxTypeTraits {}; template <> struct MinMaxTypeTraits { typedef int best_type; }; template <> struct MinMaxTypeTraits { typedef int best_type; }; template <> struct MinMaxTypeTraits { typedef int best_type; }; template <> struct MinMaxTypeTraits { typedef int best_type; }; template <> struct MinMaxTypeTraits { typedef int best_type; }; template <> struct MinMaxTypeTraits { typedef float best_type; }; template <> struct MinMaxTypeTraits { typedef double best_type; }; namespace minmax { __constant__ int ctwidth; __constant__ int ctheight; // Global counter of blocks finished its work __device__ uint blocks_finished = 0; // Estimates good thread configuration // - threads variable satisfies to threads.x * threads.y == 256 void estimateThreadCfg(int cols, int rows, dim3& threads, dim3& grid) { threads = dim3(32, 8); grid = dim3(divUp(cols, threads.x * 8), divUp(rows, threads.y * 32)); grid.x = std::min(grid.x, threads.x); grid.y = std::min(grid.y, threads.y); } // Returns required buffer sizes void getBufSizeRequired(int cols, int rows, int elem_size, int& bufcols, int& bufrows) { dim3 threads, grid; estimateThreadCfg(cols, rows, threads, grid); bufcols = grid.x * grid.y * elem_size; bufrows = 2; } // Estimates device constants which are used in the kernels using specified thread configuration void setKernelConsts(int cols, int rows, const dim3& threads, const dim3& grid) { int twidth = divUp(divUp(cols, grid.x), threads.x); int theight = divUp(divUp(rows, grid.y), threads.y); cudaSafeCall(cudaMemcpyToSymbol(ctwidth, &twidth, sizeof(ctwidth))); cudaSafeCall(cudaMemcpyToSymbol(ctheight, &theight, sizeof(ctheight))); } // Does min and max in shared memory template __device__ __forceinline__ void merge(uint tid, uint offset, volatile T* minval, volatile T* maxval) { minval[tid] = ::min(minval[tid], minval[tid + offset]); maxval[tid] = ::max(maxval[tid], maxval[tid + offset]); } template __device__ void findMinMaxInSmem(volatile T* minval, volatile T* maxval, const uint tid) { if (size >= 512) { if (tid < 256) { merge(tid, 256, minval, maxval); } __syncthreads(); } if (size >= 256) { if (tid < 128) { merge(tid, 128, minval, maxval); } __syncthreads(); } if (size >= 128) { if (tid < 64) { merge(tid, 64, minval, maxval); } __syncthreads(); } if (tid < 32) { if (size >= 64) merge(tid, 32, minval, maxval); if (size >= 32) merge(tid, 16, minval, maxval); if (size >= 16) merge(tid, 8, minval, maxval); if (size >= 8) merge(tid, 4, minval, maxval); if (size >= 4) merge(tid, 2, minval, maxval); if (size >= 2) merge(tid, 1, minval, maxval); } } template __global__ void minMaxKernel(const DevMem2Db src, Mask mask, T* minval, T* maxval) { typedef typename MinMaxTypeTraits::best_type best_type; __shared__ best_type sminval[nthreads]; __shared__ best_type smaxval[nthreads]; uint x0 = blockIdx.x * blockDim.x * ctwidth + threadIdx.x; uint y0 = blockIdx.y * blockDim.y * ctheight + threadIdx.y; uint tid = threadIdx.y * blockDim.x + threadIdx.x; T mymin = numeric_limits::max(); T mymax = numeric_limits::is_signed ? -numeric_limits::max() : numeric_limits::min(); uint y_end = ::min(y0 + (ctheight - 1) * blockDim.y + 1, src.rows); uint x_end = ::min(x0 + (ctwidth - 1) * blockDim.x + 1, src.cols); for (uint y = y0; y < y_end; y += blockDim.y) { const T* src_row = (const T*)src.ptr(y); for (uint x = x0; x < x_end; x += blockDim.x) { T val = src_row[x]; if (mask(y, x)) { mymin = ::min(mymin, val); mymax = ::max(mymax, val); } } } sminval[tid] = mymin; smaxval[tid] = mymax; __syncthreads(); findMinMaxInSmem(sminval, smaxval, tid); if (tid == 0) { minval[blockIdx.y * gridDim.x + blockIdx.x] = (T)sminval[0]; maxval[blockIdx.y * gridDim.x + blockIdx.x] = (T)smaxval[0]; } #if __CUDA_ARCH__ >= 110 __shared__ bool is_last; if (tid == 0) { minval[blockIdx.y * gridDim.x + blockIdx.x] = (T)sminval[0]; maxval[blockIdx.y * gridDim.x + blockIdx.x] = (T)smaxval[0]; __threadfence(); uint ticket = atomicInc(&blocks_finished, gridDim.x * gridDim.y); is_last = ticket == gridDim.x * gridDim.y - 1; } __syncthreads(); if (is_last) { uint idx = ::min(tid, gridDim.x * gridDim.y - 1); sminval[tid] = minval[idx]; smaxval[tid] = maxval[idx]; __syncthreads(); findMinMaxInSmem(sminval, smaxval, tid); if (tid == 0) { minval[0] = (T)sminval[0]; maxval[0] = (T)smaxval[0]; blocks_finished = 0; } } #else if (tid == 0) { minval[blockIdx.y * gridDim.x + blockIdx.x] = (T)sminval[0]; maxval[blockIdx.y * gridDim.x + blockIdx.x] = (T)smaxval[0]; } #endif } template void minMaxMaskCaller(const DevMem2Db src, const PtrStepb mask, double* minval, double* maxval, PtrStepb buf) { dim3 threads, grid; estimateThreadCfg(src.cols, src.rows, threads, grid); setKernelConsts(src.cols, src.rows, threads, grid); T* minval_buf = (T*)buf.ptr(0); T* maxval_buf = (T*)buf.ptr(1); minMaxKernel<256, T, Mask8U><<>>(src, Mask8U(mask), minval_buf, maxval_buf); cudaSafeCall( cudaGetLastError() ); cudaSafeCall( cudaDeviceSynchronize() ); T minval_, maxval_; cudaSafeCall( cudaMemcpy(&minval_, minval_buf, sizeof(T), cudaMemcpyDeviceToHost) ); cudaSafeCall( cudaMemcpy(&maxval_, maxval_buf, sizeof(T), cudaMemcpyDeviceToHost) ); *minval = minval_; *maxval = maxval_; } template void minMaxMaskCaller(const DevMem2Db, const PtrStepb, double*, double*, PtrStepb); template void minMaxMaskCaller(const DevMem2Db, const PtrStepb, double*, double*, PtrStepb); template void minMaxMaskCaller(const DevMem2Db, const PtrStepb, double*, double*, PtrStepb); template void minMaxMaskCaller(const DevMem2Db, const PtrStepb, double*, double*, PtrStepb); template void minMaxMaskCaller(const DevMem2Db, const PtrStepb, double*, double*, PtrStepb); template void minMaxMaskCaller(const DevMem2Db, const PtrStepb, double*, double*, PtrStepb); template void minMaxMaskCaller(const DevMem2Db, const PtrStepb, double*, double*, PtrStepb); template void minMaxCaller(const DevMem2Db src, double* minval, double* maxval, PtrStepb buf) { dim3 threads, grid; estimateThreadCfg(src.cols, src.rows, threads, grid); setKernelConsts(src.cols, src.rows, threads, grid); T* minval_buf = (T*)buf.ptr(0); T* maxval_buf = (T*)buf.ptr(1); minMaxKernel<256, T, MaskTrue><<>>(src, MaskTrue(), minval_buf, maxval_buf); cudaSafeCall( cudaGetLastError() ); cudaSafeCall( cudaDeviceSynchronize() ); T minval_, maxval_; cudaSafeCall( cudaMemcpy(&minval_, minval_buf, sizeof(T), cudaMemcpyDeviceToHost) ); cudaSafeCall( cudaMemcpy(&maxval_, maxval_buf, sizeof(T), cudaMemcpyDeviceToHost) ); *minval = minval_; *maxval = maxval_; } template void minMaxCaller(const DevMem2Db, double*, double*, PtrStepb); template void minMaxCaller(const DevMem2Db, double*, double*, PtrStepb); template void minMaxCaller(const DevMem2Db, double*, double*, PtrStepb); template void minMaxCaller(const DevMem2Db, double*, double*, PtrStepb); template void minMaxCaller(const DevMem2Db, double*, double*, PtrStepb); template void minMaxCaller(const DevMem2Db, double*,double*, PtrStepb); template void minMaxCaller(const DevMem2Db, double*, double*, PtrStepb); template __global__ void minMaxPass2Kernel(T* minval, T* maxval, int size) { typedef typename MinMaxTypeTraits::best_type best_type; __shared__ best_type sminval[nthreads]; __shared__ best_type smaxval[nthreads]; uint tid = threadIdx.y * blockDim.x + threadIdx.x; uint idx = ::min(tid, size - 1); sminval[tid] = minval[idx]; smaxval[tid] = maxval[idx]; __syncthreads(); findMinMaxInSmem(sminval, smaxval, tid); if (tid == 0) { minval[0] = (T)sminval[0]; maxval[0] = (T)smaxval[0]; } } template void minMaxMaskMultipassCaller(const DevMem2Db src, const PtrStepb mask, double* minval, double* maxval, PtrStepb buf) { dim3 threads, grid; estimateThreadCfg(src.cols, src.rows, threads, grid); setKernelConsts(src.cols, src.rows, threads, grid); T* minval_buf = (T*)buf.ptr(0); T* maxval_buf = (T*)buf.ptr(1); minMaxKernel<256, T, Mask8U><<>>(src, Mask8U(mask), minval_buf, maxval_buf); cudaSafeCall( cudaGetLastError() ); minMaxPass2Kernel<256, T><<<1, 256>>>(minval_buf, maxval_buf, grid.x * grid.y); cudaSafeCall( cudaGetLastError() ); cudaSafeCall(cudaDeviceSynchronize()); T minval_, maxval_; cudaSafeCall( cudaMemcpy(&minval_, minval_buf, sizeof(T), cudaMemcpyDeviceToHost) ); cudaSafeCall( cudaMemcpy(&maxval_, maxval_buf, sizeof(T), cudaMemcpyDeviceToHost) ); *minval = minval_; *maxval = maxval_; } template void minMaxMaskMultipassCaller(const DevMem2Db, const PtrStepb, double*, double*, PtrStepb); template void minMaxMaskMultipassCaller(const DevMem2Db, const PtrStepb, double*, double*, PtrStepb); template void minMaxMaskMultipassCaller(const DevMem2Db, const PtrStepb, double*, double*, PtrStepb); template void minMaxMaskMultipassCaller(const DevMem2Db, const PtrStepb, double*, double*, PtrStepb); template void minMaxMaskMultipassCaller(const DevMem2Db, const PtrStepb, double*, double*, PtrStepb); template void minMaxMaskMultipassCaller(const DevMem2Db, const PtrStepb, double*, double*, PtrStepb); template void minMaxMultipassCaller(const DevMem2Db src, double* minval, double* maxval, PtrStepb buf) { dim3 threads, grid; estimateThreadCfg(src.cols, src.rows, threads, grid); setKernelConsts(src.cols, src.rows, threads, grid); T* minval_buf = (T*)buf.ptr(0); T* maxval_buf = (T*)buf.ptr(1); minMaxKernel<256, T, MaskTrue><<>>(src, MaskTrue(), minval_buf, maxval_buf); cudaSafeCall( cudaGetLastError() ); minMaxPass2Kernel<256, T><<<1, 256>>>(minval_buf, maxval_buf, grid.x * grid.y); cudaSafeCall( cudaGetLastError() ); cudaSafeCall( cudaDeviceSynchronize() ); T minval_, maxval_; cudaSafeCall( cudaMemcpy(&minval_, minval_buf, sizeof(T), cudaMemcpyDeviceToHost) ); cudaSafeCall( cudaMemcpy(&maxval_, maxval_buf, sizeof(T), cudaMemcpyDeviceToHost) ); *minval = minval_; *maxval = maxval_; } template void minMaxMultipassCaller(const DevMem2Db, double*, double*, PtrStepb); template void minMaxMultipassCaller(const DevMem2Db, double*, double*, PtrStepb); template void minMaxMultipassCaller(const DevMem2Db, double*, double*, PtrStepb); template void minMaxMultipassCaller(const DevMem2Db, double*, double*, PtrStepb); template void minMaxMultipassCaller(const DevMem2Db, double*, double*, PtrStepb); template void minMaxMultipassCaller(const DevMem2Db, double*, double*, PtrStepb); } // namespace minmax /////////////////////////////////////////////////////////////////////////////// // minMaxLoc namespace minmaxloc { __constant__ int ctwidth; __constant__ int ctheight; // Global counter of blocks finished its work __device__ uint blocks_finished = 0; // Estimates good thread configuration // - threads variable satisfies to threads.x * threads.y == 256 void estimateThreadCfg(int cols, int rows, dim3& threads, dim3& grid) { threads = dim3(32, 8); grid = dim3(divUp(cols, threads.x * 8), divUp(rows, threads.y * 32)); grid.x = std::min(grid.x, threads.x); grid.y = std::min(grid.y, threads.y); } // Returns required buffer sizes void getBufSizeRequired(int cols, int rows, int elem_size, int& b1cols, int& b1rows, int& b2cols, int& b2rows) { dim3 threads, grid; estimateThreadCfg(cols, rows, threads, grid); b1cols = grid.x * grid.y * elem_size; // For values b1rows = 2; b2cols = grid.x * grid.y * sizeof(int); // For locations b2rows = 2; } // Estimates device constants which are used in the kernels using specified thread configuration void setKernelConsts(int cols, int rows, const dim3& threads, const dim3& grid) { int twidth = divUp(divUp(cols, grid.x), threads.x); int theight = divUp(divUp(rows, grid.y), threads.y); cudaSafeCall(cudaMemcpyToSymbol(ctwidth, &twidth, sizeof(ctwidth))); cudaSafeCall(cudaMemcpyToSymbol(ctheight, &theight, sizeof(ctheight))); } template __device__ void merge(uint tid, uint offset, volatile T* minval, volatile T* maxval, volatile uint* minloc, volatile uint* maxloc) { T val = minval[tid + offset]; if (val < minval[tid]) { minval[tid] = val; minloc[tid] = minloc[tid + offset]; } val = maxval[tid + offset]; if (val > maxval[tid]) { maxval[tid] = val; maxloc[tid] = maxloc[tid + offset]; } } template __device__ void findMinMaxLocInSmem(volatile T* minval, volatile T* maxval, volatile uint* minloc, volatile uint* maxloc, const uint tid) { if (size >= 512) { if (tid < 256) { merge(tid, 256, minval, maxval, minloc, maxloc); } __syncthreads(); } if (size >= 256) { if (tid < 128) { merge(tid, 128, minval, maxval, minloc, maxloc); } __syncthreads(); } if (size >= 128) { if (tid < 64) { merge(tid, 64, minval, maxval, minloc, maxloc); } __syncthreads(); } if (tid < 32) { if (size >= 64) merge(tid, 32, minval, maxval, minloc, maxloc); if (size >= 32) merge(tid, 16, minval, maxval, minloc, maxloc); if (size >= 16) merge(tid, 8, minval, maxval, minloc, maxloc); if (size >= 8) merge(tid, 4, minval, maxval, minloc, maxloc); if (size >= 4) merge(tid, 2, minval, maxval, minloc, maxloc); if (size >= 2) merge(tid, 1, minval, maxval, minloc, maxloc); } } template __global__ void minMaxLocKernel(const DevMem2Db src, Mask mask, T* minval, T* maxval, uint* minloc, uint* maxloc) { typedef typename MinMaxTypeTraits::best_type best_type; __shared__ best_type sminval[nthreads]; __shared__ best_type smaxval[nthreads]; __shared__ uint sminloc[nthreads]; __shared__ uint smaxloc[nthreads]; uint x0 = blockIdx.x * blockDim.x * ctwidth + threadIdx.x; uint y0 = blockIdx.y * blockDim.y * ctheight + threadIdx.y; uint tid = threadIdx.y * blockDim.x + threadIdx.x; T mymin = numeric_limits::max(); T mymax = numeric_limits::is_signed ? -numeric_limits::max() : numeric_limits::min(); uint myminloc = 0; uint mymaxloc = 0; uint y_end = ::min(y0 + (ctheight - 1) * blockDim.y + 1, src.rows); uint x_end = ::min(x0 + (ctwidth - 1) * blockDim.x + 1, src.cols); for (uint y = y0; y < y_end; y += blockDim.y) { const T* ptr = (const T*)src.ptr(y); for (uint x = x0; x < x_end; x += blockDim.x) { if (mask(y, x)) { T val = ptr[x]; if (val <= mymin) { mymin = val; myminloc = y * src.cols + x; } if (val >= mymax) { mymax = val; mymaxloc = y * src.cols + x; } } } } sminval[tid] = mymin; smaxval[tid] = mymax; sminloc[tid] = myminloc; smaxloc[tid] = mymaxloc; __syncthreads(); findMinMaxLocInSmem(sminval, smaxval, sminloc, smaxloc, tid); #if __CUDA_ARCH__ >= 110 __shared__ bool is_last; if (tid == 0) { minval[blockIdx.y * gridDim.x + blockIdx.x] = (T)sminval[0]; maxval[blockIdx.y * gridDim.x + blockIdx.x] = (T)smaxval[0]; minloc[blockIdx.y * gridDim.x + blockIdx.x] = sminloc[0]; maxloc[blockIdx.y * gridDim.x + blockIdx.x] = smaxloc[0]; __threadfence(); uint ticket = atomicInc(&blocks_finished, gridDim.x * gridDim.y); is_last = ticket == gridDim.x * gridDim.y - 1; } __syncthreads(); if (is_last) { uint idx = ::min(tid, gridDim.x * gridDim.y - 1); sminval[tid] = minval[idx]; smaxval[tid] = maxval[idx]; sminloc[tid] = minloc[idx]; smaxloc[tid] = maxloc[idx]; __syncthreads(); findMinMaxLocInSmem(sminval, smaxval, sminloc, smaxloc, tid); if (tid == 0) { minval[0] = (T)sminval[0]; maxval[0] = (T)smaxval[0]; minloc[0] = sminloc[0]; maxloc[0] = smaxloc[0]; blocks_finished = 0; } } #else if (tid == 0) { minval[blockIdx.y * gridDim.x + blockIdx.x] = (T)sminval[0]; maxval[blockIdx.y * gridDim.x + blockIdx.x] = (T)smaxval[0]; minloc[blockIdx.y * gridDim.x + blockIdx.x] = sminloc[0]; maxloc[blockIdx.y * gridDim.x + blockIdx.x] = smaxloc[0]; } #endif } template void minMaxLocMaskCaller(const DevMem2Db src, const PtrStepb mask, double* minval, double* maxval, int minloc[2], int maxloc[2], PtrStepb valbuf, PtrStepb locbuf) { dim3 threads, grid; estimateThreadCfg(src.cols, src.rows, threads, grid); setKernelConsts(src.cols, src.rows, threads, grid); T* minval_buf = (T*)valbuf.ptr(0); T* maxval_buf = (T*)valbuf.ptr(1); uint* minloc_buf = (uint*)locbuf.ptr(0); uint* maxloc_buf = (uint*)locbuf.ptr(1); minMaxLocKernel<256, T, Mask8U><<>>(src, Mask8U(mask), minval_buf, maxval_buf, minloc_buf, maxloc_buf); cudaSafeCall( cudaGetLastError() ); cudaSafeCall( cudaDeviceSynchronize() ); T minval_, maxval_; cudaSafeCall( cudaMemcpy(&minval_, minval_buf, sizeof(T), cudaMemcpyDeviceToHost) ); cudaSafeCall( cudaMemcpy(&maxval_, maxval_buf, sizeof(T), cudaMemcpyDeviceToHost) ); *minval = minval_; *maxval = maxval_; uint minloc_, maxloc_; cudaSafeCall( cudaMemcpy(&minloc_, minloc_buf, sizeof(int), cudaMemcpyDeviceToHost) ); cudaSafeCall( cudaMemcpy(&maxloc_, maxloc_buf, sizeof(int), cudaMemcpyDeviceToHost) ); minloc[1] = minloc_ / src.cols; minloc[0] = minloc_ - minloc[1] * src.cols; maxloc[1] = maxloc_ / src.cols; maxloc[0] = maxloc_ - maxloc[1] * src.cols; } template void minMaxLocMaskCaller(const DevMem2Db, const PtrStepb, double*, double*, int[2], int[2], PtrStepb, PtrStepb); template void minMaxLocMaskCaller(const DevMem2Db, const PtrStepb, double*, double*, int[2], int[2], PtrStepb, PtrStepb); template void minMaxLocMaskCaller(const DevMem2Db, const PtrStepb, double*, double*, int[2], int[2], PtrStepb, PtrStepb); template void minMaxLocMaskCaller(const DevMem2Db, const PtrStepb, double*, double*, int[2], int[2], PtrStepb, PtrStepb); template void minMaxLocMaskCaller(const DevMem2Db, const PtrStepb, double*, double*, int[2], int[2], PtrStepb, PtrStepb); template void minMaxLocMaskCaller(const DevMem2Db, const PtrStepb, double*, double*, int[2], int[2], PtrStepb, PtrStepb); template void minMaxLocMaskCaller(const DevMem2Db, const PtrStepb, double*, double*, int[2], int[2], PtrStepb, PtrStepb); template void minMaxLocCaller(const DevMem2Db src, double* minval, double* maxval, int minloc[2], int maxloc[2], PtrStepb valbuf, PtrStepb locbuf) { dim3 threads, grid; estimateThreadCfg(src.cols, src.rows, threads, grid); setKernelConsts(src.cols, src.rows, threads, grid); T* minval_buf = (T*)valbuf.ptr(0); T* maxval_buf = (T*)valbuf.ptr(1); uint* minloc_buf = (uint*)locbuf.ptr(0); uint* maxloc_buf = (uint*)locbuf.ptr(1); minMaxLocKernel<256, T, MaskTrue><<>>(src, MaskTrue(), minval_buf, maxval_buf, minloc_buf, maxloc_buf); cudaSafeCall( cudaGetLastError() ); cudaSafeCall( cudaDeviceSynchronize() ); T minval_, maxval_; cudaSafeCall(cudaMemcpy(&minval_, minval_buf, sizeof(T), cudaMemcpyDeviceToHost)); cudaSafeCall(cudaMemcpy(&maxval_, maxval_buf, sizeof(T), cudaMemcpyDeviceToHost)); *minval = minval_; *maxval = maxval_; uint minloc_, maxloc_; cudaSafeCall(cudaMemcpy(&minloc_, minloc_buf, sizeof(int), cudaMemcpyDeviceToHost)); cudaSafeCall(cudaMemcpy(&maxloc_, maxloc_buf, sizeof(int), cudaMemcpyDeviceToHost)); minloc[1] = minloc_ / src.cols; minloc[0] = minloc_ - minloc[1] * src.cols; maxloc[1] = maxloc_ / src.cols; maxloc[0] = maxloc_ - maxloc[1] * src.cols; } template void minMaxLocCaller(const DevMem2Db, double*, double*, int[2], int[2], PtrStepb, PtrStepb); template void minMaxLocCaller(const DevMem2Db, double*, double*, int[2], int[2], PtrStepb, PtrStepb); template void minMaxLocCaller(const DevMem2Db, double*, double*, int[2], int[2], PtrStepb, PtrStepb); template void minMaxLocCaller(const DevMem2Db, double*, double*, int[2], int[2], PtrStepb, PtrStepb); template void minMaxLocCaller(const DevMem2Db, double*, double*, int[2], int[2], PtrStepb, PtrStepb); template void minMaxLocCaller(const DevMem2Db, double*, double*, int[2], int[2], PtrStepb, PtrStepb); template void minMaxLocCaller(const DevMem2Db, double*, double*, int[2], int[2], PtrStepb, PtrStepb); // This kernel will be used only when compute capability is 1.0 template __global__ void minMaxLocPass2Kernel(T* minval, T* maxval, uint* minloc, uint* maxloc, int size) { typedef typename MinMaxTypeTraits::best_type best_type; __shared__ best_type sminval[nthreads]; __shared__ best_type smaxval[nthreads]; __shared__ uint sminloc[nthreads]; __shared__ uint smaxloc[nthreads]; uint tid = threadIdx.y * blockDim.x + threadIdx.x; uint idx = ::min(tid, size - 1); sminval[tid] = minval[idx]; smaxval[tid] = maxval[idx]; sminloc[tid] = minloc[idx]; smaxloc[tid] = maxloc[idx]; __syncthreads(); findMinMaxLocInSmem(sminval, smaxval, sminloc, smaxloc, tid); if (tid == 0) { minval[0] = (T)sminval[0]; maxval[0] = (T)smaxval[0]; minloc[0] = sminloc[0]; maxloc[0] = smaxloc[0]; } } template void minMaxLocMaskMultipassCaller(const DevMem2Db src, const PtrStepb mask, double* minval, double* maxval, int minloc[2], int maxloc[2], PtrStepb valbuf, PtrStepb locbuf) { dim3 threads, grid; estimateThreadCfg(src.cols, src.rows, threads, grid); setKernelConsts(src.cols, src.rows, threads, grid); T* minval_buf = (T*)valbuf.ptr(0); T* maxval_buf = (T*)valbuf.ptr(1); uint* minloc_buf = (uint*)locbuf.ptr(0); uint* maxloc_buf = (uint*)locbuf.ptr(1); minMaxLocKernel<256, T, Mask8U><<>>(src, Mask8U(mask), minval_buf, maxval_buf, minloc_buf, maxloc_buf); cudaSafeCall( cudaGetLastError() ); minMaxLocPass2Kernel<256, T><<<1, 256>>>(minval_buf, maxval_buf, minloc_buf, maxloc_buf, grid.x * grid.y); cudaSafeCall( cudaGetLastError() ); cudaSafeCall( cudaDeviceSynchronize() ); T minval_, maxval_; cudaSafeCall(cudaMemcpy(&minval_, minval_buf, sizeof(T), cudaMemcpyDeviceToHost)); cudaSafeCall(cudaMemcpy(&maxval_, maxval_buf, sizeof(T), cudaMemcpyDeviceToHost)); *minval = minval_; *maxval = maxval_; uint minloc_, maxloc_; cudaSafeCall(cudaMemcpy(&minloc_, minloc_buf, sizeof(int), cudaMemcpyDeviceToHost)); cudaSafeCall(cudaMemcpy(&maxloc_, maxloc_buf, sizeof(int), cudaMemcpyDeviceToHost)); minloc[1] = minloc_ / src.cols; minloc[0] = minloc_ - minloc[1] * src.cols; maxloc[1] = maxloc_ / src.cols; maxloc[0] = maxloc_ - maxloc[1] * src.cols; } template void minMaxLocMaskMultipassCaller(const DevMem2Db, const PtrStepb, double*, double*, int[2], int[2], PtrStepb, PtrStepb); template void minMaxLocMaskMultipassCaller(const DevMem2Db, const PtrStepb, double*, double*, int[2], int[2], PtrStepb, PtrStepb); template void minMaxLocMaskMultipassCaller(const DevMem2Db, const PtrStepb, double*, double*, int[2], int[2], PtrStepb, PtrStepb); template void minMaxLocMaskMultipassCaller(const DevMem2Db, const PtrStepb, double*, double*, int[2], int[2], PtrStepb, PtrStepb); template void minMaxLocMaskMultipassCaller(const DevMem2Db, const PtrStepb, double*, double*, int[2], int[2], PtrStepb, PtrStepb); template void minMaxLocMaskMultipassCaller(const DevMem2Db, const PtrStepb, double*, double*, int[2], int[2], PtrStepb, PtrStepb); template void minMaxLocMultipassCaller(const DevMem2Db src, double* minval, double* maxval, int minloc[2], int maxloc[2], PtrStepb valbuf, PtrStepb locbuf) { dim3 threads, grid; estimateThreadCfg(src.cols, src.rows, threads, grid); setKernelConsts(src.cols, src.rows, threads, grid); T* minval_buf = (T*)valbuf.ptr(0); T* maxval_buf = (T*)valbuf.ptr(1); uint* minloc_buf = (uint*)locbuf.ptr(0); uint* maxloc_buf = (uint*)locbuf.ptr(1); minMaxLocKernel<256, T, MaskTrue><<>>(src, MaskTrue(), minval_buf, maxval_buf, minloc_buf, maxloc_buf); cudaSafeCall( cudaGetLastError() ); minMaxLocPass2Kernel<256, T><<<1, 256>>>(minval_buf, maxval_buf, minloc_buf, maxloc_buf, grid.x * grid.y); cudaSafeCall( cudaGetLastError() ); cudaSafeCall( cudaDeviceSynchronize() ); T minval_, maxval_; cudaSafeCall(cudaMemcpy(&minval_, minval_buf, sizeof(T), cudaMemcpyDeviceToHost)); cudaSafeCall(cudaMemcpy(&maxval_, maxval_buf, sizeof(T), cudaMemcpyDeviceToHost)); *minval = minval_; *maxval = maxval_; uint minloc_, maxloc_; cudaSafeCall(cudaMemcpy(&minloc_, minloc_buf, sizeof(int), cudaMemcpyDeviceToHost)); cudaSafeCall(cudaMemcpy(&maxloc_, maxloc_buf, sizeof(int), cudaMemcpyDeviceToHost)); minloc[1] = minloc_ / src.cols; minloc[0] = minloc_ - minloc[1] * src.cols; maxloc[1] = maxloc_ / src.cols; maxloc[0] = maxloc_ - maxloc[1] * src.cols; } template void minMaxLocMultipassCaller(const DevMem2Db, double*, double*, int[2], int[2], PtrStepb, PtrStepb); template void minMaxLocMultipassCaller(const DevMem2Db, double*, double*, int[2], int[2], PtrStepb, PtrStepb); template void minMaxLocMultipassCaller(const DevMem2Db, double*, double*, int[2], int[2], PtrStepb, PtrStepb); template void minMaxLocMultipassCaller(const DevMem2Db, double*, double*, int[2], int[2], PtrStepb, PtrStepb); template void minMaxLocMultipassCaller(const DevMem2Db, double*, double*, int[2], int[2], PtrStepb, PtrStepb); template void minMaxLocMultipassCaller(const DevMem2Db, double*, double*, int[2], int[2], PtrStepb, PtrStepb); } // namespace minmaxloc ////////////////////////////////////////////////////////////////////////////////////////////////////////// // countNonZero namespace countnonzero { __constant__ int ctwidth; __constant__ int ctheight; __device__ uint blocks_finished = 0; void estimateThreadCfg(int cols, int rows, dim3& threads, dim3& grid) { threads = dim3(32, 8); grid = dim3(divUp(cols, threads.x * 8), divUp(rows, threads.y * 32)); grid.x = std::min(grid.x, threads.x); grid.y = std::min(grid.y, threads.y); } void getBufSizeRequired(int cols, int rows, int& bufcols, int& bufrows) { dim3 threads, grid; estimateThreadCfg(cols, rows, threads, grid); bufcols = grid.x * grid.y * sizeof(int); bufrows = 1; } void setKernelConsts(int cols, int rows, const dim3& threads, const dim3& grid) { int twidth = divUp(divUp(cols, grid.x), threads.x); int theight = divUp(divUp(rows, grid.y), threads.y); cudaSafeCall(cudaMemcpyToSymbol(ctwidth, &twidth, sizeof(twidth))); cudaSafeCall(cudaMemcpyToSymbol(ctheight, &theight, sizeof(theight))); } template __global__ void countNonZeroKernel(const DevMem2Db src, volatile uint* count) { __shared__ uint scount[nthreads]; uint x0 = blockIdx.x * blockDim.x * ctwidth + threadIdx.x; uint y0 = blockIdx.y * blockDim.y * ctheight + threadIdx.y; uint tid = threadIdx.y * blockDim.x + threadIdx.x; uint cnt = 0; for (uint y = 0; y < ctheight && y0 + y * blockDim.y < src.rows; ++y) { const T* ptr = (const T*)src.ptr(y0 + y * blockDim.y); for (uint x = 0; x < ctwidth && x0 + x * blockDim.x < src.cols; ++x) cnt += ptr[x0 + x * blockDim.x] != 0; } scount[tid] = cnt; __syncthreads(); sumInSmem(scount, tid); #if __CUDA_ARCH__ >= 110 __shared__ bool is_last; if (tid == 0) { count[blockIdx.y * gridDim.x + blockIdx.x] = scount[0]; __threadfence(); uint ticket = atomicInc(&blocks_finished, gridDim.x * gridDim.y); is_last = ticket == gridDim.x * gridDim.y - 1; } __syncthreads(); if (is_last) { scount[tid] = tid < gridDim.x * gridDim.y ? count[tid] : 0; __syncthreads(); sumInSmem(scount, tid); if (tid == 0) { count[0] = scount[0]; blocks_finished = 0; } } #else if (tid == 0) count[blockIdx.y * gridDim.x + blockIdx.x] = scount[0]; #endif } template int countNonZeroCaller(const DevMem2Db src, PtrStepb buf) { dim3 threads, grid; estimateThreadCfg(src.cols, src.rows, threads, grid); setKernelConsts(src.cols, src.rows, threads, grid); uint* count_buf = (uint*)buf.ptr(0); countNonZeroKernel<256, T><<>>(src, count_buf); cudaSafeCall( cudaGetLastError() ); cudaSafeCall( cudaDeviceSynchronize() ); uint count; cudaSafeCall(cudaMemcpy(&count, count_buf, sizeof(int), cudaMemcpyDeviceToHost)); return count; } template int countNonZeroCaller(const DevMem2Db, PtrStepb); template int countNonZeroCaller(const DevMem2Db, PtrStepb); template int countNonZeroCaller(const DevMem2Db, PtrStepb); template int countNonZeroCaller(const DevMem2Db, PtrStepb); template int countNonZeroCaller(const DevMem2Db, PtrStepb); template int countNonZeroCaller(const DevMem2Db, PtrStepb); template int countNonZeroCaller(const DevMem2Db, PtrStepb); template __global__ void countNonZeroPass2Kernel(uint* count, int size) { __shared__ uint scount[nthreads]; uint tid = threadIdx.y * blockDim.x + threadIdx.x; scount[tid] = tid < size ? count[tid] : 0; __syncthreads(); sumInSmem(scount, tid); if (tid == 0) count[0] = scount[0]; } template int countNonZeroMultipassCaller(const DevMem2Db src, PtrStepb buf) { dim3 threads, grid; estimateThreadCfg(src.cols, src.rows, threads, grid); setKernelConsts(src.cols, src.rows, threads, grid); uint* count_buf = (uint*)buf.ptr(0); countNonZeroKernel<256, T><<>>(src, count_buf); cudaSafeCall( cudaGetLastError() ); countNonZeroPass2Kernel<256, T><<<1, 256>>>(count_buf, grid.x * grid.y); cudaSafeCall( cudaGetLastError() ); cudaSafeCall( cudaDeviceSynchronize() ); uint count; cudaSafeCall(cudaMemcpy(&count, count_buf, sizeof(int), cudaMemcpyDeviceToHost)); return count; } template int countNonZeroMultipassCaller(const DevMem2Db, PtrStepb); template int countNonZeroMultipassCaller(const DevMem2Db, PtrStepb); template int countNonZeroMultipassCaller(const DevMem2Db, PtrStepb); template int countNonZeroMultipassCaller(const DevMem2Db, PtrStepb); template int countNonZeroMultipassCaller(const DevMem2Db, PtrStepb); template int countNonZeroMultipassCaller(const DevMem2Db, PtrStepb); } // namespace countnonzero ////////////////////////////////////////////////////////////////////////// // Sum namespace sum { template struct SumType {}; template <> struct SumType { typedef uint R; }; template <> struct SumType { typedef int R; }; template <> struct SumType { typedef uint R; }; template <> struct SumType { typedef int R; }; template <> struct SumType { typedef int R; }; template <> struct SumType { typedef float R; }; template <> struct SumType { typedef double R; }; template struct IdentityOp { static __device__ __forceinline__ R call(R x) { return x; } }; template struct AbsOp { static __device__ __forceinline__ R call(R x) { return ::abs(x); } }; template <> struct AbsOp { static __device__ __forceinline__ uint call(uint x) { return x; } }; template struct SqrOp { static __device__ __forceinline__ R call(R x) { return x * x; } }; __constant__ int ctwidth; __constant__ int ctheight; __device__ uint blocks_finished = 0; const int threads_x = 32; const int threads_y = 8; void estimateThreadCfg(int cols, int rows, dim3& threads, dim3& grid) { threads = dim3(threads_x, threads_y); grid = dim3(divUp(cols, threads.x * threads.y), divUp(rows, threads.y * threads.x)); grid.x = std::min(grid.x, threads.x); grid.y = std::min(grid.y, threads.y); } void getBufSizeRequired(int cols, int rows, int cn, int& bufcols, int& bufrows) { dim3 threads, grid; estimateThreadCfg(cols, rows, threads, grid); bufcols = grid.x * grid.y * sizeof(double) * cn; bufrows = 1; } void setKernelConsts(int cols, int rows, const dim3& threads, const dim3& grid) { int twidth = divUp(divUp(cols, grid.x), threads.x); int theight = divUp(divUp(rows, grid.y), threads.y); cudaSafeCall(cudaMemcpyToSymbol(ctwidth, &twidth, sizeof(twidth))); cudaSafeCall(cudaMemcpyToSymbol(ctheight, &theight, sizeof(theight))); } template __global__ void sumKernel(const DevMem2Db src, R* result) { __shared__ R smem[nthreads]; const int x0 = blockIdx.x * blockDim.x * ctwidth + threadIdx.x; const int y0 = blockIdx.y * blockDim.y * ctheight + threadIdx.y; const int tid = threadIdx.y * blockDim.x + threadIdx.x; const int bid = blockIdx.y * gridDim.x + blockIdx.x; R sum = 0; for (int y = 0; y < ctheight && y0 + y * blockDim.y < src.rows; ++y) { const T* ptr = (const T*)src.ptr(y0 + y * blockDim.y); for (int x = 0; x < ctwidth && x0 + x * blockDim.x < src.cols; ++x) sum += Op::call(ptr[x0 + x * blockDim.x]); } smem[tid] = sum; __syncthreads(); sumInSmem(smem, tid); #if __CUDA_ARCH__ >= 110 __shared__ bool is_last; if (tid == 0) { result[bid] = smem[0]; __threadfence(); uint ticket = atomicInc(&blocks_finished, gridDim.x * gridDim.y); is_last = (ticket == gridDim.x * gridDim.y - 1); } __syncthreads(); if (is_last) { smem[tid] = tid < gridDim.x * gridDim.y ? result[tid] : 0; __syncthreads(); sumInSmem(smem, tid); if (tid == 0) { result[0] = smem[0]; blocks_finished = 0; } } #else if (tid == 0) result[bid] = smem[0]; #endif } template __global__ void sumPass2Kernel(R* result, int size) { __shared__ R smem[nthreads]; int tid = threadIdx.y * blockDim.x + threadIdx.x; smem[tid] = tid < size ? result[tid] : 0; __syncthreads(); sumInSmem(smem, tid); if (tid == 0) result[0] = smem[0]; } template __global__ void sumKernel_C2(const DevMem2Db src, typename TypeVec::vec_type* result) { typedef typename TypeVec::vec_type SrcType; typedef typename TypeVec::vec_type DstType; __shared__ R smem[nthreads * 2]; const int x0 = blockIdx.x * blockDim.x * ctwidth + threadIdx.x; const int y0 = blockIdx.y * blockDim.y * ctheight + threadIdx.y; const int tid = threadIdx.y * blockDim.x + threadIdx.x; const int bid = blockIdx.y * gridDim.x + blockIdx.x; SrcType val; DstType sum = VecTraits::all(0); for (int y = 0; y < ctheight && y0 + y * blockDim.y < src.rows; ++y) { const SrcType* ptr = (const SrcType*)src.ptr(y0 + y * blockDim.y); for (int x = 0; x < ctwidth && x0 + x * blockDim.x < src.cols; ++x) { val = ptr[x0 + x * blockDim.x]; sum = sum + VecTraits::make(Op::call(val.x), Op::call(val.y)); } } smem[tid] = sum.x; smem[tid + nthreads] = sum.y; __syncthreads(); sumInSmem(smem, tid); sumInSmem(smem + nthreads, tid); #if __CUDA_ARCH__ >= 110 __shared__ bool is_last; if (tid == 0) { DstType res; res.x = smem[0]; res.y = smem[nthreads]; result[bid] = res; __threadfence(); uint ticket = atomicInc(&blocks_finished, gridDim.x * gridDim.y); is_last = (ticket == gridDim.x * gridDim.y - 1); } __syncthreads(); if (is_last) { DstType res = tid < gridDim.x * gridDim.y ? result[tid] : VecTraits::all(0); smem[tid] = res.x; smem[tid + nthreads] = res.y; __syncthreads(); sumInSmem(smem, tid); sumInSmem(smem + nthreads, tid); if (tid == 0) { res.x = smem[0]; res.y = smem[nthreads]; result[0] = res; blocks_finished = 0; } } #else if (tid == 0) { DstType res; res.x = smem[0]; res.y = smem[nthreads]; result[bid] = res; } #endif } template __global__ void sumPass2Kernel_C2(typename TypeVec::vec_type* result, int size) { typedef typename TypeVec::vec_type DstType; __shared__ R smem[nthreads * 2]; const int tid = threadIdx.y * blockDim.x + threadIdx.x; DstType res = tid < size ? result[tid] : VecTraits::all(0); smem[tid] = res.x; smem[tid + nthreads] = res.y; __syncthreads(); sumInSmem(smem, tid); sumInSmem(smem + nthreads, tid); if (tid == 0) { res.x = smem[0]; res.y = smem[nthreads]; result[0] = res; } } template __global__ void sumKernel_C3(const DevMem2Db src, typename TypeVec::vec_type* result) { typedef typename TypeVec::vec_type SrcType; typedef typename TypeVec::vec_type DstType; __shared__ R smem[nthreads * 3]; const int x0 = blockIdx.x * blockDim.x * ctwidth + threadIdx.x; const int y0 = blockIdx.y * blockDim.y * ctheight + threadIdx.y; const int tid = threadIdx.y * blockDim.x + threadIdx.x; const int bid = blockIdx.y * gridDim.x + blockIdx.x; SrcType val; DstType sum = VecTraits::all(0); for (int y = 0; y < ctheight && y0 + y * blockDim.y < src.rows; ++y) { const SrcType* ptr = (const SrcType*)src.ptr(y0 + y * blockDim.y); for (int x = 0; x < ctwidth && x0 + x * blockDim.x < src.cols; ++x) { val = ptr[x0 + x * blockDim.x]; sum = sum + VecTraits::make(Op::call(val.x), Op::call(val.y), Op::call(val.z)); } } smem[tid] = sum.x; smem[tid + nthreads] = sum.y; smem[tid + 2 * nthreads] = sum.z; __syncthreads(); sumInSmem(smem, tid); sumInSmem(smem + nthreads, tid); sumInSmem(smem + 2 * nthreads, tid); #if __CUDA_ARCH__ >= 110 __shared__ bool is_last; if (tid == 0) { DstType res; res.x = smem[0]; res.y = smem[nthreads]; res.z = smem[2 * nthreads]; result[bid] = res; __threadfence(); uint ticket = atomicInc(&blocks_finished, gridDim.x * gridDim.y); is_last = (ticket == gridDim.x * gridDim.y - 1); } __syncthreads(); if (is_last) { DstType res = tid < gridDim.x * gridDim.y ? result[tid] : VecTraits::all(0); smem[tid] = res.x; smem[tid + nthreads] = res.y; smem[tid + 2 * nthreads] = res.z; __syncthreads(); sumInSmem(smem, tid); sumInSmem(smem + nthreads, tid); sumInSmem(smem + 2 * nthreads, tid); if (tid == 0) { res.x = smem[0]; res.y = smem[nthreads]; res.z = smem[2 * nthreads]; result[0] = res; blocks_finished = 0; } } #else if (tid == 0) { DstType res; res.x = smem[0]; res.y = smem[nthreads]; res.z = smem[2 * nthreads]; result[bid] = res; } #endif } template __global__ void sumPass2Kernel_C3(typename TypeVec::vec_type* result, int size) { typedef typename TypeVec::vec_type DstType; __shared__ R smem[nthreads * 3]; const int tid = threadIdx.y * blockDim.x + threadIdx.x; DstType res = tid < size ? result[tid] : VecTraits::all(0); smem[tid] = res.x; smem[tid + nthreads] = res.y; smem[tid + 2 * nthreads] = res.z; __syncthreads(); sumInSmem(smem, tid); sumInSmem(smem + nthreads, tid); sumInSmem(smem + 2 * nthreads, tid); if (tid == 0) { res.x = smem[0]; res.y = smem[nthreads]; res.z = smem[2 * nthreads]; result[0] = res; } } template __global__ void sumKernel_C4(const DevMem2Db src, typename TypeVec::vec_type* result) { typedef typename TypeVec::vec_type SrcType; typedef typename TypeVec::vec_type DstType; __shared__ R smem[nthreads * 4]; const int x0 = blockIdx.x * blockDim.x * ctwidth + threadIdx.x; const int y0 = blockIdx.y * blockDim.y * ctheight + threadIdx.y; const int tid = threadIdx.y * blockDim.x + threadIdx.x; const int bid = blockIdx.y * gridDim.x + blockIdx.x; SrcType val; DstType sum = VecTraits::all(0); for (int y = 0; y < ctheight && y0 + y * blockDim.y < src.rows; ++y) { const SrcType* ptr = (const SrcType*)src.ptr(y0 + y * blockDim.y); for (int x = 0; x < ctwidth && x0 + x * blockDim.x < src.cols; ++x) { val = ptr[x0 + x * blockDim.x]; sum = sum + VecTraits::make(Op::call(val.x), Op::call(val.y), Op::call(val.z), Op::call(val.w)); } } smem[tid] = sum.x; smem[tid + nthreads] = sum.y; smem[tid + 2 * nthreads] = sum.z; smem[tid + 3 * nthreads] = sum.w; __syncthreads(); sumInSmem(smem, tid); sumInSmem(smem + nthreads, tid); sumInSmem(smem + 2 * nthreads, tid); sumInSmem(smem + 3 * nthreads, tid); #if __CUDA_ARCH__ >= 110 __shared__ bool is_last; if (tid == 0) { DstType res; res.x = smem[0]; res.y = smem[nthreads]; res.z = smem[2 * nthreads]; res.w = smem[3 * nthreads]; result[bid] = res; __threadfence(); uint ticket = atomicInc(&blocks_finished, gridDim.x * gridDim.y); is_last = (ticket == gridDim.x * gridDim.y - 1); } __syncthreads(); if (is_last) { DstType res = tid < gridDim.x * gridDim.y ? result[tid] : VecTraits::all(0); smem[tid] = res.x; smem[tid + nthreads] = res.y; smem[tid + 2 * nthreads] = res.z; smem[tid + 3 * nthreads] = res.w; __syncthreads(); sumInSmem(smem, tid); sumInSmem(smem + nthreads, tid); sumInSmem(smem + 2 * nthreads, tid); sumInSmem(smem + 3 * nthreads, tid); if (tid == 0) { res.x = smem[0]; res.y = smem[nthreads]; res.z = smem[2 * nthreads]; res.w = smem[3 * nthreads]; result[0] = res; blocks_finished = 0; } } #else if (tid == 0) { DstType res; res.x = smem[0]; res.y = smem[nthreads]; res.z = smem[2 * nthreads]; res.w = smem[3 * nthreads]; result[bid] = res; } #endif } template __global__ void sumPass2Kernel_C4(typename TypeVec::vec_type* result, int size) { typedef typename TypeVec::vec_type DstType; __shared__ R smem[nthreads * 4]; const int tid = threadIdx.y * blockDim.x + threadIdx.x; DstType res = tid < size ? result[tid] : VecTraits::all(0); smem[tid] = res.x; smem[tid + nthreads] = res.y; smem[tid + 2 * nthreads] = res.z; smem[tid + 3 * nthreads] = res.w; __syncthreads(); sumInSmem(smem, tid); sumInSmem(smem + nthreads, tid); sumInSmem(smem + 2 * nthreads, tid); sumInSmem(smem + 3 * nthreads, tid); if (tid == 0) { res.x = smem[0]; res.y = smem[nthreads]; res.z = smem[2 * nthreads]; res.w = smem[3 * nthreads]; result[0] = res; } } template void sumMultipassCaller(const DevMem2Db src, PtrStepb buf, double* sum, int cn) { typedef typename SumType::R R; dim3 threads, grid; estimateThreadCfg(src.cols, src.rows, threads, grid); setKernelConsts(src.cols, src.rows, threads, grid); switch (cn) { case 1: sumKernel, threads_x * threads_y><<>>( src, (typename TypeVec::vec_type*)buf.ptr(0)); cudaSafeCall( cudaGetLastError() ); sumPass2Kernel<<<1, threads_x * threads_y>>>( (typename TypeVec::vec_type*)buf.ptr(0), grid.x * grid.y); cudaSafeCall( cudaGetLastError() ); break; case 2: sumKernel_C2, threads_x * threads_y><<>>( src, (typename TypeVec::vec_type*)buf.ptr(0)); cudaSafeCall( cudaGetLastError() ); sumPass2Kernel_C2<<<1, threads_x * threads_y>>>( (typename TypeVec::vec_type*)buf.ptr(0), grid.x * grid.y); cudaSafeCall( cudaGetLastError() ); break; case 3: sumKernel_C3, threads_x * threads_y><<>>( src, (typename TypeVec::vec_type*)buf.ptr(0)); cudaSafeCall( cudaGetLastError() ); sumPass2Kernel_C3<<<1, threads_x * threads_y>>>( (typename TypeVec::vec_type*)buf.ptr(0), grid.x * grid.y); cudaSafeCall( cudaGetLastError() ); break; case 4: sumKernel_C4, threads_x * threads_y><<>>( src, (typename TypeVec::vec_type*)buf.ptr(0)); cudaSafeCall( cudaGetLastError() ); sumPass2Kernel_C4<<<1, threads_x * threads_y>>>( (typename TypeVec::vec_type*)buf.ptr(0), grid.x * grid.y); cudaSafeCall( cudaGetLastError() ); break; } cudaSafeCall( cudaDeviceSynchronize() ); R result[4] = {0, 0, 0, 0}; cudaSafeCall(cudaMemcpy(&result, buf.ptr(0), sizeof(R) * cn, cudaMemcpyDeviceToHost)); sum[0] = result[0]; sum[1] = result[1]; sum[2] = result[2]; sum[3] = result[3]; } template void sumMultipassCaller(const DevMem2Db, PtrStepb, double*, int); template void sumMultipassCaller(const DevMem2Db, PtrStepb, double*, int); template void sumMultipassCaller(const DevMem2Db, PtrStepb, double*, int); template void sumMultipassCaller(const DevMem2Db, PtrStepb, double*, int); template void sumMultipassCaller(const DevMem2Db, PtrStepb, double*, int); template void sumMultipassCaller(const DevMem2Db, PtrStepb, double*, int); template void sumCaller(const DevMem2Db src, PtrStepb buf, double* sum, int cn) { typedef typename SumType::R R; dim3 threads, grid; estimateThreadCfg(src.cols, src.rows, threads, grid); setKernelConsts(src.cols, src.rows, threads, grid); switch (cn) { case 1: sumKernel, threads_x * threads_y><<>>( src, (typename TypeVec::vec_type*)buf.ptr(0)); break; case 2: sumKernel_C2, threads_x * threads_y><<>>( src, (typename TypeVec::vec_type*)buf.ptr(0)); break; case 3: sumKernel_C3, threads_x * threads_y><<>>( src, (typename TypeVec::vec_type*)buf.ptr(0)); break; case 4: sumKernel_C4, threads_x * threads_y><<>>( src, (typename TypeVec::vec_type*)buf.ptr(0)); break; } cudaSafeCall( cudaGetLastError() ); cudaSafeCall( cudaDeviceSynchronize() ); R result[4] = {0, 0, 0, 0}; cudaSafeCall(cudaMemcpy(&result, buf.ptr(0), sizeof(R) * cn, cudaMemcpyDeviceToHost)); sum[0] = result[0]; sum[1] = result[1]; sum[2] = result[2]; sum[3] = result[3]; } template void sumCaller(const DevMem2Db, PtrStepb, double*, int); template void sumCaller(const DevMem2Db, PtrStepb, double*, int); template void sumCaller(const DevMem2Db, PtrStepb, double*, int); template void sumCaller(const DevMem2Db, PtrStepb, double*, int); template void sumCaller(const DevMem2Db, PtrStepb, double*, int); template void sumCaller(const DevMem2Db, PtrStepb, double*, int); template void absSumMultipassCaller(const DevMem2Db src, PtrStepb buf, double* sum, int cn) { typedef typename SumType::R R; dim3 threads, grid; estimateThreadCfg(src.cols, src.rows, threads, grid); setKernelConsts(src.cols, src.rows, threads, grid); switch (cn) { case 1: sumKernel, threads_x * threads_y><<>>( src, (typename TypeVec::vec_type*)buf.ptr(0)); cudaSafeCall( cudaGetLastError() ); sumPass2Kernel<<<1, threads_x * threads_y>>>( (typename TypeVec::vec_type*)buf.ptr(0), grid.x * grid.y); cudaSafeCall( cudaGetLastError() ); break; case 2: sumKernel_C2, threads_x * threads_y><<>>( src, (typename TypeVec::vec_type*)buf.ptr(0)); cudaSafeCall( cudaGetLastError() ); sumPass2Kernel_C2<<<1, threads_x * threads_y>>>( (typename TypeVec::vec_type*)buf.ptr(0), grid.x * grid.y); cudaSafeCall( cudaGetLastError() ); break; case 3: sumKernel_C3, threads_x * threads_y><<>>( src, (typename TypeVec::vec_type*)buf.ptr(0)); cudaSafeCall( cudaGetLastError() ); sumPass2Kernel_C3<<<1, threads_x * threads_y>>>( (typename TypeVec::vec_type*)buf.ptr(0), grid.x * grid.y); cudaSafeCall( cudaGetLastError() ); break; case 4: sumKernel_C4, threads_x * threads_y><<>>( src, (typename TypeVec::vec_type*)buf.ptr(0)); cudaSafeCall( cudaGetLastError() ); sumPass2Kernel_C4<<<1, threads_x * threads_y>>>( (typename TypeVec::vec_type*)buf.ptr(0), grid.x * grid.y); cudaSafeCall( cudaGetLastError() ); break; } cudaSafeCall( cudaDeviceSynchronize() ); R result[4] = {0, 0, 0, 0}; cudaSafeCall(cudaMemcpy(result, buf.ptr(0), sizeof(R) * cn, cudaMemcpyDeviceToHost)); sum[0] = result[0]; sum[1] = result[1]; sum[2] = result[2]; sum[3] = result[3]; } template void absSumMultipassCaller(const DevMem2Db, PtrStepb, double*, int); template void absSumMultipassCaller(const DevMem2Db, PtrStepb, double*, int); template void absSumMultipassCaller(const DevMem2Db, PtrStepb, double*, int); template void absSumMultipassCaller(const DevMem2Db, PtrStepb, double*, int); template void absSumMultipassCaller(const DevMem2Db, PtrStepb, double*, int); template void absSumMultipassCaller(const DevMem2Db, PtrStepb, double*, int); template void absSumCaller(const DevMem2Db src, PtrStepb buf, double* sum, int cn) { typedef typename SumType::R R; dim3 threads, grid; estimateThreadCfg(src.cols, src.rows, threads, grid); setKernelConsts(src.cols, src.rows, threads, grid); switch (cn) { case 1: sumKernel, threads_x * threads_y><<>>( src, (typename TypeVec::vec_type*)buf.ptr(0)); break; case 2: sumKernel_C2, threads_x * threads_y><<>>( src, (typename TypeVec::vec_type*)buf.ptr(0)); break; case 3: sumKernel_C3, threads_x * threads_y><<>>( src, (typename TypeVec::vec_type*)buf.ptr(0)); break; case 4: sumKernel_C4, threads_x * threads_y><<>>( src, (typename TypeVec::vec_type*)buf.ptr(0)); break; } cudaSafeCall( cudaGetLastError() ); cudaSafeCall( cudaDeviceSynchronize() ); R result[4] = {0, 0, 0, 0}; cudaSafeCall(cudaMemcpy(result, buf.ptr(0), sizeof(R) * cn, cudaMemcpyDeviceToHost)); sum[0] = result[0]; sum[1] = result[1]; sum[2] = result[2]; sum[3] = result[3]; } template void absSumCaller(const DevMem2Db, PtrStepb, double*, int); template void absSumCaller(const DevMem2Db, PtrStepb, double*, int); template void absSumCaller(const DevMem2Db, PtrStepb, double*, int); template void absSumCaller(const DevMem2Db, PtrStepb, double*, int); template void absSumCaller(const DevMem2Db, PtrStepb, double*, int); template void absSumCaller(const DevMem2Db, PtrStepb, double*, int); template void sqrSumMultipassCaller(const DevMem2Db src, PtrStepb buf, double* sum, int cn) { typedef typename SumType::R R; dim3 threads, grid; estimateThreadCfg(src.cols, src.rows, threads, grid); setKernelConsts(src.cols, src.rows, threads, grid); switch (cn) { case 1: sumKernel, threads_x * threads_y><<>>( src, (typename TypeVec::vec_type*)buf.ptr(0)); cudaSafeCall( cudaGetLastError() ); sumPass2Kernel<<<1, threads_x * threads_y>>>( (typename TypeVec::vec_type*)buf.ptr(0), grid.x * grid.y); cudaSafeCall( cudaGetLastError() ); break; case 2: sumKernel_C2, threads_x * threads_y><<>>( src, (typename TypeVec::vec_type*)buf.ptr(0)); cudaSafeCall( cudaGetLastError() ); sumPass2Kernel_C2<<<1, threads_x * threads_y>>>( (typename TypeVec::vec_type*)buf.ptr(0), grid.x * grid.y); cudaSafeCall( cudaGetLastError() ); break; case 3: sumKernel_C3, threads_x * threads_y><<>>( src, (typename TypeVec::vec_type*)buf.ptr(0)); cudaSafeCall( cudaGetLastError() ); sumPass2Kernel_C3<<<1, threads_x * threads_y>>>( (typename TypeVec::vec_type*)buf.ptr(0), grid.x * grid.y); cudaSafeCall( cudaGetLastError() ); break; case 4: sumKernel_C4, threads_x * threads_y><<>>( src, (typename TypeVec::vec_type*)buf.ptr(0)); cudaSafeCall( cudaGetLastError() ); sumPass2Kernel_C4<<<1, threads_x * threads_y>>>( (typename TypeVec::vec_type*)buf.ptr(0), grid.x * grid.y); cudaSafeCall( cudaGetLastError() ); break; } cudaSafeCall( cudaDeviceSynchronize() ); R result[4] = {0, 0, 0, 0}; cudaSafeCall(cudaMemcpy(result, buf.ptr(0), sizeof(R) * cn, cudaMemcpyDeviceToHost)); sum[0] = result[0]; sum[1] = result[1]; sum[2] = result[2]; sum[3] = result[3]; } template void sqrSumMultipassCaller(const DevMem2Db, PtrStepb, double*, int); template void sqrSumMultipassCaller(const DevMem2Db, PtrStepb, double*, int); template void sqrSumMultipassCaller(const DevMem2Db, PtrStepb, double*, int); template void sqrSumMultipassCaller(const DevMem2Db, PtrStepb, double*, int); template void sqrSumMultipassCaller(const DevMem2Db, PtrStepb, double*, int); template void sqrSumMultipassCaller(const DevMem2Db, PtrStepb, double*, int); template void sqrSumCaller(const DevMem2Db src, PtrStepb buf, double* sum, int cn) { typedef typename SumType::R R; dim3 threads, grid; estimateThreadCfg(src.cols, src.rows, threads, grid); setKernelConsts(src.cols, src.rows, threads, grid); switch (cn) { case 1: sumKernel, threads_x * threads_y><<>>( src, (typename TypeVec::vec_type*)buf.ptr(0)); break; case 2: sumKernel_C2, threads_x * threads_y><<>>( src, (typename TypeVec::vec_type*)buf.ptr(0)); break; case 3: sumKernel_C3, threads_x * threads_y><<>>( src, (typename TypeVec::vec_type*)buf.ptr(0)); break; case 4: sumKernel_C4, threads_x * threads_y><<>>( src, (typename TypeVec::vec_type*)buf.ptr(0)); break; } cudaSafeCall( cudaGetLastError() ); cudaSafeCall( cudaDeviceSynchronize() ); R result[4] = {0, 0, 0, 0}; cudaSafeCall(cudaMemcpy(result, buf.ptr(0), sizeof(R) * cn, cudaMemcpyDeviceToHost)); sum[0] = result[0]; sum[1] = result[1]; sum[2] = result[2]; sum[3] = result[3]; } template void sqrSumCaller(const DevMem2Db, PtrStepb, double*, int); template void sqrSumCaller(const DevMem2Db, PtrStepb, double*, int); template void sqrSumCaller(const DevMem2Db, PtrStepb, double*, int); template void sqrSumCaller(const DevMem2Db, PtrStepb, double*, int); template void sqrSumCaller(const DevMem2Db, PtrStepb, double*, int); template void sqrSumCaller(const DevMem2Db, PtrStepb, double*, int); } // namespace sum ////////////////////////////////////////////////////////////////////////////// // reduce template struct SumReductor { __device__ __forceinline__ S startValue() const { return 0; } __device__ __forceinline__ S operator ()(volatile S a, volatile S b) const { return a + b; } __device__ __forceinline__ S result(S r, double) const { return r; } }; template struct AvgReductor { __device__ __forceinline__ S startValue() const { return 0; } __device__ __forceinline__ S operator ()(volatile S a, volatile S b) const { return a + b; } __device__ __forceinline__ double result(S r, double sz) const { return r / sz; } }; template struct MinReductor { __device__ __forceinline__ S startValue() const { return numeric_limits::max(); } template __device__ __forceinline__ T operator ()(volatile T a, volatile T b) const { return saturate_cast(::min(a, b)); } __device__ __forceinline__ float operator ()(volatile float a, volatile float b) const { return ::fmin(a, b); } __device__ __forceinline__ S result(S r, double) const { return r; } }; template struct MaxReductor { __device__ __forceinline__ S startValue() const { return numeric_limits::min(); } template __device__ __forceinline__ int operator ()(volatile T a, volatile T b) const { return ::max(a, b); } __device__ __forceinline__ float operator ()(volatile float a, volatile float b) const { return ::fmax(a, b); } __device__ __forceinline__ S result(S r, double) const { return r; } }; template __global__ void reduceRows(const DevMem2D_ src, D* dst, const Op op) { __shared__ S smem[16 * 16]; const int x = blockIdx.x * 16 + threadIdx.x; S myVal = op.startValue(); if (x < src.cols) { for (int y = threadIdx.y; y < src.rows; y += 16) myVal = op(myVal, src.ptr(y)[x]); } smem[threadIdx.x * 16 + threadIdx.y] = myVal; __syncthreads(); if (threadIdx.x < 8) { volatile S* srow = smem + threadIdx.y * 16; srow[threadIdx.x] = op(srow[threadIdx.x], srow[threadIdx.x + 8]); srow[threadIdx.x] = op(srow[threadIdx.x], srow[threadIdx.x + 4]); srow[threadIdx.x] = op(srow[threadIdx.x], srow[threadIdx.x + 2]); srow[threadIdx.x] = op(srow[threadIdx.x], srow[threadIdx.x + 1]); } __syncthreads(); if (threadIdx.y == 0 && x < src.cols) dst[x] = saturate_cast(op.result(smem[threadIdx.x * 16], src.rows)); } template