diff --git a/modules/gpu/src/cuda/integral_image.cu b/modules/gpu/src/cuda/integral_image.cu index 558f9085d..5bd35bdc7 100644 --- a/modules/gpu/src/cuda/integral_image.cu +++ b/modules/gpu/src/cuda/integral_image.cu @@ -383,6 +383,89 @@ namespace cv { namespace gpu { namespace device if (stream == 0) cudaSafeCall( cudaDeviceSynchronize() ); } + + __global__ void shfl_integral_vertical(PtrStepSz buffer, PtrStepSz integral) + { + #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 300) + __shared__ unsigned int sums[32][9]; + + const int tidx = blockIdx.x * blockDim.x + threadIdx.x; + const int lane_id = tidx % 8; + + if (tidx >= integral.cols) + return; + + sums[threadIdx.x][threadIdx.y] = 0; + __syncthreads(); + + unsigned int stepSum = 0; + + for (int y = threadIdx.y; y < integral.rows; y += blockDim.y) + { + unsigned int* p = buffer.ptr(y) + tidx; + unsigned int* dst = integral.ptr(y + 1) + tidx + 1; + + unsigned int sum = *p; + + sums[threadIdx.x][threadIdx.y] = sum; + __syncthreads(); + + // place into SMEM + // shfl scan reduce the SMEM, reformating so the column + // sums are computed in a warp + // then read out properly + const int j = threadIdx.x % 8; + const int k = threadIdx.x / 8 + threadIdx.y * 4; + + int partial_sum = sums[k][j]; + + for (int i = 1; i <= 8; i *= 2) + { + int n = __shfl_up(partial_sum, i, 32); + + if (lane_id >= i) + partial_sum += n; + } + + sums[k][j] = partial_sum; + __syncthreads(); + + if (threadIdx.y > 0) + sum += sums[threadIdx.x][threadIdx.y - 1]; + + sum += stepSum; + stepSum += sums[threadIdx.x][blockDim.y - 1]; + + __syncthreads(); + + *dst = sum; + } + #endif + } + + // used for frame preprocessing before Soft Cascade evaluation: no synchronization needed + // ToDo: partial dy + void shfl_integral_gpu_buffered(PtrStepSzb img, PtrStepSz buffer, PtrStepSz integral, + int blockStep, cudaStream_t stream) + { + { + const int block = blockStep; + const int grid = img.rows; + + cudaSafeCall( cudaFuncSetCacheConfig(shfl_integral_horizontal, cudaFuncCachePreferL1) ); + + shfl_integral_horizontal<<>>((PtrStepSz) img, buffer); + cudaSafeCall( cudaGetLastError() ); + } + + { + const dim3 block(32, 8); + const dim3 grid(divUp(integral.cols, block.x), 1); + + shfl_integral_vertical<<>>((PtrStepSz)buffer, integral); + cudaSafeCall( cudaGetLastError() ); + } + } } }}} diff --git a/modules/gpu/src/cuda/isf-sc.cu b/modules/gpu/src/cuda/isf-sc.cu index 7aef41abc..3391bb1a0 100644 --- a/modules/gpu/src/cuda/isf-sc.cu +++ b/modules/gpu/src/cuda/isf-sc.cu @@ -198,14 +198,14 @@ namespace icf { Node node = nodes[nId]; float threshold = rescale(level, node); - int sum = get(x, y + (node.threshold >> 28) * 121, node.rect); + int sum = get(x, y + (node.threshold >> 28) * 120, node.rect); int next = 1 + (int)(sum >= threshold); dprintf("%d: go: %d (%d >= %f)\n\n" ,threadIdx.x, next, sum, threshold); node = nodes[nId + next]; threshold = rescale(level, node); - sum = get(x, y + (node.threshold >> 28) * 121, node.rect); + sum = get(x, y + (node.threshold >> 28) * 120, node.rect); const int lShift = (next - 1) * 2 + (int)(sum >= threshold); float impact = leaves[(st + threadIdx.x) * 4 + lShift]; diff --git a/modules/gpu/src/softcascade.cpp b/modules/gpu/src/softcascade.cpp index d9519e873..2d43a5440 100644 --- a/modules/gpu/src/softcascade.cpp +++ b/modules/gpu/src/softcascade.cpp @@ -76,14 +76,20 @@ cv::gpu::device::icf::Level::Level(int idx, const Octave& oct, const float scale } namespace cv { namespace gpu { namespace device { + namespace icf { void fillBins(cv::gpu::PtrStepSzb hogluv, const cv::gpu::PtrStepSzf& nangle, const int fw, const int fh, const int bins); } -namespace imgproc -{ - void shfl_integral_gpu(PtrStepSzb img, PtrStepSz integral, cudaStream_t stream); + +namespace imgproc { + void shfl_integral_gpu_buffered(PtrStepSzb, PtrStepSz, PtrStepSz, int, cudaStream_t); + + template + void resize_gpu(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float fx, float fy, + PtrStepSzb dst, int interpolation, cudaStream_t stream); } + }}} struct cv::gpu::SoftCascade::Filds @@ -319,9 +325,13 @@ struct cv::gpu::SoftCascade::Filds plane.create(FRAME_HEIGHT * (HOG_LUV_BINS + 1), FRAME_WIDTH, CV_8UC1); fplane.create(FRAME_HEIGHT * 6, FRAME_WIDTH, CV_32FC1); luv.create(FRAME_HEIGHT, FRAME_WIDTH, CV_8UC3); + shrunk.create(FRAME_HEIGHT / shr * HOG_LUV_BINS, FRAME_WIDTH / shr, CV_8UC1); - integralBuffer.create(1 , (shrunk.rows + 1) * HOG_LUV_BINS * (shrunk.cols + 1), CV_32SC1); - hogluv.create((FRAME_HEIGHT / shr + 1) * HOG_LUV_BINS, FRAME_WIDTH / shr + 64, CV_32SC1); + integralBuffer.create(shrunk.rows, shrunk.cols, CV_32SC1); + + hogluv.create((FRAME_HEIGHT / shr) * HOG_LUV_BINS + 1, FRAME_WIDTH / shr + 1, CV_32SC1); + hogluv.setTo(cv::Scalar::all(0)); + detCounter.create(1,1, CV_32SC1); octaves.upload(hoctaves); @@ -432,16 +442,7 @@ private: GpuMat channels(plane, cv::Rect(0, 0, fw, fh * Filds::HOG_LUV_BINS)); cv::gpu::resize(channels, shrunk, cv::Size(), 0.25, 0.25, CV_INTER_AREA); - - fw /= shrinkage; - fh /= shrinkage; - - for(int i = 0; i < Filds::HOG_LUV_BINS; ++i) - { - GpuMat channel(shrunk, cv::Rect(0, fh * i, fw, fh )); - GpuMat sum(hogluv, cv::Rect(0, (fh + 1) * i, fw + 1, fh + 1)); - cv::gpu::integralBuffered(channel, sum, integralBuffer); - } + device::imgproc::shfl_integral_gpu_buffered(shrunk, integralBuffer, hogluv, 12, 0); } public: diff --git a/modules/gpu/test/test_softcascade.cpp b/modules/gpu/test/test_softcascade.cpp index fb936be88..1146b062b 100644 --- a/modules/gpu/test/test_softcascade.cpp +++ b/modules/gpu/test/test_softcascade.cpp @@ -271,6 +271,7 @@ GPU_TEST_P(SoftCascadeTestAll, detect, ASSERT_EQ(detections.cols / sizeof(Detection) ,3670U); } +//ToDo: fix me GPU_TEST_P(SoftCascadeTestAll, detectOnIntegral, ALL_DEVICES )