From beb377b38cbd3ef346b596d9609e6a7a27c6128c Mon Sep 17 00:00:00 2001 From: Vladislav Vinogradov Date: Thu, 25 Oct 2012 13:27:14 +0400 Subject: [PATCH 001/155] gpu implementation of Dual TV-L1 Optical Flow --- modules/gpu/include/opencv2/gpu/gpu.hpp | 89 ++++++ modules/gpu/perf/perf_video.cpp | 50 ++++ modules/gpu/src/cuda/tvl1flow.cu | 344 ++++++++++++++++++++++++ modules/gpu/src/tvl1flow.cpp | 256 ++++++++++++++++++ modules/gpu/test/test_video.cpp | 39 +++ samples/gpu/tvl1_optical_flow.cpp | 172 ++++++++++++ 6 files changed, 950 insertions(+) create mode 100644 modules/gpu/src/cuda/tvl1flow.cu create mode 100644 modules/gpu/src/tvl1flow.cpp create mode 100644 samples/gpu/tvl1_optical_flow.cpp diff --git a/modules/gpu/include/opencv2/gpu/gpu.hpp b/modules/gpu/include/opencv2/gpu/gpu.hpp index ddb131788..2cbd45085 100644 --- a/modules/gpu/include/opencv2/gpu/gpu.hpp +++ b/modules/gpu/include/opencv2/gpu/gpu.hpp @@ -1948,6 +1948,95 @@ private: }; +// Implementation of the Zach, Pock and Bischof Dual TV-L1 Optical Flow method +// +// see reference: +// [1] C. Zach, T. Pock and H. Bischof, "A Duality Based Approach for Realtime TV-L1 Optical Flow". +// [2] Javier Sanchez, Enric Meinhardt-Llopis and Gabriele Facciolo. "TV-L1 Optical Flow Estimation". +class CV_EXPORTS OpticalFlowDual_TVL1_GPU +{ +public: + OpticalFlowDual_TVL1_GPU(); + + void operator ()(const GpuMat& I0, const GpuMat& I1, GpuMat& flowx, GpuMat& flowy); + + void collectGarbage(); + + /** + * Time step of the numerical scheme. + */ + double tau; + + /** + * Weight parameter for the data term, attachment parameter. + * This is the most relevant parameter, which determines the smoothness of the output. + * The smaller this parameter is, the smoother the solutions we obtain. + * It depends on the range of motions of the images, so its value should be adapted to each image sequence. + */ + double lambda; + + /** + * Weight parameter for (u - v)^2, tightness parameter. + * It serves as a link between the attachment and the regularization terms. + * In theory, it should have a small value in order to maintain both parts in correspondence. + * The method is stable for a large range of values of this parameter. + */ + double theta; + + /** + * Number of scales used to create the pyramid of images. + */ + int nscales; + + /** + * Number of warpings per scale. + * Represents the number of times that I1(x+u0) and grad( I1(x+u0) ) are computed per scale. + * This is a parameter that assures the stability of the method. + * It also affects the running time, so it is a compromise between speed and accuracy. + */ + int warps; + + /** + * Stopping criterion threshold used in the numerical scheme, which is a trade-off between precision and running time. + * A small value will yield more accurate solutions at the expense of a slower convergence. + */ + double epsilon; + + /** + * Stopping criterion iterations number used in the numerical scheme. + */ + int iterations; + + bool useInitialFlow; + +private: + void procOneScale(const GpuMat& I0, const GpuMat& I1, GpuMat& u1, GpuMat& u2); + + std::vector I0s; + std::vector I1s; + std::vector u1s; + std::vector u2s; + + GpuMat I1x_buf; + GpuMat I1y_buf; + + GpuMat I1w_buf; + GpuMat I1wx_buf; + GpuMat I1wy_buf; + + GpuMat grad_buf; + GpuMat rho_c_buf; + + GpuMat p11_buf; + GpuMat p12_buf; + GpuMat p21_buf; + GpuMat p22_buf; + + GpuMat diff_buf; + GpuMat norm_buf; +}; + + //! Interpolate frames (images) using provided optical flow (displacement field). //! frame0 - frame 0 (32-bit floating point images, single channel) //! frame1 - frame 1 (the same type and size) diff --git a/modules/gpu/perf/perf_video.cpp b/modules/gpu/perf/perf_video.cpp index b18cb17df..b228580fd 100644 --- a/modules/gpu/perf/perf_video.cpp +++ b/modules/gpu/perf/perf_video.cpp @@ -394,6 +394,56 @@ PERF_TEST_P(ImagePair, Video_FarnebackOpticalFlow, } } +////////////////////////////////////////////////////// +// OpticalFlowDual_TVL1 + +PERF_TEST_P(ImagePair, Video_OpticalFlowDual_TVL1, + Values(make_pair("gpu/opticalflow/frame0.png", "gpu/opticalflow/frame1.png"))) +{ + declare.time(20); + + cv::Mat frame0 = readImage(GetParam().first, cv::IMREAD_GRAYSCALE); + ASSERT_FALSE(frame0.empty()); + + cv::Mat frame1 = readImage(GetParam().second, cv::IMREAD_GRAYSCALE); + ASSERT_FALSE(frame1.empty()); + + if (PERF_RUN_GPU()) + { + cv::gpu::GpuMat d_frame0(frame0); + cv::gpu::GpuMat d_frame1(frame1); + cv::gpu::GpuMat d_flowx; + cv::gpu::GpuMat d_flowy; + + cv::gpu::OpticalFlowDual_TVL1_GPU d_alg; + + d_alg(d_frame0, d_frame1, d_flowx, d_flowy); + + TEST_CYCLE() + { + d_alg(d_frame0, d_frame1, d_flowx, d_flowy); + } + + GPU_SANITY_CHECK(d_flowx); + GPU_SANITY_CHECK(d_flowy); + } + else + { + cv::Mat flow; + + cv::OpticalFlowDual_TVL1 alg; + + alg(frame0, frame1, flow); + + TEST_CYCLE() + { + alg(frame0, frame1, flow); + } + + CPU_SANITY_CHECK(flow); + } +} + ////////////////////////////////////////////////////// // FGDStatModel diff --git a/modules/gpu/src/cuda/tvl1flow.cu b/modules/gpu/src/cuda/tvl1flow.cu new file mode 100644 index 000000000..dc07d2f2e --- /dev/null +++ b/modules/gpu/src/cuda/tvl1flow.cu @@ -0,0 +1,344 @@ +/*M/////////////////////////////////////////////////////////////////////////////////////// +// +// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. +// +// By downloading, copying, installing or using the software you agree to this license. +// If you do not agree to this license, do not download, install, +// copy or use the software. +// +// +// License Agreement +// For Open Source Computer Vision Library +// +// Copyright (C) 2000-2008, Intel Corporation, all rights reserved. +// Copyright (C) 2009, Willow Garage Inc., all rights reserved. +// Third party copyrights are property of their respective owners. +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// * Redistribution's of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// * Redistribution's in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// * The name of the copyright holders may not be used to endorse or promote products +// derived from this software without specific prior written permission. +// +// This software is provided by the copyright holders and contributors "as is" and +// any express or bpied warranties, including, but not limited to, the bpied +// warranties of merchantability and fitness for a particular purpose are disclaimed. +// In no event shall the Intel Corporation or contributors be liable for any direct, +// indirect, incidental, special, exemplary, or consequential damages +// (including, but not limited to, procurement of substitute goods or services; +// loss of use, data, or profits; or business interruption) however caused +// and on any theory of liability, whether in contract, strict liability, +// or tort (including negligence or otherwise) arising in any way out of +// the use of this software, even if advised of the possibility of such damage. +// +//M*/ + +#if !defined CUDA_DISABLER + +#include "opencv2/gpu/device/common.hpp" +#include "opencv2/gpu/device/border_interpolate.hpp" +#include "opencv2/gpu/device/limits.hpp" + +using namespace cv::gpu; +using namespace cv::gpu::device; + +//////////////////////////////////////////////////////////// +// centeredGradient + +namespace +{ + __global__ void centeredGradient(const PtrStepSzf src, PtrStepf dx, PtrStepf dy) + { + const int x = blockIdx.x * blockDim.x + threadIdx.x; + const int y = blockIdx.y * blockDim.y + threadIdx.y; + + if (x >= src.cols || y >= src.rows) + return; + + dx(y, x) = 0.5f * (src(y, ::min(x + 1, src.cols - 1)) - src(y, ::max(x - 1, 0))); + dy(y, x) = 0.5f * (src(::min(y + 1, src.rows - 1), x) - src(::max(y - 1, 0), x)); + } +} + +namespace tvl1flow +{ + void centeredGradient(PtrStepSzf src, PtrStepSzf dx, PtrStepSzf dy) + { + const dim3 block(32, 8); + const dim3 grid(divUp(src.cols, block.x), divUp(src.rows, block.y)); + + ::centeredGradient<<>>(src, dx, dy); + cudaSafeCall( cudaGetLastError() ); + + cudaSafeCall( cudaDeviceSynchronize() ); + } +} + +//////////////////////////////////////////////////////////// +// warpBackward + +namespace +{ + static __device__ __forceinline__ float bicubicCoeff(float x_) + { + float x = fabsf(x_); + if (x <= 1.0f) + { + return x * x * (1.5f * x - 2.5f) + 1.0f; + } + else if (x < 2.0f) + { + return x * (x * (-0.5f * x + 2.5f) - 4.0f) + 2.0f; + } + else + { + return 0.0f; + } + } + + texture tex_I1 (false, cudaFilterModePoint, cudaAddressModeClamp); + texture tex_I1x(false, cudaFilterModePoint, cudaAddressModeClamp); + texture tex_I1y(false, cudaFilterModePoint, cudaAddressModeClamp); + + __global__ void warpBackward(const PtrStepSzf I0, const PtrStepf u1, const PtrStepf u2, PtrStepf I1w, PtrStepf I1wx, PtrStepf I1wy, PtrStepf grad, PtrStepf rho) + { + const int x = blockIdx.x * blockDim.x + threadIdx.x; + const int y = blockIdx.y * blockDim.y + threadIdx.y; + + if (x >= I0.cols || y >= I0.rows) + return; + + const float u1Val = u1(y, x); + const float u2Val = u2(y, x); + + const float wx = x + u1Val; + const float wy = y + u2Val; + + const int xmin = ::ceilf(wx - 2.0f); + const int xmax = ::floorf(wx + 2.0f); + + const int ymin = ::ceilf(wy - 2.0f); + const int ymax = ::floorf(wy + 2.0f); + + float sum = 0.0f; + float sumx = 0.0f; + float sumy = 0.0f; + float wsum = 0.0f; + + for (int cy = ymin; cy <= ymax; ++cy) + { + for (int cx = xmin; cx <= xmax; ++cx) + { + const float w = bicubicCoeff(wx - cx) * bicubicCoeff(wy - cy); + + sum += w * tex2D(tex_I1 , cx, cy); + sumx += w * tex2D(tex_I1x, cx, cy); + sumy += w * tex2D(tex_I1y, cx, cy); + + wsum += w; + } + } + + const float coeff = 1.0f / wsum; + + const float I1wVal = sum * coeff; + const float I1wxVal = sumx * coeff; + const float I1wyVal = sumy * coeff; + + I1w(y, x) = I1wVal; + I1wx(y, x) = I1wxVal; + I1wy(y, x) = I1wyVal; + + const float Ix2 = I1wxVal * I1wxVal; + const float Iy2 = I1wyVal * I1wyVal; + + // store the |Grad(I1)|^2 + grad(y, x) = Ix2 + Iy2; + + // compute the constant part of the rho function + const float I0Val = I0(y, x); + rho(y, x) = I1wVal - I1wxVal * u1Val - I1wyVal * u2Val - I0Val; + } +} + +namespace tvl1flow +{ + void warpBackward(PtrStepSzf I0, PtrStepSzf I1, PtrStepSzf I1x, PtrStepSzf I1y, PtrStepSzf u1, PtrStepSzf u2, PtrStepSzf I1w, PtrStepSzf I1wx, PtrStepSzf I1wy, PtrStepSzf grad, PtrStepSzf rho) + { + const dim3 block(32, 8); + const dim3 grid(divUp(I0.cols, block.x), divUp(I0.rows, block.y)); + + bindTexture(&tex_I1 , I1); + bindTexture(&tex_I1x, I1x); + bindTexture(&tex_I1y, I1y); + + ::warpBackward<<>>(I0, u1, u2, I1w, I1wx, I1wy, grad, rho); + cudaSafeCall( cudaGetLastError() ); + + cudaSafeCall( cudaDeviceSynchronize() ); + } +} + +//////////////////////////////////////////////////////////// +// estimateU + +namespace +{ + __device__ float divergence(const PtrStepf& v1, const PtrStepf& v2, int y, int x) + { + if (x > 0 && y > 0) + { + const float v1x = v1(y, x) - v1(y, x - 1); + const float v2y = v2(y, x) - v2(y - 1, x); + return v1x + v2y; + } + else + { + if (y > 0) + return v1(y, 0) + v2(y, 0) - v2(y - 1, 0); + else + { + if (x > 0) + return v1(0, x) - v1(0, x - 1) + v2(0, x); + else + return v1(0, 0) + v2(0, 0); + } + } + } + + __global__ void estimateU(const PtrStepSzf I1wx, const PtrStepf I1wy, + const PtrStepf grad, const PtrStepf rho_c, + const PtrStepf p11, const PtrStepf p12, const PtrStepf p21, const PtrStepf p22, + PtrStepf u1, PtrStepf u2, PtrStepf error, + const float l_t, const float theta) + { + const int x = blockIdx.x * blockDim.x + threadIdx.x; + const int y = blockIdx.y * blockDim.y + threadIdx.y; + + if (x >= I1wx.cols || y >= I1wx.rows) + return; + + const float I1wxVal = I1wx(y, x); + const float I1wyVal = I1wy(y, x); + const float gradVal = grad(y, x); + const float u1OldVal = u1(y, x); + const float u2OldVal = u2(y, x); + + const float rho = rho_c(y, x) + (I1wxVal * u1OldVal + I1wyVal * u2OldVal); + + // estimate the values of the variable (v1, v2) (thresholding operator TH) + + float d1 = 0.0f; + float d2 = 0.0f; + + if (rho < -l_t * gradVal) + { + d1 = l_t * I1wxVal; + d2 = l_t * I1wyVal; + } + else if (rho > l_t * gradVal) + { + d1 = -l_t * I1wxVal; + d2 = -l_t * I1wyVal; + } + else if (gradVal > numeric_limits::epsilon()) + { + const float fi = -rho / gradVal; + d1 = fi * I1wxVal; + d2 = fi * I1wyVal; + } + + const float v1 = u1OldVal + d1; + const float v2 = u2OldVal + d2; + + // compute the divergence of the dual variable (p1, p2) + + const float div_p1 = divergence(p11, p12, y, x); + const float div_p2 = divergence(p21, p22, y, x); + + // estimate the values of the optical flow (u1, u2) + + const float u1NewVal = v1 + theta * div_p1; + const float u2NewVal = v2 + theta * div_p2; + + u1(y, x) = u1NewVal; + u2(y, x) = u2NewVal; + + const float n1 = (u1OldVal - u1NewVal) * (u1OldVal - u1NewVal); + const float n2 = (u2OldVal - u2NewVal) * (u2OldVal - u2NewVal); + error(y, x) = n1 + n2; + } +} + +namespace tvl1flow +{ + void estimateU(PtrStepSzf I1wx, PtrStepSzf I1wy, + PtrStepSzf grad, PtrStepSzf rho_c, + PtrStepSzf p11, PtrStepSzf p12, PtrStepSzf p21, PtrStepSzf p22, + PtrStepSzf u1, PtrStepSzf u2, PtrStepSzf error, + float l_t, float theta) + { + const dim3 block(32, 8); + const dim3 grid(divUp(I1wx.cols, block.x), divUp(I1wx.rows, block.y)); + + ::estimateU<<>>(I1wx, I1wy, grad, rho_c, p11, p12, p21, p22, u1, u2, error, l_t, theta); + cudaSafeCall( cudaGetLastError() ); + + cudaSafeCall( cudaDeviceSynchronize() ); + } +} + +//////////////////////////////////////////////////////////// +// estimateDualVariables + +namespace +{ + __global__ void estimateDualVariables(const PtrStepSzf u1, const PtrStepf u2, PtrStepf p11, PtrStepf p12, PtrStepf p21, PtrStepf p22, const float taut) + { + const int x = blockIdx.x * blockDim.x + threadIdx.x; + const int y = blockIdx.y * blockDim.y + threadIdx.y; + + if (x >= u1.cols || y >= u1.rows) + return; + + const float u1x = u1(y, ::min(x + 1, u1.cols - 1)) - u1(y, x); + const float u1y = u1(::min(y + 1, u1.rows - 1), x) - u1(y, x); + + const float u2x = u2(y, ::min(x + 1, u1.cols - 1)) - u2(y, x); + const float u2y = u2(::min(y + 1, u1.rows - 1), x) - u2(y, x); + + const float g1 = ::hypotf(u1x, u1y); + const float g2 = ::hypotf(u2x, u2y); + + const float ng1 = 1.0f + taut * g1; + const float ng2 = 1.0f + taut * g2; + + p11(y, x) = (p11(y, x) + taut * u1x) / ng1; + p12(y, x) = (p12(y, x) + taut * u1y) / ng1; + p21(y, x) = (p21(y, x) + taut * u2x) / ng2; + p22(y, x) = (p22(y, x) + taut * u2y) / ng2; + } +} + +namespace tvl1flow +{ + void estimateDualVariables(PtrStepSzf u1, PtrStepSzf u2, PtrStepSzf p11, PtrStepSzf p12, PtrStepSzf p21, PtrStepSzf p22, float taut) + { + const dim3 block(32, 8); + const dim3 grid(divUp(u1.cols, block.x), divUp(u1.rows, block.y)); + + ::estimateDualVariables<<>>(u1, u2, p11, p12, p21, p22, taut); + cudaSafeCall( cudaGetLastError() ); + + cudaSafeCall( cudaDeviceSynchronize() ); + } +} + +#endif // !defined CUDA_DISABLER diff --git a/modules/gpu/src/tvl1flow.cpp b/modules/gpu/src/tvl1flow.cpp new file mode 100644 index 000000000..a598a9ecf --- /dev/null +++ b/modules/gpu/src/tvl1flow.cpp @@ -0,0 +1,256 @@ +/*M/////////////////////////////////////////////////////////////////////////////////////// +// +// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. +// +// By downloading, copying, installing or using the software you agree to this license. +// If you do not agree to this license, do not download, install, +// copy or use the software. +// +// +// License Agreement +// For Open Source Computer Vision Library +// +// Copyright (C) 2000-2008, Intel Corporation, all rights reserved. +// Copyright (C) 2009, Willow Garage Inc., all rights reserved. +// Third party copyrights are property of their respective owners. +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// * Redistribution's of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// * Redistribution's in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// * The name of the copyright holders may not be used to endorse or promote products +// derived from this software without specific prior written permission. +// +// This software is provided by the copyright holders and contributors "as is" and +// any express or implied warranties, including, but not limited to, the implied +// warranties of merchantability and fitness for a particular purpose are disclaimed. +// In no event shall the Intel Corporation or contributors be liable for any direct, +// indirect, incidental, special, exemplary, or consequential damages +// (including, but not limited to, procurement of substitute goods or services; +// loss of use, data, or profits; or business interruption) however caused +// and on any theory of liability, whether in contract, strict liability, +// or tort (including negligence or otherwise) arising in any way out of +// the use of this software, even if advised of the possibility of such damage. +// +//M*/ + +#include "precomp.hpp" + +#if !defined HAVE_CUDA || defined(CUDA_DISABLER) + +cv::gpu::OpticalFlowDual_TVL1_GPU::OpticalFlowDual_TVL1_GPU() { throw_nogpu(); } +void cv::gpu::OpticalFlowDual_TVL1_GPU::operator ()(const GpuMat&, const GpuMat&, GpuMat&, GpuMat&) { throw_nogpu(); } +void cv::gpu::OpticalFlowDual_TVL1_GPU::collectGarbage() {} +void cv::gpu::OpticalFlowDual_TVL1_GPU::procOneScale(const GpuMat&, const GpuMat&, GpuMat&, GpuMat&) { throw_nogpu(); } + +#else + +using namespace std; +using namespace cv; +using namespace cv::gpu; + +cv::gpu::OpticalFlowDual_TVL1_GPU::OpticalFlowDual_TVL1_GPU() +{ + tau = 0.25; + lambda = 0.15; + theta = 0.3; + nscales = 5; + warps = 5; + epsilon = 0.01; + iterations = 300; + useInitialFlow = false; +} + +void cv::gpu::OpticalFlowDual_TVL1_GPU::operator ()(const GpuMat& I0, const GpuMat& I1, GpuMat& flowx, GpuMat& flowy) +{ + CV_Assert( I0.type() == CV_8UC1 || I0.type() == CV_32FC1 ); + CV_Assert( I0.size() == I1.size() ); + CV_Assert( I0.type() == I1.type() ); + CV_Assert( !useInitialFlow || (flowx.size() == I0.size() && flowx.type() == CV_32FC1 && flowy.size() == flowx.size() && flowy.type() == flowx.type()) ); + CV_Assert( nscales > 0 ); + + // allocate memory for the pyramid structure + I0s.resize(nscales); + I1s.resize(nscales); + u1s.resize(nscales); + u2s.resize(nscales); + + I0.convertTo(I0s[0], CV_32F, I0.depth() == CV_8U ? 1.0 : 255.0); + I1.convertTo(I1s[0], CV_32F, I1.depth() == CV_8U ? 1.0 : 255.0); + + if (!useInitialFlow) + { + flowx.create(I0.size(), CV_32FC1); + flowy.create(I0.size(), CV_32FC1); + } + + u1s[0] = flowx; + u2s[0] = flowy; + + I1x_buf.create(I0.size(), CV_32FC1); + I1y_buf.create(I0.size(), CV_32FC1); + + I1w_buf.create(I0.size(), CV_32FC1); + I1wx_buf.create(I0.size(), CV_32FC1); + I1wy_buf.create(I0.size(), CV_32FC1); + + grad_buf.create(I0.size(), CV_32FC1); + rho_c_buf.create(I0.size(), CV_32FC1); + + p11_buf.create(I0.size(), CV_32FC1); + p12_buf.create(I0.size(), CV_32FC1); + p21_buf.create(I0.size(), CV_32FC1); + p22_buf.create(I0.size(), CV_32FC1); + + diff_buf.create(I0.size(), CV_32FC1); + + // create the scales + for (int s = 1; s < nscales; ++s) + { + gpu::pyrDown(I0s[s - 1], I0s[s]); + gpu::pyrDown(I1s[s - 1], I1s[s]); + + if (I0s[s].cols < 16 || I0s[s].rows < 16) + { + nscales = s; + break; + } + + if (useInitialFlow) + { + gpu::pyrDown(u1s[s - 1], u1s[s]); + gpu::pyrDown(u2s[s - 1], u2s[s]); + + gpu::multiply(u1s[s], Scalar::all(0.5), u1s[s]); + gpu::multiply(u2s[s], Scalar::all(0.5), u2s[s]); + } + } + + // pyramidal structure for computing the optical flow + for (int s = nscales - 1; s >= 0; --s) + { + // compute the optical flow at the current scale + procOneScale(I0s[s], I1s[s], u1s[s], u2s[s]); + + // if this was the last scale, finish now + if (s == 0) + break; + + // otherwise, upsample the optical flow + + // zoom the optical flow for the next finer scale + gpu::resize(u1s[s], u1s[s - 1], I0s[s - 1].size()); + gpu::resize(u2s[s], u2s[s - 1], I0s[s - 1].size()); + + // scale the optical flow with the appropriate zoom factor + gpu::multiply(u1s[s - 1], Scalar::all(2), u1s[s - 1]); + gpu::multiply(u2s[s - 1], Scalar::all(2), u2s[s - 1]); + } +} + +namespace tvl1flow +{ + void centeredGradient(PtrStepSzf src, PtrStepSzf dx, PtrStepSzf dy); + void warpBackward(PtrStepSzf I0, PtrStepSzf I1, PtrStepSzf I1x, PtrStepSzf I1y, PtrStepSzf u1, PtrStepSzf u2, PtrStepSzf I1w, PtrStepSzf I1wx, PtrStepSzf I1wy, PtrStepSzf grad, PtrStepSzf rho); + void estimateU(PtrStepSzf I1wx, PtrStepSzf I1wy, + PtrStepSzf grad, PtrStepSzf rho_c, + PtrStepSzf p11, PtrStepSzf p12, PtrStepSzf p21, PtrStepSzf p22, + PtrStepSzf u1, PtrStepSzf u2, PtrStepSzf error, + float l_t, float theta); + void estimateDualVariables(PtrStepSzf u1, PtrStepSzf u2, PtrStepSzf p11, PtrStepSzf p12, PtrStepSzf p21, PtrStepSzf p22, float taut); +} + +void cv::gpu::OpticalFlowDual_TVL1_GPU::procOneScale(const GpuMat& I0, const GpuMat& I1, GpuMat& u1, GpuMat& u2) +{ + using namespace tvl1flow; + + const double scaledEpsilon = epsilon * epsilon * I0.size().area(); + + CV_DbgAssert( I1.size() == I0.size() ); + CV_DbgAssert( I1.type() == I0.type() ); + CV_DbgAssert( u1.empty() || u1.size() == I0.size() ); + CV_DbgAssert( u2.size() == u1.size() ); + + if (u1.empty()) + { + u1.create(I0.size(), CV_32FC1); + u1.setTo(Scalar::all(0)); + + u2.create(I0.size(), CV_32FC1); + u2.setTo(Scalar::all(0)); + } + + GpuMat I1x = I1x_buf(Rect(0, 0, I0.cols, I0.rows)); + GpuMat I1y = I1y_buf(Rect(0, 0, I0.cols, I0.rows)); + centeredGradient(I1, I1x, I1y); + + GpuMat I1w = I1w_buf(Rect(0, 0, I0.cols, I0.rows)); + GpuMat I1wx = I1wx_buf(Rect(0, 0, I0.cols, I0.rows)); + GpuMat I1wy = I1wy_buf(Rect(0, 0, I0.cols, I0.rows)); + + GpuMat grad = grad_buf(Rect(0, 0, I0.cols, I0.rows)); + GpuMat rho_c = rho_c_buf(Rect(0, 0, I0.cols, I0.rows)); + + GpuMat p11 = p11_buf(Rect(0, 0, I0.cols, I0.rows)); + GpuMat p12 = p12_buf(Rect(0, 0, I0.cols, I0.rows)); + GpuMat p21 = p21_buf(Rect(0, 0, I0.cols, I0.rows)); + GpuMat p22 = p22_buf(Rect(0, 0, I0.cols, I0.rows)); + p11.setTo(Scalar::all(0)); + p12.setTo(Scalar::all(0)); + p21.setTo(Scalar::all(0)); + p22.setTo(Scalar::all(0)); + + GpuMat diff = diff_buf(Rect(0, 0, I0.cols, I0.rows)); + + const float l_t = static_cast(lambda * theta); + const float taut = static_cast(tau / theta); + + for (int warpings = 0; warpings < warps; ++warpings) + { + warpBackward(I0, I1, I1x, I1y, u1, u2, I1w, I1wx, I1wy, grad, rho_c); + + double error = numeric_limits::max(); + for (int n = 0; error > scaledEpsilon && n < iterations; ++n) + { + estimateU(I1wx, I1wy, grad, rho_c, p11, p12, p21, p22, u1, u2, diff, l_t, static_cast(theta)); + + error = gpu::sum(diff, norm_buf)[0]; + + estimateDualVariables(u1, u2, p11, p12, p21, p22, taut); + } + } +} + +void cv::gpu::OpticalFlowDual_TVL1_GPU::collectGarbage() +{ + I0s.clear(); + I1s.clear(); + u1s.clear(); + u2s.clear(); + + I1x_buf.release(); + I1y_buf.release(); + + I1w_buf.release(); + I1wx_buf.release(); + I1wy_buf.release(); + + grad_buf.release(); + rho_c_buf.release(); + + p11_buf.release(); + p12_buf.release(); + p21_buf.release(); + p22_buf.release(); + + diff_buf.release(); + norm_buf.release(); +} + +#endif // !defined HAVE_CUDA || defined(CUDA_DISABLER) diff --git a/modules/gpu/test/test_video.cpp b/modules/gpu/test/test_video.cpp index ca9442d69..2d44af1a7 100644 --- a/modules/gpu/test/test_video.cpp +++ b/modules/gpu/test/test_video.cpp @@ -405,6 +405,45 @@ TEST_P(OpticalFlowNan, Regression) INSTANTIATE_TEST_CASE_P(GPU_Video, OpticalFlowNan, ALL_DEVICES); +////////////////////////////////////////////////////// +// OpticalFlowDual_TVL1 + +PARAM_TEST_CASE(OpticalFlowDual_TVL1, cv::gpu::DeviceInfo, UseRoi) +{ +}; + +TEST_P(OpticalFlowDual_TVL1, Accuracy) +{ + cv::gpu::DeviceInfo devInfo = GET_PARAM(0); + cv::gpu::setDevice(devInfo.deviceID()); + + const bool useRoi = GET_PARAM(1); + + cv::Mat frame0 = readImage("opticalflow/rubberwhale1.png", cv::IMREAD_GRAYSCALE); + ASSERT_FALSE(frame0.empty()); + + cv::Mat frame1 = readImage("opticalflow/rubberwhale2.png", cv::IMREAD_GRAYSCALE); + ASSERT_FALSE(frame1.empty()); + + cv::gpu::OpticalFlowDual_TVL1_GPU d_alg; + cv::gpu::GpuMat d_flowx = createMat(frame0.size(), CV_32FC1, useRoi); + cv::gpu::GpuMat d_flowy = createMat(frame0.size(), CV_32FC1, useRoi); + d_alg(loadMat(frame0, useRoi), loadMat(frame1, useRoi), d_flowx, d_flowy); + + cv::OpticalFlowDual_TVL1 alg; + cv::Mat flow; + alg(frame0, frame1, flow); + cv::Mat gold[2]; + cv::split(flow, gold); + + EXPECT_MAT_SIMILAR(gold[0], d_flowx, 3e-3); + EXPECT_MAT_SIMILAR(gold[1], d_flowy, 3e-3); +} + +INSTANTIATE_TEST_CASE_P(GPU_Video, OpticalFlowDual_TVL1, testing::Combine( + ALL_DEVICES, + WHOLE_SUBMAT)); + ////////////////////////////////////////////////////// // FGDStatModel diff --git a/samples/gpu/tvl1_optical_flow.cpp b/samples/gpu/tvl1_optical_flow.cpp new file mode 100644 index 000000000..c13afc1c7 --- /dev/null +++ b/samples/gpu/tvl1_optical_flow.cpp @@ -0,0 +1,172 @@ +#include +#include + +#include "opencv2/core/core.hpp" +#include "opencv2/highgui/highgui.hpp" +#include "opencv2/gpu/gpu.hpp" + +using namespace std; +using namespace cv; +using namespace cv::gpu; + +inline bool isFlowCorrect(Point2f u) +{ + return !cvIsNaN(u.x) && !cvIsNaN(u.y) && fabs(u.x) < 1e9 && fabs(u.y) < 1e9; +} + +static Vec3b computeColor(float fx, float fy) +{ + static bool first = true; + + // relative lengths of color transitions: + // these are chosen based on perceptual similarity + // (e.g. one can distinguish more shades between red and yellow + // than between yellow and green) + const int RY = 15; + const int YG = 6; + const int GC = 4; + const int CB = 11; + const int BM = 13; + const int MR = 6; + const int NCOLS = RY + YG + GC + CB + BM + MR; + static Vec3i colorWheel[NCOLS]; + + if (first) + { + int k = 0; + + for (int i = 0; i < RY; ++i, ++k) + colorWheel[k] = Vec3i(255, 255 * i / RY, 0); + + for (int i = 0; i < YG; ++i, ++k) + colorWheel[k] = Vec3i(255 - 255 * i / YG, 255, 0); + + for (int i = 0; i < GC; ++i, ++k) + colorWheel[k] = Vec3i(0, 255, 255 * i / GC); + + for (int i = 0; i < CB; ++i, ++k) + colorWheel[k] = Vec3i(0, 255 - 255 * i / CB, 255); + + for (int i = 0; i < BM; ++i, ++k) + colorWheel[k] = Vec3i(255 * i / BM, 0, 255); + + for (int i = 0; i < MR; ++i, ++k) + colorWheel[k] = Vec3i(255, 0, 255 - 255 * i / MR); + + first = false; + } + + const float rad = sqrt(fx * fx + fy * fy); + const float a = atan2(-fy, -fx) / CV_PI; + + const float fk = (a + 1.0f) / 2.0f * (NCOLS - 1); + const int k0 = static_cast(fk); + const int k1 = (k0 + 1) % NCOLS; + const float f = fk - k0; + + Vec3b pix; + + for (int b = 0; b < 3; b++) + { + const float col0 = colorWheel[k0][b] / 255.0; + const float col1 = colorWheel[k1][b] / 255.0; + + float col = (1 - f) * col0 + f * col1; + + if (rad <= 1) + col = 1 - rad * (1 - col); // increase saturation with radius + else + col *= .75; // out of range + + pix[2 - b] = static_cast(255.0 * col); + } + + return pix; +} + +static void drawOpticalFlow(const Mat_& flowx, const Mat_& flowy, Mat& dst, float maxmotion = -1) +{ + dst.create(flowx.size(), CV_8UC3); + dst.setTo(Scalar::all(0)); + + // determine motion range: + float maxrad = maxmotion; + + if (maxmotion <= 0) + { + maxrad = 1; + for (int y = 0; y < flowx.rows; ++y) + { + for (int x = 0; x < flowx.cols; ++x) + { + Point2f u(flowx(y, x), flowy(y, x)); + + if (!isFlowCorrect(u)) + continue; + + maxrad = max(maxrad, sqrt(u.x * u.x + u.y * u.y)); + } + } + } + + for (int y = 0; y < flowx.rows; ++y) + { + for (int x = 0; x < flowx.cols; ++x) + { + Point2f u(flowx(y, x), flowy(y, x)); + + if (isFlowCorrect(u)) + dst.at(y, x) = computeColor(u.x / maxrad, u.y / maxrad); + } + } +} + +int main(int argc, const char* argv[]) +{ + if (argc < 3) + { + cerr << "Usage : " << argv[0] << " " << endl; + return -1; + } + + Mat frame0 = imread(argv[1], IMREAD_GRAYSCALE); + Mat frame1 = imread(argv[2], IMREAD_GRAYSCALE); + + if (frame0.empty()) + { + cerr << "Can't open image [" << argv[1] << "]" << endl; + return -1; + } + if (frame1.empty()) + { + cerr << "Can't open image [" << argv[2] << "]" << endl; + return -1; + } + + if (frame1.size() != frame0.size()) + { + cerr << "Images should be of equal sizes" << endl; + return -1; + } + + GpuMat d_frame0(frame0); + GpuMat d_frame1(frame1); + + GpuMat d_flowx, d_flowy; + OpticalFlowDual_TVL1_GPU tvl1; + + const double start = getTickCount(); + tvl1(d_frame0, d_frame1, d_flowx, d_flowy); + const double timeSec = (getTickCount() - start) / getTickFrequency(); + cout << "Time : " << timeSec << " sec" << endl; + + Mat flowx(d_flowx); + Mat flowy(d_flowy); + Mat out; + drawOpticalFlow(flowx, flowy, out); + + imshow("Flow", out); + waitKey(); + + return 0; +} From bff818afbd3feca29b8b4ef10b5be68aa5ada7f1 Mon Sep 17 00:00:00 2001 From: Vladislav Vinogradov Date: Thu, 8 Nov 2012 15:49:56 +0400 Subject: [PATCH 002/155] fixed static build of gpu module with ffmpeg: multiple definition of cap_ffmpeg_impl.hpp functions in gpu and highgui --- modules/gpu/src/ffmpeg_video_source.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/gpu/src/ffmpeg_video_source.cpp b/modules/gpu/src/ffmpeg_video_source.cpp index a0f420507..7538de601 100644 --- a/modules/gpu/src/ffmpeg_video_source.cpp +++ b/modules/gpu/src/ffmpeg_video_source.cpp @@ -44,7 +44,7 @@ #if defined(HAVE_CUDA) && !defined(__APPLE__) -#ifdef HAVE_FFMPEG +#if defined(HAVE_FFMPEG) && defined(BUILD_SHARED_LIBS) #include "cap_ffmpeg_impl.hpp" #else #include "cap_ffmpeg_api.hpp" From d65b3e06170e906a68d5f370501facf5ffaacd30 Mon Sep 17 00:00:00 2001 From: "marina.kolpakova" Date: Wed, 5 Sep 2012 10:48:07 +0400 Subject: [PATCH 003/155] fix warning in CUDA samples --- modules/gpu/src/cuda/bf_knnmatch.cu | 2 +- modules/gpu/src/nvidia/core/NCV.hpp | 2 +- samples/gpu/cascadeclassifier_nvidia_api.cpp | 8 ++++---- samples/gpu/opticalflow_nvidia_api.cpp | 16 ++++++++-------- 4 files changed, 14 insertions(+), 14 deletions(-) diff --git a/modules/gpu/src/cuda/bf_knnmatch.cu b/modules/gpu/src/cuda/bf_knnmatch.cu index 6a778735b..b31f25ca8 100644 --- a/modules/gpu/src/cuda/bf_knnmatch.cu +++ b/modules/gpu/src/cuda/bf_knnmatch.cu @@ -1034,7 +1034,7 @@ namespace cv { namespace gpu { namespace device cudaSafeCall( cudaDeviceSynchronize() ); } - void findKnnMatchDispatcher(int k, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, int cc, cudaStream_t stream) + void findKnnMatchDispatcher(int k, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, int /*cc*/, cudaStream_t stream) { findKnnMatch<256>(k, static_cast(trainIdx), static_cast(distance), allDist, stream); } diff --git a/modules/gpu/src/nvidia/core/NCV.hpp b/modules/gpu/src/nvidia/core/NCV.hpp index ddac47c92..26b1d4ef1 100644 --- a/modules/gpu/src/nvidia/core/NCV.hpp +++ b/modules/gpu/src/nvidia/core/NCV.hpp @@ -288,7 +288,7 @@ NCV_EXPORTS void ncvSetDebugOutputHandler(NCVDebugOutputHandler* func); do \ { \ cudaError_t res = cudacall; \ - ncvAssertPrintReturn(cudaSuccess==res, "cudaError_t=" << res, errCode); \ + ncvAssertPrintReturn(cudaSuccess==res, "cudaError_t=" << (int)res, errCode); \ } while (0) diff --git a/samples/gpu/cascadeclassifier_nvidia_api.cpp b/samples/gpu/cascadeclassifier_nvidia_api.cpp index da98643af..99c95ab97 100644 --- a/samples/gpu/cascadeclassifier_nvidia_api.cpp +++ b/samples/gpu/cascadeclassifier_nvidia_api.cpp @@ -30,7 +30,7 @@ const Size2i preferredVideoFrameSize(640, 480); const string wndTitle = "NVIDIA Computer Vision :: Haar Classifiers Cascade"; -void matPrint(Mat &img, int lineOffsY, Scalar fontColor, const string &ss) +static void matPrint(Mat &img, int lineOffsY, Scalar fontColor, const string &ss) { int fontFace = FONT_HERSHEY_DUPLEX; double fontScale = 0.8; @@ -45,7 +45,7 @@ void matPrint(Mat &img, int lineOffsY, Scalar fontColor, const string &ss) } -void displayState(Mat &canvas, bool bHelp, bool bGpu, bool bLargestFace, bool bFilter, double fps) +static void displayState(Mat &canvas, bool bHelp, bool bGpu, bool bLargestFace, bool bFilter, double fps) { Scalar fontColorRed = CV_RGB(255,0,0); Scalar fontColorNV = CV_RGB(118,185,0); @@ -74,7 +74,7 @@ void displayState(Mat &canvas, bool bHelp, bool bGpu, bool bLargestFace, bool bF } -NCVStatus process(Mat *srcdst, +static NCVStatus process(Mat *srcdst, Ncv32u width, Ncv32u height, NcvBool bFilterRects, NcvBool bLargestFace, HaarClassifierCascadeDescriptor &haar, @@ -281,7 +281,7 @@ int main(int argc, const char** argv) //============================================================================== namedWindow(wndTitle, 1); - Mat gray, frameDisp; + Mat frameDisp; do { diff --git a/samples/gpu/opticalflow_nvidia_api.cpp b/samples/gpu/opticalflow_nvidia_api.cpp index 8a149d740..05a37ef69 100644 --- a/samples/gpu/opticalflow_nvidia_api.cpp +++ b/samples/gpu/opticalflow_nvidia_api.cpp @@ -59,7 +59,7 @@ public: class RgbToR { public: - float operator ()(unsigned char b, unsigned char g, unsigned char r) + float operator ()(unsigned char /*b*/, unsigned char /*g*/, unsigned char r) { return static_cast(r)/255.0f; } @@ -69,7 +69,7 @@ public: class RgbToG { public: - float operator ()(unsigned char b, unsigned char g, unsigned char r) + float operator ()(unsigned char /*b*/, unsigned char g, unsigned char /*r*/) { return static_cast(g)/255.0f; } @@ -78,7 +78,7 @@ public: class RgbToB { public: - float operator ()(unsigned char b, unsigned char g, unsigned char r) + float operator ()(unsigned char b, unsigned char /*g*/, unsigned char /*r*/) { return static_cast(b)/255.0f; } @@ -135,7 +135,7 @@ NCVStatus CopyData(const IplImage *image, const NCVMatrixAlloc &dst) return NCV_SUCCESS; } -NCVStatus LoadImages (const char *frame0Name, +static NCVStatus LoadImages (const char *frame0Name, const char *frame1Name, int &width, int &height, @@ -186,7 +186,7 @@ inline T MapValue (T x, T a, T b, T c, T d) return c + (d - c) * (x - a) / (b - a); } -NCVStatus ShowFlow (NCVMatrixAlloc &u, NCVMatrixAlloc &v, const char *name) +static NCVStatus ShowFlow (NCVMatrixAlloc &u, NCVMatrixAlloc &v, const char *name) { IplImage *flowField; @@ -246,7 +246,7 @@ NCVStatus ShowFlow (NCVMatrixAlloc &u, NCVMatrixAlloc &v, const return NCV_SUCCESS; } -IplImage *CreateImage (NCVMatrixAlloc &h_r, NCVMatrixAlloc &h_g, NCVMatrixAlloc &h_b) +static IplImage *CreateImage (NCVMatrixAlloc &h_r, NCVMatrixAlloc &h_g, NCVMatrixAlloc &h_b) { CvSize imageSize = cvSize (h_r.width (), h_r.height ()); IplImage *image = cvCreateImage (imageSize, IPL_DEPTH_8U, 4); @@ -270,7 +270,7 @@ IplImage *CreateImage (NCVMatrixAlloc &h_r, NCVMatrixAlloc &h_g, return image; } -void PrintHelp () +static void PrintHelp () { std::cout << "Usage help:\n"; std::cout << std::setiosflags(std::ios::left); @@ -286,7 +286,7 @@ void PrintHelp () std::cout << "\t" << std::setw(15) << PARAM_HELP << " - display this help message\n"; } -int ProcessCommandLine(int argc, char **argv, +static int ProcessCommandLine(int argc, char **argv, Ncv32f &timeStep, char *&frame0Name, char *&frame1Name, From dd9c53497bb8c32d411d8b5e784b249ad9d13364 Mon Sep 17 00:00:00 2001 From: "marina.kolpakova" Date: Thu, 20 Sep 2012 03:28:45 +0400 Subject: [PATCH 004/155] GPU interface for soft cascade --- modules/gpu/include/opencv2/gpu/gpu.hpp | 43 ++++++++++ modules/gpu/src/softcascade.cpp | 100 ++++++++++++++++++++++++ 2 files changed, 143 insertions(+) create mode 100644 modules/gpu/src/softcascade.cpp diff --git a/modules/gpu/include/opencv2/gpu/gpu.hpp b/modules/gpu/include/opencv2/gpu/gpu.hpp index ddb131788..4a2d88aa0 100644 --- a/modules/gpu/include/opencv2/gpu/gpu.hpp +++ b/modules/gpu/include/opencv2/gpu/gpu.hpp @@ -1532,6 +1532,49 @@ public: int detectMultiScale(const GpuMat& image, GpuMat& objectsBuf, Size maxObjectSize, Size minSize = Size(), double scaleFactor = 1.1, int minNeighbors = 4); }; +// ======================== GPU version for soft cascade ===================== // + +class CV_EXPORTS SoftCascade +{ +public: + //! An empty cascade will be created. + SoftCascade(); + + //! Cascade will be created from file for scales from minScale to maxScale. + //! Param filename is a path to xml-serialized cascade. + //! Param minScale is a minimum scale relative to the original size of the image on which cascade will be applyed. + //! Param minScale is a maximum scale relative to the original size of the image on which cascade will be applyed. + SoftCascade( const string& filename, const float minScale = 0.4f, const float maxScale = 5.f); + + //! cascade will be loaded from file "filename". The previous cascade will be destroyed. + //! Param filename is a path to xml-serialized cascade. + //! Param minScale is a minimum scale relative to the original size of the image on which cascade will be applyed. + //! Param minScale is a maximum scale relative to the original size of the image on which cascade will be applyed. + bool load( const string& filename, const float minScale = 0.4f, const float maxScale = 5.f); + + virtual ~SoftCascade(); + + //! return vector of bounding boxes. Each box contains one detected object + virtual void detectMultiScale(const GpuMat& image, const GpuMat& rois, GpuMat& objects, + int rejectfactor = 1, Stream stream = Stream::Null()); // ToDo store objects in GPU mem + +protected: + enum { BOOST = 0 }; + enum + { + FRAME_WIDTH = 640, + FRAME_HEIGHT = 480, + TOTAL_SCALES = 55, + CLASSIFIERS = 5, + ORIG_OBJECT_WIDTH = 64, + ORIG_OBJECT_HEIGHT = 128 + }; + +private: + struct Filds; + Filds* filds; +}; + ////////////////////////////////// SURF ////////////////////////////////////////// class CV_EXPORTS SURF_GPU diff --git a/modules/gpu/src/softcascade.cpp b/modules/gpu/src/softcascade.cpp new file mode 100644 index 000000000..509e3f501 --- /dev/null +++ b/modules/gpu/src/softcascade.cpp @@ -0,0 +1,100 @@ +/*M/////////////////////////////////////////////////////////////////////////////////////// +// +// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. +// +// By downloading, copying, installing or using the software you agree to this license. +// If you do not agree to this license, do not download, install, +// copy or use the software. +// +// +// License Agreement +// For Open Source Computer Vision Library +// +// Copyright (C) 2000-2008, Intel Corporation, all rights reserved. +// Copyright (C) 2008-2012, Willow Garage Inc., all rights reserved. +// Third party copyrights are property of their respective owners. +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// * Redistribution's of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// * Redistribution's in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// * The name of the copyright holders may not be used to endorse or promote products +// derived from this software without specific prior written permission. +// +// This software is provided by the copyright holders and contributors "as is" and +// any express or implied warranties, including, but not limited to, the implied +// warranties of merchantability and fitness for a particular purpose are disclaimed. +// In no event shall the Intel Corporation or contributors be liable for any direct, +// indirect, incidental, special, exemplary, or consequential damages +// (including, but not limited to, procurement of substitute goods or services; +// loss of use, data, or profits; or business interruption) however caused +// and on any theory of liability, whether in contract, strict liability, +// or tort (including negligence or otherwise) arising in any way out of +// the use of this software, even if advised of the possibility of such damage. +// +//M*/ + +#include + +#if !defined (HAVE_CUDA) + +cv::gpu::SoftCascade::SoftCascade() : filds(0) { throw_nogpu(); } + +cv::gpu::SoftCascade::SoftCascade( const string&, const float, const float) : filds(0) { throw_nogpu(); } + +cv::gpu::SoftCascade::~SoftCascade() { throw_nogpu(); } + +bool cv::gpu::SoftCascade::load( const string&, const float, const float) { throw_nogpu(); } + +void cv::gpu::SoftCascade::detectMultiScale(const GpuMat&, const GpuMat&, GpuMat&, const int, Stream) { throw_nogpu(); } + +#else + +struct cv::gpu::SoftCascade::Filds +{ + bool fill(const FileNode &root, const float mins, const float maxs){return true;} + void calcLevels(int frameW, int frameH, int scales) {} +}; + +cv::gpu::SoftCascade::SoftCascade() : filds(0) {} + +cv::gpu::SoftCascade::SoftCascade( const string& filename, const float minScale, const float maxScale) : filds(0) +{ + load(filename, minScale, maxScale); +} + +cv::gpu::SoftCascade::~SoftCascade() +{ + delete filds; +} + +bool cv::gpu::SoftCascade::load( const string& filename, const float minScale, const float maxScale) +{ + if (filds) + delete filds; + filds = 0; + + cv::FileStorage fs(filename, FileStorage::READ); + if (!fs.isOpened()) return false; + + filds = new Filds; + Filds& flds = *filds; + if (!flds.fill(fs.getFirstTopLevelNode(), minScale, maxScale)) return false; + flds.calcLevels(FRAME_WIDTH, FRAME_HEIGHT, TOTAL_SCALES); + + return true; +} + +void cv::gpu::SoftCascade::detectMultiScale(const GpuMat& /*image*/, const GpuMat& /*rois*/, + GpuMat& /*objects*/, const int /*rejectfactor*/, Stream /*stream*/) +{ + // empty +} + +#endif From 267d140bfeb43b1c47734e554c37c0b1db81787f Mon Sep 17 00:00:00 2001 From: "marina.kolpakova" Date: Thu, 20 Sep 2012 16:22:10 +0400 Subject: [PATCH 005/155] soft cascade: gpu representation --- modules/gpu/include/opencv2/gpu/gpu.hpp | 9 +- modules/gpu/src/cuda/isf-sc.cu | 43 +++++ modules/gpu/src/icf.hpp | 118 ++++++++++++ modules/gpu/src/softcascade.cpp | 236 +++++++++++++++++++++++- modules/gpu/test/test_softcascade.cpp | 73 ++++++++ 5 files changed, 473 insertions(+), 6 deletions(-) create mode 100644 modules/gpu/src/cuda/isf-sc.cu create mode 100644 modules/gpu/src/icf.hpp create mode 100644 modules/gpu/test/test_softcascade.cpp diff --git a/modules/gpu/include/opencv2/gpu/gpu.hpp b/modules/gpu/include/opencv2/gpu/gpu.hpp index 4a2d88aa0..61f6006c5 100644 --- a/modules/gpu/include/opencv2/gpu/gpu.hpp +++ b/modules/gpu/include/opencv2/gpu/gpu.hpp @@ -1554,9 +1554,14 @@ public: virtual ~SoftCascade(); - //! return vector of bounding boxes. Each box contains one detected object + //! detect specific objects on in the input frame for all scales computed flom minScale and maxscale values + //! Param image is input frame for detector. Cascade will be applied to it. + //! Param rois is a mask + //! Param objects 4-channel matrix thet contain detected rectangles + //! Param rejectfactor used for final object box computing + //! Param stream virtual void detectMultiScale(const GpuMat& image, const GpuMat& rois, GpuMat& objects, - int rejectfactor = 1, Stream stream = Stream::Null()); // ToDo store objects in GPU mem + int rejectfactor = 1, Stream stream = Stream::Null()); protected: enum { BOOST = 0 }; diff --git a/modules/gpu/src/cuda/isf-sc.cu b/modules/gpu/src/cuda/isf-sc.cu new file mode 100644 index 000000000..f36f86f96 --- /dev/null +++ b/modules/gpu/src/cuda/isf-sc.cu @@ -0,0 +1,43 @@ +/*M/////////////////////////////////////////////////////////////////////////////////////// +// +// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. +// +// By downloading, copying, installing or using the software you agree to this license. +// If you do not agree to this license, do not download, install, +// copy or use the software. +// +// +// License Agreement +// For Open Source Computer Vision Library +// +// Copyright (C) 2000-2008, Intel Corporation, all rights reserved. +// Copyright (C) 2008-2012, Willow Garage Inc., all rights reserved. +// Third party copyrights are property of their respective owners. +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// * Redistribution's of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// * Redistribution's in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// * The name of the copyright holders may not be used to endorse or promote products +// derived from this software without specific prior written permission. +// +// This software is provided by the copyright holders and contributors "as is" and +// any express or implied warranties, including, but not limited to, the implied +// warranties of merchantability and fitness for a particular purpose are disclaimed. +// In no event shall the Intel Corporation or contributors be liable for any direct, +// indirect, incidental, special, exemplary, or consequential damages +// (including, but not limited to, procurement of substitute goods or services; +// loss of use, data, or profits; or business interruption) however caused +// and on any theory of liability, whether in contract, strict liability, +// or tort (including negligence or otherwise) arising in any way out of +// the use of this software, even if advised of the possibility of such damage. +// +//M*/ + +#include \ No newline at end of file diff --git a/modules/gpu/src/icf.hpp b/modules/gpu/src/icf.hpp new file mode 100644 index 000000000..110890232 --- /dev/null +++ b/modules/gpu/src/icf.hpp @@ -0,0 +1,118 @@ +/*M/////////////////////////////////////////////////////////////////////////////////////// +// +// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. +// +// By downloading, copying, installing or using the software you agree to this license. +// If you do not agree to this license, do not download, install, +// copy or use the software. +// +// +// License Agreement +// For Open Source Computer Vision Library +// +// Copyright (C) 2000-2008, Intel Corporation, all rights reserved. +// Copyright (C) 2008-2012, Willow Garage Inc., all rights reserved. +// Third party copyrights are property of their respective owners. +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// * Redistribution's of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// * Redistribution's in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// * The name of the copyright holders may not be used to endorse or promote products +// derived from this software without specific prior written permission. +// +// This software is provided by the copyright holders and contributors "as is" and +// any express or implied warranties, including, but not limited to, the implied +// warranties of merchantability and fitness for a particular purpose are disclaimed. +// In no event shall the Intel Corporation or contributors be liable for any direct, +// indirect, incidental, special, exemplary, or consequential damages +// (including, but not limited to, procurement of substitute goods or services; +// loss of use, data, or profits; or business interruption) however caused +// and on any theory of liability, whether in contract, strict liability, +// or tort (including negligence or otherwise) arising in any way out of +// the use of this software, even if advised of the possibility of such damage. +// +//M*/ + +#ifndef __OPENCV_ICF_HPP__ +#define __OPENCV_ICF_HPP__ + +#if defined __CUDACC__ +# define __hd__ __host__ __device__ __forceinline__ +#else +# define __hd__ +#endif + + +namespace icf { + + struct Cascade + { + + }; + + struct ChannelStorage + { + + }; + + struct __align__(16) Octave + { + ushort index; + ushort stages; + ushort shrinkage; + ushort2 size; + float scale; + + Octave(const ushort i, const ushort s, const ushort sh, const ushort2 sz, const float sc) + : index(i), stages(s), shrinkage(sh), size(sz), scale(sc) {} + }; + + struct __align__(8) Node + { + int feature; + float threshold; + + Node(const int f, const float t) : feature(f), threshold(t) {} + }; + + struct __align__(8) Feature + { + int channel; + uchar4 rect; + + Feature(const int c, const uchar4 r) : channel(c), rect(r) {} + }; + + struct __align__(8) Level //is actually 24 bytes + { + int octave; + + // float origScale; //not actually used + float relScale; + float shrScale; // used for marking detection + float scaling[2]; // calculated according to Dollal paper + + // for 640x480 we can not get overflow + uchar2 workRect; + uchar2 objSize; + + Level(int idx, const Octave& oct, const float scale, const int w, const int h) + : octave(idx), relScale(scale / oct.scale), shrScale (relScale / (float)oct.shrinkage) + { + workRect.x = round(w / (float)oct.shrinkage); + workRect.y = round(h / (float)oct.shrinkage); + + objSize.x = round(oct.size.x * relScale); + objSize.y = round(oct.size.y * relScale); + } + }; +} + +#endif \ No newline at end of file diff --git a/modules/gpu/src/softcascade.cpp b/modules/gpu/src/softcascade.cpp index 509e3f501..04b68539c 100644 --- a/modules/gpu/src/softcascade.cpp +++ b/modules/gpu/src/softcascade.cpp @@ -56,12 +56,242 @@ void cv::gpu::SoftCascade::detectMultiScale(const GpuMat&, const GpuMat&, GpuMat #else +#include + struct cv::gpu::SoftCascade::Filds { - bool fill(const FileNode &root, const float mins, const float maxs){return true;} - void calcLevels(int frameW, int frameH, int scales) {} + // scales range + float minScale; + float maxScale; + + int origObjWidth; + int origObjHeight; + + GpuMat octaves; + GpuMat stages; + GpuMat nodes; + GpuMat leaves; + GpuMat features; + + std::vector scales; + + icf::Cascade cascade; + + bool fill(const FileNode &root, const float mins, const float maxs); + +private: + void calcLevels(const std::vector& octs, + int frameW, int frameH, int nscales); + + typedef std::vector::const_iterator octIt_t; + int fitOctave(const std::vector& octs, const float& logFactor) + { + float minAbsLog = FLT_MAX; + int res = 0; + for (int oct = 0; oct < (int)octs.size(); ++oct) + { + const icf::Octave& octave =octs[oct]; + float logOctave = ::log(octave.scale); + float logAbsScale = ::fabs(logFactor - logOctave); + + if(logAbsScale < minAbsLog) + { + res = oct; + minAbsLog = logAbsScale; + } + } + return res; + } }; +inline bool cv::gpu::SoftCascade::Filds::fill(const FileNode &root, const float mins, const float maxs) +{ + minScale = mins; + maxScale = maxs; + + // cascade properties + static const char *const SC_STAGE_TYPE = "stageType"; + static const char *const SC_BOOST = "BOOST"; + + static const char *const SC_FEATURE_TYPE = "featureType"; + static const char *const SC_ICF = "ICF"; + + static const char *const SC_ORIG_W = "width"; + static const char *const SC_ORIG_H = "height"; + + static const char *const SC_OCTAVES = "octaves"; + static const char *const SC_STAGES = "stages"; + static const char *const SC_FEATURES = "features"; + + static const char *const SC_WEEK = "weakClassifiers"; + static const char *const SC_INTERNAL = "internalNodes"; + static const char *const SC_LEAF = "leafValues"; + + static const char *const SC_OCT_SCALE = "scale"; + static const char *const SC_OCT_STAGES = "stageNum"; + static const char *const SC_OCT_SHRINKAGE = "shrinkingFactor"; + + static const char *const SC_STAGE_THRESHOLD = "stageThreshold"; + + static const char * const SC_F_CHANNEL = "channel"; + static const char * const SC_F_RECT = "rect"; + + // only Ada Boost supported + std::string stageTypeStr = (string)root[SC_STAGE_TYPE]; + CV_Assert(stageTypeStr == SC_BOOST); + + // only HOG-like integral channel features cupported + string featureTypeStr = (string)root[SC_FEATURE_TYPE]; + CV_Assert(featureTypeStr == SC_ICF); + + origObjWidth = (int)root[SC_ORIG_W]; + CV_Assert(origObjWidth == SoftCascade::ORIG_OBJECT_WIDTH); + + origObjHeight = (int)root[SC_ORIG_H]; + CV_Assert(origObjHeight == SoftCascade::ORIG_OBJECT_HEIGHT); + + FileNode fn = root[SC_OCTAVES]; + if (fn.empty()) return false; + + std::vector voctaves; + std::vector vstages; + std::vector vnodes; + std::vector vleaves; + std::vector vfeatures; + scales.clear(); + + // std::vector levels; + + FileNodeIterator it = fn.begin(), it_end = fn.end(); + int feature_offset = 0; + ushort octIndex = 0; + + for (; it != it_end; ++it) + { + FileNode fns = *it; + float scale = (float)fns[SC_OCT_SCALE]; + scales.push_back(scale); + ushort nstages = saturate_cast((int)fn[SC_OCT_STAGES]); + ushort2 size; + size.x = cvRound(SoftCascade::ORIG_OBJECT_WIDTH * scale); + size.y = cvRound(SoftCascade::ORIG_OBJECT_HEIGHT * scale); + ushort shrinkage = saturate_cast((int)fn[SC_OCT_SHRINKAGE]); + + icf::Octave octave(octIndex, nstages, shrinkage, size, scale); + CV_Assert(octave.stages > 0); + voctaves.push_back(octave); + + FileNode ffs = fns[SC_FEATURES]; + if (ffs.empty()) return false; + + fns = fns[SC_STAGES]; + if (fn.empty()) return false; + + // for each stage (~ decision tree with H = 2) + FileNodeIterator st = fns.begin(), st_end = fns.end(); + for (; st != st_end; ++st ) + { + fns = *st; + vstages.push_back((float)fn[SC_STAGE_THRESHOLD]); + + fns = fns[SC_WEEK]; + FileNodeIterator ftr = fns.begin(), ft_end = fns.end(); + for (; ftr != ft_end; ++ftr) + { + fns = (*ftr)[SC_INTERNAL]; + FileNodeIterator inIt = fns.begin(), inIt_end = fns.end(); + for (; inIt != inIt_end;) + { + int feature = (int)(*(inIt +=2)++) + feature_offset; + vnodes.push_back(icf::Node(feature, (float)(*(inIt++)))); + } + + fns = (*ftr)[SC_LEAF]; + inIt = fns.begin(), inIt_end = fns.end(); + for (; inIt != inIt_end; ++inIt) + vleaves.push_back((float)(*inIt)); + } + } + + st = ffs.begin(), st_end = ffs.end(); + for (; st != st_end; ++st ) + { + cv::FileNode rn = (*st)[SC_F_RECT]; + cv::FileNodeIterator r_it = rn.begin(); + uchar4 rect; + rect.x = saturate_cast((int)*(r_it++)); + rect.y = saturate_cast((int)*(r_it++)); + rect.z = saturate_cast((int)*(r_it++)); + rect.w = saturate_cast((int)*(r_it++)); + vfeatures.push_back(icf::Feature((int)(*st)[SC_F_CHANNEL], rect)); + } + + feature_offset += octave.stages * 3; + ++octIndex; + } + + // upload in gpu memory + octaves.upload(cv::Mat(1, voctaves.size() * sizeof(icf::Octave), CV_8UC1, (uchar*)&(voctaves[0]) )); + CV_Assert(!octaves.empty()); + + stages.upload(cv::Mat(vstages).reshape(1,1)); + CV_Assert(!stages.empty()); + + nodes.upload(cv::Mat(1, vnodes.size() * sizeof(icf::Node), CV_8UC1, (uchar*)&(vnodes[0]) )); + CV_Assert(!nodes.empty()); + + leaves.upload(cv::Mat(vleaves).reshape(1,1)); + CV_Assert(!leaves.empty()); + + features.upload(cv::Mat(1, vfeatures.size() * sizeof(icf::Feature), CV_8UC1, (uchar*)&(vfeatures[0]) )); + CV_Assert(!features.empty()); + + // compute levels + calcLevels(voctaves, (int)SoftCascade::FRAME_WIDTH, (int)SoftCascade::FRAME_HEIGHT, (int)SoftCascade::TOTAL_SCALES); + + return true; +} + +inline void cv::gpu::SoftCascade::Filds::calcLevels(const std::vector& octs, + int frameW, int frameH, int nscales) +{ + CV_Assert(nscales > 1); + + std::vector levels; + float logFactor = (::log(maxScale) - ::log(minScale)) / (nscales -1); + + float scale = minScale; + for (int sc = 0; sc < nscales; ++sc) + { + int width = ::std::max(0.0f, frameW - (origObjWidth * scale)); + int height = ::std::max(0.0f, frameH - (origObjHeight * scale)); + + float logScale = ::log(scale); + int fit = fitOctave(octs, logScale); + + icf::Level level(fit, octs[fit], scale, width, height); + + if (!width || !height) + break; + else + levels.push_back(level); + + if (::fabs(scale - maxScale) < FLT_EPSILON) break; + scale = ::std::min(maxScale, ::expf(::log(scale) + logFactor)); + + // std::cout << "level " << sc << " scale " + // << levels[sc].origScale + // << " octeve " + // << levels[sc].octave->scale + // << " " + // << levels[sc].relScale + // << " " << levels[sc].shrScale + // << " [" << levels[sc].objSize.width + // << " " << levels[sc].objSize.height << "] [" + // << levels[sc].workRect.width << " " << levels[sc].workRect.height << "]" << std::endl; + } +} + cv::gpu::SoftCascade::SoftCascade() : filds(0) {} cv::gpu::SoftCascade::SoftCascade( const string& filename, const float minScale, const float maxScale) : filds(0) @@ -86,8 +316,6 @@ bool cv::gpu::SoftCascade::load( const string& filename, const float minScale, c filds = new Filds; Filds& flds = *filds; if (!flds.fill(fs.getFirstTopLevelNode(), minScale, maxScale)) return false; - flds.calcLevels(FRAME_WIDTH, FRAME_HEIGHT, TOTAL_SCALES); - return true; } diff --git a/modules/gpu/test/test_softcascade.cpp b/modules/gpu/test/test_softcascade.cpp new file mode 100644 index 000000000..821a2b140 --- /dev/null +++ b/modules/gpu/test/test_softcascade.cpp @@ -0,0 +1,73 @@ +/*M/////////////////////////////////////////////////////////////////////////////////////// +// +// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. +// +// By downloading, copying, installing or using the software you agree to this license. +// If you do not agree to this license, do not download, install, +// copy or use the software. +// +// +// License Agreement +// For Open Source Computer Vision Library +// +// Copyright (C) 2000-2008, Intel Corporation, all rights reserved. +// Copyright (C) 2008-2012, Willow Garage Inc., all rights reserved. +// Third party copyrights are property of their respective owners. +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// * Redistribution's of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// * Redistribution's in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// * The name of the copyright holders may not be used to endorse or promote products +// derived from this software without specific prior written permission. +// +// This software is provided by the copyright holders and contributors "as is" and +// any express or implied warranties, including, but not limited to, the implied +// warranties of merchantability and fitness for a particular purpose are disclaimed. +// In no event shall the Intel Corporation or contributors be liable for any direct, +// indirect, incidental, special, exemplary, or consequential damages +// (including, but not limited to, procurement of substitute goods or services; +// loss of use, data, or profits; or business interruption) however caused +// and on any theory of liability, whether in contract, strict liability, +// or tort (including negligence or otherwise) arising in any way out of +// the use of this software, even if advised of the possibility of such damage. +// +//M*/ + +#include + +#ifdef HAVE_CUDA + +using cv::gpu::GpuMat; + +TEST(SoftCascade, readCascade) +{ + std::string xml = cvtest::TS::ptr()->get_data_path() + "cascadeandhog/icf-template.xml"; + cv::gpu::SoftCascade cascade; + ASSERT_TRUE(cascade.load(xml)); + +} + +TEST(SoftCascade, detect) +{ + std::string xml = cvtest::TS::ptr()->get_data_path() + "cascadeandhog/sc_cvpr_2012_to_opencv.xml"; + cv::gpu::SoftCascade cascade; + ASSERT_TRUE(cascade.load(xml)); + + cv::Mat coloredCpu = cv::imread(cvtest::TS::ptr()->get_data_path() + "cascadeandhog/bahnhof/image_00000000_0.png"); + ASSERT_FALSE(coloredCpu.empty()); + GpuMat colored(coloredCpu), objectBoxes, rois; + + // ASSERT_NO_THROW( + // { + cascade.detectMultiScale(colored, rois, objectBoxes); + // }); +} + +#endif \ No newline at end of file From 2b7ce8b16031124d4ebd261fd5b70e5fbc5f6d5a Mon Sep 17 00:00:00 2001 From: "marina.kolpakova" Date: Thu, 20 Sep 2012 16:44:38 +0400 Subject: [PATCH 006/155] precompute feature response for scaling factor --- modules/gpu/src/softcascade.cpp | 36 +++++++++++++++++++++++++++++++++ 1 file changed, 36 insertions(+) diff --git a/modules/gpu/src/softcascade.cpp b/modules/gpu/src/softcascade.cpp index 04b68539c..18306e04d 100644 --- a/modules/gpu/src/softcascade.cpp +++ b/modules/gpu/src/softcascade.cpp @@ -252,6 +252,40 @@ inline bool cv::gpu::SoftCascade::Filds::fill(const FileNode &root, const float return true; } +namespace { + struct CascadeIntrinsics + { + static const float lambda = 1.099f, a = 0.89f; + + static float getFor(int channel, float scaling) + { + CV_Assert(channel < 10); + + if (fabs(scaling - 1.f) < FLT_EPSILON) + return 1.f; + + // according to R. Benenson, M. Mathias, R. Timofte and L. Van Gool's and Dallal's papers + static const float A[2][2] = + { //channel <= 6, otherwise + { 0.89f, 1.f}, // down + { 1.00f, 1.f} // up + }; + + static const float B[2][2] = + { //channel <= 6, otherwise + { 1.099f / log(2), 2.f}, // down + { 0.f, 2.f} // up + }; + + float a = A[(int)(scaling >= 1)][(int)(channel > 6)]; + float b = B[(int)(scaling >= 1)][(int)(channel > 6)]; + + printf("!!! scaling: %f %f %f -> %f\n", scaling, a, b, a * pow(scaling, b)); + return a * pow(scaling, b); + } + }; +} + inline void cv::gpu::SoftCascade::Filds::calcLevels(const std::vector& octs, int frameW, int frameH, int nscales) { @@ -270,6 +304,8 @@ inline void cv::gpu::SoftCascade::Filds::calcLevels(const std::vector Date: Thu, 20 Sep 2012 18:35:48 +0400 Subject: [PATCH 007/155] GPU soft cascade: buffers preallocation --- modules/gpu/include/opencv2/gpu/gpu.hpp | 12 ----- modules/gpu/src/icf.hpp | 16 +++++- modules/gpu/src/softcascade.cpp | 68 +++++++++++++++++++++---- 3 files changed, 72 insertions(+), 24 deletions(-) diff --git a/modules/gpu/include/opencv2/gpu/gpu.hpp b/modules/gpu/include/opencv2/gpu/gpu.hpp index 61f6006c5..5008e1027 100644 --- a/modules/gpu/include/opencv2/gpu/gpu.hpp +++ b/modules/gpu/include/opencv2/gpu/gpu.hpp @@ -1563,18 +1563,6 @@ public: virtual void detectMultiScale(const GpuMat& image, const GpuMat& rois, GpuMat& objects, int rejectfactor = 1, Stream stream = Stream::Null()); -protected: - enum { BOOST = 0 }; - enum - { - FRAME_WIDTH = 640, - FRAME_HEIGHT = 480, - TOTAL_SCALES = 55, - CLASSIFIERS = 5, - ORIG_OBJECT_WIDTH = 64, - ORIG_OBJECT_HEIGHT = 128 - }; - private: struct Filds; Filds* filds; diff --git a/modules/gpu/src/icf.hpp b/modules/gpu/src/icf.hpp index 110890232..49919a79c 100644 --- a/modules/gpu/src/icf.hpp +++ b/modules/gpu/src/icf.hpp @@ -40,6 +40,8 @@ // //M*/ +#include + #ifndef __OPENCV_ICF_HPP__ #define __OPENCV_ICF_HPP__ @@ -54,12 +56,24 @@ namespace icf { struct Cascade { + Cascade() {} + Cascade(const cv::gpu::PtrStepSzb& octs, const cv::gpu::PtrStepSzf& sts, const cv::gpu::PtrStepSzb& nds, + const cv::gpu::PtrStepSzf& lvs, const cv::gpu::PtrStepSzb& fts, const cv::gpu::PtrStepSzb& lls) + : octaves(octs), stages(sts), nodes(nds), leaves(lvs), features(fts), levels(lls) {} + + cv::gpu::PtrStepSzb octaves; + cv::gpu::PtrStepSzf stages; + cv::gpu::PtrStepSzb nodes; + cv::gpu::PtrStepSzf leaves; + cv::gpu::PtrStepSzb features; + + cv::gpu::PtrStepSzb levels; }; struct ChannelStorage { - + ChannelStorage(const cv::gpu::PtrStepSzb& /*f*/, const int /*shrinkage*/) {} }; struct __align__(16) Octave diff --git a/modules/gpu/src/softcascade.cpp b/modules/gpu/src/softcascade.cpp index 18306e04d..8ef1da457 100644 --- a/modules/gpu/src/softcascade.cpp +++ b/modules/gpu/src/softcascade.cpp @@ -72,19 +72,41 @@ struct cv::gpu::SoftCascade::Filds GpuMat nodes; GpuMat leaves; GpuMat features; + GpuMat levels; + + // preallocated buffer 640x480x10 + GpuMat dmem; + // 160x120x10 + GpuMat shrunk; + // 161x121x10 + GpuMat hogluv; std::vector scales; icf::Cascade cascade; bool fill(const FileNode &root, const float mins, const float maxs); + void detect(const icf::ChannelStorage& /*channels*/) const {} + + enum { BOOST = 0 }; + enum + { + FRAME_WIDTH = 640, + FRAME_HEIGHT = 480, + TOTAL_SCALES = 55, + CLASSIFIERS = 5, + ORIG_OBJECT_WIDTH = 64, + ORIG_OBJECT_HEIGHT = 128, + HOG_BINS = 6, + HOG_LUV_BINS = 10 + }; private: void calcLevels(const std::vector& octs, int frameW, int frameH, int nscales); typedef std::vector::const_iterator octIt_t; - int fitOctave(const std::vector& octs, const float& logFactor) + int fitOctave(const std::vector& octs, const float& logFactor) const { float minAbsLog = FLT_MAX; int res = 0; @@ -145,10 +167,10 @@ inline bool cv::gpu::SoftCascade::Filds::fill(const FileNode &root, const float CV_Assert(featureTypeStr == SC_ICF); origObjWidth = (int)root[SC_ORIG_W]; - CV_Assert(origObjWidth == SoftCascade::ORIG_OBJECT_WIDTH); + CV_Assert(origObjWidth == ORIG_OBJECT_WIDTH); origObjHeight = (int)root[SC_ORIG_H]; - CV_Assert(origObjHeight == SoftCascade::ORIG_OBJECT_HEIGHT); + CV_Assert(origObjHeight == ORIG_OBJECT_HEIGHT); FileNode fn = root[SC_OCTAVES]; if (fn.empty()) return false; @@ -165,6 +187,7 @@ inline bool cv::gpu::SoftCascade::Filds::fill(const FileNode &root, const float FileNodeIterator it = fn.begin(), it_end = fn.end(); int feature_offset = 0; ushort octIndex = 0; + ushort shrinkage = 1; for (; it != it_end; ++it) { @@ -173,9 +196,9 @@ inline bool cv::gpu::SoftCascade::Filds::fill(const FileNode &root, const float scales.push_back(scale); ushort nstages = saturate_cast((int)fn[SC_OCT_STAGES]); ushort2 size; - size.x = cvRound(SoftCascade::ORIG_OBJECT_WIDTH * scale); - size.y = cvRound(SoftCascade::ORIG_OBJECT_HEIGHT * scale); - ushort shrinkage = saturate_cast((int)fn[SC_OCT_SHRINKAGE]); + size.x = cvRound(ORIG_OBJECT_WIDTH * scale); + size.y = cvRound(ORIG_OBJECT_HEIGHT * scale); + shrinkage = saturate_cast((int)fn[SC_OCT_SHRINKAGE]); icf::Octave octave(octIndex, nstages, shrinkage, size, scale); CV_Assert(octave.stages > 0); @@ -247,7 +270,16 @@ inline bool cv::gpu::SoftCascade::Filds::fill(const FileNode &root, const float CV_Assert(!features.empty()); // compute levels - calcLevels(voctaves, (int)SoftCascade::FRAME_WIDTH, (int)SoftCascade::FRAME_HEIGHT, (int)SoftCascade::TOTAL_SCALES); + calcLevels(voctaves, FRAME_WIDTH, FRAME_HEIGHT, TOTAL_SCALES); + CV_Assert(!levels.empty()); + + // init Cascade + cascade = icf::Cascade(octaves, stages, nodes, leaves, features, levels); + + // allocate buffers + dmem.create(FRAME_HEIGHT * HOG_LUV_BINS, FRAME_WIDTH, CV_8UC1); + shrunk.create(FRAME_HEIGHT / shrinkage * HOG_LUV_BINS, FRAME_WIDTH / shrinkage, CV_8UC1); + hogluv.create( (FRAME_HEIGHT / shrinkage * HOG_LUV_BINS) + 1, (FRAME_WIDTH / shrinkage) + 1, CV_16UC1); return true; } @@ -291,7 +323,7 @@ inline void cv::gpu::SoftCascade::Filds::calcLevels(const std::vector 1); - std::vector levels; + std::vector vlevels; float logFactor = (::log(maxScale) - ::log(minScale)) / (nscales -1); float scale = minScale; @@ -310,11 +342,13 @@ inline void cv::gpu::SoftCascade::Filds::calcLevels(const std::vector Date: Thu, 20 Sep 2012 18:51:41 +0400 Subject: [PATCH 008/155] integral channel storage are cached as a cascade's field --- modules/gpu/src/icf.hpp | 156 +++++++++++++++++--------------- modules/gpu/src/softcascade.cpp | 13 ++- 2 files changed, 91 insertions(+), 78 deletions(-) diff --git a/modules/gpu/src/icf.hpp b/modules/gpu/src/icf.hpp index 49919a79c..8cc4395c3 100644 --- a/modules/gpu/src/icf.hpp +++ b/modules/gpu/src/icf.hpp @@ -54,79 +54,93 @@ namespace icf { - struct Cascade +using cv::gpu::PtrStepSzb; +using cv::gpu::PtrStepSzf; + +struct Cascade +{ + Cascade() {} + Cascade(const cv::gpu::PtrStepSzb& octs, const cv::gpu::PtrStepSzf& sts, const cv::gpu::PtrStepSzb& nds, + const cv::gpu::PtrStepSzf& lvs, const cv::gpu::PtrStepSzb& fts, const cv::gpu::PtrStepSzb& lls) + : octaves(octs), stages(sts), nodes(nds), leaves(lvs), features(fts), levels(lls) {} + + PtrStepSzb octaves; + PtrStepSzf stages; + PtrStepSzb nodes; + PtrStepSzf leaves; + PtrStepSzb features; + + PtrStepSzb levels; + +}; + +struct ChannelStorage +{ + ChannelStorage(){} + ChannelStorage(const cv::gpu::PtrStepSzb& buff, const cv::gpu::PtrStepSzb& shr, + const cv::gpu::PtrStepSzb& itg, const int s) + : dmem (buff), shrunk(shr), hogluv(itg), shrinkage(s) {} + + void frame(const cv::gpu::PtrStepSzb& image) {} + + PtrStepSzb dmem; + PtrStepSzb shrunk; + PtrStepSzb hogluv; + + int shrinkage; +}; + +struct __align__(16) Octave +{ + ushort index; + ushort stages; + ushort shrinkage; + ushort2 size; + float scale; + + Octave(const ushort i, const ushort s, const ushort sh, const ushort2 sz, const float sc) + : index(i), stages(s), shrinkage(sh), size(sz), scale(sc) {} +}; + +struct __align__(8) Node +{ + int feature; + float threshold; + + Node(const int f, const float t) : feature(f), threshold(t) {} +}; + +struct __align__(8) Feature +{ + int channel; + uchar4 rect; + + Feature(const int c, const uchar4 r) : channel(c), rect(r) {} +}; + +struct __align__(8) Level //is actually 24 bytes +{ + int octave; + + // float origScale; //not actually used + float relScale; + float shrScale; // used for marking detection + float scaling[2]; // calculated according to Dollal paper + + // for 640x480 we can not get overflow + uchar2 workRect; + uchar2 objSize; + + Level(int idx, const Octave& oct, const float scale, const int w, const int h) + : octave(idx), relScale(scale / oct.scale), shrScale (relScale / (float)oct.shrinkage) { - Cascade() {} - Cascade(const cv::gpu::PtrStepSzb& octs, const cv::gpu::PtrStepSzf& sts, const cv::gpu::PtrStepSzb& nds, - const cv::gpu::PtrStepSzf& lvs, const cv::gpu::PtrStepSzb& fts, const cv::gpu::PtrStepSzb& lls) - : octaves(octs), stages(sts), nodes(nds), leaves(lvs), features(fts), levels(lls) {} + workRect.x = round(w / (float)oct.shrinkage); + workRect.y = round(h / (float)oct.shrinkage); - cv::gpu::PtrStepSzb octaves; - cv::gpu::PtrStepSzf stages; - cv::gpu::PtrStepSzb nodes; - cv::gpu::PtrStepSzf leaves; - cv::gpu::PtrStepSzb features; - - cv::gpu::PtrStepSzb levels; - - }; - - struct ChannelStorage - { - ChannelStorage(const cv::gpu::PtrStepSzb& /*f*/, const int /*shrinkage*/) {} - }; - - struct __align__(16) Octave - { - ushort index; - ushort stages; - ushort shrinkage; - ushort2 size; - float scale; - - Octave(const ushort i, const ushort s, const ushort sh, const ushort2 sz, const float sc) - : index(i), stages(s), shrinkage(sh), size(sz), scale(sc) {} - }; - - struct __align__(8) Node - { - int feature; - float threshold; - - Node(const int f, const float t) : feature(f), threshold(t) {} - }; - - struct __align__(8) Feature - { - int channel; - uchar4 rect; - - Feature(const int c, const uchar4 r) : channel(c), rect(r) {} - }; - - struct __align__(8) Level //is actually 24 bytes - { - int octave; - - // float origScale; //not actually used - float relScale; - float shrScale; // used for marking detection - float scaling[2]; // calculated according to Dollal paper - - // for 640x480 we can not get overflow - uchar2 workRect; - uchar2 objSize; - - Level(int idx, const Octave& oct, const float scale, const int w, const int h) - : octave(idx), relScale(scale / oct.scale), shrScale (relScale / (float)oct.shrinkage) - { - workRect.x = round(w / (float)oct.shrinkage); - workRect.y = round(h / (float)oct.shrinkage); - - objSize.x = round(oct.size.x * relScale); - objSize.y = round(oct.size.y * relScale); - } - }; + objSize.x = round(oct.size.x * relScale); + objSize.y = round(oct.size.y * relScale); + } +}; } #endif \ No newline at end of file diff --git a/modules/gpu/src/softcascade.cpp b/modules/gpu/src/softcascade.cpp index 8ef1da457..54f37cd17 100644 --- a/modules/gpu/src/softcascade.cpp +++ b/modules/gpu/src/softcascade.cpp @@ -84,9 +84,10 @@ struct cv::gpu::SoftCascade::Filds std::vector scales; icf::Cascade cascade; + icf::ChannelStorage storage; bool fill(const FileNode &root, const float mins, const float maxs); - void detect(const icf::ChannelStorage& /*channels*/) const {} + void detect() const {} enum { BOOST = 0 }; enum @@ -281,6 +282,7 @@ inline bool cv::gpu::SoftCascade::Filds::fill(const FileNode &root, const float shrunk.create(FRAME_HEIGHT / shrinkage * HOG_LUV_BINS, FRAME_WIDTH / shrinkage, CV_8UC1); hogluv.create( (FRAME_HEIGHT / shrinkage * HOG_LUV_BINS) + 1, (FRAME_WIDTH / shrinkage) + 1, CV_16UC1); + storage = icf::ChannelStorage(dmem, shrunk, hogluv, shrinkage); return true; } @@ -398,13 +400,10 @@ void cv::gpu::SoftCascade::detectMultiScale(const GpuMat& image, const GpuMat& / // only this window size allowed CV_Assert(image.cols == 640 && image.rows == 480); + Filds& flds = *filds; - // ToDo: add shrincage in whole cascade. - const int shrincage = 4; - icf::ChannelStorage storage(image, shrincage); - - const Filds& flds = *filds; - flds.detect(storage); + flds.storage.frame(image); + flds.detect(); } #endif From a3af5ede8059b29617e400bab631c122422b6316 Mon Sep 17 00:00:00 2001 From: "marina.kolpakova" Date: Thu, 20 Sep 2012 19:35:04 +0400 Subject: [PATCH 009/155] CUDA callers --- modules/gpu/src/cuda/isf-sc.cu | 13 ++++++++++++- modules/gpu/src/icf.hpp | 4 +++- modules/gpu/src/softcascade.cpp | 17 ++++++++++------- 3 files changed, 25 insertions(+), 9 deletions(-) diff --git a/modules/gpu/src/cuda/isf-sc.cu b/modules/gpu/src/cuda/isf-sc.cu index f36f86f96..a6418c1d3 100644 --- a/modules/gpu/src/cuda/isf-sc.cu +++ b/modules/gpu/src/cuda/isf-sc.cu @@ -40,4 +40,15 @@ // //M*/ -#include \ No newline at end of file +#include + +void icf::Cascade::detect(const cv::gpu::PtrStepSzb& hogluv) const +{ + // detection kernel +} + +void icf::ChannelStorage::frame(const cv::gpu::PtrStepSz& image) +{ + // color convertin kernel + // hog calculation kernel +} \ No newline at end of file diff --git a/modules/gpu/src/icf.hpp b/modules/gpu/src/icf.hpp index 8cc4395c3..7183fc0ac 100644 --- a/modules/gpu/src/icf.hpp +++ b/modules/gpu/src/icf.hpp @@ -64,6 +64,8 @@ struct Cascade const cv::gpu::PtrStepSzf& lvs, const cv::gpu::PtrStepSzb& fts, const cv::gpu::PtrStepSzb& lls) : octaves(octs), stages(sts), nodes(nds), leaves(lvs), features(fts), levels(lls) {} + void detect(const cv::gpu::PtrStepSzb& hogluv) const; + PtrStepSzb octaves; PtrStepSzf stages; PtrStepSzb nodes; @@ -81,7 +83,7 @@ struct ChannelStorage const cv::gpu::PtrStepSzb& itg, const int s) : dmem (buff), shrunk(shr), hogluv(itg), shrinkage(s) {} - void frame(const cv::gpu::PtrStepSzb& image) {} + void frame(const cv::gpu::PtrStepSz& image); PtrStepSzb dmem; PtrStepSzb shrunk; diff --git a/modules/gpu/src/softcascade.cpp b/modules/gpu/src/softcascade.cpp index 54f37cd17..80473da95 100644 --- a/modules/gpu/src/softcascade.cpp +++ b/modules/gpu/src/softcascade.cpp @@ -74,7 +74,7 @@ struct cv::gpu::SoftCascade::Filds GpuMat features; GpuMat levels; - // preallocated buffer 640x480x10 + // preallocated buffer 640x480x10 + 640x480 GpuMat dmem; // 160x120x10 GpuMat shrunk; @@ -86,9 +86,6 @@ struct cv::gpu::SoftCascade::Filds icf::Cascade cascade; icf::ChannelStorage storage; - bool fill(const FileNode &root, const float mins, const float maxs); - void detect() const {} - enum { BOOST = 0 }; enum { @@ -102,6 +99,12 @@ struct cv::gpu::SoftCascade::Filds HOG_LUV_BINS = 10 }; + bool fill(const FileNode &root, const float mins, const float maxs); + void detect() const + { + cascade.detect(hogluv); + } + private: void calcLevels(const std::vector& octs, int frameW, int frameH, int nscales); @@ -278,7 +281,7 @@ inline bool cv::gpu::SoftCascade::Filds::fill(const FileNode &root, const float cascade = icf::Cascade(octaves, stages, nodes, leaves, features, levels); // allocate buffers - dmem.create(FRAME_HEIGHT * HOG_LUV_BINS, FRAME_WIDTH, CV_8UC1); + dmem.create(FRAME_HEIGHT * (HOG_LUV_BINS + 1), FRAME_WIDTH, CV_8UC1); shrunk.create(FRAME_HEIGHT / shrinkage * HOG_LUV_BINS, FRAME_WIDTH / shrinkage, CV_8UC1); hogluv.create( (FRAME_HEIGHT / shrinkage * HOG_LUV_BINS) + 1, (FRAME_WIDTH / shrinkage) + 1, CV_16UC1); @@ -395,7 +398,7 @@ void cv::gpu::SoftCascade::detectMultiScale(const GpuMat& image, const GpuMat& / GpuMat& /*objects*/, const int /*rejectfactor*/, Stream /*stream*/) { // only color images are supperted - CV_Assert(image.type() == CV_8UC3); + CV_Assert(image.type() == CV_8UC4); // only this window size allowed CV_Assert(image.cols == 640 && image.rows == 480); @@ -406,4 +409,4 @@ void cv::gpu::SoftCascade::detectMultiScale(const GpuMat& image, const GpuMat& / flds.detect(); } -#endif +#endif \ No newline at end of file From 14a0dd8c98ecff30ea17fe4ecd1a900df6f32f06 Mon Sep 17 00:00:00 2001 From: "marina.kolpakova" Date: Fri, 21 Sep 2012 11:44:26 +0400 Subject: [PATCH 010/155] fix typo in cascade loading --- modules/gpu/src/softcascade.cpp | 33 +++++++++++++-------------- modules/gpu/test/test_softcascade.cpp | 7 +++--- 2 files changed, 20 insertions(+), 20 deletions(-) diff --git a/modules/gpu/src/softcascade.cpp b/modules/gpu/src/softcascade.cpp index 80473da95..fb36efddc 100644 --- a/modules/gpu/src/softcascade.cpp +++ b/modules/gpu/src/softcascade.cpp @@ -198,11 +198,11 @@ inline bool cv::gpu::SoftCascade::Filds::fill(const FileNode &root, const float FileNode fns = *it; float scale = (float)fns[SC_OCT_SCALE]; scales.push_back(scale); - ushort nstages = saturate_cast((int)fn[SC_OCT_STAGES]); + ushort nstages = saturate_cast((int)fns[SC_OCT_STAGES]); ushort2 size; size.x = cvRound(ORIG_OBJECT_WIDTH * scale); size.y = cvRound(ORIG_OBJECT_HEIGHT * scale); - shrinkage = saturate_cast((int)fn[SC_OCT_SHRINKAGE]); + shrinkage = saturate_cast((int)fns[SC_OCT_SHRINKAGE]); icf::Octave octave(octIndex, nstages, shrinkage, size, scale); CV_Assert(octave.stages > 0); @@ -219,7 +219,7 @@ inline bool cv::gpu::SoftCascade::Filds::fill(const FileNode &root, const float for (; st != st_end; ++st ) { fns = *st; - vstages.push_back((float)fn[SC_STAGE_THRESHOLD]); + vstages.push_back((float)fns[SC_STAGE_THRESHOLD]); fns = fns[SC_WEEK]; FileNodeIterator ftr = fns.begin(), ft_end = fns.end(); @@ -230,7 +230,8 @@ inline bool cv::gpu::SoftCascade::Filds::fill(const FileNode &root, const float for (; inIt != inIt_end;) { int feature = (int)(*(inIt +=2)++) + feature_offset; - vnodes.push_back(icf::Node(feature, (float)(*(inIt++)))); + float th = (float)(*(inIt++)); + vnodes.push_back(icf::Node(feature, th)); } fns = (*ftr)[SC_LEAF]; @@ -277,7 +278,7 @@ inline bool cv::gpu::SoftCascade::Filds::fill(const FileNode &root, const float calcLevels(voctaves, FRAME_WIDTH, FRAME_HEIGHT, TOTAL_SCALES); CV_Assert(!levels.empty()); - // init Cascade + //init Cascade cascade = icf::Cascade(octaves, stages, nodes, leaves, features, levels); // allocate buffers @@ -317,7 +318,7 @@ namespace { float a = A[(int)(scaling >= 1)][(int)(channel > 6)]; float b = B[(int)(scaling >= 1)][(int)(channel > 6)]; - printf("!!! scaling: %f %f %f -> %f\n", scaling, a, b, a * pow(scaling, b)); + // printf("!!! scaling: %f %f %f -> %f\n", scaling, a, b, a * pow(scaling, b)); return a * pow(scaling, b); } }; @@ -352,19 +353,17 @@ inline void cv::gpu::SoftCascade::Filds::calcLevels(const std::vectorscale - // << " " - // << levels[sc].relScale - // << " " << levels[sc].shrScale - // << " [" << levels[sc].objSize.width - // << " " << levels[sc].objSize.height << "] [" - // << levels[sc].workRect.width << " " << levels[sc].workRect.height << "]" << std::endl; + // << vlevels[sc].octave + // << " relScale " + // << vlevels[sc].relScale + // << " " << vlevels[sc].shrScale + // << " [" << (int)vlevels[sc].objSize.x + // << " " << (int)vlevels[sc].objSize.y << "] [" + // << (int)vlevels[sc].workRect.x << " " << (int)vlevels[sc].workRect.y << "]" << std::endl; } + levels.upload(cv::Mat(1, vlevels.size() * sizeof(icf::Level), CV_8UC1, (uchar*)&(vlevels[0]) )); } cv::gpu::SoftCascade::SoftCascade() : filds(0) {} diff --git a/modules/gpu/test/test_softcascade.cpp b/modules/gpu/test/test_softcascade.cpp index 821a2b140..f94b0b726 100644 --- a/modules/gpu/test/test_softcascade.cpp +++ b/modules/gpu/test/test_softcascade.cpp @@ -48,7 +48,7 @@ using cv::gpu::GpuMat; TEST(SoftCascade, readCascade) { - std::string xml = cvtest::TS::ptr()->get_data_path() + "cascadeandhog/icf-template.xml"; + std::string xml = cvtest::TS::ptr()->get_data_path() + "../cv/cascadeandhog/icf-template.xml"; cv::gpu::SoftCascade cascade; ASSERT_TRUE(cascade.load(xml)); @@ -56,11 +56,12 @@ TEST(SoftCascade, readCascade) TEST(SoftCascade, detect) { - std::string xml = cvtest::TS::ptr()->get_data_path() + "cascadeandhog/sc_cvpr_2012_to_opencv.xml"; + std::string xml = cvtest::TS::ptr()->get_data_path() + "../cv/cascadeandhog/sc_cvpr_2012_to_opencv.xml"; cv::gpu::SoftCascade cascade; ASSERT_TRUE(cascade.load(xml)); - cv::Mat coloredCpu = cv::imread(cvtest::TS::ptr()->get_data_path() + "cascadeandhog/bahnhof/image_00000000_0.png"); + cv::Mat coloredCpu = cv::imread(cvtest::TS::ptr()->get_data_path() + + "../cv/cascadeandhog/bahnhof/image_00000000_0.png"); ASSERT_FALSE(coloredCpu.empty()); GpuMat colored(coloredCpu), objectBoxes, rois; From 1a52a322b50b7756464e8021f7225fa75f5a8dc2 Mon Sep 17 00:00:00 2001 From: "marina.kolpakova" Date: Fri, 21 Sep 2012 16:10:40 +0400 Subject: [PATCH 011/155] add performance test for GPU soft cascade --- modules/gpu/perf/perf_objdetect.cpp | 39 +++++++++++++++++++++++++++++ 1 file changed, 39 insertions(+) diff --git a/modules/gpu/perf/perf_objdetect.cpp b/modules/gpu/perf/perf_objdetect.cpp index 6b864a3e5..cebcbdb63 100644 --- a/modules/gpu/perf/perf_objdetect.cpp +++ b/modules/gpu/perf/perf_objdetect.cpp @@ -89,6 +89,45 @@ PERF_TEST_P(HOG, CalTech, Values("gpu/caltech/image_00000009_0.png", "gp SANITY_CHECK(found_locations); } +typedef pair pair_string; +DEF_PARAM_TEST_1(SoftCascade, pair_string); + +PERF_TEST_P(SoftCascade, detect, Values(make_pair("cv/cascadeandhog/sc_cvpr_2012_to_opencv.xml", + "cv/cascadeandhog/bahnhof/image_00000000_0.png"))) +{ + if (runOnGpu) + { + cv::Mat cpu = readImage(GetParam().second); + ASSERT_FALSE(cpu.empty()); + cv::gpu::GpuMat colored(cpu); + + cv::gpu::SoftCascade cascade; + ASSERT_TRUE(cascade.load(GetParam().first)); + + cv::gpu::GpuMat rois, objectBoxes; + cascade.detectMultiScale(colored, rois, objectBoxes); + + TEST_CYCLE() + { + cascade.detectMultiScale(colored, rois, objectBoxes); + } + } else + { + cv::Mat colored = readImage(GetParam().second); + ASSERT_FALSE(colored.empty()); + + cv::SoftCascade cascade; + ASSERT_TRUE(cascade.load(GetParam().first)); + + std::vector rois, objectBoxes; + cascade.detectMultiScale(colored, rois, objectBoxes); + + TEST_CYCLE() + { + cascade.detectMultiScale(colored, rois, objectBoxes); + } + } +} /////////////////////////////////////////////////////////////// // HaarClassifier From 5d15e4ea58f8aa591d9be9d64ae4d22936fe0b88 Mon Sep 17 00:00:00 2001 From: "marina.kolpakova" Date: Fri, 21 Sep 2012 16:12:18 +0400 Subject: [PATCH 012/155] CUDA kernels interface --- modules/gpu/src/cuda/isf-sc.cu | 85 +++++++++++++++++++++++++++++++-- modules/gpu/src/icf.hpp | 27 ++++++++--- modules/gpu/src/softcascade.cpp | 14 +++--- 3 files changed, 110 insertions(+), 16 deletions(-) diff --git a/modules/gpu/src/cuda/isf-sc.cu b/modules/gpu/src/cuda/isf-sc.cu index a6418c1d3..89a74eeac 100644 --- a/modules/gpu/src/cuda/isf-sc.cu +++ b/modules/gpu/src/cuda/isf-sc.cu @@ -42,13 +42,90 @@ #include -void icf::Cascade::detect(const cv::gpu::PtrStepSzb& hogluv) const +namespace cv { namespace gpu { + + + namespace device { + +__global__ void rgb2grayluv(const uchar3* __restrict__ rgb, uchar* __restrict__ hog, + const int rgbPitch, const int hogPitch) { - // detection kernel } -void icf::ChannelStorage::frame(const cv::gpu::PtrStepSz& image) +__global__ void gray2hog(const uchar* __restrict__ gray, uchar* __restrict__ hog, + const int pitch) +{ +} + +__global__ void decimate(const uchar* __restrict__ hogluv, uchar* __restrict__ shrank, + const int inPitch, const int outPitch ) +{ +} + +__global__ void intRow(const uchar* __restrict__ hogluv, ushort* __restrict__ sum, + const int inPitch, const int outPitch) +{ + +} + +__global__ void intCol(ushort* __restrict__ sum, const int pitch) +{ + +} + + +__global__ void detect(const cv::gpu::icf::Cascade cascade, const uchar* __restrict__ hogluv, const int pitch) +{ + cascade.detectAt(); +} + +} + +void __device icf::Cascade::detectAt() const +{ + +} + +void icf::Cascade::detect(const cv::gpu::PtrStepSzb& hogluv, cudaStream_t stream) const +{ + // detection kernel + +} + +void icf::ChannelStorage::frame(const cv::gpu::PtrStepSz& rgb, cudaStream_t stream) { // color convertin kernel + dim3 block(32, 8); + dim3 grid(FRAME_WIDTH / 32, FRAME_HEIGHT / 8); + + uchar * channels = (uchar*)dmem.ptr(FRAME_HEIGHT * HOG_BINS); + device::rgb2grayluv<<>>((uchar3*)rgb.ptr(), channels, rgb.step, dmem.step); + cudaSafeCall( cudaGetLastError()); + // hog calculation kernel -} \ No newline at end of file + channels = (uchar*)dmem.ptr(FRAME_HEIGHT * HOG_LUV_BINS); + device::gray2hog<<>>(channels, (uchar*)dmem.ptr(), dmem.step); + cudaSafeCall( cudaGetLastError() ); + + const int shrWidth = FRAME_WIDTH / shrinkage; + const int shrHeight = FRAME_HEIGHT / shrinkage; + + // decimate kernel + grid = dim3(shrWidth / 32, shrHeight / 8); + device::decimate<<>>((uchar*)dmem.ptr(), (uchar*)shrunk.ptr(), dmem.step, shrunk.step); + cudaSafeCall( cudaGetLastError() ); + + // integrate rows + block = dim3(shrWidth, 1); + grid = dim3(shrHeight * HOG_LUV_BINS, 1); + device::intRow<<>>((uchar*)shrunk.ptr(), (ushort*)hogluv.ptr(), shrunk.step, hogluv.step); + cudaSafeCall( cudaGetLastError() ); + + // integrate cols + block = dim3(128, 1); + grid = dim3(shrWidth * HOG_LUV_BINS, 1); + device::intCol<<>>((ushort*)hogluv.ptr(), hogluv.step); + cudaSafeCall( cudaGetLastError() ); +} + +}} \ No newline at end of file diff --git a/modules/gpu/src/icf.hpp b/modules/gpu/src/icf.hpp index 7183fc0ac..8b075beba 100644 --- a/modules/gpu/src/icf.hpp +++ b/modules/gpu/src/icf.hpp @@ -46,17 +46,19 @@ #define __OPENCV_ICF_HPP__ #if defined __CUDACC__ -# define __hd__ __host__ __device__ __forceinline__ +# define __device __device__ __forceinline__ #else -# define __hd__ +# define __device #endif -namespace icf { +namespace cv { namespace gpu { namespace icf { using cv::gpu::PtrStepSzb; using cv::gpu::PtrStepSzf; +typedef unsigned char uchar; + struct Cascade { Cascade() {} @@ -64,7 +66,8 @@ struct Cascade const cv::gpu::PtrStepSzf& lvs, const cv::gpu::PtrStepSzb& fts, const cv::gpu::PtrStepSzb& lls) : octaves(octs), stages(sts), nodes(nds), leaves(lvs), features(fts), levels(lls) {} - void detect(const cv::gpu::PtrStepSzb& hogluv) const; + void detect(const cv::gpu::PtrStepSzb& hogluv, cudaStream_t stream) const; + void __device detectAt() const; PtrStepSzb octaves; PtrStepSzf stages; @@ -83,12 +86,24 @@ struct ChannelStorage const cv::gpu::PtrStepSzb& itg, const int s) : dmem (buff), shrunk(shr), hogluv(itg), shrinkage(s) {} - void frame(const cv::gpu::PtrStepSz& image); + void frame(const cv::gpu::PtrStepSz& rgb, cudaStream_t stream); PtrStepSzb dmem; PtrStepSzb shrunk; PtrStepSzb hogluv; + enum + { + FRAME_WIDTH = 640, + FRAME_HEIGHT = 480, + TOTAL_SCALES = 55, + CLASSIFIERS = 5, + ORIG_OBJECT_WIDTH = 64, + ORIG_OBJECT_HEIGHT = 128, + HOG_BINS = 6, + HOG_LUV_BINS = 10 + }; + int shrinkage; }; @@ -143,6 +158,6 @@ struct __align__(8) Level //is actually 24 bytes objSize.y = round(oct.size.y * relScale); } }; -} +}}} #endif \ No newline at end of file diff --git a/modules/gpu/src/softcascade.cpp b/modules/gpu/src/softcascade.cpp index fb36efddc..b2419c12c 100644 --- a/modules/gpu/src/softcascade.cpp +++ b/modules/gpu/src/softcascade.cpp @@ -100,9 +100,9 @@ struct cv::gpu::SoftCascade::Filds }; bool fill(const FileNode &root, const float mins, const float maxs); - void detect() const + void detect(cudaStream_t stream) const { - cascade.detect(hogluv); + cascade.detect(hogluv, stream); } private: @@ -394,18 +394,20 @@ bool cv::gpu::SoftCascade::load( const string& filename, const float minScale, c } void cv::gpu::SoftCascade::detectMultiScale(const GpuMat& image, const GpuMat& /*rois*/, - GpuMat& /*objects*/, const int /*rejectfactor*/, Stream /*stream*/) + GpuMat& /*objects*/, const int /*rejectfactor*/, Stream s) { // only color images are supperted - CV_Assert(image.type() == CV_8UC4); + CV_Assert(image.type() == CV_8UC3); // only this window size allowed CV_Assert(image.cols == 640 && image.rows == 480); Filds& flds = *filds; - flds.storage.frame(image); - flds.detect(); + cudaStream_t stream = StreamAccessor::getStream(s); + + flds.storage.frame(image, stream); + flds.detect(stream); } #endif \ No newline at end of file From 0691dc554f046b880f61ad26526a798f01c12b34 Mon Sep 17 00:00:00 2001 From: "marina.kolpakova" Date: Fri, 21 Sep 2012 18:49:51 +0400 Subject: [PATCH 013/155] fix compilation --- modules/gpu/src/cuda/texture_binder.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/gpu/src/cuda/texture_binder.hpp b/modules/gpu/src/cuda/texture_binder.hpp index 4f42b099d..391eb9a19 100644 --- a/modules/gpu/src/cuda/texture_binder.hpp +++ b/modules/gpu/src/cuda/texture_binder.hpp @@ -85,7 +85,7 @@ namespace cv namespace device { - using pcl::gpu::TextureBinder; + using cv::gpu::TextureBinder; } } From 1cf7a46f3a01c88fdbefc785943f4b4182ab4079 Mon Sep 17 00:00:00 2001 From: "marina.kolpakova" Date: Fri, 21 Sep 2012 19:42:19 +0400 Subject: [PATCH 014/155] fix data paths in performance test --- modules/gpu/perf/perf_objdetect.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/modules/gpu/perf/perf_objdetect.cpp b/modules/gpu/perf/perf_objdetect.cpp index cebcbdb63..a3a6e9c6b 100644 --- a/modules/gpu/perf/perf_objdetect.cpp +++ b/modules/gpu/perf/perf_objdetect.cpp @@ -97,12 +97,12 @@ PERF_TEST_P(SoftCascade, detect, Values(make_pair("cv/cascadeandhog { if (runOnGpu) { - cv::Mat cpu = readImage(GetParam().second); + cv::Mat cpu = readImage (GetParam().second); ASSERT_FALSE(cpu.empty()); cv::gpu::GpuMat colored(cpu); cv::gpu::SoftCascade cascade; - ASSERT_TRUE(cascade.load(GetParam().first)); + ASSERT_TRUE(cascade.load(perf::TestBase::getDataPath(GetParam().first))); cv::gpu::GpuMat rois, objectBoxes; cascade.detectMultiScale(colored, rois, objectBoxes); From 08b4e780deb9ff6d7f1dc31031c317ea6b3bb6e6 Mon Sep 17 00:00:00 2001 From: "marina.kolpakova" Date: Fri, 21 Sep 2012 19:44:30 +0400 Subject: [PATCH 015/155] add shrinking kernel --- modules/gpu/src/cuda/isf-sc.cu | 132 +++++++++++++++++++++++++++++++-- modules/gpu/src/icf.hpp | 1 + 2 files changed, 126 insertions(+), 7 deletions(-) diff --git a/modules/gpu/src/cuda/isf-sc.cu b/modules/gpu/src/cuda/isf-sc.cu index 89a74eeac..5cde71070 100644 --- a/modules/gpu/src/cuda/isf-sc.cu +++ b/modules/gpu/src/cuda/isf-sc.cu @@ -41,25 +41,136 @@ //M*/ #include +#include namespace cv { namespace gpu { namespace device { +enum { + HOG_BINS = 6, + HOG_LUV_BINS = 10, + WIDTH = 640, + HEIGHT = 480, + GREY_OFFSET = HEIGHT * HOG_LUV_BINS +}; + +/* Returns the nearest upper power of two, works only for +the typical GPU thread count (pert block) values */ +int power_2up(unsigned int n) +{ + if (n < 1) return 1; + else if (n < 2) return 2; + else if (n < 4) return 4; + else if (n < 8) return 8; + else if (n < 16) return 16; + else if (n < 32) return 32; + else if (n < 64) return 64; + else if (n < 128) return 128; + else if (n < 256) return 256; + else if (n < 512) return 512; + else if (n < 1024) return 1024; + return -1; // Input is too big +} + + +__device__ __forceinline__ uchar grey(const uchar3 rgb) +{ + return saturate_cast(rgb.x * 0.114f + rgb.y * 0.587f + rgb.z * 0.299f); +} + +__device__ __forceinline__ void luv(const uchar3 rgb, uchar& l, uchar& u, uchar& v) +{ + +} + __global__ void rgb2grayluv(const uchar3* __restrict__ rgb, uchar* __restrict__ hog, const int rgbPitch, const int hogPitch) { + const int y = blockIdx.y * blockDim.y + threadIdx.y; + const int x = blockIdx.x * blockDim.x + threadIdx.x; + + const uchar3 color = rgb[rgbPitch * y + x]; + + uchar l, u, v; + luv(color, l, u, v); + + hog[hogPitch * y + x] = l; + hog[hogPitch * (y + HEIGHT) + x] = u; + hog[hogPitch * (y + 2 * HEIGHT) + x] = v; + hog[hogPitch * (y + 3 * HEIGHT) + x] = grey(color); } -__global__ void gray2hog(const uchar* __restrict__ gray, uchar* __restrict__ hog, - const int pitch) +__device__ __forceinline__ +int qangle(const float &y, const float &x) { + int bin = 0; +// const float2 &bin_vector_zero = const_angle_bins_vectors[0]; +// float max_dot_product = fabs(x*bin_vector_zero.x + y*bin_vector_zero.y); + +// // let us hope this gets unrolled +// #pragma unroll +// for(int i=1; i < num_angles_bin; i+=1) +// { +// const float2 &bin_vector_i = const_angle_bins_vectors[i]; +// //const float2 bin_vector_i = const_angle_bins_vectors[i]; +// //const float2 &bin_vector_i = angle_bins_vectors[i]; +// const float dot_product = fabs(x*bin_vector_i.x + y*bin_vector_i.y); +// if(dot_product > max_dot_product) +// { +// max_dot_product = dot_product; +// index = i; +// } +// } + + return bin; } +// texture tgray; +__global__ void gray2hog(const uchar* __restrict__ gray, uchar* __restrict__ hog, const int pitch, const float norm) +{ + const int y = blockIdx.y * blockDim.y + threadIdx.y; + const int x = blockIdx.x * blockDim.x + threadIdx.x; + + // derivative + float dx = gray[y * pitch + x + 1]; + dx -= gray[y * pitch + x - 1]; + + float dy = gray[(y + 1) * pitch + x]; + dy -= gray[(y -1) * pitch + x - 1]; + + // mag and angle + const uchar mag = saturate_cast(sqrtf(dy * dy + dx * dx) * norm); + const int bin = qangle(dx, dy); + +} + +template +__device__ __forceinline__ uchar shrink(const uchar* ptr, const int pitch, const int y, const int x) +{ + int out = 0; +#pragma unroll + for(int dy = 0; dy < FACTOR; ++dy) +#pragma unroll + for(int dx = 0; dx < FACTOR; ++dx) + { + out += ptr[dy * pitch + dx]; + } + + return saturate_cast(out / FACTOR); +} + +template __global__ void decimate(const uchar* __restrict__ hogluv, uchar* __restrict__ shrank, const int inPitch, const int outPitch ) { + const int y = blockIdx.y * blockDim.y + threadIdx.y; + const int x = blockIdx.x * blockDim.x + threadIdx.x; + + const uchar* ptr = hogluv + (FACTOR * y) * inPitch + (FACTOR * x); + + shrank[ y * outPitch + x]= shrink(ptr, inPitch, y, x); } __global__ void intRow(const uchar* __restrict__ hogluv, ushort* __restrict__ sum, @@ -89,6 +200,11 @@ void __device icf::Cascade::detectAt() const void icf::Cascade::detect(const cv::gpu::PtrStepSzb& hogluv, cudaStream_t stream) const { // detection kernel + dim3 block(32, 8, 1); + dim3 grid(32 * ChannelStorage::FRAME_WIDTH / 32, ChannelStorage::FRAME_HEIGHT / 8, 64); + device::detect<<>>(*this, hogluv, hogluv.step / sizeof(ushort)); + if (!stream) + cudaSafeCall( cudaDeviceSynchronize() ); } @@ -99,12 +215,13 @@ void icf::ChannelStorage::frame(const cv::gpu::PtrStepSz& rgb, cudaStrea dim3 grid(FRAME_WIDTH / 32, FRAME_HEIGHT / 8); uchar * channels = (uchar*)dmem.ptr(FRAME_HEIGHT * HOG_BINS); - device::rgb2grayluv<<>>((uchar3*)rgb.ptr(), channels, rgb.step, dmem.step); + device::rgb2grayluv<<>>((uchar3*)rgb.ptr(), channels, + rgb.step / sizeof(uchar3), dmem.step); cudaSafeCall( cudaGetLastError()); // hog calculation kernel channels = (uchar*)dmem.ptr(FRAME_HEIGHT * HOG_LUV_BINS); - device::gray2hog<<>>(channels, (uchar*)dmem.ptr(), dmem.step); + device::gray2hog<<>>(channels, (uchar*)dmem.ptr(), dmem.step, magnitudeScaling); cudaSafeCall( cudaGetLastError() ); const int shrWidth = FRAME_WIDTH / shrinkage; @@ -112,19 +229,20 @@ void icf::ChannelStorage::frame(const cv::gpu::PtrStepSz& rgb, cudaStrea // decimate kernel grid = dim3(shrWidth / 32, shrHeight / 8); - device::decimate<<>>((uchar*)dmem.ptr(), (uchar*)shrunk.ptr(), dmem.step, shrunk.step); + device::decimate<4><<>>((uchar*)dmem.ptr(), (uchar*)shrunk.ptr(), dmem.step, shrunk.step); cudaSafeCall( cudaGetLastError() ); // integrate rows block = dim3(shrWidth, 1); grid = dim3(shrHeight * HOG_LUV_BINS, 1); - device::intRow<<>>((uchar*)shrunk.ptr(), (ushort*)hogluv.ptr(), shrunk.step, hogluv.step); + device::intRow<<>>((uchar*)shrunk.ptr(), (ushort*)hogluv.ptr(), + shrunk.step, hogluv.step / sizeof(ushort)); cudaSafeCall( cudaGetLastError() ); // integrate cols block = dim3(128, 1); grid = dim3(shrWidth * HOG_LUV_BINS, 1); - device::intCol<<>>((ushort*)hogluv.ptr(), hogluv.step); + device::intCol<<>>((ushort*)hogluv.ptr(), hogluv.step / hogluv.step / sizeof(ushort)); cudaSafeCall( cudaGetLastError() ); } diff --git a/modules/gpu/src/icf.hpp b/modules/gpu/src/icf.hpp index 8b075beba..69d21fdd9 100644 --- a/modules/gpu/src/icf.hpp +++ b/modules/gpu/src/icf.hpp @@ -105,6 +105,7 @@ struct ChannelStorage }; int shrinkage; + static const float magnitudeScaling = 1.f ;// / sqrt(2); }; struct __align__(16) Octave From 1bf85996b37d4d8995041050f6e0d03f2beca4bc Mon Sep 17 00:00:00 2001 From: "marina.kolpakova" Date: Mon, 24 Sep 2012 17:59:44 +0400 Subject: [PATCH 016/155] add detections vector initialization in tests --- modules/gpu/perf/perf_objdetect.cpp | 2 +- modules/gpu/test/test_softcascade.cpp | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/modules/gpu/perf/perf_objdetect.cpp b/modules/gpu/perf/perf_objdetect.cpp index a3a6e9c6b..48a355d6a 100644 --- a/modules/gpu/perf/perf_objdetect.cpp +++ b/modules/gpu/perf/perf_objdetect.cpp @@ -104,7 +104,7 @@ PERF_TEST_P(SoftCascade, detect, Values(make_pair("cv/cascadeandhog cv::gpu::SoftCascade cascade; ASSERT_TRUE(cascade.load(perf::TestBase::getDataPath(GetParam().first))); - cv::gpu::GpuMat rois, objectBoxes; + cv::gpu::GpuMat rois, objectBoxes(1, 1000, CV_8UC1); cascade.detectMultiScale(colored, rois, objectBoxes); TEST_CYCLE() diff --git a/modules/gpu/test/test_softcascade.cpp b/modules/gpu/test/test_softcascade.cpp index f94b0b726..c7e3a1f77 100644 --- a/modules/gpu/test/test_softcascade.cpp +++ b/modules/gpu/test/test_softcascade.cpp @@ -63,7 +63,7 @@ TEST(SoftCascade, detect) cv::Mat coloredCpu = cv::imread(cvtest::TS::ptr()->get_data_path() + "../cv/cascadeandhog/bahnhof/image_00000000_0.png"); ASSERT_FALSE(coloredCpu.empty()); - GpuMat colored(coloredCpu), objectBoxes, rois; + GpuMat colored(coloredCpu), objectBoxes(1, 1000, CV_8UC1), rois; // ASSERT_NO_THROW( // { From ba50d193412677954cff9a4a53c4ebe31e2e9661 Mon Sep 17 00:00:00 2001 From: "marina.kolpakova" Date: Mon, 24 Sep 2012 18:00:47 +0400 Subject: [PATCH 017/155] first version of soft cascade on GPU --- modules/gpu/src/cuda/isf-sc.cu | 219 ++++++++++++++++++++++++++------ modules/gpu/src/icf.hpp | 78 ++++++------ modules/gpu/src/softcascade.cpp | 98 ++++++++++++-- 3 files changed, 311 insertions(+), 84 deletions(-) diff --git a/modules/gpu/src/cuda/isf-sc.cu b/modules/gpu/src/cuda/isf-sc.cu index 5cde71070..37c6e3023 100644 --- a/modules/gpu/src/cuda/isf-sc.cu +++ b/modules/gpu/src/cuda/isf-sc.cu @@ -42,11 +42,48 @@ #include #include +#include +#include -namespace cv { namespace gpu { +namespace cv { namespace gpu { namespace device { +namespace icf { - namespace device { + enum { + HOG_BINS = 6, + HOG_LUV_BINS = 10, + WIDTH = 640, + HEIGHT = 480, + GREY_OFFSET = HEIGHT * HOG_LUV_BINS + }; + + __global__ void magToHist(const uchar* __restrict__ mag, + const float* __restrict__ angle, const int angPitch, + uchar* __restrict__ hog, const int hogPitch) + { + const int y = blockIdx.y * blockDim.y + threadIdx.y; + const int x = blockIdx.x * blockDim.x + threadIdx.x; + + const int bin = (int)(angle[y * angPitch + x]); + const uchar val = mag[y * angPitch + x]; + + hog[((HEIGHT * bin) + y) * hogPitch + x] = val; + } + + void fillBins(cv::gpu::PtrStepSzb hogluv, const cv::gpu::PtrStepSzf& nangle) + { + const uchar* mag = (const uchar*)hogluv.ptr(HEIGHT * HOG_BINS); + uchar* hog = (uchar*)hogluv.ptr(); + const float* angle = (const float*)nangle.ptr(); + + dim3 block(32, 8); + dim3 grid(WIDTH / 32, HEIGHT / 8); + + magToHist<<>>(mag, angle, nangle.step / sizeof(float), hog, hogluv.step); + cudaSafeCall( cudaGetLastError() ); + cudaSafeCall( cudaDeviceSynchronize() ); + } +} enum { HOG_BINS = 6, @@ -185,65 +222,175 @@ __global__ void intCol(ushort* __restrict__ sum, const int pitch) } -__global__ void detect(const cv::gpu::icf::Cascade cascade, const uchar* __restrict__ hogluv, const int pitch) +__global__ void detect(const cv::gpu::icf::Cascade cascade, const uchar* __restrict__ hogluv, const int pitch, + PtrStepSz objects) { - cascade.detectAt(); + cascade.detectAt(hogluv, pitch, objects); } } -void __device icf::Cascade::detectAt() const +float __device icf::Cascade::rescale(const icf::Level& level, uchar4& scaledRect, + const int channel, const float threshold) const { + float relScale = level.relScale; + float farea = (scaledRect.z - scaledRect.x) * (scaledRect.w - scaledRect.y); + // rescale + scaledRect.x = __float2int_rn(relScale * scaledRect.x); + scaledRect.y = __float2int_rn(relScale * scaledRect.y); + scaledRect.z = __float2int_rn(relScale * scaledRect.z); + scaledRect.w = __float2int_rn(relScale * scaledRect.w); + + float sarea = (scaledRect.z - scaledRect.x) * (scaledRect.w - scaledRect.y); + + float approx = 1.f; + if (fabs(farea - 0.f) > FLT_EPSILON && fabs(farea - 0.f) > FLT_EPSILON) + { + const float expected_new_area = farea * relScale * relScale; + approx = expected_new_area / sarea; + } + + // compensation areas rounding + float rootThreshold = threshold / approx; + rootThreshold *= level.scaling[(int)(channel > 6)]; + + return rootThreshold; } -void icf::Cascade::detect(const cv::gpu::PtrStepSzb& hogluv, cudaStream_t stream) const +typedef unsigned char uchar; +float __device get(const uchar* __restrict__ hogluv, const int pitch, + const int x, const int y, int channel, uchar4 area) +{ + const uchar* curr = hogluv + ((channel * 121) + y) * pitch; + + int a = curr[area.y * pitch + x + area.x]; + int b = curr[area.y * pitch + x + area.z]; + int c = curr[area.w * pitch + x + area.z]; + int d = curr[area.w * pitch + x + area.x]; + + return (a - b + c - d); +} + + +void __device icf::Cascade::detectAt(const uchar* __restrict__ hogluv, const int pitch, + PtrStepSz& objects) const +{ + const icf::Level* lls = (const icf::Level*)levels.ptr(); + Level level = lls[0]; + + const int y = blockIdx.y * blockDim.y + threadIdx.y; + const int x = blockIdx.x * blockDim.x + threadIdx.x; + + if (x >= level.workRect.x || y >= level.workRect.y) return; + + const Octave octave = ((const Octave*)octaves.ptr())[level.octave]; + const int stBegin = octave.index * octave.stages, stEnd = stBegin + octave.stages; + + float detectionScore = 0.f; + + int st = stBegin; + for(; st < stEnd; ++st) + { + const float stage = stages(0, st); + { + const int nId = st * 3; + + // work with root node + const Node node = ((const Node*)nodes.ptr())[nId]; + const Feature feature = ((const Feature*)features.ptr())[node.feature]; + + uchar4 scaledRect = feature.rect; + float threshold = rescale(level, scaledRect, feature.channel, node.threshold); + + float sum = get(hogluv,pitch, x, y, feature.channel, scaledRect); + + int next = 1 + (int)(sum >= threshold); + + // leaves + const Node leaf = ((const Node*)nodes.ptr())[nId + next]; + const Feature fLeaf = ((const Feature*)features.ptr())[leaf.feature]; + + scaledRect = fLeaf.rect; + threshold = rescale(level, scaledRect, feature.channel, node.threshold); + sum = get(hogluv, pitch, x, y, fLeaf.channel, scaledRect); + + const int lShift = (next - 1) * 2 + (int)(sum >= threshold); + float impact = leaves(0, (st * 4) + lShift); + + detectionScore += impact; + } + + if (detectionScore <= stage) break; + } + + // if (!threadIdx.x && !threadIdx.y)// printf("%f %d\n", detectionScore, st); + // printf("x %d y %d: %d\n", x, y, st); + + if (st == stEnd) + { + // printf(" got %d\n", st); + uchar4 a; + a.x = level.workRect.x; + a.y = level.workRect.y; + objects(0, threadIdx.x) = a; + } +} + +void icf::Cascade::detect(const cv::gpu::PtrStepSzb& hogluv, PtrStepSz objects, + cudaStream_t stream) const { // detection kernel dim3 block(32, 8, 1); - dim3 grid(32 * ChannelStorage::FRAME_WIDTH / 32, ChannelStorage::FRAME_HEIGHT / 8, 64); - device::detect<<>>(*this, hogluv, hogluv.step / sizeof(ushort)); + // dim3 grid(32 * ChannelStorage::FRAME_WIDTH / 32, ChannelStorage::FRAME_HEIGHT / 8, 1); + dim3 grid(ChannelStorage::FRAME_WIDTH / 32, ChannelStorage::FRAME_HEIGHT / 8, 1); + device::detect<<>>(*this, hogluv, hogluv.step / sizeof(ushort), objects); + cudaSafeCall( cudaGetLastError() ); if (!stream) cudaSafeCall( cudaDeviceSynchronize() ); } +//////////////////////////////////////////////////// + + + void icf::ChannelStorage::frame(const cv::gpu::PtrStepSz& rgb, cudaStream_t stream) { - // color convertin kernel - dim3 block(32, 8); - dim3 grid(FRAME_WIDTH / 32, FRAME_HEIGHT / 8); +// // // color convertin kernel +// // dim3 block(32, 8); +// // dim3 grid(FRAME_WIDTH / 32, FRAME_HEIGHT / 8); - uchar * channels = (uchar*)dmem.ptr(FRAME_HEIGHT * HOG_BINS); - device::rgb2grayluv<<>>((uchar3*)rgb.ptr(), channels, - rgb.step / sizeof(uchar3), dmem.step); - cudaSafeCall( cudaGetLastError()); +// // uchar * channels = (uchar*)dmem.ptr(FRAME_HEIGHT * HOG_BINS); +// // device::rgb2grayluv<<>>((uchar3*)rgb.ptr(), channels, +// // rgb.step / sizeof(uchar3), dmem.step); +// // cudaSafeCall( cudaGetLastError()); - // hog calculation kernel - channels = (uchar*)dmem.ptr(FRAME_HEIGHT * HOG_LUV_BINS); - device::gray2hog<<>>(channels, (uchar*)dmem.ptr(), dmem.step, magnitudeScaling); - cudaSafeCall( cudaGetLastError() ); +// // // hog calculation kernel +// // channels = (uchar*)dmem.ptr(FRAME_HEIGHT * HOG_LUV_BINS); +// // device::gray2hog<<>>(channels, (uchar*)dmem.ptr(), dmem.step, magnitudeScaling); +// // cudaSafeCall( cudaGetLastError() ); - const int shrWidth = FRAME_WIDTH / shrinkage; - const int shrHeight = FRAME_HEIGHT / shrinkage; +// // const int shrWidth = FRAME_WIDTH / shrinkage; +// // const int shrHeight = FRAME_HEIGHT / shrinkage; - // decimate kernel - grid = dim3(shrWidth / 32, shrHeight / 8); - device::decimate<4><<>>((uchar*)dmem.ptr(), (uchar*)shrunk.ptr(), dmem.step, shrunk.step); - cudaSafeCall( cudaGetLastError() ); +// // // decimate kernel +// // grid = dim3(shrWidth / 32, shrHeight / 8); +// // device::decimate<4><<>>((uchar*)dmem.ptr(), (uchar*)shrunk.ptr(), dmem.step, shrunk.step); +// // cudaSafeCall( cudaGetLastError() ); - // integrate rows - block = dim3(shrWidth, 1); - grid = dim3(shrHeight * HOG_LUV_BINS, 1); - device::intRow<<>>((uchar*)shrunk.ptr(), (ushort*)hogluv.ptr(), - shrunk.step, hogluv.step / sizeof(ushort)); - cudaSafeCall( cudaGetLastError() ); +// // // integrate rows +// // block = dim3(shrWidth, 1); +// // grid = dim3(shrHeight * HOG_LUV_BINS, 1); +// // device::intRow<<>>((uchar*)shrunk.ptr(), (ushort*)hogluv.ptr(), +// // shrunk.step, hogluv.step / sizeof(ushort)); +// // cudaSafeCall( cudaGetLastError() ); - // integrate cols - block = dim3(128, 1); - grid = dim3(shrWidth * HOG_LUV_BINS, 1); - device::intCol<<>>((ushort*)hogluv.ptr(), hogluv.step / hogluv.step / sizeof(ushort)); - cudaSafeCall( cudaGetLastError() ); +// // // integrate cols +// // block = dim3(128, 1); +// // grid = dim3(shrWidth * HOG_LUV_BINS, 1); +// // device::intCol<<>>((ushort*)hogluv.ptr(), hogluv.step / hogluv.step / sizeof(ushort)); +// // cudaSafeCall( cudaGetLastError() ); } }} \ No newline at end of file diff --git a/modules/gpu/src/icf.hpp b/modules/gpu/src/icf.hpp index 69d21fdd9..454dad881 100644 --- a/modules/gpu/src/icf.hpp +++ b/modules/gpu/src/icf.hpp @@ -59,6 +59,42 @@ using cv::gpu::PtrStepSzf; typedef unsigned char uchar; +struct __align__(16) Octave +{ + ushort index; + ushort stages; + ushort shrinkage; + ushort2 size; + float scale; + + Octave(const ushort i, const ushort s, const ushort sh, const ushort2 sz, const float sc) + : index(i), stages(s), shrinkage(sh), size(sz), scale(sc) {} +}; + +struct __align__(8) Level //is actually 24 bytes +{ + int octave; + + // float origScale; //not actually used + float relScale; + float shrScale; // used for marking detection + float scaling[2]; // calculated according to Dollal paper + + // for 640x480 we can not get overflow + uchar2 workRect; + uchar2 objSize; + + Level(int idx, const Octave& oct, const float scale, const int w, const int h) + : octave(idx), relScale(scale / oct.scale), shrScale (relScale / (float)oct.shrinkage) + { + workRect.x = round(w / (float)oct.shrinkage); + workRect.y = round(h / (float)oct.shrinkage); + + objSize.x = round(oct.size.x * relScale); + objSize.y = round(oct.size.y * relScale); + } +}; + struct Cascade { Cascade() {} @@ -66,8 +102,10 @@ struct Cascade const cv::gpu::PtrStepSzf& lvs, const cv::gpu::PtrStepSzb& fts, const cv::gpu::PtrStepSzb& lls) : octaves(octs), stages(sts), nodes(nds), leaves(lvs), features(fts), levels(lls) {} - void detect(const cv::gpu::PtrStepSzb& hogluv, cudaStream_t stream) const; - void __device detectAt() const; + void detect(const cv::gpu::PtrStepSzb& hogluv, cv::gpu::PtrStepSz objects, cudaStream_t stream) const; + void __device detectAt(const uchar* __restrict__ hogluv, const int pitch, PtrStepSz& objects) const; + float __device rescale(const icf::Level& level, uchar4& scaledRect, + const int channel, const float threshold) const; PtrStepSzb octaves; PtrStepSzf stages; @@ -108,18 +146,6 @@ struct ChannelStorage static const float magnitudeScaling = 1.f ;// / sqrt(2); }; -struct __align__(16) Octave -{ - ushort index; - ushort stages; - ushort shrinkage; - ushort2 size; - float scale; - - Octave(const ushort i, const ushort s, const ushort sh, const ushort2 sz, const float sc) - : index(i), stages(s), shrinkage(sh), size(sz), scale(sc) {} -}; - struct __align__(8) Node { int feature; @@ -135,30 +161,6 @@ struct __align__(8) Feature Feature(const int c, const uchar4 r) : channel(c), rect(r) {} }; - -struct __align__(8) Level //is actually 24 bytes -{ - int octave; - - // float origScale; //not actually used - float relScale; - float shrScale; // used for marking detection - float scaling[2]; // calculated according to Dollal paper - - // for 640x480 we can not get overflow - uchar2 workRect; - uchar2 objSize; - - Level(int idx, const Octave& oct, const float scale, const int w, const int h) - : octave(idx), relScale(scale / oct.scale), shrScale (relScale / (float)oct.shrinkage) - { - workRect.x = round(w / (float)oct.shrinkage); - workRect.y = round(h / (float)oct.shrinkage); - - objSize.x = round(oct.size.x * relScale); - objSize.y = round(oct.size.y * relScale); - } -}; }}} #endif \ No newline at end of file diff --git a/modules/gpu/src/softcascade.cpp b/modules/gpu/src/softcascade.cpp index b2419c12c..abcae73dc 100644 --- a/modules/gpu/src/softcascade.cpp +++ b/modules/gpu/src/softcascade.cpp @@ -41,6 +41,7 @@ //M*/ #include +#include "opencv2/highgui/highgui.hpp" #if !defined (HAVE_CUDA) @@ -58,6 +59,12 @@ void cv::gpu::SoftCascade::detectMultiScale(const GpuMat&, const GpuMat&, GpuMat #include +namespace cv { namespace gpu { namespace device { +namespace icf { + void fillBins(cv::gpu::PtrStepSzb hogluv,const cv::gpu::PtrStepSzf& nangle); +} +}}} + struct cv::gpu::SoftCascade::Filds { // scales range @@ -81,6 +88,16 @@ struct cv::gpu::SoftCascade::Filds // 161x121x10 GpuMat hogluv; + // will be removed in final version + // temporial mat for cvtColor + GpuMat luv; + + // temporial mat for integrall + GpuMat integralBuffer; + + // temp matrix for sobel and cartToPolar + GpuMat dfdx, dfdy, angle, mag, nmag, nangle; + std::vector scales; icf::Cascade cascade; @@ -100,9 +117,9 @@ struct cv::gpu::SoftCascade::Filds }; bool fill(const FileNode &root, const float mins, const float maxs); - void detect(cudaStream_t stream) const + void detect(cv::gpu::GpuMat objects, cudaStream_t stream) const { - cascade.detect(hogluv, stream); + cascade.detect(hogluv, objects, stream); } private: @@ -284,7 +301,18 @@ inline bool cv::gpu::SoftCascade::Filds::fill(const FileNode &root, const float // allocate buffers dmem.create(FRAME_HEIGHT * (HOG_LUV_BINS + 1), FRAME_WIDTH, CV_8UC1); shrunk.create(FRAME_HEIGHT / shrinkage * HOG_LUV_BINS, FRAME_WIDTH / shrinkage, CV_8UC1); - hogluv.create( (FRAME_HEIGHT / shrinkage * HOG_LUV_BINS) + 1, (FRAME_WIDTH / shrinkage) + 1, CV_16UC1); + // hogluv.create( (FRAME_HEIGHT / shrinkage + 1) * HOG_LUV_BINS, (FRAME_WIDTH / shrinkage + 1), CV_16UC1); + hogluv.create( (FRAME_HEIGHT / shrinkage + 1) * HOG_LUV_BINS, (FRAME_WIDTH / shrinkage + 1), CV_32SC1); + luv.create(FRAME_HEIGHT, FRAME_WIDTH, CV_8UC3); + integralBuffer.create(shrunk.rows + 1 * HOG_LUV_BINS, shrunk.cols + 1, CV_32SC1); + + dfdx.create(FRAME_HEIGHT, FRAME_WIDTH, CV_32FC1); + dfdy.create(FRAME_HEIGHT, FRAME_WIDTH, CV_32FC1); + angle.create(FRAME_HEIGHT, FRAME_WIDTH, CV_32FC1); + mag.create(FRAME_HEIGHT, FRAME_WIDTH, CV_32FC1); + + nmag.create(FRAME_HEIGHT, FRAME_WIDTH, CV_32FC1); + nangle.create(FRAME_HEIGHT, FRAME_WIDTH, CV_32FC1); storage = icf::ChannelStorage(dmem, shrunk, hogluv, shrinkage); return true; @@ -393,21 +421,71 @@ bool cv::gpu::SoftCascade::load( const string& filename, const float minScale, c return true; } -void cv::gpu::SoftCascade::detectMultiScale(const GpuMat& image, const GpuMat& /*rois*/, - GpuMat& /*objects*/, const int /*rejectfactor*/, Stream s) +void cv::gpu::SoftCascade::detectMultiScale(const GpuMat& colored, const GpuMat& /*rois*/, + GpuMat& objects, const int /*rejectfactor*/, Stream s) { // only color images are supperted - CV_Assert(image.type() == CV_8UC3); + CV_Assert(colored.type() == CV_8UC3); - // only this window size allowed - CV_Assert(image.cols == 640 && image.rows == 480); + // // only this window size allowed + CV_Assert(colored.cols == 640 && colored.rows == 480); Filds& flds = *filds; + GpuMat& dmem = flds.dmem; + cudaMemset(dmem.data, 0, dmem.step * dmem.rows); + GpuMat& shrunk = flds.shrunk; + int w = shrunk.cols; + int h = colored.rows / flds.storage.shrinkage; cudaStream_t stream = StreamAccessor::getStream(s); - flds.storage.frame(image, stream); - flds.detect(stream); + std::vector splited; + for(int i = 0; i < 3; ++i) + { + splited.push_back(GpuMat(dmem, cv::Rect(0, colored.rows * (7 + i), colored.cols, colored.rows))); + } + + GpuMat gray(dmem, cv::Rect(0, colored.rows * 10, colored.cols, colored.rows) ); + + cv::gpu::cvtColor(colored, gray, CV_RGB2GRAY); + + //create hog + cv::gpu::Sobel(gray, flds.dfdx, CV_32F, 1, 0, 3, 0.25); + cv::gpu::Sobel(gray, flds.dfdy, CV_32F, 0, 1, 3, 0.25); + + cv::gpu::cartToPolar(flds.dfdx, flds.dfdy, flds.mag, flds.angle, true); + + cv::gpu::multiply(flds.mag, cv::Scalar::all(1.0 / ::log(2)), flds.nmag); + cv::gpu::multiply(flds.angle, cv::Scalar::all(1.0 / 60.0), flds.nangle); + + GpuMat magCannel(dmem, cv::Rect(0, colored.rows * 6, colored.cols, colored.rows)); + flds.nmag.convertTo(magCannel, CV_8UC1); + device::icf::fillBins(dmem, flds.nangle); + + // create luv + cv::gpu::cvtColor(colored, flds.luv, CV_BGR2Luv); + cv::gpu::split(flds.luv, splited); + + GpuMat plane(dmem, cv::Rect(0, 0, colored.cols, colored.rows * Filds::HOG_LUV_BINS)); + cv::gpu::resize(plane, flds.shrunk, cv::Size(), 0.25, 0.25, CV_INTER_AREA); + // cv::Mat cpu(plane); + // cv::imshow("channels", cpu); + // cv::waitKey(0); + + // fer debug purpose + // cudaMemset(flds.hogluv.data, 0, flds.hogluv.step * flds.hogluv.rows); + + for(int i = 0; i < Filds::HOG_LUV_BINS; ++i) + { + GpuMat channel(shrunk, cv::Rect(0, h * i, w, h )); + GpuMat sum(flds.hogluv, cv::Rect(0, (h + 1) * i, w + 1, h + 1)); + cv::gpu::integralBuffered(channel, sum, flds.integralBuffer); + } + + // detection + flds.detect(objects, stream); + + // flds.storage.frame(colored, stream); } #endif \ No newline at end of file From e606a0d6519eadffba18e6060ace334f408c4411 Mon Sep 17 00:00:00 2001 From: "marina.kolpakova" Date: Mon, 24 Sep 2012 18:05:08 +0400 Subject: [PATCH 018/155] remove dead code --- modules/gpu/src/cuda/isf-sc.cu | 179 --------------------------------- modules/gpu/src/icf.hpp | 2 +- 2 files changed, 1 insertion(+), 180 deletions(-) diff --git a/modules/gpu/src/cuda/isf-sc.cu b/modules/gpu/src/cuda/isf-sc.cu index 37c6e3023..b5eb5ad17 100644 --- a/modules/gpu/src/cuda/isf-sc.cu +++ b/modules/gpu/src/cuda/isf-sc.cu @@ -85,143 +85,6 @@ namespace icf { } } -enum { - HOG_BINS = 6, - HOG_LUV_BINS = 10, - WIDTH = 640, - HEIGHT = 480, - GREY_OFFSET = HEIGHT * HOG_LUV_BINS -}; - -/* Returns the nearest upper power of two, works only for -the typical GPU thread count (pert block) values */ -int power_2up(unsigned int n) -{ - if (n < 1) return 1; - else if (n < 2) return 2; - else if (n < 4) return 4; - else if (n < 8) return 8; - else if (n < 16) return 16; - else if (n < 32) return 32; - else if (n < 64) return 64; - else if (n < 128) return 128; - else if (n < 256) return 256; - else if (n < 512) return 512; - else if (n < 1024) return 1024; - return -1; // Input is too big -} - - -__device__ __forceinline__ uchar grey(const uchar3 rgb) -{ - return saturate_cast(rgb.x * 0.114f + rgb.y * 0.587f + rgb.z * 0.299f); -} - -__device__ __forceinline__ void luv(const uchar3 rgb, uchar& l, uchar& u, uchar& v) -{ - -} - -__global__ void rgb2grayluv(const uchar3* __restrict__ rgb, uchar* __restrict__ hog, - const int rgbPitch, const int hogPitch) -{ - const int y = blockIdx.y * blockDim.y + threadIdx.y; - const int x = blockIdx.x * blockDim.x + threadIdx.x; - - const uchar3 color = rgb[rgbPitch * y + x]; - - uchar l, u, v; - luv(color, l, u, v); - - hog[hogPitch * y + x] = l; - hog[hogPitch * (y + HEIGHT) + x] = u; - hog[hogPitch * (y + 2 * HEIGHT) + x] = v; - hog[hogPitch * (y + 3 * HEIGHT) + x] = grey(color); -} - -__device__ __forceinline__ -int qangle(const float &y, const float &x) -{ - int bin = 0; -// const float2 &bin_vector_zero = const_angle_bins_vectors[0]; -// float max_dot_product = fabs(x*bin_vector_zero.x + y*bin_vector_zero.y); - -// // let us hope this gets unrolled -// #pragma unroll -// for(int i=1; i < num_angles_bin; i+=1) -// { -// const float2 &bin_vector_i = const_angle_bins_vectors[i]; -// //const float2 bin_vector_i = const_angle_bins_vectors[i]; -// //const float2 &bin_vector_i = angle_bins_vectors[i]; -// const float dot_product = fabs(x*bin_vector_i.x + y*bin_vector_i.y); -// if(dot_product > max_dot_product) -// { -// max_dot_product = dot_product; -// index = i; -// } -// } - - return bin; -} - -// texture tgray; -__global__ void gray2hog(const uchar* __restrict__ gray, uchar* __restrict__ hog, const int pitch, const float norm) -{ - const int y = blockIdx.y * blockDim.y + threadIdx.y; - const int x = blockIdx.x * blockDim.x + threadIdx.x; - - // derivative - float dx = gray[y * pitch + x + 1]; - dx -= gray[y * pitch + x - 1]; - - float dy = gray[(y + 1) * pitch + x]; - dy -= gray[(y -1) * pitch + x - 1]; - - // mag and angle - const uchar mag = saturate_cast(sqrtf(dy * dy + dx * dx) * norm); - const int bin = qangle(dx, dy); - -} - -template -__device__ __forceinline__ uchar shrink(const uchar* ptr, const int pitch, const int y, const int x) -{ - int out = 0; -#pragma unroll - for(int dy = 0; dy < FACTOR; ++dy) -#pragma unroll - for(int dx = 0; dx < FACTOR; ++dx) - { - out += ptr[dy * pitch + dx]; - } - - return saturate_cast(out / FACTOR); -} - -template -__global__ void decimate(const uchar* __restrict__ hogluv, uchar* __restrict__ shrank, - const int inPitch, const int outPitch ) -{ - const int y = blockIdx.y * blockDim.y + threadIdx.y; - const int x = blockIdx.x * blockDim.x + threadIdx.x; - - const uchar* ptr = hogluv + (FACTOR * y) * inPitch + (FACTOR * x); - - shrank[ y * outPitch + x]= shrink(ptr, inPitch, y, x); -} - -__global__ void intRow(const uchar* __restrict__ hogluv, ushort* __restrict__ sum, - const int inPitch, const int outPitch) -{ - -} - -__global__ void intCol(ushort* __restrict__ sum, const int pitch) -{ - -} - - __global__ void detect(const cv::gpu::icf::Cascade cascade, const uchar* __restrict__ hogluv, const int pitch, PtrStepSz objects) { @@ -351,46 +214,4 @@ void icf::Cascade::detect(const cv::gpu::PtrStepSzb& hogluv, PtrStepSz o } -//////////////////////////////////////////////////// - - - -void icf::ChannelStorage::frame(const cv::gpu::PtrStepSz& rgb, cudaStream_t stream) -{ -// // // color convertin kernel -// // dim3 block(32, 8); -// // dim3 grid(FRAME_WIDTH / 32, FRAME_HEIGHT / 8); - -// // uchar * channels = (uchar*)dmem.ptr(FRAME_HEIGHT * HOG_BINS); -// // device::rgb2grayluv<<>>((uchar3*)rgb.ptr(), channels, -// // rgb.step / sizeof(uchar3), dmem.step); -// // cudaSafeCall( cudaGetLastError()); - -// // // hog calculation kernel -// // channels = (uchar*)dmem.ptr(FRAME_HEIGHT * HOG_LUV_BINS); -// // device::gray2hog<<>>(channels, (uchar*)dmem.ptr(), dmem.step, magnitudeScaling); -// // cudaSafeCall( cudaGetLastError() ); - -// // const int shrWidth = FRAME_WIDTH / shrinkage; -// // const int shrHeight = FRAME_HEIGHT / shrinkage; - -// // // decimate kernel -// // grid = dim3(shrWidth / 32, shrHeight / 8); -// // device::decimate<4><<>>((uchar*)dmem.ptr(), (uchar*)shrunk.ptr(), dmem.step, shrunk.step); -// // cudaSafeCall( cudaGetLastError() ); - -// // // integrate rows -// // block = dim3(shrWidth, 1); -// // grid = dim3(shrHeight * HOG_LUV_BINS, 1); -// // device::intRow<<>>((uchar*)shrunk.ptr(), (ushort*)hogluv.ptr(), -// // shrunk.step, hogluv.step / sizeof(ushort)); -// // cudaSafeCall( cudaGetLastError() ); - -// // // integrate cols -// // block = dim3(128, 1); -// // grid = dim3(shrWidth * HOG_LUV_BINS, 1); -// // device::intCol<<>>((ushort*)hogluv.ptr(), hogluv.step / hogluv.step / sizeof(ushort)); -// // cudaSafeCall( cudaGetLastError() ); -} - }} \ No newline at end of file diff --git a/modules/gpu/src/icf.hpp b/modules/gpu/src/icf.hpp index 454dad881..a8ce8d483 100644 --- a/modules/gpu/src/icf.hpp +++ b/modules/gpu/src/icf.hpp @@ -124,7 +124,7 @@ struct ChannelStorage const cv::gpu::PtrStepSzb& itg, const int s) : dmem (buff), shrunk(shr), hogluv(itg), shrinkage(s) {} - void frame(const cv::gpu::PtrStepSz& rgb, cudaStream_t stream); + void frame(const cv::gpu::PtrStepSz& rgb, cudaStream_t stream){} PtrStepSzb dmem; PtrStepSzb shrunk; From dca27b4622c711f52797c126c0b2aba72f421497 Mon Sep 17 00:00:00 2001 From: "marina.kolpakova" Date: Tue, 25 Sep 2012 11:32:03 +0400 Subject: [PATCH 019/155] fix cast bug; add logging --- modules/gpu/src/cuda/isf-sc.cu | 87 +++++++++++++++++++++++++++------ modules/gpu/src/icf.hpp | 4 +- modules/gpu/src/softcascade.cpp | 34 +++++++++++-- 3 files changed, 103 insertions(+), 22 deletions(-) diff --git a/modules/gpu/src/cuda/isf-sc.cu b/modules/gpu/src/cuda/isf-sc.cu index b5eb5ad17..6572c54fc 100644 --- a/modules/gpu/src/cuda/isf-sc.cu +++ b/modules/gpu/src/cuda/isf-sc.cu @@ -45,6 +45,8 @@ #include #include +//#define LOG_CUDA_CASCADE + namespace cv { namespace gpu { namespace device { namespace icf { @@ -85,7 +87,7 @@ namespace icf { } } -__global__ void detect(const cv::gpu::icf::Cascade cascade, const uchar* __restrict__ hogluv, const int pitch, +__global__ void detect(const cv::gpu::icf::Cascade cascade, const int* __restrict__ hogluv, const int pitch, PtrStepSz objects) { cascade.detectAt(hogluv, pitch, objects); @@ -96,6 +98,11 @@ __global__ void detect(const cv::gpu::icf::Cascade cascade, const uchar* __restr float __device icf::Cascade::rescale(const icf::Level& level, uchar4& scaledRect, const int channel, const float threshold) const { +#if defined LOG_CUDA_CASCADE + printf("feature %d box %d %d %d %d\n", channel, scaledRect.x, scaledRect.y, scaledRect.z, scaledRect.w); + printf("rescale: %f [%f %f]\n",level.relScale, level.scaling[0], level.scaling[1]); +#endif + float relScale = level.relScale; float farea = (scaledRect.z - scaledRect.x) * (scaledRect.w - scaledRect.y); @@ -107,6 +114,7 @@ float __device icf::Cascade::rescale(const icf::Level& level, uchar4& scaledRect float sarea = (scaledRect.z - scaledRect.x) * (scaledRect.w - scaledRect.y); + float approx = 1.f; if (fabs(farea - 0.f) > FLT_EPSILON && fabs(farea - 0.f) > FLT_EPSILON) { @@ -114,40 +122,72 @@ float __device icf::Cascade::rescale(const icf::Level& level, uchar4& scaledRect approx = expected_new_area / sarea; } +#if defined LOG_CUDA_CASCADE + printf("new rect: %d box %d %d %d %d rel areas %f %f\n", channel, + scaledRect.x, scaledRect.y, scaledRect.z, scaledRect.w, farea * relScale * relScale, sarea); +#endif + // compensation areas rounding float rootThreshold = threshold / approx; + // printf(" approx %f\n", rootThreshold); rootThreshold *= level.scaling[(int)(channel > 6)]; +#if defined LOG_CUDA_CASCADE + printf("approximation %f %f -> %f %f\n", approx, threshold, rootThreshold, level.scaling[(int)(channel > 6)]); +#endif + return rootThreshold; } typedef unsigned char uchar; -float __device get(const uchar* __restrict__ hogluv, const int pitch, +float __device get(const int* __restrict__ hogluv, const int pitch, const int x, const int y, int channel, uchar4 area) { - const uchar* curr = hogluv + ((channel * 121) + y) * pitch; +#if defined LOG_CUDA_CASCADE + printf("feature box %d %d %d %d ", area.x, area.y, area.z, area.w); + printf("get for channel %d\n", channel); + printf("extract feature for: [%d %d] [%d %d] [%d %d] [%d %d]\n", + x + area.x, y + area.y, x + area.z, y + area.y, x + area.z,y + area.w, + x + area.x, y + area.w); + printf("at point %d %d with offset %d\n", x, y, 0); +#endif + + const int* curr = hogluv + ((channel * 121) + y) * pitch; int a = curr[area.y * pitch + x + area.x]; int b = curr[area.y * pitch + x + area.z]; int c = curr[area.w * pitch + x + area.z]; int d = curr[area.w * pitch + x + area.x]; +#if defined LOG_CUDA_CASCADE + printf(" retruved integral values: %d %d %d %d\n", a, b, c, d); +#endif + return (a - b + c - d); } -void __device icf::Cascade::detectAt(const uchar* __restrict__ hogluv, const int pitch, +void __device icf::Cascade::detectAt(const int* __restrict__ hogluv, const int pitch, PtrStepSz& objects) const { const icf::Level* lls = (const icf::Level*)levels.ptr(); - Level level = lls[0]; const int y = blockIdx.y * blockDim.y + threadIdx.y; const int x = blockIdx.x * blockDim.x + threadIdx.x; + // if (x > 0 || y > 0) return; + Level level = lls[0]; if (x >= level.workRect.x || y >= level.workRect.y) return; +#if defined LOG_CUDA_CASCADE + printf("level: %d (%f %f) [%f %f] (%d %d) (%d %d)\n", level.octave, level.relScale, level.shrScale, + level.scaling[0], level.scaling[1], level.workRect.x, level.workRect.y, level.objSize.x, level.objSize.y); +#endif + const Octave octave = ((const Octave*)octaves.ptr())[level.octave]; + // printf("Octave: %d %d %d (%d %d) %f\n", octave.index, octave.stages, + // octave.shrinkage, octave.size.x, octave.size.y, octave.scale); + const int stBegin = octave.index * octave.stages, stEnd = stBegin + octave.stages; float detectionScore = 0.f; @@ -156,11 +196,17 @@ void __device icf::Cascade::detectAt(const uchar* __restrict__ hogluv, const int for(; st < stEnd; ++st) { const float stage = stages(0, st); +#if defined LOG_CUDA_CASCADE + printf("Stage: %f\n", stage); +#endif { const int nId = st * 3; // work with root node const Node node = ((const Node*)nodes.ptr())[nId]; +#if defined LOG_CUDA_CASCADE + printf("Node: %d %f\n", node.feature, node.threshold); +#endif const Feature feature = ((const Feature*)features.ptr())[node.feature]; uchar4 scaledRect = feature.rect; @@ -168,31 +214,46 @@ void __device icf::Cascade::detectAt(const uchar* __restrict__ hogluv, const int float sum = get(hogluv,pitch, x, y, feature.channel, scaledRect); +#if defined LOG_CUDA_CASCADE + printf("root feature %d %f\n",feature.channel, sum); +#endif int next = 1 + (int)(sum >= threshold); +#if defined LOG_CUDA_CASCADE + printf("go: %d (%f >= %f)\n\n" ,next, sum, threshold); +#endif // leaves const Node leaf = ((const Node*)nodes.ptr())[nId + next]; const Feature fLeaf = ((const Feature*)features.ptr())[leaf.feature]; scaledRect = fLeaf.rect; - threshold = rescale(level, scaledRect, feature.channel, node.threshold); + threshold = rescale(level, scaledRect, fLeaf.channel, leaf.threshold); sum = get(hogluv, pitch, x, y, fLeaf.channel, scaledRect); const int lShift = (next - 1) * 2 + (int)(sum >= threshold); float impact = leaves(0, (st * 4) + lShift); detectionScore += impact; + +#if defined LOG_CUDA_CASCADE + printf("decided: %d (%f >= %f) %d %f\n\n" ,next, sum, threshold, lShift, impact); + printf("extracted stage:\n"); + printf("ct %f\n", stage); + printf("computed score %f\n\n", detectionScore); + printf("\n\n"); +#endif + } if (detectionScore <= stage) break; } - // if (!threadIdx.x && !threadIdx.y)// printf("%f %d\n", detectionScore, st); - // printf("x %d y %d: %d\n", x, y, st); +#if defined LOG_CUDA_CASCADE + // printf("x %d y %d: %d\n", x, y, st - stBegin); +#endif if (st == stEnd) { - // printf(" got %d\n", st); uchar4 a; a.x = level.workRect.x; a.y = level.workRect.y; @@ -200,18 +261,14 @@ void __device icf::Cascade::detectAt(const uchar* __restrict__ hogluv, const int } } -void icf::Cascade::detect(const cv::gpu::PtrStepSzb& hogluv, PtrStepSz objects, - cudaStream_t stream) const +void icf::Cascade::detect(const cv::gpu::PtrStepSzi& hogluv, PtrStepSz objects, cudaStream_t stream) const { - // detection kernel dim3 block(32, 8, 1); - // dim3 grid(32 * ChannelStorage::FRAME_WIDTH / 32, ChannelStorage::FRAME_HEIGHT / 8, 1); dim3 grid(ChannelStorage::FRAME_WIDTH / 32, ChannelStorage::FRAME_HEIGHT / 8, 1); - device::detect<<>>(*this, hogluv, hogluv.step / sizeof(ushort), objects); + device::detect<<>>(*this, hogluv, hogluv.step / sizeof(int), objects); cudaSafeCall( cudaGetLastError() ); if (!stream) cudaSafeCall( cudaDeviceSynchronize() ); - } }} \ No newline at end of file diff --git a/modules/gpu/src/icf.hpp b/modules/gpu/src/icf.hpp index a8ce8d483..7d4b65980 100644 --- a/modules/gpu/src/icf.hpp +++ b/modules/gpu/src/icf.hpp @@ -102,8 +102,8 @@ struct Cascade const cv::gpu::PtrStepSzf& lvs, const cv::gpu::PtrStepSzb& fts, const cv::gpu::PtrStepSzb& lls) : octaves(octs), stages(sts), nodes(nds), leaves(lvs), features(fts), levels(lls) {} - void detect(const cv::gpu::PtrStepSzb& hogluv, cv::gpu::PtrStepSz objects, cudaStream_t stream) const; - void __device detectAt(const uchar* __restrict__ hogluv, const int pitch, PtrStepSz& objects) const; + void detect(const cv::gpu::PtrStepSzi& hogluv, cv::gpu::PtrStepSz objects, cudaStream_t stream) const; + void __device detectAt(const int* __restrict__ hogluv, const int pitch, PtrStepSz& objects) const; float __device rescale(const icf::Level& level, uchar4& scaledRect, const int channel, const float threshold) const; diff --git a/modules/gpu/src/softcascade.cpp b/modules/gpu/src/softcascade.cpp index abcae73dc..7e1a5abb9 100644 --- a/modules/gpu/src/softcascade.cpp +++ b/modules/gpu/src/softcascade.cpp @@ -381,6 +381,9 @@ inline void cv::gpu::SoftCascade::Filds::calcLevels(const std::vector> channel; + GpuMat gchannel(flds.hogluv, cv::Rect(0, 121 * i, 161, 121)); + gchannel.upload(channel); + } +#else GpuMat& dmem = flds.dmem; cudaMemset(dmem.data, 0, dmem.step * dmem.rows); GpuMat& shrunk = flds.shrunk; int w = shrunk.cols; int h = colored.rows / flds.storage.shrinkage; - cudaStream_t stream = StreamAccessor::getStream(s); - std::vector splited; for(int i = 0; i < 3; ++i) { @@ -468,9 +492,6 @@ void cv::gpu::SoftCascade::detectMultiScale(const GpuMat& colored, const GpuMat& GpuMat plane(dmem, cv::Rect(0, 0, colored.cols, colored.rows * Filds::HOG_LUV_BINS)); cv::gpu::resize(plane, flds.shrunk, cv::Size(), 0.25, 0.25, CV_INTER_AREA); - // cv::Mat cpu(plane); - // cv::imshow("channels", cpu); - // cv::waitKey(0); // fer debug purpose // cudaMemset(flds.hogluv.data, 0, flds.hogluv.step * flds.hogluv.rows); @@ -482,6 +503,9 @@ void cv::gpu::SoftCascade::detectMultiScale(const GpuMat& colored, const GpuMat& cv::gpu::integralBuffered(channel, sum, flds.integralBuffer); } +#endif + + cudaStream_t stream = StreamAccessor::getStream(s); // detection flds.detect(objects, stream); From 4881205baee2807aeb9fe5e70551290474e80671 Mon Sep 17 00:00:00 2001 From: "marina.kolpakova" Date: Tue, 25 Sep 2012 22:43:43 +0400 Subject: [PATCH 020/155] refactor logs --- modules/gpu/src/cuda/isf-sc.cu | 82 ++++++++++++++-------------------- 1 file changed, 34 insertions(+), 48 deletions(-) diff --git a/modules/gpu/src/cuda/isf-sc.cu b/modules/gpu/src/cuda/isf-sc.cu index 6572c54fc..ccc1ddf30 100644 --- a/modules/gpu/src/cuda/isf-sc.cu +++ b/modules/gpu/src/cuda/isf-sc.cu @@ -47,6 +47,13 @@ //#define LOG_CUDA_CASCADE +#if defined LOG_CUDA_CASCADE +# define dprintf(format, ...) \ + do { printf(format, __VA_ARGS__); } while (0) +#else +# define dprintf(format, ...) +#endif + namespace cv { namespace gpu { namespace device { namespace icf { @@ -98,10 +105,8 @@ __global__ void detect(const cv::gpu::icf::Cascade cascade, const int* __restric float __device icf::Cascade::rescale(const icf::Level& level, uchar4& scaledRect, const int channel, const float threshold) const { -#if defined LOG_CUDA_CASCADE - printf("feature %d box %d %d %d %d\n", channel, scaledRect.x, scaledRect.y, scaledRect.z, scaledRect.w); - printf("rescale: %f [%f %f]\n",level.relScale, level.scaling[0], level.scaling[1]); -#endif + dprintf("feature %d box %d %d %d %d\n", channel, scaledRect.x, scaledRect.y, scaledRect.z, scaledRect.w); + dprintf("rescale: %f [%f %f]\n",level.relScale, level.scaling[0], level.scaling[1]); float relScale = level.relScale; float farea = (scaledRect.z - scaledRect.x) * (scaledRect.w - scaledRect.y); @@ -122,19 +127,15 @@ float __device icf::Cascade::rescale(const icf::Level& level, uchar4& scaledRect approx = expected_new_area / sarea; } -#if defined LOG_CUDA_CASCADE - printf("new rect: %d box %d %d %d %d rel areas %f %f\n", channel, + dprintf("new rect: %d box %d %d %d %d rel areas %f %f\n", channel, scaledRect.x, scaledRect.y, scaledRect.z, scaledRect.w, farea * relScale * relScale, sarea); -#endif // compensation areas rounding float rootThreshold = threshold / approx; // printf(" approx %f\n", rootThreshold); rootThreshold *= level.scaling[(int)(channel > 6)]; -#if defined LOG_CUDA_CASCADE - printf("approximation %f %f -> %f %f\n", approx, threshold, rootThreshold, level.scaling[(int)(channel > 6)]); -#endif + dprintf("approximation %f %f -> %f %f\n", approx, threshold, rootThreshold, level.scaling[(int)(channel > 6)]); return rootThreshold; } @@ -143,14 +144,12 @@ typedef unsigned char uchar; float __device get(const int* __restrict__ hogluv, const int pitch, const int x, const int y, int channel, uchar4 area) { -#if defined LOG_CUDA_CASCADE - printf("feature box %d %d %d %d ", area.x, area.y, area.z, area.w); - printf("get for channel %d\n", channel); - printf("extract feature for: [%d %d] [%d %d] [%d %d] [%d %d]\n", + dprintf("feature box %d %d %d %d ", area.x, area.y, area.z, area.w); + dprintf("get for channel %d\n", channel); + dprintf("extract feature for: [%d %d] [%d %d] [%d %d] [%d %d]\n", x + area.x, y + area.y, x + area.z, y + area.y, x + area.z,y + area.w, x + area.x, y + area.w); - printf("at point %d %d with offset %d\n", x, y, 0); -#endif + dprintf("at point %d %d with offset %d\n", x, y, 0); const int* curr = hogluv + ((channel * 121) + y) * pitch; @@ -159,9 +158,7 @@ float __device get(const int* __restrict__ hogluv, const int pitch, int c = curr[area.w * pitch + x + area.z]; int d = curr[area.w * pitch + x + area.x]; -#if defined LOG_CUDA_CASCADE - printf(" retruved integral values: %d %d %d %d\n", a, b, c, d); -#endif + dprintf(" retruved integral values: %d %d %d %d\n", a, b, c, d); return (a - b + c - d); } @@ -176,13 +173,11 @@ void __device icf::Cascade::detectAt(const int* __restrict__ hogluv, const int p const int x = blockIdx.x * blockDim.x + threadIdx.x; // if (x > 0 || y > 0) return; - Level level = lls[0]; + Level level = lls[blockIdx.z]; if (x >= level.workRect.x || y >= level.workRect.y) return; -#if defined LOG_CUDA_CASCADE - printf("level: %d (%f %f) [%f %f] (%d %d) (%d %d)\n", level.octave, level.relScale, level.shrScale, + dprintf("level: %d (%f %f) [%f %f] (%d %d) (%d %d)\n", level.octave, level.relScale, level.shrScale, level.scaling[0], level.scaling[1], level.workRect.x, level.workRect.y, level.objSize.x, level.objSize.y); -#endif const Octave octave = ((const Octave*)octaves.ptr())[level.octave]; // printf("Octave: %d %d %d (%d %d) %f\n", octave.index, octave.stages, @@ -196,17 +191,15 @@ void __device icf::Cascade::detectAt(const int* __restrict__ hogluv, const int p for(; st < stEnd; ++st) { const float stage = stages(0, st); -#if defined LOG_CUDA_CASCADE - printf("Stage: %f\n", stage); -#endif + dprintf("Stage: %f\n", stage); { const int nId = st * 3; // work with root node const Node node = ((const Node*)nodes.ptr())[nId]; -#if defined LOG_CUDA_CASCADE - printf("Node: %d %f\n", node.feature, node.threshold); -#endif + + dprintf("Node: %d %f\n", node.feature, node.threshold); + const Feature feature = ((const Feature*)features.ptr())[node.feature]; uchar4 scaledRect = feature.rect; @@ -214,14 +207,12 @@ void __device icf::Cascade::detectAt(const int* __restrict__ hogluv, const int p float sum = get(hogluv,pitch, x, y, feature.channel, scaledRect); -#if defined LOG_CUDA_CASCADE - printf("root feature %d %f\n",feature.channel, sum); -#endif + dprintf("root feature %d %f\n",feature.channel, sum); + int next = 1 + (int)(sum >= threshold); -#if defined LOG_CUDA_CASCADE - printf("go: %d (%f >= %f)\n\n" ,next, sum, threshold); -#endif + dprintf("go: %d (%f >= %f)\n\n" ,next, sum, threshold); + // leaves const Node leaf = ((const Node*)nodes.ptr())[nId + next]; const Feature fLeaf = ((const Feature*)features.ptr())[leaf.feature]; @@ -235,22 +226,17 @@ void __device icf::Cascade::detectAt(const int* __restrict__ hogluv, const int p detectionScore += impact; -#if defined LOG_CUDA_CASCADE - printf("decided: %d (%f >= %f) %d %f\n\n" ,next, sum, threshold, lShift, impact); - printf("extracted stage:\n"); - printf("ct %f\n", stage); - printf("computed score %f\n\n", detectionScore); - printf("\n\n"); -#endif - + dprintf("decided: %d (%f >= %f) %d %f\n\n" ,next, sum, threshold, lShift, impact); + dprintf("extracted stage:\n"); + dprintf("ct %f\n", stage); + dprintf("computed score %f\n\n", detectionScore); + dprintf("\n\n"); } - if (detectionScore <= stage) break; + if (detectionScore <= stage || st - stBegin == 100) break; } -#if defined LOG_CUDA_CASCADE - // printf("x %d y %d: %d\n", x, y, st - stBegin); -#endif + dprintf("x %d y %d: %d\n", x, y, st - stBegin); if (st == stEnd) { @@ -264,7 +250,7 @@ void __device icf::Cascade::detectAt(const int* __restrict__ hogluv, const int p void icf::Cascade::detect(const cv::gpu::PtrStepSzi& hogluv, PtrStepSz objects, cudaStream_t stream) const { dim3 block(32, 8, 1); - dim3 grid(ChannelStorage::FRAME_WIDTH / 32, ChannelStorage::FRAME_HEIGHT / 8, 1); + dim3 grid(ChannelStorage::FRAME_WIDTH / 32, ChannelStorage::FRAME_HEIGHT / 8, 47); device::detect<<>>(*this, hogluv, hogluv.step / sizeof(int), objects); cudaSafeCall( cudaGetLastError() ); if (!stream) From 1917366528d0703b7646de10931a01490bb32b4f Mon Sep 17 00:00:00 2001 From: "marina.kolpakova" Date: Wed, 26 Sep 2012 11:18:09 +0400 Subject: [PATCH 021/155] empty cascade --- modules/gpu/src/cuda/isf-sc.cu | 326 +++++++-------- modules/gpu/src/icf.hpp | 192 ++++----- modules/gpu/src/softcascade.cpp | 678 ++++++++++++++++---------------- 3 files changed, 596 insertions(+), 600 deletions(-) diff --git a/modules/gpu/src/cuda/isf-sc.cu b/modules/gpu/src/cuda/isf-sc.cu index ccc1ddf30..33b2222c7 100644 --- a/modules/gpu/src/cuda/isf-sc.cu +++ b/modules/gpu/src/cuda/isf-sc.cu @@ -40,221 +40,221 @@ // //M*/ -#include -#include -#include -#include +// #include +// #include +// #include +// #include -//#define LOG_CUDA_CASCADE +// //#define LOG_CUDA_CASCADE -#if defined LOG_CUDA_CASCADE -# define dprintf(format, ...) \ - do { printf(format, __VA_ARGS__); } while (0) -#else -# define dprintf(format, ...) -#endif +// #if defined LOG_CUDA_CASCADE +// # define dprintf(format, ...) \ +// do { printf(format, __VA_ARGS__); } while (0) +// #else +// # define dprintf(format, ...) +// #endif -namespace cv { namespace gpu { namespace device { +// namespace cv { namespace gpu { namespace device { -namespace icf { +// namespace icf { - enum { - HOG_BINS = 6, - HOG_LUV_BINS = 10, - WIDTH = 640, - HEIGHT = 480, - GREY_OFFSET = HEIGHT * HOG_LUV_BINS - }; +// enum { +// HOG_BINS = 6, +// HOG_LUV_BINS = 10, +// WIDTH = 640, +// HEIGHT = 480, +// GREY_OFFSET = HEIGHT * HOG_LUV_BINS +// }; - __global__ void magToHist(const uchar* __restrict__ mag, - const float* __restrict__ angle, const int angPitch, - uchar* __restrict__ hog, const int hogPitch) - { - const int y = blockIdx.y * blockDim.y + threadIdx.y; - const int x = blockIdx.x * blockDim.x + threadIdx.x; +// __global__ void magToHist(const uchar* __restrict__ mag, +// const float* __restrict__ angle, const int angPitch, +// uchar* __restrict__ hog, const int hogPitch) +// { +// const int y = blockIdx.y * blockDim.y + threadIdx.y; +// const int x = blockIdx.x * blockDim.x + threadIdx.x; - const int bin = (int)(angle[y * angPitch + x]); - const uchar val = mag[y * angPitch + x]; +// const int bin = (int)(angle[y * angPitch + x]); +// const uchar val = mag[y * angPitch + x]; - hog[((HEIGHT * bin) + y) * hogPitch + x] = val; - } +// hog[((HEIGHT * bin) + y) * hogPitch + x] = val; +// } - void fillBins(cv::gpu::PtrStepSzb hogluv, const cv::gpu::PtrStepSzf& nangle) - { - const uchar* mag = (const uchar*)hogluv.ptr(HEIGHT * HOG_BINS); - uchar* hog = (uchar*)hogluv.ptr(); - const float* angle = (const float*)nangle.ptr(); +// void fillBins(cv::gpu::PtrStepSzb hogluv, const cv::gpu::PtrStepSzf& nangle) +// { +// const uchar* mag = (const uchar*)hogluv.ptr(HEIGHT * HOG_BINS); +// uchar* hog = (uchar*)hogluv.ptr(); +// const float* angle = (const float*)nangle.ptr(); - dim3 block(32, 8); - dim3 grid(WIDTH / 32, HEIGHT / 8); +// dim3 block(32, 8); +// dim3 grid(WIDTH / 32, HEIGHT / 8); - magToHist<<>>(mag, angle, nangle.step / sizeof(float), hog, hogluv.step); - cudaSafeCall( cudaGetLastError() ); - cudaSafeCall( cudaDeviceSynchronize() ); - } -} +// magToHist<<>>(mag, angle, nangle.step / sizeof(float), hog, hogluv.step); +// cudaSafeCall( cudaGetLastError() ); +// cudaSafeCall( cudaDeviceSynchronize() ); +// } +// } -__global__ void detect(const cv::gpu::icf::Cascade cascade, const int* __restrict__ hogluv, const int pitch, - PtrStepSz objects) -{ - cascade.detectAt(hogluv, pitch, objects); -} +// __global__ void detect(const cv::gpu::icf::Cascade cascade, const int* __restrict__ hogluv, const int pitch, +// PtrStepSz objects) +// { +// cascade.detectAt(hogluv, pitch, objects); +// } -} +// } -float __device icf::Cascade::rescale(const icf::Level& level, uchar4& scaledRect, - const int channel, const float threshold) const -{ - dprintf("feature %d box %d %d %d %d\n", channel, scaledRect.x, scaledRect.y, scaledRect.z, scaledRect.w); - dprintf("rescale: %f [%f %f]\n",level.relScale, level.scaling[0], level.scaling[1]); +// float __device icf::Cascade::rescale(const icf::Level& level, uchar4& scaledRect, +// const int channel, const float threshold) const +// { +// dprintf("feature %d box %d %d %d %d\n", channel, scaledRect.x, scaledRect.y, scaledRect.z, scaledRect.w); +// dprintf("rescale: %f [%f %f]\n",level.relScale, level.scaling[0], level.scaling[1]); - float relScale = level.relScale; - float farea = (scaledRect.z - scaledRect.x) * (scaledRect.w - scaledRect.y); +// float relScale = level.relScale; +// float farea = (scaledRect.z - scaledRect.x) * (scaledRect.w - scaledRect.y); - // rescale - scaledRect.x = __float2int_rn(relScale * scaledRect.x); - scaledRect.y = __float2int_rn(relScale * scaledRect.y); - scaledRect.z = __float2int_rn(relScale * scaledRect.z); - scaledRect.w = __float2int_rn(relScale * scaledRect.w); +// // rescale +// scaledRect.x = __float2int_rn(relScale * scaledRect.x); +// scaledRect.y = __float2int_rn(relScale * scaledRect.y); +// scaledRect.z = __float2int_rn(relScale * scaledRect.z); +// scaledRect.w = __float2int_rn(relScale * scaledRect.w); - float sarea = (scaledRect.z - scaledRect.x) * (scaledRect.w - scaledRect.y); +// float sarea = (scaledRect.z - scaledRect.x) * (scaledRect.w - scaledRect.y); - float approx = 1.f; - if (fabs(farea - 0.f) > FLT_EPSILON && fabs(farea - 0.f) > FLT_EPSILON) - { - const float expected_new_area = farea * relScale * relScale; - approx = expected_new_area / sarea; - } +// float approx = 1.f; +// if (fabs(farea - 0.f) > FLT_EPSILON && fabs(farea - 0.f) > FLT_EPSILON) +// { +// const float expected_new_area = farea * relScale * relScale; +// approx = expected_new_area / sarea; +// } - dprintf("new rect: %d box %d %d %d %d rel areas %f %f\n", channel, - scaledRect.x, scaledRect.y, scaledRect.z, scaledRect.w, farea * relScale * relScale, sarea); +// dprintf("new rect: %d box %d %d %d %d rel areas %f %f\n", channel, +// scaledRect.x, scaledRect.y, scaledRect.z, scaledRect.w, farea * relScale * relScale, sarea); - // compensation areas rounding - float rootThreshold = threshold / approx; - // printf(" approx %f\n", rootThreshold); - rootThreshold *= level.scaling[(int)(channel > 6)]; +// // compensation areas rounding +// float rootThreshold = threshold / approx; +// // printf(" approx %f\n", rootThreshold); +// rootThreshold *= level.scaling[(int)(channel > 6)]; - dprintf("approximation %f %f -> %f %f\n", approx, threshold, rootThreshold, level.scaling[(int)(channel > 6)]); +// dprintf("approximation %f %f -> %f %f\n", approx, threshold, rootThreshold, level.scaling[(int)(channel > 6)]); - return rootThreshold; -} +// return rootThreshold; +// } -typedef unsigned char uchar; -float __device get(const int* __restrict__ hogluv, const int pitch, - const int x, const int y, int channel, uchar4 area) -{ - dprintf("feature box %d %d %d %d ", area.x, area.y, area.z, area.w); - dprintf("get for channel %d\n", channel); - dprintf("extract feature for: [%d %d] [%d %d] [%d %d] [%d %d]\n", - x + area.x, y + area.y, x + area.z, y + area.y, x + area.z,y + area.w, - x + area.x, y + area.w); - dprintf("at point %d %d with offset %d\n", x, y, 0); +// typedef unsigned char uchar; +// float __device get(const int* __restrict__ hogluv, const int pitch, +// const int x, const int y, int channel, uchar4 area) +// { +// dprintf("feature box %d %d %d %d ", area.x, area.y, area.z, area.w); +// dprintf("get for channel %d\n", channel); +// dprintf("extract feature for: [%d %d] [%d %d] [%d %d] [%d %d]\n", +// x + area.x, y + area.y, x + area.z, y + area.y, x + area.z,y + area.w, +// x + area.x, y + area.w); +// dprintf("at point %d %d with offset %d\n", x, y, 0); - const int* curr = hogluv + ((channel * 121) + y) * pitch; +// const int* curr = hogluv + ((channel * 121) + y) * pitch; - int a = curr[area.y * pitch + x + area.x]; - int b = curr[area.y * pitch + x + area.z]; - int c = curr[area.w * pitch + x + area.z]; - int d = curr[area.w * pitch + x + area.x]; +// int a = curr[area.y * pitch + x + area.x]; +// int b = curr[area.y * pitch + x + area.z]; +// int c = curr[area.w * pitch + x + area.z]; +// int d = curr[area.w * pitch + x + area.x]; - dprintf(" retruved integral values: %d %d %d %d\n", a, b, c, d); +// dprintf(" retruved integral values: %d %d %d %d\n", a, b, c, d); - return (a - b + c - d); -} +// return (a - b + c - d); +// } -void __device icf::Cascade::detectAt(const int* __restrict__ hogluv, const int pitch, - PtrStepSz& objects) const -{ - const icf::Level* lls = (const icf::Level*)levels.ptr(); +// void __device icf::Cascade::detectAt(const int* __restrict__ hogluv, const int pitch, +// PtrStepSz& objects) const +// { +// const icf::Level* lls = (const icf::Level*)levels.ptr(); - const int y = blockIdx.y * blockDim.y + threadIdx.y; - const int x = blockIdx.x * blockDim.x + threadIdx.x; - // if (x > 0 || y > 0) return; +// const int y = blockIdx.y * blockDim.y + threadIdx.y; +// const int x = blockIdx.x * blockDim.x + threadIdx.x; +// // if (x > 0 || y > 0) return; - Level level = lls[blockIdx.z]; - if (x >= level.workRect.x || y >= level.workRect.y) return; +// Level level = lls[blockIdx.z]; +// if (x >= level.workRect.x || y >= level.workRect.y) return; - dprintf("level: %d (%f %f) [%f %f] (%d %d) (%d %d)\n", level.octave, level.relScale, level.shrScale, - level.scaling[0], level.scaling[1], level.workRect.x, level.workRect.y, level.objSize.x, level.objSize.y); +// dprintf("level: %d (%f %f) [%f %f] (%d %d) (%d %d)\n", level.octave, level.relScale, level.shrScale, +// level.scaling[0], level.scaling[1], level.workRect.x, level.workRect.y, level.objSize.x, level.objSize.y); - const Octave octave = ((const Octave*)octaves.ptr())[level.octave]; - // printf("Octave: %d %d %d (%d %d) %f\n", octave.index, octave.stages, - // octave.shrinkage, octave.size.x, octave.size.y, octave.scale); +// const Octave octave = ((const Octave*)octaves.ptr())[level.octave]; +// // printf("Octave: %d %d %d (%d %d) %f\n", octave.index, octave.stages, +// // octave.shrinkage, octave.size.x, octave.size.y, octave.scale); - const int stBegin = octave.index * octave.stages, stEnd = stBegin + octave.stages; +// const int stBegin = octave.index * octave.stages, stEnd = stBegin + octave.stages; - float detectionScore = 0.f; +// float detectionScore = 0.f; - int st = stBegin; - for(; st < stEnd; ++st) - { - const float stage = stages(0, st); - dprintf("Stage: %f\n", stage); - { - const int nId = st * 3; +// int st = stBegin; +// for(; st < stEnd; ++st) +// { +// const float stage = stages(0, st); +// dprintf("Stage: %f\n", stage); +// { +// const int nId = st * 3; - // work with root node - const Node node = ((const Node*)nodes.ptr())[nId]; +// // work with root node +// const Node node = ((const Node*)nodes.ptr())[nId]; - dprintf("Node: %d %f\n", node.feature, node.threshold); +// dprintf("Node: %d %f\n", node.feature, node.threshold); - const Feature feature = ((const Feature*)features.ptr())[node.feature]; +// const Feature feature = ((const Feature*)features.ptr())[node.feature]; - uchar4 scaledRect = feature.rect; - float threshold = rescale(level, scaledRect, feature.channel, node.threshold); +// uchar4 scaledRect = feature.rect; +// float threshold = rescale(level, scaledRect, feature.channel, node.threshold); - float sum = get(hogluv,pitch, x, y, feature.channel, scaledRect); +// float sum = get(hogluv,pitch, x, y, feature.channel, scaledRect); - dprintf("root feature %d %f\n",feature.channel, sum); +// dprintf("root feature %d %f\n",feature.channel, sum); - int next = 1 + (int)(sum >= threshold); +// int next = 1 + (int)(sum >= threshold); - dprintf("go: %d (%f >= %f)\n\n" ,next, sum, threshold); +// dprintf("go: %d (%f >= %f)\n\n" ,next, sum, threshold); - // leaves - const Node leaf = ((const Node*)nodes.ptr())[nId + next]; - const Feature fLeaf = ((const Feature*)features.ptr())[leaf.feature]; +// // leaves +// const Node leaf = ((const Node*)nodes.ptr())[nId + next]; +// const Feature fLeaf = ((const Feature*)features.ptr())[leaf.feature]; - scaledRect = fLeaf.rect; - threshold = rescale(level, scaledRect, fLeaf.channel, leaf.threshold); - sum = get(hogluv, pitch, x, y, fLeaf.channel, scaledRect); +// scaledRect = fLeaf.rect; +// threshold = rescale(level, scaledRect, fLeaf.channel, leaf.threshold); +// sum = get(hogluv, pitch, x, y, fLeaf.channel, scaledRect); - const int lShift = (next - 1) * 2 + (int)(sum >= threshold); - float impact = leaves(0, (st * 4) + lShift); +// const int lShift = (next - 1) * 2 + (int)(sum >= threshold); +// float impact = leaves(0, (st * 4) + lShift); - detectionScore += impact; +// detectionScore += impact; - dprintf("decided: %d (%f >= %f) %d %f\n\n" ,next, sum, threshold, lShift, impact); - dprintf("extracted stage:\n"); - dprintf("ct %f\n", stage); - dprintf("computed score %f\n\n", detectionScore); - dprintf("\n\n"); - } +// dprintf("decided: %d (%f >= %f) %d %f\n\n" ,next, sum, threshold, lShift, impact); +// dprintf("extracted stage:\n"); +// dprintf("ct %f\n", stage); +// dprintf("computed score %f\n\n", detectionScore); +// dprintf("\n\n"); +// } - if (detectionScore <= stage || st - stBegin == 100) break; - } +// if (detectionScore <= stage || st - stBegin == 100) break; +// } - dprintf("x %d y %d: %d\n", x, y, st - stBegin); +// dprintf("x %d y %d: %d\n", x, y, st - stBegin); - if (st == stEnd) - { - uchar4 a; - a.x = level.workRect.x; - a.y = level.workRect.y; - objects(0, threadIdx.x) = a; - } -} +// if (st == stEnd) +// { +// uchar4 a; +// a.x = level.workRect.x; +// a.y = level.workRect.y; +// objects(0, threadIdx.x) = a; +// } +// } -void icf::Cascade::detect(const cv::gpu::PtrStepSzi& hogluv, PtrStepSz objects, cudaStream_t stream) const -{ - dim3 block(32, 8, 1); - dim3 grid(ChannelStorage::FRAME_WIDTH / 32, ChannelStorage::FRAME_HEIGHT / 8, 47); - device::detect<<>>(*this, hogluv, hogluv.step / sizeof(int), objects); - cudaSafeCall( cudaGetLastError() ); - if (!stream) - cudaSafeCall( cudaDeviceSynchronize() ); -} +// void icf::Cascade::detect(const cv::gpu::PtrStepSzi& hogluv, PtrStepSz objects, cudaStream_t stream) const +// { +// dim3 block(32, 8, 1); +// dim3 grid(ChannelStorage::FRAME_WIDTH / 32, ChannelStorage::FRAME_HEIGHT / 8, 47); +// device::detect<<>>(*this, hogluv, hogluv.step / sizeof(int), objects); +// cudaSafeCall( cudaGetLastError() ); +// if (!stream) +// cudaSafeCall( cudaDeviceSynchronize() ); +// } -}} \ No newline at end of file +// }} \ No newline at end of file diff --git a/modules/gpu/src/icf.hpp b/modules/gpu/src/icf.hpp index 7d4b65980..cf1348007 100644 --- a/modules/gpu/src/icf.hpp +++ b/modules/gpu/src/icf.hpp @@ -40,127 +40,127 @@ // //M*/ -#include +// #include -#ifndef __OPENCV_ICF_HPP__ -#define __OPENCV_ICF_HPP__ +// #ifndef __OPENCV_ICF_HPP__ +// #define __OPENCV_ICF_HPP__ -#if defined __CUDACC__ -# define __device __device__ __forceinline__ -#else -# define __device -#endif +// #if defined __CUDACC__ +// # define __device __device__ __forceinline__ +// #else +// # define __device +// #endif -namespace cv { namespace gpu { namespace icf { +// namespace cv { namespace gpu { namespace icf { -using cv::gpu::PtrStepSzb; -using cv::gpu::PtrStepSzf; +// using cv::gpu::PtrStepSzb; +// using cv::gpu::PtrStepSzf; -typedef unsigned char uchar; +// typedef unsigned char uchar; -struct __align__(16) Octave -{ - ushort index; - ushort stages; - ushort shrinkage; - ushort2 size; - float scale; +// struct __align__(16) Octave +// { +// ushort index; +// ushort stages; +// ushort shrinkage; +// ushort2 size; +// float scale; - Octave(const ushort i, const ushort s, const ushort sh, const ushort2 sz, const float sc) - : index(i), stages(s), shrinkage(sh), size(sz), scale(sc) {} -}; +// Octave(const ushort i, const ushort s, const ushort sh, const ushort2 sz, const float sc) +// : index(i), stages(s), shrinkage(sh), size(sz), scale(sc) {} +// }; -struct __align__(8) Level //is actually 24 bytes -{ - int octave; +// struct __align__(8) Level //is actually 24 bytes +// { +// int octave; - // float origScale; //not actually used - float relScale; - float shrScale; // used for marking detection - float scaling[2]; // calculated according to Dollal paper +// // float origScale; //not actually used +// float relScale; +// float shrScale; // used for marking detection +// float scaling[2]; // calculated according to Dollal paper - // for 640x480 we can not get overflow - uchar2 workRect; - uchar2 objSize; +// // for 640x480 we can not get overflow +// uchar2 workRect; +// uchar2 objSize; - Level(int idx, const Octave& oct, const float scale, const int w, const int h) - : octave(idx), relScale(scale / oct.scale), shrScale (relScale / (float)oct.shrinkage) - { - workRect.x = round(w / (float)oct.shrinkage); - workRect.y = round(h / (float)oct.shrinkage); +// Level(int idx, const Octave& oct, const float scale, const int w, const int h) +// : octave(idx), relScale(scale / oct.scale), shrScale (relScale / (float)oct.shrinkage) +// { +// workRect.x = round(w / (float)oct.shrinkage); +// workRect.y = round(h / (float)oct.shrinkage); - objSize.x = round(oct.size.x * relScale); - objSize.y = round(oct.size.y * relScale); - } -}; +// objSize.x = round(oct.size.x * relScale); +// objSize.y = round(oct.size.y * relScale); +// } +// }; -struct Cascade -{ - Cascade() {} - Cascade(const cv::gpu::PtrStepSzb& octs, const cv::gpu::PtrStepSzf& sts, const cv::gpu::PtrStepSzb& nds, - const cv::gpu::PtrStepSzf& lvs, const cv::gpu::PtrStepSzb& fts, const cv::gpu::PtrStepSzb& lls) - : octaves(octs), stages(sts), nodes(nds), leaves(lvs), features(fts), levels(lls) {} +// struct Cascade +// { +// Cascade() {} +// Cascade(const cv::gpu::PtrStepSzb& octs, const cv::gpu::PtrStepSzf& sts, const cv::gpu::PtrStepSzb& nds, +// const cv::gpu::PtrStepSzf& lvs, const cv::gpu::PtrStepSzb& fts, const cv::gpu::PtrStepSzb& lls) +// : octaves(octs), stages(sts), nodes(nds), leaves(lvs), features(fts), levels(lls) {} - void detect(const cv::gpu::PtrStepSzi& hogluv, cv::gpu::PtrStepSz objects, cudaStream_t stream) const; - void __device detectAt(const int* __restrict__ hogluv, const int pitch, PtrStepSz& objects) const; - float __device rescale(const icf::Level& level, uchar4& scaledRect, - const int channel, const float threshold) const; +// void detect(const cv::gpu::PtrStepSzi& hogluv, cv::gpu::PtrStepSz objects, cudaStream_t stream) const; +// void __device detectAt(const int* __restrict__ hogluv, const int pitch, PtrStepSz& objects) const; +// float __device rescale(const icf::Level& level, uchar4& scaledRect, +// const int channel, const float threshold) const; - PtrStepSzb octaves; - PtrStepSzf stages; - PtrStepSzb nodes; - PtrStepSzf leaves; - PtrStepSzb features; +// PtrStepSzb octaves; +// PtrStepSzf stages; +// PtrStepSzb nodes; +// PtrStepSzf leaves; +// PtrStepSzb features; - PtrStepSzb levels; +// PtrStepSzb levels; -}; +// }; -struct ChannelStorage -{ - ChannelStorage(){} - ChannelStorage(const cv::gpu::PtrStepSzb& buff, const cv::gpu::PtrStepSzb& shr, - const cv::gpu::PtrStepSzb& itg, const int s) - : dmem (buff), shrunk(shr), hogluv(itg), shrinkage(s) {} +// struct ChannelStorage +// { +// ChannelStorage(){} +// ChannelStorage(const cv::gpu::PtrStepSzb& buff, const cv::gpu::PtrStepSzb& shr, +// const cv::gpu::PtrStepSzb& itg, const int s) +// : dmem (buff), shrunk(shr), hogluv(itg), shrinkage(s) {} - void frame(const cv::gpu::PtrStepSz& rgb, cudaStream_t stream){} +// void frame(const cv::gpu::PtrStepSz& rgb, cudaStream_t stream){} - PtrStepSzb dmem; - PtrStepSzb shrunk; - PtrStepSzb hogluv; +// PtrStepSzb dmem; +// PtrStepSzb shrunk; +// PtrStepSzb hogluv; - enum - { - FRAME_WIDTH = 640, - FRAME_HEIGHT = 480, - TOTAL_SCALES = 55, - CLASSIFIERS = 5, - ORIG_OBJECT_WIDTH = 64, - ORIG_OBJECT_HEIGHT = 128, - HOG_BINS = 6, - HOG_LUV_BINS = 10 - }; +// enum +// { +// FRAME_WIDTH = 640, +// FRAME_HEIGHT = 480, +// TOTAL_SCALES = 55, +// CLASSIFIERS = 5, +// ORIG_OBJECT_WIDTH = 64, +// ORIG_OBJECT_HEIGHT = 128, +// HOG_BINS = 6, +// HOG_LUV_BINS = 10 +// }; - int shrinkage; - static const float magnitudeScaling = 1.f ;// / sqrt(2); -}; +// int shrinkage; +// static const float magnitudeScaling = 1.f ;// / sqrt(2); +// }; -struct __align__(8) Node -{ - int feature; - float threshold; +// struct __align__(8) Node +// { +// int feature; +// float threshold; - Node(const int f, const float t) : feature(f), threshold(t) {} -}; +// Node(const int f, const float t) : feature(f), threshold(t) {} +// }; -struct __align__(8) Feature -{ - int channel; - uchar4 rect; +// struct __align__(8) Feature +// { +// int channel; +// uchar4 rect; - Feature(const int c, const uchar4 r) : channel(c), rect(r) {} -}; -}}} +// Feature(const int c, const uchar4 r) : channel(c), rect(r) {} +// }; +// }}} -#endif \ No newline at end of file +// #endif \ No newline at end of file diff --git a/modules/gpu/src/softcascade.cpp b/modules/gpu/src/softcascade.cpp index 7e1a5abb9..c4334ca1d 100644 --- a/modules/gpu/src/softcascade.cpp +++ b/modules/gpu/src/softcascade.cpp @@ -41,361 +41,365 @@ //M*/ #include -#include "opencv2/highgui/highgui.hpp" +#include #if !defined (HAVE_CUDA) cv::gpu::SoftCascade::SoftCascade() : filds(0) { throw_nogpu(); } - cv::gpu::SoftCascade::SoftCascade( const string&, const float, const float) : filds(0) { throw_nogpu(); } - cv::gpu::SoftCascade::~SoftCascade() { throw_nogpu(); } - -bool cv::gpu::SoftCascade::load( const string&, const float, const float) { throw_nogpu(); } - +bool cv::gpu::SoftCascade::load( const string&, const float, const float) { throw_nogpu(); return false; } void cv::gpu::SoftCascade::detectMultiScale(const GpuMat&, const GpuMat&, GpuMat&, const int, Stream) { throw_nogpu(); } #else -#include +// #include -namespace cv { namespace gpu { namespace device { -namespace icf { - void fillBins(cv::gpu::PtrStepSzb hogluv,const cv::gpu::PtrStepSzf& nangle); -} -}}} +// namespace cv { namespace gpu { namespace device { +// namespace icf { +// void fillBins(cv::gpu::PtrStepSzb hogluv,const cv::gpu::PtrStepSzf& nangle); +// } +// }}} + +// namespace { +// char *itoa(long i, char* s, int /*dummy_radix*/) +// { +// sprintf(s, "%ld", i); +// return s; +// } +// } struct cv::gpu::SoftCascade::Filds { - // scales range - float minScale; - float maxScale; +// // scales range +// float minScale; +// float maxScale; - int origObjWidth; - int origObjHeight; +// int origObjWidth; +// int origObjHeight; - GpuMat octaves; - GpuMat stages; - GpuMat nodes; - GpuMat leaves; - GpuMat features; - GpuMat levels; +// GpuMat octaves; +// GpuMat stages; +// GpuMat nodes; +// GpuMat leaves; +// GpuMat features; +// GpuMat levels; - // preallocated buffer 640x480x10 + 640x480 - GpuMat dmem; - // 160x120x10 - GpuMat shrunk; - // 161x121x10 - GpuMat hogluv; +// // preallocated buffer 640x480x10 + 640x480 +// GpuMat dmem; +// // 160x120x10 +// GpuMat shrunk; +// // 161x121x10 +// GpuMat hogluv; - // will be removed in final version - // temporial mat for cvtColor - GpuMat luv; +// // will be removed in final version +// // temporial mat for cvtColor +// GpuMat luv; - // temporial mat for integrall - GpuMat integralBuffer; +// // temporial mat for integrall +// GpuMat integralBuffer; - // temp matrix for sobel and cartToPolar - GpuMat dfdx, dfdy, angle, mag, nmag, nangle; +// // temp matrix for sobel and cartToPolar +// GpuMat dfdx, dfdy, angle, mag, nmag, nangle; - std::vector scales; +// std::vector scales; - icf::Cascade cascade; - icf::ChannelStorage storage; +// icf::Cascade cascade; +// icf::ChannelStorage storage; - enum { BOOST = 0 }; - enum - { - FRAME_WIDTH = 640, - FRAME_HEIGHT = 480, - TOTAL_SCALES = 55, - CLASSIFIERS = 5, - ORIG_OBJECT_WIDTH = 64, - ORIG_OBJECT_HEIGHT = 128, - HOG_BINS = 6, - HOG_LUV_BINS = 10 - }; +// enum { BOOST = 0 }; +// enum +// { +// FRAME_WIDTH = 640, +// FRAME_HEIGHT = 480, +// TOTAL_SCALES = 55, +// CLASSIFIERS = 5, +// ORIG_OBJECT_WIDTH = 64, +// ORIG_OBJECT_HEIGHT = 128, +// HOG_BINS = 6, +// HOG_LUV_BINS = 10 +// }; - bool fill(const FileNode &root, const float mins, const float maxs); - void detect(cv::gpu::GpuMat objects, cudaStream_t stream) const - { - cascade.detect(hogluv, objects, stream); - } +// bool fill(const FileNode &root, const float mins, const float maxs); +// void detect(cv::gpu::GpuMat objects, cudaStream_t stream) const +// { +// cascade.detect(hogluv, objects, stream); +// } -private: - void calcLevels(const std::vector& octs, - int frameW, int frameH, int nscales); +// private: +// void calcLevels(const std::vector& octs, +// int frameW, int frameH, int nscales); - typedef std::vector::const_iterator octIt_t; - int fitOctave(const std::vector& octs, const float& logFactor) const - { - float minAbsLog = FLT_MAX; - int res = 0; - for (int oct = 0; oct < (int)octs.size(); ++oct) - { - const icf::Octave& octave =octs[oct]; - float logOctave = ::log(octave.scale); - float logAbsScale = ::fabs(logFactor - logOctave); +// typedef std::vector::const_iterator octIt_t; +// int fitOctave(const std::vector& octs, const float& logFactor) const +// { +// float minAbsLog = FLT_MAX; +// int res = 0; +// for (int oct = 0; oct < (int)octs.size(); ++oct) +// { +// const icf::Octave& octave =octs[oct]; +// float logOctave = ::log(octave.scale); +// float logAbsScale = ::fabs(logFactor - logOctave); - if(logAbsScale < minAbsLog) - { - res = oct; - minAbsLog = logAbsScale; - } - } - return res; - } +// if(logAbsScale < minAbsLog) +// { +// res = oct; +// minAbsLog = logAbsScale; +// } +// } +// return res; +// } }; -inline bool cv::gpu::SoftCascade::Filds::fill(const FileNode &root, const float mins, const float maxs) -{ - minScale = mins; - maxScale = maxs; +// inline bool cv::gpu::SoftCascade::Filds::fill(const FileNode &root, const float mins, const float maxs) +// { +// minScale = mins; +// maxScale = maxs; - // cascade properties - static const char *const SC_STAGE_TYPE = "stageType"; - static const char *const SC_BOOST = "BOOST"; +// // cascade properties +// static const char *const SC_STAGE_TYPE = "stageType"; +// static const char *const SC_BOOST = "BOOST"; - static const char *const SC_FEATURE_TYPE = "featureType"; - static const char *const SC_ICF = "ICF"; +// static const char *const SC_FEATURE_TYPE = "featureType"; +// static const char *const SC_ICF = "ICF"; - static const char *const SC_ORIG_W = "width"; - static const char *const SC_ORIG_H = "height"; +// static const char *const SC_ORIG_W = "width"; +// static const char *const SC_ORIG_H = "height"; - static const char *const SC_OCTAVES = "octaves"; - static const char *const SC_STAGES = "stages"; - static const char *const SC_FEATURES = "features"; +// static const char *const SC_OCTAVES = "octaves"; +// static const char *const SC_STAGES = "stages"; +// static const char *const SC_FEATURES = "features"; - static const char *const SC_WEEK = "weakClassifiers"; - static const char *const SC_INTERNAL = "internalNodes"; - static const char *const SC_LEAF = "leafValues"; +// static const char *const SC_WEEK = "weakClassifiers"; +// static const char *const SC_INTERNAL = "internalNodes"; +// static const char *const SC_LEAF = "leafValues"; - static const char *const SC_OCT_SCALE = "scale"; - static const char *const SC_OCT_STAGES = "stageNum"; - static const char *const SC_OCT_SHRINKAGE = "shrinkingFactor"; +// static const char *const SC_OCT_SCALE = "scale"; +// static const char *const SC_OCT_STAGES = "stageNum"; +// static const char *const SC_OCT_SHRINKAGE = "shrinkingFactor"; - static const char *const SC_STAGE_THRESHOLD = "stageThreshold"; +// static const char *const SC_STAGE_THRESHOLD = "stageThreshold"; - static const char * const SC_F_CHANNEL = "channel"; - static const char * const SC_F_RECT = "rect"; +// static const char * const SC_F_CHANNEL = "channel"; +// static const char * const SC_F_RECT = "rect"; - // only Ada Boost supported - std::string stageTypeStr = (string)root[SC_STAGE_TYPE]; - CV_Assert(stageTypeStr == SC_BOOST); +// // only Ada Boost supported +// std::string stageTypeStr = (string)root[SC_STAGE_TYPE]; +// CV_Assert(stageTypeStr == SC_BOOST); - // only HOG-like integral channel features cupported - string featureTypeStr = (string)root[SC_FEATURE_TYPE]; - CV_Assert(featureTypeStr == SC_ICF); +// // only HOG-like integral channel features cupported +// string featureTypeStr = (string)root[SC_FEATURE_TYPE]; +// CV_Assert(featureTypeStr == SC_ICF); - origObjWidth = (int)root[SC_ORIG_W]; - CV_Assert(origObjWidth == ORIG_OBJECT_WIDTH); +// origObjWidth = (int)root[SC_ORIG_W]; +// CV_Assert(origObjWidth == ORIG_OBJECT_WIDTH); - origObjHeight = (int)root[SC_ORIG_H]; - CV_Assert(origObjHeight == ORIG_OBJECT_HEIGHT); +// origObjHeight = (int)root[SC_ORIG_H]; +// CV_Assert(origObjHeight == ORIG_OBJECT_HEIGHT); - FileNode fn = root[SC_OCTAVES]; - if (fn.empty()) return false; +// FileNode fn = root[SC_OCTAVES]; +// if (fn.empty()) return false; - std::vector voctaves; - std::vector vstages; - std::vector vnodes; - std::vector vleaves; - std::vector vfeatures; - scales.clear(); +// std::vector voctaves; +// std::vector vstages; +// std::vector vnodes; +// std::vector vleaves; +// std::vector vfeatures; +// scales.clear(); - // std::vector levels; +// // std::vector levels; - FileNodeIterator it = fn.begin(), it_end = fn.end(); - int feature_offset = 0; - ushort octIndex = 0; - ushort shrinkage = 1; +// FileNodeIterator it = fn.begin(), it_end = fn.end(); +// int feature_offset = 0; +// ushort octIndex = 0; +// ushort shrinkage = 1; - for (; it != it_end; ++it) - { - FileNode fns = *it; - float scale = (float)fns[SC_OCT_SCALE]; - scales.push_back(scale); - ushort nstages = saturate_cast((int)fns[SC_OCT_STAGES]); - ushort2 size; - size.x = cvRound(ORIG_OBJECT_WIDTH * scale); - size.y = cvRound(ORIG_OBJECT_HEIGHT * scale); - shrinkage = saturate_cast((int)fns[SC_OCT_SHRINKAGE]); +// for (; it != it_end; ++it) +// { +// FileNode fns = *it; +// float scale = (float)fns[SC_OCT_SCALE]; +// scales.push_back(scale); +// ushort nstages = saturate_cast((int)fns[SC_OCT_STAGES]); +// ushort2 size; +// size.x = cvRound(ORIG_OBJECT_WIDTH * scale); +// size.y = cvRound(ORIG_OBJECT_HEIGHT * scale); +// shrinkage = saturate_cast((int)fns[SC_OCT_SHRINKAGE]); - icf::Octave octave(octIndex, nstages, shrinkage, size, scale); - CV_Assert(octave.stages > 0); - voctaves.push_back(octave); +// icf::Octave octave(octIndex, nstages, shrinkage, size, scale); +// CV_Assert(octave.stages > 0); +// voctaves.push_back(octave); - FileNode ffs = fns[SC_FEATURES]; - if (ffs.empty()) return false; +// FileNode ffs = fns[SC_FEATURES]; +// if (ffs.empty()) return false; - fns = fns[SC_STAGES]; - if (fn.empty()) return false; +// fns = fns[SC_STAGES]; +// if (fn.empty()) return false; - // for each stage (~ decision tree with H = 2) - FileNodeIterator st = fns.begin(), st_end = fns.end(); - for (; st != st_end; ++st ) - { - fns = *st; - vstages.push_back((float)fns[SC_STAGE_THRESHOLD]); +// // for each stage (~ decision tree with H = 2) +// FileNodeIterator st = fns.begin(), st_end = fns.end(); +// for (; st != st_end; ++st ) +// { +// fns = *st; +// vstages.push_back((float)fns[SC_STAGE_THRESHOLD]); - fns = fns[SC_WEEK]; - FileNodeIterator ftr = fns.begin(), ft_end = fns.end(); - for (; ftr != ft_end; ++ftr) - { - fns = (*ftr)[SC_INTERNAL]; - FileNodeIterator inIt = fns.begin(), inIt_end = fns.end(); - for (; inIt != inIt_end;) - { - int feature = (int)(*(inIt +=2)++) + feature_offset; - float th = (float)(*(inIt++)); - vnodes.push_back(icf::Node(feature, th)); - } +// fns = fns[SC_WEEK]; +// FileNodeIterator ftr = fns.begin(), ft_end = fns.end(); +// for (; ftr != ft_end; ++ftr) +// { +// fns = (*ftr)[SC_INTERNAL]; +// FileNodeIterator inIt = fns.begin(), inIt_end = fns.end(); +// for (; inIt != inIt_end;) +// { +// int feature = (int)(*(inIt +=2)++) + feature_offset; +// float th = (float)(*(inIt++)); +// vnodes.push_back(icf::Node(feature, th)); +// } - fns = (*ftr)[SC_LEAF]; - inIt = fns.begin(), inIt_end = fns.end(); - for (; inIt != inIt_end; ++inIt) - vleaves.push_back((float)(*inIt)); - } - } +// fns = (*ftr)[SC_LEAF]; +// inIt = fns.begin(), inIt_end = fns.end(); +// for (; inIt != inIt_end; ++inIt) +// vleaves.push_back((float)(*inIt)); +// } +// } - st = ffs.begin(), st_end = ffs.end(); - for (; st != st_end; ++st ) - { - cv::FileNode rn = (*st)[SC_F_RECT]; - cv::FileNodeIterator r_it = rn.begin(); - uchar4 rect; - rect.x = saturate_cast((int)*(r_it++)); - rect.y = saturate_cast((int)*(r_it++)); - rect.z = saturate_cast((int)*(r_it++)); - rect.w = saturate_cast((int)*(r_it++)); - vfeatures.push_back(icf::Feature((int)(*st)[SC_F_CHANNEL], rect)); - } +// st = ffs.begin(), st_end = ffs.end(); +// for (; st != st_end; ++st ) +// { +// cv::FileNode rn = (*st)[SC_F_RECT]; +// cv::FileNodeIterator r_it = rn.begin(); +// uchar4 rect; +// rect.x = saturate_cast((int)*(r_it++)); +// rect.y = saturate_cast((int)*(r_it++)); +// rect.z = saturate_cast((int)*(r_it++)); +// rect.w = saturate_cast((int)*(r_it++)); +// vfeatures.push_back(icf::Feature((int)(*st)[SC_F_CHANNEL], rect)); +// } - feature_offset += octave.stages * 3; - ++octIndex; - } +// feature_offset += octave.stages * 3; +// ++octIndex; +// } - // upload in gpu memory - octaves.upload(cv::Mat(1, voctaves.size() * sizeof(icf::Octave), CV_8UC1, (uchar*)&(voctaves[0]) )); - CV_Assert(!octaves.empty()); +// // upload in gpu memory +// octaves.upload(cv::Mat(1, voctaves.size() * sizeof(icf::Octave), CV_8UC1, (uchar*)&(voctaves[0]) )); +// CV_Assert(!octaves.empty()); - stages.upload(cv::Mat(vstages).reshape(1,1)); - CV_Assert(!stages.empty()); +// stages.upload(cv::Mat(vstages).reshape(1,1)); +// CV_Assert(!stages.empty()); - nodes.upload(cv::Mat(1, vnodes.size() * sizeof(icf::Node), CV_8UC1, (uchar*)&(vnodes[0]) )); - CV_Assert(!nodes.empty()); +// nodes.upload(cv::Mat(1, vnodes.size() * sizeof(icf::Node), CV_8UC1, (uchar*)&(vnodes[0]) )); +// CV_Assert(!nodes.empty()); - leaves.upload(cv::Mat(vleaves).reshape(1,1)); - CV_Assert(!leaves.empty()); +// leaves.upload(cv::Mat(vleaves).reshape(1,1)); +// CV_Assert(!leaves.empty()); - features.upload(cv::Mat(1, vfeatures.size() * sizeof(icf::Feature), CV_8UC1, (uchar*)&(vfeatures[0]) )); - CV_Assert(!features.empty()); +// features.upload(cv::Mat(1, vfeatures.size() * sizeof(icf::Feature), CV_8UC1, (uchar*)&(vfeatures[0]) )); +// CV_Assert(!features.empty()); - // compute levels - calcLevels(voctaves, FRAME_WIDTH, FRAME_HEIGHT, TOTAL_SCALES); - CV_Assert(!levels.empty()); +// // compute levels +// calcLevels(voctaves, FRAME_WIDTH, FRAME_HEIGHT, TOTAL_SCALES); +// CV_Assert(!levels.empty()); - //init Cascade - cascade = icf::Cascade(octaves, stages, nodes, leaves, features, levels); +// //init Cascade +// cascade = icf::Cascade(octaves, stages, nodes, leaves, features, levels); - // allocate buffers - dmem.create(FRAME_HEIGHT * (HOG_LUV_BINS + 1), FRAME_WIDTH, CV_8UC1); - shrunk.create(FRAME_HEIGHT / shrinkage * HOG_LUV_BINS, FRAME_WIDTH / shrinkage, CV_8UC1); - // hogluv.create( (FRAME_HEIGHT / shrinkage + 1) * HOG_LUV_BINS, (FRAME_WIDTH / shrinkage + 1), CV_16UC1); - hogluv.create( (FRAME_HEIGHT / shrinkage + 1) * HOG_LUV_BINS, (FRAME_WIDTH / shrinkage + 1), CV_32SC1); - luv.create(FRAME_HEIGHT, FRAME_WIDTH, CV_8UC3); - integralBuffer.create(shrunk.rows + 1 * HOG_LUV_BINS, shrunk.cols + 1, CV_32SC1); +// // allocate buffers +// dmem.create(FRAME_HEIGHT * (HOG_LUV_BINS + 1), FRAME_WIDTH, CV_8UC1); +// shrunk.create(FRAME_HEIGHT / shrinkage * HOG_LUV_BINS, FRAME_WIDTH / shrinkage, CV_8UC1); +// // hogluv.create( (FRAME_HEIGHT / shrinkage + 1) * HOG_LUV_BINS, (FRAME_WIDTH / shrinkage + 1), CV_16UC1); +// hogluv.create( (FRAME_HEIGHT / shrinkage + 1) * HOG_LUV_BINS, (FRAME_WIDTH / shrinkage + 1), CV_32SC1); +// luv.create(FRAME_HEIGHT, FRAME_WIDTH, CV_8UC3); +// integralBuffer.create(shrunk.rows + 1 * HOG_LUV_BINS, shrunk.cols + 1, CV_32SC1); - dfdx.create(FRAME_HEIGHT, FRAME_WIDTH, CV_32FC1); - dfdy.create(FRAME_HEIGHT, FRAME_WIDTH, CV_32FC1); - angle.create(FRAME_HEIGHT, FRAME_WIDTH, CV_32FC1); - mag.create(FRAME_HEIGHT, FRAME_WIDTH, CV_32FC1); +// dfdx.create(FRAME_HEIGHT, FRAME_WIDTH, CV_32FC1); +// dfdy.create(FRAME_HEIGHT, FRAME_WIDTH, CV_32FC1); +// angle.create(FRAME_HEIGHT, FRAME_WIDTH, CV_32FC1); +// mag.create(FRAME_HEIGHT, FRAME_WIDTH, CV_32FC1); - nmag.create(FRAME_HEIGHT, FRAME_WIDTH, CV_32FC1); - nangle.create(FRAME_HEIGHT, FRAME_WIDTH, CV_32FC1); +// nmag.create(FRAME_HEIGHT, FRAME_WIDTH, CV_32FC1); +// nangle.create(FRAME_HEIGHT, FRAME_WIDTH, CV_32FC1); - storage = icf::ChannelStorage(dmem, shrunk, hogluv, shrinkage); - return true; -} +// storage = icf::ChannelStorage(dmem, shrunk, hogluv, shrinkage); +// return true; +// } -namespace { - struct CascadeIntrinsics - { - static const float lambda = 1.099f, a = 0.89f; +// namespace { +// struct CascadeIntrinsics +// { +// static const float lambda = 1.099f, a = 0.89f; - static float getFor(int channel, float scaling) - { - CV_Assert(channel < 10); +// static float getFor(int channel, float scaling) +// { +// CV_Assert(channel < 10); - if (fabs(scaling - 1.f) < FLT_EPSILON) - return 1.f; +// if (fabs(scaling - 1.f) < FLT_EPSILON) +// return 1.f; - // according to R. Benenson, M. Mathias, R. Timofte and L. Van Gool's and Dallal's papers - static const float A[2][2] = - { //channel <= 6, otherwise - { 0.89f, 1.f}, // down - { 1.00f, 1.f} // up - }; +// // according to R. Benenson, M. Mathias, R. Timofte and L. Van Gool's and Dallal's papers +// static const float A[2][2] = +// { //channel <= 6, otherwise +// { 0.89f, 1.f}, // down +// { 1.00f, 1.f} // up +// }; - static const float B[2][2] = - { //channel <= 6, otherwise - { 1.099f / log(2), 2.f}, // down - { 0.f, 2.f} // up - }; +// static const float B[2][2] = +// { //channel <= 6, otherwise +// { 1.099f / log(2), 2.f}, // down +// { 0.f, 2.f} // up +// }; - float a = A[(int)(scaling >= 1)][(int)(channel > 6)]; - float b = B[(int)(scaling >= 1)][(int)(channel > 6)]; +// float a = A[(int)(scaling >= 1)][(int)(channel > 6)]; +// float b = B[(int)(scaling >= 1)][(int)(channel > 6)]; - // printf("!!! scaling: %f %f %f -> %f\n", scaling, a, b, a * pow(scaling, b)); - return a * pow(scaling, b); - } - }; -} +// // printf("!!! scaling: %f %f %f -> %f\n", scaling, a, b, a * pow(scaling, b)); +// return a * pow(scaling, b); +// } +// }; +// } -inline void cv::gpu::SoftCascade::Filds::calcLevels(const std::vector& octs, - int frameW, int frameH, int nscales) -{ - CV_Assert(nscales > 1); +// inline void cv::gpu::SoftCascade::Filds::calcLevels(const std::vector& octs, +// int frameW, int frameH, int nscales) +// { +// CV_Assert(nscales > 1); - std::vector vlevels; - float logFactor = (::log(maxScale) - ::log(minScale)) / (nscales -1); +// std::vector vlevels; +// float logFactor = (::log(maxScale) - ::log(minScale)) / (nscales -1); - float scale = minScale; - for (int sc = 0; sc < nscales; ++sc) - { - int width = ::std::max(0.0f, frameW - (origObjWidth * scale)); - int height = ::std::max(0.0f, frameH - (origObjHeight * scale)); +// float scale = minScale; +// for (int sc = 0; sc < nscales; ++sc) +// { +// int width = ::std::max(0.0f, frameW - (origObjWidth * scale)); +// int height = ::std::max(0.0f, frameH - (origObjHeight * scale)); - float logScale = ::log(scale); - int fit = fitOctave(octs, logScale); +// float logScale = ::log(scale); +// int fit = fitOctave(octs, logScale); - icf::Level level(fit, octs[fit], scale, width, height); - level.scaling[0] = CascadeIntrinsics::getFor(0, level.relScale); - level.scaling[1] = CascadeIntrinsics::getFor(9, level.relScale); +// icf::Level level(fit, octs[fit], scale, width, height); +// level.scaling[0] = CascadeIntrinsics::getFor(0, level.relScale); +// level.scaling[1] = CascadeIntrinsics::getFor(9, level.relScale); - if (!width || !height) - break; - else - vlevels.push_back(level); +// if (!width || !height) +// break; +// else +// vlevels.push_back(level); - if (::fabs(scale - maxScale) < FLT_EPSILON) break; - scale = ::std::min(maxScale, ::expf(::log(scale) + logFactor)); +// if (::fabs(scale - maxScale) < FLT_EPSILON) break; +// scale = ::std::min(maxScale, ::expf(::log(scale) + logFactor)); - // printf("level: %d (%f %f) [%f %f] (%d %d) (%d %d)\n", level.octave, level.relScale, level.shrScale, - // level.scaling[0], level.scaling[1], level.workRect.x, level.workRect.y, level.objSize.x, level.objSize.y); +// // printf("level: %d (%f %f) [%f %f] (%d %d) (%d %d)\n", level.octave, level.relScale, level.shrScale, +// // level.scaling[0], level.scaling[1], level.workRect.x, level.workRect.y, level.objSize.x, level.objSize.y); - // std::cout << "level " << sc - // << " octeve " - // << vlevels[sc].octave - // << " relScale " - // << vlevels[sc].relScale - // << " " << vlevels[sc].shrScale - // << " [" << (int)vlevels[sc].objSize.x - // << " " << (int)vlevels[sc].objSize.y << "] [" - // << (int)vlevels[sc].workRect.x << " " << (int)vlevels[sc].workRect.y << "]" << std::endl; - } - levels.upload(cv::Mat(1, vlevels.size() * sizeof(icf::Level), CV_8UC1, (uchar*)&(vlevels[0]) )); -} +// // std::cout << "level " << sc +// // << " octeve " +// // << vlevels[sc].octave +// // << " relScale " +// // << vlevels[sc].relScale +// // << " " << vlevels[sc].shrScale +// // << " [" << (int)vlevels[sc].objSize.x +// // << " " << (int)vlevels[sc].objSize.y << "] [" +// // << (int)vlevels[sc].workRect.x << " " << (int)vlevels[sc].workRect.y << "]" << std::endl; +// } +// levels.upload(cv::Mat(1, vlevels.size() * sizeof(icf::Level), CV_8UC1, (uchar*)&(vlevels[0]) )); +// } cv::gpu::SoftCascade::SoftCascade() : filds(0) {} @@ -419,97 +423,89 @@ bool cv::gpu::SoftCascade::load( const string& filename, const float minScale, c if (!fs.isOpened()) return false; filds = new Filds; - Filds& flds = *filds; - if (!flds.fill(fs.getFirstTopLevelNode(), minScale, maxScale)) return false; +// Filds& flds = *filds; +// if (!flds.fill(fs.getFirstTopLevelNode(), minScale, maxScale)) return false; return true; } -namespace { - char *itoa(long i, char* s, int /*dummy_radix*/) - { - sprintf(s, "%ld", i); - return s; - } -} - -#define USE_REFERENCE_VALUES +// #define USE_REFERENCE_VALUES void cv::gpu::SoftCascade::detectMultiScale(const GpuMat& colored, const GpuMat& /*rois*/, GpuMat& objects, const int /*rejectfactor*/, Stream s) { - // only color images are supperted - CV_Assert(colored.type() == CV_8UC3); +// // only color images are supperted +// CV_Assert(colored.type() == CV_8UC3); - // // only this window size allowed - CV_Assert(colored.cols == 640 && colored.rows == 480); +// // // only this window size allowed +// CV_Assert(colored.cols == 640 && colored.rows == 480); - Filds& flds = *filds; +// Filds& flds = *filds; -#if defined USE_REFERENCE_VALUES - cudaMemset(flds.hogluv.data, 0, flds.hogluv.step * flds.hogluv.rows); - cv::FileStorage imgs("/home/kellan/testInts.xml", cv::FileStorage::READ); - char buff[33]; +// #if defined USE_REFERENCE_VALUES +// cudaMemset(flds.hogluv.data, 0, flds.hogluv.step * flds.hogluv.rows); +// cv::FileStorage imgs("/home/kellan/testInts.xml", cv::FileStorage::READ); +// char buff[33]; - for(int i = 0; i < Filds::HOG_LUV_BINS; ++i) - { - cv::Mat channel; - imgs[std::string("channel") + itoa(i, buff, 10)] >> channel; - GpuMat gchannel(flds.hogluv, cv::Rect(0, 121 * i, 161, 121)); - gchannel.upload(channel); - } -#else - GpuMat& dmem = flds.dmem; - cudaMemset(dmem.data, 0, dmem.step * dmem.rows); - GpuMat& shrunk = flds.shrunk; - int w = shrunk.cols; - int h = colored.rows / flds.storage.shrinkage; +// for(int i = 0; i < Filds::HOG_LUV_BINS; ++i) +// { +// cv::Mat channel; +// imgs[std::string("channel") + itoa(i, buff, 10)] >> channel; +// GpuMat gchannel(flds.hogluv, cv::Rect(0, 121 * i, 161, 121)); +// gchannel.upload(channel); +// } +// #else +// GpuMat& dmem = flds.dmem; +// cudaMemset(dmem.data, 0, dmem.step * dmem.rows); +// GpuMat& shrunk = flds.shrunk; +// int w = shrunk.cols; +// int h = colored.rows / flds.storage.shrinkage; - std::vector splited; - for(int i = 0; i < 3; ++i) - { - splited.push_back(GpuMat(dmem, cv::Rect(0, colored.rows * (7 + i), colored.cols, colored.rows))); - } +// std::vector splited; +// for(int i = 0; i < 3; ++i) +// { +// splited.push_back(GpuMat(dmem, cv::Rect(0, colored.rows * (7 + i), colored.cols, colored.rows))); +// } - GpuMat gray(dmem, cv::Rect(0, colored.rows * 10, colored.cols, colored.rows) ); +// GpuMat gray(dmem, cv::Rect(0, colored.rows * 10, colored.cols, colored.rows) ); - cv::gpu::cvtColor(colored, gray, CV_RGB2GRAY); +// cv::gpu::cvtColor(colored, gray, CV_RGB2GRAY); - //create hog - cv::gpu::Sobel(gray, flds.dfdx, CV_32F, 1, 0, 3, 0.25); - cv::gpu::Sobel(gray, flds.dfdy, CV_32F, 0, 1, 3, 0.25); +// //create hog +// cv::gpu::Sobel(gray, flds.dfdx, CV_32F, 1, 0, 3, 0.25); +// cv::gpu::Sobel(gray, flds.dfdy, CV_32F, 0, 1, 3, 0.25); - cv::gpu::cartToPolar(flds.dfdx, flds.dfdy, flds.mag, flds.angle, true); +// cv::gpu::cartToPolar(flds.dfdx, flds.dfdy, flds.mag, flds.angle, true); - cv::gpu::multiply(flds.mag, cv::Scalar::all(1.0 / ::log(2)), flds.nmag); - cv::gpu::multiply(flds.angle, cv::Scalar::all(1.0 / 60.0), flds.nangle); +// cv::gpu::multiply(flds.mag, cv::Scalar::all(1.0 / ::log(2)), flds.nmag); +// cv::gpu::multiply(flds.angle, cv::Scalar::all(1.0 / 60.0), flds.nangle); - GpuMat magCannel(dmem, cv::Rect(0, colored.rows * 6, colored.cols, colored.rows)); - flds.nmag.convertTo(magCannel, CV_8UC1); - device::icf::fillBins(dmem, flds.nangle); +// GpuMat magCannel(dmem, cv::Rect(0, colored.rows * 6, colored.cols, colored.rows)); +// flds.nmag.convertTo(magCannel, CV_8UC1); +// device::icf::fillBins(dmem, flds.nangle); - // create luv - cv::gpu::cvtColor(colored, flds.luv, CV_BGR2Luv); - cv::gpu::split(flds.luv, splited); +// // create luv +// cv::gpu::cvtColor(colored, flds.luv, CV_BGR2Luv); +// cv::gpu::split(flds.luv, splited); - GpuMat plane(dmem, cv::Rect(0, 0, colored.cols, colored.rows * Filds::HOG_LUV_BINS)); - cv::gpu::resize(plane, flds.shrunk, cv::Size(), 0.25, 0.25, CV_INTER_AREA); +// GpuMat plane(dmem, cv::Rect(0, 0, colored.cols, colored.rows * Filds::HOG_LUV_BINS)); +// cv::gpu::resize(plane, flds.shrunk, cv::Size(), 0.25, 0.25, CV_INTER_AREA); - // fer debug purpose - // cudaMemset(flds.hogluv.data, 0, flds.hogluv.step * flds.hogluv.rows); +// // fer debug purpose +// // cudaMemset(flds.hogluv.data, 0, flds.hogluv.step * flds.hogluv.rows); - for(int i = 0; i < Filds::HOG_LUV_BINS; ++i) - { - GpuMat channel(shrunk, cv::Rect(0, h * i, w, h )); - GpuMat sum(flds.hogluv, cv::Rect(0, (h + 1) * i, w + 1, h + 1)); - cv::gpu::integralBuffered(channel, sum, flds.integralBuffer); - } +// for(int i = 0; i < Filds::HOG_LUV_BINS; ++i) +// { +// GpuMat channel(shrunk, cv::Rect(0, h * i, w, h )); +// GpuMat sum(flds.hogluv, cv::Rect(0, (h + 1) * i, w + 1, h + 1)); +// cv::gpu::integralBuffered(channel, sum, flds.integralBuffer); + // } -#endif +// #endif - cudaStream_t stream = StreamAccessor::getStream(s); - // detection - flds.detect(objects, stream); +// cudaStream_t stream = StreamAccessor::getStream(s); +// // detection +// flds.detect(objects, stream); - // flds.storage.frame(colored, stream); +// // flds.storage.frame(colored, stream); } #endif \ No newline at end of file From 4d9c7c1012be346f64fb50d50fd52d6ecb16b07e Mon Sep 17 00:00:00 2001 From: "marina.kolpakova" Date: Wed, 26 Sep 2012 13:34:21 +0400 Subject: [PATCH 022/155] preprocessing ~1.981 ms --- modules/gpu/src/cuda/isf-sc.cu | 54 +++++----- modules/gpu/src/softcascade.cpp | 175 +++++++++++++++++++------------- 2 files changed, 133 insertions(+), 96 deletions(-) diff --git a/modules/gpu/src/cuda/isf-sc.cu b/modules/gpu/src/cuda/isf-sc.cu index 33b2222c7..e4831e2e6 100644 --- a/modules/gpu/src/cuda/isf-sc.cu +++ b/modules/gpu/src/cuda/isf-sc.cu @@ -40,6 +40,7 @@ // //M*/ +#include // #include // #include // #include @@ -54,9 +55,8 @@ // # define dprintf(format, ...) // #endif -// namespace cv { namespace gpu { namespace device { - -// namespace icf { +namespace cv { namespace gpu { namespace device { +namespace icf { // enum { // HOG_BINS = 6, @@ -66,33 +66,35 @@ // GREY_OFFSET = HEIGHT * HOG_LUV_BINS // }; -// __global__ void magToHist(const uchar* __restrict__ mag, -// const float* __restrict__ angle, const int angPitch, -// uchar* __restrict__ hog, const int hogPitch) -// { -// const int y = blockIdx.y * blockDim.y + threadIdx.y; -// const int x = blockIdx.x * blockDim.x + threadIdx.x; + // ToDo: use textures or ancached load instruction. + __global__ void magToHist(const uchar* __restrict__ mag, + const float* __restrict__ angle, const int angPitch, + uchar* __restrict__ hog, const int hogPitch, const int fh) + { + const int y = blockIdx.y * blockDim.y + threadIdx.y; + const int x = blockIdx.x * blockDim.x + threadIdx.x; -// const int bin = (int)(angle[y * angPitch + x]); -// const uchar val = mag[y * angPitch + x]; + const int bin = (int)(angle[y * angPitch + x]); + const uchar val = mag[y * hogPitch + x]; + hog[((fh * bin) + y) * hogPitch + x] = val; + } -// hog[((HEIGHT * bin) + y) * hogPitch + x] = val; -// } + void fillBins(cv::gpu::PtrStepSzb hogluv, const cv::gpu::PtrStepSzf& nangle, + const int fw, const int fh, const int bins) + { + const uchar* mag = (const uchar*)hogluv.ptr(fh * bins); + uchar* hog = (uchar*)hogluv.ptr(); + const float* angle = (const float*)nangle.ptr(); -// void fillBins(cv::gpu::PtrStepSzb hogluv, const cv::gpu::PtrStepSzf& nangle) -// { -// const uchar* mag = (const uchar*)hogluv.ptr(HEIGHT * HOG_BINS); -// uchar* hog = (uchar*)hogluv.ptr(); -// const float* angle = (const float*)nangle.ptr(); + dim3 block(32, 8); + dim3 grid(fw / 32, fh / 8); -// dim3 block(32, 8); -// dim3 grid(WIDTH / 32, HEIGHT / 8); - -// magToHist<<>>(mag, angle, nangle.step / sizeof(float), hog, hogluv.step); -// cudaSafeCall( cudaGetLastError() ); -// cudaSafeCall( cudaDeviceSynchronize() ); -// } -// } + magToHist<<>>(mag, angle, nangle.step / sizeof(float), hog, hogluv.step, fh); + cudaSafeCall( cudaGetLastError() ); + cudaSafeCall( cudaDeviceSynchronize() ); + } +} +}}} // __global__ void detect(const cv::gpu::icf::Cascade cascade, const int* __restrict__ hogluv, const int pitch, // PtrStepSz objects) diff --git a/modules/gpu/src/softcascade.cpp b/modules/gpu/src/softcascade.cpp index c4334ca1d..f336fd2d9 100644 --- a/modules/gpu/src/softcascade.cpp +++ b/modules/gpu/src/softcascade.cpp @@ -49,17 +49,18 @@ cv::gpu::SoftCascade::SoftCascade() : filds(0) { throw_nogpu(); } cv::gpu::SoftCascade::SoftCascade( const string&, const float, const float) : filds(0) { throw_nogpu(); } cv::gpu::SoftCascade::~SoftCascade() { throw_nogpu(); } bool cv::gpu::SoftCascade::load( const string&, const float, const float) { throw_nogpu(); return false; } -void cv::gpu::SoftCascade::detectMultiScale(const GpuMat&, const GpuMat&, GpuMat&, const int, Stream) { throw_nogpu(); } +void cv::gpu::SoftCascade::detectMultiScale(const GpuMat&, const GpuMat&, GpuMat&, const int, Stream) { throw_nogpu();} #else // #include -// namespace cv { namespace gpu { namespace device { -// namespace icf { -// void fillBins(cv::gpu::PtrStepSzb hogluv,const cv::gpu::PtrStepSzf& nangle); -// } -// }}} +namespace cv { namespace gpu { namespace device { +namespace icf { + void fillBins(cv::gpu::PtrStepSzb hogluv, const cv::gpu::PtrStepSzf& nangle, + const int fw, const int fh, const int bins); +} +}}} // namespace { // char *itoa(long i, char* s, int /*dummy_radix*/) @@ -71,6 +72,16 @@ void cv::gpu::SoftCascade::detectMultiScale(const GpuMat&, const GpuMat&, GpuMat struct cv::gpu::SoftCascade::Filds { + + Filds() + { + plane.create(FRAME_HEIGHT * (HOG_LUV_BINS + 1), FRAME_WIDTH, CV_8UC1); + fplane.create(FRAME_HEIGHT * 6, FRAME_WIDTH, CV_32FC1); + luv.create(FRAME_HEIGHT, FRAME_WIDTH, CV_8UC3); + shrunk.create(FRAME_HEIGHT / 4 * HOG_LUV_BINS, FRAME_WIDTH / 4, CV_8UC1); + integralBuffer.create(shrunk.rows + 1 * HOG_LUV_BINS, shrunk.cols + 1, CV_32SC1); + hogluv.create((FRAME_HEIGHT / 4 + 1) * HOG_LUV_BINS, FRAME_WIDTH / 4 + 1, CV_32SC1); + } // // scales range // float minScale; // float maxScale; @@ -85,19 +96,26 @@ struct cv::gpu::SoftCascade::Filds // GpuMat features; // GpuMat levels; -// // preallocated buffer 640x480x10 + 640x480 -// GpuMat dmem; -// // 160x120x10 -// GpuMat shrunk; -// // 161x121x10 -// GpuMat hogluv; + // preallocated buffer 640x480x10 for hogluv + 640x480 got gray + GpuMat plane; + + // preallocated buffer for floating point operations + GpuMat fplane; + + // temporial mat for cvtColor + GpuMat luv; + + // 160x120x10 + GpuMat shrunk; + + // temporial mat for integrall + GpuMat integralBuffer; + + // 161x121x10 + GpuMat hogluv; // // will be removed in final version -// // temporial mat for cvtColor -// GpuMat luv; -// // temporial mat for integrall -// GpuMat integralBuffer; // // temp matrix for sobel and cartToPolar // GpuMat dfdx, dfdy, angle, mag, nmag, nangle; @@ -108,17 +126,18 @@ struct cv::gpu::SoftCascade::Filds // icf::ChannelStorage storage; // enum { BOOST = 0 }; -// enum -// { -// FRAME_WIDTH = 640, -// FRAME_HEIGHT = 480, + enum + { + FRAME_WIDTH = 640, + FRAME_HEIGHT = 480, // TOTAL_SCALES = 55, // CLASSIFIERS = 5, // ORIG_OBJECT_WIDTH = 64, // ORIG_OBJECT_HEIGHT = 128, -// HOG_BINS = 6, -// HOG_LUV_BINS = 10 -// }; + HOG_BINS = 6, + LUV_BINS = 3, + HOG_LUV_BINS = 10 + }; // bool fill(const FileNode &root, const float mins, const float maxs); // void detect(cv::gpu::GpuMat objects, cudaStream_t stream) const @@ -386,7 +405,8 @@ struct cv::gpu::SoftCascade::Filds // scale = ::std::min(maxScale, ::expf(::log(scale) + logFactor)); // // printf("level: %d (%f %f) [%f %f] (%d %d) (%d %d)\n", level.octave, level.relScale, level.shrScale, -// // level.scaling[0], level.scaling[1], level.workRect.x, level.workRect.y, level.objSize.x, level.objSize.y); +// // level.scaling[0], level.scaling[1], level.workRect.x, level.workRect.y, level.objSize.x, +//level.objSize.y); // // std::cout << "level " << sc // // << " octeve " @@ -423,8 +443,8 @@ bool cv::gpu::SoftCascade::load( const string& filename, const float minScale, c if (!fs.isOpened()) return false; filds = new Filds; -// Filds& flds = *filds; -// if (!flds.fill(fs.getFirstTopLevelNode(), minScale, maxScale)) return false; + Filds& flds = *filds; + // if (!flds.fill(fs.getFirstTopLevelNode(), minScale, maxScale)) return false; return true; } @@ -432,15 +452,15 @@ bool cv::gpu::SoftCascade::load( const string& filename, const float minScale, c void cv::gpu::SoftCascade::detectMultiScale(const GpuMat& colored, const GpuMat& /*rois*/, GpuMat& objects, const int /*rejectfactor*/, Stream s) { -// // only color images are supperted -// CV_Assert(colored.type() == CV_8UC3); + // only color images are supperted + CV_Assert(colored.type() == CV_8UC3); -// // // only this window size allowed -// CV_Assert(colored.cols == 640 && colored.rows == 480); + // only this window size allowed + CV_Assert(colored.cols == Filds::FRAME_WIDTH && colored.rows == Filds::FRAME_HEIGHT); -// Filds& flds = *filds; + Filds& flds = *filds; -// #if defined USE_REFERENCE_VALUES +#if defined USE_REFERENCE_VALUES // cudaMemset(flds.hogluv.data, 0, flds.hogluv.step * flds.hogluv.rows); // cv::FileStorage imgs("/home/kellan/testInts.xml", cv::FileStorage::READ); // char buff[33]; @@ -452,57 +472,72 @@ void cv::gpu::SoftCascade::detectMultiScale(const GpuMat& colored, const GpuMat& // GpuMat gchannel(flds.hogluv, cv::Rect(0, 121 * i, 161, 121)); // gchannel.upload(channel); // } -// #else -// GpuMat& dmem = flds.dmem; -// cudaMemset(dmem.data, 0, dmem.step * dmem.rows); -// GpuMat& shrunk = flds.shrunk; -// int w = shrunk.cols; -// int h = colored.rows / flds.storage.shrinkage; +#else + GpuMat& plane = flds.plane; + GpuMat& shrunk = flds.shrunk; + cudaMemset(plane.data, 0, plane.step * plane.rows); -// std::vector splited; -// for(int i = 0; i < 3; ++i) -// { -// splited.push_back(GpuMat(dmem, cv::Rect(0, colored.rows * (7 + i), colored.cols, colored.rows))); -// } + int fw = Filds::FRAME_WIDTH; + int fh = Filds::FRAME_HEIGHT; -// GpuMat gray(dmem, cv::Rect(0, colored.rows * 10, colored.cols, colored.rows) ); + GpuMat gray(plane, cv::Rect(0, fh * Filds::HOG_LUV_BINS, fw, fh)); -// cv::gpu::cvtColor(colored, gray, CV_RGB2GRAY); + //cv::gpu::cvtColor(colored, gray, CV_RGB2GRAY); + cv::gpu::cvtColor(colored, gray, CV_BGR2GRAY); -// //create hog -// cv::gpu::Sobel(gray, flds.dfdx, CV_32F, 1, 0, 3, 0.25); -// cv::gpu::Sobel(gray, flds.dfdy, CV_32F, 0, 1, 3, 0.25); + //create hog + GpuMat dfdx(flds.fplane, cv::Rect(0, 0, fw, fh)); + GpuMat dfdy(flds.fplane, cv::Rect(0, fh, fw, fh)); -// cv::gpu::cartToPolar(flds.dfdx, flds.dfdy, flds.mag, flds.angle, true); + cv::gpu::Sobel(gray, dfdx, CV_32F, 1, 0, 3, 0.125f); + cv::gpu::Sobel(gray, dfdy, CV_32F, 0, 1, 3, 0.125f); -// cv::gpu::multiply(flds.mag, cv::Scalar::all(1.0 / ::log(2)), flds.nmag); -// cv::gpu::multiply(flds.angle, cv::Scalar::all(1.0 / 60.0), flds.nangle); + GpuMat mag(flds.fplane, cv::Rect(0, 2 * fh, fw, fh)); + GpuMat ang(flds.fplane, cv::Rect(0, 3 * fh, fw, fh)); -// GpuMat magCannel(dmem, cv::Rect(0, colored.rows * 6, colored.cols, colored.rows)); -// flds.nmag.convertTo(magCannel, CV_8UC1); -// device::icf::fillBins(dmem, flds.nangle); + cv::gpu::cartToPolar(dfdx, dfdy, mag, ang, true); -// // create luv -// cv::gpu::cvtColor(colored, flds.luv, CV_BGR2Luv); -// cv::gpu::split(flds.luv, splited); + // normolize magnitude to uchar interval and angles to 6 bins -// GpuMat plane(dmem, cv::Rect(0, 0, colored.cols, colored.rows * Filds::HOG_LUV_BINS)); -// cv::gpu::resize(plane, flds.shrunk, cv::Size(), 0.25, 0.25, CV_INTER_AREA); + GpuMat nmag(flds.fplane, cv::Rect(0, 4 * fh, fw, fh)); + GpuMat nang(flds.fplane, cv::Rect(0, 5 * fh, fw, fh)); -// // fer debug purpose -// // cudaMemset(flds.hogluv.data, 0, flds.hogluv.step * flds.hogluv.rows); + cv::gpu::multiply(mag, cv::Scalar::all(1.f / ::log(2)), nmag); + cv::gpu::multiply(ang, cv::Scalar::all(1.f / 60.f), nang); -// for(int i = 0; i < Filds::HOG_LUV_BINS; ++i) -// { -// GpuMat channel(shrunk, cv::Rect(0, h * i, w, h )); -// GpuMat sum(flds.hogluv, cv::Rect(0, (h + 1) * i, w + 1, h + 1)); -// cv::gpu::integralBuffered(channel, sum, flds.integralBuffer); - // } + //create uchar magnitude + GpuMat cmag(plane, cv::Rect(0, fh * Filds::HOG_BINS, fw, fh)); + nmag.convertTo(cmag, CV_8UC1); -// #endif + // create luv + cv::gpu::cvtColor(colored, flds.luv, CV_BGR2Luv); -// cudaStream_t stream = StreamAccessor::getStream(s); -// // detection + std::vector splited; + for(int i = 0; i < Filds::LUV_BINS; ++i) + { + splited.push_back(GpuMat(plane, cv::Rect(0, fh * (7 + i), fw, fh))); + } + + cv::gpu::split(flds.luv, splited); + + device::icf::fillBins(plane, nang, fw, fh, Filds::HOG_BINS); + + GpuMat hogluv(plane, cv::Rect(0, 0, fw, fh * Filds::HOG_LUV_BINS)); + cv::gpu::resize(hogluv, flds.shrunk, cv::Size(), 0.25, 0.25, CV_INTER_AREA); + + fw /= 4; + fh /= 4; + for(int i = 0; i < Filds::HOG_LUV_BINS; ++i) + { + GpuMat channel(shrunk, cv::Rect(0, fh * i, fw, fh )); + GpuMat sum(flds.hogluv, cv::Rect(0, (fh + 1) * i, fw + 1, fh + 1)); + cv::gpu::integralBuffered(channel, sum, flds.integralBuffer); + } + +#endif + + cudaStream_t stream = StreamAccessor::getStream(s); + // detection // flds.detect(objects, stream); // // flds.storage.frame(colored, stream); From b83d4add2ea096b3481a838f9e26a038d10a93d5 Mon Sep 17 00:00:00 2001 From: "marina.kolpakova" Date: Wed, 26 Sep 2012 17:15:17 +0400 Subject: [PATCH 023/155] memory optimization --- modules/gpu/src/cuda/isf-sc.cu | 56 +++- modules/gpu/src/icf.hpp | 106 ++++--- modules/gpu/src/softcascade.cpp | 489 +++++++++++++++----------------- 3 files changed, 339 insertions(+), 312 deletions(-) diff --git a/modules/gpu/src/cuda/isf-sc.cu b/modules/gpu/src/cuda/isf-sc.cu index e4831e2e6..714bdfa44 100644 --- a/modules/gpu/src/cuda/isf-sc.cu +++ b/modules/gpu/src/cuda/isf-sc.cu @@ -41,9 +41,9 @@ //M*/ #include -// #include +#include // #include -// #include +#include // #include // //#define LOG_CUDA_CASCADE @@ -93,6 +93,58 @@ namespace icf { cudaSafeCall( cudaGetLastError() ); cudaSafeCall( cudaDeviceSynchronize() ); } + + texture tnode; + __global__ void test_kernel(const Level* levels, const Octave* octaves, const float* stages, + const Node* nodes, + PtrStepSz objects) + { + const int y = blockIdx.y * blockDim.y + threadIdx.y; + const int x = blockIdx.x * blockDim.x + threadIdx.x; + Level level = levels[blockIdx.z]; + if(x >= level.workRect.x || y >= level.workRect.y) return; + + Octave octave = octaves[level.octave]; + int st = octave.index * octave.stages; + const int stEnd = st + 1000;//octave.stages; + + float confidence = 0.f; + +#pragma unroll 8 + for(; st < stEnd; ++st) + { + const int nId = st * 3; + const Node node = nodes[nId]; + + const float stage = stages[st]; + confidence += node.rect.x * stage; + } + + uchar4 val; + val.x = (int)confidence; + if (x == y) objects(0, threadIdx.x) = val; + + } + + void detect(const PtrStepSzb& levels, const PtrStepSzb& octaves, const PtrStepSzf& stages, + const PtrStepSzb& nodes, const PtrStepSzb& features, + PtrStepSz objects) + { + int fw = 160; + int fh = 120; + dim3 block(32, 8); + dim3 grid(fw / 32, fh / 8, 47); + const Level* l = (const Level*)levels.ptr(); + const Octave* oct = ((const Octave*)octaves.ptr()); + const float* st = (const float*)stages.ptr(); + const Node* nd = (const Node*)nodes.ptr(); + // cudaSafeCall( cudaBindTexture(0, tnode, nodes.data, rgb.cols / size) ); + + test_kernel<<>>(l, oct, st, nd, objects); + + cudaSafeCall( cudaGetLastError()); + cudaSafeCall( cudaDeviceSynchronize()); + } } }}} diff --git a/modules/gpu/src/icf.hpp b/modules/gpu/src/icf.hpp index cf1348007..51ea2c068 100644 --- a/modules/gpu/src/icf.hpp +++ b/modules/gpu/src/icf.hpp @@ -1,4 +1,4 @@ -/*M/////////////////////////////////////////////////////////////////////////////////////// +//M/////////////////////////////////////////////////////////////////////////////////////// // // IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. // @@ -38,12 +38,12 @@ // or tort (including negligence or otherwise) arising in any way out of // the use of this software, even if advised of the possibility of such damage. // -//M*/ +//M -// #include +#include -// #ifndef __OPENCV_ICF_HPP__ -// #define __OPENCV_ICF_HPP__ +#ifndef __OPENCV_ICF_HPP__ +#define __OPENCV_ICF_HPP__ // #if defined __CUDACC__ // # define __device __device__ __forceinline__ @@ -52,49 +52,62 @@ // #endif -// namespace cv { namespace gpu { namespace icf { +namespace cv { namespace gpu { namespace device { +namespace icf { -// using cv::gpu::PtrStepSzb; -// using cv::gpu::PtrStepSzf; +struct __align__(16) Octave +{ + ushort index; + ushort stages; + ushort shrinkage; + ushort2 size; + float scale; -// typedef unsigned char uchar; + Octave(const ushort i, const ushort s, const ushort sh, const ushort2 sz, const float sc) + : index(i), stages(s), shrinkage(sh), size(sz), scale(sc) {} +}; -// struct __align__(16) Octave -// { -// ushort index; -// ushort stages; -// ushort shrinkage; -// ushort2 size; -// float scale; +struct __align__(8) Level //is actually 24 bytes +{ + int octave; -// Octave(const ushort i, const ushort s, const ushort sh, const ushort2 sz, const float sc) -// : index(i), stages(s), shrinkage(sh), size(sz), scale(sc) {} -// }; + float relScale; + float shrScale; // used for marking detection + float scaling[2]; // calculated according to Dollal paper -// struct __align__(8) Level //is actually 24 bytes -// { -// int octave; + // for 640x480 we can not get overflow + uchar2 workRect; + uchar2 objSize; -// // float origScale; //not actually used -// float relScale; -// float shrScale; // used for marking detection -// float scaling[2]; // calculated according to Dollal paper + Level(int idx, const Octave& oct, const float scale, const int w, const int h) + : octave(idx), relScale(scale / oct.scale), shrScale (relScale / (float)oct.shrinkage) + { + workRect.x = round(w / (float)oct.shrinkage); + workRect.y = round(h / (float)oct.shrinkage); -// // for 640x480 we can not get overflow -// uchar2 workRect; -// uchar2 objSize; + objSize.x = round(oct.size.x * relScale); + objSize.y = round(oct.size.y * relScale); + } +}; -// Level(int idx, const Octave& oct, const float scale, const int w, const int h) -// : octave(idx), relScale(scale / oct.scale), shrScale (relScale / (float)oct.shrinkage) -// { -// workRect.x = round(w / (float)oct.shrinkage); -// workRect.y = round(h / (float)oct.shrinkage); +struct __align__(8) Node +{ + // int feature; + uchar4 rect; + float threshold; -// objSize.x = round(oct.size.x * relScale); -// objSize.y = round(oct.size.y * relScale); -// } -// }; + Node(const uchar4 c, const int t) : rect(c), threshold(t) {} +}; +struct __align__(8) Feature +{ + int channel; + uchar4 rect; + + Feature(const int c, const uchar4 r) : channel(c), rect(r) {} +}; +} +}}} // struct Cascade // { // Cascade() {} @@ -146,21 +159,6 @@ // static const float magnitudeScaling = 1.f ;// / sqrt(2); // }; -// struct __align__(8) Node -// { -// int feature; -// float threshold; - -// Node(const int f, const float t) : feature(f), threshold(t) {} -// }; - -// struct __align__(8) Feature -// { -// int channel; -// uchar4 rect; - -// Feature(const int c, const uchar4 r) : channel(c), rect(r) {} -// }; // }}} -// #endif \ No newline at end of file +#endif \ No newline at end of file diff --git a/modules/gpu/src/softcascade.cpp b/modules/gpu/src/softcascade.cpp index f336fd2d9..8d75176ab 100644 --- a/modules/gpu/src/softcascade.cpp +++ b/modules/gpu/src/softcascade.cpp @@ -53,12 +53,15 @@ void cv::gpu::SoftCascade::detectMultiScale(const GpuMat&, const GpuMat&, GpuMat #else -// #include +#include namespace cv { namespace gpu { namespace device { namespace icf { void fillBins(cv::gpu::PtrStepSzb hogluv, const cv::gpu::PtrStepSzf& nangle, const int fw, const int fh, const int bins); + void detect(const PtrStepSzb& levels, const PtrStepSzb& octaves, const PtrStepSzf& stages, + const PtrStepSzb& nodes, const PtrStepSzb& features, + PtrStepSz objects); } }}} @@ -82,19 +85,20 @@ struct cv::gpu::SoftCascade::Filds integralBuffer.create(shrunk.rows + 1 * HOG_LUV_BINS, shrunk.cols + 1, CV_32SC1); hogluv.create((FRAME_HEIGHT / 4 + 1) * HOG_LUV_BINS, FRAME_WIDTH / 4 + 1, CV_32SC1); } -// // scales range -// float minScale; -// float maxScale; -// int origObjWidth; -// int origObjHeight; + // scales range + float minScale; + float maxScale; -// GpuMat octaves; -// GpuMat stages; -// GpuMat nodes; -// GpuMat leaves; -// GpuMat features; -// GpuMat levels; + int origObjWidth; + int origObjHeight; + + GpuMat octaves; + GpuMat stages; + GpuMat nodes; + GpuMat leaves; + GpuMat features; + GpuMat levels; // preallocated buffer 640x480x10 for hogluv + 640x480 got gray GpuMat plane; @@ -114,312 +118,285 @@ struct cv::gpu::SoftCascade::Filds // 161x121x10 GpuMat hogluv; -// // will be removed in final version + std::vector scales; - -// // temp matrix for sobel and cartToPolar -// GpuMat dfdx, dfdy, angle, mag, nmag, nangle; - -// std::vector scales; - -// icf::Cascade cascade; -// icf::ChannelStorage storage; - -// enum { BOOST = 0 }; + enum { BOOST = 0 }; enum { FRAME_WIDTH = 640, FRAME_HEIGHT = 480, -// TOTAL_SCALES = 55, + TOTAL_SCALES = 55, // CLASSIFIERS = 5, -// ORIG_OBJECT_WIDTH = 64, -// ORIG_OBJECT_HEIGHT = 128, + ORIG_OBJECT_WIDTH = 64, + ORIG_OBJECT_HEIGHT = 128, HOG_BINS = 6, LUV_BINS = 3, HOG_LUV_BINS = 10 }; -// bool fill(const FileNode &root, const float mins, const float maxs); -// void detect(cv::gpu::GpuMat objects, cudaStream_t stream) const -// { -// cascade.detect(hogluv, objects, stream); -// } + bool fill(const FileNode &root, const float mins, const float maxs); + void detect(cv::gpu::GpuMat objects, cudaStream_t stream) const + { + device::icf::detect(levels, octaves, stages, nodes, features, objects); + } -// private: -// void calcLevels(const std::vector& octs, -// int frameW, int frameH, int nscales); +private: + void calcLevels(const std::vector& octs, + int frameW, int frameH, int nscales); -// typedef std::vector::const_iterator octIt_t; -// int fitOctave(const std::vector& octs, const float& logFactor) const -// { -// float minAbsLog = FLT_MAX; -// int res = 0; -// for (int oct = 0; oct < (int)octs.size(); ++oct) -// { -// const icf::Octave& octave =octs[oct]; -// float logOctave = ::log(octave.scale); -// float logAbsScale = ::fabs(logFactor - logOctave); + typedef std::vector::const_iterator octIt_t; + int fitOctave(const std::vector& octs, const float& logFactor) const + { + float minAbsLog = FLT_MAX; + int res = 0; + for (int oct = 0; oct < (int)octs.size(); ++oct) + { + const device::icf::Octave& octave =octs[oct]; + float logOctave = ::log(octave.scale); + float logAbsScale = ::fabs(logFactor - logOctave); -// if(logAbsScale < minAbsLog) -// { -// res = oct; -// minAbsLog = logAbsScale; -// } -// } -// return res; -// } + if(logAbsScale < minAbsLog) + { + res = oct; + minAbsLog = logAbsScale; + } + } + return res; + } }; -// inline bool cv::gpu::SoftCascade::Filds::fill(const FileNode &root, const float mins, const float maxs) -// { -// minScale = mins; -// maxScale = maxs; +inline bool cv::gpu::SoftCascade::Filds::fill(const FileNode &root, const float mins, const float maxs) +{ + using namespace device::icf; + minScale = mins; + maxScale = maxs; -// // cascade properties -// static const char *const SC_STAGE_TYPE = "stageType"; -// static const char *const SC_BOOST = "BOOST"; + // cascade properties + static const char *const SC_STAGE_TYPE = "stageType"; + static const char *const SC_BOOST = "BOOST"; -// static const char *const SC_FEATURE_TYPE = "featureType"; -// static const char *const SC_ICF = "ICF"; + static const char *const SC_FEATURE_TYPE = "featureType"; + static const char *const SC_ICF = "ICF"; -// static const char *const SC_ORIG_W = "width"; -// static const char *const SC_ORIG_H = "height"; + static const char *const SC_ORIG_W = "width"; + static const char *const SC_ORIG_H = "height"; -// static const char *const SC_OCTAVES = "octaves"; -// static const char *const SC_STAGES = "stages"; -// static const char *const SC_FEATURES = "features"; + static const char *const SC_OCTAVES = "octaves"; + static const char *const SC_STAGES = "stages"; + static const char *const SC_FEATURES = "features"; -// static const char *const SC_WEEK = "weakClassifiers"; -// static const char *const SC_INTERNAL = "internalNodes"; -// static const char *const SC_LEAF = "leafValues"; + static const char *const SC_WEEK = "weakClassifiers"; + static const char *const SC_INTERNAL = "internalNodes"; + static const char *const SC_LEAF = "leafValues"; -// static const char *const SC_OCT_SCALE = "scale"; -// static const char *const SC_OCT_STAGES = "stageNum"; -// static const char *const SC_OCT_SHRINKAGE = "shrinkingFactor"; + static const char *const SC_OCT_SCALE = "scale"; + static const char *const SC_OCT_STAGES = "stageNum"; + static const char *const SC_OCT_SHRINKAGE = "shrinkingFactor"; -// static const char *const SC_STAGE_THRESHOLD = "stageThreshold"; + static const char *const SC_STAGE_THRESHOLD = "stageThreshold"; -// static const char * const SC_F_CHANNEL = "channel"; -// static const char * const SC_F_RECT = "rect"; + static const char * const SC_F_CHANNEL = "channel"; + static const char * const SC_F_RECT = "rect"; -// // only Ada Boost supported -// std::string stageTypeStr = (string)root[SC_STAGE_TYPE]; -// CV_Assert(stageTypeStr == SC_BOOST); + // only Ada Boost supported + std::string stageTypeStr = (string)root[SC_STAGE_TYPE]; + CV_Assert(stageTypeStr == SC_BOOST); -// // only HOG-like integral channel features cupported -// string featureTypeStr = (string)root[SC_FEATURE_TYPE]; -// CV_Assert(featureTypeStr == SC_ICF); + // only HOG-like integral channel features cupported + string featureTypeStr = (string)root[SC_FEATURE_TYPE]; + CV_Assert(featureTypeStr == SC_ICF); -// origObjWidth = (int)root[SC_ORIG_W]; -// CV_Assert(origObjWidth == ORIG_OBJECT_WIDTH); + origObjWidth = (int)root[SC_ORIG_W]; + CV_Assert(origObjWidth == ORIG_OBJECT_WIDTH); -// origObjHeight = (int)root[SC_ORIG_H]; -// CV_Assert(origObjHeight == ORIG_OBJECT_HEIGHT); + origObjHeight = (int)root[SC_ORIG_H]; + CV_Assert(origObjHeight == ORIG_OBJECT_HEIGHT); -// FileNode fn = root[SC_OCTAVES]; -// if (fn.empty()) return false; + FileNode fn = root[SC_OCTAVES]; + if (fn.empty()) return false; -// std::vector voctaves; -// std::vector vstages; -// std::vector vnodes; -// std::vector vleaves; -// std::vector vfeatures; -// scales.clear(); + std::vector voctaves; + std::vector vstages; + std::vector vnodes; + std::vector vleaves; + std::vector vfeatures; + scales.clear(); -// // std::vector levels; + FileNodeIterator it = fn.begin(), it_end = fn.end(); + int feature_offset = 0; + ushort octIndex = 0; + ushort shrinkage = 1; -// FileNodeIterator it = fn.begin(), it_end = fn.end(); -// int feature_offset = 0; -// ushort octIndex = 0; -// ushort shrinkage = 1; + for (; it != it_end; ++it) + { + FileNode fns = *it; + float scale = (float)fns[SC_OCT_SCALE]; + scales.push_back(scale); + ushort nstages = saturate_cast((int)fns[SC_OCT_STAGES]); + ushort2 size; + size.x = cvRound(ORIG_OBJECT_WIDTH * scale); + size.y = cvRound(ORIG_OBJECT_HEIGHT * scale); + shrinkage = saturate_cast((int)fns[SC_OCT_SHRINKAGE]); -// for (; it != it_end; ++it) -// { -// FileNode fns = *it; -// float scale = (float)fns[SC_OCT_SCALE]; -// scales.push_back(scale); -// ushort nstages = saturate_cast((int)fns[SC_OCT_STAGES]); -// ushort2 size; -// size.x = cvRound(ORIG_OBJECT_WIDTH * scale); -// size.y = cvRound(ORIG_OBJECT_HEIGHT * scale); -// shrinkage = saturate_cast((int)fns[SC_OCT_SHRINKAGE]); + Octave octave(octIndex, nstages, shrinkage, size, scale); + CV_Assert(octave.stages > 0); + voctaves.push_back(octave); -// icf::Octave octave(octIndex, nstages, shrinkage, size, scale); -// CV_Assert(octave.stages > 0); -// voctaves.push_back(octave); + FileNode ffs = fns[SC_FEATURES]; + if (ffs.empty()) return false; -// FileNode ffs = fns[SC_FEATURES]; -// if (ffs.empty()) return false; + fns = fns[SC_STAGES]; + if (fn.empty()) return false; -// fns = fns[SC_STAGES]; -// if (fn.empty()) return false; + // for each stage (~ decision tree with H = 2) + FileNodeIterator st = fns.begin(), st_end = fns.end(); + for (; st != st_end; ++st ) + { + fns = *st; + vstages.push_back((float)fns[SC_STAGE_THRESHOLD]); -// // for each stage (~ decision tree with H = 2) -// FileNodeIterator st = fns.begin(), st_end = fns.end(); -// for (; st != st_end; ++st ) -// { -// fns = *st; -// vstages.push_back((float)fns[SC_STAGE_THRESHOLD]); + fns = fns[SC_WEEK]; + FileNodeIterator ftr = fns.begin(), ft_end = fns.end(); + for (; ftr != ft_end; ++ftr) + { + fns = (*ftr)[SC_INTERNAL]; + FileNodeIterator inIt = fns.begin(), inIt_end = fns.end(); + for (; inIt != inIt_end;) + { + int feature = (int)(*(inIt +=2)++) + feature_offset; + float th = (float)(*(inIt++)); + uchar4 rect; + vnodes.push_back(Node(rect, th)); + } -// fns = fns[SC_WEEK]; -// FileNodeIterator ftr = fns.begin(), ft_end = fns.end(); -// for (; ftr != ft_end; ++ftr) -// { -// fns = (*ftr)[SC_INTERNAL]; -// FileNodeIterator inIt = fns.begin(), inIt_end = fns.end(); -// for (; inIt != inIt_end;) -// { -// int feature = (int)(*(inIt +=2)++) + feature_offset; -// float th = (float)(*(inIt++)); -// vnodes.push_back(icf::Node(feature, th)); -// } + fns = (*ftr)[SC_LEAF]; + inIt = fns.begin(), inIt_end = fns.end(); + for (; inIt != inIt_end; ++inIt) + vleaves.push_back((float)(*inIt)); + } + } -// fns = (*ftr)[SC_LEAF]; -// inIt = fns.begin(), inIt_end = fns.end(); -// for (; inIt != inIt_end; ++inIt) -// vleaves.push_back((float)(*inIt)); -// } -// } + st = ffs.begin(), st_end = ffs.end(); + for (; st != st_end; ++st ) + { + cv::FileNode rn = (*st)[SC_F_RECT]; + cv::FileNodeIterator r_it = rn.begin(); + uchar4 rect; + rect.x = saturate_cast((int)*(r_it++)); + rect.y = saturate_cast((int)*(r_it++)); + rect.z = saturate_cast((int)*(r_it++)); + rect.w = saturate_cast((int)*(r_it++)); + vfeatures.push_back(Feature((int)(*st)[SC_F_CHANNEL], rect)); + } -// st = ffs.begin(), st_end = ffs.end(); -// for (; st != st_end; ++st ) -// { -// cv::FileNode rn = (*st)[SC_F_RECT]; -// cv::FileNodeIterator r_it = rn.begin(); -// uchar4 rect; -// rect.x = saturate_cast((int)*(r_it++)); -// rect.y = saturate_cast((int)*(r_it++)); -// rect.z = saturate_cast((int)*(r_it++)); -// rect.w = saturate_cast((int)*(r_it++)); -// vfeatures.push_back(icf::Feature((int)(*st)[SC_F_CHANNEL], rect)); -// } + feature_offset += octave.stages * 3; + ++octIndex; + } -// feature_offset += octave.stages * 3; -// ++octIndex; -// } + // upload in gpu memory + octaves.upload(cv::Mat(1, voctaves.size() * sizeof(Octave), CV_8UC1, (uchar*)&(voctaves[0]) )); + CV_Assert(!octaves.empty()); -// // upload in gpu memory -// octaves.upload(cv::Mat(1, voctaves.size() * sizeof(icf::Octave), CV_8UC1, (uchar*)&(voctaves[0]) )); -// CV_Assert(!octaves.empty()); + stages.upload(cv::Mat(vstages).reshape(1,1)); + CV_Assert(!stages.empty()); -// stages.upload(cv::Mat(vstages).reshape(1,1)); -// CV_Assert(!stages.empty()); + nodes.upload(cv::Mat(1, vnodes.size() * sizeof(Node), CV_8UC1, (uchar*)&(vnodes[0]) )); + CV_Assert(!nodes.empty()); -// nodes.upload(cv::Mat(1, vnodes.size() * sizeof(icf::Node), CV_8UC1, (uchar*)&(vnodes[0]) )); -// CV_Assert(!nodes.empty()); + leaves.upload(cv::Mat(vleaves).reshape(1,1)); + CV_Assert(!leaves.empty()); -// leaves.upload(cv::Mat(vleaves).reshape(1,1)); -// CV_Assert(!leaves.empty()); + features.upload(cv::Mat(1, vfeatures.size() * sizeof(Feature), CV_8UC1, (uchar*)&(vfeatures[0]) )); + CV_Assert(!features.empty()); -// features.upload(cv::Mat(1, vfeatures.size() * sizeof(icf::Feature), CV_8UC1, (uchar*)&(vfeatures[0]) )); -// CV_Assert(!features.empty()); + // compute levels + calcLevels(voctaves, FRAME_WIDTH, FRAME_HEIGHT, TOTAL_SCALES); + CV_Assert(!levels.empty()); -// // compute levels -// calcLevels(voctaves, FRAME_WIDTH, FRAME_HEIGHT, TOTAL_SCALES); -// CV_Assert(!levels.empty()); + return true; +} -// //init Cascade -// cascade = icf::Cascade(octaves, stages, nodes, leaves, features, levels); +namespace { + struct CascadeIntrinsics + { + static const float lambda = 1.099f, a = 0.89f; -// // allocate buffers -// dmem.create(FRAME_HEIGHT * (HOG_LUV_BINS + 1), FRAME_WIDTH, CV_8UC1); -// shrunk.create(FRAME_HEIGHT / shrinkage * HOG_LUV_BINS, FRAME_WIDTH / shrinkage, CV_8UC1); -// // hogluv.create( (FRAME_HEIGHT / shrinkage + 1) * HOG_LUV_BINS, (FRAME_WIDTH / shrinkage + 1), CV_16UC1); -// hogluv.create( (FRAME_HEIGHT / shrinkage + 1) * HOG_LUV_BINS, (FRAME_WIDTH / shrinkage + 1), CV_32SC1); -// luv.create(FRAME_HEIGHT, FRAME_WIDTH, CV_8UC3); -// integralBuffer.create(shrunk.rows + 1 * HOG_LUV_BINS, shrunk.cols + 1, CV_32SC1); + static float getFor(int channel, float scaling) + { + CV_Assert(channel < 10); -// dfdx.create(FRAME_HEIGHT, FRAME_WIDTH, CV_32FC1); -// dfdy.create(FRAME_HEIGHT, FRAME_WIDTH, CV_32FC1); -// angle.create(FRAME_HEIGHT, FRAME_WIDTH, CV_32FC1); -// mag.create(FRAME_HEIGHT, FRAME_WIDTH, CV_32FC1); + if (fabs(scaling - 1.f) < FLT_EPSILON) + return 1.f; -// nmag.create(FRAME_HEIGHT, FRAME_WIDTH, CV_32FC1); -// nangle.create(FRAME_HEIGHT, FRAME_WIDTH, CV_32FC1); + // according to R. Benenson, M. Mathias, R. Timofte and L. Van Gool's and Dallal's papers + static const float A[2][2] = + { //channel <= 6, otherwise + { 0.89f, 1.f}, // down + { 1.00f, 1.f} // up + }; -// storage = icf::ChannelStorage(dmem, shrunk, hogluv, shrinkage); -// return true; -// } + static const float B[2][2] = + { //channel <= 6, otherwise + { 1.099f / log(2), 2.f}, // down + { 0.f, 2.f} // up + }; -// namespace { -// struct CascadeIntrinsics -// { -// static const float lambda = 1.099f, a = 0.89f; + float a = A[(int)(scaling >= 1)][(int)(channel > 6)]; + float b = B[(int)(scaling >= 1)][(int)(channel > 6)]; -// static float getFor(int channel, float scaling) -// { -// CV_Assert(channel < 10); + // printf("!!! scaling: %f %f %f -> %f\n", scaling, a, b, a * pow(scaling, b)); + return a * pow(scaling, b); + } + }; +} -// if (fabs(scaling - 1.f) < FLT_EPSILON) -// return 1.f; +inline void cv::gpu::SoftCascade::Filds::calcLevels(const std::vector& octs, + int frameW, int frameH, int nscales) +{ + CV_Assert(nscales > 1); + using device::icf::Level; -// // according to R. Benenson, M. Mathias, R. Timofte and L. Van Gool's and Dallal's papers -// static const float A[2][2] = -// { //channel <= 6, otherwise -// { 0.89f, 1.f}, // down -// { 1.00f, 1.f} // up -// }; + std::vector vlevels; + float logFactor = (::log(maxScale) - ::log(minScale)) / (nscales -1); -// static const float B[2][2] = -// { //channel <= 6, otherwise -// { 1.099f / log(2), 2.f}, // down -// { 0.f, 2.f} // up -// }; + float scale = minScale; + for (int sc = 0; sc < nscales; ++sc) + { + int width = ::std::max(0.0f, frameW - (origObjWidth * scale)); + int height = ::std::max(0.0f, frameH - (origObjHeight * scale)); -// float a = A[(int)(scaling >= 1)][(int)(channel > 6)]; -// float b = B[(int)(scaling >= 1)][(int)(channel > 6)]; + float logScale = ::log(scale); + int fit = fitOctave(octs, logScale); -// // printf("!!! scaling: %f %f %f -> %f\n", scaling, a, b, a * pow(scaling, b)); -// return a * pow(scaling, b); -// } -// }; -// } + Level level(fit, octs[fit], scale, width, height); + level.scaling[0] = CascadeIntrinsics::getFor(0, level.relScale); + level.scaling[1] = CascadeIntrinsics::getFor(9, level.relScale); -// inline void cv::gpu::SoftCascade::Filds::calcLevels(const std::vector& octs, -// int frameW, int frameH, int nscales) -// { -// CV_Assert(nscales > 1); + if (!width || !height) + break; + else + vlevels.push_back(level); -// std::vector vlevels; -// float logFactor = (::log(maxScale) - ::log(minScale)) / (nscales -1); + if (::fabs(scale - maxScale) < FLT_EPSILON) break; + scale = ::std::min(maxScale, ::expf(::log(scale) + logFactor)); -// float scale = minScale; -// for (int sc = 0; sc < nscales; ++sc) -// { -// int width = ::std::max(0.0f, frameW - (origObjWidth * scale)); -// int height = ::std::max(0.0f, frameH - (origObjHeight * scale)); + // printf("level: %d (%f %f) [%f %f] (%d %d) (%d %d)\n", level.octave, level.relScale, level.shrScale, + // level.scaling[0], level.scaling[1], level.workRect.x, level.workRect.y, level.objSize.x, + //level.objSize.y); -// float logScale = ::log(scale); -// int fit = fitOctave(octs, logScale); + std::cout << "level " << sc + << " octeve " + << vlevels[sc].octave + << " relScale " + << vlevels[sc].relScale + << " " << vlevels[sc].shrScale + << " [" << (int)vlevels[sc].objSize.x + << " " << (int)vlevels[sc].objSize.y << "] [" + << (int)vlevels[sc].workRect.x << " " << (int)vlevels[sc].workRect.y << "]" << std::endl; + } -// icf::Level level(fit, octs[fit], scale, width, height); -// level.scaling[0] = CascadeIntrinsics::getFor(0, level.relScale); -// level.scaling[1] = CascadeIntrinsics::getFor(9, level.relScale); - -// if (!width || !height) -// break; -// else -// vlevels.push_back(level); - -// if (::fabs(scale - maxScale) < FLT_EPSILON) break; -// scale = ::std::min(maxScale, ::expf(::log(scale) + logFactor)); - -// // printf("level: %d (%f %f) [%f %f] (%d %d) (%d %d)\n", level.octave, level.relScale, level.shrScale, -// // level.scaling[0], level.scaling[1], level.workRect.x, level.workRect.y, level.objSize.x, -//level.objSize.y); - -// // std::cout << "level " << sc -// // << " octeve " -// // << vlevels[sc].octave -// // << " relScale " -// // << vlevels[sc].relScale -// // << " " << vlevels[sc].shrScale -// // << " [" << (int)vlevels[sc].objSize.x -// // << " " << (int)vlevels[sc].objSize.y << "] [" -// // << (int)vlevels[sc].workRect.x << " " << (int)vlevels[sc].workRect.y << "]" << std::endl; -// } -// levels.upload(cv::Mat(1, vlevels.size() * sizeof(icf::Level), CV_8UC1, (uchar*)&(vlevels[0]) )); -// } + levels.upload(cv::Mat(1, vlevels.size() * sizeof(Level), CV_8UC1, (uchar*)&(vlevels[0]) )); +} cv::gpu::SoftCascade::SoftCascade() : filds(0) {} @@ -444,7 +421,7 @@ bool cv::gpu::SoftCascade::load( const string& filename, const float minScale, c filds = new Filds; Filds& flds = *filds; - // if (!flds.fill(fs.getFirstTopLevelNode(), minScale, maxScale)) return false; + if (!flds.fill(fs.getFirstTopLevelNode(), minScale, maxScale)) return false; return true; } @@ -538,7 +515,7 @@ void cv::gpu::SoftCascade::detectMultiScale(const GpuMat& colored, const GpuMat& cudaStream_t stream = StreamAccessor::getStream(s); // detection -// flds.detect(objects, stream); + flds.detect(objects, stream); // // flds.storage.frame(colored, stream); } From 8108bd30febf17e81f8329ccb65f695dd335a471 Mon Sep 17 00:00:00 2001 From: "marina.kolpakova" Date: Thu, 27 Sep 2012 12:44:06 +0400 Subject: [PATCH 024/155] optimize memory usage --- modules/gpu/src/cuda/isf-sc.cu | 306 +++++++++++++------------------- modules/gpu/src/icf.hpp | 29 +-- modules/gpu/src/softcascade.cpp | 86 +++++---- 3 files changed, 180 insertions(+), 241 deletions(-) diff --git a/modules/gpu/src/cuda/isf-sc.cu b/modules/gpu/src/cuda/isf-sc.cu index 714bdfa44..c8dff34bd 100644 --- a/modules/gpu/src/cuda/isf-sc.cu +++ b/modules/gpu/src/cuda/isf-sc.cu @@ -42,18 +42,17 @@ #include #include -// #include #include -// #include +#include -// //#define LOG_CUDA_CASCADE +// #define LOG_CUDA_CASCADE -// #if defined LOG_CUDA_CASCADE -// # define dprintf(format, ...) \ -// do { printf(format, __VA_ARGS__); } while (0) -// #else -// # define dprintf(format, ...) -// #endif +#if defined LOG_CUDA_CASCADE +# define dprintf(format, ...) \ + do { printf(format, __VA_ARGS__); } while (0) +#else +# define dprintf(format, ...) +#endif namespace cv { namespace gpu { namespace device { namespace icf { @@ -94,32 +93,128 @@ namespace icf { cudaSafeCall( cudaDeviceSynchronize() ); } - texture tnode; + texture thogluv; + // ToDo: do it in load time + // __device__ __forceinline__ float rescale(const Level& level, uchar4& scaledRect, const Node& node) + // { + // scaledRect = node.rect; + // return (float)(node.threshold & 0x0FFFFFFFU); + // } + + __device__ __forceinline__ float rescale(const Level& level, uchar4& scaledRect, const Node& node) + { + float relScale = level.relScale; + float farea = (scaledRect.z - scaledRect.x) * (scaledRect.w - scaledRect.y); + + dprintf("feature %d box %d %d %d %d\n", (node.threshold >> 28), scaledRect.x, scaledRect.y, + scaledRect.z, scaledRect.w); + dprintf("rescale: %f [%f %f] selected %f\n",level.relScale, level.scaling[0], level.scaling[1], + level.scaling[(node.threshold >> 28) > 6]); + + // rescale + scaledRect.x = __float2int_rn(relScale * scaledRect.x); + scaledRect.y = __float2int_rn(relScale * scaledRect.y); + scaledRect.z = __float2int_rn(relScale * scaledRect.z); + scaledRect.w = __float2int_rn(relScale * scaledRect.w); + + float sarea = (scaledRect.z - scaledRect.x) * (scaledRect.w - scaledRect.y); + + float approx = 1.f; + // if (fabs(farea - 0.f) > FLT_EPSILON && fabs(farea - 0.f) > FLT_EPSILON) + { + const float expected_new_area = farea * relScale * relScale; + approx = sarea / expected_new_area; + } + + dprintf("new rect: %d box %d %d %d %d rel areas %f %f\n", (node.threshold >> 28), + scaledRect.x, scaledRect.y, scaledRect.z, scaledRect.w, farea * relScale * relScale, sarea); + + + float rootThreshold = (node.threshold & 0x0FFFFFFFU) * approx; + rootThreshold *= level.scaling[(node.threshold >> 28) > 6]; + + dprintf("approximation %f %d -> %f %f\n", approx, (node.threshold & 0x0FFFFFFFU), rootThreshold, + level.scaling[(node.threshold >> 28) > 6]); + + return rootThreshold; + } + + __device__ __forceinline__ int get(const int x, int y, int channel, uchar4 area) + { + + dprintf("feature box %d %d %d %d ", area.x, area.y, area.z, area.w); + dprintf("get for channel %d\n", channel); + dprintf("extract feature for: [%d %d] [%d %d] [%d %d] [%d %d]\n", + x + area.x, y + area.y, x + area.z, y + area.y, x + area.z,y + area.w, + x + area.x, y + area.w); + dprintf("at point %d %d with offset %d\n", x, y, 0); + + int offset = channel * 121; + y += offset; + + int a = tex2D(thogluv, x + area.x, y + area.y); + int b = tex2D(thogluv, x + area.z, y + area.y); + int c = tex2D(thogluv, x + area.z, y + area.w); + int d = tex2D(thogluv, x + area.x, y + area.w); + + dprintf(" retruved integral values: %d %d %d %d\n", a, b, c, d); + + return (a - b + c - d); + } + __global__ void test_kernel(const Level* levels, const Octave* octaves, const float* stages, - const Node* nodes, - PtrStepSz objects) + const Node* nodes, const float* leaves, PtrStepSz objects) { const int y = blockIdx.y * blockDim.y + threadIdx.y; const int x = blockIdx.x * blockDim.x + threadIdx.x; Level level = levels[blockIdx.z]; + + // if (x > 0 || y > 0 || blockIdx.z > 0) return; if(x >= level.workRect.x || y >= level.workRect.y) return; Octave octave = octaves[level.octave]; + int st = octave.index * octave.stages; const int stEnd = st + 1000;//octave.stages; float confidence = 0.f; -#pragma unroll 8 +// #pragma unroll 8 for(; st < stEnd; ++st) { + dprintf("\n\nstage: %d\n", st); const int nId = st * 3; - const Node node = nodes[nId]; + Node node = nodes[nId]; - const float stage = stages[st]; - confidence += node.rect.x * stage; + dprintf("Node: [%d %d %d %d] %d %d\n", node.rect.x, node.rect.y, node.rect.z, node.rect.w, + node.threshold >> 28, node.threshold & 0x0FFFFFFFU); + + float threshold = rescale(level, node.rect, node); + int sum = get(x, y, (node.threshold >> 28), node.rect); + + dprintf("Node: [%d %d %d %d] %f\n", node.rect.x, node.rect.y, node.rect.z, + node.rect.w, threshold); + + int next = 1 + (int)(sum >= threshold); + dprintf("go: %d (%d >= %f)\n\n" ,next, sum, threshold); + + node = nodes[nId + next]; + threshold = rescale(level, node.rect, node); + sum = get(x, y, (node.threshold >> 28), node.rect); + + const int lShift = (next - 1) * 2 + (int)(sum >= threshold); + float impact = leaves[st * 4 + lShift]; + confidence += impact; + + if (confidence <= stages[st]) st = stEnd + 1; + dprintf("decided: %d (%d >= %f) %d %f\n\n" ,next, sum, threshold, lShift, impact); + dprintf("extracted stage: %f\n", stages[st]); + dprintf("computed score: %f\n\n", confidence); } + // if (st == stEnd) + // printf("%d %d %d\n", x, y, st); + uchar4 val; val.x = (int)confidence; if (x == y) objects(0, threadIdx.x) = val; @@ -127,188 +222,27 @@ namespace icf { } void detect(const PtrStepSzb& levels, const PtrStepSzb& octaves, const PtrStepSzf& stages, - const PtrStepSzb& nodes, const PtrStepSzb& features, - PtrStepSz objects) + const PtrStepSzb& nodes, const PtrStepSzf& leaves, const PtrStepSzi& hogluv, PtrStepSz objects) { int fw = 160; int fh = 120; + dim3 block(32, 8); dim3 grid(fw / 32, fh / 8, 47); + const Level* l = (const Level*)levels.ptr(); const Octave* oct = ((const Octave*)octaves.ptr()); const float* st = (const float*)stages.ptr(); const Node* nd = (const Node*)nodes.ptr(); - // cudaSafeCall( cudaBindTexture(0, tnode, nodes.data, rgb.cols / size) ); + const float* lf = (const float*)leaves.ptr(); - test_kernel<<>>(l, oct, st, nd, objects); + cudaChannelFormatDesc desc = cudaCreateChannelDesc(); + cudaSafeCall( cudaBindTexture2D(0, thogluv, hogluv.data, desc, hogluv.cols, hogluv.rows, hogluv.step)); + + test_kernel<<>>(l, oct, st, nd, lf, objects); cudaSafeCall( cudaGetLastError()); cudaSafeCall( cudaDeviceSynchronize()); } } -}}} - -// __global__ void detect(const cv::gpu::icf::Cascade cascade, const int* __restrict__ hogluv, const int pitch, -// PtrStepSz objects) -// { -// cascade.detectAt(hogluv, pitch, objects); -// } - -// } - -// float __device icf::Cascade::rescale(const icf::Level& level, uchar4& scaledRect, -// const int channel, const float threshold) const -// { -// dprintf("feature %d box %d %d %d %d\n", channel, scaledRect.x, scaledRect.y, scaledRect.z, scaledRect.w); -// dprintf("rescale: %f [%f %f]\n",level.relScale, level.scaling[0], level.scaling[1]); - -// float relScale = level.relScale; -// float farea = (scaledRect.z - scaledRect.x) * (scaledRect.w - scaledRect.y); - -// // rescale -// scaledRect.x = __float2int_rn(relScale * scaledRect.x); -// scaledRect.y = __float2int_rn(relScale * scaledRect.y); -// scaledRect.z = __float2int_rn(relScale * scaledRect.z); -// scaledRect.w = __float2int_rn(relScale * scaledRect.w); - -// float sarea = (scaledRect.z - scaledRect.x) * (scaledRect.w - scaledRect.y); - - -// float approx = 1.f; -// if (fabs(farea - 0.f) > FLT_EPSILON && fabs(farea - 0.f) > FLT_EPSILON) -// { -// const float expected_new_area = farea * relScale * relScale; -// approx = expected_new_area / sarea; -// } - -// dprintf("new rect: %d box %d %d %d %d rel areas %f %f\n", channel, -// scaledRect.x, scaledRect.y, scaledRect.z, scaledRect.w, farea * relScale * relScale, sarea); - -// // compensation areas rounding -// float rootThreshold = threshold / approx; -// // printf(" approx %f\n", rootThreshold); -// rootThreshold *= level.scaling[(int)(channel > 6)]; - -// dprintf("approximation %f %f -> %f %f\n", approx, threshold, rootThreshold, level.scaling[(int)(channel > 6)]); - -// return rootThreshold; -// } - -// typedef unsigned char uchar; -// float __device get(const int* __restrict__ hogluv, const int pitch, -// const int x, const int y, int channel, uchar4 area) -// { -// dprintf("feature box %d %d %d %d ", area.x, area.y, area.z, area.w); -// dprintf("get for channel %d\n", channel); -// dprintf("extract feature for: [%d %d] [%d %d] [%d %d] [%d %d]\n", -// x + area.x, y + area.y, x + area.z, y + area.y, x + area.z,y + area.w, -// x + area.x, y + area.w); -// dprintf("at point %d %d with offset %d\n", x, y, 0); - -// const int* curr = hogluv + ((channel * 121) + y) * pitch; - -// int a = curr[area.y * pitch + x + area.x]; -// int b = curr[area.y * pitch + x + area.z]; -// int c = curr[area.w * pitch + x + area.z]; -// int d = curr[area.w * pitch + x + area.x]; - -// dprintf(" retruved integral values: %d %d %d %d\n", a, b, c, d); - -// return (a - b + c - d); -// } - - -// void __device icf::Cascade::detectAt(const int* __restrict__ hogluv, const int pitch, -// PtrStepSz& objects) const -// { -// const icf::Level* lls = (const icf::Level*)levels.ptr(); - -// const int y = blockIdx.y * blockDim.y + threadIdx.y; -// const int x = blockIdx.x * blockDim.x + threadIdx.x; -// // if (x > 0 || y > 0) return; - -// Level level = lls[blockIdx.z]; -// if (x >= level.workRect.x || y >= level.workRect.y) return; - -// dprintf("level: %d (%f %f) [%f %f] (%d %d) (%d %d)\n", level.octave, level.relScale, level.shrScale, -// level.scaling[0], level.scaling[1], level.workRect.x, level.workRect.y, level.objSize.x, level.objSize.y); - -// const Octave octave = ((const Octave*)octaves.ptr())[level.octave]; -// // printf("Octave: %d %d %d (%d %d) %f\n", octave.index, octave.stages, -// // octave.shrinkage, octave.size.x, octave.size.y, octave.scale); - -// const int stBegin = octave.index * octave.stages, stEnd = stBegin + octave.stages; - -// float detectionScore = 0.f; - -// int st = stBegin; -// for(; st < stEnd; ++st) -// { -// const float stage = stages(0, st); -// dprintf("Stage: %f\n", stage); -// { -// const int nId = st * 3; - -// // work with root node -// const Node node = ((const Node*)nodes.ptr())[nId]; - -// dprintf("Node: %d %f\n", node.feature, node.threshold); - -// const Feature feature = ((const Feature*)features.ptr())[node.feature]; - -// uchar4 scaledRect = feature.rect; -// float threshold = rescale(level, scaledRect, feature.channel, node.threshold); - -// float sum = get(hogluv,pitch, x, y, feature.channel, scaledRect); - -// dprintf("root feature %d %f\n",feature.channel, sum); - -// int next = 1 + (int)(sum >= threshold); - -// dprintf("go: %d (%f >= %f)\n\n" ,next, sum, threshold); - -// // leaves -// const Node leaf = ((const Node*)nodes.ptr())[nId + next]; -// const Feature fLeaf = ((const Feature*)features.ptr())[leaf.feature]; - -// scaledRect = fLeaf.rect; -// threshold = rescale(level, scaledRect, fLeaf.channel, leaf.threshold); -// sum = get(hogluv, pitch, x, y, fLeaf.channel, scaledRect); - -// const int lShift = (next - 1) * 2 + (int)(sum >= threshold); -// float impact = leaves(0, (st * 4) + lShift); - -// detectionScore += impact; - -// dprintf("decided: %d (%f >= %f) %d %f\n\n" ,next, sum, threshold, lShift, impact); -// dprintf("extracted stage:\n"); -// dprintf("ct %f\n", stage); -// dprintf("computed score %f\n\n", detectionScore); -// dprintf("\n\n"); -// } - -// if (detectionScore <= stage || st - stBegin == 100) break; -// } - -// dprintf("x %d y %d: %d\n", x, y, st - stBegin); - -// if (st == stEnd) -// { -// uchar4 a; -// a.x = level.workRect.x; -// a.y = level.workRect.y; -// objects(0, threadIdx.x) = a; -// } -// } - -// void icf::Cascade::detect(const cv::gpu::PtrStepSzi& hogluv, PtrStepSz objects, cudaStream_t stream) const -// { -// dim3 block(32, 8, 1); -// dim3 grid(ChannelStorage::FRAME_WIDTH / 32, ChannelStorage::FRAME_HEIGHT / 8, 47); -// device::detect<<>>(*this, hogluv, hogluv.step / sizeof(int), objects); -// cudaSafeCall( cudaGetLastError() ); -// if (!stream) -// cudaSafeCall( cudaDeviceSynchronize() ); -// } - -// }} \ No newline at end of file +}}} \ No newline at end of file diff --git a/modules/gpu/src/icf.hpp b/modules/gpu/src/icf.hpp index 51ea2c068..ecd1886d3 100644 --- a/modules/gpu/src/icf.hpp +++ b/modules/gpu/src/icf.hpp @@ -40,11 +40,13 @@ // //M -#include #ifndef __OPENCV_ICF_HPP__ #define __OPENCV_ICF_HPP__ +#include +#include + // #if defined __CUDACC__ // # define __device __device__ __forceinline__ // #else @@ -92,20 +94,27 @@ struct __align__(8) Level //is actually 24 bytes struct __align__(8) Node { - // int feature; uchar4 rect; - float threshold; + // ushort channel; + uint threshold; - Node(const uchar4 c, const int t) : rect(c), threshold(t) {} + enum { THRESHOLD_MASK = 0x0FFFFFFF }; + + Node(const uchar4 r, const uint ch, const uint t) : rect(r), threshold(t + (ch << 28)) + { + // printf("%d\n", t); + // printf("[%d %d %d %d] %d, %d\n",rect.x, rect.y, rect.z, rect.w, (int)(threshold >> 28), + // (int)(0x0FFFFFFF & threshold)); + } }; -struct __align__(8) Feature -{ - int channel; - uchar4 rect; +// struct __align__(8) Feature +// { +// int channel; +// uchar4 rect; - Feature(const int c, const uchar4 r) : channel(c), rect(r) {} -}; +// Feature(const int c, const uchar4 r) : channel(c), rect(r) {} +// }; } }}} // struct Cascade diff --git a/modules/gpu/src/softcascade.cpp b/modules/gpu/src/softcascade.cpp index 8d75176ab..ffbf380c6 100644 --- a/modules/gpu/src/softcascade.cpp +++ b/modules/gpu/src/softcascade.cpp @@ -60,19 +60,10 @@ namespace icf { void fillBins(cv::gpu::PtrStepSzb hogluv, const cv::gpu::PtrStepSzf& nangle, const int fw, const int fh, const int bins); void detect(const PtrStepSzb& levels, const PtrStepSzb& octaves, const PtrStepSzf& stages, - const PtrStepSzb& nodes, const PtrStepSzb& features, - PtrStepSz objects); + const PtrStepSzb& nodes, const PtrStepSzf& leaves, const PtrStepSzi& hogluv, PtrStepSz objects); } }}} -// namespace { -// char *itoa(long i, char* s, int /*dummy_radix*/) -// { -// sprintf(s, "%ld", i); -// return s; -// } -// } - struct cv::gpu::SoftCascade::Filds { @@ -97,7 +88,6 @@ struct cv::gpu::SoftCascade::Filds GpuMat stages; GpuMat nodes; GpuMat leaves; - GpuMat features; GpuMat levels; // preallocated buffer 640x480x10 for hogluv + 640x480 got gray @@ -137,7 +127,7 @@ struct cv::gpu::SoftCascade::Filds bool fill(const FileNode &root, const float mins, const float maxs); void detect(cv::gpu::GpuMat objects, cudaStream_t stream) const { - device::icf::detect(levels, octaves, stages, nodes, features, objects); + device::icf::detect(levels, octaves, stages, nodes, leaves, hogluv, objects); } private: @@ -216,10 +206,9 @@ inline bool cv::gpu::SoftCascade::Filds::fill(const FileNode &root, const float if (fn.empty()) return false; std::vector voctaves; - std::vector vstages; + std::vector vstages; std::vector vnodes; - std::vector vleaves; - std::vector vfeatures; + std::vector vleaves; scales.clear(); FileNodeIterator it = fn.begin(), it_end = fn.end(); @@ -245,6 +234,8 @@ inline bool cv::gpu::SoftCascade::Filds::fill(const FileNode &root, const float FileNode ffs = fns[SC_FEATURES]; if (ffs.empty()) return false; + FileNodeIterator ftrs = ffs.begin(); + fns = fns[SC_STAGES]; if (fn.empty()) return false; @@ -263,10 +254,21 @@ inline bool cv::gpu::SoftCascade::Filds::fill(const FileNode &root, const float FileNodeIterator inIt = fns.begin(), inIt_end = fns.end(); for (; inIt != inIt_end;) { - int feature = (int)(*(inIt +=2)++) + feature_offset; - float th = (float)(*(inIt++)); + // int feature = (int)(*(inIt +=2)) + feature_offset; + inIt +=3; + // extract feature, Todo:check it + uint th = saturate_cast((float)(*(inIt++))); + cv::FileNode ftn = (*ftrs)[SC_F_RECT]; + cv::FileNodeIterator r_it = ftn.begin(); uchar4 rect; - vnodes.push_back(Node(rect, th)); + rect.x = saturate_cast((int)*(r_it++)); + rect.y = saturate_cast((int)*(r_it++)); + rect.z = saturate_cast((int)*(r_it++)); + rect.w = saturate_cast((int)*(r_it++)); + + uint channel = saturate_cast((int)(*ftrs)[SC_F_CHANNEL]); + vnodes.push_back(Node(rect, channel, th)); + ++ftrs; } fns = (*ftr)[SC_LEAF]; @@ -276,19 +278,6 @@ inline bool cv::gpu::SoftCascade::Filds::fill(const FileNode &root, const float } } - st = ffs.begin(), st_end = ffs.end(); - for (; st != st_end; ++st ) - { - cv::FileNode rn = (*st)[SC_F_RECT]; - cv::FileNodeIterator r_it = rn.begin(); - uchar4 rect; - rect.x = saturate_cast((int)*(r_it++)); - rect.y = saturate_cast((int)*(r_it++)); - rect.z = saturate_cast((int)*(r_it++)); - rect.w = saturate_cast((int)*(r_it++)); - vfeatures.push_back(Feature((int)(*st)[SC_F_CHANNEL], rect)); - } - feature_offset += octave.stages * 3; ++octIndex; } @@ -306,9 +295,6 @@ inline bool cv::gpu::SoftCascade::Filds::fill(const FileNode &root, const float leaves.upload(cv::Mat(vleaves).reshape(1,1)); CV_Assert(!leaves.empty()); - features.upload(cv::Mat(1, vfeatures.size() * sizeof(Feature), CV_8UC1, (uchar*)&(vfeatures[0]) )); - CV_Assert(!features.empty()); - // compute levels calcLevels(voctaves, FRAME_WIDTH, FRAME_HEIGHT, TOTAL_SCALES); CV_Assert(!levels.empty()); @@ -425,7 +411,14 @@ bool cv::gpu::SoftCascade::load( const string& filename, const float minScale, c return true; } -// #define USE_REFERENCE_VALUES +#define USE_REFERENCE_VALUES +namespace { + char *itoa(long i, char* s, int /*dummy_radix*/) + { + sprintf(s, "%ld", i); + return s; + } +} void cv::gpu::SoftCascade::detectMultiScale(const GpuMat& colored, const GpuMat& /*rois*/, GpuMat& objects, const int /*rejectfactor*/, Stream s) { @@ -438,17 +431,20 @@ void cv::gpu::SoftCascade::detectMultiScale(const GpuMat& colored, const GpuMat& Filds& flds = *filds; #if defined USE_REFERENCE_VALUES -// cudaMemset(flds.hogluv.data, 0, flds.hogluv.step * flds.hogluv.rows); -// cv::FileStorage imgs("/home/kellan/testInts.xml", cv::FileStorage::READ); -// char buff[33]; + cudaMemset(flds.hogluv.data, 0, flds.hogluv.step * flds.hogluv.rows); -// for(int i = 0; i < Filds::HOG_LUV_BINS; ++i) -// { -// cv::Mat channel; -// imgs[std::string("channel") + itoa(i, buff, 10)] >> channel; -// GpuMat gchannel(flds.hogluv, cv::Rect(0, 121 * i, 161, 121)); -// gchannel.upload(channel); -// } + cv::FileStorage imgs("/home/kellan/testInts.xml", cv::FileStorage::READ); + char buff[33]; + + for(int i = 0; i < Filds::HOG_LUV_BINS; ++i) + { + cv::Mat channel; + imgs[std::string("channel") + itoa(i, buff, 10)] >> channel; + + // std::cout << "channel " << i << std::endl << channel << std::endl; + GpuMat gchannel(flds.hogluv, cv::Rect(0, 121 * i, 161, 121)); + gchannel.upload(channel); + } #else GpuMat& plane = flds.plane; GpuMat& shrunk = flds.shrunk; From 72b499df006a39185b529861f40e335611ebccc8 Mon Sep 17 00:00:00 2001 From: "marina.kolpakova" Date: Thu, 27 Sep 2012 14:40:13 +0400 Subject: [PATCH 025/155] add detection storing --- modules/gpu/perf/perf_objdetect.cpp | 4 +- modules/gpu/src/cuda/isf-sc.cu | 59 ++++++++++------------------- modules/gpu/src/softcascade.cpp | 14 ++++--- 3 files changed, 30 insertions(+), 47 deletions(-) diff --git a/modules/gpu/perf/perf_objdetect.cpp b/modules/gpu/perf/perf_objdetect.cpp index 48a355d6a..e272d6535 100644 --- a/modules/gpu/perf/perf_objdetect.cpp +++ b/modules/gpu/perf/perf_objdetect.cpp @@ -104,7 +104,7 @@ PERF_TEST_P(SoftCascade, detect, Values(make_pair("cv/cascadeandhog cv::gpu::SoftCascade cascade; ASSERT_TRUE(cascade.load(perf::TestBase::getDataPath(GetParam().first))); - cv::gpu::GpuMat rois, objectBoxes(1, 1000, CV_8UC1); + cv::gpu::GpuMat rois, objectBoxes(1, 1000, CV_8UC4); cascade.detectMultiScale(colored, rois, objectBoxes); TEST_CYCLE() @@ -117,7 +117,7 @@ PERF_TEST_P(SoftCascade, detect, Values(make_pair("cv/cascadeandhog ASSERT_FALSE(colored.empty()); cv::SoftCascade cascade; - ASSERT_TRUE(cascade.load(GetParam().first)); + ASSERT_TRUE(cascade.load(getDataPath(GetParam().first))); std::vector rois, objectBoxes; cascade.detectMultiScale(colored, rois, objectBoxes); diff --git a/modules/gpu/src/cuda/isf-sc.cu b/modules/gpu/src/cuda/isf-sc.cu index c8dff34bd..4bf410fc5 100644 --- a/modules/gpu/src/cuda/isf-sc.cu +++ b/modules/gpu/src/cuda/isf-sc.cu @@ -57,14 +57,6 @@ namespace cv { namespace gpu { namespace device { namespace icf { -// enum { -// HOG_BINS = 6, -// HOG_LUV_BINS = 10, -// WIDTH = 640, -// HEIGHT = 480, -// GREY_OFFSET = HEIGHT * HOG_LUV_BINS -// }; - // ToDo: use textures or ancached load instruction. __global__ void magToHist(const uchar* __restrict__ mag, const float* __restrict__ angle, const int angPitch, @@ -94,13 +86,6 @@ namespace icf { } texture thogluv; - // ToDo: do it in load time - // __device__ __forceinline__ float rescale(const Level& level, uchar4& scaledRect, const Node& node) - // { - // scaledRect = node.rect; - // return (float)(node.threshold & 0x0FFFFFFFU); - // } - __device__ __forceinline__ float rescale(const Level& level, uchar4& scaledRect, const Node& node) { float relScale = level.relScale; @@ -119,17 +104,12 @@ namespace icf { float sarea = (scaledRect.z - scaledRect.x) * (scaledRect.w - scaledRect.y); - float approx = 1.f; - // if (fabs(farea - 0.f) > FLT_EPSILON && fabs(farea - 0.f) > FLT_EPSILON) - { - const float expected_new_area = farea * relScale * relScale; - approx = sarea / expected_new_area; - } + const float expected_new_area = farea * relScale * relScale; + float approx = sarea / expected_new_area; dprintf("new rect: %d box %d %d %d %d rel areas %f %f\n", (node.threshold >> 28), scaledRect.x, scaledRect.y, scaledRect.z, scaledRect.w, farea * relScale * relScale, sarea); - float rootThreshold = (node.threshold & 0x0FFFFFFFU) * approx; rootThreshold *= level.scaling[(node.threshold >> 28) > 6]; @@ -139,7 +119,7 @@ namespace icf { return rootThreshold; } - __device__ __forceinline__ int get(const int x, int y, int channel, uchar4 area) + __device__ __forceinline__ int get(const int x, int y, uchar4 area) { dprintf("feature box %d %d %d %d ", area.x, area.y, area.z, area.w); @@ -149,9 +129,6 @@ namespace icf { x + area.x, y + area.w); dprintf("at point %d %d with offset %d\n", x, y, 0); - int offset = channel * 121; - y += offset; - int a = tex2D(thogluv, x + area.x, y + area.y); int b = tex2D(thogluv, x + area.z, y + area.y); int c = tex2D(thogluv, x + area.z, y + area.w); @@ -163,7 +140,7 @@ namespace icf { } __global__ void test_kernel(const Level* levels, const Octave* octaves, const float* stages, - const Node* nodes, const float* leaves, PtrStepSz objects) + const Node* nodes, const float* leaves, PtrStepSz objects, uint* ctr) { const int y = blockIdx.y * blockDim.y + threadIdx.y; const int x = blockIdx.x * blockDim.x + threadIdx.x; @@ -179,7 +156,7 @@ namespace icf { float confidence = 0.f; -// #pragma unroll 8 +// #pragma unroll 2 for(; st < stEnd; ++st) { dprintf("\n\nstage: %d\n", st); @@ -190,7 +167,7 @@ namespace icf { node.threshold >> 28, node.threshold & 0x0FFFFFFFU); float threshold = rescale(level, node.rect, node); - int sum = get(x, y, (node.threshold >> 28), node.rect); + int sum = get(x, y + (node.threshold >> 28) * 121, node.rect); dprintf("Node: [%d %d %d %d] %f\n", node.rect.x, node.rect.y, node.rect.z, node.rect.w, threshold); @@ -200,29 +177,30 @@ namespace icf { node = nodes[nId + next]; threshold = rescale(level, node.rect, node); - sum = get(x, y, (node.threshold >> 28), node.rect); + sum = get(x, y + (node.threshold >> 28) * 121, node.rect); const int lShift = (next - 1) * 2 + (int)(sum >= threshold); float impact = leaves[st * 4 + lShift]; confidence += impact; - if (confidence <= stages[st]) st = stEnd + 1; + if (confidence <= stages[st]) st = stEnd + 10; dprintf("decided: %d (%d >= %f) %d %f\n\n" ,next, sum, threshold, lShift, impact); dprintf("extracted stage: %f\n", stages[st]); dprintf("computed score: %f\n\n", confidence); } - // if (st == stEnd) - // printf("%d %d %d\n", x, y, st); - - uchar4 val; - val.x = (int)confidence; - if (x == y) objects(0, threadIdx.x) = val; - + if(st == stEnd) + { + int idx = atomicInc(ctr, objects.cols); + uchar4 val; + val.x = x * 4; + objects(0, idx) = val; + } } void detect(const PtrStepSzb& levels, const PtrStepSzb& octaves, const PtrStepSzf& stages, - const PtrStepSzb& nodes, const PtrStepSzf& leaves, const PtrStepSzi& hogluv, PtrStepSz objects) + const PtrStepSzb& nodes, const PtrStepSzf& leaves, const PtrStepSzi& hogluv, + PtrStepSz objects, PtrStepSzi counter) { int fw = 160; int fh = 120; @@ -235,11 +213,12 @@ namespace icf { const float* st = (const float*)stages.ptr(); const Node* nd = (const Node*)nodes.ptr(); const float* lf = (const float*)leaves.ptr(); + uint* ctr = (uint*)counter.ptr(); cudaChannelFormatDesc desc = cudaCreateChannelDesc(); cudaSafeCall( cudaBindTexture2D(0, thogluv, hogluv.data, desc, hogluv.cols, hogluv.rows, hogluv.step)); - test_kernel<<>>(l, oct, st, nd, lf, objects); + test_kernel<<>>(l, oct, st, nd, lf, objects, ctr); cudaSafeCall( cudaGetLastError()); cudaSafeCall( cudaDeviceSynchronize()); diff --git a/modules/gpu/src/softcascade.cpp b/modules/gpu/src/softcascade.cpp index ffbf380c6..320fbb343 100644 --- a/modules/gpu/src/softcascade.cpp +++ b/modules/gpu/src/softcascade.cpp @@ -60,7 +60,8 @@ namespace icf { void fillBins(cv::gpu::PtrStepSzb hogluv, const cv::gpu::PtrStepSzf& nangle, const int fw, const int fh, const int bins); void detect(const PtrStepSzb& levels, const PtrStepSzb& octaves, const PtrStepSzf& stages, - const PtrStepSzb& nodes, const PtrStepSzf& leaves, const PtrStepSzi& hogluv, PtrStepSz objects); + const PtrStepSzb& nodes, const PtrStepSzf& leaves, const PtrStepSzi& hogluv, PtrStepSz objects, + PtrStepSzi counter); } }}} @@ -75,6 +76,7 @@ struct cv::gpu::SoftCascade::Filds shrunk.create(FRAME_HEIGHT / 4 * HOG_LUV_BINS, FRAME_WIDTH / 4, CV_8UC1); integralBuffer.create(shrunk.rows + 1 * HOG_LUV_BINS, shrunk.cols + 1, CV_32SC1); hogluv.create((FRAME_HEIGHT / 4 + 1) * HOG_LUV_BINS, FRAME_WIDTH / 4 + 1, CV_32SC1); + detCounter.create(1,1, CV_32SC1); } // scales range @@ -90,6 +92,8 @@ struct cv::gpu::SoftCascade::Filds GpuMat leaves; GpuMat levels; + GpuMat detCounter; + // preallocated buffer 640x480x10 for hogluv + 640x480 got gray GpuMat plane; @@ -127,7 +131,8 @@ struct cv::gpu::SoftCascade::Filds bool fill(const FileNode &root, const float mins, const float maxs); void detect(cv::gpu::GpuMat objects, cudaStream_t stream) const { - device::icf::detect(levels, octaves, stages, nodes, leaves, hogluv, objects); + cudaMemset(detCounter.data, 0, detCounter.step * detCounter.rows * sizeof(int)); + device::icf::detect(levels, octaves, stages, nodes, leaves, hogluv, objects , detCounter); } private: @@ -506,14 +511,13 @@ void cv::gpu::SoftCascade::detectMultiScale(const GpuMat& colored, const GpuMat& GpuMat sum(flds.hogluv, cv::Rect(0, (fh + 1) * i, fw + 1, fh + 1)); cv::gpu::integralBuffered(channel, sum, flds.integralBuffer); } - #endif cudaStream_t stream = StreamAccessor::getStream(s); - // detection flds.detect(objects, stream); -// // flds.storage.frame(colored, stream); + // cv::Mat out(flds.detCounter); + // std::cout << out << std::endl; } #endif \ No newline at end of file From c0359ed5c5987a3e58151cec20fd866fab3776ca Mon Sep 17 00:00:00 2001 From: "marina.kolpakova" Date: Thu, 27 Sep 2012 16:50:47 +0400 Subject: [PATCH 026/155] fix test: enough size for detection matrix --- modules/gpu/perf/perf_objdetect.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/gpu/perf/perf_objdetect.cpp b/modules/gpu/perf/perf_objdetect.cpp index e272d6535..8531372b0 100644 --- a/modules/gpu/perf/perf_objdetect.cpp +++ b/modules/gpu/perf/perf_objdetect.cpp @@ -104,7 +104,7 @@ PERF_TEST_P(SoftCascade, detect, Values(make_pair("cv/cascadeandhog cv::gpu::SoftCascade cascade; ASSERT_TRUE(cascade.load(perf::TestBase::getDataPath(GetParam().first))); - cv::gpu::GpuMat rois, objectBoxes(1, 1000, CV_8UC4); + cv::gpu::GpuMat rois, objectBoxes(1, 16384, CV_8UC1); cascade.detectMultiScale(colored, rois, objectBoxes); TEST_CYCLE() From 0314e0e5d74e0e6c979505448c5eb103c2181989 Mon Sep 17 00:00:00 2001 From: "marina.kolpakova" Date: Thu, 27 Sep 2012 16:54:37 +0400 Subject: [PATCH 027/155] add kind in detection representation --- modules/gpu/src/cuda/isf-sc.cu | 17 +++---- modules/gpu/src/icf.hpp | 81 ++++++++-------------------------- 2 files changed, 27 insertions(+), 71 deletions(-) diff --git a/modules/gpu/src/cuda/isf-sc.cu b/modules/gpu/src/cuda/isf-sc.cu index 4bf410fc5..adfc9edcb 100644 --- a/modules/gpu/src/cuda/isf-sc.cu +++ b/modules/gpu/src/cuda/isf-sc.cu @@ -123,7 +123,6 @@ namespace icf { { dprintf("feature box %d %d %d %d ", area.x, area.y, area.z, area.w); - dprintf("get for channel %d\n", channel); dprintf("extract feature for: [%d %d] [%d %d] [%d %d] [%d %d]\n", x + area.x, y + area.y, x + area.z, y + area.y, x + area.z,y + area.w, x + area.x, y + area.w); @@ -140,13 +139,13 @@ namespace icf { } __global__ void test_kernel(const Level* levels, const Octave* octaves, const float* stages, - const Node* nodes, const float* leaves, PtrStepSz objects, uint* ctr) + const Node* nodes, const float* leaves, Detection* objects, const uint ndetections, uint* ctr) { const int y = blockIdx.y * blockDim.y + threadIdx.y; const int x = blockIdx.x * blockDim.x + threadIdx.x; Level level = levels[blockIdx.z]; - // if (x > 0 || y > 0 || blockIdx.z > 0) return; + // if (blockIdx.z != 31) return; if(x >= level.workRect.x || y >= level.workRect.y) return; Octave octave = octaves[level.octave]; @@ -191,10 +190,10 @@ namespace icf { if(st == stEnd) { - int idx = atomicInc(ctr, objects.cols); - uchar4 val; - val.x = x * 4; - objects(0, idx) = val; + int idx = atomicInc(ctr, ndetections); + // store detection + objects[idx] = Detection(__float2int_rn(x * octave.shrinkage), + __float2int_rn(y * octave.shrinkage), level.objSize.x, level.objSize.y, confidence); } } @@ -214,11 +213,13 @@ namespace icf { const Node* nd = (const Node*)nodes.ptr(); const float* lf = (const float*)leaves.ptr(); uint* ctr = (uint*)counter.ptr(); + Detection* det = (Detection*)objects.ptr(); + uint max_det = objects.cols / sizeof(Detection); cudaChannelFormatDesc desc = cudaCreateChannelDesc(); cudaSafeCall( cudaBindTexture2D(0, thogluv, hogluv.data, desc, hogluv.cols, hogluv.rows, hogluv.step)); - test_kernel<<>>(l, oct, st, nd, lf, objects, ctr); + test_kernel<<>>(l, oct, st, nd, lf, det, max_det, ctr); cudaSafeCall( cudaGetLastError()); cudaSafeCall( cudaDeviceSynchronize()); diff --git a/modules/gpu/src/icf.hpp b/modules/gpu/src/icf.hpp index ecd1886d3..35658892f 100644 --- a/modules/gpu/src/icf.hpp +++ b/modules/gpu/src/icf.hpp @@ -47,11 +47,11 @@ #include #include -// #if defined __CUDACC__ -// # define __device __device__ __forceinline__ -// #else -// # define __device -// #endif +#if defined __CUDACC__ +# define __device __device__ __forceinline__ +#else +# define __device +#endif namespace cv { namespace gpu { namespace device { @@ -108,66 +108,21 @@ struct __align__(8) Node } }; -// struct __align__(8) Feature -// { -// int channel; -// uchar4 rect; +struct __align__(16) Detection +{ + ushort x; + ushort y; + ushort w; + ushort h; + float confidence; + int kind; + + Detection(){} + __device Detection(int _x, int _y, uchar _w, uchar _h, float c) + : x(_x), y(_y), w(_w), h(_h), confidence(c), kind(0) {}; +}; -// Feature(const int c, const uchar4 r) : channel(c), rect(r) {} -// }; } }}} -// struct Cascade -// { -// Cascade() {} -// Cascade(const cv::gpu::PtrStepSzb& octs, const cv::gpu::PtrStepSzf& sts, const cv::gpu::PtrStepSzb& nds, -// const cv::gpu::PtrStepSzf& lvs, const cv::gpu::PtrStepSzb& fts, const cv::gpu::PtrStepSzb& lls) -// : octaves(octs), stages(sts), nodes(nds), leaves(lvs), features(fts), levels(lls) {} - -// void detect(const cv::gpu::PtrStepSzi& hogluv, cv::gpu::PtrStepSz objects, cudaStream_t stream) const; -// void __device detectAt(const int* __restrict__ hogluv, const int pitch, PtrStepSz& objects) const; -// float __device rescale(const icf::Level& level, uchar4& scaledRect, -// const int channel, const float threshold) const; - -// PtrStepSzb octaves; -// PtrStepSzf stages; -// PtrStepSzb nodes; -// PtrStepSzf leaves; -// PtrStepSzb features; - -// PtrStepSzb levels; - -// }; - -// struct ChannelStorage -// { -// ChannelStorage(){} -// ChannelStorage(const cv::gpu::PtrStepSzb& buff, const cv::gpu::PtrStepSzb& shr, -// const cv::gpu::PtrStepSzb& itg, const int s) -// : dmem (buff), shrunk(shr), hogluv(itg), shrinkage(s) {} - -// void frame(const cv::gpu::PtrStepSz& rgb, cudaStream_t stream){} - -// PtrStepSzb dmem; -// PtrStepSzb shrunk; -// PtrStepSzb hogluv; - -// enum -// { -// FRAME_WIDTH = 640, -// FRAME_HEIGHT = 480, -// TOTAL_SCALES = 55, -// CLASSIFIERS = 5, -// ORIG_OBJECT_WIDTH = 64, -// ORIG_OBJECT_HEIGHT = 128, -// HOG_BINS = 6, -// HOG_LUV_BINS = 10 -// }; - -// int shrinkage; -// static const float magnitudeScaling = 1.f ;// / sqrt(2); -// }; - -// }}} #endif \ No newline at end of file From 612a258506aec30531e3db9dd1157253a6a4bc23 Mon Sep 17 00:00:00 2001 From: "marina.kolpakova" Date: Fri, 28 Sep 2012 19:10:29 +0400 Subject: [PATCH 028/155] kepler specific version --- modules/gpu/src/cuda/isf-sc.cu | 130 ++++++++++++++++++++++++--------- 1 file changed, 97 insertions(+), 33 deletions(-) diff --git a/modules/gpu/src/cuda/isf-sc.cu b/modules/gpu/src/cuda/isf-sc.cu index adfc9edcb..f3c92cc6a 100644 --- a/modules/gpu/src/cuda/isf-sc.cu +++ b/modules/gpu/src/cuda/isf-sc.cu @@ -91,9 +91,9 @@ namespace icf { float relScale = level.relScale; float farea = (scaledRect.z - scaledRect.x) * (scaledRect.w - scaledRect.y); - dprintf("feature %d box %d %d %d %d\n", (node.threshold >> 28), scaledRect.x, scaledRect.y, + dprintf("%d: feature %d box %d %d %d %d\n",threadIdx.x, (node.threshold >> 28), scaledRect.x, scaledRect.y, scaledRect.z, scaledRect.w); - dprintf("rescale: %f [%f %f] selected %f\n",level.relScale, level.scaling[0], level.scaling[1], + dprintf("%d: rescale: %f [%f %f] selected %f\n",threadIdx.x, level.relScale, level.scaling[0], level.scaling[1], level.scaling[(node.threshold >> 28) > 6]); // rescale @@ -107,13 +107,13 @@ namespace icf { const float expected_new_area = farea * relScale * relScale; float approx = sarea / expected_new_area; - dprintf("new rect: %d box %d %d %d %d rel areas %f %f\n", (node.threshold >> 28), + dprintf("%d: new rect: %d box %d %d %d %d rel areas %f %f\n",threadIdx.x, (node.threshold >> 28), scaledRect.x, scaledRect.y, scaledRect.z, scaledRect.w, farea * relScale * relScale, sarea); float rootThreshold = (node.threshold & 0x0FFFFFFFU) * approx; rootThreshold *= level.scaling[(node.threshold >> 28) > 6]; - dprintf("approximation %f %d -> %f %f\n", approx, (node.threshold & 0x0FFFFFFFU), rootThreshold, + dprintf("%d: approximation %f %d -> %f %f\n",threadIdx.x, approx, (node.threshold & 0x0FFFFFFFU), rootThreshold, level.scaling[(node.threshold >> 28) > 6]); return rootThreshold; @@ -122,73 +122,137 @@ namespace icf { __device__ __forceinline__ int get(const int x, int y, uchar4 area) { - dprintf("feature box %d %d %d %d ", area.x, area.y, area.z, area.w); - dprintf("extract feature for: [%d %d] [%d %d] [%d %d] [%d %d]\n", + dprintf("%d: feature box %d %d %d %d\n",threadIdx.x, area.x, area.y, area.z, area.w); + dprintf("%d: extract feature for: [%d %d] [%d %d] [%d %d] [%d %d]\n",threadIdx.x, x + area.x, y + area.y, x + area.z, y + area.y, x + area.z,y + area.w, x + area.x, y + area.w); - dprintf("at point %d %d with offset %d\n", x, y, 0); + dprintf("%d: at point %d %d with offset %d\n", x, y, 0); int a = tex2D(thogluv, x + area.x, y + area.y); int b = tex2D(thogluv, x + area.z, y + area.y); int c = tex2D(thogluv, x + area.z, y + area.w); int d = tex2D(thogluv, x + area.x, y + area.w); - dprintf(" retruved integral values: %d %d %d %d\n", a, b, c, d); + dprintf("%d retruved integral values: %d %d %d %d\n",threadIdx.x, a, b, c, d); return (a - b + c - d); } - __global__ void test_kernel(const Level* levels, const Octave* octaves, const float* stages, +// __global__ void test_kernel(const Level* levels, const Octave* octaves, const float* stages, +// const Node* nodes, const float* leaves, Detection* objects, const uint ndetections, uint* ctr) +// { +// const int y = blockIdx.y * blockDim.y + threadIdx.y; +// const int x = blockIdx.x * blockDim.x + threadIdx.x; +// Level level = levels[blockIdx.z]; + +// // if (blockIdx.z != 31) return; +// if(x >= level.workRect.x || y >= level.workRect.y) return; + +// Octave octave = octaves[level.octave]; + +// int st = octave.index * octave.stages; +// const int stEnd = st + 1000;//octave.stages; + +// float confidence = 0.f; + +// // #pragma unroll 2 +// for(; st < stEnd; ++st) +// { +// dprintf("\n\nstage: %d\n", st); +// const int nId = st * 3; +// Node node = nodes[nId]; + +// dprintf("Node: [%d %d %d %d] %d %d\n", node.rect.x, node.rect.y, node.rect.z, node.rect.w, +// node.threshold >> 28, node.threshold & 0x0FFFFFFFU); + +// float threshold = rescale(level, node.rect, node); +// int sum = get(x, y + (node.threshold >> 28) * 121, node.rect); + +// dprintf("Node: [%d %d %d %d] %f\n", node.rect.x, node.rect.y, node.rect.z, +// node.rect.w, threshold); + +// int next = 1 + (int)(sum >= threshold); +// dprintf("go: %d (%d >= %f)\n\n" ,next, sum, threshold); + +// node = nodes[nId + next]; +// threshold = rescale(level, node.rect, node); +// sum = get(x, y + (node.threshold >> 28) * 121, node.rect); + +// const int lShift = (next - 1) * 2 + (int)(sum >= threshold); +// float impact = leaves[st * 4 + lShift]; +// confidence += impact; + +// if (confidence <= stages[st]) st = stEnd + 10; +// dprintf("decided: %d (%d >= %f) %d %f\n\n" ,next, sum, threshold, lShift, impact); +// dprintf("extracted stage: %f\n", stages[st]); +// dprintf("computed score: %f\n\n", confidence); +// } + +// if(st == stEnd) +// { +// int idx = atomicInc(ctr, ndetections); +// // store detection +// objects[idx] = Detection(__float2int_rn(x * octave.shrinkage), +// __float2int_rn(y * octave.shrinkage), level.objSize.x, level.objSize.y, confidence); +// } +// } + + __global__ void test_kernel_warp(const Level* levels, const Octave* octaves, const float* stages, const Node* nodes, const float* leaves, Detection* objects, const uint ndetections, uint* ctr) { const int y = blockIdx.y * blockDim.y + threadIdx.y; - const int x = blockIdx.x * blockDim.x + threadIdx.x; + const int x = blockIdx.x; + Level level = levels[blockIdx.z]; - // if (blockIdx.z != 31) return; if(x >= level.workRect.x || y >= level.workRect.y) return; Octave octave = octaves[level.octave]; - int st = octave.index * octave.stages; - const int stEnd = st + 1000;//octave.stages; + const int stEnd = st + 1024; float confidence = 0.f; -// #pragma unroll 2 - for(; st < stEnd; ++st) + for(; st < stEnd; st += 32) { - dprintf("\n\nstage: %d\n", st); - const int nId = st * 3; - Node node = nodes[nId]; - dprintf("Node: [%d %d %d %d] %d %d\n", node.rect.x, node.rect.y, node.rect.z, node.rect.w, - node.threshold >> 28, node.threshold & 0x0FFFFFFFU); + const int nId = (st + threadIdx.x) * 3; + dprintf("\n\n%d: stage: %d %d\n",threadIdx.x, st, nId); + Node node = nodes[nId]; float threshold = rescale(level, node.rect, node); int sum = get(x, y + (node.threshold >> 28) * 121, node.rect); - dprintf("Node: [%d %d %d %d] %f\n", node.rect.x, node.rect.y, node.rect.z, - node.rect.w, threshold); - int next = 1 + (int)(sum >= threshold); - dprintf("go: %d (%d >= %f)\n\n" ,next, sum, threshold); + dprintf("%d: go: %d (%d >= %f)\n\n" ,threadIdx.x, next, sum, threshold); node = nodes[nId + next]; threshold = rescale(level, node.rect, node); sum = get(x, y + (node.threshold >> 28) * 121, node.rect); const int lShift = (next - 1) * 2 + (int)(sum >= threshold); - float impact = leaves[st * 4 + lShift]; - confidence += impact; + float impact = leaves[(st + threadIdx.x) * 4 + lShift]; - if (confidence <= stages[st]) st = stEnd + 10; - dprintf("decided: %d (%d >= %f) %d %f\n\n" ,next, sum, threshold, lShift, impact); - dprintf("extracted stage: %f\n", stages[st]); - dprintf("computed score: %f\n\n", confidence); + dprintf("%d: decided: %d (%d >= %f) %d %f\n\n" ,threadIdx.x, next, sum, threshold, lShift, impact); + dprintf("%d: extracted stage: %f\n",threadIdx.x, stages[(st + threadIdx.x)]); + dprintf("%d: computed score: %f\n",threadIdx.x, impact); + + // scan on shuffl functions + for (int i = 1; i < 32; i *= 2) + { + const float n = __shfl_up(impact, i, 32); + + if (threadIdx.x >= i) + impact += n; + } + + dprintf("%d: impact scaned %f\n" ,threadIdx.x, impact); + + confidence += impact; + if(__any((confidence <= stages[(st + threadIdx.x)]))) break; } - if(st == stEnd) + if(st == stEnd && !threadIdx.x) { int idx = atomicInc(ctr, ndetections); // store detection @@ -205,7 +269,7 @@ namespace icf { int fh = 120; dim3 block(32, 8); - dim3 grid(fw / 32, fh / 8, 47); + dim3 grid(fw, fh / 8, 47); const Level* l = (const Level*)levels.ptr(); const Octave* oct = ((const Octave*)octaves.ptr()); @@ -219,7 +283,7 @@ namespace icf { cudaChannelFormatDesc desc = cudaCreateChannelDesc(); cudaSafeCall( cudaBindTexture2D(0, thogluv, hogluv.data, desc, hogluv.cols, hogluv.rows, hogluv.step)); - test_kernel<<>>(l, oct, st, nd, lf, det, max_det, ctr); + test_kernel_warp<<>>(l, oct, st, nd, lf, det, max_det, ctr); cudaSafeCall( cudaGetLastError()); cudaSafeCall( cudaDeviceSynchronize()); From b52fea7fae6ec38096dcd57458e0ee6be87da996 Mon Sep 17 00:00:00 2001 From: "marina.kolpakova" Date: Mon, 1 Oct 2012 13:48:16 +0400 Subject: [PATCH 029/155] update soft cascade interface: - add class Detection in interface, - split sync- and async- versions, - add support for detecting at the specific scale. --- modules/gpu/include/opencv2/gpu/gpu.hpp | 26 ++++++++++++- modules/gpu/src/cuda/isf-sc.cu | 45 ++++++++++++++++++---- modules/gpu/src/softcascade.cpp | 50 ++++++++++++++++++++----- 3 files changed, 103 insertions(+), 18 deletions(-) diff --git a/modules/gpu/include/opencv2/gpu/gpu.hpp b/modules/gpu/include/opencv2/gpu/gpu.hpp index 5008e1027..f171ad904 100644 --- a/modules/gpu/include/opencv2/gpu/gpu.hpp +++ b/modules/gpu/include/opencv2/gpu/gpu.hpp @@ -1537,6 +1537,18 @@ public: class CV_EXPORTS SoftCascade { public: + + struct CV_EXPORTS Detection + { + ushort x; + ushort y; + ushort w; + ushort h; + float confidence; + int kind; + + enum {PEDESTRIAN = 0}; + }; //! An empty cascade will be created. SoftCascade(); @@ -1559,9 +1571,19 @@ public: //! Param rois is a mask //! Param objects 4-channel matrix thet contain detected rectangles //! Param rejectfactor used for final object box computing - //! Param stream virtual void detectMultiScale(const GpuMat& image, const GpuMat& rois, GpuMat& objects, - int rejectfactor = 1, Stream stream = Stream::Null()); + int rejectfactor = 1, int specificScale = -1); + + //! detect specific objects on in the input frame for all scales computed flom minScale and maxscale values. + //! asynchronous version. + //! Param image is input frame for detector. Cascade will be applied to it. + //! Param rois is a mask + //! Param objects 4-channel matrix thet contain detected rectangles + //! Param rejectfactor used for final object box computing + //! Param ndet retrieves number of detections + //! Param stream wrapper for CUDA stream + virtual void detectMultiScale(const GpuMat& image, const GpuMat& rois, GpuMat& objects, + int rejectfactor, GpuMat& ndet, Stream stream); private: struct Filds; diff --git a/modules/gpu/src/cuda/isf-sc.cu b/modules/gpu/src/cuda/isf-sc.cu index f3c92cc6a..3d9a1e10f 100644 --- a/modules/gpu/src/cuda/isf-sc.cu +++ b/modules/gpu/src/cuda/isf-sc.cu @@ -105,7 +105,7 @@ namespace icf { float sarea = (scaledRect.z - scaledRect.x) * (scaledRect.w - scaledRect.y); const float expected_new_area = farea * relScale * relScale; - float approx = sarea / expected_new_area; + float approx = __fdividef(sarea, expected_new_area); dprintf("%d: new rect: %d box %d %d %d %d rel areas %f %f\n",threadIdx.x, (node.threshold >> 28), scaledRect.x, scaledRect.y, scaledRect.z, scaledRect.w, farea * relScale * relScale, sarea); @@ -198,12 +198,13 @@ namespace icf { // } __global__ void test_kernel_warp(const Level* levels, const Octave* octaves, const float* stages, - const Node* nodes, const float* leaves, Detection* objects, const uint ndetections, uint* ctr) + const Node* nodes, const float* leaves, Detection* objects, const uint ndetections, uint* ctr, + const int downscales) { const int y = blockIdx.y * blockDim.y + threadIdx.y; const int x = blockIdx.x; - Level level = levels[blockIdx.z]; + Level level = levels[downscales + blockIdx.z]; if(x >= level.workRect.x || y >= level.workRect.y) return; @@ -236,7 +237,7 @@ namespace icf { dprintf("%d: decided: %d (%d >= %f) %d %f\n\n" ,threadIdx.x, next, sum, threshold, lShift, impact); dprintf("%d: extracted stage: %f\n",threadIdx.x, stages[(st + threadIdx.x)]); dprintf("%d: computed score: %f\n",threadIdx.x, impact); - +#pragma unroll // scan on shuffl functions for (int i = 1; i < 32; i *= 2) { @@ -263,13 +264,13 @@ namespace icf { void detect(const PtrStepSzb& levels, const PtrStepSzb& octaves, const PtrStepSzf& stages, const PtrStepSzb& nodes, const PtrStepSzf& leaves, const PtrStepSzi& hogluv, - PtrStepSz objects, PtrStepSzi counter) + PtrStepSz objects, PtrStepSzi counter, const int downscales) { int fw = 160; int fh = 120; dim3 block(32, 8); - dim3 grid(fw, fh / 8, 47); + dim3 grid(fw, fh / 8, downscales); const Level* l = (const Level*)levels.ptr(); const Octave* oct = ((const Octave*)octaves.ptr()); @@ -283,8 +284,38 @@ namespace icf { cudaChannelFormatDesc desc = cudaCreateChannelDesc(); cudaSafeCall( cudaBindTexture2D(0, thogluv, hogluv.data, desc, hogluv.cols, hogluv.rows, hogluv.step)); - test_kernel_warp<<>>(l, oct, st, nd, lf, det, max_det, ctr); + test_kernel_warp<<>>(l, oct, st, nd, lf, det, max_det, ctr, 0); + cudaSafeCall( cudaGetLastError()); + grid = dim3(fw, fh / 8, 47 - downscales); + test_kernel_warp<<>>(l, oct, st, nd, lf, det, max_det, ctr, downscales); + cudaSafeCall( cudaGetLastError()); + cudaSafeCall( cudaDeviceSynchronize()); + } + + void detectAtScale(const int scale, const PtrStepSzb& levels, const PtrStepSzb& octaves, const PtrStepSzf& stages, + const PtrStepSzb& nodes, const PtrStepSzf& leaves, const PtrStepSzi& hogluv, PtrStepSz objects, + PtrStepSzi counter) + { + int fw = 160; + int fh = 120; + + dim3 block(32, 8); + dim3 grid(fw, fh / 8, 1); + + const Level* l = (const Level*)levels.ptr(); + const Octave* oct = ((const Octave*)octaves.ptr()); + const float* st = (const float*)stages.ptr(); + const Node* nd = (const Node*)nodes.ptr(); + const float* lf = (const float*)leaves.ptr(); + uint* ctr = (uint*)counter.ptr(); + Detection* det = (Detection*)objects.ptr(); + uint max_det = objects.cols / sizeof(Detection); + + cudaChannelFormatDesc desc = cudaCreateChannelDesc(); + cudaSafeCall( cudaBindTexture2D(0, thogluv, hogluv.data, desc, hogluv.cols, hogluv.rows, hogluv.step)); + + test_kernel_warp<<>>(l, oct, st, nd, lf, det, max_det, ctr, scale); cudaSafeCall( cudaGetLastError()); cudaSafeCall( cudaDeviceSynchronize()); } diff --git a/modules/gpu/src/softcascade.cpp b/modules/gpu/src/softcascade.cpp index 320fbb343..fd94909cf 100644 --- a/modules/gpu/src/softcascade.cpp +++ b/modules/gpu/src/softcascade.cpp @@ -49,7 +49,11 @@ cv::gpu::SoftCascade::SoftCascade() : filds(0) { throw_nogpu(); } cv::gpu::SoftCascade::SoftCascade( const string&, const float, const float) : filds(0) { throw_nogpu(); } cv::gpu::SoftCascade::~SoftCascade() { throw_nogpu(); } bool cv::gpu::SoftCascade::load( const string&, const float, const float) { throw_nogpu(); return false; } -void cv::gpu::SoftCascade::detectMultiScale(const GpuMat&, const GpuMat&, GpuMat&, const int, Stream) { throw_nogpu();} +void cv::gpu::SoftCascade::detectMultiScale(const GpuMat&, const GpuMat&, GpuMat&, const int, int) { throw_nogpu();} +void cv::gpu::SoftCascade::detectMultiScale(const GpuMat&, const GpuMat&, GpuMat&, int, GpuMat&, Stream) +{ + throw_nogpu(); +} #else @@ -60,6 +64,9 @@ namespace icf { void fillBins(cv::gpu::PtrStepSzb hogluv, const cv::gpu::PtrStepSzf& nangle, const int fw, const int fh, const int bins); void detect(const PtrStepSzb& levels, const PtrStepSzb& octaves, const PtrStepSzf& stages, + const PtrStepSzb& nodes, const PtrStepSzf& leaves, const PtrStepSzi& hogluv, PtrStepSz objects, + PtrStepSzi counter, const int downscales); + void detectAtScale(const int scale, const PtrStepSzb& levels, const PtrStepSzb& octaves, const PtrStepSzf& stages, const PtrStepSzb& nodes, const PtrStepSzf& leaves, const PtrStepSzi& hogluv, PtrStepSz objects, PtrStepSzi counter); } @@ -86,6 +93,8 @@ struct cv::gpu::SoftCascade::Filds int origObjWidth; int origObjHeight; + int downscales; + GpuMat octaves; GpuMat stages; GpuMat nodes; @@ -120,7 +129,6 @@ struct cv::gpu::SoftCascade::Filds FRAME_WIDTH = 640, FRAME_HEIGHT = 480, TOTAL_SCALES = 55, -// CLASSIFIERS = 5, ORIG_OBJECT_WIDTH = 64, ORIG_OBJECT_HEIGHT = 128, HOG_BINS = 6, @@ -132,7 +140,14 @@ struct cv::gpu::SoftCascade::Filds void detect(cv::gpu::GpuMat objects, cudaStream_t stream) const { cudaMemset(detCounter.data, 0, detCounter.step * detCounter.rows * sizeof(int)); - device::icf::detect(levels, octaves, stages, nodes, leaves, hogluv, objects , detCounter); + device::icf::detect(levels, octaves, stages, nodes, leaves, hogluv, objects , detCounter, downscales); + } + + void detectAtScale(int scale, cv::gpu::GpuMat objects, cudaStream_t stream) const + { + cudaMemset(detCounter.data, 0, detCounter.step * detCounter.rows * sizeof(int)); + device::icf::detectAtScale(scale, levels, octaves, stages, nodes, leaves, hogluv, objects, + detCounter); } private: @@ -160,7 +175,7 @@ private: } }; -inline bool cv::gpu::SoftCascade::Filds::fill(const FileNode &root, const float mins, const float maxs) +bool cv::gpu::SoftCascade::Filds::fill(const FileNode &root, const float mins, const float maxs) { using namespace device::icf; minScale = mins; @@ -351,6 +366,7 @@ inline void cv::gpu::SoftCascade::Filds::calcLevels(const std::vector Date: Mon, 1 Oct 2012 13:49:21 +0400 Subject: [PATCH 030/155] add test that stores detections on the specific scale --- modules/gpu/test/test_softcascade.cpp | 59 +++++++++++++++++++++++++++ 1 file changed, 59 insertions(+) diff --git a/modules/gpu/test/test_softcascade.cpp b/modules/gpu/test/test_softcascade.cpp index c7e3a1f77..0da07298d 100644 --- a/modules/gpu/test/test_softcascade.cpp +++ b/modules/gpu/test/test_softcascade.cpp @@ -71,4 +71,63 @@ TEST(SoftCascade, detect) // }); } +class SCSpecific : public ::testing::TestWithParam > { +}; + +namespace { +std::string itoa(long i) +{ + static char s[65]; + sprintf(s, "%ld", i); + return std::string(s); +} +} + +TEST_P(SCSpecific, detect) +{ + std::string xml = cvtest::TS::ptr()->get_data_path() + "../cv/cascadeandhog/sc_cvpr_2012_to_opencv.xml"; + cv::gpu::SoftCascade cascade; + ASSERT_TRUE(cascade.load(xml)); + + std::string path = GET_PARAM(0); + cv::Mat coloredCpu = cv::imread(cvtest::TS::ptr()->get_data_path() + path); + + ASSERT_FALSE(coloredCpu.empty()); + GpuMat colored(coloredCpu), objectBoxes(1, 1000, CV_8UC1), rois; + + int level = GET_PARAM(1); + cascade.detectMultiScale(colored, rois, objectBoxes, 1, level); + + cv::Mat dt(objectBoxes); + typedef cv::gpu::SoftCascade::Detection detection_t; + + detection_t* dts = (detection_t*)dt.data; + cv::Mat result(coloredCpu); + + + std::cout << "Total detections " << (dt.cols / sizeof(detection_t)) << std::endl; + for(int i = 0; i < (int)(dt.cols / sizeof(detection_t)); ++i) + { + detection_t d = dts[i]; + std::cout << "detection: [" << std::setw(4) << d.x << " " << std::setw(4) << d.y + << "] [" << std::setw(4) << d.w << " " << std::setw(4) << d.h << "] " + << std::setw(12) << d.confidence << std::endl; + + cv::rectangle(result, cv::Rect(d.x, d.y, d.w, d.h), cv::Scalar(255, 0, 0, 255), 1); + } + + std::cout << "Result stored in " << "/home/kellan/gpu_res_1_oct_" + itoa(level) << "_" + + itoa((dt.cols / sizeof(detection_t))) + ".png" << std::endl; + cv::imwrite("/home/kellan/gpu_res_1_oct_" + itoa(level) + "_" + itoa((dt.cols / sizeof(detection_t))) + ".png", + result); + cv::imshow("res", result); + cv::waitKey(0); +} + +INSTANTIATE_TEST_CASE_P(inLevel, SCSpecific, + testing::Combine( + testing::Values(std::string("../cv/cascadeandhog/bahnhof/image_00000000_0.png")), + testing::Range(0, 47) + )); + #endif \ No newline at end of file From 56c7ef06e7a5e40e65c6a4d14775aa8defbc2dfb Mon Sep 17 00:00:00 2001 From: "marina.kolpakova" Date: Mon, 1 Oct 2012 14:08:48 +0400 Subject: [PATCH 031/155] integrate Kepler version --- modules/gpu/src/cuda/isf-sc.cu | 120 +++++++++++++++++---------------- 1 file changed, 61 insertions(+), 59 deletions(-) diff --git a/modules/gpu/src/cuda/isf-sc.cu b/modules/gpu/src/cuda/isf-sc.cu index 3d9a1e10f..cc4502494 100644 --- a/modules/gpu/src/cuda/isf-sc.cu +++ b/modules/gpu/src/cuda/isf-sc.cu @@ -138,65 +138,7 @@ namespace icf { return (a - b + c - d); } -// __global__ void test_kernel(const Level* levels, const Octave* octaves, const float* stages, -// const Node* nodes, const float* leaves, Detection* objects, const uint ndetections, uint* ctr) -// { -// const int y = blockIdx.y * blockDim.y + threadIdx.y; -// const int x = blockIdx.x * blockDim.x + threadIdx.x; -// Level level = levels[blockIdx.z]; - -// // if (blockIdx.z != 31) return; -// if(x >= level.workRect.x || y >= level.workRect.y) return; - -// Octave octave = octaves[level.octave]; - -// int st = octave.index * octave.stages; -// const int stEnd = st + 1000;//octave.stages; - -// float confidence = 0.f; - -// // #pragma unroll 2 -// for(; st < stEnd; ++st) -// { -// dprintf("\n\nstage: %d\n", st); -// const int nId = st * 3; -// Node node = nodes[nId]; - -// dprintf("Node: [%d %d %d %d] %d %d\n", node.rect.x, node.rect.y, node.rect.z, node.rect.w, -// node.threshold >> 28, node.threshold & 0x0FFFFFFFU); - -// float threshold = rescale(level, node.rect, node); -// int sum = get(x, y + (node.threshold >> 28) * 121, node.rect); - -// dprintf("Node: [%d %d %d %d] %f\n", node.rect.x, node.rect.y, node.rect.z, -// node.rect.w, threshold); - -// int next = 1 + (int)(sum >= threshold); -// dprintf("go: %d (%d >= %f)\n\n" ,next, sum, threshold); - -// node = nodes[nId + next]; -// threshold = rescale(level, node.rect, node); -// sum = get(x, y + (node.threshold >> 28) * 121, node.rect); - -// const int lShift = (next - 1) * 2 + (int)(sum >= threshold); -// float impact = leaves[st * 4 + lShift]; -// confidence += impact; - -// if (confidence <= stages[st]) st = stEnd + 10; -// dprintf("decided: %d (%d >= %f) %d %f\n\n" ,next, sum, threshold, lShift, impact); -// dprintf("extracted stage: %f\n", stages[st]); -// dprintf("computed score: %f\n\n", confidence); -// } - -// if(st == stEnd) -// { -// int idx = atomicInc(ctr, ndetections); -// // store detection -// objects[idx] = Detection(__float2int_rn(x * octave.shrinkage), -// __float2int_rn(y * octave.shrinkage), level.objSize.x, level.objSize.y, confidence); -// } -// } - +#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 300 __global__ void test_kernel_warp(const Level* levels, const Octave* octaves, const float* stages, const Node* nodes, const float* leaves, Detection* objects, const uint ndetections, uint* ctr, const int downscales) @@ -261,6 +203,66 @@ namespace icf { __float2int_rn(y * octave.shrinkage), level.objSize.x, level.objSize.y, confidence); } } +#else + __global__ void test_kernel_warp(const Level* levels, const Octave* octaves, const float* stages, + const Node* nodes, const float* leaves, Detection* objects, const uint ndetections, uint* ctr, + const int downscales) + { + const int y = blockIdx.y * blockDim.y + threadIdx.y; + const int x = blockIdx.x * blockDim.x + threadIdx.x; + Level level = levels[blockIdx.z]; + + // if (blockIdx.z != 31) return; + if(x >= level.workRect.x || y >= level.workRect.y) return; + + Octave octave = octaves[level.octave]; + + int st = octave.index * octave.stages; + const int stEnd = st + 1000;//octave.stages; + + float confidence = 0.f; + + for(; st < stEnd; ++st) + { + dprintf("\n\nstage: %d\n", st); + const int nId = st * 3; + Node node = nodes[nId]; + + dprintf("Node: [%d %d %d %d] %d %d\n", node.rect.x, node.rect.y, node.rect.z, node.rect.w, + node.threshold >> 28, node.threshold & 0x0FFFFFFFU); + + float threshold = rescale(level, node.rect, node); + int sum = get(x, y + (node.threshold >> 28) * 121, node.rect); + + dprintf("Node: [%d %d %d %d] %f\n", node.rect.x, node.rect.y, node.rect.z, + node.rect.w, threshold); + + int next = 1 + (int)(sum >= threshold); + dprintf("go: %d (%d >= %f)\n\n" ,next, sum, threshold); + + node = nodes[nId + next]; + threshold = rescale(level, node.rect, node); + sum = get(x, y + (node.threshold >> 28) * 121, node.rect); + + const int lShift = (next - 1) * 2 + (int)(sum >= threshold); + float impact = leaves[st * 4 + lShift]; + confidence += impact; + + if (confidence <= stages[st]) st = stEnd + 10; + dprintf("decided: %d (%d >= %f) %d %f\n\n" ,next, sum, threshold, lShift, impact); + dprintf("extracted stage: %f\n", stages[st]); + dprintf("computed score: %f\n\n", confidence); + } + + if(st == stEnd) + { + int idx = atomicInc(ctr, ndetections); + // store detection + objects[idx] = Detection(__float2int_rn(x * octave.shrinkage), + __float2int_rn(y * octave.shrinkage), level.objSize.x, level.objSize.y, confidence); + } + } +#endif void detect(const PtrStepSzb& levels, const PtrStepSzb& octaves, const PtrStepSzf& stages, const PtrStepSzb& nodes, const PtrStepSzf& leaves, const PtrStepSzi& hogluv, From 672cf1f44576bb82f1202c0c15c947b47469304b Mon Sep 17 00:00:00 2001 From: "marina.kolpakova" Date: Mon, 1 Oct 2012 14:50:28 +0400 Subject: [PATCH 032/155] implement different behaviour for up- and down-scaling --- modules/gpu/src/cuda/isf-sc.cu | 98 +++++++++++++++++++++++---- modules/gpu/src/softcascade.cpp | 13 +++- modules/gpu/test/test_softcascade.cpp | 2 +- 3 files changed, 95 insertions(+), 18 deletions(-) diff --git a/modules/gpu/src/cuda/isf-sc.cu b/modules/gpu/src/cuda/isf-sc.cu index cc4502494..c9a92e379 100644 --- a/modules/gpu/src/cuda/isf-sc.cu +++ b/modules/gpu/src/cuda/isf-sc.cu @@ -86,8 +86,11 @@ namespace icf { } texture thogluv; - __device__ __forceinline__ float rescale(const Level& level, uchar4& scaledRect, const Node& node) + + template + __device__ __forceinline__ float rescale(const Level& level, Node& node) { + uchar4& scaledRect = node.rect; float relScale = level.relScale; float farea = (scaledRect.z - scaledRect.x) * (scaledRect.w - scaledRect.y); @@ -119,7 +122,44 @@ namespace icf { return rootThreshold; } - __device__ __forceinline__ int get(const int x, int y, uchar4 area) + template<> + __device__ __forceinline__ float rescale(const Level& level, Node& node) + { + uchar4& scaledRect = node.rect; + float relScale = level.relScale; + float farea = scaledRect.z * scaledRect.w; + + dprintf("%d: feature %d box %d %d %d %d\n",threadIdx.x, (node.threshold >> 28), scaledRect.x, scaledRect.y, + scaledRect.z, scaledRect.w); + dprintf("%d: rescale: %f [%f %f] selected %f\n",threadIdx.x, level.relScale, level.scaling[0], level.scaling[1], + level.scaling[(node.threshold >> 28) > 6]); + + // rescale + scaledRect.x = __float2int_rn(relScale * scaledRect.x); + scaledRect.y = __float2int_rn(relScale * scaledRect.y); + scaledRect.z = __float2int_rn(relScale * scaledRect.z); + scaledRect.w = __float2int_rn(relScale * scaledRect.w); + + float sarea = scaledRect.z * scaledRect.w; + + const float expected_new_area = farea * relScale * relScale; + float approx = __fdividef(sarea, expected_new_area); + + dprintf("%d: new rect: %d box %d %d %d %d rel areas %f %f\n",threadIdx.x, (node.threshold >> 28), + scaledRect.x, scaledRect.y, scaledRect.z, scaledRect.w, farea * relScale * relScale, sarea); + + float rootThreshold = (node.threshold & 0x0FFFFFFFU) * approx; + + rootThreshold *= level.scaling[(node.threshold >> 28) > 6]; + + dprintf("%d: approximation %f %d -> %f %f\n",threadIdx.x, approx, (node.threshold & 0x0FFFFFFFU), rootThreshold, + level.scaling[(node.threshold >> 28) > 6]); + + return rootThreshold; + } + + template + __device__ __forceinline__ int get(int x, int y, uchar4 area) { dprintf("%d: feature box %d %d %d %d\n",threadIdx.x, area.x, area.y, area.z, area.w); @@ -138,7 +178,30 @@ namespace icf { return (a - b + c - d); } + template<> + __device__ __forceinline__ int get(int x, int y, uchar4 area) + { + + dprintf("%d: feature box %d %d %d %d\n",threadIdx.x, area.x, area.y, area.z, area.w); + dprintf("%d: extract feature for: [%d %d] [%d %d] [%d %d] [%d %d]\n",threadIdx.x, + x + area.x, y + area.y, x + area.z, y + area.y, x + area.z,y + area.w, + x + area.x, y + area.w); + dprintf("%d: at point %d %d with offset %d\n", x, y, 0); + + x += area.x; + y += area.y; + int a = tex2D(thogluv, x, y); + int b = tex2D(thogluv, x + area.z, y); + int c = tex2D(thogluv, x + area.z, y + area.w); + int d = tex2D(thogluv, x, y + area.w); + + dprintf("%d retruved integral values: %d %d %d %d\n",threadIdx.x, a, b, c, d); + + return (a - b + c - d); + } + #if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 300 + template __global__ void test_kernel_warp(const Level* levels, const Octave* octaves, const float* stages, const Node* nodes, const float* leaves, Detection* objects, const uint ndetections, uint* ctr, const int downscales) @@ -163,15 +226,15 @@ namespace icf { dprintf("\n\n%d: stage: %d %d\n",threadIdx.x, st, nId); Node node = nodes[nId]; - float threshold = rescale(level, node.rect, node); - int sum = get(x, y + (node.threshold >> 28) * 121, node.rect); + float threshold = rescale(level, node); + int sum = get(x, y + (node.threshold >> 28) * 121, node.rect); int next = 1 + (int)(sum >= threshold); dprintf("%d: go: %d (%d >= %f)\n\n" ,threadIdx.x, next, sum, threshold); node = nodes[nId + next]; - threshold = rescale(level, node.rect, node); - sum = get(x, y + (node.threshold >> 28) * 121, node.rect); + threshold = rescale(level, node); + sum = get(x, y + (node.threshold >> 28) * 121, node.rect); const int lShift = (next - 1) * 2 + (int)(sum >= threshold); float impact = leaves[(st + threadIdx.x) * 4 + lShift]; @@ -192,7 +255,7 @@ namespace icf { dprintf("%d: impact scaned %f\n" ,threadIdx.x, impact); confidence += impact; - if(__any((confidence <= stages[(st + threadIdx.x)]))) break; + if(__any((confidence <= stages[(st + threadIdx.x)]))) st += stEnd; } if(st == stEnd && !threadIdx.x) @@ -204,6 +267,7 @@ namespace icf { } } #else + template __global__ void test_kernel_warp(const Level* levels, const Octave* octaves, const float* stages, const Node* nodes, const float* leaves, Detection* objects, const uint ndetections, uint* ctr, const int downscales) @@ -231,8 +295,8 @@ namespace icf { dprintf("Node: [%d %d %d %d] %d %d\n", node.rect.x, node.rect.y, node.rect.z, node.rect.w, node.threshold >> 28, node.threshold & 0x0FFFFFFFU); - float threshold = rescale(level, node.rect, node); - int sum = get(x, y + (node.threshold >> 28) * 121, node.rect); + float threshold = rescale(level, node); + int sum = get(x, y + (node.threshold >> 28) * 121, node.rect); dprintf("Node: [%d %d %d %d] %f\n", node.rect.x, node.rect.y, node.rect.z, node.rect.w, threshold); @@ -241,8 +305,8 @@ namespace icf { dprintf("go: %d (%d >= %f)\n\n" ,next, sum, threshold); node = nodes[nId + next]; - threshold = rescale(level, node.rect, node); - sum = get(x, y + (node.threshold >> 28) * 121, node.rect); + threshold = rescale(level, node); + sum = get(x, y + (node.threshold >> 28) * 121, node.rect); const int lShift = (next - 1) * 2 + (int)(sum >= threshold); float impact = leaves[st * 4 + lShift]; @@ -286,18 +350,18 @@ namespace icf { cudaChannelFormatDesc desc = cudaCreateChannelDesc(); cudaSafeCall( cudaBindTexture2D(0, thogluv, hogluv.data, desc, hogluv.cols, hogluv.rows, hogluv.step)); - test_kernel_warp<<>>(l, oct, st, nd, lf, det, max_det, ctr, 0); + test_kernel_warp<<>>(l, oct, st, nd, lf, det, max_det, ctr, 0); cudaSafeCall( cudaGetLastError()); grid = dim3(fw, fh / 8, 47 - downscales); - test_kernel_warp<<>>(l, oct, st, nd, lf, det, max_det, ctr, downscales); + test_kernel_warp<<>>(l, oct, st, nd, lf, det, max_det, ctr, downscales); cudaSafeCall( cudaGetLastError()); cudaSafeCall( cudaDeviceSynchronize()); } void detectAtScale(const int scale, const PtrStepSzb& levels, const PtrStepSzb& octaves, const PtrStepSzf& stages, const PtrStepSzb& nodes, const PtrStepSzf& leaves, const PtrStepSzi& hogluv, PtrStepSz objects, - PtrStepSzi counter) + PtrStepSzi counter, const int downscales) { int fw = 160; int fh = 120; @@ -317,7 +381,11 @@ namespace icf { cudaChannelFormatDesc desc = cudaCreateChannelDesc(); cudaSafeCall( cudaBindTexture2D(0, thogluv, hogluv.data, desc, hogluv.cols, hogluv.rows, hogluv.step)); - test_kernel_warp<<>>(l, oct, st, nd, lf, det, max_det, ctr, scale); + if (scale >= downscales) + test_kernel_warp<<>>(l, oct, st, nd, lf, det, max_det, ctr, scale); + else + test_kernel_warp<<>>(l, oct, st, nd, lf, det, max_det, ctr, scale); + cudaSafeCall( cudaGetLastError()); cudaSafeCall( cudaDeviceSynchronize()); } diff --git a/modules/gpu/src/softcascade.cpp b/modules/gpu/src/softcascade.cpp index fd94909cf..8868aa5b1 100644 --- a/modules/gpu/src/softcascade.cpp +++ b/modules/gpu/src/softcascade.cpp @@ -68,7 +68,7 @@ namespace icf { PtrStepSzi counter, const int downscales); void detectAtScale(const int scale, const PtrStepSzb& levels, const PtrStepSzb& octaves, const PtrStepSzf& stages, const PtrStepSzb& nodes, const PtrStepSzf& leaves, const PtrStepSzi& hogluv, PtrStepSz objects, - PtrStepSzi counter); + PtrStepSzi counter, const int downscales); } }}} @@ -147,7 +147,7 @@ struct cv::gpu::SoftCascade::Filds { cudaMemset(detCounter.data, 0, detCounter.step * detCounter.rows * sizeof(int)); device::icf::detectAtScale(scale, levels, octaves, stages, nodes, leaves, hogluv, objects, - detCounter); + detCounter, downscales); } private: @@ -240,6 +240,9 @@ bool cv::gpu::SoftCascade::Filds::fill(const FileNode &root, const float mins, c { FileNode fns = *it; float scale = (float)fns[SC_OCT_SCALE]; + + bool isUPOctave = scale >= 1; + scales.push_back(scale); ushort nstages = saturate_cast((int)fns[SC_OCT_STAGES]); ushort2 size; @@ -286,6 +289,12 @@ bool cv::gpu::SoftCascade::Filds::fill(const FileNode &root, const float mins, c rect.z = saturate_cast((int)*(r_it++)); rect.w = saturate_cast((int)*(r_it++)); + if (isUPOctave) + { + rect.z -= rect.x; + rect.w -= rect.y; + } + uint channel = saturate_cast((int)(*ftrs)[SC_F_CHANNEL]); vnodes.push_back(Node(rect, channel, th)); ++ftrs; diff --git a/modules/gpu/test/test_softcascade.cpp b/modules/gpu/test/test_softcascade.cpp index 0da07298d..4d1a4b7a6 100644 --- a/modules/gpu/test/test_softcascade.cpp +++ b/modules/gpu/test/test_softcascade.cpp @@ -63,7 +63,7 @@ TEST(SoftCascade, detect) cv::Mat coloredCpu = cv::imread(cvtest::TS::ptr()->get_data_path() + "../cv/cascadeandhog/bahnhof/image_00000000_0.png"); ASSERT_FALSE(coloredCpu.empty()); - GpuMat colored(coloredCpu), objectBoxes(1, 1000, CV_8UC1), rois; + GpuMat colored(coloredCpu), objectBoxes(1, 100000, CV_8UC1), rois; // ASSERT_NO_THROW( // { From 64d6e6a48d4580964a8ce8b589a94e973804c3b9 Mon Sep 17 00:00:00 2001 From: "marina.kolpakova" Date: Tue, 2 Oct 2012 16:45:40 +0400 Subject: [PATCH 033/155] add getROISize --- modules/gpu/include/opencv2/gpu/gpu.hpp | 6 ++++-- modules/gpu/src/softcascade.cpp | 21 ++++++++++++++++----- 2 files changed, 20 insertions(+), 7 deletions(-) diff --git a/modules/gpu/include/opencv2/gpu/gpu.hpp b/modules/gpu/include/opencv2/gpu/gpu.hpp index f171ad904..9b59c6004 100644 --- a/modules/gpu/include/opencv2/gpu/gpu.hpp +++ b/modules/gpu/include/opencv2/gpu/gpu.hpp @@ -1572,7 +1572,7 @@ public: //! Param objects 4-channel matrix thet contain detected rectangles //! Param rejectfactor used for final object box computing virtual void detectMultiScale(const GpuMat& image, const GpuMat& rois, GpuMat& objects, - int rejectfactor = 1, int specificScale = -1); + int rejectfactor = 1, int specificScale = -1) const; //! detect specific objects on in the input frame for all scales computed flom minScale and maxscale values. //! asynchronous version. @@ -1583,7 +1583,9 @@ public: //! Param ndet retrieves number of detections //! Param stream wrapper for CUDA stream virtual void detectMultiScale(const GpuMat& image, const GpuMat& rois, GpuMat& objects, - int rejectfactor, GpuMat& ndet, Stream stream); + int rejectfactor, GpuMat& ndet, Stream stream) const; + + cv::Size getRoiSize() const; private: struct Filds; diff --git a/modules/gpu/src/softcascade.cpp b/modules/gpu/src/softcascade.cpp index 8868aa5b1..af836695a 100644 --- a/modules/gpu/src/softcascade.cpp +++ b/modules/gpu/src/softcascade.cpp @@ -49,12 +49,18 @@ cv::gpu::SoftCascade::SoftCascade() : filds(0) { throw_nogpu(); } cv::gpu::SoftCascade::SoftCascade( const string&, const float, const float) : filds(0) { throw_nogpu(); } cv::gpu::SoftCascade::~SoftCascade() { throw_nogpu(); } bool cv::gpu::SoftCascade::load( const string&, const float, const float) { throw_nogpu(); return false; } -void cv::gpu::SoftCascade::detectMultiScale(const GpuMat&, const GpuMat&, GpuMat&, const int, int) { throw_nogpu();} -void cv::gpu::SoftCascade::detectMultiScale(const GpuMat&, const GpuMat&, GpuMat&, int, GpuMat&, Stream) +void cv::gpu::SoftCascade::detectMultiScale(const GpuMat&, const GpuMat&, GpuMat&, const int, int) const { throw_nogpu(); } +void cv::gpu::SoftCascade::detectMultiScale(const GpuMat&, const GpuMat&, GpuMat&, int, GpuMat&, Stream) const +{ + throw_nogpu(); +} + +cv::Size cv::gpu::SoftCascade::getRoiSize() const { throw_nogpu(); return cv::Size();} + #else #include @@ -455,8 +461,8 @@ namespace { //================================== synchronous version ============================================================// -void cv::gpu::SoftCascade::detectMultiScale(const GpuMat& colored, const GpuMat& /*rois*/, - GpuMat& objects, const int /*rejectfactor*/, int specificScale) +void cv::gpu::SoftCascade::detectMultiScale(const GpuMat& colored, const GpuMat& rois, + GpuMat& objects, const int /*rejectfactor*/, int specificScale) const { // only color images are supperted CV_Assert(colored.type() == CV_8UC3); @@ -555,10 +561,15 @@ void cv::gpu::SoftCascade::detectMultiScale(const GpuMat& colored, const GpuMat& objects = GpuMat(objects, cv::Rect(0, 0, ndetections * sizeof(Detection), 1)); } -void cv::gpu::SoftCascade::detectMultiScale(const GpuMat&, const GpuMat&, GpuMat&, int, GpuMat&, Stream) +void cv::gpu::SoftCascade::detectMultiScale(const GpuMat&, const GpuMat&, GpuMat&, int, GpuMat&, Stream) const { // cudaStream_t stream = StreamAccessor::getStream(s); } +cv::Size cv::gpu::SoftCascade::getRoiSize() const +{ + return cv::Size(Filds::FRAME_WIDTH / 4, Filds::FRAME_HEIGHT / 4); +} + #endif \ No newline at end of file From eb91593c08daca478663d7d0805f0ce664fd5d9b Mon Sep 17 00:00:00 2001 From: "marina.kolpakova" Date: Tue, 2 Oct 2012 17:25:26 +0400 Subject: [PATCH 034/155] add roi support --- modules/gpu/perf/perf_objdetect.cpp | 6 +++- modules/gpu/src/cuda/isf-sc.cu | 21 ++++++++++--- modules/gpu/src/softcascade.cpp | 44 +++++++++++++++++++-------- modules/gpu/test/test_softcascade.cpp | 16 ++++++---- 4 files changed, 64 insertions(+), 23 deletions(-) diff --git a/modules/gpu/perf/perf_objdetect.cpp b/modules/gpu/perf/perf_objdetect.cpp index 8531372b0..a86337112 100644 --- a/modules/gpu/perf/perf_objdetect.cpp +++ b/modules/gpu/perf/perf_objdetect.cpp @@ -104,7 +104,11 @@ PERF_TEST_P(SoftCascade, detect, Values(make_pair("cv/cascadeandhog cv::gpu::SoftCascade cascade; ASSERT_TRUE(cascade.load(perf::TestBase::getDataPath(GetParam().first))); - cv::gpu::GpuMat rois, objectBoxes(1, 16384, CV_8UC1); + cv::gpu::GpuMat objectBoxes(1, 16384, CV_8UC1), rois(cascade.getRoiSize(), CV_8UC1); + + rois.setTo(0); + cv::gpu::GpuMat sub(rois, cv::Rect(rois.cols / 4, rois.rows / 4,rois.cols / 2, rois.rows / 2)); + sub.setTo(cv::Scalar::all(1)); cascade.detectMultiScale(colored, rois, objectBoxes); TEST_CYCLE() diff --git a/modules/gpu/src/cuda/isf-sc.cu b/modules/gpu/src/cuda/isf-sc.cu index c9a92e379..4bde7f7ea 100644 --- a/modules/gpu/src/cuda/isf-sc.cu +++ b/modules/gpu/src/cuda/isf-sc.cu @@ -86,6 +86,7 @@ namespace icf { } texture thogluv; + texture troi; template __device__ __forceinline__ float rescale(const Level& level, Node& node) @@ -213,6 +214,8 @@ namespace icf { if(x >= level.workRect.x || y >= level.workRect.y) return; + if (!tex2D(troi, x, y)) return; + Octave octave = octaves[level.octave]; int st = octave.index * octave.stages; const int stEnd = st + 1024; @@ -279,6 +282,10 @@ namespace icf { // if (blockIdx.z != 31) return; if(x >= level.workRect.x || y >= level.workRect.y) return; + int roi = tex2D(troi, x, y); + printf("%d\n", roi); + if (!roi) return; + Octave octave = octaves[level.octave]; int st = octave.index * octave.stages; @@ -328,7 +335,7 @@ namespace icf { } #endif - void detect(const PtrStepSzb& levels, const PtrStepSzb& octaves, const PtrStepSzf& stages, + void detect(const PtrStepSzb& roi, const PtrStepSzb& levels, const PtrStepSzb& octaves, const PtrStepSzf& stages, const PtrStepSzb& nodes, const PtrStepSzf& leaves, const PtrStepSzi& hogluv, PtrStepSz objects, PtrStepSzi counter, const int downscales) { @@ -350,6 +357,9 @@ namespace icf { cudaChannelFormatDesc desc = cudaCreateChannelDesc(); cudaSafeCall( cudaBindTexture2D(0, thogluv, hogluv.data, desc, hogluv.cols, hogluv.rows, hogluv.step)); + cudaChannelFormatDesc desc_roi = cudaCreateChannelDesc(); + cudaSafeCall( cudaBindTexture2D(0, troi, roi.data, desc_roi, roi.cols, roi.rows, roi.step)); + test_kernel_warp<<>>(l, oct, st, nd, lf, det, max_det, ctr, 0); cudaSafeCall( cudaGetLastError()); @@ -359,9 +369,9 @@ namespace icf { cudaSafeCall( cudaDeviceSynchronize()); } - void detectAtScale(const int scale, const PtrStepSzb& levels, const PtrStepSzb& octaves, const PtrStepSzf& stages, - const PtrStepSzb& nodes, const PtrStepSzf& leaves, const PtrStepSzi& hogluv, PtrStepSz objects, - PtrStepSzi counter, const int downscales) + void detectAtScale(const int scale, const PtrStepSzb& roi, const PtrStepSzb& levels, const PtrStepSzb& octaves, + const PtrStepSzf& stages, const PtrStepSzb& nodes, const PtrStepSzf& leaves, const PtrStepSzi& hogluv, + PtrStepSz objects, PtrStepSzi counter, const int downscales) { int fw = 160; int fh = 120; @@ -381,6 +391,9 @@ namespace icf { cudaChannelFormatDesc desc = cudaCreateChannelDesc(); cudaSafeCall( cudaBindTexture2D(0, thogluv, hogluv.data, desc, hogluv.cols, hogluv.rows, hogluv.step)); + cudaChannelFormatDesc desc_roi = cudaCreateChannelDesc(); + cudaSafeCall( cudaBindTexture2D(0, troi, roi.data, desc_roi, roi.cols, roi.rows, roi.step)); + if (scale >= downscales) test_kernel_warp<<>>(l, oct, st, nd, lf, det, max_det, ctr, scale); else diff --git a/modules/gpu/src/softcascade.cpp b/modules/gpu/src/softcascade.cpp index af836695a..9ea365c5e 100644 --- a/modules/gpu/src/softcascade.cpp +++ b/modules/gpu/src/softcascade.cpp @@ -69,12 +69,29 @@ namespace cv { namespace gpu { namespace device { namespace icf { void fillBins(cv::gpu::PtrStepSzb hogluv, const cv::gpu::PtrStepSzf& nangle, const int fw, const int fh, const int bins); - void detect(const PtrStepSzb& levels, const PtrStepSzb& octaves, const PtrStepSzf& stages, - const PtrStepSzb& nodes, const PtrStepSzf& leaves, const PtrStepSzi& hogluv, PtrStepSz objects, - PtrStepSzi counter, const int downscales); - void detectAtScale(const int scale, const PtrStepSzb& levels, const PtrStepSzb& octaves, const PtrStepSzf& stages, - const PtrStepSzb& nodes, const PtrStepSzf& leaves, const PtrStepSzi& hogluv, PtrStepSz objects, - PtrStepSzi counter, const int downscales); + + void detect(const PtrStepSzb& rois, + const PtrStepSzb& levels, + const PtrStepSzb& octaves, + const PtrStepSzf& stages, + const PtrStepSzb& nodes, + const PtrStepSzf& leaves, + const PtrStepSzi& hogluv, + PtrStepSz objects, + PtrStepSzi counter, + const int downscales); + + void detectAtScale(const int scale, + const PtrStepSzb& rois, + const PtrStepSzb& levels, + const PtrStepSzb& octaves, + const PtrStepSzf& stages, + const PtrStepSzb& nodes, + const PtrStepSzf& leaves, + const PtrStepSzi& hogluv, + PtrStepSz objects, + PtrStepSzi counter, + const int downscales); } }}} @@ -143,16 +160,16 @@ struct cv::gpu::SoftCascade::Filds }; bool fill(const FileNode &root, const float mins, const float maxs); - void detect(cv::gpu::GpuMat objects, cudaStream_t stream) const + void detect(cv::gpu::GpuMat roi, cv::gpu::GpuMat objects, cudaStream_t stream) const { cudaMemset(detCounter.data, 0, detCounter.step * detCounter.rows * sizeof(int)); - device::icf::detect(levels, octaves, stages, nodes, leaves, hogluv, objects , detCounter, downscales); + device::icf::detect(roi, levels, octaves, stages, nodes, leaves, hogluv, objects , detCounter, downscales); } - void detectAtScale(int scale, cv::gpu::GpuMat objects, cudaStream_t stream) const + void detectAtScale(int scale, cv::gpu::GpuMat roi, cv::gpu::GpuMat objects, cudaStream_t stream) const { cudaMemset(detCounter.data, 0, detCounter.step * detCounter.rows * sizeof(int)); - device::icf::detectAtScale(scale, levels, octaves, stages, nodes, leaves, hogluv, objects, + device::icf::detectAtScale(scale, roi, levels, octaves, stages, nodes, leaves, hogluv, objects, detCounter, downscales); } @@ -467,6 +484,9 @@ void cv::gpu::SoftCascade::detectMultiScale(const GpuMat& colored, const GpuMat& // only color images are supperted CV_Assert(colored.type() == CV_8UC3); + // we guess user knows about shrincage + CV_Assert((rois.size() == getRoiSize()) && (rois.type() == CV_8UC1)); + // only this window size allowed CV_Assert(colored.cols == Filds::FRAME_WIDTH && colored.rows == Filds::FRAME_HEIGHT); @@ -551,9 +571,9 @@ void cv::gpu::SoftCascade::detectMultiScale(const GpuMat& colored, const GpuMat& #endif if (specificScale == -1) - flds.detect(objects, 0); + flds.detect(rois,objects, 0); else - flds.detectAtScale(specificScale, objects, 0); + flds.detectAtScale(specificScale, rois, objects, 0); cv::Mat out(flds.detCounter); int ndetections = *(out.data); diff --git a/modules/gpu/test/test_softcascade.cpp b/modules/gpu/test/test_softcascade.cpp index 4d1a4b7a6..84602915f 100644 --- a/modules/gpu/test/test_softcascade.cpp +++ b/modules/gpu/test/test_softcascade.cpp @@ -63,12 +63,13 @@ TEST(SoftCascade, detect) cv::Mat coloredCpu = cv::imread(cvtest::TS::ptr()->get_data_path() + "../cv/cascadeandhog/bahnhof/image_00000000_0.png"); ASSERT_FALSE(coloredCpu.empty()); - GpuMat colored(coloredCpu), objectBoxes(1, 100000, CV_8UC1), rois; - // ASSERT_NO_THROW( - // { - cascade.detectMultiScale(colored, rois, objectBoxes); - // }); + GpuMat colored(coloredCpu), objectBoxes(1, 100000, CV_8UC1), rois(cascade.getRoiSize(), CV_8UC1); + rois.setTo(0); + GpuMat sub(rois, cv::Rect(rois.cols / 4, rois.rows / 4,rois.cols / 2, rois.rows / 2)); + sub.setTo(cv::Scalar::all(1)); + + cascade.detectMultiScale(colored, rois, objectBoxes); } class SCSpecific : public ::testing::TestWithParam > { @@ -93,7 +94,10 @@ TEST_P(SCSpecific, detect) cv::Mat coloredCpu = cv::imread(cvtest::TS::ptr()->get_data_path() + path); ASSERT_FALSE(coloredCpu.empty()); - GpuMat colored(coloredCpu), objectBoxes(1, 1000, CV_8UC1), rois; + GpuMat colored(coloredCpu), objectBoxes(1, 1000, CV_8UC1), rois(cascade.getRoiSize(), CV_8UC1); + rois.setTo(0); + GpuMat sub(rois, cv::Rect(rois.cols / 4, rois.rows / 4,rois.cols / 2, rois.rows / 2)); + sub.setTo(cv::Scalar::all(1)); int level = GET_PARAM(1); cascade.detectMultiScale(colored, rois, objectBoxes, 1, level); From dd595376ba861ce13f5015abc8297bd92bf5938c Mon Sep 17 00:00:00 2001 From: "marina.kolpakova" Date: Wed, 3 Oct 2012 14:26:26 +0400 Subject: [PATCH 035/155] Add performance test for detection in ROI; refactor soft cascade performance tests --- modules/gpu/test/test_softcascade.cpp | 248 +++++++++++++++++++------- 1 file changed, 181 insertions(+), 67 deletions(-) diff --git a/modules/gpu/test/test_softcascade.cpp b/modules/gpu/test/test_softcascade.cpp index 84602915f..0b266f827 100644 --- a/modules/gpu/test/test_softcascade.cpp +++ b/modules/gpu/test/test_softcascade.cpp @@ -41,20 +41,197 @@ //M*/ #include +#include #ifdef HAVE_CUDA - using cv::gpu::GpuMat; -TEST(SoftCascade, readCascade) +// show detection results on input image with cv::imshow +//#define SHOW_DETECTIONS + +#if defined SHOW_DETECTIONS +# define SHOW(res) \ + cv::imshow(#res, result);\ + cv::waitKey(0); +#else +# define SHOW(res) +#endif + +#define GPU_TEST_P(fixture, name, params) \ + class fixture##_##name : public fixture { \ + public: \ + fixture##_##name() {} \ + protected: \ + virtual void body(); \ + }; \ + TEST_P(fixture##_##name, name /*none*/){ body();} \ + INSTANTIATE_TEST_CASE_P(/*none*/, fixture##_##name, params); \ + void fixture##_##name::body() + + +typedef std::tr1::tuple roi_fixture_t; + +struct SoftCascadeTest : public ::testing::TestWithParam +{ + typedef cv::gpu::SoftCascade::Detection detection_t; + static cv::Rect getFromTable(int idx) + { + static const cv::Rect rois[] = + { + cv::Rect( 65, 20, 35, 80), + cv::Rect( 95, 35, 45, 40), + cv::Rect( 45, 35, 45, 40), + cv::Rect( 25, 27, 50, 45), + cv::Rect(100, 50, 45, 40), + + cv::Rect( 60, 30, 45, 40), + cv::Rect( 40, 55, 50, 40), + cv::Rect( 48, 37, 72, 80), + cv::Rect( 48, 32, 85, 58), + cv::Rect( 48, 0, 32, 27) + }; + + return rois[idx]; + } + + static std::string itoa(long i) + { + static char s[65]; + sprintf(s, "%ld", i); + return std::string(s); + } + + static std::string getImageName(int level) + { + time_t rawtime; + struct tm * timeinfo; + char buffer [80]; + + time ( &rawtime ); + timeinfo = localtime ( &rawtime ); + + strftime (buffer,80,"%Y-%m-%d--%H-%M-%S",timeinfo); + return "gpu_rec_level_" + itoa(level)+ "_" + std::string(buffer) + ".png"; + } + + static void print(std::ostream &out, const detection_t& d) + { + out << "\x1b[32m[ detection]\x1b[0m (" + << std::setw(4) << d.x + << " " + << std::setw(4) << d.y + << ") (" + << std::setw(4) << d.w + << " " + << std::setw(4) << d.h + << ") " + << std::setw(12) << d.confidence + << std::endl; + } + + static void printTotal(std::ostream &out, int detbytes) + { + out << "\x1b[32m[ ]\x1b[0m Total detections " << (detbytes / sizeof(detection_t)) << std::endl; + } + + static void writeResult(const cv::Mat& result, const int level) + { + std::string path = cv::tempfile(getImageName(level).c_str()); + cv::imwrite(path, result); + std::cout << "\x1b[32m" << "[ ]" << std::endl << "[ stored in]"<< "\x1b[0m" << path << std::endl; + } +}; + +GPU_TEST_P(SoftCascadeTest, detectInROI, + testing::Combine( + testing::Values(std::string("../cv/cascadeandhog/sc_cvpr_2012_to_opencv.xml")), + testing::Values(std::string("../cv/cascadeandhog/bahnhof/image_00000000_0.png")), + testing::Range(0, 5))) +{ + cv::Mat coloredCpu = cv::imread(cvtest::TS::ptr()->get_data_path() + GET_PARAM(1)); + ASSERT_FALSE(coloredCpu.empty()); + + cv::gpu::SoftCascade cascade; + ASSERT_TRUE(cascade.load(cvtest::TS::ptr()->get_data_path() + GET_PARAM(0))); + + GpuMat colored(coloredCpu), objectBoxes(1, 16384, CV_8UC1), rois(cascade.getRoiSize(), CV_8UC1); + rois.setTo(0); + + int nroi = GET_PARAM(2); + cv::RNG rng; + for (int i = 0; i < nroi; ++i) + { + cv::Rect r = getFromTable(rng(10)); + GpuMat sub(rois, r); + sub.setTo(1); + } + + cascade.detectMultiScale(colored, rois, objectBoxes); + + /// + cv::Mat dt(objectBoxes); + typedef cv::gpu::SoftCascade::Detection detection_t; + + detection_t* dts = (detection_t*)dt.data; + cv::Mat result(coloredCpu); + + printTotal(std::cout, dt.cols); + for (int i = 0; i < (int)(dt.cols / sizeof(detection_t)); ++i) + { + detection_t d = dts[i]; + print(std::cout, d); + cv::rectangle(result, cv::Rect(d.x, d.y, d.w, d.h), cv::Scalar(255, 0, 0, 255), 1); + } + + SHOW(result); +} + +GPU_TEST_P(SoftCascadeTest, detectInLevel, + testing::Combine( + testing::Values(std::string("../cv/cascadeandhog/sc_cvpr_2012_to_opencv.xml")), + testing::Values(std::string("../cv/cascadeandhog/bahnhof/image_00000000_0.png")), + testing::Range(0, 47) + )) +{ + std::string xml = cvtest::TS::ptr()->get_data_path() + GET_PARAM(0); + cv::gpu::SoftCascade cascade; + ASSERT_TRUE(cascade.load(xml)); + + cv::Mat coloredCpu = cv::imread(cvtest::TS::ptr()->get_data_path() + GET_PARAM(1)); + ASSERT_FALSE(coloredCpu.empty()); + + typedef cv::gpu::SoftCascade::Detection detection_t; + GpuMat colored(coloredCpu), objectBoxes(1, 100 * sizeof(detection_t), CV_8UC1), rois(cascade.getRoiSize(), CV_8UC1); + rois.setTo(1); + + int level = GET_PARAM(2); + cascade.detectMultiScale(colored, rois, objectBoxes, 1, level); + + cv::Mat dt(objectBoxes); + + detection_t* dts = (detection_t*)dt.data; + cv::Mat result(coloredCpu); + + printTotal(std::cout, dt.cols); + for (int i = 0; i < (int)(dt.cols / sizeof(detection_t)); ++i) + { + detection_t d = dts[i]; + print(std::cout, d); + cv::rectangle(result, cv::Rect(d.x, d.y, d.w, d.h), cv::Scalar(255, 0, 0, 255), 1); + } + + writeResult(result, level); + SHOW(result); +} + +TEST(SoftCascadeTest, readCascade) { std::string xml = cvtest::TS::ptr()->get_data_path() + "../cv/cascadeandhog/icf-template.xml"; cv::gpu::SoftCascade cascade; ASSERT_TRUE(cascade.load(xml)); - } -TEST(SoftCascade, detect) +TEST(SoftCascadeTest, detect) { std::string xml = cvtest::TS::ptr()->get_data_path() + "../cv/cascadeandhog/sc_cvpr_2012_to_opencv.xml"; cv::gpu::SoftCascade cascade; @@ -71,67 +248,4 @@ TEST(SoftCascade, detect) cascade.detectMultiScale(colored, rois, objectBoxes); } - -class SCSpecific : public ::testing::TestWithParam > { -}; - -namespace { -std::string itoa(long i) -{ - static char s[65]; - sprintf(s, "%ld", i); - return std::string(s); -} -} - -TEST_P(SCSpecific, detect) -{ - std::string xml = cvtest::TS::ptr()->get_data_path() + "../cv/cascadeandhog/sc_cvpr_2012_to_opencv.xml"; - cv::gpu::SoftCascade cascade; - ASSERT_TRUE(cascade.load(xml)); - - std::string path = GET_PARAM(0); - cv::Mat coloredCpu = cv::imread(cvtest::TS::ptr()->get_data_path() + path); - - ASSERT_FALSE(coloredCpu.empty()); - GpuMat colored(coloredCpu), objectBoxes(1, 1000, CV_8UC1), rois(cascade.getRoiSize(), CV_8UC1); - rois.setTo(0); - GpuMat sub(rois, cv::Rect(rois.cols / 4, rois.rows / 4,rois.cols / 2, rois.rows / 2)); - sub.setTo(cv::Scalar::all(1)); - - int level = GET_PARAM(1); - cascade.detectMultiScale(colored, rois, objectBoxes, 1, level); - - cv::Mat dt(objectBoxes); - typedef cv::gpu::SoftCascade::Detection detection_t; - - detection_t* dts = (detection_t*)dt.data; - cv::Mat result(coloredCpu); - - - std::cout << "Total detections " << (dt.cols / sizeof(detection_t)) << std::endl; - for(int i = 0; i < (int)(dt.cols / sizeof(detection_t)); ++i) - { - detection_t d = dts[i]; - std::cout << "detection: [" << std::setw(4) << d.x << " " << std::setw(4) << d.y - << "] [" << std::setw(4) << d.w << " " << std::setw(4) << d.h << "] " - << std::setw(12) << d.confidence << std::endl; - - cv::rectangle(result, cv::Rect(d.x, d.y, d.w, d.h), cv::Scalar(255, 0, 0, 255), 1); - } - - std::cout << "Result stored in " << "/home/kellan/gpu_res_1_oct_" + itoa(level) << "_" - + itoa((dt.cols / sizeof(detection_t))) + ".png" << std::endl; - cv::imwrite("/home/kellan/gpu_res_1_oct_" + itoa(level) + "_" + itoa((dt.cols / sizeof(detection_t))) + ".png", - result); - cv::imshow("res", result); - cv::waitKey(0); -} - -INSTANTIATE_TEST_CASE_P(inLevel, SCSpecific, - testing::Combine( - testing::Values(std::string("../cv/cascadeandhog/bahnhof/image_00000000_0.png")), - testing::Range(0, 47) - )); - #endif \ No newline at end of file From 838842cc96e1f09cbb97c189a9653c46657ac107 Mon Sep 17 00:00:00 2001 From: "marina.kolpakova" Date: Wed, 3 Oct 2012 16:36:00 +0400 Subject: [PATCH 036/155] Add performance detection test in ROI; refactored soft cascade performance tests --- modules/gpu/perf/perf_objdetect.cpp | 187 +++++++++++++++++++++++++++- 1 file changed, 185 insertions(+), 2 deletions(-) diff --git a/modules/gpu/perf/perf_objdetect.cpp b/modules/gpu/perf/perf_objdetect.cpp index a86337112..2224194bb 100644 --- a/modules/gpu/perf/perf_objdetect.cpp +++ b/modules/gpu/perf/perf_objdetect.cpp @@ -89,11 +89,90 @@ PERF_TEST_P(HOG, CalTech, Values("gpu/caltech/image_00000009_0.png", "gp SANITY_CHECK(found_locations); } +//================================================= ICF SoftCascade =================================================// + typedef pair pair_string; DEF_PARAM_TEST_1(SoftCascade, pair_string); -PERF_TEST_P(SoftCascade, detect, Values(make_pair("cv/cascadeandhog/sc_cvpr_2012_to_opencv.xml", - "cv/cascadeandhog/bahnhof/image_00000000_0.png"))) + +// struct SoftCascadeTest : public perf::TestBaseWithParam +// { +// typedef cv::gpu::SoftCascade::Detection detection_t; +// static cv::Rect getFromTable(int idx) +// { +// static const cv::Rect rois[] = +// { +// cv::Rect( 65, 20, 35, 80), +// cv::Rect( 95, 35, 45, 40), +// cv::Rect( 45, 35, 45, 40), +// cv::Rect( 25, 27, 50, 45), +// cv::Rect(100, 50, 45, 40), + +// cv::Rect( 60, 30, 45, 40), +// cv::Rect( 40, 55, 50, 40), +// cv::Rect( 48, 37, 72, 80), +// cv::Rect( 48, 32, 85, 58), +// cv::Rect( 48, 0, 32, 27) +// }; + +// return rois[idx]; +// } + +// static std::string itoa(long i) +// { +// static char s[65]; +// sprintf(s, "%ld", i); +// return std::string(s); +// } + +// static std::string getImageName(int level) +// { +// time_t rawtime; +// struct tm * timeinfo; +// char buffer [80]; + +// time ( &rawtime ); +// timeinfo = localtime ( &rawtime ); + +// strftime (buffer,80,"%Y-%m-%d--%H-%M-%S",timeinfo); +// return "gpu_rec_level_" + itoa(level)+ "_" + std::string(buffer) + ".png"; +// } + +// static void print(std::ostream &out, const detection_t& d) +// { +// out << "\x1b[32m[ detection]\x1b[0m (" +// << std::setw(4) << d.x +// << " " +// << std::setw(4) << d.y +// << ") (" +// << std::setw(4) << d.w +// << " " +// << std::setw(4) << d.h +// << ") " +// << std::setw(12) << d.confidence +// << std::endl; +// } + +// static void printTotal(std::ostream &out, int detbytes) +// { +// out << "\x1b[32m[ ]\x1b[0m Total detections " << (detbytes / sizeof(detection_t)) << std::endl; +// } + +// static void writeResult(const cv::Mat& result, const int level) +// { +// std::string path = cv::tempfile(getImageName(level).c_str()); +// cv::imwrite(path, result); +// std::cout << "\x1b[32m" << "[ ]" << std::endl << "[ stored in]"<< "\x1b[0m" << path << std::endl; +// } +// }; + +typedef std::tr1::tuple fixture_t; +typedef perf::TestBaseWithParam SoftCascadeTest; + +PERF_TEST_P(SoftCascadeTest, detect, + testing::Combine( + testing::Values(std::string("cv/cascadeandhog/sc_cvpr_2012_to_opencv.xml")), + testing::Values(std::string("cv/cascadeandhog/bahnhof/image_00000000_0.png")))) { if (runOnGpu) { @@ -133,6 +212,110 @@ PERF_TEST_P(SoftCascade, detect, Values(make_pair("cv/cascadeandhog } } +static cv::Rect getFromTable(int idx) +{ + static const cv::Rect rois[] = + { + cv::Rect( 65, 20, 35, 80), + cv::Rect( 95, 35, 45, 40), + cv::Rect( 45, 35, 45, 40), + cv::Rect( 25, 27, 50, 45), + cv::Rect(100, 50, 45, 40), + + cv::Rect( 60, 30, 45, 40), + cv::Rect( 40, 55, 50, 40), + cv::Rect( 48, 37, 72, 80), + cv::Rect( 48, 32, 85, 58), + cv::Rect( 48, 0, 32, 27) + }; + + return rois[idx]; +} + +typedef std::tr1::tuple roi_fixture_t; +typedef perf::TestBaseWithParam SoftCascadeTestRoi; + +PERF_TEST_P(SoftCascadeTestRoi, detectInRoi, + testing::Combine( + testing::Values(std::string("cv/cascadeandhog/sc_cvpr_2012_to_opencv.xml")), + testing::Values(std::string("cv/cascadeandhog/bahnhof/image_00000000_0.png")), + testing::Range(0, 5))) +{ + if (runOnGpu) + { + cv::Mat cpu = readImage (GET_PARAM(1)); + ASSERT_FALSE(cpu.empty()); + cv::gpu::GpuMat colored(cpu); + + cv::gpu::SoftCascade cascade; + ASSERT_TRUE(cascade.load(perf::TestBase::getDataPath(GET_PARAM(0)))); + + cv::gpu::GpuMat objectBoxes(1, 16384 * 20, CV_8UC1), rois(cascade.getRoiSize(), CV_8UC1); + rois.setTo(0); + + int nroi = GET_PARAM(2); + cv::RNG rng; + for (int i = 0; i < nroi; ++i) + { + cv::Rect r = getFromTable(rng(10)); + cv::gpu::GpuMat sub(rois, r); + sub.setTo(1); + } + + cv::gpu::GpuMat curr = objectBoxes; + cascade.detectMultiScale(colored, rois, curr); + + TEST_CYCLE() + { + curr = objectBoxes; + cascade.detectMultiScale(colored, rois, curr); + } + } + else + { + FAIL(); + } +} + +PERF_TEST_P(SoftCascadeTestRoi, detectEachRoi, + testing::Combine( + testing::Values(std::string("cv/cascadeandhog/sc_cvpr_2012_to_opencv.xml")), + testing::Values(std::string("cv/cascadeandhog/bahnhof/image_00000000_0.png")), + testing::Range(0, 10))) +{ + if (runOnGpu) + { + cv::Mat cpu = readImage (GET_PARAM(1)); + ASSERT_FALSE(cpu.empty()); + cv::gpu::GpuMat colored(cpu); + + cv::gpu::SoftCascade cascade; + ASSERT_TRUE(cascade.load(perf::TestBase::getDataPath(GET_PARAM(0)))); + + cv::gpu::GpuMat objectBoxes(1, 16384 * 20, CV_8UC1), rois(cascade.getRoiSize(), CV_8UC1); + rois.setTo(0); + + int idx = GET_PARAM(2); + cv::Rect r = getFromTable(idx); + cv::gpu::GpuMat sub(rois, r); + sub.setTo(1); + + cv::gpu::GpuMat curr = objectBoxes; + cascade.detectMultiScale(colored, rois, curr); + + TEST_CYCLE() + { + curr = objectBoxes; + cascade.detectMultiScale(colored, rois, curr); + } + } + else + { + FAIL(); + } +} + + /////////////////////////////////////////////////////////////// // HaarClassifier From fdef0adf95052f79a600f77cfb54e41383fa2e58 Mon Sep 17 00:00:00 2001 From: "marina.kolpakova" Date: Wed, 3 Oct 2012 16:39:37 +0400 Subject: [PATCH 037/155] Corrects objects matrix in case of the absence of objects --- modules/gpu/src/softcascade.cpp | 27 +++++++++++++-------------- 1 file changed, 13 insertions(+), 14 deletions(-) diff --git a/modules/gpu/src/softcascade.cpp b/modules/gpu/src/softcascade.cpp index 9ea365c5e..8b73ae639 100644 --- a/modules/gpu/src/softcascade.cpp +++ b/modules/gpu/src/softcascade.cpp @@ -422,19 +422,15 @@ inline void cv::gpu::SoftCascade::Filds::calcLevels(const std::vector Date: Mon, 8 Oct 2012 15:37:28 +0400 Subject: [PATCH 038/155] optimize roi loads only one thread load roi for all block --- modules/gpu/perf/perf_objdetect.cpp | 42 ++++++++++-------- modules/gpu/src/cuda/isf-sc.cu | 62 +++++++++------------------ modules/gpu/src/softcascade.cpp | 2 +- modules/gpu/test/test_softcascade.cpp | 22 +++++++--- 4 files changed, 62 insertions(+), 66 deletions(-) diff --git a/modules/gpu/perf/perf_objdetect.cpp b/modules/gpu/perf/perf_objdetect.cpp index 2224194bb..e6efcc2d6 100644 --- a/modules/gpu/perf/perf_objdetect.cpp +++ b/modules/gpu/perf/perf_objdetect.cpp @@ -176,33 +176,35 @@ PERF_TEST_P(SoftCascadeTest, detect, { if (runOnGpu) { - cv::Mat cpu = readImage (GetParam().second); + cv::Mat cpu = readImage (GET_PARAM(1)); ASSERT_FALSE(cpu.empty()); cv::gpu::GpuMat colored(cpu); cv::gpu::SoftCascade cascade; - ASSERT_TRUE(cascade.load(perf::TestBase::getDataPath(GetParam().first))); + ASSERT_TRUE(cascade.load(perf::TestBase::getDataPath(GET_PARAM(0)))); - cv::gpu::GpuMat objectBoxes(1, 16384, CV_8UC1), rois(cascade.getRoiSize(), CV_8UC1); - - rois.setTo(0); - cv::gpu::GpuMat sub(rois, cv::Rect(rois.cols / 4, rois.rows / 4,rois.cols / 2, rois.rows / 2)); - sub.setTo(cv::Scalar::all(1)); - cascade.detectMultiScale(colored, rois, objectBoxes); + cv::gpu::GpuMat objectBoxes(1, 16384, CV_8UC1), rois(cascade.getRoiSize(), CV_8UC1), trois; + rois.setTo(1); + cv::gpu::transpose(rois, trois); + cascade.detectMultiScale(colored, trois, objectBoxes); TEST_CYCLE() { - cascade.detectMultiScale(colored, rois, objectBoxes); + cascade.detectMultiScale(colored, trois, objectBoxes); } - } else + } + else { - cv::Mat colored = readImage(GetParam().second); + cv::Mat colored = readImage(GET_PARAM(1)); ASSERT_FALSE(colored.empty()); cv::SoftCascade cascade; - ASSERT_TRUE(cascade.load(getDataPath(GetParam().first))); + ASSERT_TRUE(cascade.load(getDataPath(GET_PARAM(0)))); - std::vector rois, objectBoxes; + std::vector rois; + + typedef cv::SoftCascade::Detection Detection; + std::vectorobjectBoxes; cascade.detectMultiScale(colored, rois, objectBoxes); TEST_CYCLE() @@ -262,13 +264,16 @@ PERF_TEST_P(SoftCascadeTestRoi, detectInRoi, sub.setTo(1); } + cv::gpu::GpuMat trois; + cv::gpu::transpose(rois, trois); + cv::gpu::GpuMat curr = objectBoxes; - cascade.detectMultiScale(colored, rois, curr); + cascade.detectMultiScale(colored, trois, curr); TEST_CYCLE() { curr = objectBoxes; - cascade.detectMultiScale(colored, rois, curr); + cascade.detectMultiScale(colored, trois, curr); } } else @@ -301,7 +306,10 @@ PERF_TEST_P(SoftCascadeTestRoi, detectEachRoi, sub.setTo(1); cv::gpu::GpuMat curr = objectBoxes; - cascade.detectMultiScale(colored, rois, curr); + cv::gpu::GpuMat trois; + cv::gpu::transpose(rois, trois); + + cascade.detectMultiScale(colored, trois, curr); TEST_CYCLE() { @@ -372,7 +380,7 @@ PERF_TEST_P(ImageAndCascade, ObjDetect_LBPClassifier, cv::Mat img = readImage(GetParam().first, cv::IMREAD_GRAYSCALE); ASSERT_FALSE(img.empty()); - if (PERF_RUN_GPU()) + if (runOnGpu) { cv::gpu::CascadeClassifier_GPU d_cascade; ASSERT_TRUE(d_cascade.load(perf::TestBase::getDataPath(GetParam().second))); diff --git a/modules/gpu/src/cuda/isf-sc.cu b/modules/gpu/src/cuda/isf-sc.cu index 4bde7f7ea..8df6907df 100644 --- a/modules/gpu/src/cuda/isf-sc.cu +++ b/modules/gpu/src/cuda/isf-sc.cu @@ -86,7 +86,6 @@ namespace icf { } texture thogluv; - texture troi; template __device__ __forceinline__ float rescale(const Level& level, Node& node) @@ -130,11 +129,6 @@ namespace icf { float relScale = level.relScale; float farea = scaledRect.z * scaledRect.w; - dprintf("%d: feature %d box %d %d %d %d\n",threadIdx.x, (node.threshold >> 28), scaledRect.x, scaledRect.y, - scaledRect.z, scaledRect.w); - dprintf("%d: rescale: %f [%f %f] selected %f\n",threadIdx.x, level.relScale, level.scaling[0], level.scaling[1], - level.scaling[(node.threshold >> 28) > 6]); - // rescale scaledRect.x = __float2int_rn(relScale * scaledRect.x); scaledRect.y = __float2int_rn(relScale * scaledRect.y); @@ -146,15 +140,7 @@ namespace icf { const float expected_new_area = farea * relScale * relScale; float approx = __fdividef(sarea, expected_new_area); - dprintf("%d: new rect: %d box %d %d %d %d rel areas %f %f\n",threadIdx.x, (node.threshold >> 28), - scaledRect.x, scaledRect.y, scaledRect.z, scaledRect.w, farea * relScale * relScale, sarea); - - float rootThreshold = (node.threshold & 0x0FFFFFFFU) * approx; - - rootThreshold *= level.scaling[(node.threshold >> 28) > 6]; - - dprintf("%d: approximation %f %d -> %f %f\n",threadIdx.x, approx, (node.threshold & 0x0FFFFFFFU), rootThreshold, - level.scaling[(node.threshold >> 28) > 6]); + float rootThreshold = (node.threshold & 0x0FFFFFFFU) * approx * level.scaling[(node.threshold >> 28) > 6]; return rootThreshold; } @@ -162,33 +148,17 @@ namespace icf { template __device__ __forceinline__ int get(int x, int y, uchar4 area) { - - dprintf("%d: feature box %d %d %d %d\n",threadIdx.x, area.x, area.y, area.z, area.w); - dprintf("%d: extract feature for: [%d %d] [%d %d] [%d %d] [%d %d]\n",threadIdx.x, - x + area.x, y + area.y, x + area.z, y + area.y, x + area.z,y + area.w, - x + area.x, y + area.w); - dprintf("%d: at point %d %d with offset %d\n", x, y, 0); - int a = tex2D(thogluv, x + area.x, y + area.y); int b = tex2D(thogluv, x + area.z, y + area.y); int c = tex2D(thogluv, x + area.z, y + area.w); int d = tex2D(thogluv, x + area.x, y + area.w); - dprintf("%d retruved integral values: %d %d %d %d\n",threadIdx.x, a, b, c, d); - return (a - b + c - d); } template<> __device__ __forceinline__ int get(int x, int y, uchar4 area) { - - dprintf("%d: feature box %d %d %d %d\n",threadIdx.x, area.x, area.y, area.z, area.w); - dprintf("%d: extract feature for: [%d %d] [%d %d] [%d %d] [%d %d]\n",threadIdx.x, - x + area.x, y + area.y, x + area.z, y + area.y, x + area.z,y + area.w, - x + area.x, y + area.w); - dprintf("%d: at point %d %d with offset %d\n", x, y, 0); - x += area.x; y += area.y; int a = tex2D(thogluv, x, y); @@ -196,11 +166,10 @@ namespace icf { int c = tex2D(thogluv, x + area.z, y + area.w); int d = tex2D(thogluv, x, y + area.w); - dprintf("%d retruved integral values: %d %d %d %d\n",threadIdx.x, a, b, c, d); - return (a - b + c - d); } + texture troi; #if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 300 template __global__ void test_kernel_warp(const Level* levels, const Octave* octaves, const float* stages, @@ -210,12 +179,21 @@ namespace icf { const int y = blockIdx.y * blockDim.y + threadIdx.y; const int x = blockIdx.x; + __shared__ volatile char roiCache[8]; + + if (!threadIdx.y && !threadIdx.x) + { + ((float2*)roiCache)[threadIdx.x] = tex2D(troi, blockIdx.y, x); + } + + __syncthreads(); + + if (!roiCache[threadIdx.y]) return; + Level level = levels[downscales + blockIdx.z]; if(x >= level.workRect.x || y >= level.workRect.y) return; - if (!tex2D(troi, x, y)) return; - Octave octave = octaves[level.octave]; int st = octave.index * octave.stages; const int stEnd = st + 1024; @@ -282,9 +260,9 @@ namespace icf { // if (blockIdx.z != 31) return; if(x >= level.workRect.x || y >= level.workRect.y) return; - int roi = tex2D(troi, x, y); - printf("%d\n", roi); - if (!roi) return; + // int roi = tex2D(troi, x, y); + // printf("%d\n", roi); + // if (!roi) return; Octave octave = octaves[level.octave]; @@ -357,8 +335,8 @@ namespace icf { cudaChannelFormatDesc desc = cudaCreateChannelDesc(); cudaSafeCall( cudaBindTexture2D(0, thogluv, hogluv.data, desc, hogluv.cols, hogluv.rows, hogluv.step)); - cudaChannelFormatDesc desc_roi = cudaCreateChannelDesc(); - cudaSafeCall( cudaBindTexture2D(0, troi, roi.data, desc_roi, roi.cols, roi.rows, roi.step)); + cudaChannelFormatDesc desc_roi = cudaCreateChannelDesc(); + cudaSafeCall( cudaBindTexture2D(0, troi, roi.data, desc_roi, roi.cols / 8, roi.rows, roi.step)); test_kernel_warp<<>>(l, oct, st, nd, lf, det, max_det, ctr, 0); cudaSafeCall( cudaGetLastError()); @@ -391,8 +369,8 @@ namespace icf { cudaChannelFormatDesc desc = cudaCreateChannelDesc(); cudaSafeCall( cudaBindTexture2D(0, thogluv, hogluv.data, desc, hogluv.cols, hogluv.rows, hogluv.step)); - cudaChannelFormatDesc desc_roi = cudaCreateChannelDesc(); - cudaSafeCall( cudaBindTexture2D(0, troi, roi.data, desc_roi, roi.cols, roi.rows, roi.step)); + cudaChannelFormatDesc desc_roi = cudaCreateChannelDesc(); + cudaSafeCall( cudaBindTexture2D(0, troi, roi.data, desc_roi, roi.cols / 8, roi.rows, roi.step)); if (scale >= downscales) test_kernel_warp<<>>(l, oct, st, nd, lf, det, max_det, ctr, scale); diff --git a/modules/gpu/src/softcascade.cpp b/modules/gpu/src/softcascade.cpp index 8b73ae639..e7fcfff27 100644 --- a/modules/gpu/src/softcascade.cpp +++ b/modules/gpu/src/softcascade.cpp @@ -481,7 +481,7 @@ void cv::gpu::SoftCascade::detectMultiScale(const GpuMat& colored, const GpuMat& CV_Assert(colored.type() == CV_8UC3); // we guess user knows about shrincage - CV_Assert((rois.size() == getRoiSize()) && (rois.type() == CV_8UC1)); + CV_Assert((rois.size().width == getRoiSize().height) && (rois.type() == CV_8UC1)); // only this window size allowed CV_Assert(colored.cols == Filds::FRAME_WIDTH && colored.rows == Filds::FRAME_HEIGHT); diff --git a/modules/gpu/test/test_softcascade.cpp b/modules/gpu/test/test_softcascade.cpp index 0b266f827..04fa9b181 100644 --- a/modules/gpu/test/test_softcascade.cpp +++ b/modules/gpu/test/test_softcascade.cpp @@ -47,7 +47,7 @@ using cv::gpu::GpuMat; // show detection results on input image with cv::imshow -//#define SHOW_DETECTIONS +#define SHOW_DETECTIONS #if defined SHOW_DETECTIONS # define SHOW(res) \ @@ -154,26 +154,30 @@ GPU_TEST_P(SoftCascadeTest, detectInROI, cv::gpu::SoftCascade cascade; ASSERT_TRUE(cascade.load(cvtest::TS::ptr()->get_data_path() + GET_PARAM(0))); - GpuMat colored(coloredCpu), objectBoxes(1, 16384, CV_8UC1), rois(cascade.getRoiSize(), CV_8UC1); + GpuMat colored(coloredCpu), objectBoxes(1, 16384, CV_8UC1), rois(cascade.getRoiSize(), CV_8UC1), trois; rois.setTo(0); int nroi = GET_PARAM(2); + cv::Mat result(coloredCpu); cv::RNG rng; for (int i = 0; i < nroi; ++i) { cv::Rect r = getFromTable(rng(10)); GpuMat sub(rois, r); sub.setTo(1); + r.x *= 4; r.y *= 4; r.width *= 4; r.height *= 4; + cv::rectangle(result, r, cv::Scalar(0, 0, 255, 255), 1); } - cascade.detectMultiScale(colored, rois, objectBoxes); + cv::gpu::transpose(rois, trois); + + cascade.detectMultiScale(colored, trois, objectBoxes); /// cv::Mat dt(objectBoxes); typedef cv::gpu::SoftCascade::Detection detection_t; detection_t* dts = (detection_t*)dt.data; - cv::Mat result(coloredCpu); printTotal(std::cout, dt.cols); for (int i = 0; i < (int)(dt.cols / sizeof(detection_t)); ++i) @@ -204,8 +208,11 @@ GPU_TEST_P(SoftCascadeTest, detectInLevel, GpuMat colored(coloredCpu), objectBoxes(1, 100 * sizeof(detection_t), CV_8UC1), rois(cascade.getRoiSize(), CV_8UC1); rois.setTo(1); + cv::gpu::GpuMat trois; + cv::gpu::transpose(rois, trois); + int level = GET_PARAM(2); - cascade.detectMultiScale(colored, rois, objectBoxes, 1, level); + cascade.detectMultiScale(colored, trois, objectBoxes, 1, level); cv::Mat dt(objectBoxes); @@ -246,6 +253,9 @@ TEST(SoftCascadeTest, detect) GpuMat sub(rois, cv::Rect(rois.cols / 4, rois.rows / 4,rois.cols / 2, rois.rows / 2)); sub.setTo(cv::Scalar::all(1)); - cascade.detectMultiScale(colored, rois, objectBoxes); + cv::gpu::GpuMat trois; + cv::gpu::transpose(rois, trois); + + cascade.detectMultiScale(colored, trois, objectBoxes); } #endif \ No newline at end of file From 1b9bccb856c7ff2f80c99b6b085db264cffcf79d Mon Sep 17 00:00:00 2001 From: "marina.kolpakova" Date: Mon, 8 Oct 2012 20:20:57 +0400 Subject: [PATCH 039/155] move Level into shared memory --- modules/gpu/src/cuda/isf-sc.cu | 24 +++++++----------------- modules/gpu/src/icf.hpp | 2 ++ 2 files changed, 9 insertions(+), 17 deletions(-) diff --git a/modules/gpu/src/cuda/isf-sc.cu b/modules/gpu/src/cuda/isf-sc.cu index 8df6907df..f755f8549 100644 --- a/modules/gpu/src/cuda/isf-sc.cu +++ b/modules/gpu/src/cuda/isf-sc.cu @@ -94,11 +94,6 @@ namespace icf { float relScale = level.relScale; float farea = (scaledRect.z - scaledRect.x) * (scaledRect.w - scaledRect.y); - dprintf("%d: feature %d box %d %d %d %d\n",threadIdx.x, (node.threshold >> 28), scaledRect.x, scaledRect.y, - scaledRect.z, scaledRect.w); - dprintf("%d: rescale: %f [%f %f] selected %f\n",threadIdx.x, level.relScale, level.scaling[0], level.scaling[1], - level.scaling[(node.threshold >> 28) > 6]); - // rescale scaledRect.x = __float2int_rn(relScale * scaledRect.x); scaledRect.y = __float2int_rn(relScale * scaledRect.y); @@ -110,14 +105,7 @@ namespace icf { const float expected_new_area = farea * relScale * relScale; float approx = __fdividef(sarea, expected_new_area); - dprintf("%d: new rect: %d box %d %d %d %d rel areas %f %f\n",threadIdx.x, (node.threshold >> 28), - scaledRect.x, scaledRect.y, scaledRect.z, scaledRect.w, farea * relScale * relScale, sarea); - - float rootThreshold = (node.threshold & 0x0FFFFFFFU) * approx; - rootThreshold *= level.scaling[(node.threshold >> 28) > 6]; - - dprintf("%d: approximation %f %d -> %f %f\n",threadIdx.x, approx, (node.threshold & 0x0FFFFFFFU), rootThreshold, - level.scaling[(node.threshold >> 28) > 6]); + float rootThreshold = (node.threshold & 0x0FFFFFFFU) * approx * level.scaling[(node.threshold >> 28) > 6]; return rootThreshold; } @@ -179,18 +167,20 @@ namespace icf { const int y = blockIdx.y * blockDim.y + threadIdx.y; const int x = blockIdx.x; - __shared__ volatile char roiCache[8]; + // load Lavel + __shared__ Level level; + // check POI + __shared__ volatile char roiCache[8]; if (!threadIdx.y && !threadIdx.x) - { ((float2*)roiCache)[threadIdx.x] = tex2D(troi, blockIdx.y, x); - } __syncthreads(); if (!roiCache[threadIdx.y]) return; - Level level = levels[downscales + blockIdx.z]; + if (!threadIdx.x) + level = levels[downscales + blockIdx.z]; if(x >= level.workRect.x || y >= level.workRect.y) return; diff --git a/modules/gpu/src/icf.hpp b/modules/gpu/src/icf.hpp index 35658892f..a103341fb 100644 --- a/modules/gpu/src/icf.hpp +++ b/modules/gpu/src/icf.hpp @@ -90,6 +90,8 @@ struct __align__(8) Level //is actually 24 bytes objSize.x = round(oct.size.x * relScale); objSize.y = round(oct.size.y * relScale); } + + __device Level(){} }; struct __align__(8) Node From 0424e2c8d2da9aefa9b8b6898216e7361e3141c3 Mon Sep 17 00:00:00 2001 From: "marina.kolpakova" Date: Thu, 11 Oct 2012 14:19:24 +0400 Subject: [PATCH 040/155] remove debug code --- modules/gpu/src/softcascade.cpp | 28 ---------------------------- 1 file changed, 28 deletions(-) diff --git a/modules/gpu/src/softcascade.cpp b/modules/gpu/src/softcascade.cpp index e7fcfff27..b834e0319 100644 --- a/modules/gpu/src/softcascade.cpp +++ b/modules/gpu/src/softcascade.cpp @@ -463,15 +463,6 @@ bool cv::gpu::SoftCascade::load( const string& filename, const float minScale, c return true; } -#define USE_REFERENCE_VALUES -namespace { - char *itoa(long i, char* s, int /*dummy_radix*/) - { - sprintf(s, "%ld", i); - return s; - } -} - //================================== synchronous version ============================================================// void cv::gpu::SoftCascade::detectMultiScale(const GpuMat& colored, const GpuMat& rois, @@ -488,22 +479,6 @@ void cv::gpu::SoftCascade::detectMultiScale(const GpuMat& colored, const GpuMat& Filds& flds = *filds; -#if defined USE_REFERENCE_VALUES - cudaMemset(flds.hogluv.data, 0, flds.hogluv.step * flds.hogluv.rows); - - cv::FileStorage imgs("/home/kellan/testInts.xml", cv::FileStorage::READ); - char buff[33]; - - for(int i = 0; i < Filds::HOG_LUV_BINS; ++i) - { - cv::Mat channel; - imgs[std::string("channel") + itoa(i, buff, 10)] >> channel; - - // std::cout << "channel " << i << std::endl << channel << std::endl; - GpuMat gchannel(flds.hogluv, cv::Rect(0, 121 * i, 161, 121)); - gchannel.upload(channel); - } -#else GpuMat& plane = flds.plane; GpuMat& shrunk = flds.shrunk; cudaMemset(plane.data, 0, plane.step * plane.rows); @@ -512,8 +487,6 @@ void cv::gpu::SoftCascade::detectMultiScale(const GpuMat& colored, const GpuMat& int fh = Filds::FRAME_HEIGHT; GpuMat gray(plane, cv::Rect(0, fh * Filds::HOG_LUV_BINS, fw, fh)); - - //cv::gpu::cvtColor(colored, gray, CV_RGB2GRAY); cv::gpu::cvtColor(colored, gray, CV_BGR2GRAY); //create hog @@ -564,7 +537,6 @@ void cv::gpu::SoftCascade::detectMultiScale(const GpuMat& colored, const GpuMat& GpuMat sum(flds.hogluv, cv::Rect(0, (fh + 1) * i, fw + 1, fh + 1)); cv::gpu::integralBuffered(channel, sum, flds.integralBuffer); } -#endif if (specificScale == -1) flds.detect(rois,objects, 0); From ef431f70b640acb969a90b44ed5ffef5a2774719 Mon Sep 17 00:00:00 2001 From: "marina.kolpakova" Date: Thu, 11 Oct 2012 16:26:32 +0400 Subject: [PATCH 041/155] fix buggy threshold zeroing if feature has zero area --- modules/gpu/src/cuda/isf-sc.cu | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/modules/gpu/src/cuda/isf-sc.cu b/modules/gpu/src/cuda/isf-sc.cu index f755f8549..f74673c05 100644 --- a/modules/gpu/src/cuda/isf-sc.cu +++ b/modules/gpu/src/cuda/isf-sc.cu @@ -103,7 +103,7 @@ namespace icf { float sarea = (scaledRect.z - scaledRect.x) * (scaledRect.w - scaledRect.y); const float expected_new_area = farea * relScale * relScale; - float approx = __fdividef(sarea, expected_new_area); + float approx = (sarea == 0)? 1: __fdividef(sarea, expected_new_area); float rootThreshold = (node.threshold & 0x0FFFFFFFU) * approx * level.scaling[(node.threshold >> 28) > 6]; @@ -226,7 +226,7 @@ namespace icf { dprintf("%d: impact scaned %f\n" ,threadIdx.x, impact); confidence += impact; - if(__any((confidence <= stages[(st + threadIdx.x)]))) st += stEnd; + if(__any((confidence <= stages[(st + threadIdx.x)]))) st += 2048; } if(st == stEnd && !threadIdx.x) From 312a58fcec5d8478cf95352e990ab24601c0108a Mon Sep 17 00:00:00 2001 From: "marina.kolpakova" Date: Thu, 11 Oct 2012 16:27:23 +0400 Subject: [PATCH 042/155] fix performance test --- modules/gpu/perf/perf_objdetect.cpp | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/modules/gpu/perf/perf_objdetect.cpp b/modules/gpu/perf/perf_objdetect.cpp index e6efcc2d6..ced8ee17d 100644 --- a/modules/gpu/perf/perf_objdetect.cpp +++ b/modules/gpu/perf/perf_objdetect.cpp @@ -186,11 +186,14 @@ PERF_TEST_P(SoftCascadeTest, detect, cv::gpu::GpuMat objectBoxes(1, 16384, CV_8UC1), rois(cascade.getRoiSize(), CV_8UC1), trois; rois.setTo(1); cv::gpu::transpose(rois, trois); - cascade.detectMultiScale(colored, trois, objectBoxes); + + cv::gpu::GpuMat curr = objectBoxes; + cascade.detectMultiScale(colored, trois, curr); TEST_CYCLE() { - cascade.detectMultiScale(colored, trois, objectBoxes); + curr = objectBoxes; + cascade.detectMultiScale(colored, trois, curr); } } else From fa62e2b72f5d1d84b1d056d0e42b80486e210f71 Mon Sep 17 00:00:00 2001 From: "marina.kolpakova" Date: Thu, 11 Oct 2012 16:47:49 +0400 Subject: [PATCH 043/155] move preprocessing into separate function --- modules/gpu/src/softcascade.cpp | 134 +++++++++++++++++--------------- 1 file changed, 72 insertions(+), 62 deletions(-) diff --git a/modules/gpu/src/softcascade.cpp b/modules/gpu/src/softcascade.cpp index b834e0319..8fa82867f 100644 --- a/modules/gpu/src/softcascade.cpp +++ b/modules/gpu/src/softcascade.cpp @@ -93,6 +93,10 @@ namespace icf { PtrStepSzi counter, const int downscales); } +namespace imgproc +{ + void meanShiftFiltering_gpu(const PtrStepSzb& src, PtrStepSzb dst, int sp, int sr, int maxIter, float eps, cudaStream_t stream); +} }}} struct cv::gpu::SoftCascade::Filds @@ -104,8 +108,8 @@ struct cv::gpu::SoftCascade::Filds fplane.create(FRAME_HEIGHT * 6, FRAME_WIDTH, CV_32FC1); luv.create(FRAME_HEIGHT, FRAME_WIDTH, CV_8UC3); shrunk.create(FRAME_HEIGHT / 4 * HOG_LUV_BINS, FRAME_WIDTH / 4, CV_8UC1); - integralBuffer.create(shrunk.rows + 1 * HOG_LUV_BINS, shrunk.cols + 1, CV_32SC1); - hogluv.create((FRAME_HEIGHT / 4 + 1) * HOG_LUV_BINS, FRAME_WIDTH / 4 + 1, CV_32SC1); + integralBuffer.create(1 , (shrunk.rows + 1) * HOG_LUV_BINS * (shrunk.cols + 1), CV_32SC1); + hogluv.create((FRAME_HEIGHT / 4 + 1) * HOG_LUV_BINS, FRAME_WIDTH / 4 + 64, CV_32SC1); detCounter.create(1,1, CV_32SC1); } @@ -146,6 +150,8 @@ struct cv::gpu::SoftCascade::Filds std::vector scales; + static const int shrinkage = 4; + enum { BOOST = 0 }; enum { @@ -160,19 +166,80 @@ struct cv::gpu::SoftCascade::Filds }; bool fill(const FileNode &root, const float mins, const float maxs); - void detect(cv::gpu::GpuMat roi, cv::gpu::GpuMat objects, cudaStream_t stream) const + void detect(const cv::gpu::GpuMat& roi, cv::gpu::GpuMat& objects, cudaStream_t stream) const { cudaMemset(detCounter.data, 0, detCounter.step * detCounter.rows * sizeof(int)); device::icf::detect(roi, levels, octaves, stages, nodes, leaves, hogluv, objects , detCounter, downscales); } - void detectAtScale(int scale, cv::gpu::GpuMat roi, cv::gpu::GpuMat objects, cudaStream_t stream) const + void detectAtScale(int scale, const cv::gpu::GpuMat& roi, cv::gpu::GpuMat& objects, cudaStream_t stream) const { cudaMemset(detCounter.data, 0, detCounter.step * detCounter.rows * sizeof(int)); device::icf::detectAtScale(scale, roi, levels, octaves, stages, nodes, leaves, hogluv, objects, detCounter, downscales); } + void preprocess(const cv::gpu::GpuMat& colored) + { + cudaMemset(plane.data, 0, plane.step * plane.rows); + + int fw = Filds::FRAME_WIDTH; + int fh = Filds::FRAME_HEIGHT; + + GpuMat gray(plane, cv::Rect(0, fh * Filds::HOG_LUV_BINS, fw, fh)); + cv::gpu::cvtColor(colored, gray, CV_BGR2GRAY); + + //create hog + GpuMat dfdx(fplane, cv::Rect(0, 0, fw, fh)); + GpuMat dfdy(fplane, cv::Rect(0, fh, fw, fh)); + + cv::gpu::Sobel(gray, dfdx, CV_32F, 1, 0, 3, 0.125f); + cv::gpu::Sobel(gray, dfdy, CV_32F, 0, 1, 3, 0.125f); + + GpuMat mag(fplane, cv::Rect(0, 2 * fh, fw, fh)); + GpuMat ang(fplane, cv::Rect(0, 3 * fh, fw, fh)); + + cv::gpu::cartToPolar(dfdx, dfdy, mag, ang, true); + + // normolize magnitude to uchar interval and angles to 6 bins + + GpuMat nmag(fplane, cv::Rect(0, 4 * fh, fw, fh)); + GpuMat nang(fplane, cv::Rect(0, 5 * fh, fw, fh)); + + cv::gpu::multiply(mag, cv::Scalar::all(1.f / ::log(2)), nmag); + cv::gpu::multiply(ang, cv::Scalar::all(1.f / 60.f), nang); + + //create uchar magnitude + GpuMat cmag(plane, cv::Rect(0, fh * Filds::HOG_BINS, fw, fh)); + nmag.convertTo(cmag, CV_8UC1); + + // create luv + cv::gpu::cvtColor(colored, luv, CV_BGR2Luv); + + std::vector splited; + for(int i = 0; i < Filds::LUV_BINS; ++i) + { + splited.push_back(GpuMat(plane, cv::Rect(0, fh * (7 + i), fw, fh))); + } + + cv::gpu::split(luv, splited); + + device::icf::fillBins(plane, nang, fw, fh, Filds::HOG_BINS); + + GpuMat channels(plane, cv::Rect(0, 0, fw, fh * Filds::HOG_LUV_BINS)); + cv::gpu::resize(channels, shrunk, cv::Size(), 0.25, 0.25, CV_INTER_AREA); + + fw /= shrinkage; + fh /= shrinkage; + + for(int i = 0; i < Filds::HOG_LUV_BINS; ++i) + { + GpuMat channel(shrunk, cv::Rect(0, fh * i, fw, fh )); + GpuMat sum(hogluv, cv::Rect(0, (fh + 1) * i, fw + 1, fh + 1)); + cv::gpu::integralBuffered(channel, sum, integralBuffer); + } + } + private: void calcLevels(const std::vector& octs, int frameW, int frameH, int nscales); @@ -479,64 +546,7 @@ void cv::gpu::SoftCascade::detectMultiScale(const GpuMat& colored, const GpuMat& Filds& flds = *filds; - GpuMat& plane = flds.plane; - GpuMat& shrunk = flds.shrunk; - cudaMemset(plane.data, 0, plane.step * plane.rows); - - int fw = Filds::FRAME_WIDTH; - int fh = Filds::FRAME_HEIGHT; - - GpuMat gray(plane, cv::Rect(0, fh * Filds::HOG_LUV_BINS, fw, fh)); - cv::gpu::cvtColor(colored, gray, CV_BGR2GRAY); - - //create hog - GpuMat dfdx(flds.fplane, cv::Rect(0, 0, fw, fh)); - GpuMat dfdy(flds.fplane, cv::Rect(0, fh, fw, fh)); - - cv::gpu::Sobel(gray, dfdx, CV_32F, 1, 0, 3, 0.125f); - cv::gpu::Sobel(gray, dfdy, CV_32F, 0, 1, 3, 0.125f); - - GpuMat mag(flds.fplane, cv::Rect(0, 2 * fh, fw, fh)); - GpuMat ang(flds.fplane, cv::Rect(0, 3 * fh, fw, fh)); - - cv::gpu::cartToPolar(dfdx, dfdy, mag, ang, true); - - // normolize magnitude to uchar interval and angles to 6 bins - - GpuMat nmag(flds.fplane, cv::Rect(0, 4 * fh, fw, fh)); - GpuMat nang(flds.fplane, cv::Rect(0, 5 * fh, fw, fh)); - - cv::gpu::multiply(mag, cv::Scalar::all(1.f / ::log(2)), nmag); - cv::gpu::multiply(ang, cv::Scalar::all(1.f / 60.f), nang); - - //create uchar magnitude - GpuMat cmag(plane, cv::Rect(0, fh * Filds::HOG_BINS, fw, fh)); - nmag.convertTo(cmag, CV_8UC1); - - // create luv - cv::gpu::cvtColor(colored, flds.luv, CV_BGR2Luv); - - std::vector splited; - for(int i = 0; i < Filds::LUV_BINS; ++i) - { - splited.push_back(GpuMat(plane, cv::Rect(0, fh * (7 + i), fw, fh))); - } - - cv::gpu::split(flds.luv, splited); - - device::icf::fillBins(plane, nang, fw, fh, Filds::HOG_BINS); - - GpuMat hogluv(plane, cv::Rect(0, 0, fw, fh * Filds::HOG_LUV_BINS)); - cv::gpu::resize(hogluv, flds.shrunk, cv::Size(), 0.25, 0.25, CV_INTER_AREA); - - fw /= 4; - fh /= 4; - for(int i = 0; i < Filds::HOG_LUV_BINS; ++i) - { - GpuMat channel(shrunk, cv::Rect(0, fh * i, fw, fh )); - GpuMat sum(flds.hogluv, cv::Rect(0, (fh + 1) * i, fw + 1, fh + 1)); - cv::gpu::integralBuffered(channel, sum, flds.integralBuffer); - } + flds.preprocess(colored); if (specificScale == -1) flds.detect(rois,objects, 0); From 916ba4c0ea1493c6883e45d990a85421f975a60b Mon Sep 17 00:00:00 2001 From: "marina.kolpakova" Date: Thu, 11 Oct 2012 17:05:23 +0400 Subject: [PATCH 044/155] refactor preprocessing --- modules/gpu/src/softcascade.cpp | 122 +++++++++++++++++++------------- 1 file changed, 71 insertions(+), 51 deletions(-) diff --git a/modules/gpu/src/softcascade.cpp b/modules/gpu/src/softcascade.cpp index 8fa82867f..c93949f1c 100644 --- a/modules/gpu/src/softcascade.cpp +++ b/modules/gpu/src/softcascade.cpp @@ -183,61 +183,16 @@ struct cv::gpu::SoftCascade::Filds { cudaMemset(plane.data, 0, plane.step * plane.rows); - int fw = Filds::FRAME_WIDTH; - int fh = Filds::FRAME_HEIGHT; + static const int fw = Filds::FRAME_WIDTH; + static const int fh = Filds::FRAME_HEIGHT; GpuMat gray(plane, cv::Rect(0, fh * Filds::HOG_LUV_BINS, fw, fh)); cv::gpu::cvtColor(colored, gray, CV_BGR2GRAY); + createHogBins(gray); - //create hog - GpuMat dfdx(fplane, cv::Rect(0, 0, fw, fh)); - GpuMat dfdy(fplane, cv::Rect(0, fh, fw, fh)); + createLuvBins(colored); - cv::gpu::Sobel(gray, dfdx, CV_32F, 1, 0, 3, 0.125f); - cv::gpu::Sobel(gray, dfdy, CV_32F, 0, 1, 3, 0.125f); - - GpuMat mag(fplane, cv::Rect(0, 2 * fh, fw, fh)); - GpuMat ang(fplane, cv::Rect(0, 3 * fh, fw, fh)); - - cv::gpu::cartToPolar(dfdx, dfdy, mag, ang, true); - - // normolize magnitude to uchar interval and angles to 6 bins - - GpuMat nmag(fplane, cv::Rect(0, 4 * fh, fw, fh)); - GpuMat nang(fplane, cv::Rect(0, 5 * fh, fw, fh)); - - cv::gpu::multiply(mag, cv::Scalar::all(1.f / ::log(2)), nmag); - cv::gpu::multiply(ang, cv::Scalar::all(1.f / 60.f), nang); - - //create uchar magnitude - GpuMat cmag(plane, cv::Rect(0, fh * Filds::HOG_BINS, fw, fh)); - nmag.convertTo(cmag, CV_8UC1); - - // create luv - cv::gpu::cvtColor(colored, luv, CV_BGR2Luv); - - std::vector splited; - for(int i = 0; i < Filds::LUV_BINS; ++i) - { - splited.push_back(GpuMat(plane, cv::Rect(0, fh * (7 + i), fw, fh))); - } - - cv::gpu::split(luv, splited); - - device::icf::fillBins(plane, nang, fw, fh, Filds::HOG_BINS); - - GpuMat channels(plane, cv::Rect(0, 0, fw, fh * Filds::HOG_LUV_BINS)); - cv::gpu::resize(channels, shrunk, cv::Size(), 0.25, 0.25, CV_INTER_AREA); - - fw /= shrinkage; - fh /= shrinkage; - - for(int i = 0; i < Filds::HOG_LUV_BINS; ++i) - { - GpuMat channel(shrunk, cv::Rect(0, fh * i, fw, fh )); - GpuMat sum(hogluv, cv::Rect(0, (fh + 1) * i, fw + 1, fh + 1)); - cv::gpu::integralBuffered(channel, sum, integralBuffer); - } + integrate(); } private: @@ -263,6 +218,72 @@ private: } return res; } + + void createHogBins(const cv::gpu::GpuMat& gray) + { + static const int fw = Filds::FRAME_WIDTH; + static const int fh = Filds::FRAME_HEIGHT; + + GpuMat dfdx(fplane, cv::Rect(0, 0, fw, fh)); + GpuMat dfdy(fplane, cv::Rect(0, fh, fw, fh)); + + cv::gpu::Sobel(gray, dfdx, CV_32F, 1, 0, 3, 0.125f); + cv::gpu::Sobel(gray, dfdy, CV_32F, 0, 1, 3, 0.125f); + + GpuMat mag(fplane, cv::Rect(0, 2 * fh, fw, fh)); + GpuMat ang(fplane, cv::Rect(0, 3 * fh, fw, fh)); + + cv::gpu::cartToPolar(dfdx, dfdy, mag, ang, true); + + // normolize magnitude to uchar interval and angles to 6 bins + + GpuMat nmag(fplane, cv::Rect(0, 4 * fh, fw, fh)); + GpuMat nang(fplane, cv::Rect(0, 5 * fh, fw, fh)); + + cv::gpu::multiply(mag, cv::Scalar::all(1.f / ::log(2)), nmag); + cv::gpu::multiply(ang, cv::Scalar::all(1.f / 60.f), nang); + + //create uchar magnitude + GpuMat cmag(plane, cv::Rect(0, fh * Filds::HOG_BINS, fw, fh)); + nmag.convertTo(cmag, CV_8UC1); + + device::icf::fillBins(plane, nang, fw, fh, Filds::HOG_BINS); + } + + void createLuvBins(const cv::gpu::GpuMat& colored) + { + static const int fw = Filds::FRAME_WIDTH; + static const int fh = Filds::FRAME_HEIGHT; + + cv::gpu::cvtColor(colored, luv, CV_BGR2Luv); + + std::vector splited; + for(int i = 0; i < Filds::LUV_BINS; ++i) + { + splited.push_back(GpuMat(plane, cv::Rect(0, fh * (7 + i), fw, fh))); + } + + cv::gpu::split(luv, splited); + } + + void integrate() + { + int fw = Filds::FRAME_WIDTH; + int fh = Filds::FRAME_HEIGHT; + + GpuMat channels(plane, cv::Rect(0, 0, fw, fh * Filds::HOG_LUV_BINS)); + cv::gpu::resize(channels, shrunk, cv::Size(), 0.25, 0.25, CV_INTER_AREA); + + fw /= shrinkage; + fh /= shrinkage; + + for(int i = 0; i < Filds::HOG_LUV_BINS; ++i) + { + GpuMat channel(shrunk, cv::Rect(0, fh * i, fw, fh )); + GpuMat sum(hogluv, cv::Rect(0, (fh + 1) * i, fw + 1, fh + 1)); + cv::gpu::integralBuffered(channel, sum, integralBuffer); + } + } }; bool cv::gpu::SoftCascade::Filds::fill(const FileNode &root, const float mins, const float maxs) @@ -572,5 +593,4 @@ cv::Size cv::gpu::SoftCascade::getRoiSize() const return cv::Size(Filds::FRAME_WIDTH / 4, Filds::FRAME_HEIGHT / 4); } - #endif \ No newline at end of file From 0898c3c651ef6196a2287d61ef9d6644b5653743 Mon Sep 17 00:00:00 2001 From: "marina.kolpakova" Date: Thu, 11 Oct 2012 18:24:48 +0400 Subject: [PATCH 045/155] kernel policy --- modules/gpu/src/cuda/isf-sc.cu | 62 +++++++++------------------------ modules/gpu/src/icf.hpp | 27 ++++++++++++++ modules/gpu/src/softcascade.cpp | 44 +++++------------------ 3 files changed, 52 insertions(+), 81 deletions(-) diff --git a/modules/gpu/src/cuda/isf-sc.cu b/modules/gpu/src/cuda/isf-sc.cu index f74673c05..74e47ba19 100644 --- a/modules/gpu/src/cuda/isf-sc.cu +++ b/modules/gpu/src/cuda/isf-sc.cu @@ -303,21 +303,16 @@ namespace icf { } #endif - void detect(const PtrStepSzb& roi, const PtrStepSzb& levels, const PtrStepSzb& octaves, const PtrStepSzf& stages, - const PtrStepSzb& nodes, const PtrStepSzf& leaves, const PtrStepSzi& hogluv, - PtrStepSz objects, PtrStepSzi counter, const int downscales) + template<> + void CascadeInvoker::operator()(const PtrStepSzb& roi, const PtrStepSzi& hogluv, + PtrStepSz objects, PtrStepSzi counter, const int downscales, const int scale) const { int fw = 160; int fh = 120; dim3 block(32, 8); - dim3 grid(fw, fh / 8, downscales); + dim3 grid(fw, fh / 8, (scale == -1) ? downscales : 1); - const Level* l = (const Level*)levels.ptr(); - const Octave* oct = ((const Octave*)octaves.ptr()); - const float* st = (const float*)stages.ptr(); - const Node* nd = (const Node*)nodes.ptr(); - const float* lf = (const float*)leaves.ptr(); uint* ctr = (uint*)counter.ptr(); Detection* det = (Detection*)objects.ptr(); uint max_det = objects.cols / sizeof(Detection); @@ -328,44 +323,21 @@ namespace icf { cudaChannelFormatDesc desc_roi = cudaCreateChannelDesc(); cudaSafeCall( cudaBindTexture2D(0, troi, roi.data, desc_roi, roi.cols / 8, roi.rows, roi.step)); - test_kernel_warp<<>>(l, oct, st, nd, lf, det, max_det, ctr, 0); - cudaSafeCall( cudaGetLastError()); + if (scale == -1) + { + test_kernel_warp<<>>(levels, octaves, stages, nodes, leaves, det, max_det, ctr, 0); + cudaSafeCall( cudaGetLastError()); - grid = dim3(fw, fh / 8, 47 - downscales); - test_kernel_warp<<>>(l, oct, st, nd, lf, det, max_det, ctr, downscales); - cudaSafeCall( cudaGetLastError()); - cudaSafeCall( cudaDeviceSynchronize()); - } - - void detectAtScale(const int scale, const PtrStepSzb& roi, const PtrStepSzb& levels, const PtrStepSzb& octaves, - const PtrStepSzf& stages, const PtrStepSzb& nodes, const PtrStepSzf& leaves, const PtrStepSzi& hogluv, - PtrStepSz objects, PtrStepSzi counter, const int downscales) - { - int fw = 160; - int fh = 120; - - dim3 block(32, 8); - dim3 grid(fw, fh / 8, 1); - - const Level* l = (const Level*)levels.ptr(); - const Octave* oct = ((const Octave*)octaves.ptr()); - const float* st = (const float*)stages.ptr(); - const Node* nd = (const Node*)nodes.ptr(); - const float* lf = (const float*)leaves.ptr(); - uint* ctr = (uint*)counter.ptr(); - Detection* det = (Detection*)objects.ptr(); - uint max_det = objects.cols / sizeof(Detection); - - cudaChannelFormatDesc desc = cudaCreateChannelDesc(); - cudaSafeCall( cudaBindTexture2D(0, thogluv, hogluv.data, desc, hogluv.cols, hogluv.rows, hogluv.step)); - - cudaChannelFormatDesc desc_roi = cudaCreateChannelDesc(); - cudaSafeCall( cudaBindTexture2D(0, troi, roi.data, desc_roi, roi.cols / 8, roi.rows, roi.step)); - - if (scale >= downscales) - test_kernel_warp<<>>(l, oct, st, nd, lf, det, max_det, ctr, scale); + grid = dim3(fw, fh / 8, 47 - downscales); + test_kernel_warp<<>>(levels, octaves, stages, nodes, leaves, det, max_det, ctr, downscales); + } else - test_kernel_warp<<>>(l, oct, st, nd, lf, det, max_det, ctr, scale); + { + if (scale >= downscales) + test_kernel_warp<<>>(levels, octaves, stages, nodes, leaves, det, max_det, ctr, scale); + else + test_kernel_warp<<>>(levels, octaves, stages, nodes, leaves, det, max_det, ctr, scale); + } cudaSafeCall( cudaGetLastError()); cudaSafeCall( cudaDeviceSynchronize()); diff --git a/modules/gpu/src/icf.hpp b/modules/gpu/src/icf.hpp index a103341fb..06c81149e 100644 --- a/modules/gpu/src/icf.hpp +++ b/modules/gpu/src/icf.hpp @@ -124,6 +124,33 @@ struct __align__(16) Detection : x(_x), y(_y), w(_w), h(_h), confidence(c), kind(0) {}; }; +struct CascadePolicy +{ + enum {STA_X = 32, STA_Y = 8}; +}; + +template +struct CascadeInvoker +{ + CascadeInvoker(): levels(0), octaves(0), stages(0), nodes(0), leaves(0) {} + CascadeInvoker(const PtrStepSzb& _levels, const PtrStepSzb& _octaves, const PtrStepSzf& _stages, + const PtrStepSzb& _nodes, const PtrStepSzf& _leaves) + : levels((const Level*)_levels.ptr()), octaves((const Octave*)_octaves.ptr()), stages((const float*)_stages.ptr()), + nodes((const Node*)_nodes.ptr()), leaves((const float*)_leaves.ptr()) + {} + + const Level* levels; + const Octave* octaves; + + const float* stages; + + const Node* nodes; + const float* leaves; + + void operator()(const PtrStepSzb& roi, const PtrStepSzi& hogluv, PtrStepSz objects, + PtrStepSzi counter, const int downscales, const int csale = -1) const; +}; + } }}} diff --git a/modules/gpu/src/softcascade.cpp b/modules/gpu/src/softcascade.cpp index c93949f1c..f25c5a34d 100644 --- a/modules/gpu/src/softcascade.cpp +++ b/modules/gpu/src/softcascade.cpp @@ -69,29 +69,6 @@ namespace cv { namespace gpu { namespace device { namespace icf { void fillBins(cv::gpu::PtrStepSzb hogluv, const cv::gpu::PtrStepSzf& nangle, const int fw, const int fh, const int bins); - - void detect(const PtrStepSzb& rois, - const PtrStepSzb& levels, - const PtrStepSzb& octaves, - const PtrStepSzf& stages, - const PtrStepSzb& nodes, - const PtrStepSzf& leaves, - const PtrStepSzi& hogluv, - PtrStepSz objects, - PtrStepSzi counter, - const int downscales); - - void detectAtScale(const int scale, - const PtrStepSzb& rois, - const PtrStepSzb& levels, - const PtrStepSzb& octaves, - const PtrStepSzf& stages, - const PtrStepSzb& nodes, - const PtrStepSzf& leaves, - const PtrStepSzi& hogluv, - PtrStepSz objects, - PtrStepSzi counter, - const int downscales); } namespace imgproc { @@ -150,6 +127,8 @@ struct cv::gpu::SoftCascade::Filds std::vector scales; + device::icf::CascadeInvoker invoker; + static const int shrinkage = 4; enum { BOOST = 0 }; @@ -166,17 +145,11 @@ struct cv::gpu::SoftCascade::Filds }; bool fill(const FileNode &root, const float mins, const float maxs); - void detect(const cv::gpu::GpuMat& roi, cv::gpu::GpuMat& objects, cudaStream_t stream) const + void detect(int scale, const cv::gpu::GpuMat& roi, cv::gpu::GpuMat& objects, cudaStream_t stream) const { cudaMemset(detCounter.data, 0, detCounter.step * detCounter.rows * sizeof(int)); - device::icf::detect(roi, levels, octaves, stages, nodes, leaves, hogluv, objects , detCounter, downscales); - } - - void detectAtScale(int scale, const cv::gpu::GpuMat& roi, cv::gpu::GpuMat& objects, cudaStream_t stream) const - { - cudaMemset(detCounter.data, 0, detCounter.step * detCounter.rows * sizeof(int)); - device::icf::detectAtScale(scale, roi, levels, octaves, stages, nodes, leaves, hogluv, objects, - detCounter, downscales); + // device::icf::CascadeInvoker invoker(levels, octaves, stages, nodes, leaves); + invoker(roi, hogluv, objects, detCounter, downscales, scale); } void preprocess(const cv::gpu::GpuMat& colored) @@ -439,6 +412,8 @@ bool cv::gpu::SoftCascade::Filds::fill(const FileNode &root, const float mins, c calcLevels(voctaves, FRAME_WIDTH, FRAME_HEIGHT, TOTAL_SCALES); CV_Assert(!levels.empty()); + invoker = device::icf::CascadeInvoker(levels, octaves, stages, nodes, leaves); + return true; } @@ -569,10 +544,7 @@ void cv::gpu::SoftCascade::detectMultiScale(const GpuMat& colored, const GpuMat& flds.preprocess(colored); - if (specificScale == -1) - flds.detect(rois,objects, 0); - else - flds.detectAtScale(specificScale, rois, objects, 0); + flds.detect(specificScale, rois, objects, 0); cv::Mat out(flds.detCounter); int ndetections = *(out.data); From f196e9fda44d71d0be5081dcb0c3d618cd8f06a7 Mon Sep 17 00:00:00 2001 From: "marina.kolpakova" Date: Thu, 11 Oct 2012 19:11:39 +0400 Subject: [PATCH 046/155] add factory method for Fields structure --- modules/gpu/src/softcascade.cpp | 578 ++++++++++++++++---------------- 1 file changed, 281 insertions(+), 297 deletions(-) diff --git a/modules/gpu/src/softcascade.cpp b/modules/gpu/src/softcascade.cpp index f25c5a34d..fc7114b5f 100644 --- a/modules/gpu/src/softcascade.cpp +++ b/modules/gpu/src/softcascade.cpp @@ -78,77 +78,255 @@ namespace imgproc struct cv::gpu::SoftCascade::Filds { + struct CascadeIntrinsics + { + static const float lambda = 1.099f, a = 0.89f; - Filds() + static float getFor(int channel, float scaling) + { + CV_Assert(channel < 10); + + if (fabs(scaling - 1.f) < FLT_EPSILON) + return 1.f; + + // according to R. Benenson, M. Mathias, R. Timofte and L. Van Gool's and Dallal's papers + static const float A[2][2] = + { //channel <= 6, otherwise + { 0.89f, 1.f}, // down + { 1.00f, 1.f} // up + }; + + static const float B[2][2] = + { //channel <= 6, otherwise + { 1.099f / ::log(2), 2.f}, // down + { 0.f, 2.f} // up + }; + + float a = A[(int)(scaling >= 1)][(int)(channel > 6)]; + float b = B[(int)(scaling >= 1)][(int)(channel > 6)]; + + // printf("!!! scaling: %f %f %f -> %f\n", scaling, a, b, a * pow(scaling, b)); + return a * ::pow(scaling, b); + } + }; + + static Filds* parseCascade(const FileNode &root, const float mins, const float maxs) + { + static const char *const SC_STAGE_TYPE = "stageType"; + static const char *const SC_BOOST = "BOOST"; + + static const char *const SC_FEATURE_TYPE = "featureType"; + static const char *const SC_ICF = "ICF"; + + // only Ada Boost supported + std::string stageTypeStr = (string)root[SC_STAGE_TYPE]; + CV_Assert(stageTypeStr == SC_BOOST); + + // only HOG-like integral channel features cupported + string featureTypeStr = (string)root[SC_FEATURE_TYPE]; + CV_Assert(featureTypeStr == SC_ICF); + + static const char *const SC_ORIG_W = "width"; + static const char *const SC_ORIG_H = "height"; + + int origWidth = (int)root[SC_ORIG_W]; + CV_Assert(origWidth == ORIG_OBJECT_WIDTH); + + int origHeight = (int)root[SC_ORIG_H]; + CV_Assert(origHeight == ORIG_OBJECT_HEIGHT); + + static const char *const SC_OCTAVES = "octaves"; + static const char *const SC_STAGES = "stages"; + static const char *const SC_FEATURES = "features"; + + static const char *const SC_WEEK = "weakClassifiers"; + static const char *const SC_INTERNAL = "internalNodes"; + static const char *const SC_LEAF = "leafValues"; + + static const char *const SC_OCT_SCALE = "scale"; + static const char *const SC_OCT_STAGES = "stageNum"; + static const char *const SC_OCT_SHRINKAGE = "shrinkingFactor"; + + static const char *const SC_STAGE_THRESHOLD = "stageThreshold"; + + static const char * const SC_F_CHANNEL = "channel"; + static const char * const SC_F_RECT = "rect"; + + + FileNode fn = root[SC_OCTAVES]; + if (fn.empty()) return false; + + using namespace device::icf; + + std::vector voctaves; + std::vector vstages; + std::vector vnodes; + std::vector vleaves; + + FileNodeIterator it = fn.begin(), it_end = fn.end(); + int feature_offset = 0; + ushort octIndex = 0; + ushort shrinkage = 1; + + for (; it != it_end; ++it) + { + FileNode fns = *it; + float scale = (float)fns[SC_OCT_SCALE]; + + bool isUPOctave = scale >= 1; + + ushort nstages = saturate_cast((int)fns[SC_OCT_STAGES]); + ushort2 size; + size.x = cvRound(ORIG_OBJECT_WIDTH * scale); + size.y = cvRound(ORIG_OBJECT_HEIGHT * scale); + shrinkage = saturate_cast((int)fns[SC_OCT_SHRINKAGE]); + + Octave octave(octIndex, nstages, shrinkage, size, scale); + CV_Assert(octave.stages > 0); + voctaves.push_back(octave); + + FileNode ffs = fns[SC_FEATURES]; + if (ffs.empty()) return false; + + FileNodeIterator ftrs = ffs.begin(); + + fns = fns[SC_STAGES]; + if (fn.empty()) return false; + + // for each stage (~ decision tree with H = 2) + FileNodeIterator st = fns.begin(), st_end = fns.end(); + for (; st != st_end; ++st ) + { + fns = *st; + vstages.push_back((float)fns[SC_STAGE_THRESHOLD]); + + fns = fns[SC_WEEK]; + FileNodeIterator ftr = fns.begin(), ft_end = fns.end(); + for (; ftr != ft_end; ++ftr) + { + fns = (*ftr)[SC_INTERNAL]; + FileNodeIterator inIt = fns.begin(), inIt_end = fns.end(); + for (; inIt != inIt_end;) + { + // int feature = (int)(*(inIt +=2)) + feature_offset; + inIt +=3; + // extract feature, Todo:check it + uint th = saturate_cast((float)(*(inIt++))); + cv::FileNode ftn = (*ftrs)[SC_F_RECT]; + cv::FileNodeIterator r_it = ftn.begin(); + uchar4 rect; + rect.x = saturate_cast((int)*(r_it++)); + rect.y = saturate_cast((int)*(r_it++)); + rect.z = saturate_cast((int)*(r_it++)); + rect.w = saturate_cast((int)*(r_it++)); + + if (isUPOctave) + { + rect.z -= rect.x; + rect.w -= rect.y; + } + + uint channel = saturate_cast((int)(*ftrs)[SC_F_CHANNEL]); + vnodes.push_back(Node(rect, channel, th)); + ++ftrs; + } + + fns = (*ftr)[SC_LEAF]; + inIt = fns.begin(), inIt_end = fns.end(); + for (; inIt != inIt_end; ++inIt) + vleaves.push_back((float)(*inIt)); + } + } + + feature_offset += octave.stages * 3; + ++octIndex; + } + + cv::Mat hoctaves(1, voctaves.size() * sizeof(Octave), CV_8UC1, (uchar*)&(voctaves[0])); + CV_Assert(!hoctaves.empty()); + + cv::Mat hstages(cv::Mat(vstages).reshape(1,1)); + CV_Assert(!hstages.empty()); + + cv::Mat hnodes(1, vnodes.size() * sizeof(Node), CV_8UC1, (uchar*)&(vnodes[0]) ); + CV_Assert(!hnodes.empty()); + + cv::Mat hleaves(cv::Mat(vleaves).reshape(1,1)); + CV_Assert(!hleaves.empty()); + + std::vector vlevels; + float logFactor = (::log(maxs) - ::log(mins)) / (TOTAL_SCALES -1); + + float scale = mins; + int downscales = 0; + for (int sc = 0; sc < TOTAL_SCALES; ++sc) + { + int width = ::std::max(0.0f, FRAME_WIDTH - (origWidth * scale)); + int height = ::std::max(0.0f, FRAME_HEIGHT - (origHeight * scale)); + + float logScale = ::log(scale); + int fit = fitOctave(voctaves, logScale); + + Level level(fit, voctaves[fit], scale, width, height); + level.scaling[0] = CascadeIntrinsics::getFor(0, level.relScale); + level.scaling[1] = CascadeIntrinsics::getFor(9, level.relScale); + + if (!width || !height) + break; + else + { + vlevels.push_back(level); + if (voctaves[fit].scale < 1) ++downscales; + } + + if (::fabs(scale - maxs) < FLT_EPSILON) break; + scale = ::std::min(maxs, ::expf(::log(scale) + logFactor)); + + // std::cout << "level " << sc + // << " octeve " + // << vlevels[sc].octave + // << " relScale " + // << vlevels[sc].relScale + // << " " << vlevels[sc].shrScale + // << " [" << (int)vlevels[sc].objSize.x + // << " " << (int)vlevels[sc].objSize.y << "] [" + // << (int)vlevels[sc].workRect.x << " " << (int)vlevels[sc].workRect.y << "]" << std::endl; + } + + cv::Mat hlevels(1, vlevels.size() * sizeof(Level), CV_8UC1, (uchar*)&(vlevels[0]) ); + CV_Assert(!hlevels.empty()); + + Filds* filds = new Filds(mins, maxs, origWidth, origHeight, shrinkage, downscales, + hoctaves, hstages, hnodes, hleaves, hlevels); + + return filds; + } + + Filds( const float mins, const float maxs, const int ow, const int oh, const int shr, const int ds, + cv::Mat hoctaves, cv::Mat hstages, cv::Mat hnodes, cv::Mat hleaves, cv::Mat hlevels) + : minScale(mins), maxScale(maxs), origObjWidth(ow), origObjHeight(oh), shrinkage(shr), downscales(ds) { plane.create(FRAME_HEIGHT * (HOG_LUV_BINS + 1), FRAME_WIDTH, CV_8UC1); fplane.create(FRAME_HEIGHT * 6, FRAME_WIDTH, CV_32FC1); luv.create(FRAME_HEIGHT, FRAME_WIDTH, CV_8UC3); - shrunk.create(FRAME_HEIGHT / 4 * HOG_LUV_BINS, FRAME_WIDTH / 4, CV_8UC1); + shrunk.create(FRAME_HEIGHT / shr * HOG_LUV_BINS, FRAME_WIDTH / shr, CV_8UC1); integralBuffer.create(1 , (shrunk.rows + 1) * HOG_LUV_BINS * (shrunk.cols + 1), CV_32SC1); - hogluv.create((FRAME_HEIGHT / 4 + 1) * HOG_LUV_BINS, FRAME_WIDTH / 4 + 64, CV_32SC1); + hogluv.create((FRAME_HEIGHT / shr + 1) * HOG_LUV_BINS, FRAME_WIDTH / shr + 64, CV_32SC1); detCounter.create(1,1, CV_32SC1); + + octaves.upload(hoctaves); + stages.upload(hstages); + nodes.upload(hnodes); + leaves.upload(hleaves); + levels.upload(hlevels); + + invoker = device::icf::CascadeInvoker(levels, octaves, stages, nodes, leaves); + } - // scales range - float minScale; - float maxScale; - - int origObjWidth; - int origObjHeight; - - int downscales; - - GpuMat octaves; - GpuMat stages; - GpuMat nodes; - GpuMat leaves; - GpuMat levels; - - GpuMat detCounter; - - // preallocated buffer 640x480x10 for hogluv + 640x480 got gray - GpuMat plane; - - // preallocated buffer for floating point operations - GpuMat fplane; - - // temporial mat for cvtColor - GpuMat luv; - - // 160x120x10 - GpuMat shrunk; - - // temporial mat for integrall - GpuMat integralBuffer; - - // 161x121x10 - GpuMat hogluv; - - std::vector scales; - - device::icf::CascadeInvoker invoker; - - static const int shrinkage = 4; - - enum { BOOST = 0 }; - enum - { - FRAME_WIDTH = 640, - FRAME_HEIGHT = 480, - TOTAL_SCALES = 55, - ORIG_OBJECT_WIDTH = 64, - ORIG_OBJECT_HEIGHT = 128, - HOG_BINS = 6, - LUV_BINS = 3, - HOG_LUV_BINS = 10 - }; - - bool fill(const FileNode &root, const float mins, const float maxs); void detect(int scale, const cv::gpu::GpuMat& roi, cv::gpu::GpuMat& objects, cudaStream_t stream) const { cudaMemset(detCounter.data, 0, detCounter.step * detCounter.rows * sizeof(int)); - // device::icf::CascadeInvoker invoker(levels, octaves, stages, nodes, leaves); invoker(roi, hogluv, objects, detCounter, downscales, scale); } @@ -169,11 +347,9 @@ struct cv::gpu::SoftCascade::Filds } private: - void calcLevels(const std::vector& octs, - int frameW, int frameH, int nscales); typedef std::vector::const_iterator octIt_t; - int fitOctave(const std::vector& octs, const float& logFactor) const + static int fitOctave(const std::vector& octs, const float& logFactor) { float minAbsLog = FLT_MAX; int res = 0; @@ -257,247 +433,61 @@ private: cv::gpu::integralBuffered(channel, sum, integralBuffer); } } -}; -bool cv::gpu::SoftCascade::Filds::fill(const FileNode &root, const float mins, const float maxs) -{ - using namespace device::icf; - minScale = mins; - maxScale = maxs; +public: - // cascade properties - static const char *const SC_STAGE_TYPE = "stageType"; - static const char *const SC_BOOST = "BOOST"; + // scales range + float minScale; + float maxScale; - static const char *const SC_FEATURE_TYPE = "featureType"; - static const char *const SC_ICF = "ICF"; + int origObjWidth; + int origObjHeight; - static const char *const SC_ORIG_W = "width"; - static const char *const SC_ORIG_H = "height"; + const int shrinkage; + int downscales; - static const char *const SC_OCTAVES = "octaves"; - static const char *const SC_STAGES = "stages"; - static const char *const SC_FEATURES = "features"; + // preallocated buffer 640x480x10 for hogluv + 640x480 got gray + GpuMat plane; - static const char *const SC_WEEK = "weakClassifiers"; - static const char *const SC_INTERNAL = "internalNodes"; - static const char *const SC_LEAF = "leafValues"; + // preallocated buffer for floating point operations + GpuMat fplane; - static const char *const SC_OCT_SCALE = "scale"; - static const char *const SC_OCT_STAGES = "stageNum"; - static const char *const SC_OCT_SHRINKAGE = "shrinkingFactor"; + // temporial mat for cvtColor + GpuMat luv; - static const char *const SC_STAGE_THRESHOLD = "stageThreshold"; + // 160x120x10 + GpuMat shrunk; - static const char * const SC_F_CHANNEL = "channel"; - static const char * const SC_F_RECT = "rect"; + // temporial mat for integrall + GpuMat integralBuffer; - // only Ada Boost supported - std::string stageTypeStr = (string)root[SC_STAGE_TYPE]; - CV_Assert(stageTypeStr == SC_BOOST); + // 161x121x10 + GpuMat hogluv; - // only HOG-like integral channel features cupported - string featureTypeStr = (string)root[SC_FEATURE_TYPE]; - CV_Assert(featureTypeStr == SC_ICF); + GpuMat detCounter; - origObjWidth = (int)root[SC_ORIG_W]; - CV_Assert(origObjWidth == ORIG_OBJECT_WIDTH); + // Cascade from xml + GpuMat octaves; + GpuMat stages; + GpuMat nodes; + GpuMat leaves; + GpuMat levels; - origObjHeight = (int)root[SC_ORIG_H]; - CV_Assert(origObjHeight == ORIG_OBJECT_HEIGHT); + device::icf::CascadeInvoker invoker; - FileNode fn = root[SC_OCTAVES]; - if (fn.empty()) return false; - - std::vector voctaves; - std::vector vstages; - std::vector vnodes; - std::vector vleaves; - scales.clear(); - - FileNodeIterator it = fn.begin(), it_end = fn.end(); - int feature_offset = 0; - ushort octIndex = 0; - ushort shrinkage = 1; - - for (; it != it_end; ++it) + enum { BOOST = 0 }; + enum { - FileNode fns = *it; - float scale = (float)fns[SC_OCT_SCALE]; - - bool isUPOctave = scale >= 1; - - scales.push_back(scale); - ushort nstages = saturate_cast((int)fns[SC_OCT_STAGES]); - ushort2 size; - size.x = cvRound(ORIG_OBJECT_WIDTH * scale); - size.y = cvRound(ORIG_OBJECT_HEIGHT * scale); - shrinkage = saturate_cast((int)fns[SC_OCT_SHRINKAGE]); - - Octave octave(octIndex, nstages, shrinkage, size, scale); - CV_Assert(octave.stages > 0); - voctaves.push_back(octave); - - FileNode ffs = fns[SC_FEATURES]; - if (ffs.empty()) return false; - - FileNodeIterator ftrs = ffs.begin(); - - fns = fns[SC_STAGES]; - if (fn.empty()) return false; - - // for each stage (~ decision tree with H = 2) - FileNodeIterator st = fns.begin(), st_end = fns.end(); - for (; st != st_end; ++st ) - { - fns = *st; - vstages.push_back((float)fns[SC_STAGE_THRESHOLD]); - - fns = fns[SC_WEEK]; - FileNodeIterator ftr = fns.begin(), ft_end = fns.end(); - for (; ftr != ft_end; ++ftr) - { - fns = (*ftr)[SC_INTERNAL]; - FileNodeIterator inIt = fns.begin(), inIt_end = fns.end(); - for (; inIt != inIt_end;) - { - // int feature = (int)(*(inIt +=2)) + feature_offset; - inIt +=3; - // extract feature, Todo:check it - uint th = saturate_cast((float)(*(inIt++))); - cv::FileNode ftn = (*ftrs)[SC_F_RECT]; - cv::FileNodeIterator r_it = ftn.begin(); - uchar4 rect; - rect.x = saturate_cast((int)*(r_it++)); - rect.y = saturate_cast((int)*(r_it++)); - rect.z = saturate_cast((int)*(r_it++)); - rect.w = saturate_cast((int)*(r_it++)); - - if (isUPOctave) - { - rect.z -= rect.x; - rect.w -= rect.y; - } - - uint channel = saturate_cast((int)(*ftrs)[SC_F_CHANNEL]); - vnodes.push_back(Node(rect, channel, th)); - ++ftrs; - } - - fns = (*ftr)[SC_LEAF]; - inIt = fns.begin(), inIt_end = fns.end(); - for (; inIt != inIt_end; ++inIt) - vleaves.push_back((float)(*inIt)); - } - } - - feature_offset += octave.stages * 3; - ++octIndex; - } - - // upload in gpu memory - octaves.upload(cv::Mat(1, voctaves.size() * sizeof(Octave), CV_8UC1, (uchar*)&(voctaves[0]) )); - CV_Assert(!octaves.empty()); - - stages.upload(cv::Mat(vstages).reshape(1,1)); - CV_Assert(!stages.empty()); - - nodes.upload(cv::Mat(1, vnodes.size() * sizeof(Node), CV_8UC1, (uchar*)&(vnodes[0]) )); - CV_Assert(!nodes.empty()); - - leaves.upload(cv::Mat(vleaves).reshape(1,1)); - CV_Assert(!leaves.empty()); - - // compute levels - calcLevels(voctaves, FRAME_WIDTH, FRAME_HEIGHT, TOTAL_SCALES); - CV_Assert(!levels.empty()); - - invoker = device::icf::CascadeInvoker(levels, octaves, stages, nodes, leaves); - - return true; -} - -namespace { - struct CascadeIntrinsics - { - static const float lambda = 1.099f, a = 0.89f; - - static float getFor(int channel, float scaling) - { - CV_Assert(channel < 10); - - if (fabs(scaling - 1.f) < FLT_EPSILON) - return 1.f; - - // according to R. Benenson, M. Mathias, R. Timofte and L. Van Gool's and Dallal's papers - static const float A[2][2] = - { //channel <= 6, otherwise - { 0.89f, 1.f}, // down - { 1.00f, 1.f} // up - }; - - static const float B[2][2] = - { //channel <= 6, otherwise - { 1.099f / log(2), 2.f}, // down - { 0.f, 2.f} // up - }; - - float a = A[(int)(scaling >= 1)][(int)(channel > 6)]; - float b = B[(int)(scaling >= 1)][(int)(channel > 6)]; - - // printf("!!! scaling: %f %f %f -> %f\n", scaling, a, b, a * pow(scaling, b)); - return a * pow(scaling, b); - } + FRAME_WIDTH = 640, + FRAME_HEIGHT = 480, + TOTAL_SCALES = 55, + ORIG_OBJECT_WIDTH = 64, + ORIG_OBJECT_HEIGHT = 128, + HOG_BINS = 6, + LUV_BINS = 3, + HOG_LUV_BINS = 10 }; -} - -inline void cv::gpu::SoftCascade::Filds::calcLevels(const std::vector& octs, - int frameW, int frameH, int nscales) -{ - CV_Assert(nscales > 1); - using device::icf::Level; - - std::vector vlevels; - float logFactor = (::log(maxScale) - ::log(minScale)) / (nscales -1); - - float scale = minScale; - downscales = 0; - for (int sc = 0; sc < nscales; ++sc) - { - int width = ::std::max(0.0f, frameW - (origObjWidth * scale)); - int height = ::std::max(0.0f, frameH - (origObjHeight * scale)); - - float logScale = ::log(scale); - int fit = fitOctave(octs, logScale); - - Level level(fit, octs[fit], scale, width, height); - level.scaling[0] = CascadeIntrinsics::getFor(0, level.relScale); - level.scaling[1] = CascadeIntrinsics::getFor(9, level.relScale); - - if (!width || !height) - break; - else - { - vlevels.push_back(level); - if (octs[fit].scale < 1) ++downscales; - } - - if (::fabs(scale - maxScale) < FLT_EPSILON) break; - scale = ::std::min(maxScale, ::expf(::log(scale) + logFactor)); - - // std::cout << "level " << sc - // << " octeve " - // << vlevels[sc].octave - // << " relScale " - // << vlevels[sc].relScale - // << " " << vlevels[sc].shrScale - // << " [" << (int)vlevels[sc].objSize.x - // << " " << (int)vlevels[sc].objSize.y << "] [" - // << (int)vlevels[sc].workRect.x << " " << (int)vlevels[sc].workRect.y << "]" << std::endl; - } - - levels.upload(cv::Mat(1, vlevels.size() * sizeof(Level), CV_8UC1, (uchar*)&(vlevels[0]) )); -} +}; cv::gpu::SoftCascade::SoftCascade() : filds(0) {} @@ -513,21 +503,15 @@ cv::gpu::SoftCascade::~SoftCascade() bool cv::gpu::SoftCascade::load( const string& filename, const float minScale, const float maxScale) { - if (filds) - delete filds; - filds = 0; + if (filds) delete filds; cv::FileStorage fs(filename, FileStorage::READ); if (!fs.isOpened()) return false; - filds = new Filds; - Filds& flds = *filds; - if (!flds.fill(fs.getFirstTopLevelNode(), minScale, maxScale)) return false; - return true; + filds = Filds::parseCascade(fs.getFirstTopLevelNode(), minScale, maxScale); + return filds != 0; } -//================================== synchronous version ============================================================// - void cv::gpu::SoftCascade::detectMultiScale(const GpuMat& colored, const GpuMat& rois, GpuMat& objects, const int /*rejectfactor*/, int specificScale) const { @@ -562,7 +546,7 @@ void cv::gpu::SoftCascade::detectMultiScale(const GpuMat&, const GpuMat&, GpuMat cv::Size cv::gpu::SoftCascade::getRoiSize() const { - return cv::Size(Filds::FRAME_WIDTH / 4, Filds::FRAME_HEIGHT / 4); + return cv::Size(Filds::FRAME_WIDTH / (*filds).shrinkage, Filds::FRAME_HEIGHT / (*filds).shrinkage); } #endif \ No newline at end of file From 2bcb8dbd83a67e27868fba2101bc443d9f892f5c Mon Sep 17 00:00:00 2001 From: "marina.kolpakova" Date: Thu, 11 Oct 2012 21:56:36 +0400 Subject: [PATCH 047/155] refactor performance tests --- modules/gpu/perf/perf_softcascade.cpp | 182 ++++++++++++++++++++++++++ 1 file changed, 182 insertions(+) create mode 100644 modules/gpu/perf/perf_softcascade.cpp diff --git a/modules/gpu/perf/perf_softcascade.cpp b/modules/gpu/perf/perf_softcascade.cpp new file mode 100644 index 000000000..783089fcb --- /dev/null +++ b/modules/gpu/perf/perf_softcascade.cpp @@ -0,0 +1,182 @@ +#include "perf_precomp.hpp" + +#define GPU_PERF_TEST_P(fixture, name, params) \ + class fixture##_##name : public fixture {\ + public:\ + fixture##_##name() {}\ + protected:\ + virtual void __cpu();\ + virtual void __gpu();\ + virtual void PerfTestBody();\ + };\ + TEST_P(fixture##_##name, name /*perf*/){ RunPerfTestBody(); if (runOnGpu) __gpu(); else __cpu();}\ + INSTANTIATE_TEST_CASE_P(/*none*/, fixture##_##name, params);\ + void fixture##_##name::PerfTestBody() + +#define RUN_CPU(fixture, name)\ + void fixture##_##name::__cpu() + +#define RUN_GPU(fixture, name)\ + void fixture##_##name::__gpu() + +#define FAIL_NO_CPU(fixture, name)\ +void fixture##_##name::__cpu() { FAIL() << "No such CPU implementation analogy";} + + +typedef std::tr1::tuple fixture_t; +typedef perf::TestBaseWithParam SoftCascadeTest; + +GPU_PERF_TEST_P(SoftCascadeTest, detect, + testing::Combine( + testing::Values(std::string("cv/cascadeandhog/sc_cvpr_2012_to_opencv.xml")), + testing::Values(std::string("cv/cascadeandhog/bahnhof/image_00000000_0.png")))) +{ } + +RUN_GPU(SoftCascadeTest, detect) +{ + cv::Mat cpu = readImage (GET_PARAM(1)); + ASSERT_FALSE(cpu.empty()); + cv::gpu::GpuMat colored(cpu); + + cv::gpu::SoftCascade cascade; + ASSERT_TRUE(cascade.load(perf::TestBase::getDataPath(GET_PARAM(0)))); + + cv::gpu::GpuMat objectBoxes(1, 16384, CV_8UC1), rois(cascade.getRoiSize(), CV_8UC1), trois; + rois.setTo(1); + cv::gpu::transpose(rois, trois); + + cv::gpu::GpuMat curr = objectBoxes; + cascade.detectMultiScale(colored, trois, curr); + + TEST_CYCLE() + { + curr = objectBoxes; + cascade.detectMultiScale(colored, trois, curr); + } +} + +RUN_CPU(SoftCascadeTest, detect) +{ + cv::Mat colored = readImage(GET_PARAM(1)); + ASSERT_FALSE(colored.empty()); + + cv::SoftCascade cascade; + ASSERT_TRUE(cascade.load(getDataPath(GET_PARAM(0)))); + + std::vector rois; + + typedef cv::SoftCascade::Detection Detection; + std::vectorobjectBoxes; + cascade.detectMultiScale(colored, rois, objectBoxes); + + TEST_CYCLE() + { + cascade.detectMultiScale(colored, rois, objectBoxes); + } +} + +static cv::Rect getFromTable(int idx) +{ + static const cv::Rect rois[] = + { + cv::Rect( 65, 20, 35, 80), + cv::Rect( 95, 35, 45, 40), + cv::Rect( 45, 35, 45, 40), + cv::Rect( 25, 27, 50, 45), + cv::Rect(100, 50, 45, 40), + + cv::Rect( 60, 30, 45, 40), + cv::Rect( 40, 55, 50, 40), + cv::Rect( 48, 37, 72, 80), + cv::Rect( 48, 32, 85, 58), + cv::Rect( 48, 0, 32, 27) + }; + + return rois[idx]; +} + +typedef std::tr1::tuple roi_fixture_t; +typedef perf::TestBaseWithParam SoftCascadeTestRoi; + +GPU_PERF_TEST_P(SoftCascadeTestRoi, detectInRoi, + testing::Combine( + testing::Values(std::string("cv/cascadeandhog/sc_cvpr_2012_to_opencv.xml")), + testing::Values(std::string("cv/cascadeandhog/bahnhof/image_00000000_0.png")), + testing::Range(0, 5))) +{} + +RUN_GPU(SoftCascadeTestRoi, detectInRoi) +{ + cv::Mat cpu = readImage (GET_PARAM(1)); + ASSERT_FALSE(cpu.empty()); + cv::gpu::GpuMat colored(cpu); + + cv::gpu::SoftCascade cascade; + ASSERT_TRUE(cascade.load(perf::TestBase::getDataPath(GET_PARAM(0)))); + + cv::gpu::GpuMat objectBoxes(1, 16384 * 20, CV_8UC1), rois(cascade.getRoiSize(), CV_8UC1); + rois.setTo(0); + + int nroi = GET_PARAM(2); + cv::RNG rng; + for (int i = 0; i < nroi; ++i) + { + cv::Rect r = getFromTable(rng(10)); + cv::gpu::GpuMat sub(rois, r); + sub.setTo(1); + } + + cv::gpu::GpuMat trois; + cv::gpu::transpose(rois, trois); + + cv::gpu::GpuMat curr = objectBoxes; + cascade.detectMultiScale(colored, trois, curr); + + TEST_CYCLE() + { + curr = objectBoxes; + cascade.detectMultiScale(colored, trois, curr); + } +} + +FAIL_NO_CPU(SoftCascadeTestRoi, detectInRoi) + + +GPU_PERF_TEST_P(SoftCascadeTestRoi, detectEachRoi, + testing::Combine( + testing::Values(std::string("cv/cascadeandhog/sc_cvpr_2012_to_opencv.xml")), + testing::Values(std::string("cv/cascadeandhog/bahnhof/image_00000000_0.png")), + testing::Range(0, 10))) +{} + +RUN_GPU(SoftCascadeTestRoi, detectEachRoi) +{ + cv::Mat cpu = readImage (GET_PARAM(1)); + ASSERT_FALSE(cpu.empty()); + cv::gpu::GpuMat colored(cpu); + + cv::gpu::SoftCascade cascade; + ASSERT_TRUE(cascade.load(perf::TestBase::getDataPath(GET_PARAM(0)))); + + cv::gpu::GpuMat objectBoxes(1, 16384 * 20, CV_8UC1), rois(cascade.getRoiSize(), CV_8UC1); + rois.setTo(0); + + int idx = GET_PARAM(2); + cv::Rect r = getFromTable(idx); + cv::gpu::GpuMat sub(rois, r); + sub.setTo(1); + + cv::gpu::GpuMat curr = objectBoxes; + cv::gpu::GpuMat trois; + cv::gpu::transpose(rois, trois); + + cascade.detectMultiScale(colored, trois, curr); + + TEST_CYCLE() + { + curr = objectBoxes; + cascade.detectMultiScale(colored, trois, curr); + } +} + +FAIL_NO_CPU(SoftCascadeTestRoi, detectEachRoi) \ No newline at end of file From 022a8b9698c0c488c03fdfb83b47bf1675b6712a Mon Sep 17 00:00:00 2001 From: "marina.kolpakova" Date: Mon, 15 Oct 2012 15:27:33 +0400 Subject: [PATCH 048/155] fix rounding bug in Level creation --- modules/gpu/src/icf.hpp | 11 +---------- modules/gpu/src/imgproc.cpp | 2 +- modules/gpu/src/softcascade.cpp | 12 +++++++++++- 3 files changed, 13 insertions(+), 12 deletions(-) diff --git a/modules/gpu/src/icf.hpp b/modules/gpu/src/icf.hpp index 06c81149e..d829012c8 100644 --- a/modules/gpu/src/icf.hpp +++ b/modules/gpu/src/icf.hpp @@ -81,16 +81,7 @@ struct __align__(8) Level //is actually 24 bytes uchar2 workRect; uchar2 objSize; - Level(int idx, const Octave& oct, const float scale, const int w, const int h) - : octave(idx), relScale(scale / oct.scale), shrScale (relScale / (float)oct.shrinkage) - { - workRect.x = round(w / (float)oct.shrinkage); - workRect.y = round(h / (float)oct.shrinkage); - - objSize.x = round(oct.size.x * relScale); - objSize.y = round(oct.size.y * relScale); - } - + Level(int idx, const Octave& oct, const float scale, const int w, const int h); __device Level(){} }; diff --git a/modules/gpu/src/imgproc.cpp b/modules/gpu/src/imgproc.cpp index 0bf9c81c2..df02213b1 100644 --- a/modules/gpu/src/imgproc.cpp +++ b/modules/gpu/src/imgproc.cpp @@ -553,7 +553,7 @@ void cv::gpu::integralBuffered(const GpuMat& src, GpuMat& sum, GpuMat& buffer, S src.locateROI(whole, offset); - if (info.supports(WARP_SHUFFLE_FUNCTIONS) && src.cols <= 2048) + if (false && info.supports(WARP_SHUFFLE_FUNCTIONS) && src.cols <= 2048) { GpuMat srcAlligned; diff --git a/modules/gpu/src/softcascade.cpp b/modules/gpu/src/softcascade.cpp index fc7114b5f..e5d8cb9fb 100644 --- a/modules/gpu/src/softcascade.cpp +++ b/modules/gpu/src/softcascade.cpp @@ -65,6 +65,16 @@ cv::Size cv::gpu::SoftCascade::getRoiSize() const { throw_nogpu(); return cv::Si #include +cv::gpu::device::icf::Level::Level(int idx, const Octave& oct, const float scale, const int w, const int h) +: octave(idx), relScale(scale / oct.scale), shrScale (relScale / (float)oct.shrinkage) +{ + workRect.x = round(w / (float)oct.shrinkage); + workRect.y = round(h / (float)oct.shrinkage); + + objSize.x = cv::saturate_cast(oct.size.x * relScale); + objSize.y = cv::saturate_cast(oct.size.y * relScale); +} + namespace cv { namespace gpu { namespace device { namespace icf { void fillBins(cv::gpu::PtrStepSzb hogluv, const cv::gpu::PtrStepSzf& nangle, @@ -72,7 +82,7 @@ namespace icf { } namespace imgproc { - void meanShiftFiltering_gpu(const PtrStepSzb& src, PtrStepSzb dst, int sp, int sr, int maxIter, float eps, cudaStream_t stream); + void shfl_integral_gpu(PtrStepSzb img, PtrStepSz integral, cudaStream_t stream); } }}} From ca81628a9a3fe8e82f5ba3348a3036e5b6475e45 Mon Sep 17 00:00:00 2001 From: "marina.kolpakova" Date: Mon, 15 Oct 2012 17:38:21 +0400 Subject: [PATCH 049/155] fix retrieval of detections count --- modules/gpu/src/cuda/isf-sc.cu | 2 +- modules/gpu/src/softcascade.cpp | 2 +- modules/gpu/test/test_softcascade.cpp | 4 ++++ 3 files changed, 6 insertions(+), 2 deletions(-) diff --git a/modules/gpu/src/cuda/isf-sc.cu b/modules/gpu/src/cuda/isf-sc.cu index 74e47ba19..7aef41abc 100644 --- a/modules/gpu/src/cuda/isf-sc.cu +++ b/modules/gpu/src/cuda/isf-sc.cu @@ -313,7 +313,7 @@ namespace icf { dim3 block(32, 8); dim3 grid(fw, fh / 8, (scale == -1) ? downscales : 1); - uint* ctr = (uint*)counter.ptr(); + uint* ctr = (uint*)(counter.ptr(0)); Detection* det = (Detection*)objects.ptr(); uint max_det = objects.cols / sizeof(Detection); diff --git a/modules/gpu/src/softcascade.cpp b/modules/gpu/src/softcascade.cpp index e5d8cb9fb..560c25196 100644 --- a/modules/gpu/src/softcascade.cpp +++ b/modules/gpu/src/softcascade.cpp @@ -541,7 +541,7 @@ void cv::gpu::SoftCascade::detectMultiScale(const GpuMat& colored, const GpuMat& flds.detect(specificScale, rois, objects, 0); cv::Mat out(flds.detCounter); - int ndetections = *(out.data); + int ndetections = *(out.ptr(0)); if (! ndetections) objects = GpuMat(); diff --git a/modules/gpu/test/test_softcascade.cpp b/modules/gpu/test/test_softcascade.cpp index 04fa9b181..04c38557c 100644 --- a/modules/gpu/test/test_softcascade.cpp +++ b/modules/gpu/test/test_softcascade.cpp @@ -257,5 +257,9 @@ TEST(SoftCascadeTest, detect) cv::gpu::transpose(rois, trois); cascade.detectMultiScale(colored, trois, objectBoxes); + + typedef cv::gpu::SoftCascade::Detection Detection; + cv::Mat detections(objectBoxes); + ASSERT_EQ(detections.cols / sizeof(Detection) ,3670U); } #endif \ No newline at end of file From fa55d51b6ae616b90a036de7a231335f1fce475f Mon Sep 17 00:00:00 2001 From: "marina.kolpakova" Date: Mon, 15 Oct 2012 18:13:58 +0400 Subject: [PATCH 050/155] add sanity check to performance tests for soft cascade --- modules/gpu/perf/perf_softcascade.cpp | 59 ++++++++++++++++++++++++--- 1 file changed, 53 insertions(+), 6 deletions(-) diff --git a/modules/gpu/perf/perf_softcascade.cpp b/modules/gpu/perf/perf_softcascade.cpp index 783089fcb..d379d7fe5 100644 --- a/modules/gpu/perf/perf_softcascade.cpp +++ b/modules/gpu/perf/perf_softcascade.cpp @@ -5,7 +5,7 @@ public:\ fixture##_##name() {}\ protected:\ - virtual void __cpu();\ + virtual void __cpu();\ virtual void __gpu();\ virtual void PerfTestBody();\ };\ @@ -22,6 +22,44 @@ #define FAIL_NO_CPU(fixture, name)\ void fixture##_##name::__cpu() { FAIL() << "No such CPU implementation analogy";} +namespace { + struct DetectionLess + { + bool operator()(const cv::gpu::SoftCascade::Detection& a, + const cv::gpu::SoftCascade::Detection& b) const + { + if (a.x != b.x) return a.x < b.x; + else if (a.y != b.y) return a.y < b.y; + else if (a.w != b.w) return a.w < b.w; + else return a.h < b.h; + } + + bool operator()(const cv::SoftCascade::Detection& a, + const cv::SoftCascade::Detection& b) const + { + const cv::Rect& ra = a.rect; + const cv::Rect& rb = b.rect; + + if (ra.x != rb.x) return ra.x < rb.x; + else if (ra.y != rb.y) return ra.y < rb.y; + else if (ra.width != rb.width) return ra.width < rb.width; + else return ra.height < rb.height; + } + }; + + cv::Mat sortDetections(cv::gpu::GpuMat& objects) + { + cv::Mat detections(objects); + + typedef cv::gpu::SoftCascade::Detection Detection; + Detection* begin = (Detection*)(detections.ptr(0)); + Detection* end = (Detection*)(detections.ptr(0) + detections.cols); + std::sort(begin, end, DetectionLess()); + + return detections; + } +} + typedef std::tr1::tuple fixture_t; typedef perf::TestBaseWithParam SoftCascadeTest; @@ -41,7 +79,7 @@ RUN_GPU(SoftCascadeTest, detect) cv::gpu::SoftCascade cascade; ASSERT_TRUE(cascade.load(perf::TestBase::getDataPath(GET_PARAM(0)))); - cv::gpu::GpuMat objectBoxes(1, 16384, CV_8UC1), rois(cascade.getRoiSize(), CV_8UC1), trois; + cv::gpu::GpuMat objectBoxes(1, 10000 * sizeof(cv::gpu::SoftCascade::Detection), CV_8UC1), rois(cascade.getRoiSize(), CV_8UC1), trois; rois.setTo(1); cv::gpu::transpose(rois, trois); @@ -53,6 +91,8 @@ RUN_GPU(SoftCascadeTest, detect) curr = objectBoxes; cascade.detectMultiScale(colored, trois, curr); } + + SANITY_CHECK(sortDetections(curr)); } RUN_CPU(SoftCascadeTest, detect) @@ -66,13 +106,16 @@ RUN_CPU(SoftCascadeTest, detect) std::vector rois; typedef cv::SoftCascade::Detection Detection; - std::vectorobjectBoxes; - cascade.detectMultiScale(colored, rois, objectBoxes); + std::vectorobjects; + cascade.detectMultiScale(colored, rois, objects); TEST_CYCLE() { - cascade.detectMultiScale(colored, rois, objectBoxes); + cascade.detectMultiScale(colored, rois, objects); } + + std::sort(objects.begin(), objects.end(), DetectionLess()); + SANITY_CHECK(objects); } static cv::Rect getFromTable(int idx) @@ -137,6 +180,8 @@ RUN_GPU(SoftCascadeTestRoi, detectInRoi) curr = objectBoxes; cascade.detectMultiScale(colored, trois, curr); } + + SANITY_CHECK(sortDetections(curr)); } FAIL_NO_CPU(SoftCascadeTestRoi, detectInRoi) @@ -177,6 +222,8 @@ RUN_GPU(SoftCascadeTestRoi, detectEachRoi) curr = objectBoxes; cascade.detectMultiScale(colored, trois, curr); } + + SANITY_CHECK(sortDetections(curr)); } -FAIL_NO_CPU(SoftCascadeTestRoi, detectEachRoi) \ No newline at end of file +FAIL_NO_CPU(SoftCascadeTestRoi, detectEachRoi) From 2bd35c4358400657b0b6b1a47905152410e01628 Mon Sep 17 00:00:00 2001 From: "marina.kolpakova" Date: Mon, 15 Oct 2012 21:55:57 +0400 Subject: [PATCH 051/155] add support for precomputed integrals --- modules/gpu/perf/perf_softcascade.cpp | 56 +++++++++++++++++++++++++-- modules/gpu/src/softcascade.cpp | 15 +++++-- modules/gpu/test/test_softcascade.cpp | 33 ++++++++++++++++ 3 files changed, 96 insertions(+), 8 deletions(-) diff --git a/modules/gpu/perf/perf_softcascade.cpp b/modules/gpu/perf/perf_softcascade.cpp index d379d7fe5..582561c7c 100644 --- a/modules/gpu/perf/perf_softcascade.cpp +++ b/modules/gpu/perf/perf_softcascade.cpp @@ -9,7 +9,7 @@ virtual void __gpu();\ virtual void PerfTestBody();\ };\ - TEST_P(fixture##_##name, name /*perf*/){ RunPerfTestBody(); if (runOnGpu) __gpu(); else __cpu();}\ + TEST_P(fixture##_##name, name /*perf*/){ RunPerfTestBody(); if (PERF_RUN_GPU()) __gpu(); else __cpu();}\ INSTANTIATE_TEST_CASE_P(/*none*/, fixture##_##name, params);\ void fixture##_##name::PerfTestBody() @@ -19,7 +19,7 @@ #define RUN_GPU(fixture, name)\ void fixture##_##name::__gpu() -#define FAIL_NO_CPU(fixture, name)\ +#define NO_CPU(fixture, name)\ void fixture##_##name::__cpu() { FAIL() << "No such CPU implementation analogy";} namespace { @@ -184,7 +184,7 @@ RUN_GPU(SoftCascadeTestRoi, detectInRoi) SANITY_CHECK(sortDetections(curr)); } -FAIL_NO_CPU(SoftCascadeTestRoi, detectInRoi) +NO_CPU(SoftCascadeTestRoi, detectInRoi) GPU_PERF_TEST_P(SoftCascadeTestRoi, detectEachRoi, @@ -226,4 +226,52 @@ RUN_GPU(SoftCascadeTestRoi, detectEachRoi) SANITY_CHECK(sortDetections(curr)); } -FAIL_NO_CPU(SoftCascadeTestRoi, detectEachRoi) +NO_CPU(SoftCascadeTestRoi, detectEachRoi) + +GPU_PERF_TEST_P(SoftCascadeTest, detectOnIntegral, + testing::Combine( + testing::Values(std::string("cv/cascadeandhog/sc_cvpr_2012_to_opencv.xml")), + testing::Values(std::string("cv/cascadeandhog/integrals.xml")))) +{ } + + static std::string itoa(long i) + { + static char s[65]; + sprintf(s, "%ld", i); + return std::string(s); + } + +RUN_GPU(SoftCascadeTest, detectOnIntegral) +{ + cv::FileStorage fs(perf::TestBase::getDataPath(GET_PARAM(1)), cv::FileStorage::READ); + ASSERT_TRUE(fs.isOpened()); + + cv::gpu::GpuMat hogluv(121 * 10, 161, CV_32SC1); + for (int i = 0; i < 10; ++i) + { + cv::Mat channel; + fs[std::string("channel") + itoa(i)] >> channel; + cv::gpu::GpuMat gchannel(hogluv, cv::Rect(0, 121 * i, 161, 121)); + gchannel.upload(channel); + } + + cv::gpu::SoftCascade cascade; + ASSERT_TRUE(cascade.load(perf::TestBase::getDataPath(GET_PARAM(0)))); + + cv::gpu::GpuMat objectBoxes(1, 10000 * sizeof(cv::gpu::SoftCascade::Detection), CV_8UC1), rois(cascade.getRoiSize(), CV_8UC1), trois; + rois.setTo(1); + cv::gpu::transpose(rois, trois); + + cv::gpu::GpuMat curr = objectBoxes; + cascade.detectMultiScale(hogluv, trois, curr); + + TEST_CYCLE() + { + curr = objectBoxes; + cascade.detectMultiScale(hogluv, trois, curr); + } + + SANITY_CHECK(sortDetections(curr)); +} + +NO_CPU(SoftCascadeTest, detectOnIntegral) \ No newline at end of file diff --git a/modules/gpu/src/softcascade.cpp b/modules/gpu/src/softcascade.cpp index 560c25196..d9519e873 100644 --- a/modules/gpu/src/softcascade.cpp +++ b/modules/gpu/src/softcascade.cpp @@ -526,17 +526,24 @@ void cv::gpu::SoftCascade::detectMultiScale(const GpuMat& colored, const GpuMat& GpuMat& objects, const int /*rejectfactor*/, int specificScale) const { // only color images are supperted - CV_Assert(colored.type() == CV_8UC3); + CV_Assert(colored.type() == CV_8UC3 || colored.type() == CV_32SC1); // we guess user knows about shrincage CV_Assert((rois.size().width == getRoiSize().height) && (rois.type() == CV_8UC1)); - // only this window size allowed - CV_Assert(colored.cols == Filds::FRAME_WIDTH && colored.rows == Filds::FRAME_HEIGHT); Filds& flds = *filds; - flds.preprocess(colored); + if (colored.type() == CV_8UC3) + { + // only this window size allowed + CV_Assert(colored.cols == Filds::FRAME_WIDTH && colored.rows == Filds::FRAME_HEIGHT); + flds.preprocess(colored); + } + else + { + colored.copyTo(flds.hogluv); + } flds.detect(specificScale, rois, objects, 0); diff --git a/modules/gpu/test/test_softcascade.cpp b/modules/gpu/test/test_softcascade.cpp index 04c38557c..bf880297b 100644 --- a/modules/gpu/test/test_softcascade.cpp +++ b/modules/gpu/test/test_softcascade.cpp @@ -262,4 +262,37 @@ TEST(SoftCascadeTest, detect) cv::Mat detections(objectBoxes); ASSERT_EQ(detections.cols / sizeof(Detection) ,3670U); } + +TEST(SoftCascadeTest, detectOnIntegral) +{ + std::string xml = cvtest::TS::ptr()->get_data_path() + "../cv/cascadeandhog/sc_cvpr_2012_to_opencv.xml"; + cv::gpu::SoftCascade cascade; + ASSERT_TRUE(cascade.load(xml)); + + std::string intPath = cvtest::TS::ptr()->get_data_path() + "../cv/cascadeandhog/integrals.xml"; + cv::FileStorage fs(intPath, cv::FileStorage::READ); + ASSERT_TRUE(fs.isOpened()); + + GpuMat hogluv(121 * 10, 161, CV_32SC1); + for (int i = 0; i < 10; ++i) + { + cv::Mat channel; + fs[std::string("channel") + SoftCascadeTest::itoa(i)] >> channel; + GpuMat gchannel(hogluv, cv::Rect(0, 121 * i, 161, 121)); + gchannel.upload(channel); + } + + GpuMat objectBoxes(1, 100000, CV_8UC1), rois(cascade.getRoiSize(), CV_8UC1); + rois.setTo(1); + + cv::gpu::GpuMat trois; + cv::gpu::transpose(rois, trois); + + cascade.detectMultiScale(hogluv, trois, objectBoxes); + + typedef cv::gpu::SoftCascade::Detection Detection; + cv::Mat detections(objectBoxes); + + ASSERT_EQ(detections.cols / sizeof(Detection) ,2042U); +} #endif \ No newline at end of file From fba62c9251053410adc71ead3d9e594567426242 Mon Sep 17 00:00:00 2001 From: "marina.kolpakova" Date: Sat, 3 Nov 2012 18:03:31 +0400 Subject: [PATCH 052/155] fix compilation problem after rebase --- modules/gpu/perf/perf_objdetect.cpp | 242 +------------------------- modules/gpu/perf/perf_softcascade.cpp | 56 +++--- 2 files changed, 31 insertions(+), 267 deletions(-) diff --git a/modules/gpu/perf/perf_objdetect.cpp b/modules/gpu/perf/perf_objdetect.cpp index ced8ee17d..6d040ac02 100644 --- a/modules/gpu/perf/perf_objdetect.cpp +++ b/modules/gpu/perf/perf_objdetect.cpp @@ -89,244 +89,6 @@ PERF_TEST_P(HOG, CalTech, Values("gpu/caltech/image_00000009_0.png", "gp SANITY_CHECK(found_locations); } -//================================================= ICF SoftCascade =================================================// - -typedef pair pair_string; -DEF_PARAM_TEST_1(SoftCascade, pair_string); - - -// struct SoftCascadeTest : public perf::TestBaseWithParam -// { -// typedef cv::gpu::SoftCascade::Detection detection_t; -// static cv::Rect getFromTable(int idx) -// { -// static const cv::Rect rois[] = -// { -// cv::Rect( 65, 20, 35, 80), -// cv::Rect( 95, 35, 45, 40), -// cv::Rect( 45, 35, 45, 40), -// cv::Rect( 25, 27, 50, 45), -// cv::Rect(100, 50, 45, 40), - -// cv::Rect( 60, 30, 45, 40), -// cv::Rect( 40, 55, 50, 40), -// cv::Rect( 48, 37, 72, 80), -// cv::Rect( 48, 32, 85, 58), -// cv::Rect( 48, 0, 32, 27) -// }; - -// return rois[idx]; -// } - -// static std::string itoa(long i) -// { -// static char s[65]; -// sprintf(s, "%ld", i); -// return std::string(s); -// } - -// static std::string getImageName(int level) -// { -// time_t rawtime; -// struct tm * timeinfo; -// char buffer [80]; - -// time ( &rawtime ); -// timeinfo = localtime ( &rawtime ); - -// strftime (buffer,80,"%Y-%m-%d--%H-%M-%S",timeinfo); -// return "gpu_rec_level_" + itoa(level)+ "_" + std::string(buffer) + ".png"; -// } - -// static void print(std::ostream &out, const detection_t& d) -// { -// out << "\x1b[32m[ detection]\x1b[0m (" -// << std::setw(4) << d.x -// << " " -// << std::setw(4) << d.y -// << ") (" -// << std::setw(4) << d.w -// << " " -// << std::setw(4) << d.h -// << ") " -// << std::setw(12) << d.confidence -// << std::endl; -// } - -// static void printTotal(std::ostream &out, int detbytes) -// { -// out << "\x1b[32m[ ]\x1b[0m Total detections " << (detbytes / sizeof(detection_t)) << std::endl; -// } - -// static void writeResult(const cv::Mat& result, const int level) -// { -// std::string path = cv::tempfile(getImageName(level).c_str()); -// cv::imwrite(path, result); -// std::cout << "\x1b[32m" << "[ ]" << std::endl << "[ stored in]"<< "\x1b[0m" << path << std::endl; -// } -// }; - -typedef std::tr1::tuple fixture_t; -typedef perf::TestBaseWithParam SoftCascadeTest; - -PERF_TEST_P(SoftCascadeTest, detect, - testing::Combine( - testing::Values(std::string("cv/cascadeandhog/sc_cvpr_2012_to_opencv.xml")), - testing::Values(std::string("cv/cascadeandhog/bahnhof/image_00000000_0.png")))) -{ - if (runOnGpu) - { - cv::Mat cpu = readImage (GET_PARAM(1)); - ASSERT_FALSE(cpu.empty()); - cv::gpu::GpuMat colored(cpu); - - cv::gpu::SoftCascade cascade; - ASSERT_TRUE(cascade.load(perf::TestBase::getDataPath(GET_PARAM(0)))); - - cv::gpu::GpuMat objectBoxes(1, 16384, CV_8UC1), rois(cascade.getRoiSize(), CV_8UC1), trois; - rois.setTo(1); - cv::gpu::transpose(rois, trois); - - cv::gpu::GpuMat curr = objectBoxes; - cascade.detectMultiScale(colored, trois, curr); - - TEST_CYCLE() - { - curr = objectBoxes; - cascade.detectMultiScale(colored, trois, curr); - } - } - else - { - cv::Mat colored = readImage(GET_PARAM(1)); - ASSERT_FALSE(colored.empty()); - - cv::SoftCascade cascade; - ASSERT_TRUE(cascade.load(getDataPath(GET_PARAM(0)))); - - std::vector rois; - - typedef cv::SoftCascade::Detection Detection; - std::vectorobjectBoxes; - cascade.detectMultiScale(colored, rois, objectBoxes); - - TEST_CYCLE() - { - cascade.detectMultiScale(colored, rois, objectBoxes); - } - } -} - -static cv::Rect getFromTable(int idx) -{ - static const cv::Rect rois[] = - { - cv::Rect( 65, 20, 35, 80), - cv::Rect( 95, 35, 45, 40), - cv::Rect( 45, 35, 45, 40), - cv::Rect( 25, 27, 50, 45), - cv::Rect(100, 50, 45, 40), - - cv::Rect( 60, 30, 45, 40), - cv::Rect( 40, 55, 50, 40), - cv::Rect( 48, 37, 72, 80), - cv::Rect( 48, 32, 85, 58), - cv::Rect( 48, 0, 32, 27) - }; - - return rois[idx]; -} - -typedef std::tr1::tuple roi_fixture_t; -typedef perf::TestBaseWithParam SoftCascadeTestRoi; - -PERF_TEST_P(SoftCascadeTestRoi, detectInRoi, - testing::Combine( - testing::Values(std::string("cv/cascadeandhog/sc_cvpr_2012_to_opencv.xml")), - testing::Values(std::string("cv/cascadeandhog/bahnhof/image_00000000_0.png")), - testing::Range(0, 5))) -{ - if (runOnGpu) - { - cv::Mat cpu = readImage (GET_PARAM(1)); - ASSERT_FALSE(cpu.empty()); - cv::gpu::GpuMat colored(cpu); - - cv::gpu::SoftCascade cascade; - ASSERT_TRUE(cascade.load(perf::TestBase::getDataPath(GET_PARAM(0)))); - - cv::gpu::GpuMat objectBoxes(1, 16384 * 20, CV_8UC1), rois(cascade.getRoiSize(), CV_8UC1); - rois.setTo(0); - - int nroi = GET_PARAM(2); - cv::RNG rng; - for (int i = 0; i < nroi; ++i) - { - cv::Rect r = getFromTable(rng(10)); - cv::gpu::GpuMat sub(rois, r); - sub.setTo(1); - } - - cv::gpu::GpuMat trois; - cv::gpu::transpose(rois, trois); - - cv::gpu::GpuMat curr = objectBoxes; - cascade.detectMultiScale(colored, trois, curr); - - TEST_CYCLE() - { - curr = objectBoxes; - cascade.detectMultiScale(colored, trois, curr); - } - } - else - { - FAIL(); - } -} - -PERF_TEST_P(SoftCascadeTestRoi, detectEachRoi, - testing::Combine( - testing::Values(std::string("cv/cascadeandhog/sc_cvpr_2012_to_opencv.xml")), - testing::Values(std::string("cv/cascadeandhog/bahnhof/image_00000000_0.png")), - testing::Range(0, 10))) -{ - if (runOnGpu) - { - cv::Mat cpu = readImage (GET_PARAM(1)); - ASSERT_FALSE(cpu.empty()); - cv::gpu::GpuMat colored(cpu); - - cv::gpu::SoftCascade cascade; - ASSERT_TRUE(cascade.load(perf::TestBase::getDataPath(GET_PARAM(0)))); - - cv::gpu::GpuMat objectBoxes(1, 16384 * 20, CV_8UC1), rois(cascade.getRoiSize(), CV_8UC1); - rois.setTo(0); - - int idx = GET_PARAM(2); - cv::Rect r = getFromTable(idx); - cv::gpu::GpuMat sub(rois, r); - sub.setTo(1); - - cv::gpu::GpuMat curr = objectBoxes; - cv::gpu::GpuMat trois; - cv::gpu::transpose(rois, trois); - - cascade.detectMultiScale(colored, trois, curr); - - TEST_CYCLE() - { - curr = objectBoxes; - cascade.detectMultiScale(colored, rois, curr); - } - } - else - { - FAIL(); - } -} - - /////////////////////////////////////////////////////////////// // HaarClassifier @@ -383,7 +145,7 @@ PERF_TEST_P(ImageAndCascade, ObjDetect_LBPClassifier, cv::Mat img = readImage(GetParam().first, cv::IMREAD_GRAYSCALE); ASSERT_FALSE(img.empty()); - if (runOnGpu) + if (PERF_RUN_GPU()) { cv::gpu::CascadeClassifier_GPU d_cascade; ASSERT_TRUE(d_cascade.load(perf::TestBase::getDataPath(GetParam().second))); @@ -418,4 +180,4 @@ PERF_TEST_P(ImageAndCascade, ObjDetect_LBPClassifier, } } -} // namespace +} // namespace \ No newline at end of file diff --git a/modules/gpu/perf/perf_softcascade.cpp b/modules/gpu/perf/perf_softcascade.cpp index 582561c7c..9b53b2e84 100644 --- a/modules/gpu/perf/perf_softcascade.cpp +++ b/modules/gpu/perf/perf_softcascade.cpp @@ -34,17 +34,17 @@ namespace { else return a.h < b.h; } - bool operator()(const cv::SoftCascade::Detection& a, - const cv::SoftCascade::Detection& b) const - { - const cv::Rect& ra = a.rect; - const cv::Rect& rb = b.rect; + // bool operator()(const cv::SoftCascade::Detection& a, + // const cv::SoftCascade::Detection& b) const + // { + // const cv::Rect& ra = a.rect; + // const cv::Rect& rb = b.rect; - if (ra.x != rb.x) return ra.x < rb.x; - else if (ra.y != rb.y) return ra.y < rb.y; - else if (ra.width != rb.width) return ra.width < rb.width; - else return ra.height < rb.height; - } + // if (ra.x != rb.x) return ra.x < rb.x; + // else if (ra.y != rb.y) return ra.y < rb.y; + // else if (ra.width != rb.width) return ra.width < rb.width; + // else return ra.height < rb.height; + // } }; cv::Mat sortDetections(cv::gpu::GpuMat& objects) @@ -95,28 +95,30 @@ RUN_GPU(SoftCascadeTest, detect) SANITY_CHECK(sortDetections(curr)); } -RUN_CPU(SoftCascadeTest, detect) -{ - cv::Mat colored = readImage(GET_PARAM(1)); - ASSERT_FALSE(colored.empty()); +NO_CPU(SoftCascadeTest, detect) - cv::SoftCascade cascade; - ASSERT_TRUE(cascade.load(getDataPath(GET_PARAM(0)))); +// RUN_CPU(SoftCascadeTest, detect) +// { +// cv::Mat colored = readImage(GET_PARAM(1)); +// ASSERT_FALSE(colored.empty()); - std::vector rois; +// cv::SoftCascade cascade; +// ASSERT_TRUE(cascade.load(getDataPath(GET_PARAM(0)))); - typedef cv::SoftCascade::Detection Detection; - std::vectorobjects; - cascade.detectMultiScale(colored, rois, objects); +// std::vector rois; - TEST_CYCLE() - { - cascade.detectMultiScale(colored, rois, objects); - } +// typedef cv::SoftCascade::Detection Detection; +// std::vectorobjects; +// cascade.detectMultiScale(colored, rois, objects); - std::sort(objects.begin(), objects.end(), DetectionLess()); - SANITY_CHECK(objects); -} +// TEST_CYCLE() +// { +// cascade.detectMultiScale(colored, rois, objects); +// } + +// std::sort(objects.begin(), objects.end(), DetectionLess()); +// SANITY_CHECK(objects); +// } static cv::Rect getFromTable(int idx) { From ac5cd4827908952bf29a40879c1b4490c6d3127d Mon Sep 17 00:00:00 2001 From: "marina.kolpakova" Date: Sun, 4 Nov 2012 03:29:20 +0400 Subject: [PATCH 053/155] add DeviceInfo parameter to the soft cascade tests --- modules/gpu/test/test_softcascade.cpp | 67 ++++++++++++++++----------- 1 file changed, 39 insertions(+), 28 deletions(-) diff --git a/modules/gpu/test/test_softcascade.cpp b/modules/gpu/test/test_softcascade.cpp index bf880297b..fb936be88 100644 --- a/modules/gpu/test/test_softcascade.cpp +++ b/modules/gpu/test/test_softcascade.cpp @@ -68,12 +68,10 @@ using cv::gpu::GpuMat; INSTANTIATE_TEST_CASE_P(/*none*/, fixture##_##name, params); \ void fixture##_##name::body() +namespace { -typedef std::tr1::tuple roi_fixture_t; + typedef cv::gpu::SoftCascade::Detection Detection; -struct SoftCascadeTest : public ::testing::TestWithParam -{ - typedef cv::gpu::SoftCascade::Detection detection_t; static cv::Rect getFromTable(int idx) { static const cv::Rect rois[] = @@ -114,7 +112,7 @@ struct SoftCascadeTest : public ::testing::TestWithParam return "gpu_rec_level_" + itoa(level)+ "_" + std::string(buffer) + ".png"; } - static void print(std::ostream &out, const detection_t& d) + static void print(std::ostream &out, const Detection& d) { out << "\x1b[32m[ detection]\x1b[0m (" << std::setw(4) << d.x @@ -131,7 +129,7 @@ struct SoftCascadeTest : public ::testing::TestWithParam static void printTotal(std::ostream &out, int detbytes) { - out << "\x1b[32m[ ]\x1b[0m Total detections " << (detbytes / sizeof(detection_t)) << std::endl; + out << "\x1b[32m[ ]\x1b[0m Total detections " << (detbytes / sizeof(Detection)) << std::endl; } static void writeResult(const cv::Mat& result, const int level) @@ -140,24 +138,27 @@ struct SoftCascadeTest : public ::testing::TestWithParam cv::imwrite(path, result); std::cout << "\x1b[32m" << "[ ]" << std::endl << "[ stored in]"<< "\x1b[0m" << path << std::endl; } -}; +} -GPU_TEST_P(SoftCascadeTest, detectInROI, +typedef ::testing::TestWithParam > SoftCascadeTestRoi; +GPU_TEST_P(SoftCascadeTestRoi, detect, testing::Combine( + ALL_DEVICES, testing::Values(std::string("../cv/cascadeandhog/sc_cvpr_2012_to_opencv.xml")), testing::Values(std::string("../cv/cascadeandhog/bahnhof/image_00000000_0.png")), testing::Range(0, 5))) { - cv::Mat coloredCpu = cv::imread(cvtest::TS::ptr()->get_data_path() + GET_PARAM(1)); + cv::gpu::setDevice(GET_PARAM(0).deviceID()); + cv::Mat coloredCpu = cv::imread(cvtest::TS::ptr()->get_data_path() + GET_PARAM(2)); ASSERT_FALSE(coloredCpu.empty()); cv::gpu::SoftCascade cascade; - ASSERT_TRUE(cascade.load(cvtest::TS::ptr()->get_data_path() + GET_PARAM(0))); + ASSERT_TRUE(cascade.load(cvtest::TS::ptr()->get_data_path() + GET_PARAM(1))); GpuMat colored(coloredCpu), objectBoxes(1, 16384, CV_8UC1), rois(cascade.getRoiSize(), CV_8UC1), trois; rois.setTo(0); - int nroi = GET_PARAM(2); + int nroi = GET_PARAM(3); cv::Mat result(coloredCpu); cv::RNG rng; for (int i = 0; i < nroi; ++i) @@ -173,16 +174,15 @@ GPU_TEST_P(SoftCascadeTest, detectInROI, cascade.detectMultiScale(colored, trois, objectBoxes); - /// cv::Mat dt(objectBoxes); - typedef cv::gpu::SoftCascade::Detection detection_t; + typedef cv::gpu::SoftCascade::Detection Detection; - detection_t* dts = (detection_t*)dt.data; + Detection* dts = (Detection*)dt.data; printTotal(std::cout, dt.cols); - for (int i = 0; i < (int)(dt.cols / sizeof(detection_t)); ++i) + for (int i = 0; i < (int)(dt.cols / sizeof(Detection)); ++i) { - detection_t d = dts[i]; + Detection d = dts[i]; print(std::cout, d); cv::rectangle(result, cv::Rect(d.x, d.y, d.w, d.h), cv::Scalar(255, 0, 0, 255), 1); } @@ -190,39 +190,43 @@ GPU_TEST_P(SoftCascadeTest, detectInROI, SHOW(result); } -GPU_TEST_P(SoftCascadeTest, detectInLevel, +typedef ::testing::TestWithParam > SoftCascadeTestLevel; +GPU_TEST_P(SoftCascadeTestLevel, detect, testing::Combine( + ALL_DEVICES, testing::Values(std::string("../cv/cascadeandhog/sc_cvpr_2012_to_opencv.xml")), testing::Values(std::string("../cv/cascadeandhog/bahnhof/image_00000000_0.png")), testing::Range(0, 47) )) { - std::string xml = cvtest::TS::ptr()->get_data_path() + GET_PARAM(0); + cv::gpu::setDevice(GET_PARAM(0).deviceID()); + + std::string xml = cvtest::TS::ptr()->get_data_path() + GET_PARAM(1); cv::gpu::SoftCascade cascade; ASSERT_TRUE(cascade.load(xml)); - cv::Mat coloredCpu = cv::imread(cvtest::TS::ptr()->get_data_path() + GET_PARAM(1)); + cv::Mat coloredCpu = cv::imread(cvtest::TS::ptr()->get_data_path() + GET_PARAM(2)); ASSERT_FALSE(coloredCpu.empty()); - typedef cv::gpu::SoftCascade::Detection detection_t; - GpuMat colored(coloredCpu), objectBoxes(1, 100 * sizeof(detection_t), CV_8UC1), rois(cascade.getRoiSize(), CV_8UC1); + typedef cv::gpu::SoftCascade::Detection Detection; + GpuMat colored(coloredCpu), objectBoxes(1, 100 * sizeof(Detection), CV_8UC1), rois(cascade.getRoiSize(), CV_8UC1); rois.setTo(1); cv::gpu::GpuMat trois; cv::gpu::transpose(rois, trois); - int level = GET_PARAM(2); + int level = GET_PARAM(3); cascade.detectMultiScale(colored, trois, objectBoxes, 1, level); cv::Mat dt(objectBoxes); - detection_t* dts = (detection_t*)dt.data; + Detection* dts = (Detection*)dt.data; cv::Mat result(coloredCpu); printTotal(std::cout, dt.cols); - for (int i = 0; i < (int)(dt.cols / sizeof(detection_t)); ++i) + for (int i = 0; i < (int)(dt.cols / sizeof(Detection)); ++i) { - detection_t d = dts[i]; + Detection d = dts[i]; print(std::cout, d); cv::rectangle(result, cv::Rect(d.x, d.y, d.w, d.h), cv::Scalar(255, 0, 0, 255), 1); } @@ -238,8 +242,12 @@ TEST(SoftCascadeTest, readCascade) ASSERT_TRUE(cascade.load(xml)); } -TEST(SoftCascadeTest, detect) +typedef ::testing::TestWithParam SoftCascadeTestAll; +GPU_TEST_P(SoftCascadeTestAll, detect, + ALL_DEVICES + ) { + cv::gpu::setDevice(GetParam().deviceID()); std::string xml = cvtest::TS::ptr()->get_data_path() + "../cv/cascadeandhog/sc_cvpr_2012_to_opencv.xml"; cv::gpu::SoftCascade cascade; ASSERT_TRUE(cascade.load(xml)); @@ -263,8 +271,11 @@ TEST(SoftCascadeTest, detect) ASSERT_EQ(detections.cols / sizeof(Detection) ,3670U); } -TEST(SoftCascadeTest, detectOnIntegral) +GPU_TEST_P(SoftCascadeTestAll, detectOnIntegral, + ALL_DEVICES + ) { + cv::gpu::setDevice(GetParam().deviceID()); std::string xml = cvtest::TS::ptr()->get_data_path() + "../cv/cascadeandhog/sc_cvpr_2012_to_opencv.xml"; cv::gpu::SoftCascade cascade; ASSERT_TRUE(cascade.load(xml)); @@ -277,7 +288,7 @@ TEST(SoftCascadeTest, detectOnIntegral) for (int i = 0; i < 10; ++i) { cv::Mat channel; - fs[std::string("channel") + SoftCascadeTest::itoa(i)] >> channel; + fs[std::string("channel") + itoa(i)] >> channel; GpuMat gchannel(hogluv, cv::Rect(0, 121 * i, 161, 121)); gchannel.upload(channel); } From df392cc830df738cdb8f0f2a2398fad684752f3e Mon Sep 17 00:00:00 2001 From: "marina.kolpakova" Date: Mon, 5 Nov 2012 13:52:24 +0400 Subject: [PATCH 054/155] use fast integral for soft cascade --- modules/gpu/src/cuda/integral_image.cu | 83 ++++++++++++++++++++++++++ modules/gpu/src/cuda/isf-sc.cu | 4 +- modules/gpu/src/softcascade.cpp | 31 +++++----- modules/gpu/test/test_softcascade.cpp | 1 + 4 files changed, 102 insertions(+), 17 deletions(-) diff --git a/modules/gpu/src/cuda/integral_image.cu b/modules/gpu/src/cuda/integral_image.cu index 558f9085d..5bd35bdc7 100644 --- a/modules/gpu/src/cuda/integral_image.cu +++ b/modules/gpu/src/cuda/integral_image.cu @@ -383,6 +383,89 @@ namespace cv { namespace gpu { namespace device if (stream == 0) cudaSafeCall( cudaDeviceSynchronize() ); } + + __global__ void shfl_integral_vertical(PtrStepSz buffer, PtrStepSz integral) + { + #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 300) + __shared__ unsigned int sums[32][9]; + + const int tidx = blockIdx.x * blockDim.x + threadIdx.x; + const int lane_id = tidx % 8; + + if (tidx >= integral.cols) + return; + + sums[threadIdx.x][threadIdx.y] = 0; + __syncthreads(); + + unsigned int stepSum = 0; + + for (int y = threadIdx.y; y < integral.rows; y += blockDim.y) + { + unsigned int* p = buffer.ptr(y) + tidx; + unsigned int* dst = integral.ptr(y + 1) + tidx + 1; + + unsigned int sum = *p; + + sums[threadIdx.x][threadIdx.y] = sum; + __syncthreads(); + + // place into SMEM + // shfl scan reduce the SMEM, reformating so the column + // sums are computed in a warp + // then read out properly + const int j = threadIdx.x % 8; + const int k = threadIdx.x / 8 + threadIdx.y * 4; + + int partial_sum = sums[k][j]; + + for (int i = 1; i <= 8; i *= 2) + { + int n = __shfl_up(partial_sum, i, 32); + + if (lane_id >= i) + partial_sum += n; + } + + sums[k][j] = partial_sum; + __syncthreads(); + + if (threadIdx.y > 0) + sum += sums[threadIdx.x][threadIdx.y - 1]; + + sum += stepSum; + stepSum += sums[threadIdx.x][blockDim.y - 1]; + + __syncthreads(); + + *dst = sum; + } + #endif + } + + // used for frame preprocessing before Soft Cascade evaluation: no synchronization needed + // ToDo: partial dy + void shfl_integral_gpu_buffered(PtrStepSzb img, PtrStepSz buffer, PtrStepSz integral, + int blockStep, cudaStream_t stream) + { + { + const int block = blockStep; + const int grid = img.rows; + + cudaSafeCall( cudaFuncSetCacheConfig(shfl_integral_horizontal, cudaFuncCachePreferL1) ); + + shfl_integral_horizontal<<>>((PtrStepSz) img, buffer); + cudaSafeCall( cudaGetLastError() ); + } + + { + const dim3 block(32, 8); + const dim3 grid(divUp(integral.cols, block.x), 1); + + shfl_integral_vertical<<>>((PtrStepSz)buffer, integral); + cudaSafeCall( cudaGetLastError() ); + } + } } }}} diff --git a/modules/gpu/src/cuda/isf-sc.cu b/modules/gpu/src/cuda/isf-sc.cu index 7aef41abc..3391bb1a0 100644 --- a/modules/gpu/src/cuda/isf-sc.cu +++ b/modules/gpu/src/cuda/isf-sc.cu @@ -198,14 +198,14 @@ namespace icf { Node node = nodes[nId]; float threshold = rescale(level, node); - int sum = get(x, y + (node.threshold >> 28) * 121, node.rect); + int sum = get(x, y + (node.threshold >> 28) * 120, node.rect); int next = 1 + (int)(sum >= threshold); dprintf("%d: go: %d (%d >= %f)\n\n" ,threadIdx.x, next, sum, threshold); node = nodes[nId + next]; threshold = rescale(level, node); - sum = get(x, y + (node.threshold >> 28) * 121, node.rect); + sum = get(x, y + (node.threshold >> 28) * 120, node.rect); const int lShift = (next - 1) * 2 + (int)(sum >= threshold); float impact = leaves[(st + threadIdx.x) * 4 + lShift]; diff --git a/modules/gpu/src/softcascade.cpp b/modules/gpu/src/softcascade.cpp index d9519e873..2d43a5440 100644 --- a/modules/gpu/src/softcascade.cpp +++ b/modules/gpu/src/softcascade.cpp @@ -76,14 +76,20 @@ cv::gpu::device::icf::Level::Level(int idx, const Octave& oct, const float scale } namespace cv { namespace gpu { namespace device { + namespace icf { void fillBins(cv::gpu::PtrStepSzb hogluv, const cv::gpu::PtrStepSzf& nangle, const int fw, const int fh, const int bins); } -namespace imgproc -{ - void shfl_integral_gpu(PtrStepSzb img, PtrStepSz integral, cudaStream_t stream); + +namespace imgproc { + void shfl_integral_gpu_buffered(PtrStepSzb, PtrStepSz, PtrStepSz, int, cudaStream_t); + + template + void resize_gpu(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float fx, float fy, + PtrStepSzb dst, int interpolation, cudaStream_t stream); } + }}} struct cv::gpu::SoftCascade::Filds @@ -319,9 +325,13 @@ struct cv::gpu::SoftCascade::Filds plane.create(FRAME_HEIGHT * (HOG_LUV_BINS + 1), FRAME_WIDTH, CV_8UC1); fplane.create(FRAME_HEIGHT * 6, FRAME_WIDTH, CV_32FC1); luv.create(FRAME_HEIGHT, FRAME_WIDTH, CV_8UC3); + shrunk.create(FRAME_HEIGHT / shr * HOG_LUV_BINS, FRAME_WIDTH / shr, CV_8UC1); - integralBuffer.create(1 , (shrunk.rows + 1) * HOG_LUV_BINS * (shrunk.cols + 1), CV_32SC1); - hogluv.create((FRAME_HEIGHT / shr + 1) * HOG_LUV_BINS, FRAME_WIDTH / shr + 64, CV_32SC1); + integralBuffer.create(shrunk.rows, shrunk.cols, CV_32SC1); + + hogluv.create((FRAME_HEIGHT / shr) * HOG_LUV_BINS + 1, FRAME_WIDTH / shr + 1, CV_32SC1); + hogluv.setTo(cv::Scalar::all(0)); + detCounter.create(1,1, CV_32SC1); octaves.upload(hoctaves); @@ -432,16 +442,7 @@ private: GpuMat channels(plane, cv::Rect(0, 0, fw, fh * Filds::HOG_LUV_BINS)); cv::gpu::resize(channels, shrunk, cv::Size(), 0.25, 0.25, CV_INTER_AREA); - - fw /= shrinkage; - fh /= shrinkage; - - for(int i = 0; i < Filds::HOG_LUV_BINS; ++i) - { - GpuMat channel(shrunk, cv::Rect(0, fh * i, fw, fh )); - GpuMat sum(hogluv, cv::Rect(0, (fh + 1) * i, fw + 1, fh + 1)); - cv::gpu::integralBuffered(channel, sum, integralBuffer); - } + device::imgproc::shfl_integral_gpu_buffered(shrunk, integralBuffer, hogluv, 12, 0); } public: diff --git a/modules/gpu/test/test_softcascade.cpp b/modules/gpu/test/test_softcascade.cpp index fb936be88..1146b062b 100644 --- a/modules/gpu/test/test_softcascade.cpp +++ b/modules/gpu/test/test_softcascade.cpp @@ -271,6 +271,7 @@ GPU_TEST_P(SoftCascadeTestAll, detect, ASSERT_EQ(detections.cols / sizeof(Detection) ,3670U); } +//ToDo: fix me GPU_TEST_P(SoftCascadeTestAll, detectOnIntegral, ALL_DEVICES ) From 9b251f81309aad65344e3a345c87e132ff37e8d5 Mon Sep 17 00:00:00 2001 From: "marina.kolpakova" Date: Mon, 5 Nov 2012 14:21:46 +0400 Subject: [PATCH 055/155] remove Sobel normalization --- modules/gpu/src/softcascade.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/modules/gpu/src/softcascade.cpp b/modules/gpu/src/softcascade.cpp index 2d43a5440..1e0c271b9 100644 --- a/modules/gpu/src/softcascade.cpp +++ b/modules/gpu/src/softcascade.cpp @@ -396,8 +396,8 @@ private: GpuMat dfdx(fplane, cv::Rect(0, 0, fw, fh)); GpuMat dfdy(fplane, cv::Rect(0, fh, fw, fh)); - cv::gpu::Sobel(gray, dfdx, CV_32F, 1, 0, 3, 0.125f); - cv::gpu::Sobel(gray, dfdy, CV_32F, 0, 1, 3, 0.125f); + cv::gpu::Sobel(gray, dfdx, CV_32F, 1, 0); + cv::gpu::Sobel(gray, dfdy, CV_32F, 0, 1); GpuMat mag(fplane, cv::Rect(0, 2 * fh, fw, fh)); GpuMat ang(fplane, cv::Rect(0, 3 * fh, fw, fh)); @@ -409,7 +409,7 @@ private: GpuMat nmag(fplane, cv::Rect(0, 4 * fh, fw, fh)); GpuMat nang(fplane, cv::Rect(0, 5 * fh, fw, fh)); - cv::gpu::multiply(mag, cv::Scalar::all(1.f / ::log(2)), nmag); + cv::gpu::multiply(mag, cv::Scalar::all(1.f / (8 *::log(2))), nmag); cv::gpu::multiply(ang, cv::Scalar::all(1.f / 60.f), nang); //create uchar magnitude From 27eb2e27db92c47d70c1e9d89187c2a28df93dc7 Mon Sep 17 00:00:00 2001 From: "marina.kolpakova" Date: Mon, 5 Nov 2012 18:52:35 +0400 Subject: [PATCH 056/155] enable fast integral for Kepler --- modules/gpu/src/imgproc.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/gpu/src/imgproc.cpp b/modules/gpu/src/imgproc.cpp index df02213b1..0bf9c81c2 100644 --- a/modules/gpu/src/imgproc.cpp +++ b/modules/gpu/src/imgproc.cpp @@ -553,7 +553,7 @@ void cv::gpu::integralBuffered(const GpuMat& src, GpuMat& sum, GpuMat& buffer, S src.locateROI(whole, offset); - if (false && info.supports(WARP_SHUFFLE_FUNCTIONS) && src.cols <= 2048) + if (info.supports(WARP_SHUFFLE_FUNCTIONS) && src.cols <= 2048) { GpuMat srcAlligned; From e6eb1b99e19b5e8e99e1d8aa872363335a6a0a92 Mon Sep 17 00:00:00 2001 From: "marina.kolpakova" Date: Sat, 10 Nov 2012 00:48:37 +0400 Subject: [PATCH 057/155] fix negative confidence bug --- modules/gpu/src/cuda/isf-sc.cu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/gpu/src/cuda/isf-sc.cu b/modules/gpu/src/cuda/isf-sc.cu index 3391bb1a0..ee9a9f674 100644 --- a/modules/gpu/src/cuda/isf-sc.cu +++ b/modules/gpu/src/cuda/isf-sc.cu @@ -229,7 +229,7 @@ namespace icf { if(__any((confidence <= stages[(st + threadIdx.x)]))) st += 2048; } - if(st == stEnd && !threadIdx.x) + if(!threadIdx.x && st == stEnd && ((confidence - FLT_EPSILON) >= 0)) { int idx = atomicInc(ctr, ndetections); // store detection From 40600fa5048ddf43bf8fc7602694ad723234c87d Mon Sep 17 00:00:00 2001 From: "marina.kolpakova" Date: Sat, 10 Nov 2012 00:49:51 +0400 Subject: [PATCH 058/155] GPU version becomes algorithm --- modules/gpu/include/opencv2/gpu/gpu.hpp | 67 +++++----- modules/gpu/perf/perf_softcascade.cpp | 146 +++++++++++---------- modules/gpu/src/gpu_init.cpp | 60 +++++++++ modules/gpu/src/softcascade.cpp | 161 +++++++++++++----------- modules/gpu/test/test_softcascade.cpp | 147 +++++++++++++--------- 5 files changed, 346 insertions(+), 235 deletions(-) create mode 100644 modules/gpu/src/gpu_init.cpp diff --git a/modules/gpu/include/opencv2/gpu/gpu.hpp b/modules/gpu/include/opencv2/gpu/gpu.hpp index 9b59c6004..4fc6179d8 100644 --- a/modules/gpu/include/opencv2/gpu/gpu.hpp +++ b/modules/gpu/include/opencv2/gpu/gpu.hpp @@ -1534,10 +1534,12 @@ public: // ======================== GPU version for soft cascade ===================== // -class CV_EXPORTS SoftCascade +// Implementation of soft (stageless) cascaded detector. +class CV_EXPORTS SCascade : public Algorithm { public: + // Representation of detectors result. struct CV_EXPORTS Detection { ushort x; @@ -1549,47 +1551,44 @@ public: enum {PEDESTRIAN = 0}; }; - //! An empty cascade will be created. - SoftCascade(); - //! Cascade will be created from file for scales from minScale to maxScale. - //! Param filename is a path to xml-serialized cascade. - //! Param minScale is a minimum scale relative to the original size of the image on which cascade will be applyed. - //! Param minScale is a maximum scale relative to the original size of the image on which cascade will be applyed. - SoftCascade( const string& filename, const float minScale = 0.4f, const float maxScale = 5.f); + // An empty cascade will be created. + // Param minScale is a minimum scale relative to the original size of the image on which cascade will be applyed. + // Param minScale is a maximum scale relative to the original size of the image on which cascade will be applyed. + // Param scales is a number of scales from minScale to maxScale. + // Param rejfactor is used for NMS. + SCascade(const double minScale = 0.4, const double maxScale = 5., const int scales = 55, const int rejfactor = 1); - //! cascade will be loaded from file "filename". The previous cascade will be destroyed. - //! Param filename is a path to xml-serialized cascade. - //! Param minScale is a minimum scale relative to the original size of the image on which cascade will be applyed. - //! Param minScale is a maximum scale relative to the original size of the image on which cascade will be applyed. - bool load( const string& filename, const float minScale = 0.4f, const float maxScale = 5.f); + virtual ~SCascade(); - virtual ~SoftCascade(); + cv::AlgorithmInfo* info() const; - //! detect specific objects on in the input frame for all scales computed flom minScale and maxscale values - //! Param image is input frame for detector. Cascade will be applied to it. - //! Param rois is a mask - //! Param objects 4-channel matrix thet contain detected rectangles - //! Param rejectfactor used for final object box computing - virtual void detectMultiScale(const GpuMat& image, const GpuMat& rois, GpuMat& objects, - int rejectfactor = 1, int specificScale = -1) const; + // Load cascade from FileNode. + // Param fn is a root node for cascade. Should be . + virtual bool load(const FileNode& fn); - //! detect specific objects on in the input frame for all scales computed flom minScale and maxscale values. - //! asynchronous version. - //! Param image is input frame for detector. Cascade will be applied to it. - //! Param rois is a mask - //! Param objects 4-channel matrix thet contain detected rectangles - //! Param rejectfactor used for final object box computing - //! Param ndet retrieves number of detections - //! Param stream wrapper for CUDA stream - virtual void detectMultiScale(const GpuMat& image, const GpuMat& rois, GpuMat& objects, - int rejectfactor, GpuMat& ndet, Stream stream) const; + // Load cascade config. + virtual void read(const FileNode& fn); - cv::Size getRoiSize() const; + // Return the vector of Decection objcts. + // Param image is a frame on which detector will be applied. + // Param rois is a vector of regions of interest. Only the objects that fall into one of the regions will be returned. + // Param objects is an output array of Detections + virtual void detect(InputArray image, InputArray rois, OutputArray objects, Stream& stream = Stream::Null()) const; + virtual void detect(InputArray image, InputArray rois, OutputArray objects, const int level, Stream& stream = Stream::Null()) const; + + void genRoi(InputArray roi, OutputArray mask) const; private: - struct Filds; - Filds* filds; + + struct Fields; + Fields* fields; + + double minScale; + double maxScale; + + int scales; + int rejfactor; }; ////////////////////////////////// SURF ////////////////////////////////////////// diff --git a/modules/gpu/perf/perf_softcascade.cpp b/modules/gpu/perf/perf_softcascade.cpp index 9b53b2e84..1e62af8eb 100644 --- a/modules/gpu/perf/perf_softcascade.cpp +++ b/modules/gpu/perf/perf_softcascade.cpp @@ -25,8 +25,8 @@ void fixture##_##name::__cpu() { FAIL() << "No such CPU implementation analogy"; namespace { struct DetectionLess { - bool operator()(const cv::gpu::SoftCascade::Detection& a, - const cv::gpu::SoftCascade::Detection& b) const + bool operator()(const cv::gpu::SCascade::Detection& a, + const cv::gpu::SCascade::Detection& b) const { if (a.x != b.x) return a.x < b.x; else if (a.y != b.y) return a.y < b.y; @@ -51,7 +51,7 @@ namespace { { cv::Mat detections(objects); - typedef cv::gpu::SoftCascade::Detection Detection; + typedef cv::gpu::SCascade::Detection Detection; Detection* begin = (Detection*)(detections.ptr(0)); Detection* end = (Detection*)(detections.ptr(0) + detections.cols); std::sort(begin, end, DetectionLess()); @@ -62,52 +62,54 @@ namespace { typedef std::tr1::tuple fixture_t; -typedef perf::TestBaseWithParam SoftCascadeTest; +typedef perf::TestBaseWithParam SCascadeTest; -GPU_PERF_TEST_P(SoftCascadeTest, detect, +GPU_PERF_TEST_P(SCascadeTest, detect, testing::Combine( testing::Values(std::string("cv/cascadeandhog/sc_cvpr_2012_to_opencv.xml")), testing::Values(std::string("cv/cascadeandhog/bahnhof/image_00000000_0.png")))) { } -RUN_GPU(SoftCascadeTest, detect) +RUN_GPU(SCascadeTest, detect) { cv::Mat cpu = readImage (GET_PARAM(1)); ASSERT_FALSE(cpu.empty()); cv::gpu::GpuMat colored(cpu); - cv::gpu::SoftCascade cascade; - ASSERT_TRUE(cascade.load(perf::TestBase::getDataPath(GET_PARAM(0)))); + cv::gpu::SCascade cascade; - cv::gpu::GpuMat objectBoxes(1, 10000 * sizeof(cv::gpu::SoftCascade::Detection), CV_8UC1), rois(cascade.getRoiSize(), CV_8UC1), trois; + cv::FileStorage fs(perf::TestBase::getDataPath(GET_PARAM(0)), cv::FileStorage::READ); + ASSERT_TRUE(fs.isOpened()); + + ASSERT_TRUE(cascade.load(fs.getFirstTopLevelNode())); + + cv::gpu::GpuMat objectBoxes(1, 10000 * sizeof(cv::gpu::SCascade::Detection), CV_8UC1), rois(colored.size(), CV_8UC1), trois; rois.setTo(1); - cv::gpu::transpose(rois, trois); + cascade.genRoi(rois, trois); - cv::gpu::GpuMat curr = objectBoxes; - cascade.detectMultiScale(colored, trois, curr); + cascade.detect(colored, trois, objectBoxes); TEST_CYCLE() { - curr = objectBoxes; - cascade.detectMultiScale(colored, trois, curr); + cascade.detect(colored, trois, objectBoxes); } - SANITY_CHECK(sortDetections(curr)); + SANITY_CHECK(sortDetections(objectBoxes)); } -NO_CPU(SoftCascadeTest, detect) +NO_CPU(SCascadeTest, detect) -// RUN_CPU(SoftCascadeTest, detect) +// RUN_CPU(SCascadeTest, detect) // { // cv::Mat colored = readImage(GET_PARAM(1)); // ASSERT_FALSE(colored.empty()); -// cv::SoftCascade cascade; +// cv::SCascade cascade; // ASSERT_TRUE(cascade.load(getDataPath(GET_PARAM(0)))); // std::vector rois; -// typedef cv::SoftCascade::Detection Detection; +// typedef cv::SCascade::Detection Detection; // std::vectorobjects; // cascade.detectMultiScale(colored, rois, objects); @@ -124,42 +126,46 @@ static cv::Rect getFromTable(int idx) { static const cv::Rect rois[] = { - cv::Rect( 65, 20, 35, 80), - cv::Rect( 95, 35, 45, 40), - cv::Rect( 45, 35, 45, 40), - cv::Rect( 25, 27, 50, 45), - cv::Rect(100, 50, 45, 40), + cv::Rect( 65 * 4, 20 * 4, 35 * 4, 80 * 4), + cv::Rect( 95 * 4, 35 * 4, 45 * 4, 40 * 4), + cv::Rect( 45 * 4, 35 * 4, 45 * 4, 40 * 4), + cv::Rect( 25 * 4, 27 * 4, 50 * 4, 45 * 4), + cv::Rect(100 * 4, 50 * 4, 45 * 4, 40 * 4), - cv::Rect( 60, 30, 45, 40), - cv::Rect( 40, 55, 50, 40), - cv::Rect( 48, 37, 72, 80), - cv::Rect( 48, 32, 85, 58), - cv::Rect( 48, 0, 32, 27) + cv::Rect( 60 * 4, 30 * 4, 45 * 4, 40 * 4), + cv::Rect( 40 * 4, 55 * 4, 50 * 4, 40 * 4), + cv::Rect( 48 * 4, 37 * 4, 72 * 4, 80 * 4), + cv::Rect( 48 * 4, 32 * 4, 85 * 4, 58 * 4), + cv::Rect( 48 * 4, 0 * 4, 32 * 4, 27 * 4) }; return rois[idx]; } typedef std::tr1::tuple roi_fixture_t; -typedef perf::TestBaseWithParam SoftCascadeTestRoi; +typedef perf::TestBaseWithParam SCascadeTestRoi; -GPU_PERF_TEST_P(SoftCascadeTestRoi, detectInRoi, +GPU_PERF_TEST_P(SCascadeTestRoi, detectInRoi, testing::Combine( testing::Values(std::string("cv/cascadeandhog/sc_cvpr_2012_to_opencv.xml")), testing::Values(std::string("cv/cascadeandhog/bahnhof/image_00000000_0.png")), testing::Range(0, 5))) {} -RUN_GPU(SoftCascadeTestRoi, detectInRoi) +RUN_GPU(SCascadeTestRoi, detectInRoi) { cv::Mat cpu = readImage (GET_PARAM(1)); ASSERT_FALSE(cpu.empty()); cv::gpu::GpuMat colored(cpu); - cv::gpu::SoftCascade cascade; - ASSERT_TRUE(cascade.load(perf::TestBase::getDataPath(GET_PARAM(0)))); + cv::gpu::SCascade cascade; - cv::gpu::GpuMat objectBoxes(1, 16384 * 20, CV_8UC1), rois(cascade.getRoiSize(), CV_8UC1); + cv::FileStorage fs(perf::TestBase::getDataPath(GET_PARAM(0)), cv::FileStorage::READ); + ASSERT_TRUE(fs.isOpened()); + + ASSERT_TRUE(cascade.load(fs.getFirstTopLevelNode())); + + cv::gpu::GpuMat objectBoxes(1, 16384 * 20, CV_8UC1), rois(colored.size(), CV_8UC1); rois.setTo(0); int nroi = GET_PARAM(2); @@ -172,40 +178,42 @@ RUN_GPU(SoftCascadeTestRoi, detectInRoi) } cv::gpu::GpuMat trois; - cv::gpu::transpose(rois, trois); + cascade.genRoi(rois, trois); - cv::gpu::GpuMat curr = objectBoxes; - cascade.detectMultiScale(colored, trois, curr); + cascade.detect(colored, trois, objectBoxes); TEST_CYCLE() { - curr = objectBoxes; - cascade.detectMultiScale(colored, trois, curr); + cascade.detect(colored, trois, objectBoxes); } - SANITY_CHECK(sortDetections(curr)); + SANITY_CHECK(sortDetections(objectBoxes)); } -NO_CPU(SoftCascadeTestRoi, detectInRoi) +NO_CPU(SCascadeTestRoi, detectInRoi) -GPU_PERF_TEST_P(SoftCascadeTestRoi, detectEachRoi, +GPU_PERF_TEST_P(SCascadeTestRoi, detectEachRoi, testing::Combine( testing::Values(std::string("cv/cascadeandhog/sc_cvpr_2012_to_opencv.xml")), testing::Values(std::string("cv/cascadeandhog/bahnhof/image_00000000_0.png")), testing::Range(0, 10))) {} -RUN_GPU(SoftCascadeTestRoi, detectEachRoi) +RUN_GPU(SCascadeTestRoi, detectEachRoi) { cv::Mat cpu = readImage (GET_PARAM(1)); ASSERT_FALSE(cpu.empty()); cv::gpu::GpuMat colored(cpu); - cv::gpu::SoftCascade cascade; - ASSERT_TRUE(cascade.load(perf::TestBase::getDataPath(GET_PARAM(0)))); + cv::gpu::SCascade cascade; - cv::gpu::GpuMat objectBoxes(1, 16384 * 20, CV_8UC1), rois(cascade.getRoiSize(), CV_8UC1); + cv::FileStorage fs(perf::TestBase::getDataPath(GET_PARAM(0)), cv::FileStorage::READ); + ASSERT_TRUE(fs.isOpened()); + + ASSERT_TRUE(cascade.load(fs.getFirstTopLevelNode())); + + cv::gpu::GpuMat objectBoxes(1, 16384 * 20, CV_8UC1), rois(colored.size(), CV_8UC1); rois.setTo(0); int idx = GET_PARAM(2); @@ -213,24 +221,22 @@ RUN_GPU(SoftCascadeTestRoi, detectEachRoi) cv::gpu::GpuMat sub(rois, r); sub.setTo(1); - cv::gpu::GpuMat curr = objectBoxes; cv::gpu::GpuMat trois; - cv::gpu::transpose(rois, trois); + cascade.genRoi(rois, trois); - cascade.detectMultiScale(colored, trois, curr); + cascade.detect(colored, trois, objectBoxes); TEST_CYCLE() { - curr = objectBoxes; - cascade.detectMultiScale(colored, trois, curr); + cascade.detect(colored, trois, objectBoxes); } - SANITY_CHECK(sortDetections(curr)); + SANITY_CHECK(sortDetections(objectBoxes)); } -NO_CPU(SoftCascadeTestRoi, detectEachRoi) +NO_CPU(SCascadeTestRoi, detectEachRoi) -GPU_PERF_TEST_P(SoftCascadeTest, detectOnIntegral, +GPU_PERF_TEST_P(SCascadeTest, detectOnIntegral, testing::Combine( testing::Values(std::string("cv/cascadeandhog/sc_cvpr_2012_to_opencv.xml")), testing::Values(std::string("cv/cascadeandhog/integrals.xml")))) @@ -243,37 +249,39 @@ GPU_PERF_TEST_P(SoftCascadeTest, detectOnIntegral, return std::string(s); } -RUN_GPU(SoftCascadeTest, detectOnIntegral) +RUN_GPU(SCascadeTest, detectOnIntegral) { - cv::FileStorage fs(perf::TestBase::getDataPath(GET_PARAM(1)), cv::FileStorage::READ); - ASSERT_TRUE(fs.isOpened()); + cv::FileStorage fsi(perf::TestBase::getDataPath(GET_PARAM(1)), cv::FileStorage::READ); + ASSERT_TRUE(fsi.isOpened()); cv::gpu::GpuMat hogluv(121 * 10, 161, CV_32SC1); for (int i = 0; i < 10; ++i) { cv::Mat channel; - fs[std::string("channel") + itoa(i)] >> channel; + fsi[std::string("channel") + itoa(i)] >> channel; cv::gpu::GpuMat gchannel(hogluv, cv::Rect(0, 121 * i, 161, 121)); gchannel.upload(channel); } - cv::gpu::SoftCascade cascade; - ASSERT_TRUE(cascade.load(perf::TestBase::getDataPath(GET_PARAM(0)))); + cv::gpu::SCascade cascade; - cv::gpu::GpuMat objectBoxes(1, 10000 * sizeof(cv::gpu::SoftCascade::Detection), CV_8UC1), rois(cascade.getRoiSize(), CV_8UC1), trois; + cv::FileStorage fs(perf::TestBase::getDataPath(GET_PARAM(0)), cv::FileStorage::READ); + ASSERT_TRUE(fs.isOpened()); + + ASSERT_TRUE(cascade.load(fs.getFirstTopLevelNode())); + + cv::gpu::GpuMat objectBoxes(1, 10000 * sizeof(cv::gpu::SCascade::Detection), CV_8UC1), rois(cv::Size(640, 480), CV_8UC1), trois; rois.setTo(1); - cv::gpu::transpose(rois, trois); + cascade.genRoi(rois, trois); - cv::gpu::GpuMat curr = objectBoxes; - cascade.detectMultiScale(hogluv, trois, curr); + cascade.detect(hogluv, trois, objectBoxes); TEST_CYCLE() { - curr = objectBoxes; - cascade.detectMultiScale(hogluv, trois, curr); + cascade.detect(hogluv, trois, objectBoxes); } - SANITY_CHECK(sortDetections(curr)); + SANITY_CHECK(sortDetections(objectBoxes)); } -NO_CPU(SoftCascadeTest, detectOnIntegral) \ No newline at end of file +NO_CPU(SCascadeTest, detectOnIntegral) \ No newline at end of file diff --git a/modules/gpu/src/gpu_init.cpp b/modules/gpu/src/gpu_init.cpp new file mode 100644 index 000000000..f25bc2ceb --- /dev/null +++ b/modules/gpu/src/gpu_init.cpp @@ -0,0 +1,60 @@ +/*M/////////////////////////////////////////////////////////////////////////////////////// +// +// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. +// +// By downloading, copying, installing or using the software you agree to this license. +// If you do not agree to this license, do not download, install, +// copy or use the software. +// +// +// License Agreement +// For Open Source Computer Vision Library +// +// Copyright (C) 2000-2008, Intel Corporation, all rights reserved. +// Copyright (C) 2008-2012, Willow Garage Inc., all rights reserved. +// Third party copyrights are property of their respective owners. +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// * Redistribution's of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// * Redistribution's in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// * The name of the copyright holders may not be used to endorse or promote products +// derived from this software without specific prior written permission. +// +// This software is provided by the copyright holders and contributors "as is" and +// any express or implied warranties, including, but not limited to, the implied +// warranties of merchantability and fitness for a particular purpose are disclaimed. +// In no event shall the Intel Corporation or contributors be liable for any direct, +// indirect, incidental, special, exemplary, or consequential damages +// (including, but not limited to, procurement of substitute goods or services; +// loss of use, data, or profits; or business interruption) however caused +// and on any theory of liability, whether in contract, strict liability, +// or tort (including negligence or otherwise) arising in any way out of +// the use of this software, even if advised of the possibility of such damage. +// +//M*/ + +#include + +namespace cv { namespace gpu +{ + +CV_INIT_ALGORITHM(SCascade, "CascadeDetector.SCascade", + obj.info()->addParam(obj, "minScale", obj.minScale); + obj.info()->addParam(obj, "maxScale", obj.maxScale); + obj.info()->addParam(obj, "scales", obj.scales); + obj.info()->addParam(obj, "rejfactor", obj.rejfactor)); + +bool initModule_gpu(void) +{ + Ptr sc = createSCascade(); + return sc->info() != 0; +} + +} } \ No newline at end of file diff --git a/modules/gpu/src/softcascade.cpp b/modules/gpu/src/softcascade.cpp index 1e0c271b9..02481ed37 100644 --- a/modules/gpu/src/softcascade.cpp +++ b/modules/gpu/src/softcascade.cpp @@ -45,21 +45,18 @@ #if !defined (HAVE_CUDA) -cv::gpu::SoftCascade::SoftCascade() : filds(0) { throw_nogpu(); } -cv::gpu::SoftCascade::SoftCascade( const string&, const float, const float) : filds(0) { throw_nogpu(); } -cv::gpu::SoftCascade::~SoftCascade() { throw_nogpu(); } -bool cv::gpu::SoftCascade::load( const string&, const float, const float) { throw_nogpu(); return false; } -void cv::gpu::SoftCascade::detectMultiScale(const GpuMat&, const GpuMat&, GpuMat&, const int, int) const -{ - throw_nogpu(); -} +cv::gpu::SCascade::SCascade(const double, const double, const int, const int) { throw_nogpu(); } -void cv::gpu::SoftCascade::detectMultiScale(const GpuMat&, const GpuMat&, GpuMat&, int, GpuMat&, Stream) const -{ - throw_nogpu(); -} +cv::gpu::SCascade::~SCascade() { throw_nogpu(); } -cv::Size cv::gpu::SoftCascade::getRoiSize() const { throw_nogpu(); return cv::Size();} +bool cv::gpu::SCascade::load(const FileNode&) { throw_nogpu(); return false;} + +void cv::gpu::SCascade::detect(InputArray, InputArray, OutputArray, Stream&) const { throw_nogpu(); } +void cv::gpu::SCascade::detect(InputArray, InputArray, OutputArray, const int, Stream&) const { throw_nogpu(); } + +void cv::gpu::SCascade::genRoi(InputArray, OutputArray) const { throw_nogpu(); } + +void cv::gpu::SCascade::read(const FileNode& fn) { Algorithm::read(fn); } #else @@ -92,7 +89,7 @@ namespace imgproc { }}} -struct cv::gpu::SoftCascade::Filds +struct cv::gpu::SCascade::Fields { struct CascadeIntrinsics { @@ -126,7 +123,7 @@ struct cv::gpu::SoftCascade::Filds } }; - static Filds* parseCascade(const FileNode &root, const float mins, const float maxs) + static Fields* parseCascade(const FileNode &root, const float mins, const float maxs) { static const char *const SC_STAGE_TYPE = "stageType"; static const char *const SC_BOOST = "BOOST"; @@ -312,13 +309,13 @@ struct cv::gpu::SoftCascade::Filds cv::Mat hlevels(1, vlevels.size() * sizeof(Level), CV_8UC1, (uchar*)&(vlevels[0]) ); CV_Assert(!hlevels.empty()); - Filds* filds = new Filds(mins, maxs, origWidth, origHeight, shrinkage, downscales, + Fields* fields = new Fields(mins, maxs, origWidth, origHeight, shrinkage, downscales, hoctaves, hstages, hnodes, hleaves, hlevels); - return filds; + return fields; } - Filds( const float mins, const float maxs, const int ow, const int oh, const int shr, const int ds, + Fields( const float mins, const float maxs, const int ow, const int oh, const int shr, const int ds, cv::Mat hoctaves, cv::Mat hstages, cv::Mat hnodes, cv::Mat hleaves, cv::Mat hlevels) : minScale(mins), maxScale(maxs), origObjWidth(ow), origObjHeight(oh), shrinkage(shr), downscales(ds) { @@ -332,7 +329,7 @@ struct cv::gpu::SoftCascade::Filds hogluv.create((FRAME_HEIGHT / shr) * HOG_LUV_BINS + 1, FRAME_WIDTH / shr + 1, CV_32SC1); hogluv.setTo(cv::Scalar::all(0)); - detCounter.create(1,1, CV_32SC1); + detCounter.create(sizeof(Detection) / sizeof(int),1, CV_32SC1); octaves.upload(hoctaves); stages.upload(hstages); @@ -344,20 +341,21 @@ struct cv::gpu::SoftCascade::Filds } - void detect(int scale, const cv::gpu::GpuMat& roi, cv::gpu::GpuMat& objects, cudaStream_t stream) const + void detect(int scale, const cv::gpu::GpuMat& roi, const cv::gpu::GpuMat& count, cv::gpu::GpuMat& objects, cudaStream_t stream) const { - cudaMemset(detCounter.data, 0, detCounter.step * detCounter.rows * sizeof(int)); - invoker(roi, hogluv, objects, detCounter, downscales, scale); + cudaMemset(count.data, 0, sizeof(Detection)); + cudaSafeCall( cudaGetLastError()); + invoker(roi, hogluv, objects, count, downscales, scale); } void preprocess(const cv::gpu::GpuMat& colored) { cudaMemset(plane.data, 0, plane.step * plane.rows); - static const int fw = Filds::FRAME_WIDTH; - static const int fh = Filds::FRAME_HEIGHT; + static const int fw = Fields::FRAME_WIDTH; + static const int fh = Fields::FRAME_HEIGHT; - GpuMat gray(plane, cv::Rect(0, fh * Filds::HOG_LUV_BINS, fw, fh)); + GpuMat gray(plane, cv::Rect(0, fh * Fields::HOG_LUV_BINS, fw, fh)); cv::gpu::cvtColor(colored, gray, CV_BGR2GRAY); createHogBins(gray); @@ -390,8 +388,8 @@ private: void createHogBins(const cv::gpu::GpuMat& gray) { - static const int fw = Filds::FRAME_WIDTH; - static const int fh = Filds::FRAME_HEIGHT; + static const int fw = Fields::FRAME_WIDTH; + static const int fh = Fields::FRAME_HEIGHT; GpuMat dfdx(fplane, cv::Rect(0, 0, fw, fh)); GpuMat dfdy(fplane, cv::Rect(0, fh, fw, fh)); @@ -413,21 +411,21 @@ private: cv::gpu::multiply(ang, cv::Scalar::all(1.f / 60.f), nang); //create uchar magnitude - GpuMat cmag(plane, cv::Rect(0, fh * Filds::HOG_BINS, fw, fh)); + GpuMat cmag(plane, cv::Rect(0, fh * Fields::HOG_BINS, fw, fh)); nmag.convertTo(cmag, CV_8UC1); - device::icf::fillBins(plane, nang, fw, fh, Filds::HOG_BINS); + device::icf::fillBins(plane, nang, fw, fh, Fields::HOG_BINS); } void createLuvBins(const cv::gpu::GpuMat& colored) { - static const int fw = Filds::FRAME_WIDTH; - static const int fh = Filds::FRAME_HEIGHT; + static const int fw = Fields::FRAME_WIDTH; + static const int fh = Fields::FRAME_HEIGHT; cv::gpu::cvtColor(colored, luv, CV_BGR2Luv); std::vector splited; - for(int i = 0; i < Filds::LUV_BINS; ++i) + for(int i = 0; i < Fields::LUV_BINS; ++i) { splited.push_back(GpuMat(plane, cv::Rect(0, fh * (7 + i), fw, fh))); } @@ -437,10 +435,10 @@ private: void integrate() { - int fw = Filds::FRAME_WIDTH; - int fh = Filds::FRAME_HEIGHT; + int fw = Fields::FRAME_WIDTH; + int fh = Fields::FRAME_HEIGHT; - GpuMat channels(plane, cv::Rect(0, 0, fw, fh * Filds::HOG_LUV_BINS)); + GpuMat channels(plane, cv::Rect(0, 0, fw, fh * Fields::HOG_LUV_BINS)); cv::gpu::resize(channels, shrunk, cv::Size(), 0.25, 0.25, CV_INTER_AREA); device::imgproc::shfl_integral_gpu_buffered(shrunk, integralBuffer, hogluv, 12, 0); } @@ -500,45 +498,33 @@ public: }; }; -cv::gpu::SoftCascade::SoftCascade() : filds(0) {} +cv::gpu::SCascade::SCascade(const double mins, const double maxs, const int sc, const int rjf) +: fields(0), minScale(mins), maxScale(maxs), scales(sc), rejfactor(rjf) {} -cv::gpu::SoftCascade::SoftCascade( const string& filename, const float minScale, const float maxScale) : filds(0) +cv::gpu::SCascade::~SCascade() { delete fields; } + +bool cv::gpu::SCascade::load(const FileNode& fn) { - load(filename, minScale, maxScale); + if (fields) delete fields; + fields = Fields::parseCascade(fn, minScale, maxScale); + return fields != 0; } -cv::gpu::SoftCascade::~SoftCascade() -{ - delete filds; -} - -bool cv::gpu::SoftCascade::load( const string& filename, const float minScale, const float maxScale) -{ - if (filds) delete filds; - - cv::FileStorage fs(filename, FileStorage::READ); - if (!fs.isOpened()) return false; - - filds = Filds::parseCascade(fs.getFirstTopLevelNode(), minScale, maxScale); - return filds != 0; -} - -void cv::gpu::SoftCascade::detectMultiScale(const GpuMat& colored, const GpuMat& rois, - GpuMat& objects, const int /*rejectfactor*/, int specificScale) const +void cv::gpu::SCascade::detect(InputArray image, InputArray _rois, OutputArray _objects, Stream& s) const { + const GpuMat colored = image.getGpuMat(); // only color images are supperted CV_Assert(colored.type() == CV_8UC3 || colored.type() == CV_32SC1); // we guess user knows about shrincage - CV_Assert((rois.size().width == getRoiSize().height) && (rois.type() == CV_8UC1)); + // CV_Assert((rois.size().width == getRoiSize().height) && (rois.type() == CV_8UC1)); - - Filds& flds = *filds; + Fields& flds = *fields; if (colored.type() == CV_8UC3) { // only this window size allowed - CV_Assert(colored.cols == Filds::FRAME_WIDTH && colored.rows == Filds::FRAME_HEIGHT); + CV_Assert(colored.cols == Fields::FRAME_WIDTH && colored.rows == Fields::FRAME_HEIGHT); flds.preprocess(colored); } else @@ -546,25 +532,60 @@ void cv::gpu::SoftCascade::detectMultiScale(const GpuMat& colored, const GpuMat& colored.copyTo(flds.hogluv); } - flds.detect(specificScale, rois, objects, 0); + GpuMat rois = _rois.getGpuMat(), objects = _objects.getGpuMat(); - cv::Mat out(flds.detCounter); - int ndetections = *(out.ptr(0)); + GpuMat tmp = GpuMat(objects, cv::Rect(0, 0, sizeof(Detection), 1)); + objects = GpuMat(objects, cv::Rect( sizeof(Detection), 0, objects.cols - sizeof(Detection), 1)); + cudaStream_t stream = StreamAccessor::getStream(s); - if (! ndetections) - objects = GpuMat(); + flds.detect(-1, rois, tmp, objects, stream); +} + +void cv::gpu::SCascade::detect(InputArray image, InputArray _rois, OutputArray _objects, const int level, Stream& s) const +{ + const GpuMat colored = image.getGpuMat(); + // only color images are supperted + CV_Assert(colored.type() == CV_8UC3 || colored.type() == CV_32SC1); + + // we guess user knows about shrincage + // CV_Assert((rois.size().width == getRoiSize().height) && (rois.type() == CV_8UC1)); + + Fields& flds = *fields; + + if (colored.type() == CV_8UC3) + { + // only this window size allowed + CV_Assert(colored.cols == Fields::FRAME_WIDTH && colored.rows == Fields::FRAME_HEIGHT); + flds.preprocess(colored); + } else - objects = GpuMat(objects, cv::Rect(0, 0, ndetections * sizeof(Detection), 1)); + { + colored.copyTo(flds.hogluv); + } + + GpuMat rois = _rois.getGpuMat(), objects = _objects.getGpuMat(); + + GpuMat tmp = GpuMat(objects, cv::Rect(0, 0, sizeof(Detection), 1)); + objects = GpuMat(objects, cv::Rect( sizeof(Detection), 0, objects.cols - sizeof(Detection), 1)); + cudaStream_t stream = StreamAccessor::getStream(s); + + flds.detect(level, rois, tmp, objects, stream); } -void cv::gpu::SoftCascade::detectMultiScale(const GpuMat&, const GpuMat&, GpuMat&, int, GpuMat&, Stream) const +void cv::gpu::SCascade::genRoi(InputArray _roi, OutputArray _mask) const { - // cudaStream_t stream = StreamAccessor::getStream(s); + const GpuMat roi = _roi.getGpuMat(); + _mask.create( roi.cols / 4, roi.rows / 4, roi.type() ); + GpuMat mask = _mask.getGpuMat(); + cv::gpu::GpuMat tmp; + + cv::gpu::resize(roi, tmp, cv::Size(), 0.25, 0.25, CV_INTER_AREA); + cv::gpu::transpose(tmp, mask); } -cv::Size cv::gpu::SoftCascade::getRoiSize() const +void cv::gpu::SCascade::read(const FileNode& fn) { - return cv::Size(Filds::FRAME_WIDTH / (*filds).shrinkage, Filds::FRAME_HEIGHT / (*filds).shrinkage); + Algorithm::read(fn); } #endif \ No newline at end of file diff --git a/modules/gpu/test/test_softcascade.cpp b/modules/gpu/test/test_softcascade.cpp index 1146b062b..f26c44f0e 100644 --- a/modules/gpu/test/test_softcascade.cpp +++ b/modules/gpu/test/test_softcascade.cpp @@ -70,23 +70,23 @@ using cv::gpu::GpuMat; namespace { - typedef cv::gpu::SoftCascade::Detection Detection; + typedef cv::gpu::SCascade::Detection Detection; static cv::Rect getFromTable(int idx) { static const cv::Rect rois[] = { - cv::Rect( 65, 20, 35, 80), - cv::Rect( 95, 35, 45, 40), - cv::Rect( 45, 35, 45, 40), - cv::Rect( 25, 27, 50, 45), - cv::Rect(100, 50, 45, 40), + cv::Rect( 65 * 4, 20 * 4, 35 * 4, 80 * 4), + cv::Rect( 95 * 4, 35 * 4, 45 * 4, 40 * 4), + cv::Rect( 45 * 4, 35 * 4, 45 * 4, 40 * 4), + cv::Rect( 25 * 4, 27 * 4, 50 * 4, 45 * 4), + cv::Rect(100 * 4, 50 * 4, 45 * 4, 40 * 4), - cv::Rect( 60, 30, 45, 40), - cv::Rect( 40, 55, 50, 40), - cv::Rect( 48, 37, 72, 80), - cv::Rect( 48, 32, 85, 58), - cv::Rect( 48, 0, 32, 27) + cv::Rect( 60 * 4, 30 * 4, 45 * 4, 40 * 4), + cv::Rect( 40 * 4, 55 * 4, 50 * 4, 40 * 4), + cv::Rect( 48 * 4, 37 * 4, 72 * 4, 80 * 4), + cv::Rect( 48 * 4, 32 * 4, 85 * 4, 58 * 4), + cv::Rect( 48 * 4, 0 * 4, 32 * 4, 27 * 4) }; return rois[idx]; @@ -140,11 +140,11 @@ namespace { } } -typedef ::testing::TestWithParam > SoftCascadeTestRoi; -GPU_TEST_P(SoftCascadeTestRoi, detect, +typedef ::testing::TestWithParam > SCascadeTestRoi; +GPU_TEST_P(SCascadeTestRoi, detect, testing::Combine( ALL_DEVICES, - testing::Values(std::string("../cv/cascadeandhog/sc_cvpr_2012_to_opencv.xml")), + testing::Values(std::string("cv/cascadeandhog/sc_cvpr_2012_to_opencv.xml")), testing::Values(std::string("../cv/cascadeandhog/bahnhof/image_00000000_0.png")), testing::Range(0, 5))) { @@ -152,10 +152,14 @@ GPU_TEST_P(SoftCascadeTestRoi, detect, cv::Mat coloredCpu = cv::imread(cvtest::TS::ptr()->get_data_path() + GET_PARAM(2)); ASSERT_FALSE(coloredCpu.empty()); - cv::gpu::SoftCascade cascade; - ASSERT_TRUE(cascade.load(cvtest::TS::ptr()->get_data_path() + GET_PARAM(1))); + cv::gpu::SCascade cascade; - GpuMat colored(coloredCpu), objectBoxes(1, 16384, CV_8UC1), rois(cascade.getRoiSize(), CV_8UC1), trois; + cv::FileStorage fs(perf::TestBase::getDataPath(GET_PARAM(1)), cv::FileStorage::READ); + ASSERT_TRUE(fs.isOpened()); + + ASSERT_TRUE(cascade.load(fs.getFirstTopLevelNode())); + + GpuMat colored(coloredCpu), objectBoxes(1, 16384, CV_8UC1), rois(colored.size(), CV_8UC1), trois; rois.setTo(0); int nroi = GET_PARAM(3); @@ -166,21 +170,21 @@ GPU_TEST_P(SoftCascadeTestRoi, detect, cv::Rect r = getFromTable(rng(10)); GpuMat sub(rois, r); sub.setTo(1); - r.x *= 4; r.y *= 4; r.width *= 4; r.height *= 4; cv::rectangle(result, r, cv::Scalar(0, 0, 255, 255), 1); } - cv::gpu::transpose(rois, trois); - - cascade.detectMultiScale(colored, trois, objectBoxes); + cascade.genRoi(rois, trois); + cascade.detect(colored, trois, objectBoxes); cv::Mat dt(objectBoxes); - typedef cv::gpu::SoftCascade::Detection Detection; + typedef cv::gpu::SCascade::Detection Detection; - Detection* dts = (Detection*)dt.data; + Detection* dts = ((Detection*)dt.data) + 1; + int* count = dt.ptr(0); - printTotal(std::cout, dt.cols); - for (int i = 0; i < (int)(dt.cols / sizeof(Detection)); ++i) + printTotal(std::cout, *count); + + for (int i = 0; i < *count; ++i) { Detection d = dts[i]; print(std::cout, d); @@ -188,43 +192,49 @@ GPU_TEST_P(SoftCascadeTestRoi, detect, } SHOW(result); + } -typedef ::testing::TestWithParam > SoftCascadeTestLevel; -GPU_TEST_P(SoftCascadeTestLevel, detect, +typedef ::testing::TestWithParam > SCascadeTestLevel; +GPU_TEST_P(SCascadeTestLevel, detect, testing::Combine( ALL_DEVICES, - testing::Values(std::string("../cv/cascadeandhog/sc_cvpr_2012_to_opencv.xml")), + testing::Values(std::string("cv/cascadeandhog/sc_cvpr_2012_to_opencv.xml")), testing::Values(std::string("../cv/cascadeandhog/bahnhof/image_00000000_0.png")), testing::Range(0, 47) )) { cv::gpu::setDevice(GET_PARAM(0).deviceID()); - std::string xml = cvtest::TS::ptr()->get_data_path() + GET_PARAM(1); - cv::gpu::SoftCascade cascade; - ASSERT_TRUE(cascade.load(xml)); + cv::gpu::SCascade cascade; + + cv::FileStorage fs(perf::TestBase::getDataPath(GET_PARAM(1)), cv::FileStorage::READ); + ASSERT_TRUE(fs.isOpened()); + + ASSERT_TRUE(cascade.load(fs.getFirstTopLevelNode())); cv::Mat coloredCpu = cv::imread(cvtest::TS::ptr()->get_data_path() + GET_PARAM(2)); ASSERT_FALSE(coloredCpu.empty()); - typedef cv::gpu::SoftCascade::Detection Detection; - GpuMat colored(coloredCpu), objectBoxes(1, 100 * sizeof(Detection), CV_8UC1), rois(cascade.getRoiSize(), CV_8UC1); + typedef cv::gpu::SCascade::Detection Detection; + GpuMat colored(coloredCpu), objectBoxes(1, 100 * sizeof(Detection), CV_8UC1), rois(colored.size(), CV_8UC1); rois.setTo(1); cv::gpu::GpuMat trois; - cv::gpu::transpose(rois, trois); + cascade.genRoi(rois, trois); int level = GET_PARAM(3); - cascade.detectMultiScale(colored, trois, objectBoxes, 1, level); + cascade.detect(colored, trois, objectBoxes, level); cv::Mat dt(objectBoxes); - Detection* dts = (Detection*)dt.data; + Detection* dts = ((Detection*)dt.data) + 1; + int* count = dt.ptr(0); + cv::Mat result(coloredCpu); - printTotal(std::cout, dt.cols); - for (int i = 0; i < (int)(dt.cols / sizeof(Detection)); ++i) + printTotal(std::cout, *count); + for (int i = 0; i < *count; ++i) { Detection d = dts[i]; print(std::cout, d); @@ -235,76 +245,89 @@ GPU_TEST_P(SoftCascadeTestLevel, detect, SHOW(result); } -TEST(SoftCascadeTest, readCascade) +TEST(SCascadeTest, readCascade) { std::string xml = cvtest::TS::ptr()->get_data_path() + "../cv/cascadeandhog/icf-template.xml"; - cv::gpu::SoftCascade cascade; - ASSERT_TRUE(cascade.load(xml)); + cv::gpu::SCascade cascade; + + cv::FileStorage fs(xml, cv::FileStorage::READ); + ASSERT_TRUE(fs.isOpened()); + + ASSERT_TRUE(cascade.load(fs.getFirstTopLevelNode())); } -typedef ::testing::TestWithParam SoftCascadeTestAll; -GPU_TEST_P(SoftCascadeTestAll, detect, +typedef ::testing::TestWithParam SCascadeTestAll; +GPU_TEST_P(SCascadeTestAll, detect, ALL_DEVICES ) { cv::gpu::setDevice(GetParam().deviceID()); std::string xml = cvtest::TS::ptr()->get_data_path() + "../cv/cascadeandhog/sc_cvpr_2012_to_opencv.xml"; - cv::gpu::SoftCascade cascade; - ASSERT_TRUE(cascade.load(xml)); + cv::gpu::SCascade cascade; + + cv::FileStorage fs(xml, cv::FileStorage::READ); + ASSERT_TRUE(fs.isOpened()); + + ASSERT_TRUE(cascade.load(fs.getFirstTopLevelNode())); cv::Mat coloredCpu = cv::imread(cvtest::TS::ptr()->get_data_path() + "../cv/cascadeandhog/bahnhof/image_00000000_0.png"); ASSERT_FALSE(coloredCpu.empty()); - GpuMat colored(coloredCpu), objectBoxes(1, 100000, CV_8UC1), rois(cascade.getRoiSize(), CV_8UC1); + GpuMat colored(coloredCpu), objectBoxes(1, 100000, CV_8UC1), rois(colored.size(), CV_8UC1); rois.setTo(0); GpuMat sub(rois, cv::Rect(rois.cols / 4, rois.rows / 4,rois.cols / 2, rois.rows / 2)); sub.setTo(cv::Scalar::all(1)); cv::gpu::GpuMat trois; - cv::gpu::transpose(rois, trois); + cascade.genRoi(rois, trois); - cascade.detectMultiScale(colored, trois, objectBoxes); + cascade.detect(colored, trois, objectBoxes); - typedef cv::gpu::SoftCascade::Detection Detection; + typedef cv::gpu::SCascade::Detection Detection; cv::Mat detections(objectBoxes); - ASSERT_EQ(detections.cols / sizeof(Detection) ,3670U); + int a = *(detections.ptr(0)); + ASSERT_EQ(a ,2460); } -//ToDo: fix me -GPU_TEST_P(SoftCascadeTestAll, detectOnIntegral, +GPU_TEST_P(SCascadeTestAll, detectOnIntegral, ALL_DEVICES ) { cv::gpu::setDevice(GetParam().deviceID()); std::string xml = cvtest::TS::ptr()->get_data_path() + "../cv/cascadeandhog/sc_cvpr_2012_to_opencv.xml"; - cv::gpu::SoftCascade cascade; - ASSERT_TRUE(cascade.load(xml)); + cv::gpu::SCascade cascade; + + cv::FileStorage fs(xml, cv::FileStorage::READ); + ASSERT_TRUE(fs.isOpened()); + + ASSERT_TRUE(cascade.load(fs.getFirstTopLevelNode())); std::string intPath = cvtest::TS::ptr()->get_data_path() + "../cv/cascadeandhog/integrals.xml"; - cv::FileStorage fs(intPath, cv::FileStorage::READ); - ASSERT_TRUE(fs.isOpened()); + cv::FileStorage fsi(intPath, cv::FileStorage::READ); + ASSERT_TRUE(fsi.isOpened()); GpuMat hogluv(121 * 10, 161, CV_32SC1); for (int i = 0; i < 10; ++i) { cv::Mat channel; - fs[std::string("channel") + itoa(i)] >> channel; + fsi[std::string("channel") + itoa(i)] >> channel; GpuMat gchannel(hogluv, cv::Rect(0, 121 * i, 161, 121)); gchannel.upload(channel); } - GpuMat objectBoxes(1, 100000, CV_8UC1), rois(cascade.getRoiSize(), CV_8UC1); + GpuMat objectBoxes(1, 100000, CV_8UC1), rois(cv::Size(640, 480), CV_8UC1); rois.setTo(1); cv::gpu::GpuMat trois; - cv::gpu::transpose(rois, trois); + cascade.genRoi(rois, trois); - cascade.detectMultiScale(hogluv, trois, objectBoxes); + cascade.detect(hogluv, trois, objectBoxes); - typedef cv::gpu::SoftCascade::Detection Detection; + typedef cv::gpu::SCascade::Detection Detection; cv::Mat detections(objectBoxes); + int a = *(detections.ptr(0)); - ASSERT_EQ(detections.cols / sizeof(Detection) ,2042U); + ASSERT_EQ( a ,1024); } #endif \ No newline at end of file From 0cbf9eb22a264493da9ebf2e1101af60534cc12c Mon Sep 17 00:00:00 2001 From: "marina.kolpakova" Date: Sat, 10 Nov 2012 03:59:09 +0400 Subject: [PATCH 059/155] add support for CUDA streams --- modules/gpu/include/opencv2/gpu/gpu.hpp | 2 +- modules/gpu/perf/perf_softcascade.cpp | 42 ++++++++++++++- modules/gpu/src/cuda/integral_image.cu | 1 - modules/gpu/src/cuda/isf-sc.cu | 28 ++++++---- modules/gpu/src/icf.hpp | 2 +- modules/gpu/src/softcascade.cpp | 72 ++++++++++++++----------- modules/gpu/test/test_softcascade.cpp | 39 ++++++++++++++ 7 files changed, 140 insertions(+), 46 deletions(-) diff --git a/modules/gpu/include/opencv2/gpu/gpu.hpp b/modules/gpu/include/opencv2/gpu/gpu.hpp index 4fc6179d8..8f327f227 100644 --- a/modules/gpu/include/opencv2/gpu/gpu.hpp +++ b/modules/gpu/include/opencv2/gpu/gpu.hpp @@ -1577,7 +1577,7 @@ public: virtual void detect(InputArray image, InputArray rois, OutputArray objects, Stream& stream = Stream::Null()) const; virtual void detect(InputArray image, InputArray rois, OutputArray objects, const int level, Stream& stream = Stream::Null()) const; - void genRoi(InputArray roi, OutputArray mask) const; + void genRoi(InputArray roi, OutputArray mask, Stream& stream = Stream::Null()) const; private: diff --git a/modules/gpu/perf/perf_softcascade.cpp b/modules/gpu/perf/perf_softcascade.cpp index 1e62af8eb..3e82cc5bb 100644 --- a/modules/gpu/perf/perf_softcascade.cpp +++ b/modules/gpu/perf/perf_softcascade.cpp @@ -284,4 +284,44 @@ RUN_GPU(SCascadeTest, detectOnIntegral) SANITY_CHECK(sortDetections(objectBoxes)); } -NO_CPU(SCascadeTest, detectOnIntegral) \ No newline at end of file +NO_CPU(SCascadeTest, detectOnIntegral) + +GPU_PERF_TEST_P(SCascadeTest, detectStream, + testing::Combine( + testing::Values(std::string("cv/cascadeandhog/sc_cvpr_2012_to_opencv.xml")), + testing::Values(std::string("cv/cascadeandhog/bahnhof/image_00000000_0.png")))) +{ } + +RUN_GPU(SCascadeTest, detectStream) +{ + cv::Mat cpu = readImage (GET_PARAM(1)); + ASSERT_FALSE(cpu.empty()); + cv::gpu::GpuMat colored(cpu); + + cv::gpu::SCascade cascade; + + cv::FileStorage fs(perf::TestBase::getDataPath(GET_PARAM(0)), cv::FileStorage::READ); + ASSERT_TRUE(fs.isOpened()); + + ASSERT_TRUE(cascade.load(fs.getFirstTopLevelNode())); + + cv::gpu::GpuMat objectBoxes(1, 10000 * sizeof(cv::gpu::SCascade::Detection), CV_8UC1), rois(colored.size(), CV_8UC1), trois; + rois.setTo(1); + + cv::gpu::Stream s; + + cascade.genRoi(rois, trois, s); + + cascade.detect(colored, trois, objectBoxes, s); + + TEST_CYCLE() + { + cascade.detect(colored, trois, objectBoxes, s); + } + + cudaDeviceSynchronize(); + + SANITY_CHECK(sortDetections(objectBoxes)); +} + +NO_CPU(SCascadeTest, detectStream) \ No newline at end of file diff --git a/modules/gpu/src/cuda/integral_image.cu b/modules/gpu/src/cuda/integral_image.cu index 5bd35bdc7..200960b43 100644 --- a/modules/gpu/src/cuda/integral_image.cu +++ b/modules/gpu/src/cuda/integral_image.cu @@ -444,7 +444,6 @@ namespace cv { namespace gpu { namespace device } // used for frame preprocessing before Soft Cascade evaluation: no synchronization needed - // ToDo: partial dy void shfl_integral_gpu_buffered(PtrStepSzb img, PtrStepSz buffer, PtrStepSz integral, int blockStep, cudaStream_t stream) { diff --git a/modules/gpu/src/cuda/isf-sc.cu b/modules/gpu/src/cuda/isf-sc.cu index ee9a9f674..0de2d8e37 100644 --- a/modules/gpu/src/cuda/isf-sc.cu +++ b/modules/gpu/src/cuda/isf-sc.cu @@ -71,7 +71,7 @@ namespace icf { } void fillBins(cv::gpu::PtrStepSzb hogluv, const cv::gpu::PtrStepSzf& nangle, - const int fw, const int fh, const int bins) + const int fw, const int fh, const int bins, cudaStream_t stream ) { const uchar* mag = (const uchar*)hogluv.ptr(fh * bins); uchar* hog = (uchar*)hogluv.ptr(); @@ -80,9 +80,12 @@ namespace icf { dim3 block(32, 8); dim3 grid(fw / 32, fh / 8); - magToHist<<>>(mag, angle, nangle.step / sizeof(float), hog, hogluv.step, fh); - cudaSafeCall( cudaGetLastError() ); - cudaSafeCall( cudaDeviceSynchronize() ); + magToHist<<>>(mag, angle, nangle.step / sizeof(float), hog, hogluv.step, fh); + if (!stream) + { + cudaSafeCall( cudaGetLastError() ); + cudaSafeCall( cudaDeviceSynchronize() ); + } } texture thogluv; @@ -305,7 +308,7 @@ namespace icf { template<> void CascadeInvoker::operator()(const PtrStepSzb& roi, const PtrStepSzi& hogluv, - PtrStepSz objects, PtrStepSzi counter, const int downscales, const int scale) const + PtrStepSz objects, PtrStepSzi counter, const int downscales, const int scale, const cudaStream_t& stream) const { int fw = 160; int fh = 120; @@ -325,22 +328,25 @@ namespace icf { if (scale == -1) { - test_kernel_warp<<>>(levels, octaves, stages, nodes, leaves, det, max_det, ctr, 0); + test_kernel_warp<<>>(levels, octaves, stages, nodes, leaves, det, max_det, ctr, 0); cudaSafeCall( cudaGetLastError()); grid = dim3(fw, fh / 8, 47 - downscales); - test_kernel_warp<<>>(levels, octaves, stages, nodes, leaves, det, max_det, ctr, downscales); + test_kernel_warp<<>>(levels, octaves, stages, nodes, leaves, det, max_det, ctr, downscales); } else { if (scale >= downscales) - test_kernel_warp<<>>(levels, octaves, stages, nodes, leaves, det, max_det, ctr, scale); + test_kernel_warp<<>>(levels, octaves, stages, nodes, leaves, det, max_det, ctr, scale); else - test_kernel_warp<<>>(levels, octaves, stages, nodes, leaves, det, max_det, ctr, scale); + test_kernel_warp<<>>(levels, octaves, stages, nodes, leaves, det, max_det, ctr, scale); } - cudaSafeCall( cudaGetLastError()); - cudaSafeCall( cudaDeviceSynchronize()); + if (!stream) + { + cudaSafeCall( cudaGetLastError()); + cudaSafeCall( cudaDeviceSynchronize()); + } } } }}} \ No newline at end of file diff --git a/modules/gpu/src/icf.hpp b/modules/gpu/src/icf.hpp index d829012c8..60df55882 100644 --- a/modules/gpu/src/icf.hpp +++ b/modules/gpu/src/icf.hpp @@ -139,7 +139,7 @@ struct CascadeInvoker const float* leaves; void operator()(const PtrStepSzb& roi, const PtrStepSzi& hogluv, PtrStepSz objects, - PtrStepSzi counter, const int downscales, const int csale = -1) const; + PtrStepSzi counter, const int downscales, const int csale = -1, const cudaStream_t& stream = 0) const; }; } diff --git a/modules/gpu/src/softcascade.cpp b/modules/gpu/src/softcascade.cpp index 02481ed37..fdde2618e 100644 --- a/modules/gpu/src/softcascade.cpp +++ b/modules/gpu/src/softcascade.cpp @@ -54,7 +54,7 @@ bool cv::gpu::SCascade::load(const FileNode&) { throw_nogpu(); return false;} void cv::gpu::SCascade::detect(InputArray, InputArray, OutputArray, Stream&) const { throw_nogpu(); } void cv::gpu::SCascade::detect(InputArray, InputArray, OutputArray, const int, Stream&) const { throw_nogpu(); } -void cv::gpu::SCascade::genRoi(InputArray, OutputArray) const { throw_nogpu(); } +void cv::gpu::SCascade::genRoi(InputArray, OutputArray, Stream&) const { throw_nogpu(); } void cv::gpu::SCascade::read(const FileNode& fn) { Algorithm::read(fn); } @@ -76,7 +76,7 @@ namespace cv { namespace gpu { namespace device { namespace icf { void fillBins(cv::gpu::PtrStepSzb hogluv, const cv::gpu::PtrStepSzf& nangle, - const int fw, const int fh, const int bins); + const int fw, const int fh, const int bins, cudaStream_t stream); } namespace imgproc { @@ -341,27 +341,30 @@ struct cv::gpu::SCascade::Fields } - void detect(int scale, const cv::gpu::GpuMat& roi, const cv::gpu::GpuMat& count, cv::gpu::GpuMat& objects, cudaStream_t stream) const + void detect(int scale, const cv::gpu::GpuMat& roi, const cv::gpu::GpuMat& count, cv::gpu::GpuMat& objects, const cudaStream_t& stream) const { cudaMemset(count.data, 0, sizeof(Detection)); cudaSafeCall( cudaGetLastError()); - invoker(roi, hogluv, objects, count, downscales, scale); + invoker(roi, hogluv, objects, count, downscales, scale, stream); } - void preprocess(const cv::gpu::GpuMat& colored) + void preprocess(const cv::gpu::GpuMat& colored, Stream& s) { - cudaMemset(plane.data, 0, plane.step * plane.rows); + if (s) + s.enqueueMemSet(plane, 0); + else + cudaMemset(plane.data, 0, plane.step * plane.rows); static const int fw = Fields::FRAME_WIDTH; static const int fh = Fields::FRAME_HEIGHT; GpuMat gray(plane, cv::Rect(0, fh * Fields::HOG_LUV_BINS, fw, fh)); - cv::gpu::cvtColor(colored, gray, CV_BGR2GRAY); - createHogBins(gray); + cv::gpu::cvtColor(colored, gray, CV_BGR2GRAY, s); + createHogBins(gray ,s); - createLuvBins(colored); + createLuvBins(colored, s); - integrate(); + integrate(s); } private: @@ -386,7 +389,7 @@ private: return res; } - void createHogBins(const cv::gpu::GpuMat& gray) + void createHogBins(const cv::gpu::GpuMat& gray, Stream& s) { static const int fw = Fields::FRAME_WIDTH; static const int fh = Fields::FRAME_HEIGHT; @@ -394,35 +397,38 @@ private: GpuMat dfdx(fplane, cv::Rect(0, 0, fw, fh)); GpuMat dfdy(fplane, cv::Rect(0, fh, fw, fh)); - cv::gpu::Sobel(gray, dfdx, CV_32F, 1, 0); - cv::gpu::Sobel(gray, dfdy, CV_32F, 0, 1); + cv::gpu::Sobel(gray, dfdx, CV_32F, 1, 0, sobelBuf, 3, 1, BORDER_DEFAULT, -1, s); + cv::gpu::Sobel(gray, dfdy, CV_32F, 0, 1, sobelBuf, 3, 1, BORDER_DEFAULT, -1, s); GpuMat mag(fplane, cv::Rect(0, 2 * fh, fw, fh)); GpuMat ang(fplane, cv::Rect(0, 3 * fh, fw, fh)); - cv::gpu::cartToPolar(dfdx, dfdy, mag, ang, true); + cv::gpu::cartToPolar(dfdx, dfdy, mag, ang, true, s); // normolize magnitude to uchar interval and angles to 6 bins - GpuMat nmag(fplane, cv::Rect(0, 4 * fh, fw, fh)); GpuMat nang(fplane, cv::Rect(0, 5 * fh, fw, fh)); - cv::gpu::multiply(mag, cv::Scalar::all(1.f / (8 *::log(2))), nmag); - cv::gpu::multiply(ang, cv::Scalar::all(1.f / 60.f), nang); + cv::gpu::multiply(mag, cv::Scalar::all(1.f / (8 *::log(2))), nmag, 1, -1, s); + cv::gpu::multiply(ang, cv::Scalar::all(1.f / 60.f), nang, 1, -1, s); //create uchar magnitude GpuMat cmag(plane, cv::Rect(0, fh * Fields::HOG_BINS, fw, fh)); - nmag.convertTo(cmag, CV_8UC1); + if (s) + s.enqueueConvert(nmag, cmag, CV_8UC1); + else + nmag.convertTo(cmag, CV_8UC1); - device::icf::fillBins(plane, nang, fw, fh, Fields::HOG_BINS); + cudaStream_t stream = StreamAccessor::getStream(s); + device::icf::fillBins(plane, nang, fw, fh, Fields::HOG_BINS, stream); } - void createLuvBins(const cv::gpu::GpuMat& colored) + void createLuvBins(const cv::gpu::GpuMat& colored, Stream& s) { static const int fw = Fields::FRAME_WIDTH; static const int fh = Fields::FRAME_HEIGHT; - cv::gpu::cvtColor(colored, luv, CV_BGR2Luv); + cv::gpu::cvtColor(colored, luv, CV_BGR2Luv, s); std::vector splited; for(int i = 0; i < Fields::LUV_BINS; ++i) @@ -430,17 +436,18 @@ private: splited.push_back(GpuMat(plane, cv::Rect(0, fh * (7 + i), fw, fh))); } - cv::gpu::split(luv, splited); + cv::gpu::split(luv, splited, s); } - void integrate() + void integrate( Stream& s) { int fw = Fields::FRAME_WIDTH; int fh = Fields::FRAME_HEIGHT; GpuMat channels(plane, cv::Rect(0, 0, fw, fh * Fields::HOG_LUV_BINS)); - cv::gpu::resize(channels, shrunk, cv::Size(), 0.25, 0.25, CV_INTER_AREA); - device::imgproc::shfl_integral_gpu_buffered(shrunk, integralBuffer, hogluv, 12, 0); + cv::gpu::resize(channels, shrunk, cv::Size(), 0.25, 0.25, CV_INTER_AREA, s); + cudaStream_t stream = StreamAccessor::getStream(s); + device::imgproc::shfl_integral_gpu_buffered(shrunk, integralBuffer, hogluv, 12, stream); } public: @@ -482,6 +489,8 @@ public: GpuMat leaves; GpuMat levels; + GpuMat sobelBuf; + device::icf::CascadeInvoker invoker; enum { BOOST = 0 }; @@ -516,6 +525,8 @@ void cv::gpu::SCascade::detect(InputArray image, InputArray _rois, OutputArray _ // only color images are supperted CV_Assert(colored.type() == CV_8UC3 || colored.type() == CV_32SC1); + GpuMat rois = _rois.getGpuMat(), objects = _objects.getGpuMat(); + // we guess user knows about shrincage // CV_Assert((rois.size().width == getRoiSize().height) && (rois.type() == CV_8UC1)); @@ -525,14 +536,13 @@ void cv::gpu::SCascade::detect(InputArray image, InputArray _rois, OutputArray _ { // only this window size allowed CV_Assert(colored.cols == Fields::FRAME_WIDTH && colored.rows == Fields::FRAME_HEIGHT); - flds.preprocess(colored); + flds.preprocess(colored, s); } else { colored.copyTo(flds.hogluv); } - GpuMat rois = _rois.getGpuMat(), objects = _objects.getGpuMat(); GpuMat tmp = GpuMat(objects, cv::Rect(0, 0, sizeof(Detection), 1)); objects = GpuMat(objects, cv::Rect( sizeof(Detection), 0, objects.cols - sizeof(Detection), 1)); @@ -556,7 +566,7 @@ void cv::gpu::SCascade::detect(InputArray image, InputArray _rois, OutputArray _ { // only this window size allowed CV_Assert(colored.cols == Fields::FRAME_WIDTH && colored.rows == Fields::FRAME_HEIGHT); - flds.preprocess(colored); + flds.preprocess(colored, s); } else { @@ -572,15 +582,15 @@ void cv::gpu::SCascade::detect(InputArray image, InputArray _rois, OutputArray _ flds.detect(level, rois, tmp, objects, stream); } -void cv::gpu::SCascade::genRoi(InputArray _roi, OutputArray _mask) const +void cv::gpu::SCascade::genRoi(InputArray _roi, OutputArray _mask, Stream& stream) const { const GpuMat roi = _roi.getGpuMat(); _mask.create( roi.cols / 4, roi.rows / 4, roi.type() ); GpuMat mask = _mask.getGpuMat(); cv::gpu::GpuMat tmp; - cv::gpu::resize(roi, tmp, cv::Size(), 0.25, 0.25, CV_INTER_AREA); - cv::gpu::transpose(tmp, mask); + cv::gpu::resize(roi, tmp, cv::Size(), 0.25, 0.25, CV_INTER_AREA, stream); + cv::gpu::transpose(tmp, mask, stream); } void cv::gpu::SCascade::read(const FileNode& fn) diff --git a/modules/gpu/test/test_softcascade.cpp b/modules/gpu/test/test_softcascade.cpp index f26c44f0e..cfae940c7 100644 --- a/modules/gpu/test/test_softcascade.cpp +++ b/modules/gpu/test/test_softcascade.cpp @@ -330,4 +330,43 @@ GPU_TEST_P(SCascadeTestAll, detectOnIntegral, ASSERT_EQ( a ,1024); } + +GPU_TEST_P(SCascadeTestAll, detectStream, + ALL_DEVICES + ) +{ + cv::gpu::setDevice(GetParam().deviceID()); + std::string xml = cvtest::TS::ptr()->get_data_path() + "../cv/cascadeandhog/sc_cvpr_2012_to_opencv.xml"; + cv::gpu::SCascade cascade; + + cv::FileStorage fs(xml, cv::FileStorage::READ); + ASSERT_TRUE(fs.isOpened()); + + ASSERT_TRUE(cascade.load(fs.getFirstTopLevelNode())); + + cv::Mat coloredCpu = cv::imread(cvtest::TS::ptr()->get_data_path() + + "../cv/cascadeandhog/bahnhof/image_00000000_0.png"); + ASSERT_FALSE(coloredCpu.empty()); + + GpuMat colored(coloredCpu), objectBoxes(1, 100000, CV_8UC1), rois(colored.size(), CV_8UC1); + rois.setTo(0); + GpuMat sub(rois, cv::Rect(rois.cols / 4, rois.rows / 4,rois.cols / 2, rois.rows / 2)); + sub.setTo(cv::Scalar::all(1)); + + cv::gpu::Stream s; + + cv::gpu::GpuMat trois; + cascade.genRoi(rois, trois, s); + + cascade.detect(colored, trois, objectBoxes, s); + + cudaDeviceSynchronize(); + + typedef cv::gpu::SCascade::Detection Detection; + cv::Mat detections(objectBoxes); + int a = *(detections.ptr(0)); + ASSERT_EQ(a ,2460); +} + + #endif \ No newline at end of file From 916967cac5070d231195713f5a1057bd546c1d4d Mon Sep 17 00:00:00 2001 From: "marina.kolpakova" Date: Sat, 10 Nov 2012 04:26:38 +0400 Subject: [PATCH 060/155] add comments to class declaration --- modules/gpu/include/opencv2/gpu/gpu.hpp | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/modules/gpu/include/opencv2/gpu/gpu.hpp b/modules/gpu/include/opencv2/gpu/gpu.hpp index 8f327f227..9a43760f9 100644 --- a/modules/gpu/include/opencv2/gpu/gpu.hpp +++ b/modules/gpu/include/opencv2/gpu/gpu.hpp @@ -1570,13 +1570,22 @@ public: // Load cascade config. virtual void read(const FileNode& fn); - // Return the vector of Decection objcts. + // Return the matrix of of detectioned objects. // Param image is a frame on which detector will be applied. - // Param rois is a vector of regions of interest. Only the objects that fall into one of the regions will be returned. - // Param objects is an output array of Detections + // Param rois is a regions of interests mask generated by genRoi. + // Only the objects that fall into one of the regions will be returned. + // Param objects is an output array of Detections represented as GpuMat of detections (SCascade::Detection) + // The first element of the matrix is actually a count of detections. + // Param stream is stream is a high-level CUDA stream abstraction used for asynchronous execution + // Param level used for execution cascade on specific scales pyramid level. virtual void detect(InputArray image, InputArray rois, OutputArray objects, Stream& stream = Stream::Null()) const; virtual void detect(InputArray image, InputArray rois, OutputArray objects, const int level, Stream& stream = Stream::Null()) const; + // Convert ROI matrix into the suitable for detect method. + // Param roi is an input matrix of the same size as the image. + // There non zero value mean that detector should be executed in this point. + // Param mask is an output mask + // Param stream is stream is a high-level CUDA stream abstraction used for asynchronous execution void genRoi(InputArray roi, OutputArray mask, Stream& stream = Stream::Null()) const; private: From d3ac282487793bc2a609ac1a58973dfa4e714864 Mon Sep 17 00:00:00 2001 From: "marina.kolpakova" Date: Sat, 10 Nov 2012 05:48:06 +0400 Subject: [PATCH 061/155] GPU soft cascade documentation --- modules/gpu/doc/object_detection.rst | 119 +++++++++++++++++++++++++++ 1 file changed, 119 insertions(+) diff --git a/modules/gpu/doc/object_detection.rst b/modules/gpu/doc/object_detection.rst index 133660236..64348717c 100644 --- a/modules/gpu/doc/object_detection.rst +++ b/modules/gpu/doc/object_detection.rst @@ -199,6 +199,125 @@ Returns block descriptors computed for the whole image. The function is mainly used to learn the classifier. +Soft Cascade Classifier +====================== + +Soft Cascade Classifier for Object Detection +---------------------------------------------------------- + +Cascade detectors have been shown to operate extremely rapidly, with high accuracy, and have important applications in different spheres. The initial goal for this cascade implementation was the fast and accurate pedestrian detector but it also useful in general. Soft cascade is trained with AdaBoost. But instead of training sequence of stages, the soft cascade is trained as a one long stage of T weak classifiers. Soft cascade is formulated as follows: + +.. math:: + \texttt{H}(x) = \sum _{\texttt{t}=1..\texttt{T}} {\texttt{s}_t(x)} + +where :math:`\texttt{s}_t(x) = \alpha_t\texttt{h}_t(x)` are the set of thresholded weak classifiers selected during AdaBoost training scaled by the associated weights. Let + +.. math:: + \texttt{H}_t(x) = \sum _{\texttt{i}=1..\texttt{t}} {\texttt{s}_i(x)} + +be the partial sum of sample responses before :math:`t`-the weak classifier will be applied. The funtcion :math:`\texttt{H}_t(x)` of :math:`t` for sample :math:`x` named *sample trace*. +After each weak classifier evaluation, the sample trace at the point :math:`t` is compared with the rejection threshold :math:`r_t`. The sequence of :math:`r_t` named *rejection trace*. + +The sample has been rejected if it fall rejection threshold. So stageless cascade allows to reject not-object sample as soon as possible. Another meaning of the sample trace is a confidence with that sample recognized as desired object. At each :math:`t` that confidence depend on all previous weak classifier. This feature of soft cascade is resulted in more accurate detection. The original formulation of soft cascade can be found in [BJ05]_. + +.. [BJ05] Lubomir Bourdev and Jonathan Brandt. tRobust Object Detection Via Soft Cascade. IEEE CVPR, 2005. +.. [BMTG12] Rodrigo Benenson, Markus Mathias, Radu Timofte and Luc Van Gool. Pedestrian detection at 100 frames per second. IEEE CVPR, 2012. + + +SCascade +---------------- +.. ocv:class:: SCascade + +Implementation of soft (stageless) cascaded detector. :: + + class CV_EXPORTS SCascade : public Algorithm + { + struct CV_EXPORTS Detection + { + ushort x; + ushort y; + ushort w; + ushort h; + float confidence; + int kind; + + enum {PEDESTRIAN = 0}; + }; + + SCascade(const double minScale = 0.4, const double maxScale = 5., const int scales = 55, const int rejfactor = 1); + virtual ~SCascade(); + virtual bool load(const FileNode& fn); + virtual void detect(InputArray image, InputArray rois, OutputArray objects, Stream& stream = Stream::Null()) const; + virtual void detect(InputArray image, InputArray rois, OutputArray objects, const int level, Stream& stream = Stream::Null()) const; + void genRoi(InputArray roi, OutputArray mask, Stream& stream = Stream::Null()) const; + }; + + +SCascade::SCascade +-------------------------- +An empty cascade will be created. + +.. ocv:function:: bool SCascade::SCascade(const float minScale = 0.4f, const float maxScale = 5.f, const int scales = 55, const int rejfactor = 1) + + :param minScale: a minimum scale relative to the original size of the image on which cascade will be applyed. + + :param maxScale: a maximum scale relative to the original size of the image on which cascade will be applyed. + + :param scales: a number of scales from minScale to maxScale. + + :param rejfactor: used for non maximum suppression. + + + +SCascade::~SCascade +--------------------------- +Destructor for SCascade. + +.. ocv:function:: SCascade::~SCascade() + + + +SCascade::load +-------------------------- +Load cascade from FileNode. + +.. ocv:function:: bool SCascade::load(const FileNode& fn) + + :param fn: File node from which the soft cascade are read. + + + +SCascade::detect +-------------------------- +Apply cascade to an input frame and return the vector of Decection objcts. + +.. ocv:function:: void detect(InputArray image, InputArray rois, OutputArray objects, Stream& stream = Stream::Null()) const +.. ocv:function:: void detect(InputArray image, InputArray rois, OutputArray objects, const int level, Stream& stream = Stream::Null()) const + + :param image: a frame on which detector will be applied. + + :param rois: a regions of interests mask generated by genRoi. Only the objects that fall into one of the regions will be returned. + + :param objects: an output array of Detections represented as GpuMat of detections (SCascade::Detection). The first element of the matrix is actually a count of detections. + + :param stream: a high-level CUDA stream abstraction used for asynchronous execution. + + :param level: used for execution cascade on specific scales pyramid level. + + +SCascade::genRoi +-------------------------- +Convert ROI matrix into the suitable for detect method. + +.. ocv:function:: void genRoi(InputArray roi, OutputArray mask, Stream& stream = Stream::Null()) const + + :param rois: an input matrix of the same size as the image. There non zero value mean that detector should be executed in this point. + + :param mask: an output mask + + :param stream: a high-level CUDA stream abstraction used for asynchronous execution. + + gpu::CascadeClassifier_GPU -------------------------- From 29f89e8930c91c1e9fdcccb31d34acb1552b9281 Mon Sep 17 00:00:00 2001 From: Vladislav Vinogradov Date: Tue, 30 Oct 2012 12:14:07 +0400 Subject: [PATCH 062/155] moved block.hpp to include folder --- modules/gpu/{src => include}/opencv2/gpu/device/block.hpp | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename modules/gpu/{src => include}/opencv2/gpu/device/block.hpp (100%) diff --git a/modules/gpu/src/opencv2/gpu/device/block.hpp b/modules/gpu/include/opencv2/gpu/device/block.hpp similarity index 100% rename from modules/gpu/src/opencv2/gpu/device/block.hpp rename to modules/gpu/include/opencv2/gpu/device/block.hpp From 810829f32ee604433cf99cbc679139e20407f938 Mon Sep 17 00:00:00 2001 From: Vladislav Vinogradov Date: Fri, 9 Nov 2012 13:14:59 +0400 Subject: [PATCH 063/155] speedup compilation of row_filter.cu and column_filter.cu split them into several small files --- modules/gpu/src/cuda/column_filter.0.cu | 53 ++++ modules/gpu/src/cuda/column_filter.1.cu | 53 ++++ modules/gpu/src/cuda/column_filter.2.cu | 53 ++++ modules/gpu/src/cuda/column_filter.3.cu | 53 ++++ modules/gpu/src/cuda/column_filter.4.cu | 53 ++++ modules/gpu/src/cuda/column_filter.5.cu | 53 ++++ modules/gpu/src/cuda/column_filter.6.cu | 53 ++++ modules/gpu/src/cuda/column_filter.7.cu | 53 ++++ modules/gpu/src/cuda/column_filter.cu | 391 ------------------------ modules/gpu/src/cuda/column_filter.h | 378 +++++++++++++++++++++++ modules/gpu/src/cuda/row_filter.0.cu | 53 ++++ modules/gpu/src/cuda/row_filter.1.cu | 53 ++++ modules/gpu/src/cuda/row_filter.2.cu | 53 ++++ modules/gpu/src/cuda/row_filter.3.cu | 53 ++++ modules/gpu/src/cuda/row_filter.4.cu | 53 ++++ modules/gpu/src/cuda/row_filter.5.cu | 53 ++++ modules/gpu/src/cuda/row_filter.6.cu | 53 ++++ modules/gpu/src/cuda/row_filter.7.cu | 53 ++++ modules/gpu/src/cuda/row_filter.cu | 390 ----------------------- modules/gpu/src/cuda/row_filter.h | 377 +++++++++++++++++++++++ modules/gpu/src/filtering.cpp | 54 ++-- 21 files changed, 1625 insertions(+), 813 deletions(-) create mode 100644 modules/gpu/src/cuda/column_filter.0.cu create mode 100644 modules/gpu/src/cuda/column_filter.1.cu create mode 100644 modules/gpu/src/cuda/column_filter.2.cu create mode 100644 modules/gpu/src/cuda/column_filter.3.cu create mode 100644 modules/gpu/src/cuda/column_filter.4.cu create mode 100644 modules/gpu/src/cuda/column_filter.5.cu create mode 100644 modules/gpu/src/cuda/column_filter.6.cu create mode 100644 modules/gpu/src/cuda/column_filter.7.cu delete mode 100644 modules/gpu/src/cuda/column_filter.cu create mode 100644 modules/gpu/src/cuda/column_filter.h create mode 100644 modules/gpu/src/cuda/row_filter.0.cu create mode 100644 modules/gpu/src/cuda/row_filter.1.cu create mode 100644 modules/gpu/src/cuda/row_filter.2.cu create mode 100644 modules/gpu/src/cuda/row_filter.3.cu create mode 100644 modules/gpu/src/cuda/row_filter.4.cu create mode 100644 modules/gpu/src/cuda/row_filter.5.cu create mode 100644 modules/gpu/src/cuda/row_filter.6.cu create mode 100644 modules/gpu/src/cuda/row_filter.7.cu delete mode 100644 modules/gpu/src/cuda/row_filter.cu create mode 100644 modules/gpu/src/cuda/row_filter.h diff --git a/modules/gpu/src/cuda/column_filter.0.cu b/modules/gpu/src/cuda/column_filter.0.cu new file mode 100644 index 000000000..c35c6ee64 --- /dev/null +++ b/modules/gpu/src/cuda/column_filter.0.cu @@ -0,0 +1,53 @@ +/*M/////////////////////////////////////////////////////////////////////////////////////// +// +// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. +// +// By downloading, copying, installing or using the software you agree to this license. +// If you do not agree to this license, do not download, install, +// copy or use the software. +// +// +// License Agreement +// For Open Source Computer Vision Library +// +// Copyright (C) 2000-2008, Intel Corporation, all rights reserved. +// Copyright (C) 2009, Willow Garage Inc., all rights reserved. +// Copyright (C) 1993-2011, NVIDIA Corporation, all rights reserved. +// Third party copyrights are property of their respective owners. +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// * Redistribution's of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// * Redistribution's in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// * The name of the copyright holders may not be used to endorse or promote products +// derived from this software without specific prior written permission. +// +// This software is provided by the copyright holders and contributors "as is" and +// any express or implied warranties, including, but not limited to, the implied +// warranties of merchantability and fitness for a particular purpose are disclaimed. +// In no event shall the Intel Corporation or contributors be liable for any direct, +// indirect, incidental, special, exemplary, or consequential damages +// (including, but not limited to, procurement of substitute goods or services; +// loss of use, data, or profits; or business interruption) however caused +// and on any theory of liability, whether in contract, strict liability, +// or tort (including negligence or otherwise) arising in any way out of +// the use of this software, even if advised of the possibility of such damage. +// +//M*/ + +#if !defined CUDA_DISABLER + +#include "column_filter.h" + +namespace filter +{ + template void linearColumn(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream); +} + +#endif /* CUDA_DISABLER */ diff --git a/modules/gpu/src/cuda/column_filter.1.cu b/modules/gpu/src/cuda/column_filter.1.cu new file mode 100644 index 000000000..9a2d6a042 --- /dev/null +++ b/modules/gpu/src/cuda/column_filter.1.cu @@ -0,0 +1,53 @@ +/*M/////////////////////////////////////////////////////////////////////////////////////// +// +// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. +// +// By downloading, copying, installing or using the software you agree to this license. +// If you do not agree to this license, do not download, install, +// copy or use the software. +// +// +// License Agreement +// For Open Source Computer Vision Library +// +// Copyright (C) 2000-2008, Intel Corporation, all rights reserved. +// Copyright (C) 2009, Willow Garage Inc., all rights reserved. +// Copyright (C) 1993-2011, NVIDIA Corporation, all rights reserved. +// Third party copyrights are property of their respective owners. +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// * Redistribution's of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// * Redistribution's in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// * The name of the copyright holders may not be used to endorse or promote products +// derived from this software without specific prior written permission. +// +// This software is provided by the copyright holders and contributors "as is" and +// any express or implied warranties, including, but not limited to, the implied +// warranties of merchantability and fitness for a particular purpose are disclaimed. +// In no event shall the Intel Corporation or contributors be liable for any direct, +// indirect, incidental, special, exemplary, or consequential damages +// (including, but not limited to, procurement of substitute goods or services; +// loss of use, data, or profits; or business interruption) however caused +// and on any theory of liability, whether in contract, strict liability, +// or tort (including negligence or otherwise) arising in any way out of +// the use of this software, even if advised of the possibility of such damage. +// +//M*/ + +#if !defined CUDA_DISABLER + +#include "column_filter.h" + +namespace filter +{ + template void linearColumn(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream); +} + +#endif /* CUDA_DISABLER */ diff --git a/modules/gpu/src/cuda/column_filter.2.cu b/modules/gpu/src/cuda/column_filter.2.cu new file mode 100644 index 000000000..05ee01c76 --- /dev/null +++ b/modules/gpu/src/cuda/column_filter.2.cu @@ -0,0 +1,53 @@ +/*M/////////////////////////////////////////////////////////////////////////////////////// +// +// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. +// +// By downloading, copying, installing or using the software you agree to this license. +// If you do not agree to this license, do not download, install, +// copy or use the software. +// +// +// License Agreement +// For Open Source Computer Vision Library +// +// Copyright (C) 2000-2008, Intel Corporation, all rights reserved. +// Copyright (C) 2009, Willow Garage Inc., all rights reserved. +// Copyright (C) 1993-2011, NVIDIA Corporation, all rights reserved. +// Third party copyrights are property of their respective owners. +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// * Redistribution's of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// * Redistribution's in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// * The name of the copyright holders may not be used to endorse or promote products +// derived from this software without specific prior written permission. +// +// This software is provided by the copyright holders and contributors "as is" and +// any express or implied warranties, including, but not limited to, the implied +// warranties of merchantability and fitness for a particular purpose are disclaimed. +// In no event shall the Intel Corporation or contributors be liable for any direct, +// indirect, incidental, special, exemplary, or consequential damages +// (including, but not limited to, procurement of substitute goods or services; +// loss of use, data, or profits; or business interruption) however caused +// and on any theory of liability, whether in contract, strict liability, +// or tort (including negligence or otherwise) arising in any way out of +// the use of this software, even if advised of the possibility of such damage. +// +//M*/ + +#if !defined CUDA_DISABLER + +#include "column_filter.h" + +namespace filter +{ + template void linearColumn(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream); +} + +#endif /* CUDA_DISABLER */ diff --git a/modules/gpu/src/cuda/column_filter.3.cu b/modules/gpu/src/cuda/column_filter.3.cu new file mode 100644 index 000000000..1bf49219f --- /dev/null +++ b/modules/gpu/src/cuda/column_filter.3.cu @@ -0,0 +1,53 @@ +/*M/////////////////////////////////////////////////////////////////////////////////////// +// +// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. +// +// By downloading, copying, installing or using the software you agree to this license. +// If you do not agree to this license, do not download, install, +// copy or use the software. +// +// +// License Agreement +// For Open Source Computer Vision Library +// +// Copyright (C) 2000-2008, Intel Corporation, all rights reserved. +// Copyright (C) 2009, Willow Garage Inc., all rights reserved. +// Copyright (C) 1993-2011, NVIDIA Corporation, all rights reserved. +// Third party copyrights are property of their respective owners. +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// * Redistribution's of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// * Redistribution's in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// * The name of the copyright holders may not be used to endorse or promote products +// derived from this software without specific prior written permission. +// +// This software is provided by the copyright holders and contributors "as is" and +// any express or implied warranties, including, but not limited to, the implied +// warranties of merchantability and fitness for a particular purpose are disclaimed. +// In no event shall the Intel Corporation or contributors be liable for any direct, +// indirect, incidental, special, exemplary, or consequential damages +// (including, but not limited to, procurement of substitute goods or services; +// loss of use, data, or profits; or business interruption) however caused +// and on any theory of liability, whether in contract, strict liability, +// or tort (including negligence or otherwise) arising in any way out of +// the use of this software, even if advised of the possibility of such damage. +// +//M*/ + +#if !defined CUDA_DISABLER + +#include "column_filter.h" + +namespace filter +{ + template void linearColumn(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream); +} + +#endif /* CUDA_DISABLER */ diff --git a/modules/gpu/src/cuda/column_filter.4.cu b/modules/gpu/src/cuda/column_filter.4.cu new file mode 100644 index 000000000..bec7a085a --- /dev/null +++ b/modules/gpu/src/cuda/column_filter.4.cu @@ -0,0 +1,53 @@ +/*M/////////////////////////////////////////////////////////////////////////////////////// +// +// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. +// +// By downloading, copying, installing or using the software you agree to this license. +// If you do not agree to this license, do not download, install, +// copy or use the software. +// +// +// License Agreement +// For Open Source Computer Vision Library +// +// Copyright (C) 2000-2008, Intel Corporation, all rights reserved. +// Copyright (C) 2009, Willow Garage Inc., all rights reserved. +// Copyright (C) 1993-2011, NVIDIA Corporation, all rights reserved. +// Third party copyrights are property of their respective owners. +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// * Redistribution's of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// * Redistribution's in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// * The name of the copyright holders may not be used to endorse or promote products +// derived from this software without specific prior written permission. +// +// This software is provided by the copyright holders and contributors "as is" and +// any express or implied warranties, including, but not limited to, the implied +// warranties of merchantability and fitness for a particular purpose are disclaimed. +// In no event shall the Intel Corporation or contributors be liable for any direct, +// indirect, incidental, special, exemplary, or consequential damages +// (including, but not limited to, procurement of substitute goods or services; +// loss of use, data, or profits; or business interruption) however caused +// and on any theory of liability, whether in contract, strict liability, +// or tort (including negligence or otherwise) arising in any way out of +// the use of this software, even if advised of the possibility of such damage. +// +//M*/ + +#if !defined CUDA_DISABLER + +#include "column_filter.h" + +namespace filter +{ + template void linearColumn(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream); +} + +#endif /* CUDA_DISABLER */ diff --git a/modules/gpu/src/cuda/column_filter.5.cu b/modules/gpu/src/cuda/column_filter.5.cu new file mode 100644 index 000000000..8194ee39a --- /dev/null +++ b/modules/gpu/src/cuda/column_filter.5.cu @@ -0,0 +1,53 @@ +/*M/////////////////////////////////////////////////////////////////////////////////////// +// +// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. +// +// By downloading, copying, installing or using the software you agree to this license. +// If you do not agree to this license, do not download, install, +// copy or use the software. +// +// +// License Agreement +// For Open Source Computer Vision Library +// +// Copyright (C) 2000-2008, Intel Corporation, all rights reserved. +// Copyright (C) 2009, Willow Garage Inc., all rights reserved. +// Copyright (C) 1993-2011, NVIDIA Corporation, all rights reserved. +// Third party copyrights are property of their respective owners. +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// * Redistribution's of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// * Redistribution's in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// * The name of the copyright holders may not be used to endorse or promote products +// derived from this software without specific prior written permission. +// +// This software is provided by the copyright holders and contributors "as is" and +// any express or implied warranties, including, but not limited to, the implied +// warranties of merchantability and fitness for a particular purpose are disclaimed. +// In no event shall the Intel Corporation or contributors be liable for any direct, +// indirect, incidental, special, exemplary, or consequential damages +// (including, but not limited to, procurement of substitute goods or services; +// loss of use, data, or profits; or business interruption) however caused +// and on any theory of liability, whether in contract, strict liability, +// or tort (including negligence or otherwise) arising in any way out of +// the use of this software, even if advised of the possibility of such damage. +// +//M*/ + +#if !defined CUDA_DISABLER + +#include "column_filter.h" + +namespace filter +{ + template void linearColumn(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream); +} + +#endif /* CUDA_DISABLER */ diff --git a/modules/gpu/src/cuda/column_filter.6.cu b/modules/gpu/src/cuda/column_filter.6.cu new file mode 100644 index 000000000..d8fc49be6 --- /dev/null +++ b/modules/gpu/src/cuda/column_filter.6.cu @@ -0,0 +1,53 @@ +/*M/////////////////////////////////////////////////////////////////////////////////////// +// +// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. +// +// By downloading, copying, installing or using the software you agree to this license. +// If you do not agree to this license, do not download, install, +// copy or use the software. +// +// +// License Agreement +// For Open Source Computer Vision Library +// +// Copyright (C) 2000-2008, Intel Corporation, all rights reserved. +// Copyright (C) 2009, Willow Garage Inc., all rights reserved. +// Copyright (C) 1993-2011, NVIDIA Corporation, all rights reserved. +// Third party copyrights are property of their respective owners. +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// * Redistribution's of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// * Redistribution's in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// * The name of the copyright holders may not be used to endorse or promote products +// derived from this software without specific prior written permission. +// +// This software is provided by the copyright holders and contributors "as is" and +// any express or implied warranties, including, but not limited to, the implied +// warranties of merchantability and fitness for a particular purpose are disclaimed. +// In no event shall the Intel Corporation or contributors be liable for any direct, +// indirect, incidental, special, exemplary, or consequential damages +// (including, but not limited to, procurement of substitute goods or services; +// loss of use, data, or profits; or business interruption) however caused +// and on any theory of liability, whether in contract, strict liability, +// or tort (including negligence or otherwise) arising in any way out of +// the use of this software, even if advised of the possibility of such damage. +// +//M*/ + +#if !defined CUDA_DISABLER + +#include "column_filter.h" + +namespace filter +{ + template void linearColumn(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream); +} + +#endif /* CUDA_DISABLER */ diff --git a/modules/gpu/src/cuda/column_filter.7.cu b/modules/gpu/src/cuda/column_filter.7.cu new file mode 100644 index 000000000..534bd821e --- /dev/null +++ b/modules/gpu/src/cuda/column_filter.7.cu @@ -0,0 +1,53 @@ +/*M/////////////////////////////////////////////////////////////////////////////////////// +// +// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. +// +// By downloading, copying, installing or using the software you agree to this license. +// If you do not agree to this license, do not download, install, +// copy or use the software. +// +// +// License Agreement +// For Open Source Computer Vision Library +// +// Copyright (C) 2000-2008, Intel Corporation, all rights reserved. +// Copyright (C) 2009, Willow Garage Inc., all rights reserved. +// Copyright (C) 1993-2011, NVIDIA Corporation, all rights reserved. +// Third party copyrights are property of their respective owners. +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// * Redistribution's of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// * Redistribution's in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// * The name of the copyright holders may not be used to endorse or promote products +// derived from this software without specific prior written permission. +// +// This software is provided by the copyright holders and contributors "as is" and +// any express or implied warranties, including, but not limited to, the implied +// warranties of merchantability and fitness for a particular purpose are disclaimed. +// In no event shall the Intel Corporation or contributors be liable for any direct, +// indirect, incidental, special, exemplary, or consequential damages +// (including, but not limited to, procurement of substitute goods or services; +// loss of use, data, or profits; or business interruption) however caused +// and on any theory of liability, whether in contract, strict liability, +// or tort (including negligence or otherwise) arising in any way out of +// the use of this software, even if advised of the possibility of such damage. +// +//M*/ + +#if !defined CUDA_DISABLER + +#include "column_filter.h" + +namespace filter +{ + template void linearColumn(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream); +} + +#endif /* CUDA_DISABLER */ diff --git a/modules/gpu/src/cuda/column_filter.cu b/modules/gpu/src/cuda/column_filter.cu deleted file mode 100644 index af7369ad5..000000000 --- a/modules/gpu/src/cuda/column_filter.cu +++ /dev/null @@ -1,391 +0,0 @@ -/*M/////////////////////////////////////////////////////////////////////////////////////// -// -// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. -// -// By downloading, copying, installing or using the software you agree to this license. -// If you do not agree to this license, do not download, install, -// copy or use the software. -// -// -// License Agreement -// For Open Source Computer Vision Library -// -// Copyright (C) 2000-2008, Intel Corporation, all rights reserved. -// Copyright (C) 2009, Willow Garage Inc., all rights reserved. -// Copyright (C) 1993-2011, NVIDIA Corporation, all rights reserved. -// Third party copyrights are property of their respective owners. -// -// Redistribution and use in source and binary forms, with or without modification, -// are permitted provided that the following conditions are met: -// -// * Redistribution's of source code must retain the above copyright notice, -// this list of conditions and the following disclaimer. -// -// * Redistribution's in binary form must reproduce the above copyright notice, -// this list of conditions and the following disclaimer in the documentation -// and/or other materials provided with the distribution. -// -// * The name of the copyright holders may not be used to endorse or promote products -// derived from this software without specific prior written permission. -// -// This software is provided by the copyright holders and contributors "as is" and -// any express or implied warranties, including, but not limited to, the implied -// warranties of merchantability and fitness for a particular purpose are disclaimed. -// In no event shall the Intel Corporation or contributors be liable for any direct, -// indirect, incidental, special, exemplary, or consequential damages -// (including, but not limited to, procurement of substitute goods or services; -// loss of use, data, or profits; or business interruption) however caused -// and on any theory of liability, whether in contract, strict liability, -// or tort (including negligence or otherwise) arising in any way out of -// the use of this software, even if advised of the possibility of such damage. -// -//M*/ - -#if !defined CUDA_DISABLER - -#include "internal_shared.hpp" -#include "opencv2/gpu/device/saturate_cast.hpp" -#include "opencv2/gpu/device/vec_math.hpp" -#include "opencv2/gpu/device/limits.hpp" -#include "opencv2/gpu/device/border_interpolate.hpp" -#include "opencv2/gpu/device/static_check.hpp" - -namespace cv { namespace gpu { namespace device -{ - namespace column_filter - { - #define MAX_KERNEL_SIZE 32 - - __constant__ float c_kernel[MAX_KERNEL_SIZE]; - - void loadKernel(const float* kernel, int ksize, cudaStream_t stream) - { - if (stream == 0) - cudaSafeCall( cudaMemcpyToSymbol(c_kernel, kernel, ksize * sizeof(float), 0, cudaMemcpyDeviceToDevice) ); - else - cudaSafeCall( cudaMemcpyToSymbolAsync(c_kernel, kernel, ksize * sizeof(float), 0, cudaMemcpyDeviceToDevice, stream) ); - } - - template - __global__ void linearColumnFilter(const PtrStepSz src, PtrStep dst, const int anchor, const B brd) - { - #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 200) - const int BLOCK_DIM_X = 16; - const int BLOCK_DIM_Y = 16; - const int PATCH_PER_BLOCK = 4; - const int HALO_SIZE = KSIZE <= 16 ? 1 : 2; - #else - const int BLOCK_DIM_X = 16; - const int BLOCK_DIM_Y = 8; - const int PATCH_PER_BLOCK = 2; - const int HALO_SIZE = 2; - #endif - - typedef typename TypeVec::cn>::vec_type sum_t; - - __shared__ sum_t smem[(PATCH_PER_BLOCK + 2 * HALO_SIZE) * BLOCK_DIM_Y][BLOCK_DIM_X]; - - const int x = blockIdx.x * BLOCK_DIM_X + threadIdx.x; - - if (x >= src.cols) - return; - - const T* src_col = src.ptr() + x; - - const int yStart = blockIdx.y * (BLOCK_DIM_Y * PATCH_PER_BLOCK) + threadIdx.y; - - if (blockIdx.y > 0) - { - //Upper halo - #pragma unroll - for (int j = 0; j < HALO_SIZE; ++j) - smem[threadIdx.y + j * BLOCK_DIM_Y][threadIdx.x] = saturate_cast(src(yStart - (HALO_SIZE - j) * BLOCK_DIM_Y, x)); - } - else - { - //Upper halo - #pragma unroll - for (int j = 0; j < HALO_SIZE; ++j) - smem[threadIdx.y + j * BLOCK_DIM_Y][threadIdx.x] = saturate_cast(brd.at_low(yStart - (HALO_SIZE - j) * BLOCK_DIM_Y, src_col, src.step)); - } - - if (blockIdx.y + 2 < gridDim.y) - { - //Main data - #pragma unroll - for (int j = 0; j < PATCH_PER_BLOCK; ++j) - smem[threadIdx.y + HALO_SIZE * BLOCK_DIM_Y + j * BLOCK_DIM_Y][threadIdx.x] = saturate_cast(src(yStart + j * BLOCK_DIM_Y, x)); - - //Lower halo - #pragma unroll - for (int j = 0; j < HALO_SIZE; ++j) - smem[threadIdx.y + (PATCH_PER_BLOCK + HALO_SIZE) * BLOCK_DIM_Y + j * BLOCK_DIM_Y][threadIdx.x] = saturate_cast(src(yStart + (PATCH_PER_BLOCK + j) * BLOCK_DIM_Y, x)); - } - else - { - //Main data - #pragma unroll - for (int j = 0; j < PATCH_PER_BLOCK; ++j) - smem[threadIdx.y + HALO_SIZE * BLOCK_DIM_Y + j * BLOCK_DIM_Y][threadIdx.x] = saturate_cast(brd.at_high(yStart + j * BLOCK_DIM_Y, src_col, src.step)); - - //Lower halo - #pragma unroll - for (int j = 0; j < HALO_SIZE; ++j) - smem[threadIdx.y + (PATCH_PER_BLOCK + HALO_SIZE) * BLOCK_DIM_Y + j * BLOCK_DIM_Y][threadIdx.x] = saturate_cast(brd.at_high(yStart + (PATCH_PER_BLOCK + j) * BLOCK_DIM_Y, src_col, src.step)); - } - - __syncthreads(); - - #pragma unroll - for (int j = 0; j < PATCH_PER_BLOCK; ++j) - { - const int y = yStart + j * BLOCK_DIM_Y; - - if (y < src.rows) - { - sum_t sum = VecTraits::all(0); - - #pragma unroll - for (int k = 0; k < KSIZE; ++k) - sum = sum + smem[threadIdx.y + HALO_SIZE * BLOCK_DIM_Y + j * BLOCK_DIM_Y - anchor + k][threadIdx.x] * c_kernel[k]; - - dst(y, x) = saturate_cast(sum); - } - } - } - - template class B> - void linearColumnFilter_caller(PtrStepSz src, PtrStepSz dst, int anchor, int cc, cudaStream_t stream) - { - int BLOCK_DIM_X; - int BLOCK_DIM_Y; - int PATCH_PER_BLOCK; - - if (cc >= 20) - { - BLOCK_DIM_X = 16; - BLOCK_DIM_Y = 16; - PATCH_PER_BLOCK = 4; - } - else - { - BLOCK_DIM_X = 16; - BLOCK_DIM_Y = 8; - PATCH_PER_BLOCK = 2; - } - - const dim3 block(BLOCK_DIM_X, BLOCK_DIM_Y); - const dim3 grid(divUp(src.cols, BLOCK_DIM_X), divUp(src.rows, BLOCK_DIM_Y * PATCH_PER_BLOCK)); - - B brd(src.rows); - - linearColumnFilter<<>>(src, dst, anchor, brd); - - cudaSafeCall( cudaGetLastError() ); - - if (stream == 0) - cudaSafeCall( cudaDeviceSynchronize() ); - } - - template - void linearColumnFilter_gpu(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream) - { - typedef void (*caller_t)(PtrStepSz src, PtrStepSz dst, int anchor, int cc, cudaStream_t stream); - - static const caller_t callers[5][33] = - { - { - 0, - linearColumnFilter_caller< 1, T, D, BrdColReflect101>, - linearColumnFilter_caller< 2, T, D, BrdColReflect101>, - linearColumnFilter_caller< 3, T, D, BrdColReflect101>, - linearColumnFilter_caller< 4, T, D, BrdColReflect101>, - linearColumnFilter_caller< 5, T, D, BrdColReflect101>, - linearColumnFilter_caller< 6, T, D, BrdColReflect101>, - linearColumnFilter_caller< 7, T, D, BrdColReflect101>, - linearColumnFilter_caller< 8, T, D, BrdColReflect101>, - linearColumnFilter_caller< 9, T, D, BrdColReflect101>, - linearColumnFilter_caller<10, T, D, BrdColReflect101>, - linearColumnFilter_caller<11, T, D, BrdColReflect101>, - linearColumnFilter_caller<12, T, D, BrdColReflect101>, - linearColumnFilter_caller<13, T, D, BrdColReflect101>, - linearColumnFilter_caller<14, T, D, BrdColReflect101>, - linearColumnFilter_caller<15, T, D, BrdColReflect101>, - linearColumnFilter_caller<16, T, D, BrdColReflect101>, - linearColumnFilter_caller<17, T, D, BrdColReflect101>, - linearColumnFilter_caller<18, T, D, BrdColReflect101>, - linearColumnFilter_caller<19, T, D, BrdColReflect101>, - linearColumnFilter_caller<20, T, D, BrdColReflect101>, - linearColumnFilter_caller<21, T, D, BrdColReflect101>, - linearColumnFilter_caller<22, T, D, BrdColReflect101>, - linearColumnFilter_caller<23, T, D, BrdColReflect101>, - linearColumnFilter_caller<24, T, D, BrdColReflect101>, - linearColumnFilter_caller<25, T, D, BrdColReflect101>, - linearColumnFilter_caller<26, T, D, BrdColReflect101>, - linearColumnFilter_caller<27, T, D, BrdColReflect101>, - linearColumnFilter_caller<28, T, D, BrdColReflect101>, - linearColumnFilter_caller<29, T, D, BrdColReflect101>, - linearColumnFilter_caller<30, T, D, BrdColReflect101>, - linearColumnFilter_caller<31, T, D, BrdColReflect101>, - linearColumnFilter_caller<32, T, D, BrdColReflect101> - }, - { - 0, - linearColumnFilter_caller< 1, T, D, BrdColReplicate>, - linearColumnFilter_caller< 2, T, D, BrdColReplicate>, - linearColumnFilter_caller< 3, T, D, BrdColReplicate>, - linearColumnFilter_caller< 4, T, D, BrdColReplicate>, - linearColumnFilter_caller< 5, T, D, BrdColReplicate>, - linearColumnFilter_caller< 6, T, D, BrdColReplicate>, - linearColumnFilter_caller< 7, T, D, BrdColReplicate>, - linearColumnFilter_caller< 8, T, D, BrdColReplicate>, - linearColumnFilter_caller< 9, T, D, BrdColReplicate>, - linearColumnFilter_caller<10, T, D, BrdColReplicate>, - linearColumnFilter_caller<11, T, D, BrdColReplicate>, - linearColumnFilter_caller<12, T, D, BrdColReplicate>, - linearColumnFilter_caller<13, T, D, BrdColReplicate>, - linearColumnFilter_caller<14, T, D, BrdColReplicate>, - linearColumnFilter_caller<15, T, D, BrdColReplicate>, - linearColumnFilter_caller<16, T, D, BrdColReplicate>, - linearColumnFilter_caller<17, T, D, BrdColReplicate>, - linearColumnFilter_caller<18, T, D, BrdColReplicate>, - linearColumnFilter_caller<19, T, D, BrdColReplicate>, - linearColumnFilter_caller<20, T, D, BrdColReplicate>, - linearColumnFilter_caller<21, T, D, BrdColReplicate>, - linearColumnFilter_caller<22, T, D, BrdColReplicate>, - linearColumnFilter_caller<23, T, D, BrdColReplicate>, - linearColumnFilter_caller<24, T, D, BrdColReplicate>, - linearColumnFilter_caller<25, T, D, BrdColReplicate>, - linearColumnFilter_caller<26, T, D, BrdColReplicate>, - linearColumnFilter_caller<27, T, D, BrdColReplicate>, - linearColumnFilter_caller<28, T, D, BrdColReplicate>, - linearColumnFilter_caller<29, T, D, BrdColReplicate>, - linearColumnFilter_caller<30, T, D, BrdColReplicate>, - linearColumnFilter_caller<31, T, D, BrdColReplicate>, - linearColumnFilter_caller<32, T, D, BrdColReplicate> - }, - { - 0, - linearColumnFilter_caller< 1, T, D, BrdColConstant>, - linearColumnFilter_caller< 2, T, D, BrdColConstant>, - linearColumnFilter_caller< 3, T, D, BrdColConstant>, - linearColumnFilter_caller< 4, T, D, BrdColConstant>, - linearColumnFilter_caller< 5, T, D, BrdColConstant>, - linearColumnFilter_caller< 6, T, D, BrdColConstant>, - linearColumnFilter_caller< 7, T, D, BrdColConstant>, - linearColumnFilter_caller< 8, T, D, BrdColConstant>, - linearColumnFilter_caller< 9, T, D, BrdColConstant>, - linearColumnFilter_caller<10, T, D, BrdColConstant>, - linearColumnFilter_caller<11, T, D, BrdColConstant>, - linearColumnFilter_caller<12, T, D, BrdColConstant>, - linearColumnFilter_caller<13, T, D, BrdColConstant>, - linearColumnFilter_caller<14, T, D, BrdColConstant>, - linearColumnFilter_caller<15, T, D, BrdColConstant>, - linearColumnFilter_caller<16, T, D, BrdColConstant>, - linearColumnFilter_caller<17, T, D, BrdColConstant>, - linearColumnFilter_caller<18, T, D, BrdColConstant>, - linearColumnFilter_caller<19, T, D, BrdColConstant>, - linearColumnFilter_caller<20, T, D, BrdColConstant>, - linearColumnFilter_caller<21, T, D, BrdColConstant>, - linearColumnFilter_caller<22, T, D, BrdColConstant>, - linearColumnFilter_caller<23, T, D, BrdColConstant>, - linearColumnFilter_caller<24, T, D, BrdColConstant>, - linearColumnFilter_caller<25, T, D, BrdColConstant>, - linearColumnFilter_caller<26, T, D, BrdColConstant>, - linearColumnFilter_caller<27, T, D, BrdColConstant>, - linearColumnFilter_caller<28, T, D, BrdColConstant>, - linearColumnFilter_caller<29, T, D, BrdColConstant>, - linearColumnFilter_caller<30, T, D, BrdColConstant>, - linearColumnFilter_caller<31, T, D, BrdColConstant>, - linearColumnFilter_caller<32, T, D, BrdColConstant> - }, - { - 0, - linearColumnFilter_caller< 1, T, D, BrdColReflect>, - linearColumnFilter_caller< 2, T, D, BrdColReflect>, - linearColumnFilter_caller< 3, T, D, BrdColReflect>, - linearColumnFilter_caller< 4, T, D, BrdColReflect>, - linearColumnFilter_caller< 5, T, D, BrdColReflect>, - linearColumnFilter_caller< 6, T, D, BrdColReflect>, - linearColumnFilter_caller< 7, T, D, BrdColReflect>, - linearColumnFilter_caller< 8, T, D, BrdColReflect>, - linearColumnFilter_caller< 9, T, D, BrdColReflect>, - linearColumnFilter_caller<10, T, D, BrdColReflect>, - linearColumnFilter_caller<11, T, D, BrdColReflect>, - linearColumnFilter_caller<12, T, D, BrdColReflect>, - linearColumnFilter_caller<13, T, D, BrdColReflect>, - linearColumnFilter_caller<14, T, D, BrdColReflect>, - linearColumnFilter_caller<15, T, D, BrdColReflect>, - linearColumnFilter_caller<16, T, D, BrdColReflect>, - linearColumnFilter_caller<17, T, D, BrdColReflect>, - linearColumnFilter_caller<18, T, D, BrdColReflect>, - linearColumnFilter_caller<19, T, D, BrdColReflect>, - linearColumnFilter_caller<20, T, D, BrdColReflect>, - linearColumnFilter_caller<21, T, D, BrdColReflect>, - linearColumnFilter_caller<22, T, D, BrdColReflect>, - linearColumnFilter_caller<23, T, D, BrdColReflect>, - linearColumnFilter_caller<24, T, D, BrdColReflect>, - linearColumnFilter_caller<25, T, D, BrdColReflect>, - linearColumnFilter_caller<26, T, D, BrdColReflect>, - linearColumnFilter_caller<27, T, D, BrdColReflect>, - linearColumnFilter_caller<28, T, D, BrdColReflect>, - linearColumnFilter_caller<29, T, D, BrdColReflect>, - linearColumnFilter_caller<30, T, D, BrdColReflect>, - linearColumnFilter_caller<31, T, D, BrdColReflect>, - linearColumnFilter_caller<32, T, D, BrdColReflect> - }, - { - 0, - linearColumnFilter_caller< 1, T, D, BrdColWrap>, - linearColumnFilter_caller< 2, T, D, BrdColWrap>, - linearColumnFilter_caller< 3, T, D, BrdColWrap>, - linearColumnFilter_caller< 4, T, D, BrdColWrap>, - linearColumnFilter_caller< 5, T, D, BrdColWrap>, - linearColumnFilter_caller< 6, T, D, BrdColWrap>, - linearColumnFilter_caller< 7, T, D, BrdColWrap>, - linearColumnFilter_caller< 8, T, D, BrdColWrap>, - linearColumnFilter_caller< 9, T, D, BrdColWrap>, - linearColumnFilter_caller<10, T, D, BrdColWrap>, - linearColumnFilter_caller<11, T, D, BrdColWrap>, - linearColumnFilter_caller<12, T, D, BrdColWrap>, - linearColumnFilter_caller<13, T, D, BrdColWrap>, - linearColumnFilter_caller<14, T, D, BrdColWrap>, - linearColumnFilter_caller<15, T, D, BrdColWrap>, - linearColumnFilter_caller<16, T, D, BrdColWrap>, - linearColumnFilter_caller<17, T, D, BrdColWrap>, - linearColumnFilter_caller<18, T, D, BrdColWrap>, - linearColumnFilter_caller<19, T, D, BrdColWrap>, - linearColumnFilter_caller<20, T, D, BrdColWrap>, - linearColumnFilter_caller<21, T, D, BrdColWrap>, - linearColumnFilter_caller<22, T, D, BrdColWrap>, - linearColumnFilter_caller<23, T, D, BrdColWrap>, - linearColumnFilter_caller<24, T, D, BrdColWrap>, - linearColumnFilter_caller<25, T, D, BrdColWrap>, - linearColumnFilter_caller<26, T, D, BrdColWrap>, - linearColumnFilter_caller<27, T, D, BrdColWrap>, - linearColumnFilter_caller<28, T, D, BrdColWrap>, - linearColumnFilter_caller<29, T, D, BrdColWrap>, - linearColumnFilter_caller<30, T, D, BrdColWrap>, - linearColumnFilter_caller<31, T, D, BrdColWrap>, - linearColumnFilter_caller<32, T, D, BrdColWrap> - } - }; - - loadKernel(kernel, ksize, stream); - - callers[brd_type][ksize]((PtrStepSz)src, (PtrStepSz)dst, anchor, cc, stream); - } - - template void linearColumnFilter_gpu(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream); - template void linearColumnFilter_gpu(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream); - template void linearColumnFilter_gpu(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream); - template void linearColumnFilter_gpu(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream); - template void linearColumnFilter_gpu(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream); - template void linearColumnFilter_gpu(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream); - template void linearColumnFilter_gpu(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream); - template void linearColumnFilter_gpu(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream); - } // namespace column_filter -}}} // namespace cv { namespace gpu { namespace device - - -#endif /* CUDA_DISABLER */ diff --git a/modules/gpu/src/cuda/column_filter.h b/modules/gpu/src/cuda/column_filter.h new file mode 100644 index 000000000..dbcd09fa3 --- /dev/null +++ b/modules/gpu/src/cuda/column_filter.h @@ -0,0 +1,378 @@ +/*M/////////////////////////////////////////////////////////////////////////////////////// +// +// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. +// +// By downloading, copying, installing or using the software you agree to this license. +// If you do not agree to this license, do not download, install, +// copy or use the software. +// +// +// License Agreement +// For Open Source Computer Vision Library +// +// Copyright (C) 2000-2008, Intel Corporation, all rights reserved. +// Copyright (C) 2009, Willow Garage Inc., all rights reserved. +// Copyright (C) 1993-2011, NVIDIA Corporation, all rights reserved. +// Third party copyrights are property of their respective owners. +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// * Redistribution's of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// * Redistribution's in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// * The name of the copyright holders may not be used to endorse or promote products +// derived from this software without specific prior written permission. +// +// This software is provided by the copyright holders and contributors "as is" and +// any express or implied warranties, including, but not limited to, the implied +// warranties of merchantability and fitness for a particular purpose are disclaimed. +// In no event shall the Intel Corporation or contributors be liable for any direct, +// indirect, incidental, special, exemplary, or consequential damages +// (including, but not limited to, procurement of substitute goods or services; +// loss of use, data, or profits; or business interruption) however caused +// and on any theory of liability, whether in contract, strict liability, +// or tort (including negligence or otherwise) arising in any way out of +// the use of this software, even if advised of the possibility of such damage. +// +//M*/ + +#include "opencv2/gpu/device/common.hpp" +#include "opencv2/gpu/device/saturate_cast.hpp" +#include "opencv2/gpu/device/vec_math.hpp" +#include "opencv2/gpu/device/border_interpolate.hpp" + +using namespace cv::gpu; +using namespace cv::gpu::device; + +namespace +{ + #define MAX_KERNEL_SIZE 32 + + __constant__ float c_kernel[MAX_KERNEL_SIZE]; + + void loadKernel(const float* kernel, int ksize, cudaStream_t stream) + { + if (stream == 0) + cudaSafeCall( cudaMemcpyToSymbol(c_kernel, kernel, ksize * sizeof(float), 0, cudaMemcpyDeviceToDevice) ); + else + cudaSafeCall( cudaMemcpyToSymbolAsync(c_kernel, kernel, ksize * sizeof(float), 0, cudaMemcpyDeviceToDevice, stream) ); + } + + template + __global__ void linearColumnFilter(const PtrStepSz src, PtrStep dst, const int anchor, const B brd) + { + #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 200) + const int BLOCK_DIM_X = 16; + const int BLOCK_DIM_Y = 16; + const int PATCH_PER_BLOCK = 4; + const int HALO_SIZE = KSIZE <= 16 ? 1 : 2; + #else + const int BLOCK_DIM_X = 16; + const int BLOCK_DIM_Y = 8; + const int PATCH_PER_BLOCK = 2; + const int HALO_SIZE = 2; + #endif + + typedef typename TypeVec::cn>::vec_type sum_t; + + __shared__ sum_t smem[(PATCH_PER_BLOCK + 2 * HALO_SIZE) * BLOCK_DIM_Y][BLOCK_DIM_X]; + + const int x = blockIdx.x * BLOCK_DIM_X + threadIdx.x; + + if (x >= src.cols) + return; + + const T* src_col = src.ptr() + x; + + const int yStart = blockIdx.y * (BLOCK_DIM_Y * PATCH_PER_BLOCK) + threadIdx.y; + + if (blockIdx.y > 0) + { + //Upper halo + #pragma unroll + for (int j = 0; j < HALO_SIZE; ++j) + smem[threadIdx.y + j * BLOCK_DIM_Y][threadIdx.x] = saturate_cast(src(yStart - (HALO_SIZE - j) * BLOCK_DIM_Y, x)); + } + else + { + //Upper halo + #pragma unroll + for (int j = 0; j < HALO_SIZE; ++j) + smem[threadIdx.y + j * BLOCK_DIM_Y][threadIdx.x] = saturate_cast(brd.at_low(yStart - (HALO_SIZE - j) * BLOCK_DIM_Y, src_col, src.step)); + } + + if (blockIdx.y + 2 < gridDim.y) + { + //Main data + #pragma unroll + for (int j = 0; j < PATCH_PER_BLOCK; ++j) + smem[threadIdx.y + HALO_SIZE * BLOCK_DIM_Y + j * BLOCK_DIM_Y][threadIdx.x] = saturate_cast(src(yStart + j * BLOCK_DIM_Y, x)); + + //Lower halo + #pragma unroll + for (int j = 0; j < HALO_SIZE; ++j) + smem[threadIdx.y + (PATCH_PER_BLOCK + HALO_SIZE) * BLOCK_DIM_Y + j * BLOCK_DIM_Y][threadIdx.x] = saturate_cast(src(yStart + (PATCH_PER_BLOCK + j) * BLOCK_DIM_Y, x)); + } + else + { + //Main data + #pragma unroll + for (int j = 0; j < PATCH_PER_BLOCK; ++j) + smem[threadIdx.y + HALO_SIZE * BLOCK_DIM_Y + j * BLOCK_DIM_Y][threadIdx.x] = saturate_cast(brd.at_high(yStart + j * BLOCK_DIM_Y, src_col, src.step)); + + //Lower halo + #pragma unroll + for (int j = 0; j < HALO_SIZE; ++j) + smem[threadIdx.y + (PATCH_PER_BLOCK + HALO_SIZE) * BLOCK_DIM_Y + j * BLOCK_DIM_Y][threadIdx.x] = saturate_cast(brd.at_high(yStart + (PATCH_PER_BLOCK + j) * BLOCK_DIM_Y, src_col, src.step)); + } + + __syncthreads(); + + #pragma unroll + for (int j = 0; j < PATCH_PER_BLOCK; ++j) + { + const int y = yStart + j * BLOCK_DIM_Y; + + if (y < src.rows) + { + sum_t sum = VecTraits::all(0); + + #pragma unroll + for (int k = 0; k < KSIZE; ++k) + sum = sum + smem[threadIdx.y + HALO_SIZE * BLOCK_DIM_Y + j * BLOCK_DIM_Y - anchor + k][threadIdx.x] * c_kernel[k]; + + dst(y, x) = saturate_cast(sum); + } + } + } + + template class B> + void caller(PtrStepSz src, PtrStepSz dst, int anchor, int cc, cudaStream_t stream) + { + int BLOCK_DIM_X; + int BLOCK_DIM_Y; + int PATCH_PER_BLOCK; + + if (cc >= 20) + { + BLOCK_DIM_X = 16; + BLOCK_DIM_Y = 16; + PATCH_PER_BLOCK = 4; + } + else + { + BLOCK_DIM_X = 16; + BLOCK_DIM_Y = 8; + PATCH_PER_BLOCK = 2; + } + + const dim3 block(BLOCK_DIM_X, BLOCK_DIM_Y); + const dim3 grid(divUp(src.cols, BLOCK_DIM_X), divUp(src.rows, BLOCK_DIM_Y * PATCH_PER_BLOCK)); + + B brd(src.rows); + + linearColumnFilter<<>>(src, dst, anchor, brd); + + cudaSafeCall( cudaGetLastError() ); + + if (stream == 0) + cudaSafeCall( cudaDeviceSynchronize() ); + } +} + +namespace filter +{ + template + void linearColumn(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream) + { + typedef void (*caller_t)(PtrStepSz src, PtrStepSz dst, int anchor, int cc, cudaStream_t stream); + + static const caller_t callers[5][33] = + { + { + 0, + ::caller< 1, T, D, BrdColReflect101>, + ::caller< 2, T, D, BrdColReflect101>, + ::caller< 3, T, D, BrdColReflect101>, + ::caller< 4, T, D, BrdColReflect101>, + ::caller< 5, T, D, BrdColReflect101>, + ::caller< 6, T, D, BrdColReflect101>, + ::caller< 7, T, D, BrdColReflect101>, + ::caller< 8, T, D, BrdColReflect101>, + ::caller< 9, T, D, BrdColReflect101>, + ::caller<10, T, D, BrdColReflect101>, + ::caller<11, T, D, BrdColReflect101>, + ::caller<12, T, D, BrdColReflect101>, + ::caller<13, T, D, BrdColReflect101>, + ::caller<14, T, D, BrdColReflect101>, + ::caller<15, T, D, BrdColReflect101>, + ::caller<16, T, D, BrdColReflect101>, + ::caller<17, T, D, BrdColReflect101>, + ::caller<18, T, D, BrdColReflect101>, + ::caller<19, T, D, BrdColReflect101>, + ::caller<20, T, D, BrdColReflect101>, + ::caller<21, T, D, BrdColReflect101>, + ::caller<22, T, D, BrdColReflect101>, + ::caller<23, T, D, BrdColReflect101>, + ::caller<24, T, D, BrdColReflect101>, + ::caller<25, T, D, BrdColReflect101>, + ::caller<26, T, D, BrdColReflect101>, + ::caller<27, T, D, BrdColReflect101>, + ::caller<28, T, D, BrdColReflect101>, + ::caller<29, T, D, BrdColReflect101>, + ::caller<30, T, D, BrdColReflect101>, + ::caller<31, T, D, BrdColReflect101>, + ::caller<32, T, D, BrdColReflect101> + }, + { + 0, + ::caller< 1, T, D, BrdColReplicate>, + ::caller< 2, T, D, BrdColReplicate>, + ::caller< 3, T, D, BrdColReplicate>, + ::caller< 4, T, D, BrdColReplicate>, + ::caller< 5, T, D, BrdColReplicate>, + ::caller< 6, T, D, BrdColReplicate>, + ::caller< 7, T, D, BrdColReplicate>, + ::caller< 8, T, D, BrdColReplicate>, + ::caller< 9, T, D, BrdColReplicate>, + ::caller<10, T, D, BrdColReplicate>, + ::caller<11, T, D, BrdColReplicate>, + ::caller<12, T, D, BrdColReplicate>, + ::caller<13, T, D, BrdColReplicate>, + ::caller<14, T, D, BrdColReplicate>, + ::caller<15, T, D, BrdColReplicate>, + ::caller<16, T, D, BrdColReplicate>, + ::caller<17, T, D, BrdColReplicate>, + ::caller<18, T, D, BrdColReplicate>, + ::caller<19, T, D, BrdColReplicate>, + ::caller<20, T, D, BrdColReplicate>, + ::caller<21, T, D, BrdColReplicate>, + ::caller<22, T, D, BrdColReplicate>, + ::caller<23, T, D, BrdColReplicate>, + ::caller<24, T, D, BrdColReplicate>, + ::caller<25, T, D, BrdColReplicate>, + ::caller<26, T, D, BrdColReplicate>, + ::caller<27, T, D, BrdColReplicate>, + ::caller<28, T, D, BrdColReplicate>, + ::caller<29, T, D, BrdColReplicate>, + ::caller<30, T, D, BrdColReplicate>, + ::caller<31, T, D, BrdColReplicate>, + ::caller<32, T, D, BrdColReplicate> + }, + { + 0, + ::caller< 1, T, D, BrdColConstant>, + ::caller< 2, T, D, BrdColConstant>, + ::caller< 3, T, D, BrdColConstant>, + ::caller< 4, T, D, BrdColConstant>, + ::caller< 5, T, D, BrdColConstant>, + ::caller< 6, T, D, BrdColConstant>, + ::caller< 7, T, D, BrdColConstant>, + ::caller< 8, T, D, BrdColConstant>, + ::caller< 9, T, D, BrdColConstant>, + ::caller<10, T, D, BrdColConstant>, + ::caller<11, T, D, BrdColConstant>, + ::caller<12, T, D, BrdColConstant>, + ::caller<13, T, D, BrdColConstant>, + ::caller<14, T, D, BrdColConstant>, + ::caller<15, T, D, BrdColConstant>, + ::caller<16, T, D, BrdColConstant>, + ::caller<17, T, D, BrdColConstant>, + ::caller<18, T, D, BrdColConstant>, + ::caller<19, T, D, BrdColConstant>, + ::caller<20, T, D, BrdColConstant>, + ::caller<21, T, D, BrdColConstant>, + ::caller<22, T, D, BrdColConstant>, + ::caller<23, T, D, BrdColConstant>, + ::caller<24, T, D, BrdColConstant>, + ::caller<25, T, D, BrdColConstant>, + ::caller<26, T, D, BrdColConstant>, + ::caller<27, T, D, BrdColConstant>, + ::caller<28, T, D, BrdColConstant>, + ::caller<29, T, D, BrdColConstant>, + ::caller<30, T, D, BrdColConstant>, + ::caller<31, T, D, BrdColConstant>, + ::caller<32, T, D, BrdColConstant> + }, + { + 0, + ::caller< 1, T, D, BrdColReflect>, + ::caller< 2, T, D, BrdColReflect>, + ::caller< 3, T, D, BrdColReflect>, + ::caller< 4, T, D, BrdColReflect>, + ::caller< 5, T, D, BrdColReflect>, + ::caller< 6, T, D, BrdColReflect>, + ::caller< 7, T, D, BrdColReflect>, + ::caller< 8, T, D, BrdColReflect>, + ::caller< 9, T, D, BrdColReflect>, + ::caller<10, T, D, BrdColReflect>, + ::caller<11, T, D, BrdColReflect>, + ::caller<12, T, D, BrdColReflect>, + ::caller<13, T, D, BrdColReflect>, + ::caller<14, T, D, BrdColReflect>, + ::caller<15, T, D, BrdColReflect>, + ::caller<16, T, D, BrdColReflect>, + ::caller<17, T, D, BrdColReflect>, + ::caller<18, T, D, BrdColReflect>, + ::caller<19, T, D, BrdColReflect>, + ::caller<20, T, D, BrdColReflect>, + ::caller<21, T, D, BrdColReflect>, + ::caller<22, T, D, BrdColReflect>, + ::caller<23, T, D, BrdColReflect>, + ::caller<24, T, D, BrdColReflect>, + ::caller<25, T, D, BrdColReflect>, + ::caller<26, T, D, BrdColReflect>, + ::caller<27, T, D, BrdColReflect>, + ::caller<28, T, D, BrdColReflect>, + ::caller<29, T, D, BrdColReflect>, + ::caller<30, T, D, BrdColReflect>, + ::caller<31, T, D, BrdColReflect>, + ::caller<32, T, D, BrdColReflect> + }, + { + 0, + ::caller< 1, T, D, BrdColWrap>, + ::caller< 2, T, D, BrdColWrap>, + ::caller< 3, T, D, BrdColWrap>, + ::caller< 4, T, D, BrdColWrap>, + ::caller< 5, T, D, BrdColWrap>, + ::caller< 6, T, D, BrdColWrap>, + ::caller< 7, T, D, BrdColWrap>, + ::caller< 8, T, D, BrdColWrap>, + ::caller< 9, T, D, BrdColWrap>, + ::caller<10, T, D, BrdColWrap>, + ::caller<11, T, D, BrdColWrap>, + ::caller<12, T, D, BrdColWrap>, + ::caller<13, T, D, BrdColWrap>, + ::caller<14, T, D, BrdColWrap>, + ::caller<15, T, D, BrdColWrap>, + ::caller<16, T, D, BrdColWrap>, + ::caller<17, T, D, BrdColWrap>, + ::caller<18, T, D, BrdColWrap>, + ::caller<19, T, D, BrdColWrap>, + ::caller<20, T, D, BrdColWrap>, + ::caller<21, T, D, BrdColWrap>, + ::caller<22, T, D, BrdColWrap>, + ::caller<23, T, D, BrdColWrap>, + ::caller<24, T, D, BrdColWrap>, + ::caller<25, T, D, BrdColWrap>, + ::caller<26, T, D, BrdColWrap>, + ::caller<27, T, D, BrdColWrap>, + ::caller<28, T, D, BrdColWrap>, + ::caller<29, T, D, BrdColWrap>, + ::caller<30, T, D, BrdColWrap>, + ::caller<31, T, D, BrdColWrap>, + ::caller<32, T, D, BrdColWrap> + } + }; + + ::loadKernel(kernel, ksize, stream); + + callers[brd_type][ksize]((PtrStepSz)src, (PtrStepSz)dst, anchor, cc, stream); + } +} diff --git a/modules/gpu/src/cuda/row_filter.0.cu b/modules/gpu/src/cuda/row_filter.0.cu new file mode 100644 index 000000000..a1a8f36ca --- /dev/null +++ b/modules/gpu/src/cuda/row_filter.0.cu @@ -0,0 +1,53 @@ +/*M/////////////////////////////////////////////////////////////////////////////////////// +// +// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. +// +// By downloading, copying, installing or using the software you agree to this license. +// If you do not agree to this license, do not download, install, +// copy or use the software. +// +// +// License Agreement +// For Open Source Computer Vision Library +// +// Copyright (C) 2000-2008, Intel Corporation, all rights reserved. +// Copyright (C) 2009, Willow Garage Inc., all rights reserved. +// Copyright (C) 1993-2011, NVIDIA Corporation, all rights reserved. +// Third party copyrights are property of their respective owners. +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// * Redistribution's of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// * Redistribution's in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// * The name of the copyright holders may not be used to endorse or promote products +// derived from this software without specific prior written permission. +// +// This software is provided by the copyright holders and contributors "as is" and +// any express or implied warranties, including, but not limited to, the implied +// warranties of merchantability and fitness for a particular purpose are disclaimed. +// In no event shall the Intel Corporation or contributors be liable for any direct, +// indirect, incidental, special, exemplary, or consequential damages +// (including, but not limited to, procurement of substitute goods or services; +// loss of use, data, or profits; or business interruption) however caused +// and on any theory of liability, whether in contract, strict liability, +// or tort (including negligence or otherwise) arising in any way out of +// the use of this software, even if advised of the possibility of such damage. +// +//M*/ + +#if !defined CUDA_DISABLER + +#include "row_filter.h" + +namespace filter +{ + template void linearRow(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream); +} + +#endif /* CUDA_DISABLER */ diff --git a/modules/gpu/src/cuda/row_filter.1.cu b/modules/gpu/src/cuda/row_filter.1.cu new file mode 100644 index 000000000..ab2248e1b --- /dev/null +++ b/modules/gpu/src/cuda/row_filter.1.cu @@ -0,0 +1,53 @@ +/*M/////////////////////////////////////////////////////////////////////////////////////// +// +// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. +// +// By downloading, copying, installing or using the software you agree to this license. +// If you do not agree to this license, do not download, install, +// copy or use the software. +// +// +// License Agreement +// For Open Source Computer Vision Library +// +// Copyright (C) 2000-2008, Intel Corporation, all rights reserved. +// Copyright (C) 2009, Willow Garage Inc., all rights reserved. +// Copyright (C) 1993-2011, NVIDIA Corporation, all rights reserved. +// Third party copyrights are property of their respective owners. +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// * Redistribution's of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// * Redistribution's in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// * The name of the copyright holders may not be used to endorse or promote products +// derived from this software without specific prior written permission. +// +// This software is provided by the copyright holders and contributors "as is" and +// any express or implied warranties, including, but not limited to, the implied +// warranties of merchantability and fitness for a particular purpose are disclaimed. +// In no event shall the Intel Corporation or contributors be liable for any direct, +// indirect, incidental, special, exemplary, or consequential damages +// (including, but not limited to, procurement of substitute goods or services; +// loss of use, data, or profits; or business interruption) however caused +// and on any theory of liability, whether in contract, strict liability, +// or tort (including negligence or otherwise) arising in any way out of +// the use of this software, even if advised of the possibility of such damage. +// +//M*/ + +#if !defined CUDA_DISABLER + +#include "row_filter.h" + +namespace filter +{ + template void linearRow(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream); +} + +#endif /* CUDA_DISABLER */ diff --git a/modules/gpu/src/cuda/row_filter.2.cu b/modules/gpu/src/cuda/row_filter.2.cu new file mode 100644 index 000000000..5aa2e2b80 --- /dev/null +++ b/modules/gpu/src/cuda/row_filter.2.cu @@ -0,0 +1,53 @@ +/*M/////////////////////////////////////////////////////////////////////////////////////// +// +// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. +// +// By downloading, copying, installing or using the software you agree to this license. +// If you do not agree to this license, do not download, install, +// copy or use the software. +// +// +// License Agreement +// For Open Source Computer Vision Library +// +// Copyright (C) 2000-2008, Intel Corporation, all rights reserved. +// Copyright (C) 2009, Willow Garage Inc., all rights reserved. +// Copyright (C) 1993-2011, NVIDIA Corporation, all rights reserved. +// Third party copyrights are property of their respective owners. +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// * Redistribution's of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// * Redistribution's in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// * The name of the copyright holders may not be used to endorse or promote products +// derived from this software without specific prior written permission. +// +// This software is provided by the copyright holders and contributors "as is" and +// any express or implied warranties, including, but not limited to, the implied +// warranties of merchantability and fitness for a particular purpose are disclaimed. +// In no event shall the Intel Corporation or contributors be liable for any direct, +// indirect, incidental, special, exemplary, or consequential damages +// (including, but not limited to, procurement of substitute goods or services; +// loss of use, data, or profits; or business interruption) however caused +// and on any theory of liability, whether in contract, strict liability, +// or tort (including negligence or otherwise) arising in any way out of +// the use of this software, even if advised of the possibility of such damage. +// +//M*/ + +#if !defined CUDA_DISABLER + +#include "row_filter.h" + +namespace filter +{ + template void linearRow(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream); +} + +#endif /* CUDA_DISABLER */ diff --git a/modules/gpu/src/cuda/row_filter.3.cu b/modules/gpu/src/cuda/row_filter.3.cu new file mode 100644 index 000000000..9d131a959 --- /dev/null +++ b/modules/gpu/src/cuda/row_filter.3.cu @@ -0,0 +1,53 @@ +/*M/////////////////////////////////////////////////////////////////////////////////////// +// +// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. +// +// By downloading, copying, installing or using the software you agree to this license. +// If you do not agree to this license, do not download, install, +// copy or use the software. +// +// +// License Agreement +// For Open Source Computer Vision Library +// +// Copyright (C) 2000-2008, Intel Corporation, all rights reserved. +// Copyright (C) 2009, Willow Garage Inc., all rights reserved. +// Copyright (C) 1993-2011, NVIDIA Corporation, all rights reserved. +// Third party copyrights are property of their respective owners. +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// * Redistribution's of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// * Redistribution's in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// * The name of the copyright holders may not be used to endorse or promote products +// derived from this software without specific prior written permission. +// +// This software is provided by the copyright holders and contributors "as is" and +// any express or implied warranties, including, but not limited to, the implied +// warranties of merchantability and fitness for a particular purpose are disclaimed. +// In no event shall the Intel Corporation or contributors be liable for any direct, +// indirect, incidental, special, exemplary, or consequential damages +// (including, but not limited to, procurement of substitute goods or services; +// loss of use, data, or profits; or business interruption) however caused +// and on any theory of liability, whether in contract, strict liability, +// or tort (including negligence or otherwise) arising in any way out of +// the use of this software, even if advised of the possibility of such damage. +// +//M*/ + +#if !defined CUDA_DISABLER + +#include "row_filter.h" + +namespace filter +{ + template void linearRow(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream); +} + +#endif /* CUDA_DISABLER */ diff --git a/modules/gpu/src/cuda/row_filter.4.cu b/modules/gpu/src/cuda/row_filter.4.cu new file mode 100644 index 000000000..0aae534ce --- /dev/null +++ b/modules/gpu/src/cuda/row_filter.4.cu @@ -0,0 +1,53 @@ +/*M/////////////////////////////////////////////////////////////////////////////////////// +// +// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. +// +// By downloading, copying, installing or using the software you agree to this license. +// If you do not agree to this license, do not download, install, +// copy or use the software. +// +// +// License Agreement +// For Open Source Computer Vision Library +// +// Copyright (C) 2000-2008, Intel Corporation, all rights reserved. +// Copyright (C) 2009, Willow Garage Inc., all rights reserved. +// Copyright (C) 1993-2011, NVIDIA Corporation, all rights reserved. +// Third party copyrights are property of their respective owners. +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// * Redistribution's of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// * Redistribution's in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// * The name of the copyright holders may not be used to endorse or promote products +// derived from this software without specific prior written permission. +// +// This software is provided by the copyright holders and contributors "as is" and +// any express or implied warranties, including, but not limited to, the implied +// warranties of merchantability and fitness for a particular purpose are disclaimed. +// In no event shall the Intel Corporation or contributors be liable for any direct, +// indirect, incidental, special, exemplary, or consequential damages +// (including, but not limited to, procurement of substitute goods or services; +// loss of use, data, or profits; or business interruption) however caused +// and on any theory of liability, whether in contract, strict liability, +// or tort (including negligence or otherwise) arising in any way out of +// the use of this software, even if advised of the possibility of such damage. +// +//M*/ + +#if !defined CUDA_DISABLER + +#include "row_filter.h" + +namespace filter +{ + template void linearRow(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream); +} + +#endif /* CUDA_DISABLER */ diff --git a/modules/gpu/src/cuda/row_filter.5.cu b/modules/gpu/src/cuda/row_filter.5.cu new file mode 100644 index 000000000..dd1f2be13 --- /dev/null +++ b/modules/gpu/src/cuda/row_filter.5.cu @@ -0,0 +1,53 @@ +/*M/////////////////////////////////////////////////////////////////////////////////////// +// +// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. +// +// By downloading, copying, installing or using the software you agree to this license. +// If you do not agree to this license, do not download, install, +// copy or use the software. +// +// +// License Agreement +// For Open Source Computer Vision Library +// +// Copyright (C) 2000-2008, Intel Corporation, all rights reserved. +// Copyright (C) 2009, Willow Garage Inc., all rights reserved. +// Copyright (C) 1993-2011, NVIDIA Corporation, all rights reserved. +// Third party copyrights are property of their respective owners. +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// * Redistribution's of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// * Redistribution's in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// * The name of the copyright holders may not be used to endorse or promote products +// derived from this software without specific prior written permission. +// +// This software is provided by the copyright holders and contributors "as is" and +// any express or implied warranties, including, but not limited to, the implied +// warranties of merchantability and fitness for a particular purpose are disclaimed. +// In no event shall the Intel Corporation or contributors be liable for any direct, +// indirect, incidental, special, exemplary, or consequential damages +// (including, but not limited to, procurement of substitute goods or services; +// loss of use, data, or profits; or business interruption) however caused +// and on any theory of liability, whether in contract, strict liability, +// or tort (including negligence or otherwise) arising in any way out of +// the use of this software, even if advised of the possibility of such damage. +// +//M*/ + +#if !defined CUDA_DISABLER + +#include "row_filter.h" + +namespace filter +{ + template void linearRow(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream); +} + +#endif /* CUDA_DISABLER */ diff --git a/modules/gpu/src/cuda/row_filter.6.cu b/modules/gpu/src/cuda/row_filter.6.cu new file mode 100644 index 000000000..548069d36 --- /dev/null +++ b/modules/gpu/src/cuda/row_filter.6.cu @@ -0,0 +1,53 @@ +/*M/////////////////////////////////////////////////////////////////////////////////////// +// +// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. +// +// By downloading, copying, installing or using the software you agree to this license. +// If you do not agree to this license, do not download, install, +// copy or use the software. +// +// +// License Agreement +// For Open Source Computer Vision Library +// +// Copyright (C) 2000-2008, Intel Corporation, all rights reserved. +// Copyright (C) 2009, Willow Garage Inc., all rights reserved. +// Copyright (C) 1993-2011, NVIDIA Corporation, all rights reserved. +// Third party copyrights are property of their respective owners. +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// * Redistribution's of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// * Redistribution's in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// * The name of the copyright holders may not be used to endorse or promote products +// derived from this software without specific prior written permission. +// +// This software is provided by the copyright holders and contributors "as is" and +// any express or implied warranties, including, but not limited to, the implied +// warranties of merchantability and fitness for a particular purpose are disclaimed. +// In no event shall the Intel Corporation or contributors be liable for any direct, +// indirect, incidental, special, exemplary, or consequential damages +// (including, but not limited to, procurement of substitute goods or services; +// loss of use, data, or profits; or business interruption) however caused +// and on any theory of liability, whether in contract, strict liability, +// or tort (including negligence or otherwise) arising in any way out of +// the use of this software, even if advised of the possibility of such damage. +// +//M*/ + +#if !defined CUDA_DISABLER + +#include "row_filter.h" + +namespace filter +{ + template void linearRow(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream); +} + +#endif /* CUDA_DISABLER */ diff --git a/modules/gpu/src/cuda/row_filter.7.cu b/modules/gpu/src/cuda/row_filter.7.cu new file mode 100644 index 000000000..8c5c09ed9 --- /dev/null +++ b/modules/gpu/src/cuda/row_filter.7.cu @@ -0,0 +1,53 @@ +/*M/////////////////////////////////////////////////////////////////////////////////////// +// +// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. +// +// By downloading, copying, installing or using the software you agree to this license. +// If you do not agree to this license, do not download, install, +// copy or use the software. +// +// +// License Agreement +// For Open Source Computer Vision Library +// +// Copyright (C) 2000-2008, Intel Corporation, all rights reserved. +// Copyright (C) 2009, Willow Garage Inc., all rights reserved. +// Copyright (C) 1993-2011, NVIDIA Corporation, all rights reserved. +// Third party copyrights are property of their respective owners. +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// * Redistribution's of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// * Redistribution's in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// * The name of the copyright holders may not be used to endorse or promote products +// derived from this software without specific prior written permission. +// +// This software is provided by the copyright holders and contributors "as is" and +// any express or implied warranties, including, but not limited to, the implied +// warranties of merchantability and fitness for a particular purpose are disclaimed. +// In no event shall the Intel Corporation or contributors be liable for any direct, +// indirect, incidental, special, exemplary, or consequential damages +// (including, but not limited to, procurement of substitute goods or services; +// loss of use, data, or profits; or business interruption) however caused +// and on any theory of liability, whether in contract, strict liability, +// or tort (including negligence or otherwise) arising in any way out of +// the use of this software, even if advised of the possibility of such damage. +// +//M*/ + +#if !defined CUDA_DISABLER + +#include "row_filter.h" + +namespace filter +{ + template void linearRow(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream); +} + +#endif /* CUDA_DISABLER */ diff --git a/modules/gpu/src/cuda/row_filter.cu b/modules/gpu/src/cuda/row_filter.cu deleted file mode 100644 index 39fc53fdc..000000000 --- a/modules/gpu/src/cuda/row_filter.cu +++ /dev/null @@ -1,390 +0,0 @@ -/*M/////////////////////////////////////////////////////////////////////////////////////// -// -// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. -// -// By downloading, copying, installing or using the software you agree to this license. -// If you do not agree to this license, do not download, install, -// copy or use the software. -// -// -// License Agreement -// For Open Source Computer Vision Library -// -// Copyright (C) 2000-2008, Intel Corporation, all rights reserved. -// Copyright (C) 2009, Willow Garage Inc., all rights reserved. -// Copyright (C) 1993-2011, NVIDIA Corporation, all rights reserved. -// Third party copyrights are property of their respective owners. -// -// Redistribution and use in source and binary forms, with or without modification, -// are permitted provided that the following conditions are met: -// -// * Redistribution's of source code must retain the above copyright notice, -// this list of conditions and the following disclaimer. -// -// * Redistribution's in binary form must reproduce the above copyright notice, -// this list of conditions and the following disclaimer in the documentation -// and/or other materials provided with the distribution. -// -// * The name of the copyright holders may not be used to endorse or promote products -// derived from this software without specific prior written permission. -// -// This software is provided by the copyright holders and contributors "as is" and -// any express or implied warranties, including, but not limited to, the implied -// warranties of merchantability and fitness for a particular purpose are disclaimed. -// In no event shall the Intel Corporation or contributors be liable for any direct, -// indirect, incidental, special, exemplary, or consequential damages -// (including, but not limited to, procurement of substitute goods or services; -// loss of use, data, or profits; or business interruption) however caused -// and on any theory of liability, whether in contract, strict liability, -// or tort (including negligence or otherwise) arising in any way out of -// the use of this software, even if advised of the possibility of such damage. -// -//M*/ - -#if !defined CUDA_DISABLER - -#include "internal_shared.hpp" -#include "opencv2/gpu/device/saturate_cast.hpp" -#include "opencv2/gpu/device/vec_math.hpp" -#include "opencv2/gpu/device/limits.hpp" -#include "opencv2/gpu/device/border_interpolate.hpp" -#include "opencv2/gpu/device/static_check.hpp" - -namespace cv { namespace gpu { namespace device -{ - namespace row_filter - { - #define MAX_KERNEL_SIZE 32 - - __constant__ float c_kernel[MAX_KERNEL_SIZE]; - - void loadKernel(const float* kernel, int ksize, cudaStream_t stream) - { - if (stream == 0) - cudaSafeCall( cudaMemcpyToSymbol(c_kernel, kernel, ksize * sizeof(float), 0, cudaMemcpyDeviceToDevice) ); - else - cudaSafeCall( cudaMemcpyToSymbolAsync(c_kernel, kernel, ksize * sizeof(float), 0, cudaMemcpyDeviceToDevice, stream) ); - } - - template - __global__ void linearRowFilter(const PtrStepSz src, PtrStep dst, const int anchor, const B brd) - { - #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 200) - const int BLOCK_DIM_X = 32; - const int BLOCK_DIM_Y = 8; - const int PATCH_PER_BLOCK = 4; - const int HALO_SIZE = 1; - #else - const int BLOCK_DIM_X = 32; - const int BLOCK_DIM_Y = 4; - const int PATCH_PER_BLOCK = 4; - const int HALO_SIZE = 1; - #endif - - typedef typename TypeVec::cn>::vec_type sum_t; - - __shared__ sum_t smem[BLOCK_DIM_Y][(PATCH_PER_BLOCK + 2 * HALO_SIZE) * BLOCK_DIM_X]; - - const int y = blockIdx.y * BLOCK_DIM_Y + threadIdx.y; - - if (y >= src.rows) - return; - - const T* src_row = src.ptr(y); - - const int xStart = blockIdx.x * (PATCH_PER_BLOCK * BLOCK_DIM_X) + threadIdx.x; - - if (blockIdx.x > 0) - { - //Load left halo - #pragma unroll - for (int j = 0; j < HALO_SIZE; ++j) - smem[threadIdx.y][threadIdx.x + j * BLOCK_DIM_X] = saturate_cast(src_row[xStart - (HALO_SIZE - j) * BLOCK_DIM_X]); - } - else - { - //Load left halo - #pragma unroll - for (int j = 0; j < HALO_SIZE; ++j) - smem[threadIdx.y][threadIdx.x + j * BLOCK_DIM_X] = saturate_cast(brd.at_low(xStart - (HALO_SIZE - j) * BLOCK_DIM_X, src_row)); - } - - if (blockIdx.x + 2 < gridDim.x) - { - //Load main data - #pragma unroll - for (int j = 0; j < PATCH_PER_BLOCK; ++j) - smem[threadIdx.y][threadIdx.x + HALO_SIZE * BLOCK_DIM_X + j * BLOCK_DIM_X] = saturate_cast(src_row[xStart + j * BLOCK_DIM_X]); - - //Load right halo - #pragma unroll - for (int j = 0; j < HALO_SIZE; ++j) - smem[threadIdx.y][threadIdx.x + (PATCH_PER_BLOCK + HALO_SIZE) * BLOCK_DIM_X + j * BLOCK_DIM_X] = saturate_cast(src_row[xStart + (PATCH_PER_BLOCK + j) * BLOCK_DIM_X]); - } - else - { - //Load main data - #pragma unroll - for (int j = 0; j < PATCH_PER_BLOCK; ++j) - smem[threadIdx.y][threadIdx.x + HALO_SIZE * BLOCK_DIM_X + j * BLOCK_DIM_X] = saturate_cast(brd.at_high(xStart + j * BLOCK_DIM_X, src_row)); - - //Load right halo - #pragma unroll - for (int j = 0; j < HALO_SIZE; ++j) - smem[threadIdx.y][threadIdx.x + (PATCH_PER_BLOCK + HALO_SIZE) * BLOCK_DIM_X + j * BLOCK_DIM_X] = saturate_cast(brd.at_high(xStart + (PATCH_PER_BLOCK + j) * BLOCK_DIM_X, src_row)); - } - - __syncthreads(); - - #pragma unroll - for (int j = 0; j < PATCH_PER_BLOCK; ++j) - { - const int x = xStart + j * BLOCK_DIM_X; - - if (x < src.cols) - { - sum_t sum = VecTraits::all(0); - - #pragma unroll - for (int k = 0; k < KSIZE; ++k) - sum = sum + smem[threadIdx.y][threadIdx.x + HALO_SIZE * BLOCK_DIM_X + j * BLOCK_DIM_X - anchor + k] * c_kernel[k]; - - dst(y, x) = saturate_cast(sum); - } - } - } - - template class B> - void linearRowFilter_caller(PtrStepSz src, PtrStepSz dst, int anchor, int cc, cudaStream_t stream) - { - int BLOCK_DIM_X; - int BLOCK_DIM_Y; - int PATCH_PER_BLOCK; - - if (cc >= 20) - { - BLOCK_DIM_X = 32; - BLOCK_DIM_Y = 8; - PATCH_PER_BLOCK = 4; - } - else - { - BLOCK_DIM_X = 32; - BLOCK_DIM_Y = 4; - PATCH_PER_BLOCK = 4; - } - - const dim3 block(BLOCK_DIM_X, BLOCK_DIM_Y); - const dim3 grid(divUp(src.cols, BLOCK_DIM_X * PATCH_PER_BLOCK), divUp(src.rows, BLOCK_DIM_Y)); - - B brd(src.cols); - - linearRowFilter<<>>(src, dst, anchor, brd); - cudaSafeCall( cudaGetLastError() ); - - if (stream == 0) - cudaSafeCall( cudaDeviceSynchronize() ); - } - - template - void linearRowFilter_gpu(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream) - { - typedef void (*caller_t)(PtrStepSz src, PtrStepSz dst, int anchor, int cc, cudaStream_t stream); - - static const caller_t callers[5][33] = - { - { - 0, - linearRowFilter_caller< 1, T, D, BrdRowReflect101>, - linearRowFilter_caller< 2, T, D, BrdRowReflect101>, - linearRowFilter_caller< 3, T, D, BrdRowReflect101>, - linearRowFilter_caller< 4, T, D, BrdRowReflect101>, - linearRowFilter_caller< 5, T, D, BrdRowReflect101>, - linearRowFilter_caller< 6, T, D, BrdRowReflect101>, - linearRowFilter_caller< 7, T, D, BrdRowReflect101>, - linearRowFilter_caller< 8, T, D, BrdRowReflect101>, - linearRowFilter_caller< 9, T, D, BrdRowReflect101>, - linearRowFilter_caller<10, T, D, BrdRowReflect101>, - linearRowFilter_caller<11, T, D, BrdRowReflect101>, - linearRowFilter_caller<12, T, D, BrdRowReflect101>, - linearRowFilter_caller<13, T, D, BrdRowReflect101>, - linearRowFilter_caller<14, T, D, BrdRowReflect101>, - linearRowFilter_caller<15, T, D, BrdRowReflect101>, - linearRowFilter_caller<16, T, D, BrdRowReflect101>, - linearRowFilter_caller<17, T, D, BrdRowReflect101>, - linearRowFilter_caller<18, T, D, BrdRowReflect101>, - linearRowFilter_caller<19, T, D, BrdRowReflect101>, - linearRowFilter_caller<20, T, D, BrdRowReflect101>, - linearRowFilter_caller<21, T, D, BrdRowReflect101>, - linearRowFilter_caller<22, T, D, BrdRowReflect101>, - linearRowFilter_caller<23, T, D, BrdRowReflect101>, - linearRowFilter_caller<24, T, D, BrdRowReflect101>, - linearRowFilter_caller<25, T, D, BrdRowReflect101>, - linearRowFilter_caller<26, T, D, BrdRowReflect101>, - linearRowFilter_caller<27, T, D, BrdRowReflect101>, - linearRowFilter_caller<28, T, D, BrdRowReflect101>, - linearRowFilter_caller<29, T, D, BrdRowReflect101>, - linearRowFilter_caller<30, T, D, BrdRowReflect101>, - linearRowFilter_caller<31, T, D, BrdRowReflect101>, - linearRowFilter_caller<32, T, D, BrdRowReflect101> - }, - { - 0, - linearRowFilter_caller< 1, T, D, BrdRowReplicate>, - linearRowFilter_caller< 2, T, D, BrdRowReplicate>, - linearRowFilter_caller< 3, T, D, BrdRowReplicate>, - linearRowFilter_caller< 4, T, D, BrdRowReplicate>, - linearRowFilter_caller< 5, T, D, BrdRowReplicate>, - linearRowFilter_caller< 6, T, D, BrdRowReplicate>, - linearRowFilter_caller< 7, T, D, BrdRowReplicate>, - linearRowFilter_caller< 8, T, D, BrdRowReplicate>, - linearRowFilter_caller< 9, T, D, BrdRowReplicate>, - linearRowFilter_caller<10, T, D, BrdRowReplicate>, - linearRowFilter_caller<11, T, D, BrdRowReplicate>, - linearRowFilter_caller<12, T, D, BrdRowReplicate>, - linearRowFilter_caller<13, T, D, BrdRowReplicate>, - linearRowFilter_caller<14, T, D, BrdRowReplicate>, - linearRowFilter_caller<15, T, D, BrdRowReplicate>, - linearRowFilter_caller<16, T, D, BrdRowReplicate>, - linearRowFilter_caller<17, T, D, BrdRowReplicate>, - linearRowFilter_caller<18, T, D, BrdRowReplicate>, - linearRowFilter_caller<19, T, D, BrdRowReplicate>, - linearRowFilter_caller<20, T, D, BrdRowReplicate>, - linearRowFilter_caller<21, T, D, BrdRowReplicate>, - linearRowFilter_caller<22, T, D, BrdRowReplicate>, - linearRowFilter_caller<23, T, D, BrdRowReplicate>, - linearRowFilter_caller<24, T, D, BrdRowReplicate>, - linearRowFilter_caller<25, T, D, BrdRowReplicate>, - linearRowFilter_caller<26, T, D, BrdRowReplicate>, - linearRowFilter_caller<27, T, D, BrdRowReplicate>, - linearRowFilter_caller<28, T, D, BrdRowReplicate>, - linearRowFilter_caller<29, T, D, BrdRowReplicate>, - linearRowFilter_caller<30, T, D, BrdRowReplicate>, - linearRowFilter_caller<31, T, D, BrdRowReplicate>, - linearRowFilter_caller<32, T, D, BrdRowReplicate> - }, - { - 0, - linearRowFilter_caller< 1, T, D, BrdRowConstant>, - linearRowFilter_caller< 2, T, D, BrdRowConstant>, - linearRowFilter_caller< 3, T, D, BrdRowConstant>, - linearRowFilter_caller< 4, T, D, BrdRowConstant>, - linearRowFilter_caller< 5, T, D, BrdRowConstant>, - linearRowFilter_caller< 6, T, D, BrdRowConstant>, - linearRowFilter_caller< 7, T, D, BrdRowConstant>, - linearRowFilter_caller< 8, T, D, BrdRowConstant>, - linearRowFilter_caller< 9, T, D, BrdRowConstant>, - linearRowFilter_caller<10, T, D, BrdRowConstant>, - linearRowFilter_caller<11, T, D, BrdRowConstant>, - linearRowFilter_caller<12, T, D, BrdRowConstant>, - linearRowFilter_caller<13, T, D, BrdRowConstant>, - linearRowFilter_caller<14, T, D, BrdRowConstant>, - linearRowFilter_caller<15, T, D, BrdRowConstant>, - linearRowFilter_caller<16, T, D, BrdRowConstant>, - linearRowFilter_caller<17, T, D, BrdRowConstant>, - linearRowFilter_caller<18, T, D, BrdRowConstant>, - linearRowFilter_caller<19, T, D, BrdRowConstant>, - linearRowFilter_caller<20, T, D, BrdRowConstant>, - linearRowFilter_caller<21, T, D, BrdRowConstant>, - linearRowFilter_caller<22, T, D, BrdRowConstant>, - linearRowFilter_caller<23, T, D, BrdRowConstant>, - linearRowFilter_caller<24, T, D, BrdRowConstant>, - linearRowFilter_caller<25, T, D, BrdRowConstant>, - linearRowFilter_caller<26, T, D, BrdRowConstant>, - linearRowFilter_caller<27, T, D, BrdRowConstant>, - linearRowFilter_caller<28, T, D, BrdRowConstant>, - linearRowFilter_caller<29, T, D, BrdRowConstant>, - linearRowFilter_caller<30, T, D, BrdRowConstant>, - linearRowFilter_caller<31, T, D, BrdRowConstant>, - linearRowFilter_caller<32, T, D, BrdRowConstant> - }, - { - 0, - linearRowFilter_caller< 1, T, D, BrdRowReflect>, - linearRowFilter_caller< 2, T, D, BrdRowReflect>, - linearRowFilter_caller< 3, T, D, BrdRowReflect>, - linearRowFilter_caller< 4, T, D, BrdRowReflect>, - linearRowFilter_caller< 5, T, D, BrdRowReflect>, - linearRowFilter_caller< 6, T, D, BrdRowReflect>, - linearRowFilter_caller< 7, T, D, BrdRowReflect>, - linearRowFilter_caller< 8, T, D, BrdRowReflect>, - linearRowFilter_caller< 9, T, D, BrdRowReflect>, - linearRowFilter_caller<10, T, D, BrdRowReflect>, - linearRowFilter_caller<11, T, D, BrdRowReflect>, - linearRowFilter_caller<12, T, D, BrdRowReflect>, - linearRowFilter_caller<13, T, D, BrdRowReflect>, - linearRowFilter_caller<14, T, D, BrdRowReflect>, - linearRowFilter_caller<15, T, D, BrdRowReflect>, - linearRowFilter_caller<16, T, D, BrdRowReflect>, - linearRowFilter_caller<17, T, D, BrdRowReflect>, - linearRowFilter_caller<18, T, D, BrdRowReflect>, - linearRowFilter_caller<19, T, D, BrdRowReflect>, - linearRowFilter_caller<20, T, D, BrdRowReflect>, - linearRowFilter_caller<21, T, D, BrdRowReflect>, - linearRowFilter_caller<22, T, D, BrdRowReflect>, - linearRowFilter_caller<23, T, D, BrdRowReflect>, - linearRowFilter_caller<24, T, D, BrdRowReflect>, - linearRowFilter_caller<25, T, D, BrdRowReflect>, - linearRowFilter_caller<26, T, D, BrdRowReflect>, - linearRowFilter_caller<27, T, D, BrdRowReflect>, - linearRowFilter_caller<28, T, D, BrdRowReflect>, - linearRowFilter_caller<29, T, D, BrdRowReflect>, - linearRowFilter_caller<30, T, D, BrdRowReflect>, - linearRowFilter_caller<31, T, D, BrdRowReflect>, - linearRowFilter_caller<32, T, D, BrdRowReflect> - }, - { - 0, - linearRowFilter_caller< 1, T, D, BrdRowWrap>, - linearRowFilter_caller< 2, T, D, BrdRowWrap>, - linearRowFilter_caller< 3, T, D, BrdRowWrap>, - linearRowFilter_caller< 4, T, D, BrdRowWrap>, - linearRowFilter_caller< 5, T, D, BrdRowWrap>, - linearRowFilter_caller< 6, T, D, BrdRowWrap>, - linearRowFilter_caller< 7, T, D, BrdRowWrap>, - linearRowFilter_caller< 8, T, D, BrdRowWrap>, - linearRowFilter_caller< 9, T, D, BrdRowWrap>, - linearRowFilter_caller<10, T, D, BrdRowWrap>, - linearRowFilter_caller<11, T, D, BrdRowWrap>, - linearRowFilter_caller<12, T, D, BrdRowWrap>, - linearRowFilter_caller<13, T, D, BrdRowWrap>, - linearRowFilter_caller<14, T, D, BrdRowWrap>, - linearRowFilter_caller<15, T, D, BrdRowWrap>, - linearRowFilter_caller<16, T, D, BrdRowWrap>, - linearRowFilter_caller<17, T, D, BrdRowWrap>, - linearRowFilter_caller<18, T, D, BrdRowWrap>, - linearRowFilter_caller<19, T, D, BrdRowWrap>, - linearRowFilter_caller<20, T, D, BrdRowWrap>, - linearRowFilter_caller<21, T, D, BrdRowWrap>, - linearRowFilter_caller<22, T, D, BrdRowWrap>, - linearRowFilter_caller<23, T, D, BrdRowWrap>, - linearRowFilter_caller<24, T, D, BrdRowWrap>, - linearRowFilter_caller<25, T, D, BrdRowWrap>, - linearRowFilter_caller<26, T, D, BrdRowWrap>, - linearRowFilter_caller<27, T, D, BrdRowWrap>, - linearRowFilter_caller<28, T, D, BrdRowWrap>, - linearRowFilter_caller<29, T, D, BrdRowWrap>, - linearRowFilter_caller<30, T, D, BrdRowWrap>, - linearRowFilter_caller<31, T, D, BrdRowWrap>, - linearRowFilter_caller<32, T, D, BrdRowWrap> - } - }; - - loadKernel(kernel, ksize, stream); - - callers[brd_type][ksize]((PtrStepSz)src, (PtrStepSz)dst, anchor, cc, stream); - } - - template void linearRowFilter_gpu(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream); - template void linearRowFilter_gpu(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream); - template void linearRowFilter_gpu(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream); - template void linearRowFilter_gpu(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream); - template void linearRowFilter_gpu(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream); - template void linearRowFilter_gpu(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream); - template void linearRowFilter_gpu(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream); - template void linearRowFilter_gpu(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream); - } // namespace row_filter -}}} // namespace cv { namespace gpu { namespace device - - -#endif /* CUDA_DISABLER */ diff --git a/modules/gpu/src/cuda/row_filter.h b/modules/gpu/src/cuda/row_filter.h new file mode 100644 index 000000000..0da2dfe0c --- /dev/null +++ b/modules/gpu/src/cuda/row_filter.h @@ -0,0 +1,377 @@ +/*M/////////////////////////////////////////////////////////////////////////////////////// +// +// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. +// +// By downloading, copying, installing or using the software you agree to this license. +// If you do not agree to this license, do not download, install, +// copy or use the software. +// +// +// License Agreement +// For Open Source Computer Vision Library +// +// Copyright (C) 2000-2008, Intel Corporation, all rights reserved. +// Copyright (C) 2009, Willow Garage Inc., all rights reserved. +// Copyright (C) 1993-2011, NVIDIA Corporation, all rights reserved. +// Third party copyrights are property of their respective owners. +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// * Redistribution's of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// * Redistribution's in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// * The name of the copyright holders may not be used to endorse or promote products +// derived from this software without specific prior written permission. +// +// This software is provided by the copyright holders and contributors "as is" and +// any express or implied warranties, including, but not limited to, the implied +// warranties of merchantability and fitness for a particular purpose are disclaimed. +// In no event shall the Intel Corporation or contributors be liable for any direct, +// indirect, incidental, special, exemplary, or consequential damages +// (including, but not limited to, procurement of substitute goods or services; +// loss of use, data, or profits; or business interruption) however caused +// and on any theory of liability, whether in contract, strict liability, +// or tort (including negligence or otherwise) arising in any way out of +// the use of this software, even if advised of the possibility of such damage. +// +//M*/ + +#include "opencv2/gpu/device/common.hpp" +#include "opencv2/gpu/device/saturate_cast.hpp" +#include "opencv2/gpu/device/vec_math.hpp" +#include "opencv2/gpu/device/border_interpolate.hpp" + +using namespace cv::gpu; +using namespace cv::gpu::device; + +namespace +{ + #define MAX_KERNEL_SIZE 32 + + __constant__ float c_kernel[MAX_KERNEL_SIZE]; + + void loadKernel(const float* kernel, int ksize, cudaStream_t stream) + { + if (stream == 0) + cudaSafeCall( cudaMemcpyToSymbol(c_kernel, kernel, ksize * sizeof(float), 0, cudaMemcpyDeviceToDevice) ); + else + cudaSafeCall( cudaMemcpyToSymbolAsync(c_kernel, kernel, ksize * sizeof(float), 0, cudaMemcpyDeviceToDevice, stream) ); + } + + template + __global__ void linearRowFilter(const PtrStepSz src, PtrStep dst, const int anchor, const B brd) + { + #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 200) + const int BLOCK_DIM_X = 32; + const int BLOCK_DIM_Y = 8; + const int PATCH_PER_BLOCK = 4; + const int HALO_SIZE = 1; + #else + const int BLOCK_DIM_X = 32; + const int BLOCK_DIM_Y = 4; + const int PATCH_PER_BLOCK = 4; + const int HALO_SIZE = 1; + #endif + + typedef typename TypeVec::cn>::vec_type sum_t; + + __shared__ sum_t smem[BLOCK_DIM_Y][(PATCH_PER_BLOCK + 2 * HALO_SIZE) * BLOCK_DIM_X]; + + const int y = blockIdx.y * BLOCK_DIM_Y + threadIdx.y; + + if (y >= src.rows) + return; + + const T* src_row = src.ptr(y); + + const int xStart = blockIdx.x * (PATCH_PER_BLOCK * BLOCK_DIM_X) + threadIdx.x; + + if (blockIdx.x > 0) + { + //Load left halo + #pragma unroll + for (int j = 0; j < HALO_SIZE; ++j) + smem[threadIdx.y][threadIdx.x + j * BLOCK_DIM_X] = saturate_cast(src_row[xStart - (HALO_SIZE - j) * BLOCK_DIM_X]); + } + else + { + //Load left halo + #pragma unroll + for (int j = 0; j < HALO_SIZE; ++j) + smem[threadIdx.y][threadIdx.x + j * BLOCK_DIM_X] = saturate_cast(brd.at_low(xStart - (HALO_SIZE - j) * BLOCK_DIM_X, src_row)); + } + + if (blockIdx.x + 2 < gridDim.x) + { + //Load main data + #pragma unroll + for (int j = 0; j < PATCH_PER_BLOCK; ++j) + smem[threadIdx.y][threadIdx.x + HALO_SIZE * BLOCK_DIM_X + j * BLOCK_DIM_X] = saturate_cast(src_row[xStart + j * BLOCK_DIM_X]); + + //Load right halo + #pragma unroll + for (int j = 0; j < HALO_SIZE; ++j) + smem[threadIdx.y][threadIdx.x + (PATCH_PER_BLOCK + HALO_SIZE) * BLOCK_DIM_X + j * BLOCK_DIM_X] = saturate_cast(src_row[xStart + (PATCH_PER_BLOCK + j) * BLOCK_DIM_X]); + } + else + { + //Load main data + #pragma unroll + for (int j = 0; j < PATCH_PER_BLOCK; ++j) + smem[threadIdx.y][threadIdx.x + HALO_SIZE * BLOCK_DIM_X + j * BLOCK_DIM_X] = saturate_cast(brd.at_high(xStart + j * BLOCK_DIM_X, src_row)); + + //Load right halo + #pragma unroll + for (int j = 0; j < HALO_SIZE; ++j) + smem[threadIdx.y][threadIdx.x + (PATCH_PER_BLOCK + HALO_SIZE) * BLOCK_DIM_X + j * BLOCK_DIM_X] = saturate_cast(brd.at_high(xStart + (PATCH_PER_BLOCK + j) * BLOCK_DIM_X, src_row)); + } + + __syncthreads(); + + #pragma unroll + for (int j = 0; j < PATCH_PER_BLOCK; ++j) + { + const int x = xStart + j * BLOCK_DIM_X; + + if (x < src.cols) + { + sum_t sum = VecTraits::all(0); + + #pragma unroll + for (int k = 0; k < KSIZE; ++k) + sum = sum + smem[threadIdx.y][threadIdx.x + HALO_SIZE * BLOCK_DIM_X + j * BLOCK_DIM_X - anchor + k] * c_kernel[k]; + + dst(y, x) = saturate_cast(sum); + } + } + } + + template class B> + void caller(PtrStepSz src, PtrStepSz dst, int anchor, int cc, cudaStream_t stream) + { + int BLOCK_DIM_X; + int BLOCK_DIM_Y; + int PATCH_PER_BLOCK; + + if (cc >= 20) + { + BLOCK_DIM_X = 32; + BLOCK_DIM_Y = 8; + PATCH_PER_BLOCK = 4; + } + else + { + BLOCK_DIM_X = 32; + BLOCK_DIM_Y = 4; + PATCH_PER_BLOCK = 4; + } + + const dim3 block(BLOCK_DIM_X, BLOCK_DIM_Y); + const dim3 grid(divUp(src.cols, BLOCK_DIM_X * PATCH_PER_BLOCK), divUp(src.rows, BLOCK_DIM_Y)); + + B brd(src.cols); + + linearRowFilter<<>>(src, dst, anchor, brd); + cudaSafeCall( cudaGetLastError() ); + + if (stream == 0) + cudaSafeCall( cudaDeviceSynchronize() ); + } +} + +namespace filter +{ + template + void linearRow(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream) + { + typedef void (*caller_t)(PtrStepSz src, PtrStepSz dst, int anchor, int cc, cudaStream_t stream); + + static const caller_t callers[5][33] = + { + { + 0, + ::caller< 1, T, D, BrdRowReflect101>, + ::caller< 2, T, D, BrdRowReflect101>, + ::caller< 3, T, D, BrdRowReflect101>, + ::caller< 4, T, D, BrdRowReflect101>, + ::caller< 5, T, D, BrdRowReflect101>, + ::caller< 6, T, D, BrdRowReflect101>, + ::caller< 7, T, D, BrdRowReflect101>, + ::caller< 8, T, D, BrdRowReflect101>, + ::caller< 9, T, D, BrdRowReflect101>, + ::caller<10, T, D, BrdRowReflect101>, + ::caller<11, T, D, BrdRowReflect101>, + ::caller<12, T, D, BrdRowReflect101>, + ::caller<13, T, D, BrdRowReflect101>, + ::caller<14, T, D, BrdRowReflect101>, + ::caller<15, T, D, BrdRowReflect101>, + ::caller<16, T, D, BrdRowReflect101>, + ::caller<17, T, D, BrdRowReflect101>, + ::caller<18, T, D, BrdRowReflect101>, + ::caller<19, T, D, BrdRowReflect101>, + ::caller<20, T, D, BrdRowReflect101>, + ::caller<21, T, D, BrdRowReflect101>, + ::caller<22, T, D, BrdRowReflect101>, + ::caller<23, T, D, BrdRowReflect101>, + ::caller<24, T, D, BrdRowReflect101>, + ::caller<25, T, D, BrdRowReflect101>, + ::caller<26, T, D, BrdRowReflect101>, + ::caller<27, T, D, BrdRowReflect101>, + ::caller<28, T, D, BrdRowReflect101>, + ::caller<29, T, D, BrdRowReflect101>, + ::caller<30, T, D, BrdRowReflect101>, + ::caller<31, T, D, BrdRowReflect101>, + ::caller<32, T, D, BrdRowReflect101> + }, + { + 0, + ::caller< 1, T, D, BrdRowReplicate>, + ::caller< 2, T, D, BrdRowReplicate>, + ::caller< 3, T, D, BrdRowReplicate>, + ::caller< 4, T, D, BrdRowReplicate>, + ::caller< 5, T, D, BrdRowReplicate>, + ::caller< 6, T, D, BrdRowReplicate>, + ::caller< 7, T, D, BrdRowReplicate>, + ::caller< 8, T, D, BrdRowReplicate>, + ::caller< 9, T, D, BrdRowReplicate>, + ::caller<10, T, D, BrdRowReplicate>, + ::caller<11, T, D, BrdRowReplicate>, + ::caller<12, T, D, BrdRowReplicate>, + ::caller<13, T, D, BrdRowReplicate>, + ::caller<14, T, D, BrdRowReplicate>, + ::caller<15, T, D, BrdRowReplicate>, + ::caller<16, T, D, BrdRowReplicate>, + ::caller<17, T, D, BrdRowReplicate>, + ::caller<18, T, D, BrdRowReplicate>, + ::caller<19, T, D, BrdRowReplicate>, + ::caller<20, T, D, BrdRowReplicate>, + ::caller<21, T, D, BrdRowReplicate>, + ::caller<22, T, D, BrdRowReplicate>, + ::caller<23, T, D, BrdRowReplicate>, + ::caller<24, T, D, BrdRowReplicate>, + ::caller<25, T, D, BrdRowReplicate>, + ::caller<26, T, D, BrdRowReplicate>, + ::caller<27, T, D, BrdRowReplicate>, + ::caller<28, T, D, BrdRowReplicate>, + ::caller<29, T, D, BrdRowReplicate>, + ::caller<30, T, D, BrdRowReplicate>, + ::caller<31, T, D, BrdRowReplicate>, + ::caller<32, T, D, BrdRowReplicate> + }, + { + 0, + ::caller< 1, T, D, BrdRowConstant>, + ::caller< 2, T, D, BrdRowConstant>, + ::caller< 3, T, D, BrdRowConstant>, + ::caller< 4, T, D, BrdRowConstant>, + ::caller< 5, T, D, BrdRowConstant>, + ::caller< 6, T, D, BrdRowConstant>, + ::caller< 7, T, D, BrdRowConstant>, + ::caller< 8, T, D, BrdRowConstant>, + ::caller< 9, T, D, BrdRowConstant>, + ::caller<10, T, D, BrdRowConstant>, + ::caller<11, T, D, BrdRowConstant>, + ::caller<12, T, D, BrdRowConstant>, + ::caller<13, T, D, BrdRowConstant>, + ::caller<14, T, D, BrdRowConstant>, + ::caller<15, T, D, BrdRowConstant>, + ::caller<16, T, D, BrdRowConstant>, + ::caller<17, T, D, BrdRowConstant>, + ::caller<18, T, D, BrdRowConstant>, + ::caller<19, T, D, BrdRowConstant>, + ::caller<20, T, D, BrdRowConstant>, + ::caller<21, T, D, BrdRowConstant>, + ::caller<22, T, D, BrdRowConstant>, + ::caller<23, T, D, BrdRowConstant>, + ::caller<24, T, D, BrdRowConstant>, + ::caller<25, T, D, BrdRowConstant>, + ::caller<26, T, D, BrdRowConstant>, + ::caller<27, T, D, BrdRowConstant>, + ::caller<28, T, D, BrdRowConstant>, + ::caller<29, T, D, BrdRowConstant>, + ::caller<30, T, D, BrdRowConstant>, + ::caller<31, T, D, BrdRowConstant>, + ::caller<32, T, D, BrdRowConstant> + }, + { + 0, + ::caller< 1, T, D, BrdRowReflect>, + ::caller< 2, T, D, BrdRowReflect>, + ::caller< 3, T, D, BrdRowReflect>, + ::caller< 4, T, D, BrdRowReflect>, + ::caller< 5, T, D, BrdRowReflect>, + ::caller< 6, T, D, BrdRowReflect>, + ::caller< 7, T, D, BrdRowReflect>, + ::caller< 8, T, D, BrdRowReflect>, + ::caller< 9, T, D, BrdRowReflect>, + ::caller<10, T, D, BrdRowReflect>, + ::caller<11, T, D, BrdRowReflect>, + ::caller<12, T, D, BrdRowReflect>, + ::caller<13, T, D, BrdRowReflect>, + ::caller<14, T, D, BrdRowReflect>, + ::caller<15, T, D, BrdRowReflect>, + ::caller<16, T, D, BrdRowReflect>, + ::caller<17, T, D, BrdRowReflect>, + ::caller<18, T, D, BrdRowReflect>, + ::caller<19, T, D, BrdRowReflect>, + ::caller<20, T, D, BrdRowReflect>, + ::caller<21, T, D, BrdRowReflect>, + ::caller<22, T, D, BrdRowReflect>, + ::caller<23, T, D, BrdRowReflect>, + ::caller<24, T, D, BrdRowReflect>, + ::caller<25, T, D, BrdRowReflect>, + ::caller<26, T, D, BrdRowReflect>, + ::caller<27, T, D, BrdRowReflect>, + ::caller<28, T, D, BrdRowReflect>, + ::caller<29, T, D, BrdRowReflect>, + ::caller<30, T, D, BrdRowReflect>, + ::caller<31, T, D, BrdRowReflect>, + ::caller<32, T, D, BrdRowReflect> + }, + { + 0, + ::caller< 1, T, D, BrdRowWrap>, + ::caller< 2, T, D, BrdRowWrap>, + ::caller< 3, T, D, BrdRowWrap>, + ::caller< 4, T, D, BrdRowWrap>, + ::caller< 5, T, D, BrdRowWrap>, + ::caller< 6, T, D, BrdRowWrap>, + ::caller< 7, T, D, BrdRowWrap>, + ::caller< 8, T, D, BrdRowWrap>, + ::caller< 9, T, D, BrdRowWrap>, + ::caller<10, T, D, BrdRowWrap>, + ::caller<11, T, D, BrdRowWrap>, + ::caller<12, T, D, BrdRowWrap>, + ::caller<13, T, D, BrdRowWrap>, + ::caller<14, T, D, BrdRowWrap>, + ::caller<15, T, D, BrdRowWrap>, + ::caller<16, T, D, BrdRowWrap>, + ::caller<17, T, D, BrdRowWrap>, + ::caller<18, T, D, BrdRowWrap>, + ::caller<19, T, D, BrdRowWrap>, + ::caller<20, T, D, BrdRowWrap>, + ::caller<21, T, D, BrdRowWrap>, + ::caller<22, T, D, BrdRowWrap>, + ::caller<23, T, D, BrdRowWrap>, + ::caller<24, T, D, BrdRowWrap>, + ::caller<25, T, D, BrdRowWrap>, + ::caller<26, T, D, BrdRowWrap>, + ::caller<27, T, D, BrdRowWrap>, + ::caller<28, T, D, BrdRowWrap>, + ::caller<29, T, D, BrdRowWrap>, + ::caller<30, T, D, BrdRowWrap>, + ::caller<31, T, D, BrdRowWrap>, + ::caller<32, T, D, BrdRowWrap> + } + }; + + loadKernel(kernel, ksize, stream); + + callers[brd_type][ksize]((PtrStepSz)src, (PtrStepSz)dst, anchor, cc, stream); + } +} diff --git a/modules/gpu/src/filtering.cpp b/modules/gpu/src/filtering.cpp index 77ed46e15..6b7135ab6 100644 --- a/modules/gpu/src/filtering.cpp +++ b/modules/gpu/src/filtering.cpp @@ -830,20 +830,14 @@ void cv::gpu::filter2D(const GpuMat& src, GpuMat& dst, int ddepth, const Mat& ke //////////////////////////////////////////////////////////////////////////////////////////////////// // Separable Linear Filter -namespace cv { namespace gpu { namespace device +namespace filter { - namespace row_filter - { - template - void linearRowFilter_gpu(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream); - } + template + void linearRow(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream); - namespace column_filter - { - template - void linearColumnFilter_gpu(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream); - } -}}} + template + void linearColumn(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream); +} namespace { @@ -899,8 +893,6 @@ namespace Ptr cv::gpu::getLinearRowFilter_GPU(int srcType, int bufType, const Mat& rowKernel, int anchor, int borderType) { - using namespace ::cv::gpu::device::row_filter; - static const nppFilter1D_t nppFilter1D_callers[] = {0, nppiFilterRow_8u_C1R, 0, 0, nppiFilterRow_8u_C4R}; if ((bufType == srcType) && (srcType == CV_8UC1 || srcType == CV_8UC4)) @@ -940,28 +932,28 @@ Ptr cv::gpu::getLinearRowFilter_GPU(int srcType, int bufType, switch (srcType) { case CV_8UC1: - func = linearRowFilter_gpu; + func = filter::linearRow; break; case CV_8UC3: - func = linearRowFilter_gpu; + func = filter::linearRow; break; case CV_8UC4: - func = linearRowFilter_gpu; + func = filter::linearRow; break; case CV_16SC3: - func = linearRowFilter_gpu; + func = filter::linearRow; break; case CV_32SC1: - func = linearRowFilter_gpu; + func = filter::linearRow; break; case CV_32FC1: - func = linearRowFilter_gpu; + func = filter::linearRow; break; case CV_32FC3: - func = linearRowFilter_gpu; + func = filter::linearRow; break; case CV_32FC4: - func = linearRowFilter_gpu; + func = filter::linearRow; break; } @@ -1020,8 +1012,6 @@ namespace Ptr cv::gpu::getLinearColumnFilter_GPU(int bufType, int dstType, const Mat& columnKernel, int anchor, int borderType) { - using namespace ::cv::gpu::device::column_filter; - static const nppFilter1D_t nppFilter1D_callers[] = {0, nppiFilterColumn_8u_C1R, 0, 0, nppiFilterColumn_8u_C4R}; if ((bufType == dstType) && (bufType == CV_8UC1 || bufType == CV_8UC4)) @@ -1061,28 +1051,28 @@ Ptr cv::gpu::getLinearColumnFilter_GPU(int bufType, int ds switch (dstType) { case CV_8UC1: - func = linearColumnFilter_gpu; + func = filter::linearColumn; break; case CV_8UC3: - func = linearColumnFilter_gpu; + func = filter::linearColumn; break; case CV_8UC4: - func = linearColumnFilter_gpu; + func = filter::linearColumn; break; case CV_16SC3: - func = linearColumnFilter_gpu; + func = filter::linearColumn; break; case CV_32SC1: - func = linearColumnFilter_gpu; + func = filter::linearColumn; break; case CV_32FC1: - func = linearColumnFilter_gpu; + func = filter::linearColumn; break; case CV_32FC3: - func = linearColumnFilter_gpu; + func = filter::linearColumn; break; case CV_32FC4: - func = linearColumnFilter_gpu; + func = filter::linearColumn; break; } From afff9cf71634a96a491fdc7d195e267ba126871e Mon Sep 17 00:00:00 2001 From: Vladislav Vinogradov Date: Fri, 26 Oct 2012 15:49:39 +0400 Subject: [PATCH 064/155] Optimized buffers reuse in gpu module: ensureSizeIsEnough now doesn't reallocate memory, if buffer is small submat of big matrix fixed createContinous according new changes --- modules/core/include/opencv2/core/gpumat.hpp | 16 --------- modules/core/src/gpumat.cpp | 37 ++++++++++++++++++++ modules/gpu/test/test_gpumat.cpp | 34 ++++++++++++++++++ 3 files changed, 71 insertions(+), 16 deletions(-) diff --git a/modules/core/include/opencv2/core/gpumat.hpp b/modules/core/include/opencv2/core/gpumat.hpp index 2830a9e94..73da7e7a8 100644 --- a/modules/core/include/opencv2/core/gpumat.hpp +++ b/modules/core/include/opencv2/core/gpumat.hpp @@ -545,22 +545,6 @@ namespace cv { namespace gpu ensureSizeIsEnough(size.height, size.width, type, m); } - inline void createContinuous(int rows, int cols, int type, GpuMat& m) - { - int area = rows * cols; - if (!m.isContinuous() || m.type() != type || m.size().area() != area) - ensureSizeIsEnough(1, area, type, m); - m = m.reshape(0, rows); - } - - inline void ensureSizeIsEnough(int rows, int cols, int type, GpuMat& m) - { - if (m.type() == type && m.rows >= rows && m.cols >= cols) - m = m(Rect(0, 0, cols, rows)); - else - m.create(rows, cols, type); - } - inline GpuMat allocMatFromBuf(int rows, int cols, int type, GpuMat &mat) { if (!mat.empty() && mat.type() == type && mat.rows >= rows && mat.cols >= cols) diff --git a/modules/core/src/gpumat.cpp b/modules/core/src/gpumat.cpp index fc291a862..afd7115b0 100644 --- a/modules/core/src/gpumat.cpp +++ b/modules/core/src/gpumat.cpp @@ -704,6 +704,43 @@ cv::Mat::Mat(const GpuMat& m) : flags(0), dims(0), rows(0), cols(0), data(0), re m.download(*this); } +void cv::gpu::createContinuous(int rows, int cols, int type, GpuMat& m) +{ + int area = rows * cols; + if (m.empty() || m.type() != type || !m.isContinuous() || m.size().area() < area) + m.create(1, area, type); + + m.cols = cols; + m.rows = rows; + m.step = m.elemSize() * cols; + m.flags |= Mat::CONTINUOUS_FLAG; +} + +void cv::gpu::ensureSizeIsEnough(int rows, int cols, int type, GpuMat& m) +{ + if (m.empty() || m.type() != type || m.data != m.datastart) + m.create(rows, cols, type); + else + { + const size_t esz = m.elemSize(); + const ptrdiff_t delta2 = m.dataend - m.datastart; + + const size_t minstep = m.cols * esz; + + Size wholeSize; + wholeSize.height = std::max(static_cast((delta2 - minstep) / m.step + 1), m.rows); + wholeSize.width = std::max(static_cast((delta2 - m.step * (wholeSize.height - 1)) / esz), m.cols); + + if (wholeSize.height < rows || wholeSize.width < cols) + m.create(rows, cols, type); + else + { + m.cols = cols; + m.rows = rows; + } + } +} + namespace { class GpuFuncTable diff --git a/modules/gpu/test/test_gpumat.cpp b/modules/gpu/test/test_gpumat.cpp index 7a4a61623..9d7e545de 100644 --- a/modules/gpu/test/test_gpumat.cpp +++ b/modules/gpu/test/test_gpumat.cpp @@ -324,6 +324,40 @@ INSTANTIATE_TEST_CASE_P(GPU_GpuMat, ConvertTo, testing::Combine( ALL_DEPTH, WHOLE_SUBMAT)); +//////////////////////////////////////////////////////////////////////////////// +// ensureSizeIsEnough + +struct EnsureSizeIsEnough : testing::TestWithParam +{ + virtual void SetUp() + { + cv::gpu::DeviceInfo devInfo = GetParam(); + cv::gpu::setDevice(devInfo.deviceID()); + } +}; + +TEST_P(EnsureSizeIsEnough, BufferReuse) +{ + cv::gpu::GpuMat buffer(100, 100, CV_8U); + cv::gpu::GpuMat old = buffer; + + // don't reallocate memory + cv::gpu::ensureSizeIsEnough(10, 20, CV_8U, buffer); + EXPECT_EQ(10, buffer.rows); + EXPECT_EQ(20, buffer.cols); + EXPECT_EQ(CV_8UC1, buffer.type()); + EXPECT_EQ(reinterpret_cast(old.data), reinterpret_cast(buffer.data)); + + // don't reallocate memory + cv::gpu::ensureSizeIsEnough(20, 30, CV_8U, buffer); + EXPECT_EQ(20, buffer.rows); + EXPECT_EQ(30, buffer.cols); + EXPECT_EQ(CV_8UC1, buffer.type()); + EXPECT_EQ(reinterpret_cast(old.data), reinterpret_cast(buffer.data)); +} + +INSTANTIATE_TEST_CASE_P(GPU_GpuMat, EnsureSizeIsEnough, ALL_DEVICES); + } // namespace #endif // HAVE_CUDA From 580d8173e5acc12cc0a5a6997fe8af73efffe619 Mon Sep 17 00:00:00 2001 From: "marina.kolpakova" Date: Mon, 12 Nov 2012 11:54:40 +0400 Subject: [PATCH 065/155] refactor computing of scaling factor --- modules/gpu/src/softcascade.cpp | 53 ++++++--------------------------- 1 file changed, 9 insertions(+), 44 deletions(-) diff --git a/modules/gpu/src/softcascade.cpp b/modules/gpu/src/softcascade.cpp index fdde2618e..a69be9239 100644 --- a/modules/gpu/src/softcascade.cpp +++ b/modules/gpu/src/softcascade.cpp @@ -70,6 +70,15 @@ cv::gpu::device::icf::Level::Level(int idx, const Octave& oct, const float scale objSize.x = cv::saturate_cast(oct.size.x * relScale); objSize.y = cv::saturate_cast(oct.size.y * relScale); + + // according to R. Benenson, M. Mathias, R. Timofte and L. Van Gool's and Dallal's papers + if (fabs(relScale - 1.f) < FLT_EPSILON) + scaling[0] = scaling[1] = 1.f; + else + { + scaling[0] = (relScale < 1.f) ? 0.89f * ::pow(relScale, 1.099f / ::log(2)) : 1.f; + scaling[1] = relScale * relScale; + } } namespace cv { namespace gpu { namespace device { @@ -91,38 +100,6 @@ namespace imgproc { struct cv::gpu::SCascade::Fields { - struct CascadeIntrinsics - { - static const float lambda = 1.099f, a = 0.89f; - - static float getFor(int channel, float scaling) - { - CV_Assert(channel < 10); - - if (fabs(scaling - 1.f) < FLT_EPSILON) - return 1.f; - - // according to R. Benenson, M. Mathias, R. Timofte and L. Van Gool's and Dallal's papers - static const float A[2][2] = - { //channel <= 6, otherwise - { 0.89f, 1.f}, // down - { 1.00f, 1.f} // up - }; - - static const float B[2][2] = - { //channel <= 6, otherwise - { 1.099f / ::log(2), 2.f}, // down - { 0.f, 2.f} // up - }; - - float a = A[(int)(scaling >= 1)][(int)(channel > 6)]; - float b = B[(int)(scaling >= 1)][(int)(channel > 6)]; - - // printf("!!! scaling: %f %f %f -> %f\n", scaling, a, b, a * pow(scaling, b)); - return a * ::pow(scaling, b); - } - }; - static Fields* parseCascade(const FileNode &root, const float mins, const float maxs) { static const char *const SC_STAGE_TYPE = "stageType"; @@ -281,8 +258,6 @@ struct cv::gpu::SCascade::Fields int fit = fitOctave(voctaves, logScale); Level level(fit, voctaves[fit], scale, width, height); - level.scaling[0] = CascadeIntrinsics::getFor(0, level.relScale); - level.scaling[1] = CascadeIntrinsics::getFor(9, level.relScale); if (!width || !height) break; @@ -294,16 +269,6 @@ struct cv::gpu::SCascade::Fields if (::fabs(scale - maxs) < FLT_EPSILON) break; scale = ::std::min(maxs, ::expf(::log(scale) + logFactor)); - - // std::cout << "level " << sc - // << " octeve " - // << vlevels[sc].octave - // << " relScale " - // << vlevels[sc].relScale - // << " " << vlevels[sc].shrScale - // << " [" << (int)vlevels[sc].objSize.x - // << " " << (int)vlevels[sc].objSize.y << "] [" - // << (int)vlevels[sc].workRect.x << " " << (int)vlevels[sc].workRect.y << "]" << std::endl; } cv::Mat hlevels(1, vlevels.size() * sizeof(Level), CV_8UC1, (uchar*)&(vlevels[0]) ); From aa92be34d603a154b6e734d3a52a43507ab150e2 Mon Sep 17 00:00:00 2001 From: "marina.kolpakova" Date: Mon, 12 Nov 2012 16:37:18 +0400 Subject: [PATCH 066/155] GK107 Policy --- modules/gpu/src/cuda/isf-sc.cu | 284 ++++++++++---------------- modules/gpu/src/icf.hpp | 26 ++- modules/gpu/src/softcascade.cpp | 41 ++-- modules/gpu/test/test_softcascade.cpp | 10 +- 4 files changed, 149 insertions(+), 212 deletions(-) diff --git a/modules/gpu/src/cuda/isf-sc.cu b/modules/gpu/src/cuda/isf-sc.cu index 0de2d8e37..ac4b8f0e8 100644 --- a/modules/gpu/src/cuda/isf-sc.cu +++ b/modules/gpu/src/cuda/isf-sc.cu @@ -161,192 +161,128 @@ namespace icf { } texture troi; -#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 300 - template - __global__ void test_kernel_warp(const Level* levels, const Octave* octaves, const float* stages, - const Node* nodes, const float* leaves, Detection* objects, const uint ndetections, uint* ctr, - const int downscales) + +template +template +__device void CascadeInvoker::detect(Detection* objects, const uint ndetections, uint* ctr, const int downscales) const +{ + const int y = blockIdx.y * blockDim.y + threadIdx.y; + const int x = blockIdx.x; + + // load Lavel + __shared__ Level level; + + // check POI + __shared__ volatile char roiCache[Policy::STA_Y]; + + if (!threadIdx.y && !threadIdx.x) + ((float2*)roiCache)[threadIdx.x] = tex2D(troi, blockIdx.y, x); + + __syncthreads(); + + if (!roiCache[threadIdx.y]) return; + + if (!threadIdx.x) + level = levels[downscales + blockIdx.z]; + + if(x >= level.workRect.x || y >= level.workRect.y) return; + + int st = level.octave * level.step; + const int stEnd = st + level.step; + + float confidence = 0.f; + for(; st < stEnd; st += Policy::WARP) { - const int y = blockIdx.y * blockDim.y + threadIdx.y; - const int x = blockIdx.x; + const int nId = (st + threadIdx.x) * 3; - // load Lavel - __shared__ Level level; + Node node = nodes[nId]; - // check POI - __shared__ volatile char roiCache[8]; - if (!threadIdx.y && !threadIdx.x) - ((float2*)roiCache)[threadIdx.x] = tex2D(troi, blockIdx.y, x); + float threshold = rescale(level, node); + int sum = get(x, y + (node.threshold >> 28) * 120, node.rect); - __syncthreads(); + int next = 1 + (int)(sum >= threshold); - if (!roiCache[threadIdx.y]) return; + node = nodes[nId + next]; + threshold = rescale(level, node); + sum = get(x, y + (node.threshold >> 28) * 120, node.rect); - if (!threadIdx.x) - level = levels[downscales + blockIdx.z]; + const int lShift = (next - 1) * 2 + (int)(sum >= threshold); + float impact = leaves[(st + threadIdx.x) * 4 + lShift]; - if(x >= level.workRect.x || y >= level.workRect.y) return; - - Octave octave = octaves[level.octave]; - int st = octave.index * octave.stages; - const int stEnd = st + 1024; - - float confidence = 0.f; - - for(; st < stEnd; st += 32) - { - - const int nId = (st + threadIdx.x) * 3; - dprintf("\n\n%d: stage: %d %d\n",threadIdx.x, st, nId); - Node node = nodes[nId]; - - float threshold = rescale(level, node); - int sum = get(x, y + (node.threshold >> 28) * 120, node.rect); - - int next = 1 + (int)(sum >= threshold); - dprintf("%d: go: %d (%d >= %f)\n\n" ,threadIdx.x, next, sum, threshold); - - node = nodes[nId + next]; - threshold = rescale(level, node); - sum = get(x, y + (node.threshold >> 28) * 120, node.rect); - - const int lShift = (next - 1) * 2 + (int)(sum >= threshold); - float impact = leaves[(st + threadIdx.x) * 4 + lShift]; - - dprintf("%d: decided: %d (%d >= %f) %d %f\n\n" ,threadIdx.x, next, sum, threshold, lShift, impact); - dprintf("%d: extracted stage: %f\n",threadIdx.x, stages[(st + threadIdx.x)]); - dprintf("%d: computed score: %f\n",threadIdx.x, impact); #pragma unroll - // scan on shuffl functions - for (int i = 1; i < 32; i *= 2) - { - const float n = __shfl_up(impact, i, 32); - - if (threadIdx.x >= i) - impact += n; - } - - dprintf("%d: impact scaned %f\n" ,threadIdx.x, impact); - - confidence += impact; - if(__any((confidence <= stages[(st + threadIdx.x)]))) st += 2048; - } - - if(!threadIdx.x && st == stEnd && ((confidence - FLT_EPSILON) >= 0)) + // scan on shuffl functions + for (int i = 1; i < Policy::WARP; i *= 2) { - int idx = atomicInc(ctr, ndetections); - // store detection - objects[idx] = Detection(__float2int_rn(x * octave.shrinkage), - __float2int_rn(y * octave.shrinkage), level.objSize.x, level.objSize.y, confidence); + const float n = __shfl_up(impact, i, Policy::WARP); + + if (threadIdx.x >= i) + impact += n; } + + confidence += impact; + if(__any((confidence <= stages[(st + threadIdx.x)]))) st += 2048; } -#else - template - __global__ void test_kernel_warp(const Level* levels, const Octave* octaves, const float* stages, - const Node* nodes, const float* leaves, Detection* objects, const uint ndetections, uint* ctr, - const int downscales) + + if(!threadIdx.x && st == stEnd && ((confidence - FLT_EPSILON) >= 0)) { - const int y = blockIdx.y * blockDim.y + threadIdx.y; - const int x = blockIdx.x * blockDim.x + threadIdx.x; - Level level = levels[blockIdx.z]; - - // if (blockIdx.z != 31) return; - if(x >= level.workRect.x || y >= level.workRect.y) return; - - // int roi = tex2D(troi, x, y); - // printf("%d\n", roi); - // if (!roi) return; - - Octave octave = octaves[level.octave]; - - int st = octave.index * octave.stages; - const int stEnd = st + 1000;//octave.stages; - - float confidence = 0.f; - - for(; st < stEnd; ++st) - { - dprintf("\n\nstage: %d\n", st); - const int nId = st * 3; - Node node = nodes[nId]; - - dprintf("Node: [%d %d %d %d] %d %d\n", node.rect.x, node.rect.y, node.rect.z, node.rect.w, - node.threshold >> 28, node.threshold & 0x0FFFFFFFU); - - float threshold = rescale(level, node); - int sum = get(x, y + (node.threshold >> 28) * 121, node.rect); - - dprintf("Node: [%d %d %d %d] %f\n", node.rect.x, node.rect.y, node.rect.z, - node.rect.w, threshold); - - int next = 1 + (int)(sum >= threshold); - dprintf("go: %d (%d >= %f)\n\n" ,next, sum, threshold); - - node = nodes[nId + next]; - threshold = rescale(level, node); - sum = get(x, y + (node.threshold >> 28) * 121, node.rect); - - const int lShift = (next - 1) * 2 + (int)(sum >= threshold); - float impact = leaves[st * 4 + lShift]; - confidence += impact; - - if (confidence <= stages[st]) st = stEnd + 10; - dprintf("decided: %d (%d >= %f) %d %f\n\n" ,next, sum, threshold, lShift, impact); - dprintf("extracted stage: %f\n", stages[st]); - dprintf("computed score: %f\n\n", confidence); - } - - if(st == stEnd) - { - int idx = atomicInc(ctr, ndetections); - // store detection - objects[idx] = Detection(__float2int_rn(x * octave.shrinkage), - __float2int_rn(y * octave.shrinkage), level.objSize.x, level.objSize.y, confidence); - } - } -#endif - - template<> - void CascadeInvoker::operator()(const PtrStepSzb& roi, const PtrStepSzi& hogluv, - PtrStepSz objects, PtrStepSzi counter, const int downscales, const int scale, const cudaStream_t& stream) const - { - int fw = 160; - int fh = 120; - - dim3 block(32, 8); - dim3 grid(fw, fh / 8, (scale == -1) ? downscales : 1); - - uint* ctr = (uint*)(counter.ptr(0)); - Detection* det = (Detection*)objects.ptr(); - uint max_det = objects.cols / sizeof(Detection); - - cudaChannelFormatDesc desc = cudaCreateChannelDesc(); - cudaSafeCall( cudaBindTexture2D(0, thogluv, hogluv.data, desc, hogluv.cols, hogluv.rows, hogluv.step)); - - cudaChannelFormatDesc desc_roi = cudaCreateChannelDesc(); - cudaSafeCall( cudaBindTexture2D(0, troi, roi.data, desc_roi, roi.cols / 8, roi.rows, roi.step)); - - if (scale == -1) - { - test_kernel_warp<<>>(levels, octaves, stages, nodes, leaves, det, max_det, ctr, 0); - cudaSafeCall( cudaGetLastError()); - - grid = dim3(fw, fh / 8, 47 - downscales); - test_kernel_warp<<>>(levels, octaves, stages, nodes, leaves, det, max_det, ctr, downscales); - } - else - { - if (scale >= downscales) - test_kernel_warp<<>>(levels, octaves, stages, nodes, leaves, det, max_det, ctr, scale); - else - test_kernel_warp<<>>(levels, octaves, stages, nodes, leaves, det, max_det, ctr, scale); - } - - if (!stream) - { - cudaSafeCall( cudaGetLastError()); - cudaSafeCall( cudaDeviceSynchronize()); - } + int idx = atomicInc(ctr, ndetections); + objects[idx] = Detection(__float2int_rn(x * Policy::SHRINKAGE), + __float2int_rn(y * Policy::SHRINKAGE), level.objSize.x, level.objSize.y, confidence); } } + +template +__global__ void soft_cascade(const CascadeInvoker invoker, Detection* objects, const uint n, uint* ctr, const int downs) +{ + invoker.template detect(objects, n, ctr, downs); +} + +template +void CascadeInvoker::operator()(const PtrStepSzb& roi, const PtrStepSzi& hogluv, + PtrStepSz objects, PtrStepSzi counter, const int downscales, const int scale, const cudaStream_t& stream) const +{ + int fw = 160; + int fh = 120; + + dim3 grid(fw, fh / Policy::STA_Y, (scale == -1) ? downscales : 1); + + uint* ctr = (uint*)(counter.ptr(0)); + Detection* det = (Detection*)objects.ptr(); + uint max_det = objects.cols / sizeof(Detection); + + cudaChannelFormatDesc desc = cudaCreateChannelDesc(); + cudaSafeCall( cudaBindTexture2D(0, thogluv, hogluv.data, desc, hogluv.cols, hogluv.rows, hogluv.step)); + + cudaChannelFormatDesc desc_roi = cudaCreateChannelDesc(); + cudaSafeCall( cudaBindTexture2D(0, troi, roi.data, desc_roi, roi.cols / 8, roi.rows, roi.step)); + + const CascadeInvoker inv = *this; + + if (scale == -1) + { + soft_cascade<<>>(inv, det, max_det, ctr, 0); + cudaSafeCall( cudaGetLastError()); + + grid = dim3(fw, fh / Policy::STA_Y, scales - downscales); + soft_cascade<<>>(inv, det, max_det, ctr, downscales); + } + else + { + if (scale >= downscales) + soft_cascade<<>>(inv, det, max_det, ctr, scale); + else + soft_cascade<<>>(inv, det, max_det, ctr, scale); + } + + if (!stream) + { + cudaSafeCall( cudaGetLastError()); + cudaSafeCall( cudaDeviceSynchronize()); + } +} + +template void CascadeInvoker::operator()(const PtrStepSzb& roi, const PtrStepSzi& hogluv, + PtrStepSz objects, PtrStepSzi counter, const int downscales, const int scale, const cudaStream_t& stream) const; + +} }}} \ No newline at end of file diff --git a/modules/gpu/src/icf.hpp b/modules/gpu/src/icf.hpp index 60df55882..8eb080e23 100644 --- a/modules/gpu/src/icf.hpp +++ b/modules/gpu/src/icf.hpp @@ -72,9 +72,9 @@ struct __align__(16) Octave struct __align__(8) Level //is actually 24 bytes { int octave; + int step; float relScale; - float shrScale; // used for marking detection float scaling[2]; // calculated according to Dollal paper // for 640x480 we can not get overflow @@ -115,31 +115,41 @@ struct __align__(16) Detection : x(_x), y(_y), w(_w), h(_h), confidence(c), kind(0) {}; }; -struct CascadePolicy +struct GK107PolicyX4 { - enum {STA_X = 32, STA_Y = 8}; + enum {WARP = 32, STA_X = WARP, STA_Y = 8, SHRINKAGE = 4}; + static const dim3 block() + { + return dim3(GK107PolicyX4::STA_X, GK107PolicyX4::STA_Y); + } }; template struct CascadeInvoker { - CascadeInvoker(): levels(0), octaves(0), stages(0), nodes(0), leaves(0) {} + CascadeInvoker(): levels(0), stages(0), nodes(0), leaves(0), scales(0) {} + CascadeInvoker(const PtrStepSzb& _levels, const PtrStepSzb& _octaves, const PtrStepSzf& _stages, const PtrStepSzb& _nodes, const PtrStepSzf& _leaves) - : levels((const Level*)_levels.ptr()), octaves((const Octave*)_octaves.ptr()), stages((const float*)_stages.ptr()), - nodes((const Node*)_nodes.ptr()), leaves((const float*)_leaves.ptr()) + : levels((const Level*)_levels.ptr()), + stages((const float*)_stages.ptr()), + nodes((const Node*)_nodes.ptr()), leaves((const float*)_leaves.ptr()), + scales(_levels.cols / sizeof(Level)) {} const Level* levels; - const Octave* octaves; - const float* stages; const Node* nodes; const float* leaves; + int scales; + void operator()(const PtrStepSzb& roi, const PtrStepSzi& hogluv, PtrStepSz objects, PtrStepSzi counter, const int downscales, const int csale = -1, const cudaStream_t& stream = 0) const; + + template + __device void detect(Detection* objects, const uint ndetections, uint* ctr, const int downscales) const; }; } diff --git a/modules/gpu/src/softcascade.cpp b/modules/gpu/src/softcascade.cpp index a69be9239..6133bd1cb 100644 --- a/modules/gpu/src/softcascade.cpp +++ b/modules/gpu/src/softcascade.cpp @@ -63,7 +63,7 @@ void cv::gpu::SCascade::read(const FileNode& fn) { Algorithm::read(fn); } #include cv::gpu::device::icf::Level::Level(int idx, const Octave& oct, const float scale, const int w, const int h) -: octave(idx), relScale(scale / oct.scale), shrScale (relScale / (float)oct.shrinkage) +: octave(idx), step(oct.stages), relScale(scale / oct.scale) { workRect.x = round(w / (float)oct.shrinkage); workRect.y = round(h / (float)oct.shrinkage); @@ -100,7 +100,7 @@ namespace imgproc { struct cv::gpu::SCascade::Fields { - static Fields* parseCascade(const FileNode &root, const float mins, const float maxs) + static Fields* parseCascade(const FileNode &root, const float mins, const float maxs, const int totals) { static const char *const SC_STAGE_TYPE = "stageType"; static const char *const SC_BOOST = "BOOST"; @@ -119,11 +119,8 @@ struct cv::gpu::SCascade::Fields static const char *const SC_ORIG_W = "width"; static const char *const SC_ORIG_H = "height"; - int origWidth = (int)root[SC_ORIG_W]; - CV_Assert(origWidth == ORIG_OBJECT_WIDTH); - + int origWidth = (int)root[SC_ORIG_W]; int origHeight = (int)root[SC_ORIG_H]; - CV_Assert(origHeight == ORIG_OBJECT_HEIGHT); static const char *const SC_OCTAVES = "octaves"; static const char *const SC_STAGES = "stages"; @@ -142,7 +139,6 @@ struct cv::gpu::SCascade::Fields static const char * const SC_F_CHANNEL = "channel"; static const char * const SC_F_RECT = "rect"; - FileNode fn = root[SC_OCTAVES]; if (fn.empty()) return false; @@ -167,8 +163,8 @@ struct cv::gpu::SCascade::Fields ushort nstages = saturate_cast((int)fns[SC_OCT_STAGES]); ushort2 size; - size.x = cvRound(ORIG_OBJECT_WIDTH * scale); - size.y = cvRound(ORIG_OBJECT_HEIGHT * scale); + size.x = cvRound(origWidth * scale); + size.y = cvRound(origHeight * scale); shrinkage = saturate_cast((int)fns[SC_OCT_SHRINKAGE]); Octave octave(octIndex, nstages, shrinkage, size, scale); @@ -245,11 +241,11 @@ struct cv::gpu::SCascade::Fields CV_Assert(!hleaves.empty()); std::vector vlevels; - float logFactor = (::log(maxs) - ::log(mins)) / (TOTAL_SCALES -1); + float logFactor = (::log(maxs) - ::log(mins)) / (totals -1); float scale = mins; int downscales = 0; - for (int sc = 0; sc < TOTAL_SCALES; ++sc) + for (int sc = 0; sc < totals; ++sc) { int width = ::std::max(0.0f, FRAME_WIDTH - (origWidth * scale)); int height = ::std::max(0.0f, FRAME_HEIGHT - (origHeight * scale)); @@ -302,7 +298,7 @@ struct cv::gpu::SCascade::Fields leaves.upload(hleaves); levels.upload(hlevels); - invoker = device::icf::CascadeInvoker(levels, octaves, stages, nodes, leaves); + invoker = device::icf::CascadeInvoker(levels, octaves, stages, nodes, leaves); } @@ -456,16 +452,13 @@ public: GpuMat sobelBuf; - device::icf::CascadeInvoker invoker; + device::icf::CascadeInvoker invoker; enum { BOOST = 0 }; enum { FRAME_WIDTH = 640, FRAME_HEIGHT = 480, - TOTAL_SCALES = 55, - ORIG_OBJECT_WIDTH = 64, - ORIG_OBJECT_HEIGHT = 128, HOG_BINS = 6, LUV_BINS = 3, HOG_LUV_BINS = 10 @@ -480,21 +473,19 @@ cv::gpu::SCascade::~SCascade() { delete fields; } bool cv::gpu::SCascade::load(const FileNode& fn) { if (fields) delete fields; - fields = Fields::parseCascade(fn, minScale, maxScale); + fields = Fields::parseCascade(fn, minScale, maxScale, scales); return fields != 0; } void cv::gpu::SCascade::detect(InputArray image, InputArray _rois, OutputArray _objects, Stream& s) const { + CV_Assert(fields); + const GpuMat colored = image.getGpuMat(); // only color images are supperted CV_Assert(colored.type() == CV_8UC3 || colored.type() == CV_32SC1); GpuMat rois = _rois.getGpuMat(), objects = _objects.getGpuMat(); - - // we guess user knows about shrincage - // CV_Assert((rois.size().width == getRoiSize().height) && (rois.type() == CV_8UC1)); - Fields& flds = *fields; if (colored.type() == CV_8UC3) @@ -518,15 +509,13 @@ void cv::gpu::SCascade::detect(InputArray image, InputArray _rois, OutputArray _ void cv::gpu::SCascade::detect(InputArray image, InputArray _rois, OutputArray _objects, const int level, Stream& s) const { + CV_Assert(fields); + const GpuMat colored = image.getGpuMat(); // only color images are supperted CV_Assert(colored.type() == CV_8UC3 || colored.type() == CV_32SC1); - // we guess user knows about shrincage - // CV_Assert((rois.size().width == getRoiSize().height) && (rois.type() == CV_8UC1)); - Fields& flds = *fields; - if (colored.type() == CV_8UC3) { // only this window size allowed @@ -549,6 +538,8 @@ void cv::gpu::SCascade::detect(InputArray image, InputArray _rois, OutputArray _ void cv::gpu::SCascade::genRoi(InputArray _roi, OutputArray _mask, Stream& stream) const { + CV_Assert(fields); + const GpuMat roi = _roi.getGpuMat(); _mask.create( roi.cols / 4, roi.rows / 4, roi.type() ); GpuMat mask = _mask.getGpuMat(); diff --git a/modules/gpu/test/test_softcascade.cpp b/modules/gpu/test/test_softcascade.cpp index cfae940c7..7034b33b9 100644 --- a/modules/gpu/test/test_softcascade.cpp +++ b/modules/gpu/test/test_softcascade.cpp @@ -172,7 +172,7 @@ GPU_TEST_P(SCascadeTestRoi, detect, sub.setTo(1); cv::rectangle(result, r, cv::Scalar(0, 0, 255, 255), 1); } - + objectBoxes.setTo(0); cascade.genRoi(rois, trois); cascade.detect(colored, trois, objectBoxes); @@ -222,7 +222,7 @@ GPU_TEST_P(SCascadeTestLevel, detect, cv::gpu::GpuMat trois; cascade.genRoi(rois, trois); - + objectBoxes.setTo(0); int level = GET_PARAM(3); cascade.detect(colored, trois, objectBoxes, level); @@ -281,7 +281,7 @@ GPU_TEST_P(SCascadeTestAll, detect, cv::gpu::GpuMat trois; cascade.genRoi(rois, trois); - + objectBoxes.setTo(0); cascade.detect(colored, trois, objectBoxes); typedef cv::gpu::SCascade::Detection Detection; @@ -321,7 +321,7 @@ GPU_TEST_P(SCascadeTestAll, detectOnIntegral, cv::gpu::GpuMat trois; cascade.genRoi(rois, trois); - + objectBoxes.setTo(0); cascade.detect(hogluv, trois, objectBoxes); typedef cv::gpu::SCascade::Detection Detection; @@ -357,7 +357,7 @@ GPU_TEST_P(SCascadeTestAll, detectStream, cv::gpu::GpuMat trois; cascade.genRoi(rois, trois, s); - + objectBoxes.setTo(0); cascade.detect(colored, trois, objectBoxes, s); cudaDeviceSynchronize(); From 08910e81af95dd2004930845e5f206c2b9368aac Mon Sep 17 00:00:00 2001 From: "marina.kolpakova" Date: Wed, 14 Nov 2012 12:40:44 +0400 Subject: [PATCH 067/155] integrate pre-Kepler architectures --- modules/gpu/src/cuda/isf-sc.cu | 15 +++++++++++++++ modules/gpu/src/softcascade.cpp | 16 +++++++++++----- 2 files changed, 26 insertions(+), 5 deletions(-) diff --git a/modules/gpu/src/cuda/isf-sc.cu b/modules/gpu/src/cuda/isf-sc.cu index ac4b8f0e8..b6c87e17b 100644 --- a/modules/gpu/src/cuda/isf-sc.cu +++ b/modules/gpu/src/cuda/isf-sc.cu @@ -209,6 +209,7 @@ __device void CascadeInvoker::detect(Detection* objects, const uint ndet const int lShift = (next - 1) * 2 + (int)(sum >= threshold); float impact = leaves[(st + threadIdx.x) * 4 + lShift]; +#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 300 #pragma unroll // scan on shuffl functions for (int i = 1; i < Policy::WARP; i *= 2) @@ -218,7 +219,21 @@ __device void CascadeInvoker::detect(Detection* objects, const uint ndet if (threadIdx.x >= i) impact += n; } +#else + __shared__ volatile float ptr[Policy::STA_X * Policy::STA_Y]; + const int idx = threadIdx.y * Policy::STA_X + threadIdx.x; + + ptr[idx] = impact; + + if ( threadIdx.x >= 1) ptr [idx ] = (ptr [idx - 1] + ptr [idx]); + if ( threadIdx.x >= 2) ptr [idx ] = (ptr [idx - 2] + ptr [idx]); + if ( threadIdx.x >= 4) ptr [idx ] = (ptr [idx - 4] + ptr [idx]); + if ( threadIdx.x >= 8) ptr [idx ] = (ptr [idx - 8] + ptr [idx]); + if ( threadIdx.x >= 16) ptr [idx ] = (ptr [idx - 16] + ptr [idx]); + + impact = ptr[idx]; +#endif confidence += impact; if(__any((confidence <= stages[(st + threadIdx.x)]))) st += 2048; } diff --git a/modules/gpu/src/softcascade.cpp b/modules/gpu/src/softcascade.cpp index 6133bd1cb..c5bcbedb5 100644 --- a/modules/gpu/src/softcascade.cpp +++ b/modules/gpu/src/softcascade.cpp @@ -298,14 +298,14 @@ struct cv::gpu::SCascade::Fields leaves.upload(hleaves); levels.upload(hlevels); - invoker = device::icf::CascadeInvoker(levels, octaves, stages, nodes, leaves); - } void detect(int scale, const cv::gpu::GpuMat& roi, const cv::gpu::GpuMat& count, cv::gpu::GpuMat& objects, const cudaStream_t& stream) const { cudaMemset(count.data, 0, sizeof(Detection)); cudaSafeCall( cudaGetLastError()); + device::icf::CascadeInvoker invoker + = device::icf::CascadeInvoker(levels, octaves, stages, nodes, leaves); invoker(roi, hogluv, objects, count, downscales, scale, stream); } @@ -407,8 +407,14 @@ private: GpuMat channels(plane, cv::Rect(0, 0, fw, fh * Fields::HOG_LUV_BINS)); cv::gpu::resize(channels, shrunk, cv::Size(), 0.25, 0.25, CV_INTER_AREA, s); - cudaStream_t stream = StreamAccessor::getStream(s); - device::imgproc::shfl_integral_gpu_buffered(shrunk, integralBuffer, hogluv, 12, stream); + + if (info.majorVersion() < 3) + cv::gpu::integralBuffered(shrunk, hogluv, integralBuffer, s); + else + { + cudaStream_t stream = StreamAccessor::getStream(s); + device::imgproc::shfl_integral_gpu_buffered(shrunk, integralBuffer, hogluv, 12, stream); + } } public: @@ -452,7 +458,7 @@ public: GpuMat sobelBuf; - device::icf::CascadeInvoker invoker; + DeviceInfo info; enum { BOOST = 0 }; enum From 72e2b8b370c10024adb05a46230fbac782fbeee4 Mon Sep 17 00:00:00 2001 From: "marina.kolpakova" Date: Wed, 14 Nov 2012 14:00:02 +0400 Subject: [PATCH 068/155] remove size constraints of input frame --- modules/gpu/src/cuda/isf-sc.cu | 4 +- modules/gpu/src/softcascade.cpp | 97 ++++++++++++++++++--------------- 2 files changed, 56 insertions(+), 45 deletions(-) diff --git a/modules/gpu/src/cuda/isf-sc.cu b/modules/gpu/src/cuda/isf-sc.cu index b6c87e17b..7f7a10e92 100644 --- a/modules/gpu/src/cuda/isf-sc.cu +++ b/modules/gpu/src/cuda/isf-sc.cu @@ -256,8 +256,8 @@ template void CascadeInvoker::operator()(const PtrStepSzb& roi, const PtrStepSzi& hogluv, PtrStepSz objects, PtrStepSzi counter, const int downscales, const int scale, const cudaStream_t& stream) const { - int fw = 160; - int fh = 120; + int fw = roi.rows; + int fh = roi.cols; dim3 grid(fw, fh / Policy::STA_Y, (scale == -1) ? downscales : 1); diff --git a/modules/gpu/src/softcascade.cpp b/modules/gpu/src/softcascade.cpp index c5bcbedb5..ad6e00027 100644 --- a/modules/gpu/src/softcascade.cpp +++ b/modules/gpu/src/softcascade.cpp @@ -240,15 +240,26 @@ struct cv::gpu::SCascade::Fields cv::Mat hleaves(cv::Mat(vleaves).reshape(1,1)); CV_Assert(!hleaves.empty()); - std::vector vlevels; - float logFactor = (::log(maxs) - ::log(mins)) / (totals -1); + Fields* fields = new Fields(mins, maxs, totals, origWidth, origHeight, shrinkage, 0, + hoctaves, hstages, hnodes, hleaves); + fields->voctaves = voctaves; + fields->createLevels(FRAME_HEIGHT, FRAME_WIDTH); - float scale = mins; - int downscales = 0; + return fields; + } + + int createLevels(const int fh, const int fw) + { + using namespace device::icf; + std::vector vlevels; + float logFactor = (::log(maxScale) - ::log(minScale)) / (totals -1); + + float scale = minScale; + int dcs = 0; for (int sc = 0; sc < totals; ++sc) { - int width = ::std::max(0.0f, FRAME_WIDTH - (origWidth * scale)); - int height = ::std::max(0.0f, FRAME_HEIGHT - (origHeight * scale)); + int width = ::std::max(0.0f, fw - (origObjWidth * scale)); + int height = ::std::max(0.0f, fh - (origObjHeight * scale)); float logScale = ::log(scale); int fit = fitOctave(voctaves, logScale); @@ -260,44 +271,44 @@ struct cv::gpu::SCascade::Fields else { vlevels.push_back(level); - if (voctaves[fit].scale < 1) ++downscales; + if (voctaves[fit].scale < 1) ++dcs; } - if (::fabs(scale - maxs) < FLT_EPSILON) break; - scale = ::std::min(maxs, ::expf(::log(scale) + logFactor)); + if (::fabs(scale - maxScale) < FLT_EPSILON) break; + scale = ::std::min(maxScale, ::expf(::log(scale) + logFactor)); } - cv::Mat hlevels(1, vlevels.size() * sizeof(Level), CV_8UC1, (uchar*)&(vlevels[0]) ); + cv::Mat hlevels = cv::Mat(1, vlevels.size() * sizeof(Level), CV_8UC1, (uchar*)&(vlevels[0]) ); CV_Assert(!hlevels.empty()); - - Fields* fields = new Fields(mins, maxs, origWidth, origHeight, shrinkage, downscales, - hoctaves, hstages, hnodes, hleaves, hlevels); - - return fields; + levels.upload(hlevels); + downscales = dcs; + return dcs; } - Fields( const float mins, const float maxs, const int ow, const int oh, const int shr, const int ds, - cv::Mat hoctaves, cv::Mat hstages, cv::Mat hnodes, cv::Mat hleaves, cv::Mat hlevels) - : minScale(mins), maxScale(maxs), origObjWidth(ow), origObjHeight(oh), shrinkage(shr), downscales(ds) + bool update(int fh, int fw, int shr) { - plane.create(FRAME_HEIGHT * (HOG_LUV_BINS + 1), FRAME_WIDTH, CV_8UC1); - fplane.create(FRAME_HEIGHT * 6, FRAME_WIDTH, CV_32FC1); - luv.create(FRAME_HEIGHT, FRAME_WIDTH, CV_8UC3); + if (fh == luv.rows && fh == luv.cols) return false; + plane.create(fh * (HOG_LUV_BINS + 1), fw, CV_8UC1); + fplane.create(fh * HOG_BINS, fw, CV_32FC1); + luv.create(fh, fw, CV_8UC3); - shrunk.create(FRAME_HEIGHT / shr * HOG_LUV_BINS, FRAME_WIDTH / shr, CV_8UC1); + shrunk.create(fh / shr * HOG_LUV_BINS, fw / shr, CV_8UC1); integralBuffer.create(shrunk.rows, shrunk.cols, CV_32SC1); - hogluv.create((FRAME_HEIGHT / shr) * HOG_LUV_BINS + 1, FRAME_WIDTH / shr + 1, CV_32SC1); + hogluv.create((fh / shr) * HOG_LUV_BINS + 1, fw / shr + 1, CV_32SC1); hogluv.setTo(cv::Scalar::all(0)); + return true; + } - detCounter.create(sizeof(Detection) / sizeof(int),1, CV_32SC1); - + Fields( const float mins, const float maxs, const int tts, const int ow, const int oh, const int shr, const int ds, + cv::Mat hoctaves, cv::Mat hstages, cv::Mat hnodes, cv::Mat hleaves) + : minScale(mins), maxScale(maxs), totals(tts), origObjWidth(ow), origObjHeight(oh), shrinkage(shr), downscales(ds) + { + update(FRAME_HEIGHT, FRAME_WIDTH, shr); octaves.upload(hoctaves); stages.upload(hstages); nodes.upload(hnodes); leaves.upload(hleaves); - levels.upload(hlevels); - } void detect(int scale, const cv::gpu::GpuMat& roi, const cv::gpu::GpuMat& count, cv::gpu::GpuMat& objects, const cudaStream_t& stream) const @@ -316,8 +327,8 @@ struct cv::gpu::SCascade::Fields else cudaMemset(plane.data, 0, plane.step * plane.rows); - static const int fw = Fields::FRAME_WIDTH; - static const int fh = Fields::FRAME_HEIGHT; + const int fw = colored.cols; + const int fh = colored.rows; GpuMat gray(plane, cv::Rect(0, fh * Fields::HOG_LUV_BINS, fw, fh)); cv::gpu::cvtColor(colored, gray, CV_BGR2GRAY, s); @@ -325,7 +336,7 @@ struct cv::gpu::SCascade::Fields createLuvBins(colored, s); - integrate(s); + integrate(fh, fw, s); } private: @@ -352,8 +363,8 @@ private: void createHogBins(const cv::gpu::GpuMat& gray, Stream& s) { - static const int fw = Fields::FRAME_WIDTH; - static const int fh = Fields::FRAME_HEIGHT; + static const int fw = gray.cols; + static const int fh = gray.rows; GpuMat dfdx(fplane, cv::Rect(0, 0, fw, fh)); GpuMat dfdy(fplane, cv::Rect(0, fh, fw, fh)); @@ -386,8 +397,8 @@ private: void createLuvBins(const cv::gpu::GpuMat& colored, Stream& s) { - static const int fw = Fields::FRAME_WIDTH; - static const int fh = Fields::FRAME_HEIGHT; + static const int fw = colored.cols; + static const int fh = colored.rows; cv::gpu::cvtColor(colored, luv, CV_BGR2Luv, s); @@ -400,11 +411,8 @@ private: cv::gpu::split(luv, splited, s); } - void integrate( Stream& s) + void integrate(const int fh, const int fw, Stream& s) { - int fw = Fields::FRAME_WIDTH; - int fh = Fields::FRAME_HEIGHT; - GpuMat channels(plane, cv::Rect(0, 0, fw, fh * Fields::HOG_LUV_BINS)); cv::gpu::resize(channels, shrunk, cv::Size(), 0.25, 0.25, CV_INTER_AREA, s); @@ -423,6 +431,8 @@ public: float minScale; float maxScale; + int totals; + int origObjWidth; int origObjHeight; @@ -447,8 +457,6 @@ public: // 161x121x10 GpuMat hogluv; - GpuMat detCounter; - // Cascade from xml GpuMat octaves; GpuMat stages; @@ -458,6 +466,8 @@ public: GpuMat sobelBuf; + std::vector voctaves; + DeviceInfo info; enum { BOOST = 0 }; @@ -488,6 +498,7 @@ void cv::gpu::SCascade::detect(InputArray image, InputArray _rois, OutputArray _ CV_Assert(fields); const GpuMat colored = image.getGpuMat(); + // only color images are supperted CV_Assert(colored.type() == CV_8UC3 || colored.type() == CV_32SC1); @@ -496,8 +507,8 @@ void cv::gpu::SCascade::detect(InputArray image, InputArray _rois, OutputArray _ if (colored.type() == CV_8UC3) { - // only this window size allowed - CV_Assert(colored.cols == Fields::FRAME_WIDTH && colored.rows == Fields::FRAME_HEIGHT); + if (!flds.update(colored.rows, colored.cols, flds.shrinkage)) + flds.createLevels(colored.rows, colored.cols); flds.preprocess(colored, s); } else @@ -525,7 +536,7 @@ void cv::gpu::SCascade::detect(InputArray image, InputArray _rois, OutputArray _ if (colored.type() == CV_8UC3) { // only this window size allowed - CV_Assert(colored.cols == Fields::FRAME_WIDTH && colored.rows == Fields::FRAME_HEIGHT); + // CV_Assert(colored.cols == Fields::FRAME_WIDTH && colored.rows == Fields::FRAME_HEIGHT); flds.preprocess(colored, s); } else From 8acfbde68e5b9c4bd6640a21c384c3fff8fd28aa Mon Sep 17 00:00:00 2001 From: "marina.kolpakova" Date: Wed, 14 Nov 2012 14:21:22 +0400 Subject: [PATCH 069/155] remove debug detect at scale method --- modules/gpu/doc/object_detection.rst | 4 -- modules/gpu/include/opencv2/gpu/gpu.hpp | 2 - modules/gpu/src/cuda/isf-sc.cu | 37 +++--------- modules/gpu/src/icf.hpp | 5 +- modules/gpu/src/softcascade.cpp | 35 +----------- modules/gpu/test/test_softcascade.cpp | 76 ++++++++++++------------- 6 files changed, 53 insertions(+), 106 deletions(-) diff --git a/modules/gpu/doc/object_detection.rst b/modules/gpu/doc/object_detection.rst index 64348717c..c503d93fe 100644 --- a/modules/gpu/doc/object_detection.rst +++ b/modules/gpu/doc/object_detection.rst @@ -248,7 +248,6 @@ Implementation of soft (stageless) cascaded detector. :: virtual ~SCascade(); virtual bool load(const FileNode& fn); virtual void detect(InputArray image, InputArray rois, OutputArray objects, Stream& stream = Stream::Null()) const; - virtual void detect(InputArray image, InputArray rois, OutputArray objects, const int level, Stream& stream = Stream::Null()) const; void genRoi(InputArray roi, OutputArray mask, Stream& stream = Stream::Null()) const; }; @@ -292,7 +291,6 @@ SCascade::detect Apply cascade to an input frame and return the vector of Decection objcts. .. ocv:function:: void detect(InputArray image, InputArray rois, OutputArray objects, Stream& stream = Stream::Null()) const -.. ocv:function:: void detect(InputArray image, InputArray rois, OutputArray objects, const int level, Stream& stream = Stream::Null()) const :param image: a frame on which detector will be applied. @@ -302,8 +300,6 @@ Apply cascade to an input frame and return the vector of Decection objcts. :param stream: a high-level CUDA stream abstraction used for asynchronous execution. - :param level: used for execution cascade on specific scales pyramid level. - SCascade::genRoi -------------------------- diff --git a/modules/gpu/include/opencv2/gpu/gpu.hpp b/modules/gpu/include/opencv2/gpu/gpu.hpp index 9a43760f9..db228a69b 100644 --- a/modules/gpu/include/opencv2/gpu/gpu.hpp +++ b/modules/gpu/include/opencv2/gpu/gpu.hpp @@ -1577,9 +1577,7 @@ public: // Param objects is an output array of Detections represented as GpuMat of detections (SCascade::Detection) // The first element of the matrix is actually a count of detections. // Param stream is stream is a high-level CUDA stream abstraction used for asynchronous execution - // Param level used for execution cascade on specific scales pyramid level. virtual void detect(InputArray image, InputArray rois, OutputArray objects, Stream& stream = Stream::Null()) const; - virtual void detect(InputArray image, InputArray rois, OutputArray objects, const int level, Stream& stream = Stream::Null()) const; // Convert ROI matrix into the suitable for detect method. // Param roi is an input matrix of the same size as the image. diff --git a/modules/gpu/src/cuda/isf-sc.cu b/modules/gpu/src/cuda/isf-sc.cu index 7f7a10e92..3d3536683 100644 --- a/modules/gpu/src/cuda/isf-sc.cu +++ b/modules/gpu/src/cuda/isf-sc.cu @@ -45,15 +45,6 @@ #include #include -// #define LOG_CUDA_CASCADE - -#if defined LOG_CUDA_CASCADE -# define dprintf(format, ...) \ - do { printf(format, __VA_ARGS__); } while (0) -#else -# define dprintf(format, ...) -#endif - namespace cv { namespace gpu { namespace device { namespace icf { @@ -254,12 +245,12 @@ __global__ void soft_cascade(const CascadeInvoker invoker, Detection* ob template void CascadeInvoker::operator()(const PtrStepSzb& roi, const PtrStepSzi& hogluv, - PtrStepSz objects, PtrStepSzi counter, const int downscales, const int scale, const cudaStream_t& stream) const + PtrStepSz objects, PtrStepSzi counter, const int downscales, const cudaStream_t& stream) const { int fw = roi.rows; int fh = roi.cols; - dim3 grid(fw, fh / Policy::STA_Y, (scale == -1) ? downscales : 1); + dim3 grid(fw, fh / Policy::STA_Y, downscales); uint* ctr = (uint*)(counter.ptr(0)); Detection* det = (Detection*)objects.ptr(); @@ -268,26 +259,16 @@ void CascadeInvoker::operator()(const PtrStepSzb& roi, const PtrStepSzi& cudaChannelFormatDesc desc = cudaCreateChannelDesc(); cudaSafeCall( cudaBindTexture2D(0, thogluv, hogluv.data, desc, hogluv.cols, hogluv.rows, hogluv.step)); - cudaChannelFormatDesc desc_roi = cudaCreateChannelDesc(); - cudaSafeCall( cudaBindTexture2D(0, troi, roi.data, desc_roi, roi.cols / 8, roi.rows, roi.step)); + cudaChannelFormatDesc desc_roi = cudaCreateChannelDesc(); + cudaSafeCall( cudaBindTexture2D(0, troi, roi.data, desc_roi, roi.cols / Policy::STA_Y, roi.rows, roi.step)); const CascadeInvoker inv = *this; - if (scale == -1) - { - soft_cascade<<>>(inv, det, max_det, ctr, 0); - cudaSafeCall( cudaGetLastError()); + soft_cascade<<>>(inv, det, max_det, ctr, 0); + cudaSafeCall( cudaGetLastError()); - grid = dim3(fw, fh / Policy::STA_Y, scales - downscales); - soft_cascade<<>>(inv, det, max_det, ctr, downscales); - } - else - { - if (scale >= downscales) - soft_cascade<<>>(inv, det, max_det, ctr, scale); - else - soft_cascade<<>>(inv, det, max_det, ctr, scale); - } + grid = dim3(fw, fh / Policy::STA_Y, scales - downscales); + soft_cascade<<>>(inv, det, max_det, ctr, downscales); if (!stream) { @@ -297,7 +278,7 @@ void CascadeInvoker::operator()(const PtrStepSzb& roi, const PtrStepSzi& } template void CascadeInvoker::operator()(const PtrStepSzb& roi, const PtrStepSzi& hogluv, - PtrStepSz objects, PtrStepSzi counter, const int downscales, const int scale, const cudaStream_t& stream) const; + PtrStepSz objects, PtrStepSzi counter, const int downscales, const cudaStream_t& stream) const; } }}} \ No newline at end of file diff --git a/modules/gpu/src/icf.hpp b/modules/gpu/src/icf.hpp index 8eb080e23..2bbbb64d2 100644 --- a/modules/gpu/src/icf.hpp +++ b/modules/gpu/src/icf.hpp @@ -118,9 +118,10 @@ struct __align__(16) Detection struct GK107PolicyX4 { enum {WARP = 32, STA_X = WARP, STA_Y = 8, SHRINKAGE = 4}; + typedef float2 roi_type; static const dim3 block() { - return dim3(GK107PolicyX4::STA_X, GK107PolicyX4::STA_Y); + return dim3(STA_X, STA_Y); } }; @@ -146,7 +147,7 @@ struct CascadeInvoker int scales; void operator()(const PtrStepSzb& roi, const PtrStepSzi& hogluv, PtrStepSz objects, - PtrStepSzi counter, const int downscales, const int csale = -1, const cudaStream_t& stream = 0) const; + PtrStepSzi counter, const int downscales, const cudaStream_t& stream = 0) const; template __device void detect(Detection* objects, const uint ndetections, uint* ctr, const int downscales) const; diff --git a/modules/gpu/src/softcascade.cpp b/modules/gpu/src/softcascade.cpp index ad6e00027..5da3abf53 100644 --- a/modules/gpu/src/softcascade.cpp +++ b/modules/gpu/src/softcascade.cpp @@ -311,13 +311,13 @@ struct cv::gpu::SCascade::Fields leaves.upload(hleaves); } - void detect(int scale, const cv::gpu::GpuMat& roi, const cv::gpu::GpuMat& count, cv::gpu::GpuMat& objects, const cudaStream_t& stream) const + void detect(const cv::gpu::GpuMat& roi, const cv::gpu::GpuMat& count, cv::gpu::GpuMat& objects, const cudaStream_t& stream) const { cudaMemset(count.data, 0, sizeof(Detection)); cudaSafeCall( cudaGetLastError()); device::icf::CascadeInvoker invoker = device::icf::CascadeInvoker(levels, octaves, stages, nodes, leaves); - invoker(roi, hogluv, objects, count, downscales, scale, stream); + invoker(roi, hogluv, objects, count, downscales, stream); } void preprocess(const cv::gpu::GpuMat& colored, Stream& s) @@ -521,36 +521,7 @@ void cv::gpu::SCascade::detect(InputArray image, InputArray _rois, OutputArray _ objects = GpuMat(objects, cv::Rect( sizeof(Detection), 0, objects.cols - sizeof(Detection), 1)); cudaStream_t stream = StreamAccessor::getStream(s); - flds.detect(-1, rois, tmp, objects, stream); -} - -void cv::gpu::SCascade::detect(InputArray image, InputArray _rois, OutputArray _objects, const int level, Stream& s) const -{ - CV_Assert(fields); - - const GpuMat colored = image.getGpuMat(); - // only color images are supperted - CV_Assert(colored.type() == CV_8UC3 || colored.type() == CV_32SC1); - - Fields& flds = *fields; - if (colored.type() == CV_8UC3) - { - // only this window size allowed - // CV_Assert(colored.cols == Fields::FRAME_WIDTH && colored.rows == Fields::FRAME_HEIGHT); - flds.preprocess(colored, s); - } - else - { - colored.copyTo(flds.hogluv); - } - - GpuMat rois = _rois.getGpuMat(), objects = _objects.getGpuMat(); - - GpuMat tmp = GpuMat(objects, cv::Rect(0, 0, sizeof(Detection), 1)); - objects = GpuMat(objects, cv::Rect( sizeof(Detection), 0, objects.cols - sizeof(Detection), 1)); - cudaStream_t stream = StreamAccessor::getStream(s); - - flds.detect(level, rois, tmp, objects, stream); + flds.detect(rois, tmp, objects, stream); } void cv::gpu::SCascade::genRoi(InputArray _roi, OutputArray _mask, Stream& stream) const diff --git a/modules/gpu/test/test_softcascade.cpp b/modules/gpu/test/test_softcascade.cpp index 7034b33b9..e36c28904 100644 --- a/modules/gpu/test/test_softcascade.cpp +++ b/modules/gpu/test/test_softcascade.cpp @@ -195,55 +195,55 @@ GPU_TEST_P(SCascadeTestRoi, detect, } -typedef ::testing::TestWithParam > SCascadeTestLevel; -GPU_TEST_P(SCascadeTestLevel, detect, - testing::Combine( - ALL_DEVICES, - testing::Values(std::string("cv/cascadeandhog/sc_cvpr_2012_to_opencv.xml")), - testing::Values(std::string("../cv/cascadeandhog/bahnhof/image_00000000_0.png")), - testing::Range(0, 47) - )) -{ - cv::gpu::setDevice(GET_PARAM(0).deviceID()); +// typedef ::testing::TestWithParam > SCascadeTestLevel; +// GPU_TEST_P(SCascadeTestLevel, detect, +// testing::Combine( +// ALL_DEVICES, +// testing::Values(std::string("cv/cascadeandhog/sc_cvpr_2012_to_opencv.xml")), +// testing::Values(std::string("../cv/cascadeandhog/bahnhof/image_00000000_0.png")), +// testing::Range(0, 47) +// )) +// { +// cv::gpu::setDevice(GET_PARAM(0).deviceID()); - cv::gpu::SCascade cascade; +// cv::gpu::SCascade cascade; - cv::FileStorage fs(perf::TestBase::getDataPath(GET_PARAM(1)), cv::FileStorage::READ); - ASSERT_TRUE(fs.isOpened()); +// cv::FileStorage fs(perf::TestBase::getDataPath(GET_PARAM(1)), cv::FileStorage::READ); +// ASSERT_TRUE(fs.isOpened()); - ASSERT_TRUE(cascade.load(fs.getFirstTopLevelNode())); +// ASSERT_TRUE(cascade.load(fs.getFirstTopLevelNode())); - cv::Mat coloredCpu = cv::imread(cvtest::TS::ptr()->get_data_path() + GET_PARAM(2)); - ASSERT_FALSE(coloredCpu.empty()); +// cv::Mat coloredCpu = cv::imread(cvtest::TS::ptr()->get_data_path() + GET_PARAM(2)); +// ASSERT_FALSE(coloredCpu.empty()); - typedef cv::gpu::SCascade::Detection Detection; - GpuMat colored(coloredCpu), objectBoxes(1, 100 * sizeof(Detection), CV_8UC1), rois(colored.size(), CV_8UC1); - rois.setTo(1); +// typedef cv::gpu::SCascade::Detection Detection; +// GpuMat colored(coloredCpu), objectBoxes(1, 100 * sizeof(Detection), CV_8UC1), rois(colored.size(), CV_8UC1); +// rois.setTo(1); - cv::gpu::GpuMat trois; - cascade.genRoi(rois, trois); - objectBoxes.setTo(0); - int level = GET_PARAM(3); - cascade.detect(colored, trois, objectBoxes, level); +// cv::gpu::GpuMat trois; +// cascade.genRoi(rois, trois); +// objectBoxes.setTo(0); +// int level = GET_PARAM(3); +// cascade.detect(colored, trois, objectBoxes, level); - cv::Mat dt(objectBoxes); +// cv::Mat dt(objectBoxes); - Detection* dts = ((Detection*)dt.data) + 1; - int* count = dt.ptr(0); +// Detection* dts = ((Detection*)dt.data) + 1; +// int* count = dt.ptr(0); - cv::Mat result(coloredCpu); +// cv::Mat result(coloredCpu); - printTotal(std::cout, *count); - for (int i = 0; i < *count; ++i) - { - Detection d = dts[i]; - print(std::cout, d); - cv::rectangle(result, cv::Rect(d.x, d.y, d.w, d.h), cv::Scalar(255, 0, 0, 255), 1); - } +// printTotal(std::cout, *count); +// for (int i = 0; i < *count; ++i) +// { +// Detection d = dts[i]; +// print(std::cout, d); +// cv::rectangle(result, cv::Rect(d.x, d.y, d.w, d.h), cv::Scalar(255, 0, 0, 255), 1); +// } - writeResult(result, level); - SHOW(result); -} +// writeResult(result, level); +// SHOW(result); +// } TEST(SCascadeTest, readCascade) { From a30bbda3bddfa55c1e6261e1d872c262f69deb43 Mon Sep 17 00:00:00 2001 From: "marina.kolpakova" Date: Wed, 14 Nov 2012 14:36:59 +0400 Subject: [PATCH 070/155] remove hardcoded values --- modules/gpu/src/cuda/isf-sc.cu | 5 +++-- modules/gpu/src/icf.hpp | 2 +- modules/gpu/src/softcascade.cpp | 11 ++++++----- 3 files changed, 10 insertions(+), 8 deletions(-) diff --git a/modules/gpu/src/cuda/isf-sc.cu b/modules/gpu/src/cuda/isf-sc.cu index 3d3536683..a4496bf67 100644 --- a/modules/gpu/src/cuda/isf-sc.cu +++ b/modules/gpu/src/cuda/isf-sc.cu @@ -181,6 +181,7 @@ __device void CascadeInvoker::detect(Detection* objects, const uint ndet int st = level.octave * level.step; const int stEnd = st + level.step; + const int hogluvStep = gridDim.y * Policy::STA_Y; float confidence = 0.f; for(; st < stEnd; st += Policy::WARP) { @@ -189,13 +190,13 @@ __device void CascadeInvoker::detect(Detection* objects, const uint ndet Node node = nodes[nId]; float threshold = rescale(level, node); - int sum = get(x, y + (node.threshold >> 28) * 120, node.rect); + int sum = get(x, y + (node.threshold >> 28) * hogluvStep, node.rect); int next = 1 + (int)(sum >= threshold); node = nodes[nId + next]; threshold = rescale(level, node); - sum = get(x, y + (node.threshold >> 28) * 120, node.rect); + sum = get(x, y + (node.threshold >> 28) * hogluvStep, node.rect); const int lShift = (next - 1) * 2 + (int)(sum >= threshold); float impact = leaves[(st + threadIdx.x) * 4 + lShift]; diff --git a/modules/gpu/src/icf.hpp b/modules/gpu/src/icf.hpp index 2bbbb64d2..454ac30da 100644 --- a/modules/gpu/src/icf.hpp +++ b/modules/gpu/src/icf.hpp @@ -130,7 +130,7 @@ struct CascadeInvoker { CascadeInvoker(): levels(0), stages(0), nodes(0), leaves(0), scales(0) {} - CascadeInvoker(const PtrStepSzb& _levels, const PtrStepSzb& _octaves, const PtrStepSzf& _stages, + CascadeInvoker(const PtrStepSzb& _levels, const PtrStepSzf& _stages, const PtrStepSzb& _nodes, const PtrStepSzf& _leaves) : levels((const Level*)_levels.ptr()), stages((const float*)_stages.ptr()), diff --git a/modules/gpu/src/softcascade.cpp b/modules/gpu/src/softcascade.cpp index 5da3abf53..038654225 100644 --- a/modules/gpu/src/softcascade.cpp +++ b/modules/gpu/src/softcascade.cpp @@ -316,7 +316,7 @@ struct cv::gpu::SCascade::Fields cudaMemset(count.data, 0, sizeof(Detection)); cudaSafeCall( cudaGetLastError()); device::icf::CascadeInvoker invoker - = device::icf::CascadeInvoker(levels, octaves, stages, nodes, leaves); + = device::icf::CascadeInvoker(levels, stages, nodes, leaves); invoker(roi, hogluv, objects, count, downscales, stream); } @@ -414,7 +414,7 @@ private: void integrate(const int fh, const int fw, Stream& s) { GpuMat channels(plane, cv::Rect(0, 0, fw, fh * Fields::HOG_LUV_BINS)); - cv::gpu::resize(channels, shrunk, cv::Size(), 0.25, 0.25, CV_INTER_AREA, s); + cv::gpu::resize(channels, shrunk, cv::Size(), 1.f / shrinkage, 1.f / shrinkage, CV_INTER_AREA, s); if (info.majorVersion() < 3) cv::gpu::integralBuffered(shrunk, hogluv, integralBuffer, s); @@ -518,7 +518,7 @@ void cv::gpu::SCascade::detect(InputArray image, InputArray _rois, OutputArray _ GpuMat tmp = GpuMat(objects, cv::Rect(0, 0, sizeof(Detection), 1)); - objects = GpuMat(objects, cv::Rect( sizeof(Detection), 0, objects.cols - sizeof(Detection), 1)); + objects = GpuMat(objects, cv::Rect( sizeof(Detection), 0, objects.cols - sizeof(Detection), 1)); cudaStream_t stream = StreamAccessor::getStream(s); flds.detect(rois, tmp, objects, stream); @@ -527,13 +527,14 @@ void cv::gpu::SCascade::detect(InputArray image, InputArray _rois, OutputArray _ void cv::gpu::SCascade::genRoi(InputArray _roi, OutputArray _mask, Stream& stream) const { CV_Assert(fields); + int shr = (*fields).shrinkage; const GpuMat roi = _roi.getGpuMat(); - _mask.create( roi.cols / 4, roi.rows / 4, roi.type() ); + _mask.create( roi.cols / shr, roi.rows / shr, roi.type() ); GpuMat mask = _mask.getGpuMat(); cv::gpu::GpuMat tmp; - cv::gpu::resize(roi, tmp, cv::Size(), 0.25, 0.25, CV_INTER_AREA, stream); + cv::gpu::resize(roi, tmp, cv::Size(), 1.f / shr, 1.f / shr, CV_INTER_AREA, stream); cv::gpu::transpose(tmp, mask, stream); } From 781c04324eab9537dc3ddb0b01f75975990b8e14 Mon Sep 17 00:00:00 2001 From: "marina.kolpakova" Date: Wed, 14 Nov 2012 14:47:00 +0400 Subject: [PATCH 071/155] refactor: PrefixSum --- modules/gpu/src/cuda/isf-sc.cu | 60 ++++++++++++++++++++-------------- 1 file changed, 35 insertions(+), 25 deletions(-) diff --git a/modules/gpu/src/cuda/isf-sc.cu b/modules/gpu/src/cuda/isf-sc.cu index a4496bf67..27d60e637 100644 --- a/modules/gpu/src/cuda/isf-sc.cu +++ b/modules/gpu/src/cuda/isf-sc.cu @@ -79,6 +79,39 @@ namespace icf { } } + template + struct PrefixSum + { + __device static void apply(float& impact) + { + #if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 300 + #pragma unroll + // scan on shuffl functions + for (int i = 1; i < Policy::WARP; i *= 2) + { + const float n = __shfl_up(impact, i, Policy::WARP); + + if (threadIdx.x >= i) + impact += n; + } + #else + __shared__ volatile float ptr[Policy::STA_X * Policy::STA_Y]; + + const int idx = threadIdx.y * Policy::STA_X + threadIdx.x; + + ptr[idx] = impact; + + if ( threadIdx.x >= 1) ptr [idx ] = (ptr [idx - 1] + ptr [idx]); + if ( threadIdx.x >= 2) ptr [idx ] = (ptr [idx - 2] + ptr [idx]); + if ( threadIdx.x >= 4) ptr [idx ] = (ptr [idx - 4] + ptr [idx]); + if ( threadIdx.x >= 8) ptr [idx ] = (ptr [idx - 8] + ptr [idx]); + if ( threadIdx.x >= 16) ptr [idx ] = (ptr [idx - 16] + ptr [idx]); + + impact = ptr[idx]; + #endif + } + }; + texture thogluv; template @@ -201,32 +234,9 @@ __device void CascadeInvoker::detect(Detection* objects, const uint ndet const int lShift = (next - 1) * 2 + (int)(sum >= threshold); float impact = leaves[(st + threadIdx.x) * 4 + lShift]; -#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 300 -#pragma unroll - // scan on shuffl functions - for (int i = 1; i < Policy::WARP; i *= 2) - { - const float n = __shfl_up(impact, i, Policy::WARP); - - if (threadIdx.x >= i) - impact += n; - } -#else - __shared__ volatile float ptr[Policy::STA_X * Policy::STA_Y]; - - const int idx = threadIdx.y * Policy::STA_X + threadIdx.x; - - ptr[idx] = impact; - - if ( threadIdx.x >= 1) ptr [idx ] = (ptr [idx - 1] + ptr [idx]); - if ( threadIdx.x >= 2) ptr [idx ] = (ptr [idx - 2] + ptr [idx]); - if ( threadIdx.x >= 4) ptr [idx ] = (ptr [idx - 4] + ptr [idx]); - if ( threadIdx.x >= 8) ptr [idx ] = (ptr [idx - 8] + ptr [idx]); - if ( threadIdx.x >= 16) ptr [idx ] = (ptr [idx - 16] + ptr [idx]); - - impact = ptr[idx]; -#endif + PrefixSum::apply(impact); confidence += impact; + if(__any((confidence <= stages[(st + threadIdx.x)]))) st += 2048; } From ee4f003e725cea21690d361f03c2596304d28720 Mon Sep 17 00:00:00 2001 From: "marina.kolpakova" Date: Wed, 14 Nov 2012 14:49:38 +0400 Subject: [PATCH 072/155] fixed typo --- modules/gpu/src/cuda/{isf-sc.cu => icf-sc.cu} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename modules/gpu/src/cuda/{isf-sc.cu => icf-sc.cu} (100%) diff --git a/modules/gpu/src/cuda/isf-sc.cu b/modules/gpu/src/cuda/icf-sc.cu similarity index 100% rename from modules/gpu/src/cuda/isf-sc.cu rename to modules/gpu/src/cuda/icf-sc.cu From c3e4a52fbe44ae5bf2f754e922a2e932e3d20a28 Mon Sep 17 00:00:00 2001 From: "marina.kolpakova" Date: Wed, 14 Nov 2012 20:11:18 +0400 Subject: [PATCH 073/155] soft cascade sample --- modules/gpu/src/softcascade.cpp | 4 +- samples/gpu/softcascade.cpp | 106 ++++++++++++++++++++++++++++++++ 2 files changed, 109 insertions(+), 1 deletion(-) create mode 100644 samples/gpu/softcascade.cpp diff --git a/modules/gpu/src/softcascade.cpp b/modules/gpu/src/softcascade.cpp index 038654225..7f5221f37 100644 --- a/modules/gpu/src/softcascade.cpp +++ b/modules/gpu/src/softcascade.cpp @@ -287,7 +287,8 @@ struct cv::gpu::SCascade::Fields bool update(int fh, int fw, int shr) { - if (fh == luv.rows && fh == luv.cols) return false; + if ((fh == luv.rows) && (fw == luv.cols)) return false; + plane.create(fh * (HOG_LUV_BINS + 1), fw, CV_8UC1); fplane.create(fh * HOG_BINS, fw, CV_32FC1); luv.create(fh, fw, CV_8UC3); @@ -297,6 +298,7 @@ struct cv::gpu::SCascade::Fields hogluv.create((fh / shr) * HOG_LUV_BINS + 1, fw / shr + 1, CV_32SC1); hogluv.setTo(cv::Scalar::all(0)); + return true; } diff --git a/samples/gpu/softcascade.cpp b/samples/gpu/softcascade.cpp new file mode 100644 index 000000000..3c08fdb1c --- /dev/null +++ b/samples/gpu/softcascade.cpp @@ -0,0 +1,106 @@ +#include +#include +#include + +int main(int argc, char** argv) +{ + const std::string keys = + "{help h usage ? | | print this message }" + "{cascade c | | path to configuration xml }" + "{frames f | | path to configuration xml }" + "{min_scale |0.4f | path to configuration xml }" + "{max_scale |5.0f | path to configuration xml }" + "{total_scales |55 | path to configuration xml }" + "{device d |0 | path to configuration xml }" + ; + + cv::CommandLineParser parser(argc, argv, keys); + parser.about("Soft cascade training application."); + + if (parser.has("help")) + { + parser.printMessage(); + return 0; + } + + if (!parser.check()) + { + parser.printErrors(); + return 1; + } + + cv::gpu::setDevice(parser.get("device")); + + std::string cascadePath = parser.get("cascade"); + + cv::FileStorage fs(cascadePath, cv::FileStorage::READ); + if(!fs.isOpened()) + { + std::cout << "Soft Cascade file " << cascadePath << " can't be opened." << std::endl << std::flush; + return 1; + } + + std::cout << "Read cascade from file " << cascadePath << std::endl; + + float minScale = parser.get("min_scale"); + float maxScale = parser.get("max_scale"); + int scales = parser.get("total_scales"); + + using cv::gpu::SCascade; + SCascade cascade(minScale, maxScale, scales); + + if (!cascade.load(fs.getFirstTopLevelNode())) + { + std::cout << "Soft Cascade can't be parsed." << std::endl << std::flush; + return 1; + } + + std::string frames = parser.get("frames"); + cv::VideoCapture capture(frames); + if(!capture.isOpened()) + { + std::cout << "Frame source " << frames << " can't be opened." << std::endl << std::flush; + return 1; + } + + cv::gpu::GpuMat objects(1, sizeof(SCascade::Detection) * 10000, CV_8UC1); + cv::gpu::printShortCudaDeviceInfo(parser.get("device")); + for (;;) + { + cv::Mat frame; + if (!capture.read(frame)) + { + std::cout << "Nothing to read. " << std::endl << std::flush; + return 0; + } + + cv::gpu::GpuMat dframe(frame), roi(frame.rows, frame.cols, CV_8UC1), trois; + roi.setTo(cv::Scalar::all(1)); + cascade.genRoi(roi, trois); + cascade.detect(dframe, trois, objects); + + cv::Mat dt(objects); + typedef cv::gpu::SCascade::Detection Detection; + + Detection* dts = ((Detection*)dt.data) + 1; + int* count = dt.ptr(0); + + std::cout << *count << std::endl; + + cv::Mat result; + frame.copyTo(result); + + + for (int i = 0; i < *count; ++i) + { + Detection d = dts[i]; + cv::rectangle(result, cv::Rect(d.x, d.y, d.w, d.h), cv::Scalar(255, 0, 0, 255), 1); + } + + std::cout << "working..." << std::endl; + cv::imshow("Soft Cascade demo", result); + cv::waitKey(10); + } + + return 0; +} \ No newline at end of file From 0865227049b465bc61dc104627dbbf50611f3436 Mon Sep 17 00:00:00 2001 From: "marina.kolpakova" Date: Wed, 14 Nov 2012 20:28:08 +0400 Subject: [PATCH 074/155] check if scaling values changed --- modules/gpu/src/softcascade.cpp | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/modules/gpu/src/softcascade.cpp b/modules/gpu/src/softcascade.cpp index 7f5221f37..bf543150b 100644 --- a/modules/gpu/src/softcascade.cpp +++ b/modules/gpu/src/softcascade.cpp @@ -248,6 +248,17 @@ struct cv::gpu::SCascade::Fields return fields; } + bool check(float mins,float maxs, int scales) + { + bool updated = (minScale == mins) || (maxScale == maxs) || (totals = scales); + + minScale = mins; + maxScale = maxScale; + totals = scales; + + return updated; + } + int createLevels(const int fh, const int fw) { using namespace device::icf; @@ -509,7 +520,7 @@ void cv::gpu::SCascade::detect(InputArray image, InputArray _rois, OutputArray _ if (colored.type() == CV_8UC3) { - if (!flds.update(colored.rows, colored.cols, flds.shrinkage)) + if (!flds.update(colored.rows, colored.cols, flds.shrinkage) || flds.check(minScale, maxScale, scales)) flds.createLevels(colored.rows, colored.cols); flds.preprocess(colored, s); } From a9f10e5cadeb64bf956e5bef624a494bf65142ce Mon Sep 17 00:00:00 2001 From: "marina.kolpakova" Date: Wed, 14 Nov 2012 20:34:17 +0400 Subject: [PATCH 075/155] fixed compile without cuda --- modules/gpu/src/softcascade.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/modules/gpu/src/softcascade.cpp b/modules/gpu/src/softcascade.cpp index bf543150b..d5a8e8481 100644 --- a/modules/gpu/src/softcascade.cpp +++ b/modules/gpu/src/softcascade.cpp @@ -52,7 +52,6 @@ cv::gpu::SCascade::~SCascade() { throw_nogpu(); } bool cv::gpu::SCascade::load(const FileNode&) { throw_nogpu(); return false;} void cv::gpu::SCascade::detect(InputArray, InputArray, OutputArray, Stream&) const { throw_nogpu(); } -void cv::gpu::SCascade::detect(InputArray, InputArray, OutputArray, const int, Stream&) const { throw_nogpu(); } void cv::gpu::SCascade::genRoi(InputArray, OutputArray, Stream&) const { throw_nogpu(); } From d47c112434064c90816af2cf993e1ede3b303a7a Mon Sep 17 00:00:00 2001 From: Vladislav Vinogradov Date: Mon, 12 Nov 2012 12:18:31 +0400 Subject: [PATCH 076/155] fix abs_func and minimum/maximum functors --- .../include/opencv2/gpu/device/functional.hpp | 89 +++++++++++++++++-- .../include/opencv2/gpu/device/vec_math.hpp | 4 +- modules/gpu/src/cuda/pyrlk.cu | 4 +- modules/gpu/src/cuda/surf.cu | 4 +- 4 files changed, 86 insertions(+), 15 deletions(-) diff --git a/modules/gpu/include/opencv2/gpu/device/functional.hpp b/modules/gpu/include/opencv2/gpu/device/functional.hpp index c601cf527..6e0471e9a 100644 --- a/modules/gpu/include/opencv2/gpu/device/functional.hpp +++ b/modules/gpu/include/opencv2/gpu/device/functional.hpp @@ -302,18 +302,18 @@ namespace cv { namespace gpu { namespace device template <> struct name : binary_function \ { \ __device__ __forceinline__ type operator()(type lhs, type rhs) const {return op(lhs, rhs);} \ - __device__ __forceinline__ name(const name& other):binary_function(){}\ - __device__ __forceinline__ name():binary_function(){}\ + __device__ __forceinline__ name() {}\ + __device__ __forceinline__ name(const name&) {}\ }; template struct maximum : binary_function { __device__ __forceinline__ T operator()(typename TypeTraits::ParameterType lhs, typename TypeTraits::ParameterType rhs) const { - return lhs < rhs ? rhs : lhs; + return max(lhs, rhs); } - __device__ __forceinline__ maximum(const maximum& other):binary_function(){} - __device__ __forceinline__ maximum():binary_function(){} + __device__ __forceinline__ maximum() {} + __device__ __forceinline__ maximum(const maximum&) {} }; OPENCV_GPU_IMPLEMENT_MINMAX(maximum, uchar, ::max) @@ -330,10 +330,10 @@ namespace cv { namespace gpu { namespace device { __device__ __forceinline__ T operator()(typename TypeTraits::ParameterType lhs, typename TypeTraits::ParameterType rhs) const { - return lhs < rhs ? lhs : rhs; + return min(lhs, rhs); } - __device__ __forceinline__ minimum(const minimum& other):binary_function(){} - __device__ __forceinline__ minimum():binary_function(){} + __device__ __forceinline__ minimum() {} + __device__ __forceinline__ minimum(const minimum&) {} }; OPENCV_GPU_IMPLEMENT_MINMAX(minimum, uchar, ::min) @@ -350,6 +350,78 @@ namespace cv { namespace gpu { namespace device // Math functions ///bound========================================= + + template struct abs_func : unary_function + { + __device__ __forceinline__ T operator ()(typename TypeTraits::ParameterType x) const + { + return abs(x); + } + }; + template <> struct abs_func : unary_function + { + __device__ __forceinline__ unsigned char operator ()(unsigned char x) const + { + return x; + } + }; + template <> struct abs_func : unary_function + { + __device__ __forceinline__ signed char operator ()(signed char x) const + { + return ::abs(x); + } + }; + template <> struct abs_func : unary_function + { + __device__ __forceinline__ char operator ()(char x) const + { + return ::abs(x); + } + }; + template <> struct abs_func : unary_function + { + __device__ __forceinline__ unsigned short operator ()(unsigned short x) const + { + return x; + } + }; + template <> struct abs_func : unary_function + { + __device__ __forceinline__ short operator ()(short x) const + { + return ::abs(x); + } + }; + template <> struct abs_func : unary_function + { + __device__ __forceinline__ unsigned int operator ()(unsigned int x) const + { + return x; + } + }; + template <> struct abs_func : unary_function + { + __device__ __forceinline__ int operator ()(int x) const + { + return ::abs(x); + } + }; + template <> struct abs_func : unary_function + { + __device__ __forceinline__ float operator ()(float x) const + { + return ::fabsf(x); + } + }; + template <> struct abs_func : unary_function + { + __device__ __forceinline__ double operator ()(double x) const + { + return ::fabs(x); + } + }; + #define OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(name, func) \ template struct name ## _func : unary_function \ { \ @@ -382,7 +454,6 @@ namespace cv { namespace gpu { namespace device } \ }; - OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(fabs, ::fabs) OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(sqrt, ::sqrt) OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(exp, ::exp) OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(exp2, ::exp2) diff --git a/modules/gpu/include/opencv2/gpu/device/vec_math.hpp b/modules/gpu/include/opencv2/gpu/device/vec_math.hpp index 0ec790c0b..1c46dc0c3 100644 --- a/modules/gpu/include/opencv2/gpu/device/vec_math.hpp +++ b/modules/gpu/include/opencv2/gpu/device/vec_math.hpp @@ -280,7 +280,7 @@ namespace cv { namespace gpu { namespace device OPENCV_GPU_IMPLEMENT_VEC_UNOP (type, operator ! , logical_not) \ OPENCV_GPU_IMPLEMENT_VEC_BINOP(type, max, maximum) \ OPENCV_GPU_IMPLEMENT_VEC_BINOP(type, min, minimum) \ - OPENCV_GPU_IMPLEMENT_VEC_UNOP(type, fabs, fabs_func) \ + OPENCV_GPU_IMPLEMENT_VEC_UNOP(type, abs, abs_func) \ OPENCV_GPU_IMPLEMENT_VEC_UNOP(type, sqrt, sqrt_func) \ OPENCV_GPU_IMPLEMENT_VEC_UNOP(type, exp, exp_func) \ OPENCV_GPU_IMPLEMENT_VEC_UNOP(type, exp2, exp2_func) \ @@ -327,4 +327,4 @@ namespace cv { namespace gpu { namespace device #undef OPENCV_GPU_IMPLEMENT_VEC_INT_OP }}} // namespace cv { namespace gpu { namespace device -#endif // __OPENCV_GPU_VECMATH_HPP__ \ No newline at end of file +#endif // __OPENCV_GPU_VECMATH_HPP__ diff --git a/modules/gpu/src/cuda/pyrlk.cu b/modules/gpu/src/cuda/pyrlk.cu index d1a65c210..811c3b90b 100644 --- a/modules/gpu/src/cuda/pyrlk.cu +++ b/modules/gpu/src/cuda/pyrlk.cu @@ -267,7 +267,7 @@ namespace cv { namespace gpu { namespace device } __device__ __forceinline__ float4 abs_(const float4& a) { - return fabs(a); + return abs(a); } template @@ -681,4 +681,4 @@ namespace cv { namespace gpu { namespace device } }}} -#endif /* CUDA_DISABLER */ \ No newline at end of file +#endif /* CUDA_DISABLER */ diff --git a/modules/gpu/src/cuda/surf.cu b/modules/gpu/src/cuda/surf.cu index 8c80559c5..aebda0ea9 100644 --- a/modules/gpu/src/cuda/surf.cu +++ b/modules/gpu/src/cuda/surf.cu @@ -638,7 +638,7 @@ namespace cv { namespace gpu { namespace device kp_dir *= 180.0f / CV_PI_F; kp_dir = 360.0f - kp_dir; - if (abs(kp_dir - 360.f) < FLT_EPSILON) + if (::fabsf(kp_dir - 360.f) < FLT_EPSILON) kp_dir = 0.f; featureDir[blockIdx.x] = kp_dir; @@ -1003,4 +1003,4 @@ namespace cv { namespace gpu { namespace device }}} // namespace cv { namespace gpu { namespace device -#endif /* CUDA_DISABLER */ \ No newline at end of file +#endif /* CUDA_DISABLER */ From 7a1874b2ccf9bafd12a343baa2f133194439f659 Mon Sep 17 00:00:00 2001 From: Vladislav Vinogradov Date: Mon, 12 Nov 2012 10:55:32 +0400 Subject: [PATCH 077/155] new reduce and reduceKeyVal implementation --- .../opencv2/gpu/device/detail/reduce.hpp | 352 +++++++++++++ .../gpu/device/detail/reduce_key_val.hpp | 489 ++++++++++++++++++ .../gpu/include/opencv2/gpu/device/reduce.hpp | 197 +++++++ .../include/opencv2/gpu/device/utility.hpp | 2 +- .../opencv2/gpu/device/vec_distance.hpp | 8 +- .../opencv2/gpu/device/warp_shuffle.hpp | 97 ++++ modules/gpu/src/cuda/orb.cu | 14 +- modules/gpu/src/cuda/surf.cu | 4 +- 8 files changed, 1149 insertions(+), 14 deletions(-) create mode 100644 modules/gpu/include/opencv2/gpu/device/detail/reduce.hpp create mode 100644 modules/gpu/include/opencv2/gpu/device/detail/reduce_key_val.hpp create mode 100644 modules/gpu/include/opencv2/gpu/device/reduce.hpp create mode 100644 modules/gpu/include/opencv2/gpu/device/warp_shuffle.hpp diff --git a/modules/gpu/include/opencv2/gpu/device/detail/reduce.hpp b/modules/gpu/include/opencv2/gpu/device/detail/reduce.hpp new file mode 100644 index 000000000..628129ea3 --- /dev/null +++ b/modules/gpu/include/opencv2/gpu/device/detail/reduce.hpp @@ -0,0 +1,352 @@ +/*M/////////////////////////////////////////////////////////////////////////////////////// +// +// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. +// +// By downloading, copying, installing or using the software you agree to this license. +// If you do not agree to this license, do not download, install, +// copy or use the software. +// +// +// License Agreement +// For Open Source Computer Vision Library +// +// Copyright (C) 2000-2008, Intel Corporation, all rights reserved. +// Copyright (C) 2009, Willow Garage Inc., all rights reserved. +// Third party copyrights are property of their respective owners. +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// * Redistribution's of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// * Redistribution's in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// * The name of the copyright holders may not be used to endorse or promote products +// derived from this software without specific prior written permission. +// +// This software is provided by the copyright holders and contributors "as is" and +// any express or implied warranties, including, but not limited to, the implied +// warranties of merchantability and fitness for a particular purpose are disclaimed. +// In no event shall the Intel Corporation or contributors be liable for any direct, +// indirect, incidental, special, exemplary, or consequential damages +// (including, but not limited to, procurement of substitute goods or services; +// loss of use, data, or profits; or business interruption) however caused +// and on any theory of liability, whether in contract, strict liability, +// or tort (including negligence or otherwise) arising in any way out of +// the use of this software, even if advised of the possibility of such damage. +// +//M*/ + +#ifndef __OPENCV_GPU_REDUCE_DETAIL_HPP__ +#define __OPENCV_GPU_REDUCE_DETAIL_HPP__ + +#include +#include "../warp.hpp" +#include "../warp_shuffle.hpp" + +namespace cv { namespace gpu { namespace device +{ + namespace reduce_detail + { + template struct GetType; + template struct GetType + { + typedef T type; + }; + template struct GetType + { + typedef T type; + }; + template struct GetType + { + typedef T type; + }; + + template + struct For + { + template + static __device__ void loadToSmem(const PointerTuple& smem, const ValTuple& val, unsigned int tid) + { + thrust::get(smem)[tid] = thrust::get(val); + + For::loadToSmem(smem, val, tid); + } + template + static __device__ void loadFromSmem(const PointerTuple& smem, const ValTuple& val, unsigned int tid) + { + thrust::get(val) = thrust::get(smem)[tid]; + + For::loadFromSmem(smem, val, tid); + } + + template + static __device__ void merge(const PointerTuple& smem, const ValTuple& val, unsigned int tid, unsigned int delta, const OpTuple& op) + { + typename GetType::type>::type reg = thrust::get(smem)[tid + delta]; + thrust::get(smem)[tid] = thrust::get(val) = thrust::get(op)(thrust::get(val), reg); + + For::merge(smem, val, tid, delta, op); + } + template + static __device__ void mergeShfl(const ValTuple& val, unsigned int delta, unsigned int width, const OpTuple& op) + { + typename GetType::type>::type reg = shfl_down(thrust::get(val), delta, width); + thrust::get(val) = thrust::get(op)(thrust::get(val), reg); + + For::mergeShfl(val, delta, width, op); + } + }; + template + struct For + { + template + static __device__ void loadToSmem(const PointerTuple&, const ValTuple&, unsigned int) + { + } + template + static __device__ void loadFromSmem(const PointerTuple&, const ValTuple&, unsigned int) + { + } + + template + static __device__ void merge(const PointerTuple&, const ValTuple&, unsigned int, unsigned int, const OpTuple&) + { + } + template + static __device__ void mergeShfl(const ValTuple&, unsigned int, unsigned int, const OpTuple&) + { + } + }; + + template + __device__ __forceinline__ void loadToSmem(volatile T* smem, T& val, unsigned int tid) + { + smem[tid] = val; + } + template + __device__ __forceinline__ void loadFromSmem(volatile T* smem, T& val, unsigned int tid) + { + val = smem[tid]; + } + template + __device__ __forceinline__ void loadToSmem(const thrust::tuple& smem, + const thrust::tuple& val, + unsigned int tid) + { + For<0, thrust::tuple_size >::value>::loadToSmem(smem, val, tid); + } + template + __device__ __forceinline__ void loadFromSmem(const thrust::tuple& smem, + const thrust::tuple& val, + unsigned int tid) + { + For<0, thrust::tuple_size >::value>::loadFromSmem(smem, val, tid); + } + + template + __device__ __forceinline__ void merge(volatile T* smem, T& val, unsigned int tid, unsigned int delta, const Op& op) + { + T reg = smem[tid + delta]; + smem[tid] = val = op(val, reg); + } + template + __device__ __forceinline__ void mergeShfl(T& val, unsigned int delta, unsigned int width, const Op& op) + { + T reg = shfl_down(val, delta, width); + val = op(val, reg); + } + template + __device__ __forceinline__ void merge(const thrust::tuple& smem, + const thrust::tuple& val, + unsigned int tid, + unsigned int delta, + const thrust::tuple& op) + { + For<0, thrust::tuple_size >::value>::merge(smem, val, tid, delta, op); + } + template + __device__ __forceinline__ void mergeShfl(const thrust::tuple& val, + unsigned int delta, + unsigned int width, + const thrust::tuple& op) + { + For<0, thrust::tuple_size >::value>::mergeShfl(val, delta, width, op); + } + + template struct Generic + { + template + static __device__ void reduce(Pointer smem, Reference val, unsigned int tid, Op op) + { + loadToSmem(smem, val, tid); + if (N >= 32) + __syncthreads(); + + if (N >= 2048) + { + if (tid < 1024) + merge(smem, val, tid, 1024, op); + + __syncthreads(); + } + if (N >= 1024) + { + if (tid < 512) + merge(smem, val, tid, 512, op); + + __syncthreads(); + } + if (N >= 512) + { + if (tid < 256) + merge(smem, val, tid, 256, op); + + __syncthreads(); + } + if (N >= 256) + { + if (tid < 128) + merge(smem, val, tid, 128, op); + + __syncthreads(); + } + if (N >= 128) + { + if (tid < 64) + merge(smem, val, tid, 64, op); + + __syncthreads(); + } + if (N >= 64) + { + if (tid < 32) + merge(smem, val, tid, 32, op); + } + + if (tid < 16) + { + merge(smem, val, tid, 16, op); + merge(smem, val, tid, 8, op); + merge(smem, val, tid, 4, op); + merge(smem, val, tid, 2, op); + merge(smem, val, tid, 1, op); + } + } + }; + + template struct WarpOptimized + { + template + static __device__ void reduce(Pointer smem, Reference val, unsigned int tid, Op op) + { + #if __CUDA_ARCH >= 300 + (void) smem; + (void) tid; + + #pragma unroll + for (unsigned int i = N / 2; i >= 1; i /= 2) + mergeShfl(val, i, N, op); + #else + loadToSmem(smem, val, tid); + + if (tid < N / 2) + { + #pragma unroll + for (unsigned int i = N / 2; i >= 1; i /= 2) + merge(smem, val, tid, i, op); + } + #endif + } + }; + + template struct GenericOptimized32 + { + enum { M = N / 32 }; + + template + static __device__ void reduce(Pointer smem, Reference val, unsigned int tid, Op op) + { + const unsigned int laneId = Warp::laneId(); + + #if __CUDA_ARCH >= 300 + #pragma unroll + for (int i = 16; i >= 1; i /= 2) + mergeShfl(val, i, warpSize, op); + + if (laneId == 0) + loadToSmem(smem, val, tid / 32); + #else + loadToSmem(smem, val, tid); + + if (laneId < 16) + { + #pragma unroll + for (int i = 16; i >= 1; i /= 2) + merge(smem, val, tid, i, op); + } + + __syncthreads(); + + if (laneId == 0) + loadToSmem(smem, val, tid / 32); + #endif + + __syncthreads(); + + loadFromSmem(smem, val, tid); + + if (tid < 32) + { + #if __CUDA_ARCH >= 300 + #pragma unroll + for (int i = M / 2; i >= 1; i /= 2) + mergeShfl(val, i, M, op); + #else + #pragma unroll + for (int i = M / 2; i >= 1; i /= 2) + merge(smem, val, tid, i, op); + #endif + } + } + }; + + template struct StaticIf; + template struct StaticIf + { + typedef T1 type; + }; + template struct StaticIf + { + typedef T2 type; + }; + + template struct IsPowerOf2 + { + enum { value = ((N != 0) && !(N & (N - 1))) }; + }; + + template struct Dispatcher + { + typedef typename StaticIf< + (N <= 32) && IsPowerOf2::value, + WarpOptimized, + typename StaticIf< + (N <= 1024) && IsPowerOf2::value, + GenericOptimized32, + Generic + >::type + >::type reductor; + }; + } +}}} + +#endif // __OPENCV_GPU_REDUCE_DETAIL_HPP__ diff --git a/modules/gpu/include/opencv2/gpu/device/detail/reduce_key_val.hpp b/modules/gpu/include/opencv2/gpu/device/detail/reduce_key_val.hpp new file mode 100644 index 000000000..f7531da24 --- /dev/null +++ b/modules/gpu/include/opencv2/gpu/device/detail/reduce_key_val.hpp @@ -0,0 +1,489 @@ +/*M/////////////////////////////////////////////////////////////////////////////////////// +// +// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. +// +// By downloading, copying, installing or using the software you agree to this license. +// If you do not agree to this license, do not download, install, +// copy or use the software. +// +// +// License Agreement +// For Open Source Computer Vision Library +// +// Copyright (C) 2000-2008, Intel Corporation, all rights reserved. +// Copyright (C) 2009, Willow Garage Inc., all rights reserved. +// Third party copyrights are property of their respective owners. +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// * Redistribution's of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// * Redistribution's in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// * The name of the copyright holders may not be used to endorse or promote products +// derived from this software without specific prior written permission. +// +// This software is provided by the copyright holders and contributors "as is" and +// any express or implied warranties, including, but not limited to, the implied +// warranties of merchantability and fitness for a particular purpose are disclaimed. +// In no event shall the Intel Corporation or contributors be liable for any direct, +// indirect, incidental, special, exemplary, or consequential damages +// (including, but not limited to, procurement of substitute goods or services; +// loss of use, data, or profits; or business interruption) however caused +// and on any theory of liability, whether in contract, strict liability, +// or tort (including negligence or otherwise) arising in any way out of +// the use of this software, even if advised of the possibility of such damage. +// +//M*/ + +#ifndef __OPENCV_GPU_PRED_VAL_REDUCE_DETAIL_HPP__ +#define __OPENCV_GPU_PRED_VAL_REDUCE_DETAIL_HPP__ + +#include +#include "../warp.hpp" +#include "../warp_shuffle.hpp" + +namespace cv { namespace gpu { namespace device +{ + namespace reduce_key_val_detail + { + template struct GetType; + template struct GetType + { + typedef T type; + }; + template struct GetType + { + typedef T type; + }; + template struct GetType + { + typedef T type; + }; + + template + struct For + { + template + static __device__ void loadToSmem(const PointerTuple& smem, const ReferenceTuple& data, unsigned int tid) + { + thrust::get(smem)[tid] = thrust::get(data); + + For::loadToSmem(smem, data, tid); + } + template + static __device__ void loadFromSmem(const PointerTuple& smem, const ReferenceTuple& data, unsigned int tid) + { + thrust::get(data) = thrust::get(smem)[tid]; + + For::loadFromSmem(smem, data, tid); + } + + template + static __device__ void copyShfl(const ReferenceTuple& val, unsigned int delta, int width) + { + thrust::get(val) = shfl_down(thrust::get(val), delta, width); + + For::copyShfl(val, delta, width); + } + template + static __device__ void copy(const PointerTuple& svals, const ReferenceTuple& val, unsigned int tid, unsigned int delta) + { + thrust::get(svals)[tid] = thrust::get(val) = thrust::get(svals)[tid + delta]; + + For::copy(svals, val, tid, delta); + } + + template + static __device__ void mergeShfl(const KeyReferenceTuple& key, const ValReferenceTuple& val, const CmpTuple& cmp, unsigned int delta, int width) + { + typename GetType::type>::type reg = shfl_down(thrust::get(key), delta, width); + + if (thrust::get(cmp)(reg, thrust::get(key))) + { + thrust::get(key) = reg; + thrust::get(val) = shfl_down(thrust::get(val), delta, width); + } + + For::mergeShfl(key, val, cmp, delta, width); + } + template + static __device__ void merge(const KeyPointerTuple& skeys, const KeyReferenceTuple& key, + const ValPointerTuple& svals, const ValReferenceTuple& val, + const CmpTuple& cmp, + unsigned int tid, unsigned int delta) + { + typename GetType::type>::type reg = thrust::get(skeys)[tid + delta]; + + if (thrust::get(cmp)(reg, thrust::get(key))) + { + thrust::get(skeys)[tid] = thrust::get(key) = reg; + thrust::get(svals)[tid] = thrust::get(val) = thrust::get(svals)[tid + delta]; + } + + For::merge(skeys, key, svals, val, cmp, tid, delta); + } + }; + template + struct For + { + template + static __device__ void loadToSmem(const PointerTuple&, const ReferenceTuple&, unsigned int) + { + } + template + static __device__ void loadFromSmem(const PointerTuple&, const ReferenceTuple&, unsigned int) + { + } + + template + static __device__ void copyShfl(const ReferenceTuple&, unsigned int, int) + { + } + template + static __device__ void copy(const PointerTuple&, const ReferenceTuple&, unsigned int, unsigned int) + { + } + + template + static __device__ void mergeShfl(const KeyReferenceTuple&, const ValReferenceTuple&, const CmpTuple&, unsigned int, int) + { + } + template + static __device__ void merge(const KeyPointerTuple&, const KeyReferenceTuple&, + const ValPointerTuple&, const ValReferenceTuple&, + const CmpTuple&, + unsigned int, unsigned int) + { + } + }; + + ////////////////////////////////////////////////////// + // loadToSmem + + template + __device__ __forceinline__ void loadToSmem(volatile T* smem, T& data, unsigned int tid) + { + smem[tid] = data; + } + template + __device__ __forceinline__ void loadFromSmem(volatile T* smem, T& data, unsigned int tid) + { + data = smem[tid]; + } + template + __device__ __forceinline__ void loadToSmem(const thrust::tuple& smem, + const thrust::tuple& data, + unsigned int tid) + { + For<0, thrust::tuple_size >::value>::loadToSmem(smem, data, tid); + } + template + __device__ __forceinline__ void loadFromSmem(const thrust::tuple& smem, + const thrust::tuple& data, + unsigned int tid) + { + For<0, thrust::tuple_size >::value>::loadFromSmem(smem, data, tid); + } + + ////////////////////////////////////////////////////// + // copyVals + + template + __device__ __forceinline__ void copyValsShfl(V& val, unsigned int delta, int width) + { + val = shfl_down(val, delta, width); + } + template + __device__ __forceinline__ void copyVals(volatile V* svals, V& val, unsigned int tid, unsigned int delta) + { + svals[tid] = val = svals[tid + delta]; + } + template + __device__ __forceinline__ void copyValsShfl(const thrust::tuple& val, + unsigned int delta, + int width) + { + For<0, thrust::tuple_size >::value>::copyShfl(val, delta, width); + } + template + __device__ __forceinline__ void copyVals(const thrust::tuple& svals, + const thrust::tuple& val, + unsigned int tid, unsigned int delta) + { + For<0, thrust::tuple_size >::value>::copy(svals, val, tid, delta); + } + + ////////////////////////////////////////////////////// + // merge + + template + __device__ __forceinline__ void mergeShfl(K& key, V& val, const Cmp& cmp, unsigned int delta, int width) + { + K reg = shfl_down(key, delta, width); + + if (cmp(reg, key)) + { + key = reg; + copyValsShfl(val, delta, width); + } + } + template + __device__ __forceinline__ void merge(volatile K* skeys, K& key, volatile V* svals, V& val, const Cmp& cmp, unsigned int tid, unsigned int delta) + { + K reg = skeys[tid + delta]; + + if (cmp(reg, key)) + { + skeys[tid] = key = reg; + copyVals(svals, val, tid, delta); + } + } + template + __device__ __forceinline__ void mergeShfl(K& key, + const thrust::tuple& val, + const Cmp& cmp, + unsigned int delta, int width) + { + K reg = shfl_down(key, delta, width); + + if (cmp(reg, key)) + { + key = reg; + copyValsShfl(val, delta, width); + } + } + template + __device__ __forceinline__ void merge(volatile K* skeys, K& key, + const thrust::tuple& svals, + const thrust::tuple& val, + const Cmp& cmp, unsigned int tid, unsigned int delta) + { + K reg = skeys[tid + delta]; + + if (cmp(reg, key)) + { + skeys[tid] = key = reg; + copyVals(svals, val, tid, delta); + } + } + template + __device__ __forceinline__ void mergeShfl(const thrust::tuple& key, + const thrust::tuple& val, + const thrust::tuple& cmp, + unsigned int delta, int width) + { + For<0, thrust::tuple_size >::value>::mergeShfl(key, val, cmp, delta, width); + } + template + __device__ __forceinline__ void merge(const thrust::tuple& skeys, + const thrust::tuple& key, + const thrust::tuple& svals, + const thrust::tuple& val, + const thrust::tuple& cmp, + unsigned int tid, unsigned int delta) + { + For<0, thrust::tuple_size >::value>::merge(skeys, key, svals, val, cmp, tid, delta); + } + + ////////////////////////////////////////////////////// + // Generic + + template struct Generic + { + template + static __device__ void reduce(KP skeys, KR key, VP svals, VR val, unsigned int tid, Cmp cmp) + { + loadToSmem(skeys, key, tid); + loadValsToSmem(svals, val, tid); + if (N >= 32) + __syncthreads(); + + if (N >= 2048) + { + if (tid < 1024) + merge(skeys, key, svals, val, cmp, tid, 1024); + + __syncthreads(); + } + if (N >= 1024) + { + if (tid < 512) + merge(skeys, key, svals, val, cmp, tid, 512); + + __syncthreads(); + } + if (N >= 512) + { + if (tid < 256) + merge(skeys, key, svals, val, cmp, tid, 256); + + __syncthreads(); + } + if (N >= 256) + { + if (tid < 128) + merge(skeys, key, svals, val, cmp, tid, 128); + + __syncthreads(); + } + if (N >= 128) + { + if (tid < 64) + merge(skeys, key, svals, val, cmp, tid, 64); + + __syncthreads(); + } + if (N >= 64) + { + if (tid < 32) + merge(skeys, key, svals, val, cmp, tid, 32); + } + + if (tid < 16) + { + merge(skeys, key, svals, val, cmp, tid, 16); + merge(skeys, key, svals, val, cmp, tid, 8); + merge(skeys, key, svals, val, cmp, tid, 4); + merge(skeys, key, svals, val, cmp, tid, 2); + merge(skeys, key, svals, val, cmp, tid, 1); + } + } + }; + + template struct WarpOptimized + { + template + static __device__ void reduce(KP skeys, KR key, VP svals, VR val, unsigned int tid, Cmp cmp) + { + #if __CUDA_ARCH >= 300 + (void) skeys; + (void) svals; + (void) tid; + + #pragma unroll + for (unsigned int i = N / 2; i >= 1; i /= 2) + mergeShfl(key, val, cml, i, N); + #else + loadToSmem(skeys, key, tid); + loadToSmem(svals, val, tid); + + if (tid < N / 2) + { + #pragma unroll + for (unsigned int i = N / 2; i >= 1; i /= 2) + merge(skeys, key, svals, val, cmp, tid, i); + } + #endif + } + }; + + template struct GenericOptimized32 + { + enum { M = N / 32 }; + + template + static __device__ void reduce(KP skeys, KR key, VP svals, VR val, unsigned int tid, Cmp cmp) + { + const unsigned int laneId = Warp::laneId(); + + #if __CUDA_ARCH >= 300 + #pragma unroll + for (unsigned int i = 16; i >= 1; i /= 2) + mergeShfl(key, val, cml, i, warpSize); + + if (laneId == 0) + { + loadToSmem(skeys, key, tid / 32); + loadToSmem(svals, val, tid / 32); + } + #else + loadToSmem(skeys, key, tid); + loadToSmem(svals, val, tid); + + if (laneId < 16) + { + #pragma unroll + for (int i = 16; i >= 1; i /= 2) + merge(skeys, key, svals, val, cmp, tid, i); + } + + __syncthreads(); + + if (laneId == 0) + { + loadToSmem(skeys, key, tid / 32); + loadToSmem(svals, val, tid / 32); + } + #endif + + __syncthreads(); + + loadFromSmem(skeys, key, tid); + + if (tid < 32) + { + #if __CUDA_ARCH >= 300 + loadFromSmem(svals, val, tid); + + #pragma unroll + for (unsigned int i = M / 2; i >= 1; i /= 2) + mergeShfl(key, val, cml, i, M); + #else + #pragma unroll + for (unsigned int i = M / 2; i >= 1; i /= 2) + merge(skeys, key, svals, val, cmp, tid, i); + #endif + } + } + }; + + template struct StaticIf; + template struct StaticIf + { + typedef T1 type; + }; + template struct StaticIf + { + typedef T2 type; + }; + + template struct IsPowerOf2 + { + enum { value = ((N != 0) && !(N & (N - 1))) }; + }; + + template struct Dispatcher + { + typedef typename StaticIf< + (N <= 32) && IsPowerOf2::value, + WarpOptimized, + typename StaticIf< + (N <= 1024) && IsPowerOf2::value, + GenericOptimized32, + Generic + >::type + >::type reductor; + }; + } +}}} + +#endif // __OPENCV_GPU_PRED_VAL_REDUCE_DETAIL_HPP__ diff --git a/modules/gpu/include/opencv2/gpu/device/reduce.hpp b/modules/gpu/include/opencv2/gpu/device/reduce.hpp new file mode 100644 index 000000000..2161b0649 --- /dev/null +++ b/modules/gpu/include/opencv2/gpu/device/reduce.hpp @@ -0,0 +1,197 @@ +/*M/////////////////////////////////////////////////////////////////////////////////////// +// +// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. +// +// By downloading, copying, installing or using the software you agree to this license. +// If you do not agree to this license, do not download, install, +// copy or use the software. +// +// +// License Agreement +// For Open Source Computer Vision Library +// +// Copyright (C) 2000-2008, Intel Corporation, all rights reserved. +// Copyright (C) 2009, Willow Garage Inc., all rights reserved. +// Third party copyrights are property of their respective owners. +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// * Redistribution's of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// * Redistribution's in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// * The name of the copyright holders may not be used to endorse or promote products +// derived from this software without specific prior written permission. +// +// This software is provided by the copyright holders and contributors "as is" and +// any express or implied warranties, including, but not limited to, the implied +// warranties of merchantability and fitness for a particular purpose are disclaimed. +// In no event shall the Intel Corporation or contributors be liable for any direct, +// indirect, incidental, special, exemplary, or consequential damages +// (including, but not limited to, procurement of substitute goods or services; +// loss of use, data, or profits; or business interruption) however caused +// and on any theory of liability, whether in contract, strict liability, +// or tort (including negligence or otherwise) arising in any way out of +// the use of this software, even if advised of the possibility of such damage. +// +//M*/ + +#ifndef __OPENCV_GPU_REDUCE_HPP__ +#define __OPENCV_GPU_REDUCE_HPP__ + +#include +#include "detail/reduce.hpp" +#include "detail/reduce_key_val.hpp" + +namespace cv { namespace gpu { namespace device +{ + template + __device__ __forceinline__ void reduce(volatile T* smem, T& val, unsigned int tid, const Op& op) + { + reduce_detail::Dispatcher::reductor::template reduce(smem, val, tid, op); + } + template + __device__ __forceinline__ void reduce(const thrust::tuple& smem, + const thrust::tuple& val, + unsigned int tid, + const thrust::tuple& op) + { + reduce_detail::Dispatcher::reductor::template reduce< + const thrust::tuple&, + const thrust::tuple&, + const thrust::tuple&>(smem, val, tid, op); + } + + template + __device__ __forceinline__ void reduceKeyVal(volatile K* skeys, K& key, volatile V* svals, V& val, unsigned int tid, const Cmp& cmp) + { + reduce_key_val_detail::Dispatcher::reductor::template reduce(skeys, key, svals, val, tid, cmp); + } + template + __device__ __forceinline__ void reduceKeyVal(volatile K* skeys, K& key, + const thrust::tuple& svals, + const thrust::tuple& val, + unsigned int tid, const Cmp& cmp) + { + reduce_key_val_detail::Dispatcher::reductor::template reduce&, + const thrust::tuple&, + const Cmp&>(skeys, key, svals, val, tid, cmp); + } + template + __device__ __forceinline__ void reduceKeyVal(const thrust::tuple& skeys, + const thrust::tuple& key, + const thrust::tuple& svals, + const thrust::tuple& val, + unsigned int tid, + const thrust::tuple& cmp) + { + reduce_key_val_detail::Dispatcher::reductor::template reduce< + const thrust::tuple&, + const thrust::tuple&, + const thrust::tuple&, + const thrust::tuple&, + const thrust::tuple& + >(skeys, key, svals, val, tid, cmp); + } + + // smem_tuple + + template + __device__ __forceinline__ + thrust::tuple + smem_tuple(T0* t0) + { + return thrust::make_tuple((volatile T0*) t0); + } + + template + __device__ __forceinline__ + thrust::tuple + smem_tuple(T0* t0, T1* t1) + { + return thrust::make_tuple((volatile T0*) t0, (volatile T1*) t1); + } + + template + __device__ __forceinline__ + thrust::tuple + smem_tuple(T0* t0, T1* t1, T2* t2) + { + return thrust::make_tuple((volatile T0*) t0, (volatile T1*) t1, (volatile T2*) t2); + } + + template + __device__ __forceinline__ + thrust::tuple + smem_tuple(T0* t0, T1* t1, T2* t2, T3* t3) + { + return thrust::make_tuple((volatile T0*) t0, (volatile T1*) t1, (volatile T2*) t2, (volatile T3*) t3); + } + + template + __device__ __forceinline__ + thrust::tuple + smem_tuple(T0* t0, T1* t1, T2* t2, T3* t3, T4* t4) + { + return thrust::make_tuple((volatile T0*) t0, (volatile T1*) t1, (volatile T2*) t2, (volatile T3*) t3, (volatile T4*) t4); + } + + template + __device__ __forceinline__ + thrust::tuple + smem_tuple(T0* t0, T1* t1, T2* t2, T3* t3, T4* t4, T5* t5) + { + return thrust::make_tuple((volatile T0*) t0, (volatile T1*) t1, (volatile T2*) t2, (volatile T3*) t3, (volatile T4*) t4, (volatile T5*) t5); + } + + template + __device__ __forceinline__ + thrust::tuple + smem_tuple(T0* t0, T1* t1, T2* t2, T3* t3, T4* t4, T5* t5, T6* t6) + { + return thrust::make_tuple((volatile T0*) t0, (volatile T1*) t1, (volatile T2*) t2, (volatile T3*) t3, (volatile T4*) t4, (volatile T5*) t5, (volatile T6*) t6); + } + + template + __device__ __forceinline__ + thrust::tuple + smem_tuple(T0* t0, T1* t1, T2* t2, T3* t3, T4* t4, T5* t5, T6* t6, T7* t7) + { + return thrust::make_tuple((volatile T0*) t0, (volatile T1*) t1, (volatile T2*) t2, (volatile T3*) t3, (volatile T4*) t4, (volatile T5*) t5, (volatile T6*) t6, (volatile T7*) t7); + } + + template + __device__ __forceinline__ + thrust::tuple + smem_tuple(T0* t0, T1* t1, T2* t2, T3* t3, T4* t4, T5* t5, T6* t6, T7* t7, T8* t8) + { + return thrust::make_tuple((volatile T0*) t0, (volatile T1*) t1, (volatile T2*) t2, (volatile T3*) t3, (volatile T4*) t4, (volatile T5*) t5, (volatile T6*) t6, (volatile T7*) t7, (volatile T8*) t8); + } + + template + __device__ __forceinline__ + thrust::tuple + smem_tuple(T0* t0, T1* t1, T2* t2, T3* t3, T4* t4, T5* t5, T6* t6, T7* t7, T8* t8, T9* t9) + { + return thrust::make_tuple((volatile T0*) t0, (volatile T1*) t1, (volatile T2*) t2, (volatile T3*) t3, (volatile T4*) t4, (volatile T5*) t5, (volatile T6*) t6, (volatile T7*) t7, (volatile T8*) t8, (volatile T9*) t9); + } +}}} + +#endif // __OPENCV_GPU_UTILITY_HPP__ diff --git a/modules/gpu/include/opencv2/gpu/device/utility.hpp b/modules/gpu/include/opencv2/gpu/device/utility.hpp index 4489a20b1..e44d51a6a 100644 --- a/modules/gpu/include/opencv2/gpu/device/utility.hpp +++ b/modules/gpu/include/opencv2/gpu/device/utility.hpp @@ -159,7 +159,7 @@ namespace cv { namespace gpu { namespace device /////////////////////////////////////////////////////////////////////////////// // Reduction - template __device__ __forceinline__ void reduce(volatile T* data, T& partial_reduction, int tid, const Op& op) + template __device__ __forceinline__ void reduce_old(volatile T* data, T& partial_reduction, int tid, const Op& op) { StaticAssert= 8 && n <= 512>::check(); utility_detail::ReductionDispatcher::reduce(data, partial_reduction, tid, op); diff --git a/modules/gpu/include/opencv2/gpu/device/vec_distance.hpp b/modules/gpu/include/opencv2/gpu/device/vec_distance.hpp index b7861bca7..f65af3aa5 100644 --- a/modules/gpu/include/opencv2/gpu/device/vec_distance.hpp +++ b/modules/gpu/include/opencv2/gpu/device/vec_distance.hpp @@ -63,7 +63,7 @@ namespace cv { namespace gpu { namespace device template __device__ __forceinline__ void reduceAll(int* smem, int tid) { - reduce(smem, mySum, tid, plus()); + reduce_old(smem, mySum, tid, plus()); } __device__ __forceinline__ operator int() const @@ -87,7 +87,7 @@ namespace cv { namespace gpu { namespace device template __device__ __forceinline__ void reduceAll(float* smem, int tid) { - reduce(smem, mySum, tid, plus()); + reduce_old(smem, mySum, tid, plus()); } __device__ __forceinline__ operator float() const @@ -113,7 +113,7 @@ namespace cv { namespace gpu { namespace device template __device__ __forceinline__ void reduceAll(float* smem, int tid) { - reduce(smem, mySum, tid, plus()); + reduce_old(smem, mySum, tid, plus()); } __device__ __forceinline__ operator float() const @@ -138,7 +138,7 @@ namespace cv { namespace gpu { namespace device template __device__ __forceinline__ void reduceAll(int* smem, int tid) { - reduce(smem, mySum, tid, plus()); + reduce_old(smem, mySum, tid, plus()); } __device__ __forceinline__ operator int() const diff --git a/modules/gpu/include/opencv2/gpu/device/warp_shuffle.hpp b/modules/gpu/include/opencv2/gpu/device/warp_shuffle.hpp new file mode 100644 index 000000000..39b7e852a --- /dev/null +++ b/modules/gpu/include/opencv2/gpu/device/warp_shuffle.hpp @@ -0,0 +1,97 @@ +/*M/////////////////////////////////////////////////////////////////////////////////////// +// +// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. +// +// By downloading, copying, installing or using the software you agree to this license. +// If you do not agree to this license, do not download, install, +// copy or use the software. +// +// +// License Agreement +// For Open Source Computer Vision Library +// +// Copyright (C) 2000-2008, Intel Corporation, all rights reserved. +// Copyright (C) 2009, Willow Garage Inc., all rights reserved. +// Third party copyrights are property of their respective owners. +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// * Redistribution's of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// * Redistribution's in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// * The name of the copyright holders may not be used to endorse or promote products +// derived from this software without specific prior written permission. +// +// This software is provided by the copyright holders and contributors "as is" and +// any express or implied warranties, including, but not limited to, the implied +// warranties of merchantability and fitness for a particular purpose are disclaimed. +// In no event shall the Intel Corporation or contributors be liable for any direct, +// indirect, incidental, special, exemplary, or consequential damages +// (including, but not limited to, procurement of substitute goods or services; +// loss of use, data, or profits; or business interruption) however caused +// and on any theory of liability, whether in contract, strict liability, +// or tort (including negligence or otherwise) arising in any way out of +// the use of this software, even if advised of the possibility of such damage. +// +//M*/ + +#ifndef __OPENCV_GPU_WARP_SHUFFLE_HPP__ +#define __OPENCV_GPU_WARP_SHUFFLE_HPP__ + +namespace cv { namespace gpu { namespace device +{ + template + __device__ __forceinline__ T shfl(T val, int srcLane, int width = warpSize) + { + #if __CUDA_ARCH__ >= 300 + return __shfl(val, srcLane, width); + #else + return T(); + #endif + } + __device__ __forceinline__ double shfl(double val, int srcLane, int width = warpSize) + { + #if __CUDA_ARCH__ >= 300 + int lo = __double2loint(val); + int hi = __double2hiint(val); + + lo = __shfl(lo, srcLane, width); + hi = __shfl(hi, srcLane, width); + + return __hiloint2double(hi, lo); + #else + return 0.0; + #endif + } + + template + __device__ __forceinline__ T shfl_down(T val, unsigned int delta, int width = warpSize) + { + #if __CUDA_ARCH__ >= 300 + return __shfl_down(val, delta, width); + #else + return T(); + #endif + } + __device__ __forceinline__ double shfl_down(double val, unsigned int delta, int width = warpSize) + { + #if __CUDA_ARCH__ >= 300 + int lo = __double2loint(val); + int hi = __double2hiint(val); + + lo = __shfl_down(lo, delta, width); + hi = __shfl_down(hi, delta, width); + + return __hiloint2double(hi, lo); + #else + return 0.0; + #endif + } +}}} + +#endif // __OPENCV_GPU_WARP_SHUFFLE_HPP__ diff --git a/modules/gpu/src/cuda/orb.cu b/modules/gpu/src/cuda/orb.cu index 2d441a472..91c570957 100644 --- a/modules/gpu/src/cuda/orb.cu +++ b/modules/gpu/src/cuda/orb.cu @@ -109,9 +109,9 @@ namespace cv { namespace gpu { namespace device c += Ix * Iy; } - reduce<32>(srow, a, threadIdx.x, plus()); - reduce<32>(srow, b, threadIdx.x, plus()); - reduce<32>(srow, c, threadIdx.x, plus()); + reduce_old<32>(srow, a, threadIdx.x, plus()); + reduce_old<32>(srow, b, threadIdx.x, plus()); + reduce_old<32>(srow, c, threadIdx.x, plus()); if (threadIdx.x == 0) { @@ -167,7 +167,7 @@ namespace cv { namespace gpu { namespace device for (int u = threadIdx.x - half_k; u <= half_k; u += blockDim.x) m_10 += u * image(loc.y, loc.x + u); - reduce<32>(srow, m_10, threadIdx.x, plus()); + reduce_old<32>(srow, m_10, threadIdx.x, plus()); for (int v = 1; v <= half_k; ++v) { @@ -185,8 +185,8 @@ namespace cv { namespace gpu { namespace device m_sum += u * (val_plus + val_minus); } - reduce<32>(srow, v_sum, threadIdx.x, plus()); - reduce<32>(srow, m_sum, threadIdx.x, plus()); + reduce_old<32>(srow, v_sum, threadIdx.x, plus()); + reduce_old<32>(srow, m_sum, threadIdx.x, plus()); m_10 += m_sum; m_01 += v * v_sum; @@ -419,4 +419,4 @@ namespace cv { namespace gpu { namespace device } }}} -#endif /* CUDA_DISABLER */ \ No newline at end of file +#endif /* CUDA_DISABLER */ diff --git a/modules/gpu/src/cuda/surf.cu b/modules/gpu/src/cuda/surf.cu index aebda0ea9..451fb425d 100644 --- a/modules/gpu/src/cuda/surf.cu +++ b/modules/gpu/src/cuda/surf.cu @@ -599,8 +599,8 @@ namespace cv { namespace gpu { namespace device sumy += s_Y[threadIdx.x + 96]; } - device::reduce<32>(s_sumx + threadIdx.y * 32, sumx, threadIdx.x, plus()); - device::reduce<32>(s_sumy + threadIdx.y * 32, sumy, threadIdx.x, plus()); + device::reduce_old<32>(s_sumx + threadIdx.y * 32, sumx, threadIdx.x, plus()); + device::reduce_old<32>(s_sumy + threadIdx.y * 32, sumy, threadIdx.x, plus()); const float temp_mod = sumx * sumx + sumy * sumy; if (temp_mod > best_mod) From 05db02fbc8af694c89c4740ecb067ce71eefd999 Mon Sep 17 00:00:00 2001 From: Vladislav Vinogradov Date: Mon, 12 Nov 2012 12:46:49 +0400 Subject: [PATCH 078/155] BruteForceMatcher --- .../opencv2/gpu/device/vec_distance.hpp | 10 +- modules/gpu/src/cuda/bf_knnmatch.cu | 97 ++++++++++++++++++- modules/gpu/src/cuda/bf_match.cu | 21 ++-- modules/gpu/src/cuda/bf_radius_match.cu | 13 +-- 4 files changed, 108 insertions(+), 33 deletions(-) diff --git a/modules/gpu/include/opencv2/gpu/device/vec_distance.hpp b/modules/gpu/include/opencv2/gpu/device/vec_distance.hpp index f65af3aa5..d5b4bb202 100644 --- a/modules/gpu/include/opencv2/gpu/device/vec_distance.hpp +++ b/modules/gpu/include/opencv2/gpu/device/vec_distance.hpp @@ -43,7 +43,7 @@ #ifndef __OPENCV_GPU_VEC_DISTANCE_HPP__ #define __OPENCV_GPU_VEC_DISTANCE_HPP__ -#include "utility.hpp" +#include "reduce.hpp" #include "functional.hpp" #include "detail/vec_distance_detail.hpp" @@ -63,7 +63,7 @@ namespace cv { namespace gpu { namespace device template __device__ __forceinline__ void reduceAll(int* smem, int tid) { - reduce_old(smem, mySum, tid, plus()); + reduce(smem, mySum, tid, plus()); } __device__ __forceinline__ operator int() const @@ -87,7 +87,7 @@ namespace cv { namespace gpu { namespace device template __device__ __forceinline__ void reduceAll(float* smem, int tid) { - reduce_old(smem, mySum, tid, plus()); + reduce(smem, mySum, tid, plus()); } __device__ __forceinline__ operator float() const @@ -113,7 +113,7 @@ namespace cv { namespace gpu { namespace device template __device__ __forceinline__ void reduceAll(float* smem, int tid) { - reduce_old(smem, mySum, tid, plus()); + reduce(smem, mySum, tid, plus()); } __device__ __forceinline__ operator float() const @@ -138,7 +138,7 @@ namespace cv { namespace gpu { namespace device template __device__ __forceinline__ void reduceAll(int* smem, int tid) { - reduce_old(smem, mySum, tid, plus()); + reduce(smem, mySum, tid, plus()); } __device__ __forceinline__ operator int() const diff --git a/modules/gpu/src/cuda/bf_knnmatch.cu b/modules/gpu/src/cuda/bf_knnmatch.cu index 6a778735b..44e567b75 100644 --- a/modules/gpu/src/cuda/bf_knnmatch.cu +++ b/modules/gpu/src/cuda/bf_knnmatch.cu @@ -42,10 +42,13 @@ #if !defined CUDA_DISABLER -#include "internal_shared.hpp" +#include "opencv2/gpu/device/common.hpp" +#include "opencv2/gpu/device/utility.hpp" +#include "opencv2/gpu/device/reduce.hpp" #include "opencv2/gpu/device/limits.hpp" #include "opencv2/gpu/device/vec_distance.hpp" #include "opencv2/gpu/device/datamov_utils.hpp" +#include "opencv2/gpu/device/warp_shuffle.hpp" namespace cv { namespace gpu { namespace device { @@ -59,6 +62,45 @@ namespace cv { namespace gpu { namespace device int& bestTrainIdx1, int& bestTrainIdx2, float* s_distance, int* s_trainIdx) { + #if __CUDA_ARCH__ >= 300 + (void) s_distance; + (void) s_trainIdx; + + float d1, d2; + int i1, i2; + + #pragma unroll + for (int i = BLOCK_SIZE / 2; i >= 1; i /= 2) + { + d1 = shfl_down(bestDistance1, i, BLOCK_SIZE); + d2 = shfl_down(bestDistance2, i, BLOCK_SIZE); + i1 = shfl_down(bestTrainIdx1, i, BLOCK_SIZE); + i2 = shfl_down(bestTrainIdx2, i, BLOCK_SIZE); + + if (bestDistance1 < d1) + { + if (d1 < bestDistance2) + { + bestDistance2 = d1; + bestTrainIdx2 = i1; + } + } + else + { + bestDistance2 = bestDistance1; + bestTrainIdx2 = bestTrainIdx1; + + bestDistance1 = d1; + bestTrainIdx1 = i1; + + if (d2 < bestDistance2) + { + bestDistance2 = d2; + bestTrainIdx2 = i2; + } + } + } + #else float myBestDistance1 = numeric_limits::max(); float myBestDistance2 = numeric_limits::max(); int myBestTrainIdx1 = -1; @@ -122,6 +164,7 @@ namespace cv { namespace gpu { namespace device bestTrainIdx1 = myBestTrainIdx1; bestTrainIdx2 = myBestTrainIdx2; + #endif } template @@ -130,6 +173,53 @@ namespace cv { namespace gpu { namespace device int& bestImgIdx1, int& bestImgIdx2, float* s_distance, int* s_trainIdx, int* s_imgIdx) { + #if __CUDA_ARCH__ >= 300 + (void) s_distance; + (void) s_trainIdx; + (void) s_imgIdx; + + float d1, d2; + int i1, i2; + int j1, j2; + + #pragma unroll + for (int i = BLOCK_SIZE / 2; i >= 1; i /= 2) + { + d1 = shfl_down(bestDistance1, i, BLOCK_SIZE); + d2 = shfl_down(bestDistance2, i, BLOCK_SIZE); + i1 = shfl_down(bestTrainIdx1, i, BLOCK_SIZE); + i2 = shfl_down(bestTrainIdx2, i, BLOCK_SIZE); + j1 = shfl_down(bestImgIdx1, i, BLOCK_SIZE); + j2 = shfl_down(bestImgIdx2, i, BLOCK_SIZE); + + if (bestDistance1 < d1) + { + if (d1 < bestDistance2) + { + bestDistance2 = d1; + bestTrainIdx2 = i1; + bestImgIdx2 = j1; + } + } + else + { + bestDistance2 = bestDistance1; + bestTrainIdx2 = bestTrainIdx1; + bestImgIdx2 = bestImgIdx1; + + bestDistance1 = d1; + bestTrainIdx1 = i1; + bestImgIdx1 = j1; + + if (d2 < bestDistance2) + { + bestDistance2 = d2; + bestTrainIdx2 = i2; + bestImgIdx2 = j2; + } + } + } + #else float myBestDistance1 = numeric_limits::max(); float myBestDistance2 = numeric_limits::max(); int myBestTrainIdx1 = -1; @@ -205,6 +295,7 @@ namespace cv { namespace gpu { namespace device bestImgIdx1 = myBestImgIdx1; bestImgIdx2 = myBestImgIdx2; + #endif } /////////////////////////////////////////////////////////////////////////////// @@ -1005,7 +1096,7 @@ namespace cv { namespace gpu { namespace device s_trainIdx[threadIdx.x] = bestIdx; __syncthreads(); - reducePredVal(s_dist, dist, s_trainIdx, bestIdx, threadIdx.x, less()); + reduceKeyVal(s_dist, dist, s_trainIdx, bestIdx, threadIdx.x, less()); if (threadIdx.x == 0) { @@ -1164,4 +1255,4 @@ namespace cv { namespace gpu { namespace device }}} // namespace cv { namespace gpu { namespace device { -#endif /* CUDA_DISABLER */ \ No newline at end of file +#endif /* CUDA_DISABLER */ diff --git a/modules/gpu/src/cuda/bf_match.cu b/modules/gpu/src/cuda/bf_match.cu index f50089ed9..9745dee82 100644 --- a/modules/gpu/src/cuda/bf_match.cu +++ b/modules/gpu/src/cuda/bf_match.cu @@ -42,7 +42,9 @@ #if !defined CUDA_DISABLER -#include "internal_shared.hpp" +#include "opencv2/gpu/device/common.hpp" +#include "opencv2/gpu/device/utility.hpp" +#include "opencv2/gpu/device/reduce.hpp" #include "opencv2/gpu/device/limits.hpp" #include "opencv2/gpu/device/vec_distance.hpp" #include "opencv2/gpu/device/datamov_utils.hpp" @@ -60,12 +62,7 @@ namespace cv { namespace gpu { namespace device s_distance += threadIdx.y * BLOCK_SIZE; s_trainIdx += threadIdx.y * BLOCK_SIZE; - s_distance[threadIdx.x] = bestDistance; - s_trainIdx[threadIdx.x] = bestTrainIdx; - - __syncthreads(); - - reducePredVal(s_distance, bestDistance, s_trainIdx, bestTrainIdx, threadIdx.x, less()); + reduceKeyVal(s_distance, bestDistance, s_trainIdx, bestTrainIdx, threadIdx.x, less()); } template @@ -75,13 +72,7 @@ namespace cv { namespace gpu { namespace device s_trainIdx += threadIdx.y * BLOCK_SIZE; s_imgIdx += threadIdx.y * BLOCK_SIZE; - s_distance[threadIdx.x] = bestDistance; - s_trainIdx[threadIdx.x] = bestTrainIdx; - s_imgIdx [threadIdx.x] = bestImgIdx; - - __syncthreads(); - - reducePredVal2(s_distance, bestDistance, s_trainIdx, bestTrainIdx, s_imgIdx, bestImgIdx, threadIdx.x, less()); + reduceKeyVal(s_distance, bestDistance, smem_tuple(s_trainIdx, s_imgIdx), thrust::tie(bestTrainIdx, bestImgIdx), threadIdx.x, less()); } /////////////////////////////////////////////////////////////////////////////// @@ -782,4 +773,4 @@ namespace cv { namespace gpu { namespace device }}} // namespace cv { namespace gpu { namespace device { -#endif /* CUDA_DISABLER */ \ No newline at end of file +#endif /* CUDA_DISABLER */ diff --git a/modules/gpu/src/cuda/bf_radius_match.cu b/modules/gpu/src/cuda/bf_radius_match.cu index 934b8fe84..bb828829f 100644 --- a/modules/gpu/src/cuda/bf_radius_match.cu +++ b/modules/gpu/src/cuda/bf_radius_match.cu @@ -42,7 +42,8 @@ #if !defined CUDA_DISABLER -#include "internal_shared.hpp" +#include "opencv2/gpu/device/common.hpp" +#include "opencv2/gpu/device/utility.hpp" #include "opencv2/gpu/device/limits.hpp" #include "opencv2/gpu/device/vec_distance.hpp" #include "opencv2/gpu/device/datamov_utils.hpp" @@ -58,8 +59,6 @@ namespace cv { namespace gpu { namespace device __global__ void matchUnrolled(const PtrStepSz query, int imgIdx, const PtrStepSz train, float maxDistance, const Mask mask, PtrStepi bestTrainIdx, PtrStepi bestImgIdx, PtrStepf bestDistance, unsigned int* nMatches, int maxCount) { - #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 110) - extern __shared__ int smem[]; const int queryIdx = blockIdx.y * BLOCK_SIZE + threadIdx.y; @@ -110,8 +109,6 @@ namespace cv { namespace gpu { namespace device bestDistance.ptr(queryIdx)[ind] = distVal; } } - - #endif } template @@ -170,8 +167,6 @@ namespace cv { namespace gpu { namespace device __global__ void match(const PtrStepSz query, int imgIdx, const PtrStepSz train, float maxDistance, const Mask mask, PtrStepi bestTrainIdx, PtrStepi bestImgIdx, PtrStepf bestDistance, unsigned int* nMatches, int maxCount) { - #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 110) - extern __shared__ int smem[]; const int queryIdx = blockIdx.y * BLOCK_SIZE + threadIdx.y; @@ -221,8 +216,6 @@ namespace cv { namespace gpu { namespace device bestDistance.ptr(queryIdx)[ind] = distVal; } } - - #endif } template @@ -469,4 +462,4 @@ namespace cv { namespace gpu { namespace device }}} // namespace cv { namespace gpu { namespace device -#endif /* CUDA_DISABLER */ \ No newline at end of file +#endif /* CUDA_DISABLER */ From e2995956670488282b4743bc2841279f6481a2ee Mon Sep 17 00:00:00 2001 From: Vladislav Vinogradov Date: Mon, 12 Nov 2012 12:50:00 +0400 Subject: [PATCH 079/155] computeHypothesisScoresKernel --- modules/gpu/src/cuda/calib3d.cu | 27 ++++++++++++--------------- 1 file changed, 12 insertions(+), 15 deletions(-) diff --git a/modules/gpu/src/cuda/calib3d.cu b/modules/gpu/src/cuda/calib3d.cu index 40c847547..0fd482c41 100644 --- a/modules/gpu/src/cuda/calib3d.cu +++ b/modules/gpu/src/cuda/calib3d.cu @@ -42,9 +42,10 @@ #if !defined CUDA_DISABLER -#include "internal_shared.hpp" +#include "opencv2/gpu/device/common.hpp" #include "opencv2/gpu/device/transform.hpp" #include "opencv2/gpu/device/functional.hpp" +#include "opencv2/gpu/device/reduce.hpp" namespace cv { namespace gpu { namespace device { @@ -66,6 +67,8 @@ namespace cv { namespace gpu { namespace device crot1.x * p.x + crot1.y * p.y + crot1.z * p.z + ctransl.y, crot2.x * p.x + crot2.y * p.y + crot2.z * p.z + ctransl.z); } + __device__ __forceinline__ TransformOp() {} + __device__ __forceinline__ TransformOp(const TransformOp&) {} }; void call(const PtrStepSz src, const float* rot, @@ -103,6 +106,8 @@ namespace cv { namespace gpu { namespace device (cproj0.x * t.x + cproj0.y * t.y) / t.z + cproj0.z, (cproj1.x * t.x + cproj1.y * t.y) / t.z + cproj1.z); } + __device__ __forceinline__ ProjectOp() {} + __device__ __forceinline__ ProjectOp(const ProjectOp&) {} }; void call(const PtrStepSz src, const float* rot, @@ -134,6 +139,7 @@ namespace cv { namespace gpu { namespace device return x * x; } + template __global__ void computeHypothesisScoresKernel( const int num_points, const float3* object, const float2* image, const float dist_threshold, int* g_num_inliers) @@ -156,19 +162,11 @@ namespace cv { namespace gpu { namespace device ++num_inliers; } - extern __shared__ float s_num_inliers[]; - s_num_inliers[threadIdx.x] = num_inliers; - __syncthreads(); - - for (int step = blockDim.x / 2; step > 0; step >>= 1) - { - if (threadIdx.x < step) - s_num_inliers[threadIdx.x] += s_num_inliers[threadIdx.x + step]; - __syncthreads(); - } + __shared__ int s_num_inliers[BLOCK_SIZE]; + reduce(s_num_inliers, num_inliers, threadIdx.x, plus()); if (threadIdx.x == 0) - g_num_inliers[blockIdx.x] = s_num_inliers[0]; + g_num_inliers[blockIdx.x] = num_inliers; } void computeHypothesisScores( @@ -181,9 +179,8 @@ namespace cv { namespace gpu { namespace device dim3 threads(256); dim3 grid(num_hypotheses); - int smem_size = threads.x * sizeof(float); - computeHypothesisScoresKernel<<>>( + computeHypothesisScoresKernel<256><<>>( num_points, object, image, dist_threshold, hypothesis_scores); cudaSafeCall( cudaGetLastError() ); @@ -193,4 +190,4 @@ namespace cv { namespace gpu { namespace device }}} // namespace cv { namespace gpu { namespace device -#endif /* CUDA_DISABLER */ \ No newline at end of file +#endif /* CUDA_DISABLER */ From 28716d7f306cfc89d7e5507259e98a05bd9b7b8b Mon Sep 17 00:00:00 2001 From: Vladislav Vinogradov Date: Mon, 12 Nov 2012 13:02:17 +0400 Subject: [PATCH 080/155] Canny --- modules/gpu/include/opencv2/gpu/gpu.hpp | 28 +- modules/gpu/src/cuda/canny.cu | 790 ++++++++++++------------ modules/gpu/src/imgproc.cpp | 107 ++-- modules/gpu/test/test_imgproc.cpp | 2 +- 4 files changed, 450 insertions(+), 477 deletions(-) diff --git a/modules/gpu/include/opencv2/gpu/gpu.hpp b/modules/gpu/include/opencv2/gpu/gpu.hpp index 2cbd45085..4396a0a10 100644 --- a/modules/gpu/include/opencv2/gpu/gpu.hpp +++ b/modules/gpu/include/opencv2/gpu/gpu.hpp @@ -792,31 +792,23 @@ private: GpuMat lab, l, ab; }; +struct CV_EXPORTS CannyBuf +{ + void create(const Size& image_size, int apperture_size = 3); + void release(); -struct CV_EXPORTS CannyBuf; + GpuMat dx, dy; + GpuMat mag; + GpuMat map; + GpuMat st1, st2; + Ptr filterDX, filterDY; +}; CV_EXPORTS void Canny(const GpuMat& image, GpuMat& edges, double low_thresh, double high_thresh, int apperture_size = 3, bool L2gradient = false); CV_EXPORTS void Canny(const GpuMat& image, CannyBuf& buf, GpuMat& edges, double low_thresh, double high_thresh, int apperture_size = 3, bool L2gradient = false); CV_EXPORTS void Canny(const GpuMat& dx, const GpuMat& dy, GpuMat& edges, double low_thresh, double high_thresh, bool L2gradient = false); CV_EXPORTS void Canny(const GpuMat& dx, const GpuMat& dy, CannyBuf& buf, GpuMat& edges, double low_thresh, double high_thresh, bool L2gradient = false); -struct CV_EXPORTS CannyBuf -{ - CannyBuf() {} - explicit CannyBuf(const Size& image_size, int apperture_size = 3) {create(image_size, apperture_size);} - CannyBuf(const GpuMat& dx_, const GpuMat& dy_); - - void create(const Size& image_size, int apperture_size = 3); - - void release(); - - GpuMat dx, dy; - GpuMat dx_buf, dy_buf; - GpuMat edgeBuf; - GpuMat trackBuf1, trackBuf2; - Ptr filterDX, filterDY; -}; - class CV_EXPORTS ImagePyramid { public: diff --git a/modules/gpu/src/cuda/canny.cu b/modules/gpu/src/cuda/canny.cu index 3dc048678..b08a61c83 100644 --- a/modules/gpu/src/cuda/canny.cu +++ b/modules/gpu/src/cuda/canny.cu @@ -43,459 +43,463 @@ #if !defined CUDA_DISABLER #include -#include -#include "internal_shared.hpp" +#include "opencv2/gpu/device/common.hpp" +#include "opencv2/gpu/device/emulation.hpp" +#include "opencv2/gpu/device/transform.hpp" +#include "opencv2/gpu/device/functional.hpp" +#include "opencv2/gpu/device/utility.hpp" + +using namespace cv::gpu; +using namespace cv::gpu::device; + +namespace +{ + struct L1 : binary_function + { + __device__ __forceinline__ float operator ()(int x, int y) const + { + return ::abs(x) + ::abs(y); + } + + __device__ __forceinline__ L1() {} + __device__ __forceinline__ L1(const L1&) {} + }; + struct L2 : binary_function + { + __device__ __forceinline__ float operator ()(int x, int y) const + { + return ::sqrtf(x * x + y * y); + } + + __device__ __forceinline__ L2() {} + __device__ __forceinline__ L2(const L2&) {} + }; +} namespace cv { namespace gpu { namespace device { - namespace canny + template <> struct TransformFunctorTraits : DefaultTransformFunctorTraits { - __global__ void calcSobelRowPass(const PtrStepb src, PtrStepi dx_buf, PtrStepi dy_buf, int rows, int cols) + enum { smart_shift = 4 }; + }; + template <> struct TransformFunctorTraits : DefaultTransformFunctorTraits + { + enum { smart_shift = 4 }; + }; +}}} + +namespace +{ + texture tex_src(false, cudaFilterModePoint, cudaAddressModeClamp); + struct SrcTex + { + const int xoff; + const int yoff; + __host__ SrcTex(int _xoff, int _yoff) : xoff(_xoff), yoff(_yoff) {} + + __device__ __forceinline__ int operator ()(int y, int x) const { - __shared__ int smem[16][18]; + return tex2D(tex_src, x + xoff, y + yoff); + } + }; - const int j = blockIdx.x * blockDim.x + threadIdx.x; - const int i = blockIdx.y * blockDim.y + threadIdx.y; + template __global__ + void calcMagnitude(const SrcTex src, PtrStepi dx, PtrStepi dy, PtrStepSzf mag, const Norm norm) + { + const int x = blockIdx.x * blockDim.x + threadIdx.x; + const int y = blockIdx.y * blockDim.y + threadIdx.y; - if (i < rows) - { - smem[threadIdx.y][threadIdx.x + 1] = src.ptr(i)[j]; - if (threadIdx.x == 0) - { - smem[threadIdx.y][0] = src.ptr(i)[::max(j - 1, 0)]; - smem[threadIdx.y][17] = src.ptr(i)[::min(j + 16, cols - 1)]; - } - __syncthreads(); + if (y >= mag.rows || x >= mag.cols) + return; - if (j < cols) - { - dx_buf.ptr(i)[j] = -smem[threadIdx.y][threadIdx.x] + smem[threadIdx.y][threadIdx.x + 2]; - dy_buf.ptr(i)[j] = smem[threadIdx.y][threadIdx.x] + 2 * smem[threadIdx.y][threadIdx.x + 1] + smem[threadIdx.y][threadIdx.x + 2]; - } - } + int dxVal = (src(y - 1, x + 1) + 2 * src(y, x + 1) + src(y + 1, x + 1)) - (src(y - 1, x - 1) + 2 * src(y, x - 1) + src(y + 1, x - 1)); + int dyVal = (src(y + 1, x - 1) + 2 * src(y + 1, x) + src(y + 1, x + 1)) - (src(y - 1, x - 1) + 2 * src(y - 1, x) + src(y - 1, x + 1)); + + dx(y, x) = dxVal; + dy(y, x) = dyVal; + + mag(y, x) = norm(dxVal, dyVal); + } +} + +namespace canny +{ + void calcMagnitude(PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzi dx, PtrStepSzi dy, PtrStepSzf mag, bool L2Grad) + { + const dim3 block(16, 16); + const dim3 grid(divUp(mag.cols, block.x), divUp(mag.rows, block.y)); + + bindTexture(&tex_src, srcWhole); + SrcTex src(xoff, yoff); + + if (L2Grad) + { + L2 norm; + ::calcMagnitude<<>>(src, dx, dy, mag, norm); + } + else + { + L1 norm; + ::calcMagnitude<<>>(src, dx, dy, mag, norm); } - void calcSobelRowPass_gpu(PtrStepb src, PtrStepi dx_buf, PtrStepi dy_buf, int rows, int cols) + cudaSafeCall( cudaGetLastError() ); + + cudaSafeCall(cudaThreadSynchronize()); + } + + void calcMagnitude(PtrStepSzi dx, PtrStepSzi dy, PtrStepSzf mag, bool L2Grad) + { + if (L2Grad) { - dim3 block(16, 16, 1); - dim3 grid(divUp(cols, block.x), divUp(rows, block.y), 1); - - calcSobelRowPass<<>>(src, dx_buf, dy_buf, rows, cols); - cudaSafeCall( cudaGetLastError() ); - - cudaSafeCall( cudaDeviceSynchronize() ); + L2 norm; + transform(dx, dy, mag, norm, WithOutMask(), 0); } - - struct L1 + else { - static __device__ __forceinline__ float calc(int x, int y) - { - return ::abs(x) + ::abs(y); - } - }; - struct L2 - { - static __device__ __forceinline__ float calc(int x, int y) - { - return ::sqrtf(x * x + y * y); - } - }; - - template __global__ void calcMagnitude(const PtrStepi dx_buf, const PtrStepi dy_buf, - PtrStepi dx, PtrStepi dy, PtrStepf mag, int rows, int cols) - { - __shared__ int sdx[18][16]; - __shared__ int sdy[18][16]; - - const int j = blockIdx.x * blockDim.x + threadIdx.x; - const int i = blockIdx.y * blockDim.y + threadIdx.y; - - if (j < cols) - { - sdx[threadIdx.y + 1][threadIdx.x] = dx_buf.ptr(i)[j]; - sdy[threadIdx.y + 1][threadIdx.x] = dy_buf.ptr(i)[j]; - if (threadIdx.y == 0) - { - sdx[0][threadIdx.x] = dx_buf.ptr(::max(i - 1, 0))[j]; - sdx[17][threadIdx.x] = dx_buf.ptr(::min(i + 16, rows - 1))[j]; - - sdy[0][threadIdx.x] = dy_buf.ptr(::max(i - 1, 0))[j]; - sdy[17][threadIdx.x] = dy_buf.ptr(::min(i + 16, rows - 1))[j]; - } - __syncthreads(); - - if (i < rows) - { - int x = sdx[threadIdx.y][threadIdx.x] + 2 * sdx[threadIdx.y + 1][threadIdx.x] + sdx[threadIdx.y + 2][threadIdx.x]; - int y = -sdy[threadIdx.y][threadIdx.x] + sdy[threadIdx.y + 2][threadIdx.x]; - - dx.ptr(i)[j] = x; - dy.ptr(i)[j] = y; - - mag.ptr(i + 1)[j + 1] = Norm::calc(x, y); - } - } + L1 norm; + transform(dx, dy, mag, norm, WithOutMask(), 0); } + } +} - void calcMagnitude_gpu(PtrStepi dx_buf, PtrStepi dy_buf, PtrStepi dx, PtrStepi dy, PtrStepf mag, int rows, int cols, bool L2Grad) +////////////////////////////////////////////////////////////////////////////////////////// + +namespace +{ + texture tex_mag(false, cudaFilterModePoint, cudaAddressModeClamp); + + __global__ void calcMap(const PtrStepSzi dx, const PtrStepi dy, PtrStepi map, const float low_thresh, const float high_thresh) + { + const int CANNY_SHIFT = 15; + const int TG22 = (int)(0.4142135623730950488016887242097*(1<= dx.cols || y >= dx.rows) + return; + + int dxVal = dx(y, x); + int dyVal = dy(y, x); + + const int s = (dxVal ^ dyVal) < 0 ? -1 : 1; + const float m = tex2D(tex_mag, x, y); + + dxVal = ::abs(dxVal); + dyVal = ::abs(dyVal); + + // 0 - the pixel can not belong to an edge + // 1 - the pixel might belong to an edge + // 2 - the pixel does belong to an edge + int edge_type = 0; + + if (m > low_thresh) { - dim3 block(16, 16, 1); - dim3 grid(divUp(cols, block.x), divUp(rows, block.y), 1); + const int tg22x = dxVal * TG22; + const int tg67x = tg22x + ((dxVal + dxVal) << CANNY_SHIFT); - if (L2Grad) - calcMagnitude<<>>(dx_buf, dy_buf, dx, dy, mag, rows, cols); + dyVal <<= CANNY_SHIFT; + + if (dyVal < tg22x) + { + if (m > tex2D(tex_mag, x - 1, y) && m >= tex2D(tex_mag, x + 1, y)) + edge_type = 1 + (int)(m > high_thresh); + } + else if(dyVal > tg67x) + { + if (m > tex2D(tex_mag, x, y - 1) && m >= tex2D(tex_mag, x, y + 1)) + edge_type = 1 + (int)(m > high_thresh); + } else - calcMagnitude<<>>(dx_buf, dy_buf, dx, dy, mag, rows, cols); - - cudaSafeCall( cudaGetLastError() ); - - cudaSafeCall(cudaThreadSynchronize()); + { + if (m > tex2D(tex_mag, x - s, y - 1) && m >= tex2D(tex_mag, x + s, y + 1)) + edge_type = 1 + (int)(m > high_thresh); + } } - template __global__ void calcMagnitude(PtrStepi dx, PtrStepi dy, PtrStepf mag, int rows, int cols) - { - const int j = blockIdx.x * blockDim.x + threadIdx.x; - const int i = blockIdx.y * blockDim.y + threadIdx.y; + map(y, x) = edge_type; + } +} - if (i < rows && j < cols) - mag.ptr(i + 1)[j + 1] = Norm::calc(dx.ptr(i)[j], dy.ptr(i)[j]); +namespace canny +{ + void calcMap(PtrStepSzi dx, PtrStepSzi dy, PtrStepSzf mag, PtrStepSzi map, float low_thresh, float high_thresh) + { + const dim3 block(16, 16); + const dim3 grid(divUp(dx.cols, block.x), divUp(dx.rows, block.y)); + + bindTexture(&tex_mag, mag); + + ::calcMap<<>>(dx, dy, map, low_thresh, high_thresh); + cudaSafeCall( cudaGetLastError() ); + + cudaSafeCall( cudaDeviceSynchronize() ); + } +} + +////////////////////////////////////////////////////////////////////////////////////////// + +namespace +{ + __device__ int counter = 0; + + __global__ void edgesHysteresisLocal(PtrStepSzi map, ushort2* st) + { + __shared__ volatile int smem[18][18]; + + const int x = blockIdx.x * blockDim.x + threadIdx.x; + const int y = blockIdx.y * blockDim.y + threadIdx.y; + + smem[threadIdx.y + 1][threadIdx.x + 1] = x < map.cols && y < map.rows ? map(y, x) : 0; + if (threadIdx.y == 0) + smem[0][threadIdx.x + 1] = y > 0 ? map(y - 1, x) : 0; + if (threadIdx.y == blockDim.y - 1) + smem[blockDim.y + 1][threadIdx.x + 1] = y + 1 < map.rows ? map(y + 1, x) : 0; + if (threadIdx.x == 0) + smem[threadIdx.y + 1][0] = x > 0 ? map(y, x - 1) : 0; + if (threadIdx.x == blockDim.x - 1) + smem[threadIdx.y + 1][blockDim.x + 1] = x + 1 < map.cols ? map(y, x + 1) : 0; + if (threadIdx.x == 0 && threadIdx.y == 0) + smem[0][0] = y > 0 && x > 0 ? map(y - 1, x - 1) : 0; + if (threadIdx.x == blockDim.x - 1 && threadIdx.y == 0) + smem[0][blockDim.x + 1] = y > 0 && x + 1 < map.cols ? map(y - 1, x + 1) : 0; + if (threadIdx.x == 0 && threadIdx.y == blockDim.y - 1) + smem[blockDim.y + 1][0] = y + 1 < map.rows && x > 0 ? map(y + 1, x - 1) : 0; + if (threadIdx.x == blockDim.x - 1 && threadIdx.y == blockDim.y - 1) + smem[blockDim.y + 1][blockDim.x + 1] = y + 1 < map.rows && x + 1 < map.cols ? map(y + 1, x + 1) : 0; + + __syncthreads(); + + if (x >= map.cols || y >= map.rows) + return; + + int n; + + #pragma unroll + for (int k = 0; k < 16; ++k) + { + n = 0; + + if (smem[threadIdx.y + 1][threadIdx.x + 1] == 1) + { + n += smem[threadIdx.y ][threadIdx.x ] == 2; + n += smem[threadIdx.y ][threadIdx.x + 1] == 2; + n += smem[threadIdx.y ][threadIdx.x + 2] == 2; + + n += smem[threadIdx.y + 1][threadIdx.x ] == 2; + n += smem[threadIdx.y + 1][threadIdx.x + 2] == 2; + + n += smem[threadIdx.y + 2][threadIdx.x ] == 2; + n += smem[threadIdx.y + 2][threadIdx.x + 1] == 2; + n += smem[threadIdx.y + 2][threadIdx.x + 2] == 2; + } + + if (n > 0) + smem[threadIdx.y + 1][threadIdx.x + 1] = 2; } - void calcMagnitude_gpu(PtrStepi dx, PtrStepi dy, PtrStepf mag, int rows, int cols, bool L2Grad) + const int e = smem[threadIdx.y + 1][threadIdx.x + 1]; + + map(y, x) = e; + + n = 0; + + if (e == 2) { - dim3 block(16, 16, 1); - dim3 grid(divUp(cols, block.x), divUp(rows, block.y), 1); + n += smem[threadIdx.y ][threadIdx.x ] == 1; + n += smem[threadIdx.y ][threadIdx.x + 1] == 1; + n += smem[threadIdx.y ][threadIdx.x + 2] == 1; - if (L2Grad) - calcMagnitude<<>>(dx, dy, mag, rows, cols); - else - calcMagnitude<<>>(dx, dy, mag, rows, cols); + n += smem[threadIdx.y + 1][threadIdx.x ] == 1; + n += smem[threadIdx.y + 1][threadIdx.x + 2] == 1; - cudaSafeCall( cudaGetLastError() ); - - cudaSafeCall( cudaDeviceSynchronize() ); + n += smem[threadIdx.y + 2][threadIdx.x ] == 1; + n += smem[threadIdx.y + 2][threadIdx.x + 1] == 1; + n += smem[threadIdx.y + 2][threadIdx.x + 2] == 1; } - ////////////////////////////////////////////////////////////////////////////////////////// - - #define CANNY_SHIFT 15 - #define TG22 (int)(0.4142135623730950488016887242097*(1< 0) { - __shared__ float smem[18][18]; + const int ind = ::atomicAdd(&counter, 1); + st[ind] = make_ushort2(x, y); + } + } +} - const int j = blockIdx.x * 16 + threadIdx.x; - const int i = blockIdx.y * 16 + threadIdx.y; +namespace canny +{ + void edgesHysteresisLocal(PtrStepSzi map, ushort2* st1) + { + void* counter_ptr; + cudaSafeCall( cudaGetSymbolAddress(&counter_ptr, counter) ); - const int tid = threadIdx.y * 16 + threadIdx.x; - const int lx = tid % 18; - const int ly = tid / 18; + cudaSafeCall( cudaMemset(counter_ptr, 0, sizeof(int)) ); - if (ly < 14) - smem[ly][lx] = mag.ptr(blockIdx.y * 16 + ly)[blockIdx.x * 16 + lx]; + const dim3 block(16, 16); + const dim3 grid(divUp(map.cols, block.x), divUp(map.rows, block.y)); - if (ly < 4 && blockIdx.y * 16 + ly + 14 <= rows && blockIdx.x * 16 + lx <= cols) - smem[ly + 14][lx] = mag.ptr(blockIdx.y * 16 + ly + 14)[blockIdx.x * 16 + lx]; + ::edgesHysteresisLocal<<>>(map, st1); + cudaSafeCall( cudaGetLastError() ); + + cudaSafeCall( cudaDeviceSynchronize() ); + } +} + +////////////////////////////////////////////////////////////////////////////////////////// + +namespace +{ + __constant__ int c_dx[8] = {-1, 0, 1, -1, 1, -1, 0, 1}; + __constant__ int c_dy[8] = {-1, -1, -1, 0, 0, 1, 1, 1}; + + __global__ void edgesHysteresisGlobal(PtrStepSzi map, ushort2* st1, ushort2* st2, const int count) + { + const int stack_size = 512; + + __shared__ int s_counter; + __shared__ int s_ind; + __shared__ ushort2 s_st[stack_size]; + + if (threadIdx.x == 0) + s_counter = 0; + + __syncthreads(); + + int ind = blockIdx.y * gridDim.x + blockIdx.x; + + if (ind >= count) + return; + + ushort2 pos = st1[ind]; + + if (threadIdx.x < 8) + { + pos.x += c_dx[threadIdx.x]; + pos.y += c_dy[threadIdx.x]; + + if (pos.x > 0 && pos.x <= map.cols && pos.y > 0 && pos.y <= map.rows && map(pos.y, pos.x) == 1) + { + map(pos.y, pos.x) = 2; + + ind = Emulation::smem::atomicAdd(&s_counter, 1); + + s_st[ind] = pos; + } + } + + __syncthreads(); + + while (s_counter > 0 && s_counter <= stack_size - blockDim.x) + { + const int subTaskIdx = threadIdx.x >> 3; + const int portion = ::min(s_counter, blockDim.x >> 3); + + if (subTaskIdx < portion) + pos = s_st[s_counter - 1 - subTaskIdx]; __syncthreads(); - if (i < rows && j < cols) - { - int x = dx.ptr(i)[j]; - int y = dy.ptr(i)[j]; - const int s = (x ^ y) < 0 ? -1 : 1; - const float m = smem[threadIdx.y + 1][threadIdx.x + 1]; - - x = ::abs(x); - y = ::abs(y); - - // 0 - the pixel can not belong to an edge - // 1 - the pixel might belong to an edge - // 2 - the pixel does belong to an edge - int edge_type = 0; - - if (m > low_thresh) - { - const int tg22x = x * TG22; - const int tg67x = tg22x + ((x + x) << CANNY_SHIFT); - - y <<= CANNY_SHIFT; - - if (y < tg22x) - { - if (m > smem[threadIdx.y + 1][threadIdx.x] && m >= smem[threadIdx.y + 1][threadIdx.x + 2]) - edge_type = 1 + (int)(m > high_thresh); - } - else if( y > tg67x ) - { - if (m > smem[threadIdx.y][threadIdx.x + 1] && m >= smem[threadIdx.y + 2][threadIdx.x + 1]) - edge_type = 1 + (int)(m > high_thresh); - } - else - { - if (m > smem[threadIdx.y][threadIdx.x + 1 - s] && m > smem[threadIdx.y + 2][threadIdx.x + 1 + s]) - edge_type = 1 + (int)(m > high_thresh); - } - } - - map.ptr(i + 1)[j + 1] = edge_type; - } - } - - #undef CANNY_SHIFT - #undef TG22 - - void calcMap_gpu(PtrStepi dx, PtrStepi dy, PtrStepf mag, PtrStepi map, int rows, int cols, float low_thresh, float high_thresh) - { - dim3 block(16, 16, 1); - dim3 grid(divUp(cols, block.x), divUp(rows, block.y), 1); - - calcMap<<>>(dx, dy, mag, map, rows, cols, low_thresh, high_thresh); - cudaSafeCall( cudaGetLastError() ); - - cudaSafeCall( cudaDeviceSynchronize() ); - } - - ////////////////////////////////////////////////////////////////////////////////////////// - - __device__ unsigned int counter = 0; - - __global__ void edgesHysteresisLocal(PtrStepi map, ushort2* st, int rows, int cols) - { - #if defined (__CUDA_ARCH__) && (__CUDA_ARCH__ >= 120) - - __shared__ int smem[18][18]; - - const int j = blockIdx.x * 16 + threadIdx.x; - const int i = blockIdx.y * 16 + threadIdx.y; - - const int tid = threadIdx.y * 16 + threadIdx.x; - const int lx = tid % 18; - const int ly = tid / 18; - - if (ly < 14) - smem[ly][lx] = map.ptr(blockIdx.y * 16 + ly)[blockIdx.x * 16 + lx]; - - if (ly < 4 && blockIdx.y * 16 + ly + 14 <= rows && blockIdx.x * 16 + lx <= cols) - smem[ly + 14][lx] = map.ptr(blockIdx.y * 16 + ly + 14)[blockIdx.x * 16 + lx]; - - __syncthreads(); - - if (i < rows && j < cols) - { - int n; - - #pragma unroll - for (int k = 0; k < 16; ++k) - { - n = 0; - - if (smem[threadIdx.y + 1][threadIdx.x + 1] == 1) - { - n += smem[threadIdx.y ][threadIdx.x ] == 2; - n += smem[threadIdx.y ][threadIdx.x + 1] == 2; - n += smem[threadIdx.y ][threadIdx.x + 2] == 2; - - n += smem[threadIdx.y + 1][threadIdx.x ] == 2; - n += smem[threadIdx.y + 1][threadIdx.x + 2] == 2; - - n += smem[threadIdx.y + 2][threadIdx.x ] == 2; - n += smem[threadIdx.y + 2][threadIdx.x + 1] == 2; - n += smem[threadIdx.y + 2][threadIdx.x + 2] == 2; - } - - if (n > 0) - smem[threadIdx.y + 1][threadIdx.x + 1] = 2; - } - - const int e = smem[threadIdx.y + 1][threadIdx.x + 1]; - - map.ptr(i + 1)[j + 1] = e; - - n = 0; - - if (e == 2) - { - n += smem[threadIdx.y ][threadIdx.x ] == 1; - n += smem[threadIdx.y ][threadIdx.x + 1] == 1; - n += smem[threadIdx.y ][threadIdx.x + 2] == 1; - - n += smem[threadIdx.y + 1][threadIdx.x ] == 1; - n += smem[threadIdx.y + 1][threadIdx.x + 2] == 1; - - n += smem[threadIdx.y + 2][threadIdx.x ] == 1; - n += smem[threadIdx.y + 2][threadIdx.x + 1] == 1; - n += smem[threadIdx.y + 2][threadIdx.x + 2] == 1; - } - - if (n > 0) - { - const unsigned int ind = atomicInc(&counter, (unsigned int)(-1)); - st[ind] = make_ushort2(j + 1, i + 1); - } - } - - #endif - } - - void edgesHysteresisLocal_gpu(PtrStepi map, ushort2* st1, int rows, int cols) - { - void* counter_ptr; - cudaSafeCall( cudaGetSymbolAddress(&counter_ptr, counter) ); - - cudaSafeCall( cudaMemset(counter_ptr, 0, sizeof(unsigned int)) ); - - dim3 block(16, 16, 1); - dim3 grid(divUp(cols, block.x), divUp(rows, block.y), 1); - - edgesHysteresisLocal<<>>(map, st1, rows, cols); - cudaSafeCall( cudaGetLastError() ); - - cudaSafeCall( cudaDeviceSynchronize() ); - } - - __constant__ int c_dx[8] = {-1, 0, 1, -1, 1, -1, 0, 1}; - __constant__ int c_dy[8] = {-1, -1, -1, 0, 0, 1, 1, 1}; - - __global__ void edgesHysteresisGlobal(PtrStepi map, ushort2* st1, ushort2* st2, int rows, int cols, int count) - { - #if defined (__CUDA_ARCH__) && __CUDA_ARCH__ >= 120 - - const int stack_size = 512; - - __shared__ unsigned int s_counter; - __shared__ unsigned int s_ind; - __shared__ ushort2 s_st[stack_size]; - if (threadIdx.x == 0) - s_counter = 0; + s_counter -= portion; + __syncthreads(); - int ind = blockIdx.y * gridDim.x + blockIdx.x; - - if (ind < count) + if (subTaskIdx < portion) { - ushort2 pos = st1[ind]; + pos.x += c_dx[threadIdx.x & 7]; + pos.y += c_dy[threadIdx.x & 7]; - if (pos.x > 0 && pos.x <= cols && pos.y > 0 && pos.y <= rows) + if (pos.x > 0 && pos.x <= map.cols && pos.y > 0 && pos.y <= map.rows && map(pos.y, pos.x) == 1) { - if (threadIdx.x < 8) - { - pos.x += c_dx[threadIdx.x]; - pos.y += c_dy[threadIdx.x]; + map(pos.y, pos.x) = 2; - if (map.ptr(pos.y)[pos.x] == 1) - { - map.ptr(pos.y)[pos.x] = 2; + ind = Emulation::smem::atomicAdd(&s_counter, 1); - ind = atomicInc(&s_counter, (unsigned int)(-1)); - - s_st[ind] = pos; - } - } - __syncthreads(); - - while (s_counter > 0 && s_counter <= stack_size - blockDim.x) - { - const int subTaskIdx = threadIdx.x >> 3; - const int portion = ::min(s_counter, blockDim.x >> 3); - - pos.x = pos.y = 0; - - if (subTaskIdx < portion) - pos = s_st[s_counter - 1 - subTaskIdx]; - __syncthreads(); - - if (threadIdx.x == 0) - s_counter -= portion; - __syncthreads(); - - if (pos.x > 0 && pos.x <= cols && pos.y > 0 && pos.y <= rows) - { - pos.x += c_dx[threadIdx.x & 7]; - pos.y += c_dy[threadIdx.x & 7]; - - if (map.ptr(pos.y)[pos.x] == 1) - { - map.ptr(pos.y)[pos.x] = 2; - - ind = atomicInc(&s_counter, (unsigned int)(-1)); - - s_st[ind] = pos; - } - } - __syncthreads(); - } - - if (s_counter > 0) - { - if (threadIdx.x == 0) - { - ind = atomicAdd(&counter, s_counter); - s_ind = ind - s_counter; - } - __syncthreads(); - - ind = s_ind; - - for (int i = threadIdx.x; i < s_counter; i += blockDim.x) - { - st2[ind + i] = s_st[i]; - } - } + s_st[ind] = pos; } } - #endif + __syncthreads(); } - void edgesHysteresisGlobal_gpu(PtrStepi map, ushort2* st1, ushort2* st2, int rows, int cols) + if (s_counter > 0) { - void* counter_ptr; - cudaSafeCall( cudaGetSymbolAddress(&counter_ptr, counter) ); - - unsigned int count; - cudaSafeCall( cudaMemcpy(&count, counter_ptr, sizeof(unsigned int), cudaMemcpyDeviceToHost) ); - - while (count > 0) + if (threadIdx.x == 0) { - cudaSafeCall( cudaMemset(counter_ptr, 0, sizeof(unsigned int)) ); - - dim3 block(128, 1, 1); - dim3 grid(std::min(count, 65535u), divUp(count, 65535), 1); - edgesHysteresisGlobal<<>>(map, st1, st2, rows, cols, count); - cudaSafeCall( cudaGetLastError() ); - - cudaSafeCall( cudaDeviceSynchronize() ); - - cudaSafeCall( cudaMemcpy(&count, counter_ptr, sizeof(unsigned int), cudaMemcpyDeviceToHost) ); - - std::swap(st1, st2); + ind = ::atomicAdd(&counter, s_counter); + s_ind = ind - s_counter; } + + __syncthreads(); + + ind = s_ind; + + for (int i = threadIdx.x; i < s_counter; i += blockDim.x) + st2[ind + i] = s_st[i]; } + } +} - __global__ void getEdges(PtrStepi map, PtrStepb dst, int rows, int cols) +namespace canny +{ + void edgesHysteresisGlobal(PtrStepSzi map, ushort2* st1, ushort2* st2) + { + void* counter_ptr; + cudaSafeCall( cudaGetSymbolAddress(&counter_ptr, ::counter) ); + + int count; + cudaSafeCall( cudaMemcpy(&count, counter_ptr, sizeof(int), cudaMemcpyDeviceToHost) ); + + while (count > 0) { - const int j = blockIdx.x * 16 + threadIdx.x; - const int i = blockIdx.y * 16 + threadIdx.y; + cudaSafeCall( cudaMemset(counter_ptr, 0, sizeof(int)) ); - if (i < rows && j < cols) - dst.ptr(i)[j] = (uchar)(-(map.ptr(i + 1)[j + 1] >> 1)); - } + const dim3 block(128); + const dim3 grid(::min(count, 65535u), divUp(count, 65535), 1); - void getEdges_gpu(PtrStepi map, PtrStepb dst, int rows, int cols) - { - dim3 block(16, 16, 1); - dim3 grid(divUp(cols, block.x), divUp(rows, block.y), 1); - - getEdges<<>>(map, dst, rows, cols); + ::edgesHysteresisGlobal<<>>(map, st1, st2, count); cudaSafeCall( cudaGetLastError() ); cudaSafeCall( cudaDeviceSynchronize() ); + + cudaSafeCall( cudaMemcpy(&count, counter_ptr, sizeof(int), cudaMemcpyDeviceToHost) ); + + std::swap(st1, st2); } - } // namespace canny -}}} // namespace cv { namespace gpu { namespace device + } +} +////////////////////////////////////////////////////////////////////////////////////////// -#endif /* CUDA_DISABLER */ \ No newline at end of file +namespace +{ + struct GetEdges : unary_function + { + __device__ __forceinline__ uchar operator ()(int e) const + { + return (uchar)(-(e >> 1)); + } + + __device__ __forceinline__ GetEdges() {} + __device__ __forceinline__ GetEdges(const GetEdges&) {} + }; +} + +namespace cv { namespace gpu { namespace device +{ + template <> struct TransformFunctorTraits : DefaultTransformFunctorTraits + { + enum { smart_shift = 4 }; + }; +}}} + +namespace canny +{ + void getEdges(PtrStepSzi map, PtrStepSzb dst) + { + transform(map, dst, GetEdges(), WithOutMask(), 0); + } +} + +#endif /* CUDA_DISABLER */ diff --git a/modules/gpu/src/imgproc.cpp b/modules/gpu/src/imgproc.cpp index 0bf9c81c2..b733faf5d 100644 --- a/modules/gpu/src/imgproc.cpp +++ b/modules/gpu/src/imgproc.cpp @@ -91,7 +91,6 @@ void cv::gpu::Canny(const GpuMat&, GpuMat&, double, double, int, bool) { throw_n void cv::gpu::Canny(const GpuMat&, CannyBuf&, GpuMat&, double, double, int, bool) { throw_nogpu(); } void cv::gpu::Canny(const GpuMat&, const GpuMat&, GpuMat&, double, double, bool) { throw_nogpu(); } void cv::gpu::Canny(const GpuMat&, const GpuMat&, CannyBuf&, GpuMat&, double, double, bool) { throw_nogpu(); } -cv::gpu::CannyBuf::CannyBuf(const GpuMat&, const GpuMat&) { throw_nogpu(); } void cv::gpu::CannyBuf::create(const Size&, int) { throw_nogpu(); } void cv::gpu::CannyBuf::release() { throw_nogpu(); } @@ -1466,92 +1465,76 @@ void cv::gpu::convolve(const GpuMat& image, const GpuMat& templ, GpuMat& result, ////////////////////////////////////////////////////////////////////////////// // Canny -cv::gpu::CannyBuf::CannyBuf(const GpuMat& dx_, const GpuMat& dy_) : dx(dx_), dy(dy_) -{ - CV_Assert(dx_.type() == CV_32SC1 && dy_.type() == CV_32SC1 && dx_.size() == dy_.size()); - - create(dx_.size(), -1); -} - void cv::gpu::CannyBuf::create(const Size& image_size, int apperture_size) { - ensureSizeIsEnough(image_size, CV_32SC1, dx); - ensureSizeIsEnough(image_size, CV_32SC1, dy); + if (apperture_size > 0) + { + ensureSizeIsEnough(image_size, CV_32SC1, dx); + ensureSizeIsEnough(image_size, CV_32SC1, dy); - if (apperture_size == 3) - { - ensureSizeIsEnough(image_size, CV_32SC1, dx_buf); - ensureSizeIsEnough(image_size, CV_32SC1, dy_buf); - } - else if(apperture_size > 0) - { - if (!filterDX) + if (apperture_size != 3) + { filterDX = createDerivFilter_GPU(CV_8UC1, CV_32S, 1, 0, apperture_size, BORDER_REPLICATE); - if (!filterDY) filterDY = createDerivFilter_GPU(CV_8UC1, CV_32S, 0, 1, apperture_size, BORDER_REPLICATE); + } } - ensureSizeIsEnough(image_size.height + 2, image_size.width + 2, CV_32FC1, edgeBuf); + ensureSizeIsEnough(image_size, CV_32FC1, mag); + ensureSizeIsEnough(image_size, CV_32SC1, map); - ensureSizeIsEnough(1, image_size.width * image_size.height, CV_16UC2, trackBuf1); - ensureSizeIsEnough(1, image_size.width * image_size.height, CV_16UC2, trackBuf2); + ensureSizeIsEnough(1, image_size.area(), CV_16UC2, st1); + ensureSizeIsEnough(1, image_size.area(), CV_16UC2, st2); } void cv::gpu::CannyBuf::release() { dx.release(); dy.release(); - dx_buf.release(); - dy_buf.release(); - edgeBuf.release(); - trackBuf1.release(); - trackBuf2.release(); + mag.release(); + map.release(); + st1.release(); + st2.release(); } -namespace cv { namespace gpu { namespace device +namespace canny { - namespace canny - { - void calcSobelRowPass_gpu(PtrStepb src, PtrStepi dx_buf, PtrStepi dy_buf, int rows, int cols); + void calcMagnitude(PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzi dx, PtrStepSzi dy, PtrStepSzf mag, bool L2Grad); + void calcMagnitude(PtrStepSzi dx, PtrStepSzi dy, PtrStepSzf mag, bool L2Grad); - void calcMagnitude_gpu(PtrStepi dx_buf, PtrStepi dy_buf, PtrStepi dx, PtrStepi dy, PtrStepf mag, int rows, int cols, bool L2Grad); - void calcMagnitude_gpu(PtrStepi dx, PtrStepi dy, PtrStepf mag, int rows, int cols, bool L2Grad); + void calcMap(PtrStepSzi dx, PtrStepSzi dy, PtrStepSzf mag, PtrStepSzi map, float low_thresh, float high_thresh); - void calcMap_gpu(PtrStepi dx, PtrStepi dy, PtrStepf mag, PtrStepi map, int rows, int cols, float low_thresh, float high_thresh); + void edgesHysteresisLocal(PtrStepSzi map, ushort2* st1); - void edgesHysteresisLocal_gpu(PtrStepi map, ushort2* st1, int rows, int cols); + void edgesHysteresisGlobal(PtrStepSzi map, ushort2* st1, ushort2* st2); - void edgesHysteresisGlobal_gpu(PtrStepi map, ushort2* st1, ushort2* st2, int rows, int cols); - - void getEdges_gpu(PtrStepi map, PtrStepb dst, int rows, int cols); - } -}}} + void getEdges(PtrStepSzi map, PtrStepSzb dst); +} namespace { - void CannyCaller(CannyBuf& buf, GpuMat& dst, float low_thresh, float high_thresh) + void CannyCaller(const GpuMat& dx, const GpuMat& dy, CannyBuf& buf, GpuMat& dst, float low_thresh, float high_thresh) { - using namespace ::cv::gpu::device::canny; + using namespace canny; - calcMap_gpu(buf.dx, buf.dy, buf.edgeBuf, buf.edgeBuf, dst.rows, dst.cols, low_thresh, high_thresh); + calcMap(dx, dy, buf.mag, buf.map, low_thresh, high_thresh); - edgesHysteresisLocal_gpu(buf.edgeBuf, buf.trackBuf1.ptr(), dst.rows, dst.cols); + edgesHysteresisLocal(buf.map, buf.st1.ptr()); - edgesHysteresisGlobal_gpu(buf.edgeBuf, buf.trackBuf1.ptr(), buf.trackBuf2.ptr(), dst.rows, dst.cols); + edgesHysteresisGlobal(buf.map, buf.st1.ptr(), buf.st2.ptr()); - getEdges_gpu(buf.edgeBuf, dst, dst.rows, dst.cols); + getEdges(buf.map, dst); } } void cv::gpu::Canny(const GpuMat& src, GpuMat& dst, double low_thresh, double high_thresh, int apperture_size, bool L2gradient) { - CannyBuf buf(src.size(), apperture_size); + CannyBuf buf; Canny(src, buf, dst, low_thresh, high_thresh, apperture_size, L2gradient); } void cv::gpu::Canny(const GpuMat& src, CannyBuf& buf, GpuMat& dst, double low_thresh, double high_thresh, int apperture_size, bool L2gradient) { - using namespace ::cv::gpu::device::canny; + using namespace canny; CV_Assert(src.type() == CV_8UC1); @@ -1562,37 +1545,37 @@ void cv::gpu::Canny(const GpuMat& src, CannyBuf& buf, GpuMat& dst, double low_th std::swap( low_thresh, high_thresh); dst.create(src.size(), CV_8U); - dst.setTo(Scalar::all(0)); - buf.create(src.size(), apperture_size); - buf.edgeBuf.setTo(Scalar::all(0)); if (apperture_size == 3) { - calcSobelRowPass_gpu(src, buf.dx_buf, buf.dy_buf, src.rows, src.cols); + Size wholeSize; + Point ofs; + src.locateROI(wholeSize, ofs); + GpuMat srcWhole(wholeSize, src.type(), src.datastart, src.step); - calcMagnitude_gpu(buf.dx_buf, buf.dy_buf, buf.dx, buf.dy, buf.edgeBuf, src.rows, src.cols, L2gradient); + calcMagnitude(srcWhole, ofs.x, ofs.y, buf.dx, buf.dy, buf.mag, L2gradient); } else { buf.filterDX->apply(src, buf.dx, Rect(0, 0, src.cols, src.rows)); buf.filterDY->apply(src, buf.dy, Rect(0, 0, src.cols, src.rows)); - calcMagnitude_gpu(buf.dx, buf.dy, buf.edgeBuf, src.rows, src.cols, L2gradient); + calcMagnitude(buf.dx, buf.dy, buf.mag, L2gradient); } - CannyCaller(buf, dst, static_cast(low_thresh), static_cast(high_thresh)); + CannyCaller(buf.dx, buf.dy, buf, dst, static_cast(low_thresh), static_cast(high_thresh)); } void cv::gpu::Canny(const GpuMat& dx, const GpuMat& dy, GpuMat& dst, double low_thresh, double high_thresh, bool L2gradient) { - CannyBuf buf(dx, dy); + CannyBuf buf; Canny(dx, dy, buf, dst, low_thresh, high_thresh, L2gradient); } void cv::gpu::Canny(const GpuMat& dx, const GpuMat& dy, CannyBuf& buf, GpuMat& dst, double low_thresh, double high_thresh, bool L2gradient) { - using namespace ::cv::gpu::device::canny; + using namespace canny; CV_Assert(TargetArchs::builtWith(SHARED_ATOMICS) && DeviceInfo().supports(SHARED_ATOMICS)); CV_Assert(dx.type() == CV_32SC1 && dy.type() == CV_32SC1 && dx.size() == dy.size()); @@ -1601,17 +1584,11 @@ void cv::gpu::Canny(const GpuMat& dx, const GpuMat& dy, CannyBuf& buf, GpuMat& d std::swap( low_thresh, high_thresh); dst.create(dx.size(), CV_8U); - dst.setTo(Scalar::all(0)); - - buf.dx = dx; buf.dy = dy; buf.create(dx.size(), -1); - buf.edgeBuf.setTo(Scalar::all(0)); - calcMagnitude_gpu(dx, dy, buf.edgeBuf, dx.rows, dx.cols, L2gradient); + calcMagnitude(dx, dy, buf.mag, L2gradient); - CannyCaller(buf, dst, static_cast(low_thresh), static_cast(high_thresh)); + CannyCaller(dx, dy, buf, dst, static_cast(low_thresh), static_cast(high_thresh)); } #endif /* !defined (HAVE_CUDA) */ - - diff --git a/modules/gpu/test/test_imgproc.cpp b/modules/gpu/test/test_imgproc.cpp index e77cad69a..71d4a8e65 100644 --- a/modules/gpu/test/test_imgproc.cpp +++ b/modules/gpu/test/test_imgproc.cpp @@ -313,7 +313,7 @@ TEST_P(Canny, Accuracy) cv::Mat edges_gold; cv::Canny(img, edges_gold, low_thresh, high_thresh, apperture_size, useL2gradient); - EXPECT_MAT_SIMILAR(edges_gold, edges, 1e-2); + EXPECT_MAT_SIMILAR(edges_gold, edges, 2e-2); } } From 7e57648ea22761f9e5cce2d5bba5b7b2ff71b331 Mon Sep 17 00:00:00 2001 From: Vladislav Vinogradov Date: Mon, 12 Nov 2012 13:09:39 +0400 Subject: [PATCH 081/155] FGDStatModel --- modules/gpu/src/cuda/fgd_bgfg.cu | 57 +++----------------------------- 1 file changed, 5 insertions(+), 52 deletions(-) diff --git a/modules/gpu/src/cuda/fgd_bgfg.cu b/modules/gpu/src/cuda/fgd_bgfg.cu index 6040d021b..6361e1811 100644 --- a/modules/gpu/src/cuda/fgd_bgfg.cu +++ b/modules/gpu/src/cuda/fgd_bgfg.cu @@ -46,6 +46,8 @@ #include "opencv2/gpu/device/vec_math.hpp" #include "opencv2/gpu/device/limits.hpp" #include "opencv2/gpu/device/utility.hpp" +#include "opencv2/gpu/device/reduce.hpp" +#include "opencv2/gpu/device/functional.hpp" #include "fgd_bgfg_common.hpp" using namespace cv::gpu; @@ -181,57 +183,8 @@ namespace bgfg __shared__ unsigned int data1[MERGE_THREADBLOCK_SIZE]; __shared__ unsigned int data2[MERGE_THREADBLOCK_SIZE]; - data0[threadIdx.x] = sum0; - data1[threadIdx.x] = sum1; - data2[threadIdx.x] = sum2; - __syncthreads(); - - if (threadIdx.x < 128) - { - data0[threadIdx.x] = sum0 += data0[threadIdx.x + 128]; - data1[threadIdx.x] = sum1 += data1[threadIdx.x + 128]; - data2[threadIdx.x] = sum2 += data2[threadIdx.x + 128]; - } - __syncthreads(); - - if (threadIdx.x < 64) - { - data0[threadIdx.x] = sum0 += data0[threadIdx.x + 64]; - data1[threadIdx.x] = sum1 += data1[threadIdx.x + 64]; - data2[threadIdx.x] = sum2 += data2[threadIdx.x + 64]; - } - __syncthreads(); - - if (threadIdx.x < 32) - { - volatile unsigned int* vdata0 = data0; - volatile unsigned int* vdata1 = data1; - volatile unsigned int* vdata2 = data2; - - vdata0[threadIdx.x] = sum0 += vdata0[threadIdx.x + 32]; - vdata1[threadIdx.x] = sum1 += vdata1[threadIdx.x + 32]; - vdata2[threadIdx.x] = sum2 += vdata2[threadIdx.x + 32]; - - vdata0[threadIdx.x] = sum0 += vdata0[threadIdx.x + 16]; - vdata1[threadIdx.x] = sum1 += vdata1[threadIdx.x + 16]; - vdata2[threadIdx.x] = sum2 += vdata2[threadIdx.x + 16]; - - vdata0[threadIdx.x] = sum0 += vdata0[threadIdx.x + 8]; - vdata1[threadIdx.x] = sum1 += vdata1[threadIdx.x + 8]; - vdata2[threadIdx.x] = sum2 += vdata2[threadIdx.x + 8]; - - vdata0[threadIdx.x] = sum0 += vdata0[threadIdx.x + 4]; - vdata1[threadIdx.x] = sum1 += vdata1[threadIdx.x + 4]; - vdata2[threadIdx.x] = sum2 += vdata2[threadIdx.x + 4]; - - vdata0[threadIdx.x] = sum0 += vdata0[threadIdx.x + 2]; - vdata1[threadIdx.x] = sum1 += vdata1[threadIdx.x + 2]; - vdata2[threadIdx.x] = sum2 += vdata2[threadIdx.x + 2]; - - vdata0[threadIdx.x] = sum0 += vdata0[threadIdx.x + 1]; - vdata1[threadIdx.x] = sum1 += vdata1[threadIdx.x + 1]; - vdata2[threadIdx.x] = sum2 += vdata2[threadIdx.x + 1]; - } + plus op; + reduce(smem_tuple(data0, data1, data2), thrust::tie(sum0, sum1, sum2), threadIdx.x, thrust::make_tuple(op, op, op)); if(threadIdx.x == 0) { @@ -845,4 +798,4 @@ namespace bgfg template void updateBackgroundModel_gpu(PtrStepSzb prevFrame, PtrStepSzb curFrame, PtrStepSzb Ftd, PtrStepSzb Fbd, PtrStepSzb foreground, PtrStepSzb background, int deltaC, int deltaCC, float alpha1, float alpha2, float alpha3, int N1c, int N1cc, int N2c, int N2cc, float T, cudaStream_t stream); } -#endif /* CUDA_DISABLER */ \ No newline at end of file +#endif /* CUDA_DISABLER */ From 0ddd16cf78229ceaaeaa5fb5a226a877cfc8feb8 Mon Sep 17 00:00:00 2001 From: Vladislav Vinogradov Date: Mon, 12 Nov 2012 13:16:47 +0400 Subject: [PATCH 082/155] calcHist & equalizeHist --- modules/gpu/include/opencv2/gpu/gpu.hpp | 2 - modules/gpu/perf/perf_imgproc.cpp | 5 +- modules/gpu/src/cuda/hist.cu | 273 +++++++++--------------- modules/gpu/src/imgproc.cpp | 50 +---- 4 files changed, 114 insertions(+), 216 deletions(-) diff --git a/modules/gpu/include/opencv2/gpu/gpu.hpp b/modules/gpu/include/opencv2/gpu/gpu.hpp index 4396a0a10..4aed05cae 100644 --- a/modules/gpu/include/opencv2/gpu/gpu.hpp +++ b/modules/gpu/include/opencv2/gpu/gpu.hpp @@ -1028,11 +1028,9 @@ CV_EXPORTS void histRange(const GpuMat& src, GpuMat hist[4], const GpuMat levels //! Calculates histogram for 8u one channel image //! Output hist will have one row, 256 cols and CV32SC1 type. CV_EXPORTS void calcHist(const GpuMat& src, GpuMat& hist, Stream& stream = Stream::Null()); -CV_EXPORTS void calcHist(const GpuMat& src, GpuMat& hist, GpuMat& buf, Stream& stream = Stream::Null()); //! normalizes the grayscale image brightness and contrast by normalizing its histogram CV_EXPORTS void equalizeHist(const GpuMat& src, GpuMat& dst, Stream& stream = Stream::Null()); -CV_EXPORTS void equalizeHist(const GpuMat& src, GpuMat& dst, GpuMat& hist, Stream& stream = Stream::Null()); CV_EXPORTS void equalizeHist(const GpuMat& src, GpuMat& dst, GpuMat& hist, GpuMat& buf, Stream& stream = Stream::Null()); //////////////////////////////// StereoBM_GPU //////////////////////////////// diff --git a/modules/gpu/perf/perf_imgproc.cpp b/modules/gpu/perf/perf_imgproc.cpp index 30377e148..a4e199bc7 100644 --- a/modules/gpu/perf/perf_imgproc.cpp +++ b/modules/gpu/perf/perf_imgproc.cpp @@ -581,13 +581,12 @@ PERF_TEST_P(Sz, ImgProc_CalcHist, GPU_TYPICAL_MAT_SIZES) { cv::gpu::GpuMat d_src(src); cv::gpu::GpuMat d_hist; - cv::gpu::GpuMat d_buf; - cv::gpu::calcHist(d_src, d_hist, d_buf); + cv::gpu::calcHist(d_src, d_hist); TEST_CYCLE() { - cv::gpu::calcHist(d_src, d_hist, d_buf); + cv::gpu::calcHist(d_src, d_hist); } GPU_SANITY_CHECK(d_hist); diff --git a/modules/gpu/src/cuda/hist.cu b/modules/gpu/src/cuda/hist.cu index 3a2b59b85..2adc5d5b4 100644 --- a/modules/gpu/src/cuda/hist.cu +++ b/modules/gpu/src/cuda/hist.cu @@ -43,182 +43,115 @@ #if !defined CUDA_DISABLER -#include "internal_shared.hpp" -#include "opencv2/gpu/device/utility.hpp" -#include "opencv2/gpu/device/saturate_cast.hpp" +#include "opencv2/gpu/device/common.hpp" +#include "opencv2/gpu/device/functional.hpp" +#include "opencv2/gpu/device/emulation.hpp" +#include "opencv2/gpu/device/transform.hpp" + +using namespace cv::gpu; +using namespace cv::gpu::device; + +namespace +{ + __global__ void histogram256(const uchar* src, int cols, int rows, size_t step, int* hist) + { + __shared__ int shist[256]; + + const int y = blockIdx.x * blockDim.y + threadIdx.y; + const int tid = threadIdx.y * blockDim.x + threadIdx.x; + + shist[tid] = 0; + __syncthreads(); + + if (y < rows) + { + const unsigned int* rowPtr = (const unsigned int*) (src + y * step); + + const int cols_4 = cols / 4; + for (int x = threadIdx.x; x < cols_4; x += blockDim.x) + { + unsigned int data = rowPtr[x]; + + Emulation::smem::atomicAdd(&shist[(data >> 0) & 0xFFU], 1); + Emulation::smem::atomicAdd(&shist[(data >> 8) & 0xFFU], 1); + Emulation::smem::atomicAdd(&shist[(data >> 16) & 0xFFU], 1); + Emulation::smem::atomicAdd(&shist[(data >> 24) & 0xFFU], 1); + } + + if (cols % 4 != 0 && threadIdx.x == 0) + { + for (int x = cols_4 * 4; x < cols; ++x) + { + unsigned int data = ((const uchar*)rowPtr)[x]; + Emulation::smem::atomicAdd(&shist[data], 1); + } + } + } + + __syncthreads(); + + const int histVal = shist[tid]; + if (histVal > 0) + ::atomicAdd(hist + tid, histVal); + } +} + +namespace hist +{ + void histogram256(PtrStepSzb src, int* hist, cudaStream_t stream) + { + const dim3 block(32, 8); + const dim3 grid(divUp(src.rows, block.y)); + + ::histogram256<<>>(src.data, src.cols, src.rows, src.step, hist); + cudaSafeCall( cudaGetLastError() ); + + if (stream == 0) + cudaSafeCall( cudaDeviceSynchronize() ); + } +} + +///////////////////////////////////////////////////////////////////////// + +namespace +{ + __constant__ int c_lut[256]; + + struct EqualizeHist : unary_function + { + float scale; + + __host__ EqualizeHist(float _scale) : scale(_scale) {} + + __device__ __forceinline__ uchar operator ()(uchar val) const + { + const int lut = c_lut[val]; + return __float2int_rn(scale * lut); + } + }; +} namespace cv { namespace gpu { namespace device { - #define UINT_BITS 32U - - //Warps == subhistograms per threadblock - #define WARP_COUNT 6 - - //Threadblock size - #define HISTOGRAM256_THREADBLOCK_SIZE (WARP_COUNT * OPENCV_GPU_WARP_SIZE) - #define HISTOGRAM256_BIN_COUNT 256 - - //Shared memory per threadblock - #define HISTOGRAM256_THREADBLOCK_MEMORY (WARP_COUNT * HISTOGRAM256_BIN_COUNT) - - #define PARTIAL_HISTOGRAM256_COUNT 240 - - #define MERGE_THREADBLOCK_SIZE 256 - - #define USE_SMEM_ATOMICS (defined (__CUDA_ARCH__) && (__CUDA_ARCH__ >= 120)) - - namespace hist + template <> struct TransformFunctorTraits : DefaultTransformFunctorTraits { - #if (!USE_SMEM_ATOMICS) - - #define TAG_MASK ( (1U << (UINT_BITS - OPENCV_GPU_LOG_WARP_SIZE)) - 1U ) - - __forceinline__ __device__ void addByte(volatile uint* s_WarpHist, uint data, uint threadTag) - { - uint count; - do - { - count = s_WarpHist[data] & TAG_MASK; - count = threadTag | (count + 1); - s_WarpHist[data] = count; - } while (s_WarpHist[data] != count); - } - - #else - - #define TAG_MASK 0xFFFFFFFFU - - __forceinline__ __device__ void addByte(uint* s_WarpHist, uint data, uint threadTag) - { - atomicAdd(s_WarpHist + data, 1); - } - - #endif - - __forceinline__ __device__ void addWord(uint* s_WarpHist, uint data, uint tag, uint pos_x, uint cols) - { - uint x = pos_x << 2; - - if (x + 0 < cols) addByte(s_WarpHist, (data >> 0) & 0xFFU, tag); - if (x + 1 < cols) addByte(s_WarpHist, (data >> 8) & 0xFFU, tag); - if (x + 2 < cols) addByte(s_WarpHist, (data >> 16) & 0xFFU, tag); - if (x + 3 < cols) addByte(s_WarpHist, (data >> 24) & 0xFFU, tag); - } - - __global__ void histogram256(const PtrStep d_Data, uint* d_PartialHistograms, uint dataCount, uint cols) - { - //Per-warp subhistogram storage - __shared__ uint s_Hist[HISTOGRAM256_THREADBLOCK_MEMORY]; - uint* s_WarpHist= s_Hist + (threadIdx.x >> OPENCV_GPU_LOG_WARP_SIZE) * HISTOGRAM256_BIN_COUNT; - - //Clear shared memory storage for current threadblock before processing - #pragma unroll - for (uint i = 0; i < (HISTOGRAM256_THREADBLOCK_MEMORY / HISTOGRAM256_THREADBLOCK_SIZE); i++) - s_Hist[threadIdx.x + i * HISTOGRAM256_THREADBLOCK_SIZE] = 0; - - //Cycle through the entire data set, update subhistograms for each warp - const uint tag = threadIdx.x << (UINT_BITS - OPENCV_GPU_LOG_WARP_SIZE); - - __syncthreads(); - const uint colsui = d_Data.step / sizeof(uint); - for(uint pos = blockIdx.x * blockDim.x + threadIdx.x; pos < dataCount; pos += blockDim.x * gridDim.x) - { - uint pos_y = pos / colsui; - uint pos_x = pos % colsui; - uint data = d_Data.ptr(pos_y)[pos_x]; - addWord(s_WarpHist, data, tag, pos_x, cols); - } - - //Merge per-warp histograms into per-block and write to global memory - __syncthreads(); - for(uint bin = threadIdx.x; bin < HISTOGRAM256_BIN_COUNT; bin += HISTOGRAM256_THREADBLOCK_SIZE) - { - uint sum = 0; - - for (uint i = 0; i < WARP_COUNT; i++) - sum += s_Hist[bin + i * HISTOGRAM256_BIN_COUNT] & TAG_MASK; - - d_PartialHistograms[blockIdx.x * HISTOGRAM256_BIN_COUNT + bin] = sum; - } - } - - //////////////////////////////////////////////////////////////////////////////// - // Merge histogram256() output - // Run one threadblock per bin; each threadblock adds up the same bin counter - // from every partial histogram. Reads are uncoalesced, but mergeHistogram256 - // takes only a fraction of total processing time - //////////////////////////////////////////////////////////////////////////////// - - __global__ void mergeHistogram256(const uint* d_PartialHistograms, int* d_Histogram) - { - uint sum = 0; - - #pragma unroll - for (uint i = threadIdx.x; i < PARTIAL_HISTOGRAM256_COUNT; i += MERGE_THREADBLOCK_SIZE) - sum += d_PartialHistograms[blockIdx.x + i * HISTOGRAM256_BIN_COUNT]; - - __shared__ uint data[MERGE_THREADBLOCK_SIZE]; - data[threadIdx.x] = sum; - - for (uint stride = MERGE_THREADBLOCK_SIZE / 2; stride > 0; stride >>= 1) - { - __syncthreads(); - if(threadIdx.x < stride) - data[threadIdx.x] += data[threadIdx.x + stride]; - } - - if(threadIdx.x == 0) - d_Histogram[blockIdx.x] = saturate_cast(data[0]); - } - - void histogram256_gpu(PtrStepSzb src, int* hist, uint* buf, cudaStream_t stream) - { - histogram256<<>>( - PtrStepSz(src), - buf, - static_cast(src.rows * src.step / sizeof(uint)), - src.cols); - - cudaSafeCall( cudaGetLastError() ); - - mergeHistogram256<<>>(buf, hist); - - cudaSafeCall( cudaGetLastError() ); - - if (stream == 0) - cudaSafeCall( cudaDeviceSynchronize() ); - } - - __constant__ int c_lut[256]; - - __global__ void equalizeHist(const PtrStepSzb src, PtrStepb dst) - { - const int x = blockIdx.x * blockDim.x + threadIdx.x; - const int y = blockIdx.y * blockDim.y + threadIdx.y; - - if (x < src.cols && y < src.rows) - { - const uchar val = src.ptr(y)[x]; - const int lut = c_lut[val]; - dst.ptr(y)[x] = __float2int_rn(255.0f / (src.cols * src.rows) * lut); - } - } - - void equalizeHist_gpu(PtrStepSzb src, PtrStepSzb dst, const int* lut, cudaStream_t stream) - { - dim3 block(16, 16); - dim3 grid(divUp(src.cols, block.x), divUp(src.rows, block.y)); + enum { smart_shift = 4 }; + }; +}}} +namespace hist +{ + void equalizeHist(PtrStepSzb src, PtrStepSzb dst, const int* lut, cudaStream_t stream) + { + if (stream == 0) cudaSafeCall( cudaMemcpyToSymbol(c_lut, lut, 256 * sizeof(int), 0, cudaMemcpyDeviceToDevice) ); + else + cudaSafeCall( cudaMemcpyToSymbolAsync(c_lut, lut, 256 * sizeof(int), 0, cudaMemcpyDeviceToDevice, stream) ); - equalizeHist<<>>(src, dst); - cudaSafeCall( cudaGetLastError() ); + const float scale = 255.0f / (src.cols * src.rows); - if (stream == 0) - cudaSafeCall( cudaDeviceSynchronize() ); - } - } // namespace hist -}}} // namespace cv { namespace gpu { namespace device + transform(src, dst, EqualizeHist(scale), WithOutMask(), stream); + } +} - -#endif /* CUDA_DISABLER */ \ No newline at end of file +#endif /* CUDA_DISABLER */ diff --git a/modules/gpu/src/imgproc.cpp b/modules/gpu/src/imgproc.cpp index b733faf5d..b216de874 100644 --- a/modules/gpu/src/imgproc.cpp +++ b/modules/gpu/src/imgproc.cpp @@ -71,9 +71,7 @@ void cv::gpu::histRange(const GpuMat&, GpuMat&, const GpuMat&, GpuMat&, Stream&) void cv::gpu::histRange(const GpuMat&, GpuMat*, const GpuMat*, Stream&) { throw_nogpu(); } void cv::gpu::histRange(const GpuMat&, GpuMat*, const GpuMat*, GpuMat&, Stream&) { throw_nogpu(); } void cv::gpu::calcHist(const GpuMat&, GpuMat&, Stream&) { throw_nogpu(); } -void cv::gpu::calcHist(const GpuMat&, GpuMat&, GpuMat&, Stream&) { throw_nogpu(); } void cv::gpu::equalizeHist(const GpuMat&, GpuMat&, Stream&) { throw_nogpu(); } -void cv::gpu::equalizeHist(const GpuMat&, GpuMat&, GpuMat&, Stream&) { throw_nogpu(); } void cv::gpu::equalizeHist(const GpuMat&, GpuMat&, GpuMat&, GpuMat&, Stream&) { throw_nogpu(); } void cv::gpu::cornerHarris(const GpuMat&, GpuMat&, int, int, double, int) { throw_nogpu(); } void cv::gpu::cornerHarris(const GpuMat&, GpuMat&, GpuMat&, GpuMat&, int, int, double, int) { throw_nogpu(); } @@ -989,36 +987,20 @@ void cv::gpu::histRange(const GpuMat& src, GpuMat hist[4], const GpuMat levels[4 hist_callers[src.depth()](src, hist, levels, buf, StreamAccessor::getStream(stream)); } -namespace cv { namespace gpu { namespace device +namespace hist { - namespace hist - { - void histogram256_gpu(PtrStepSzb src, int* hist, unsigned int* buf, cudaStream_t stream); - - const int PARTIAL_HISTOGRAM256_COUNT = 240; - const int HISTOGRAM256_BIN_COUNT = 256; - - void equalizeHist_gpu(PtrStepSzb src, PtrStepSzb dst, const int* lut, cudaStream_t stream); - } -}}} + void histogram256(PtrStepSzb src, int* hist, cudaStream_t stream); + void equalizeHist(PtrStepSzb src, PtrStepSzb dst, const int* lut, cudaStream_t stream); +} void cv::gpu::calcHist(const GpuMat& src, GpuMat& hist, Stream& stream) { - GpuMat buf; - calcHist(src, hist, buf, stream); -} - -void cv::gpu::calcHist(const GpuMat& src, GpuMat& hist, GpuMat& buf, Stream& stream) -{ - using namespace ::cv::gpu::device::hist; - CV_Assert(src.type() == CV_8UC1); hist.create(1, 256, CV_32SC1); + hist.setTo(Scalar::all(0)); - ensureSizeIsEnough(1, PARTIAL_HISTOGRAM256_COUNT * HISTOGRAM256_BIN_COUNT, CV_32SC1, buf); - - histogram256_gpu(src, hist.ptr(), buf.ptr(), StreamAccessor::getStream(stream)); + hist::histogram256(src, hist.ptr(), StreamAccessor::getStream(stream)); } void cv::gpu::equalizeHist(const GpuMat& src, GpuMat& dst, Stream& stream) @@ -1028,16 +1010,8 @@ void cv::gpu::equalizeHist(const GpuMat& src, GpuMat& dst, Stream& stream) equalizeHist(src, dst, hist, buf, stream); } -void cv::gpu::equalizeHist(const GpuMat& src, GpuMat& dst, GpuMat& hist, Stream& stream) -{ - GpuMat buf; - equalizeHist(src, dst, hist, buf, stream); -} - void cv::gpu::equalizeHist(const GpuMat& src, GpuMat& dst, GpuMat& hist, GpuMat& buf, Stream& s) { - using namespace ::cv::gpu::device::hist; - CV_Assert(src.type() == CV_8UC1); dst.create(src.size(), src.type()); @@ -1045,15 +1019,12 @@ void cv::gpu::equalizeHist(const GpuMat& src, GpuMat& dst, GpuMat& hist, GpuMat& int intBufSize; nppSafeCall( nppsIntegralGetBufferSize_32s(256, &intBufSize) ); - int bufSize = static_cast(std::max(256 * 240 * sizeof(int), intBufSize + 256 * sizeof(int))); + ensureSizeIsEnough(1, intBufSize + 256 * sizeof(int), CV_8UC1, buf); - ensureSizeIsEnough(1, bufSize, CV_8UC1, buf); - - GpuMat histBuf(1, 256 * 240, CV_32SC1, buf.ptr()); GpuMat intBuf(1, intBufSize, CV_8UC1, buf.ptr()); GpuMat lut(1, 256, CV_32S, buf.ptr() + intBufSize); - calcHist(src, hist, histBuf, s); + calcHist(src, hist, s); cudaStream_t stream = StreamAccessor::getStream(s); @@ -1061,10 +1032,7 @@ void cv::gpu::equalizeHist(const GpuMat& src, GpuMat& dst, GpuMat& hist, GpuMat& nppSafeCall( nppsIntegral_32s(hist.ptr(), lut.ptr(), 256, intBuf.ptr()) ); - if (stream == 0) - cudaSafeCall( cudaDeviceSynchronize() ); - - equalizeHist_gpu(src, dst, lut.ptr(), stream); + hist::equalizeHist(src, dst, lut.ptr(), stream); } //////////////////////////////////////////////////////////////////////// From 0e339dd13741a52e29cf65ee4e155e114218af4b Mon Sep 17 00:00:00 2001 From: Vladislav Vinogradov Date: Mon, 12 Nov 2012 13:19:48 +0400 Subject: [PATCH 083/155] hog --- modules/gpu/src/cuda/hog.cu | 179 ++++++++++++------------------------ 1 file changed, 60 insertions(+), 119 deletions(-) diff --git a/modules/gpu/src/cuda/hog.cu b/modules/gpu/src/cuda/hog.cu index 953fdec1d..6a7e927d1 100644 --- a/modules/gpu/src/cuda/hog.cu +++ b/modules/gpu/src/cuda/hog.cu @@ -42,7 +42,10 @@ #if !defined CUDA_DISABLER -#include "internal_shared.hpp" +#include "opencv2/gpu/device/common.hpp" +#include "opencv2/gpu/device/reduce.hpp" +#include "opencv2/gpu/device/functional.hpp" +#include "opencv2/gpu/device/warp_shuffle.hpp" namespace cv { namespace gpu { namespace device { @@ -226,29 +229,30 @@ namespace cv { namespace gpu { namespace device template - __device__ float reduce_smem(volatile float* smem) + __device__ float reduce_smem(float* smem, float val) { unsigned int tid = threadIdx.x; - float sum = smem[tid]; + float sum = val; - if (size >= 512) { if (tid < 256) smem[tid] = sum = sum + smem[tid + 256]; __syncthreads(); } - if (size >= 256) { if (tid < 128) smem[tid] = sum = sum + smem[tid + 128]; __syncthreads(); } - if (size >= 128) { if (tid < 64) smem[tid] = sum = sum + smem[tid + 64]; __syncthreads(); } + reduce(smem, sum, tid, plus()); - if (tid < 32) + if (size == 32) { - if (size >= 64) smem[tid] = sum = sum + smem[tid + 32]; - if (size >= 32) smem[tid] = sum = sum + smem[tid + 16]; - if (size >= 16) smem[tid] = sum = sum + smem[tid + 8]; - if (size >= 8) smem[tid] = sum = sum + smem[tid + 4]; - if (size >= 4) smem[tid] = sum = sum + smem[tid + 2]; - if (size >= 2) smem[tid] = sum = sum + smem[tid + 1]; + #if __CUDA_ARCH__ >= 300 + return shfl(sum, 0); + #else + return smem[0]; + #endif } - __syncthreads(); - sum = smem[0]; + #if __CUDA_ARCH__ >= 300 + if (threadIdx.x == 0) + smem[0] = sum; + #endif - return sum; + __syncthreads(); + + return smem[0]; } @@ -272,19 +276,13 @@ namespace cv { namespace gpu { namespace device if (threadIdx.x < block_hist_size) elem = hist[0]; - squares[threadIdx.x] = elem * elem; - - __syncthreads(); - float sum = reduce_smem(squares); + float sum = reduce_smem(squares, elem * elem); float scale = 1.0f / (::sqrtf(sum) + 0.1f * block_hist_size); elem = ::min(elem * scale, threshold); - __syncthreads(); - squares[threadIdx.x] = elem * elem; + sum = reduce_smem(squares, elem * elem); - __syncthreads(); - sum = reduce_smem(squares); scale = 1.0f / (::sqrtf(sum) + 1e-3f); if (threadIdx.x < block_hist_size) @@ -330,65 +328,36 @@ namespace cv { namespace gpu { namespace device // return confidence values not just positive location template // Number of histogram block processed by single GPU thread block + int nblocks> // Number of histogram block processed by single GPU thread block __global__ void compute_confidence_hists_kernel_many_blocks(const int img_win_width, const int img_block_width, const int win_block_stride_x, const int win_block_stride_y, const float* block_hists, const float* coefs, float free_coef, float threshold, float* confidences) { - const int win_x = threadIdx.z; - if (blockIdx.x * blockDim.z + win_x >= img_win_width) - return; + const int win_x = threadIdx.z; + if (blockIdx.x * blockDim.z + win_x >= img_win_width) + return; - const float* hist = block_hists + (blockIdx.y * win_block_stride_y * img_block_width + - blockIdx.x * win_block_stride_x * blockDim.z + win_x) * - cblock_hist_size; + const float* hist = block_hists + (blockIdx.y * win_block_stride_y * img_block_width + + blockIdx.x * win_block_stride_x * blockDim.z + win_x) * + cblock_hist_size; - float product = 0.f; - for (int i = threadIdx.x; i < cdescr_size; i += nthreads) - { - int offset_y = i / cdescr_width; - int offset_x = i - offset_y * cdescr_width; - product += coefs[i] * hist[offset_y * img_block_width * cblock_hist_size + offset_x]; - } + float product = 0.f; + for (int i = threadIdx.x; i < cdescr_size; i += nthreads) + { + int offset_y = i / cdescr_width; + int offset_x = i - offset_y * cdescr_width; + product += coefs[i] * hist[offset_y * img_block_width * cblock_hist_size + offset_x]; + } - __shared__ float products[nthreads * nblocks]; + __shared__ float products[nthreads * nblocks]; - const int tid = threadIdx.z * nthreads + threadIdx.x; - products[tid] = product; + const int tid = threadIdx.z * nthreads + threadIdx.x; - __syncthreads(); + reduce(products, product, tid, plus()); - if (nthreads >= 512) - { - if (threadIdx.x < 256) products[tid] = product = product + products[tid + 256]; - __syncthreads(); - } - if (nthreads >= 256) - { - if (threadIdx.x < 128) products[tid] = product = product + products[tid + 128]; - __syncthreads(); - } - if (nthreads >= 128) - { - if (threadIdx.x < 64) products[tid] = product = product + products[tid + 64]; - __syncthreads(); - } - - if (threadIdx.x < 32) - { - volatile float* smem = products; - if (nthreads >= 64) smem[tid] = product = product + smem[tid + 32]; - if (nthreads >= 32) smem[tid] = product = product + smem[tid + 16]; - if (nthreads >= 16) smem[tid] = product = product + smem[tid + 8]; - if (nthreads >= 8) smem[tid] = product = product + smem[tid + 4]; - if (nthreads >= 4) smem[tid] = product = product + smem[tid + 2]; - if (nthreads >= 2) smem[tid] = product = product + smem[tid + 1]; - } - - if (threadIdx.x == 0) - confidences[blockIdx.y * img_win_width + blockIdx.x * blockDim.z + win_x] - = (float)(product + free_coef); + if (threadIdx.x == 0) + confidences[blockIdx.y * img_win_width + blockIdx.x * blockDim.z + win_x] = product + free_coef; } @@ -396,32 +365,32 @@ namespace cv { namespace gpu { namespace device int win_stride_y, int win_stride_x, int height, int width, float* block_hists, float* coefs, float free_coef, float threshold, float *confidences) { - const int nthreads = 256; - const int nblocks = 1; + const int nthreads = 256; + const int nblocks = 1; - int win_block_stride_x = win_stride_x / block_stride_x; - int win_block_stride_y = win_stride_y / block_stride_y; - int img_win_width = (width - win_width + win_stride_x) / win_stride_x; - int img_win_height = (height - win_height + win_stride_y) / win_stride_y; + int win_block_stride_x = win_stride_x / block_stride_x; + int win_block_stride_y = win_stride_y / block_stride_y; + int img_win_width = (width - win_width + win_stride_x) / win_stride_x; + int img_win_height = (height - win_height + win_stride_y) / win_stride_y; - dim3 threads(nthreads, 1, nblocks); - dim3 grid(divUp(img_win_width, nblocks), img_win_height); + dim3 threads(nthreads, 1, nblocks); + dim3 grid(divUp(img_win_width, nblocks), img_win_height); - cudaSafeCall(cudaFuncSetCacheConfig(compute_confidence_hists_kernel_many_blocks, - cudaFuncCachePreferL1)); + cudaSafeCall(cudaFuncSetCacheConfig(compute_confidence_hists_kernel_many_blocks, + cudaFuncCachePreferL1)); - int img_block_width = (width - CELLS_PER_BLOCK_X * CELL_WIDTH + block_stride_x) / - block_stride_x; - compute_confidence_hists_kernel_many_blocks<<>>( - img_win_width, img_block_width, win_block_stride_x, win_block_stride_y, - block_hists, coefs, free_coef, threshold, confidences); - cudaSafeCall(cudaThreadSynchronize()); + int img_block_width = (width - CELLS_PER_BLOCK_X * CELL_WIDTH + block_stride_x) / + block_stride_x; + compute_confidence_hists_kernel_many_blocks<<>>( + img_win_width, img_block_width, win_block_stride_x, win_block_stride_y, + block_hists, coefs, free_coef, threshold, confidences); + cudaSafeCall(cudaThreadSynchronize()); } template // Number of histogram block processed by single GPU thread block + int nblocks> // Number of histogram block processed by single GPU thread block __global__ void classify_hists_kernel_many_blocks(const int img_win_width, const int img_block_width, const int win_block_stride_x, const int win_block_stride_y, const float* block_hists, const float* coefs, @@ -446,36 +415,8 @@ namespace cv { namespace gpu { namespace device __shared__ float products[nthreads * nblocks]; const int tid = threadIdx.z * nthreads + threadIdx.x; - products[tid] = product; - __syncthreads(); - - if (nthreads >= 512) - { - if (threadIdx.x < 256) products[tid] = product = product + products[tid + 256]; - __syncthreads(); - } - if (nthreads >= 256) - { - if (threadIdx.x < 128) products[tid] = product = product + products[tid + 128]; - __syncthreads(); - } - if (nthreads >= 128) - { - if (threadIdx.x < 64) products[tid] = product = product + products[tid + 64]; - __syncthreads(); - } - - if (threadIdx.x < 32) - { - volatile float* smem = products; - if (nthreads >= 64) smem[tid] = product = product + smem[tid + 32]; - if (nthreads >= 32) smem[tid] = product = product + smem[tid + 16]; - if (nthreads >= 16) smem[tid] = product = product + smem[tid + 8]; - if (nthreads >= 8) smem[tid] = product = product + smem[tid + 4]; - if (nthreads >= 4) smem[tid] = product = product + smem[tid + 2]; - if (nthreads >= 2) smem[tid] = product = product + smem[tid + 1]; - } + reduce(products, product, tid, plus()); if (threadIdx.x == 0) labels[blockIdx.y * img_win_width + blockIdx.x * blockDim.z + win_x] = (product + free_coef >= threshold); @@ -868,4 +809,4 @@ namespace cv { namespace gpu { namespace device }}} // namespace cv { namespace gpu { namespace device -#endif /* CUDA_DISABLER */ \ No newline at end of file +#endif /* CUDA_DISABLER */ From 1b571bde10183b8270e8b46ab0d296ae55a6d39e Mon Sep 17 00:00:00 2001 From: Vladislav Vinogradov Date: Mon, 12 Nov 2012 14:08:46 +0400 Subject: [PATCH 084/155] StereoConstantSpaceBP --- modules/gpu/src/cuda/stereocsbp.cu | 44 ++++++------------------------ 1 file changed, 8 insertions(+), 36 deletions(-) diff --git a/modules/gpu/src/cuda/stereocsbp.cu b/modules/gpu/src/cuda/stereocsbp.cu index 1c95ed9e1..7b76f478b 100644 --- a/modules/gpu/src/cuda/stereocsbp.cu +++ b/modules/gpu/src/cuda/stereocsbp.cu @@ -42,9 +42,11 @@ #if !defined CUDA_DISABLER -#include "internal_shared.hpp" +#include "opencv2/gpu/device/common.hpp" #include "opencv2/gpu/device/saturate_cast.hpp" #include "opencv2/gpu/device/limits.hpp" +#include "opencv2/gpu/device/reduce.hpp" +#include "opencv2/gpu/device/functional.hpp" namespace cv { namespace gpu { namespace device { @@ -297,28 +299,13 @@ namespace cv { namespace gpu { namespace device } extern __shared__ float smem[]; - float* dline = smem + winsz * threadIdx.z; - dline[tid] = val; - - __syncthreads(); - - if (winsz >= 256) { if (tid < 128) { dline[tid] += dline[tid + 128]; } __syncthreads(); } - if (winsz >= 128) { if (tid < 64) { dline[tid] += dline[tid + 64]; } __syncthreads(); } - - volatile float* vdline = smem + winsz * threadIdx.z; - - if (winsz >= 64) if (tid < 32) vdline[tid] += vdline[tid + 32]; - if (winsz >= 32) if (tid < 16) vdline[tid] += vdline[tid + 16]; - if (winsz >= 16) if (tid < 8) vdline[tid] += vdline[tid + 8]; - if (winsz >= 8) if (tid < 4) vdline[tid] += vdline[tid + 4]; - if (winsz >= 4) if (tid < 2) vdline[tid] += vdline[tid + 2]; - if (winsz >= 2) if (tid < 1) vdline[tid] += vdline[tid + 1]; + reduce(smem + winsz * threadIdx.z, val, tid, plus()); T* data_cost = (T*)ctemp + y_out * cmsg_step + x_out; if (tid == 0) - data_cost[cdisp_step1 * d] = saturate_cast(dline[0]); + data_cost[cdisp_step1 * d] = saturate_cast(val); } } @@ -496,26 +483,11 @@ namespace cv { namespace gpu { namespace device } extern __shared__ float smem[]; - float* dline = smem + winsz * threadIdx.z; - dline[tid] = val; - - __syncthreads(); - - if (winsz >= 256) { if (tid < 128) { dline[tid] += dline[tid + 128]; } __syncthreads(); } - if (winsz >= 128) { if (tid < 64) { dline[tid] += dline[tid + 64]; } __syncthreads(); } - - volatile float* vdline = smem + winsz * threadIdx.z; - - if (winsz >= 64) if (tid < 32) vdline[tid] += vdline[tid + 32]; - if (winsz >= 32) if (tid < 16) vdline[tid] += vdline[tid + 16]; - if (winsz >= 16) if (tid < 8) vdline[tid] += vdline[tid + 8]; - if (winsz >= 8) if (tid < 4) vdline[tid] += vdline[tid + 4]; - if (winsz >= 4) if (tid < 2) vdline[tid] += vdline[tid + 2]; - if (winsz >= 2) if (tid < 1) vdline[tid] += vdline[tid + 1]; + reduce(smem + winsz * threadIdx.z, val, tid, plus()); if (tid == 0) - data_cost[cdisp_step1 * d] = saturate_cast(dline[0]); + data_cost[cdisp_step1 * d] = saturate_cast(val); } } @@ -889,4 +861,4 @@ namespace cv { namespace gpu { namespace device } // namespace stereocsbp }}} // namespace cv { namespace gpu { namespace device { -#endif /* CUDA_DISABLER */ \ No newline at end of file +#endif /* CUDA_DISABLER */ From 1f1e24be3c475054f168bf2f776bd597e79d4ca4 Mon Sep 17 00:00:00 2001 From: Vladislav Vinogradov Date: Mon, 12 Nov 2012 14:12:27 +0400 Subject: [PATCH 085/155] PyrLKOpticalFlow --- modules/gpu/src/cuda/pyrlk.cu | 968 +++++++++++++++------------------- modules/gpu/src/pyrlk.cpp | 35 +- 2 files changed, 440 insertions(+), 563 deletions(-) diff --git a/modules/gpu/src/cuda/pyrlk.cu b/modules/gpu/src/cuda/pyrlk.cu index 811c3b90b..c0f54bd33 100644 --- a/modules/gpu/src/cuda/pyrlk.cu +++ b/modules/gpu/src/cuda/pyrlk.cu @@ -52,244 +52,187 @@ #include "opencv2/gpu/device/functional.hpp" #include "opencv2/gpu/device/limits.hpp" #include "opencv2/gpu/device/vec_math.hpp" +#include "opencv2/gpu/device/reduce.hpp" -namespace cv { namespace gpu { namespace device +using namespace cv::gpu; +using namespace cv::gpu::device; + +namespace { - namespace pyrlk + __constant__ int c_winSize_x; + __constant__ int c_winSize_y; + __constant__ int c_halfWin_x; + __constant__ int c_halfWin_y; + __constant__ int c_iters; + + texture tex_If(false, cudaFilterModeLinear, cudaAddressModeClamp); + texture tex_If4(false, cudaFilterModeLinear, cudaAddressModeClamp); + texture tex_Ib(false, cudaFilterModePoint, cudaAddressModeClamp); + + texture tex_Jf(false, cudaFilterModeLinear, cudaAddressModeClamp); + texture tex_Jf4(false, cudaFilterModeLinear, cudaAddressModeClamp); + + template struct Tex_I; + template <> struct Tex_I<1> { - __constant__ int c_winSize_x; - __constant__ int c_winSize_y; - - __constant__ int c_halfWin_x; - __constant__ int c_halfWin_y; - - __constant__ int c_iters; - - void loadConstants(int2 winSize, int iters) + static __device__ __forceinline__ float read(float x, float y) { - cudaSafeCall( cudaMemcpyToSymbol(c_winSize_x, &winSize.x, sizeof(int)) ); - cudaSafeCall( cudaMemcpyToSymbol(c_winSize_y, &winSize.y, sizeof(int)) ); + return tex2D(tex_If, x, y); + } + }; + template <> struct Tex_I<4> + { + static __device__ __forceinline__ float4 read(float x, float y) + { + return tex2D(tex_If4, x, y); + } + }; - int2 halfWin = make_int2((winSize.x - 1) / 2, (winSize.y - 1) / 2); - cudaSafeCall( cudaMemcpyToSymbol(c_halfWin_x, &halfWin.x, sizeof(int)) ); - cudaSafeCall( cudaMemcpyToSymbol(c_halfWin_y, &halfWin.y, sizeof(int)) ); + template struct Tex_J; + template <> struct Tex_J<1> + { + static __device__ __forceinline__ float read(float x, float y) + { + return tex2D(tex_Jf, x, y); + } + }; + template <> struct Tex_J<4> + { + static __device__ __forceinline__ float4 read(float x, float y) + { + return tex2D(tex_Jf4, x, y); + } + }; - cudaSafeCall( cudaMemcpyToSymbol(c_iters, &iters, sizeof(int)) ); + __device__ __forceinline__ void accum(float& dst, float val) + { + dst += val; + } + __device__ __forceinline__ void accum(float& dst, const float4& val) + { + dst += val.x + val.y + val.z; + } + + __device__ __forceinline__ float abs_(float a) + { + return ::fabsf(a); + } + __device__ __forceinline__ float4 abs_(const float4& a) + { + return abs(a); + } + + template + __global__ void sparse(const float2* prevPts, float2* nextPts, uchar* status, float* err, const int level, const int rows, const int cols) + { + #if __CUDA_ARCH__ <= 110 + const int BLOCK_SIZE = 128; + #else + const int BLOCK_SIZE = 256; + #endif + + __shared__ float smem1[BLOCK_SIZE]; + __shared__ float smem2[BLOCK_SIZE]; + __shared__ float smem3[BLOCK_SIZE]; + + const unsigned int tid = threadIdx.y * blockDim.x + threadIdx.x; + + float2 prevPt = prevPts[blockIdx.x]; + prevPt.x *= (1.0f / (1 << level)); + prevPt.y *= (1.0f / (1 << level)); + + if (prevPt.x < 0 || prevPt.x >= cols || prevPt.y < 0 || prevPt.y >= rows) + { + if (tid == 0 && level == 0) + status[blockIdx.x] = 0; + + return; } - __device__ void reduce(float& val1, float& val2, float& val3, float* smem1, float* smem2, float* smem3, int tid) + prevPt.x -= c_halfWin_x; + prevPt.y -= c_halfWin_y; + + // extract the patch from the first image, compute covariation matrix of derivatives + + float A11 = 0; + float A12 = 0; + float A22 = 0; + + typedef typename TypeVec::vec_type work_type; + + work_type I_patch [PATCH_Y][PATCH_X]; + work_type dIdx_patch[PATCH_Y][PATCH_X]; + work_type dIdy_patch[PATCH_Y][PATCH_X]; + + for (int yBase = threadIdx.y, i = 0; yBase < c_winSize_y; yBase += blockDim.y, ++i) { - smem1[tid] = val1; - smem2[tid] = val2; - smem3[tid] = val3; - __syncthreads(); - -#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ > 110) - if (tid < 128) + for (int xBase = threadIdx.x, j = 0; xBase < c_winSize_x; xBase += blockDim.x, ++j) { - smem1[tid] = val1 += smem1[tid + 128]; - smem2[tid] = val2 += smem2[tid + 128]; - smem3[tid] = val3 += smem3[tid + 128]; - } - __syncthreads(); -#endif + float x = prevPt.x + xBase + 0.5f; + float y = prevPt.y + yBase + 0.5f; - if (tid < 64) - { - smem1[tid] = val1 += smem1[tid + 64]; - smem2[tid] = val2 += smem2[tid + 64]; - smem3[tid] = val3 += smem3[tid + 64]; - } - __syncthreads(); + I_patch[i][j] = Tex_I::read(x, y); - if (tid < 32) - { - volatile float* vmem1 = smem1; - volatile float* vmem2 = smem2; - volatile float* vmem3 = smem3; + // Sharr Deriv - vmem1[tid] = val1 += vmem1[tid + 32]; - vmem2[tid] = val2 += vmem2[tid + 32]; - vmem3[tid] = val3 += vmem3[tid + 32]; + work_type dIdx = 3.0f * Tex_I::read(x+1, y-1) + 10.0f * Tex_I::read(x+1, y) + 3.0f * Tex_I::read(x+1, y+1) - + (3.0f * Tex_I::read(x-1, y-1) + 10.0f * Tex_I::read(x-1, y) + 3.0f * Tex_I::read(x-1, y+1)); - vmem1[tid] = val1 += vmem1[tid + 16]; - vmem2[tid] = val2 += vmem2[tid + 16]; - vmem3[tid] = val3 += vmem3[tid + 16]; + work_type dIdy = 3.0f * Tex_I::read(x-1, y+1) + 10.0f * Tex_I::read(x, y+1) + 3.0f * Tex_I::read(x+1, y+1) - + (3.0f * Tex_I::read(x-1, y-1) + 10.0f * Tex_I::read(x, y-1) + 3.0f * Tex_I::read(x+1, y-1)); - vmem1[tid] = val1 += vmem1[tid + 8]; - vmem2[tid] = val2 += vmem2[tid + 8]; - vmem3[tid] = val3 += vmem3[tid + 8]; + dIdx_patch[i][j] = dIdx; + dIdy_patch[i][j] = dIdy; - vmem1[tid] = val1 += vmem1[tid + 4]; - vmem2[tid] = val2 += vmem2[tid + 4]; - vmem3[tid] = val3 += vmem3[tid + 4]; - - vmem1[tid] = val1 += vmem1[tid + 2]; - vmem2[tid] = val2 += vmem2[tid + 2]; - vmem3[tid] = val3 += vmem3[tid + 2]; - - vmem1[tid] = val1 += vmem1[tid + 1]; - vmem2[tid] = val2 += vmem2[tid + 1]; - vmem3[tid] = val3 += vmem3[tid + 1]; + accum(A11, dIdx * dIdx); + accum(A12, dIdx * dIdy); + accum(A22, dIdy * dIdy); } } - __device__ void reduce(float& val1, float& val2, float* smem1, float* smem2, int tid) + reduce(smem_tuple(smem1, smem2, smem3), thrust::tie(A11, A12, A22), tid, thrust::make_tuple(plus(), plus(), plus())); + + #if __CUDA_ARCH__ >= 300 + if (tid == 0) { - smem1[tid] = val1; - smem2[tid] = val2; - __syncthreads(); + smem1[0] = A11; + smem2[0] = A12; + smem3[0] = A22; + } + #endif -#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ > 110) - if (tid < 128) - { - smem1[tid] = val1 += smem1[tid + 128]; - smem2[tid] = val2 += smem2[tid + 128]; - } - __syncthreads(); -#endif + __syncthreads(); - if (tid < 64) - { - smem1[tid] = val1 += smem1[tid + 64]; - smem2[tid] = val2 += smem2[tid + 64]; - } - __syncthreads(); + A11 = smem1[0]; + A12 = smem2[0]; + A22 = smem3[0]; - if (tid < 32) - { - volatile float* vmem1 = smem1; - volatile float* vmem2 = smem2; + float D = A11 * A22 - A12 * A12; - vmem1[tid] = val1 += vmem1[tid + 32]; - vmem2[tid] = val2 += vmem2[tid + 32]; + if (D < numeric_limits::epsilon()) + { + if (tid == 0 && level == 0) + status[blockIdx.x] = 0; - vmem1[tid] = val1 += vmem1[tid + 16]; - vmem2[tid] = val2 += vmem2[tid + 16]; - - vmem1[tid] = val1 += vmem1[tid + 8]; - vmem2[tid] = val2 += vmem2[tid + 8]; - - vmem1[tid] = val1 += vmem1[tid + 4]; - vmem2[tid] = val2 += vmem2[tid + 4]; - - vmem1[tid] = val1 += vmem1[tid + 2]; - vmem2[tid] = val2 += vmem2[tid + 2]; - - vmem1[tid] = val1 += vmem1[tid + 1]; - vmem2[tid] = val2 += vmem2[tid + 1]; - } + return; } - __device__ void reduce(float& val1, float* smem1, int tid) + D = 1.f / D; + + A11 *= D; + A12 *= D; + A22 *= D; + + float2 nextPt = nextPts[blockIdx.x]; + nextPt.x *= 2.f; + nextPt.y *= 2.f; + + nextPt.x -= c_halfWin_x; + nextPt.y -= c_halfWin_y; + + for (int k = 0; k < c_iters; ++k) { - smem1[tid] = val1; - __syncthreads(); - -#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ > 110) - if (tid < 128) - { - smem1[tid] = val1 += smem1[tid + 128]; - } - __syncthreads(); -#endif - - if (tid < 64) - { - smem1[tid] = val1 += smem1[tid + 64]; - } - __syncthreads(); - - if (tid < 32) - { - volatile float* vmem1 = smem1; - - vmem1[tid] = val1 += vmem1[tid + 32]; - vmem1[tid] = val1 += vmem1[tid + 16]; - vmem1[tid] = val1 += vmem1[tid + 8]; - vmem1[tid] = val1 += vmem1[tid + 4]; - vmem1[tid] = val1 += vmem1[tid + 2]; - vmem1[tid] = val1 += vmem1[tid + 1]; - } - } - - texture tex_If(false, cudaFilterModeLinear, cudaAddressModeClamp); - texture tex_If4(false, cudaFilterModeLinear, cudaAddressModeClamp); - texture tex_Ib(false, cudaFilterModePoint, cudaAddressModeClamp); - - texture tex_Jf(false, cudaFilterModeLinear, cudaAddressModeClamp); - texture tex_Jf4(false, cudaFilterModeLinear, cudaAddressModeClamp); - - template struct Tex_I; - template <> struct Tex_I<1> - { - static __device__ __forceinline__ float read(float x, float y) - { - return tex2D(tex_If, x, y); - } - }; - template <> struct Tex_I<4> - { - static __device__ __forceinline__ float4 read(float x, float y) - { - return tex2D(tex_If4, x, y); - } - }; - - template struct Tex_J; - template <> struct Tex_J<1> - { - static __device__ __forceinline__ float read(float x, float y) - { - return tex2D(tex_Jf, x, y); - } - }; - template <> struct Tex_J<4> - { - static __device__ __forceinline__ float4 read(float x, float y) - { - return tex2D(tex_Jf4, x, y); - } - }; - - __device__ __forceinline__ void accum(float& dst, float val) - { - dst += val; - } - __device__ __forceinline__ void accum(float& dst, const float4& val) - { - dst += val.x + val.y + val.z; - } - - __device__ __forceinline__ float abs_(float a) - { - return ::fabs(a); - } - __device__ __forceinline__ float4 abs_(const float4& a) - { - return abs(a); - } - - template - __global__ void lkSparse(const float2* prevPts, float2* nextPts, uchar* status, float* err, const int level, const int rows, const int cols) - { -#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ <= 110) - __shared__ float smem1[128]; - __shared__ float smem2[128]; - __shared__ float smem3[128]; -#else - __shared__ float smem1[256]; - __shared__ float smem2[256]; - __shared__ float smem3[256]; -#endif - - const int tid = threadIdx.y * blockDim.x + threadIdx.x; - - float2 prevPt = prevPts[blockIdx.x]; - prevPt.x *= (1.0f / (1 << level)); - prevPt.y *= (1.0f / (1 << level)); - - if (prevPt.x < 0 || prevPt.x >= cols || prevPt.y < 0 || prevPt.y >= rows) + if (nextPt.x < -c_halfWin_x || nextPt.x >= cols || nextPt.y < -c_halfWin_y || nextPt.y >= rows) { if (tid == 0 && level == 0) status[blockIdx.x] = 0; @@ -297,279 +240,183 @@ namespace cv { namespace gpu { namespace device return; } - prevPt.x -= c_halfWin_x; - prevPt.y -= c_halfWin_y; + float b1 = 0; + float b2 = 0; - // extract the patch from the first image, compute covariation matrix of derivatives - - float A11 = 0; - float A12 = 0; - float A22 = 0; - - typedef typename TypeVec::vec_type work_type; - - work_type I_patch [PATCH_Y][PATCH_X]; - work_type dIdx_patch[PATCH_Y][PATCH_X]; - work_type dIdy_patch[PATCH_Y][PATCH_X]; - - for (int yBase = threadIdx.y, i = 0; yBase < c_winSize_y; yBase += blockDim.y, ++i) + for (int y = threadIdx.y, i = 0; y < c_winSize_y; y += blockDim.y, ++i) { - for (int xBase = threadIdx.x, j = 0; xBase < c_winSize_x; xBase += blockDim.x, ++j) + for (int x = threadIdx.x, j = 0; x < c_winSize_x; x += blockDim.x, ++j) { - float x = prevPt.x + xBase + 0.5f; - float y = prevPt.y + yBase + 0.5f; + work_type I_val = I_patch[i][j]; + work_type J_val = Tex_J::read(nextPt.x + x + 0.5f, nextPt.y + y + 0.5f); - I_patch[i][j] = Tex_I::read(x, y); + work_type diff = (J_val - I_val) * 32.0f; - // Sharr Deriv - - work_type dIdx = 3.0f * Tex_I::read(x+1, y-1) + 10.0f * Tex_I::read(x+1, y) + 3.0f * Tex_I::read(x+1, y+1) - - (3.0f * Tex_I::read(x-1, y-1) + 10.0f * Tex_I::read(x-1, y) + 3.0f * Tex_I::read(x-1, y+1)); - - work_type dIdy = 3.0f * Tex_I::read(x-1, y+1) + 10.0f * Tex_I::read(x, y+1) + 3.0f * Tex_I::read(x+1, y+1) - - (3.0f * Tex_I::read(x-1, y-1) + 10.0f * Tex_I::read(x, y-1) + 3.0f * Tex_I::read(x+1, y-1)); - - dIdx_patch[i][j] = dIdx; - dIdy_patch[i][j] = dIdy; - - accum(A11, dIdx * dIdx); - accum(A12, dIdx * dIdy); - accum(A22, dIdy * dIdy); + accum(b1, diff * dIdx_patch[i][j]); + accum(b2, diff * dIdy_patch[i][j]); } } - reduce(A11, A12, A22, smem1, smem2, smem3, tid); - __syncthreads(); - - A11 = smem1[0]; - A12 = smem2[0]; - A22 = smem3[0]; - - float D = A11 * A22 - A12 * A12; - - if (D < numeric_limits::epsilon()) - { - if (tid == 0 && level == 0) - status[blockIdx.x] = 0; - - return; - } - - D = 1.f / D; - - A11 *= D; - A12 *= D; - A22 *= D; - - float2 nextPt = nextPts[blockIdx.x]; - nextPt.x *= 2.f; - nextPt.y *= 2.f; - - nextPt.x -= c_halfWin_x; - nextPt.y -= c_halfWin_y; - - for (int k = 0; k < c_iters; ++k) - { - if (nextPt.x < -c_halfWin_x || nextPt.x >= cols || nextPt.y < -c_halfWin_y || nextPt.y >= rows) - { - if (tid == 0 && level == 0) - status[blockIdx.x] = 0; - - return; - } - - float b1 = 0; - float b2 = 0; - - for (int y = threadIdx.y, i = 0; y < c_winSize_y; y += blockDim.y, ++i) - { - for (int x = threadIdx.x, j = 0; x < c_winSize_x; x += blockDim.x, ++j) - { - work_type I_val = I_patch[i][j]; - work_type J_val = Tex_J::read(nextPt.x + x + 0.5f, nextPt.y + y + 0.5f); - - work_type diff = (J_val - I_val) * 32.0f; - - accum(b1, diff * dIdx_patch[i][j]); - accum(b2, diff * dIdy_patch[i][j]); - } - } - - reduce(b1, b2, smem1, smem2, tid); - __syncthreads(); - - b1 = smem1[0]; - b2 = smem2[0]; - - float2 delta; - delta.x = A12 * b2 - A22 * b1; - delta.y = A12 * b1 - A11 * b2; - - nextPt.x += delta.x; - nextPt.y += delta.y; - - if (::fabs(delta.x) < 0.01f && ::fabs(delta.y) < 0.01f) - break; - } - - float errval = 0; - if (calcErr) - { - for (int y = threadIdx.y, i = 0; y < c_winSize_y; y += blockDim.y, ++i) - { - for (int x = threadIdx.x, j = 0; x < c_winSize_x; x += blockDim.x, ++j) - { - work_type I_val = I_patch[i][j]; - work_type J_val = Tex_J::read(nextPt.x + x + 0.5f, nextPt.y + y + 0.5f); - - work_type diff = J_val - I_val; - - accum(errval, abs_(diff)); - } - } - - reduce(errval, smem1, tid); - } + reduce(smem_tuple(smem1, smem2), thrust::tie(b1, b2), tid, thrust::make_tuple(plus(), plus())); + #if __CUDA_ARCH__ >= 300 if (tid == 0) { - nextPt.x += c_halfWin_x; - nextPt.y += c_halfWin_y; - - nextPts[blockIdx.x] = nextPt; - - if (calcErr) - err[blockIdx.x] = static_cast(errval) / (cn * c_winSize_x * c_winSize_y); - } - } - - template - void lkSparse_caller(int rows, int cols, const float2* prevPts, float2* nextPts, uchar* status, float* err, int ptcount, - int level, dim3 block, cudaStream_t stream) - { - dim3 grid(ptcount); - - if (level == 0 && err) - lkSparse<<>>(prevPts, nextPts, status, err, level, rows, cols); - else - lkSparse<<>>(prevPts, nextPts, status, err, level, rows, cols); - - cudaSafeCall( cudaGetLastError() ); - - if (stream == 0) - cudaSafeCall( cudaDeviceSynchronize() ); - } - - void lkSparse1_gpu(PtrStepSzf I, PtrStepSzf J, const float2* prevPts, float2* nextPts, uchar* status, float* err, int ptcount, - int level, dim3 block, dim3 patch, cudaStream_t stream) - { - typedef void (*func_t)(int rows, int cols, const float2* prevPts, float2* nextPts, uchar* status, float* err, int ptcount, - int level, dim3 block, cudaStream_t stream); - - static const func_t funcs[5][5] = - { - {lkSparse_caller<1, 1, 1>, lkSparse_caller<1, 2, 1>, lkSparse_caller<1, 3, 1>, lkSparse_caller<1, 4, 1>, lkSparse_caller<1, 5, 1>}, - {lkSparse_caller<1, 1, 2>, lkSparse_caller<1, 2, 2>, lkSparse_caller<1, 3, 2>, lkSparse_caller<1, 4, 2>, lkSparse_caller<1, 5, 2>}, - {lkSparse_caller<1, 1, 3>, lkSparse_caller<1, 2, 3>, lkSparse_caller<1, 3, 3>, lkSparse_caller<1, 4, 3>, lkSparse_caller<1, 5, 3>}, - {lkSparse_caller<1, 1, 4>, lkSparse_caller<1, 2, 4>, lkSparse_caller<1, 3, 4>, lkSparse_caller<1, 4, 4>, lkSparse_caller<1, 5, 4>}, - {lkSparse_caller<1, 1, 5>, lkSparse_caller<1, 2, 5>, lkSparse_caller<1, 3, 5>, lkSparse_caller<1, 4, 5>, lkSparse_caller<1, 5, 5>} - }; - - bindTexture(&tex_If, I); - bindTexture(&tex_Jf, J); - - funcs[patch.y - 1][patch.x - 1](I.rows, I.cols, prevPts, nextPts, status, err, ptcount, - level, block, stream); - } - - void lkSparse4_gpu(PtrStepSz I, PtrStepSz J, const float2* prevPts, float2* nextPts, uchar* status, float* err, int ptcount, - int level, dim3 block, dim3 patch, cudaStream_t stream) - { - typedef void (*func_t)(int rows, int cols, const float2* prevPts, float2* nextPts, uchar* status, float* err, int ptcount, - int level, dim3 block, cudaStream_t stream); - - static const func_t funcs[5][5] = - { - {lkSparse_caller<4, 1, 1>, lkSparse_caller<4, 2, 1>, lkSparse_caller<4, 3, 1>, lkSparse_caller<4, 4, 1>, lkSparse_caller<4, 5, 1>}, - {lkSparse_caller<4, 1, 2>, lkSparse_caller<4, 2, 2>, lkSparse_caller<4, 3, 2>, lkSparse_caller<4, 4, 2>, lkSparse_caller<4, 5, 2>}, - {lkSparse_caller<4, 1, 3>, lkSparse_caller<4, 2, 3>, lkSparse_caller<4, 3, 3>, lkSparse_caller<4, 4, 3>, lkSparse_caller<4, 5, 3>}, - {lkSparse_caller<4, 1, 4>, lkSparse_caller<4, 2, 4>, lkSparse_caller<4, 3, 4>, lkSparse_caller<4, 4, 4>, lkSparse_caller<4, 5, 4>}, - {lkSparse_caller<4, 1, 5>, lkSparse_caller<4, 2, 5>, lkSparse_caller<4, 3, 5>, lkSparse_caller<4, 4, 5>, lkSparse_caller<4, 5, 5>} - }; - - bindTexture(&tex_If4, I); - bindTexture(&tex_Jf4, J); - - funcs[patch.y - 1][patch.x - 1](I.rows, I.cols, prevPts, nextPts, status, err, ptcount, - level, block, stream); - } - - template - __global__ void lkDense(PtrStepf u, PtrStepf v, const PtrStepf prevU, const PtrStepf prevV, PtrStepf err, const int rows, const int cols) - { - extern __shared__ int smem[]; - - const int patchWidth = blockDim.x + 2 * c_halfWin_x; - const int patchHeight = blockDim.y + 2 * c_halfWin_y; - - int* I_patch = smem; - int* dIdx_patch = I_patch + patchWidth * patchHeight; - int* dIdy_patch = dIdx_patch + patchWidth * patchHeight; - - const int xBase = blockIdx.x * blockDim.x; - const int yBase = blockIdx.y * blockDim.y; - - for (int i = threadIdx.y; i < patchHeight; i += blockDim.y) - { - for (int j = threadIdx.x; j < patchWidth; j += blockDim.x) - { - float x = xBase - c_halfWin_x + j + 0.5f; - float y = yBase - c_halfWin_y + i + 0.5f; - - I_patch[i * patchWidth + j] = tex2D(tex_Ib, x, y); - - // Sharr Deriv - - dIdx_patch[i * patchWidth + j] = 3 * tex2D(tex_Ib, x+1, y-1) + 10 * tex2D(tex_Ib, x+1, y) + 3 * tex2D(tex_Ib, x+1, y+1) - - (3 * tex2D(tex_Ib, x-1, y-1) + 10 * tex2D(tex_Ib, x-1, y) + 3 * tex2D(tex_Ib, x-1, y+1)); - - dIdy_patch[i * patchWidth + j] = 3 * tex2D(tex_Ib, x-1, y+1) + 10 * tex2D(tex_Ib, x, y+1) + 3 * tex2D(tex_Ib, x+1, y+1) - - (3 * tex2D(tex_Ib, x-1, y-1) + 10 * tex2D(tex_Ib, x, y-1) + 3 * tex2D(tex_Ib, x+1, y-1)); - } + smem1[0] = b1; + smem2[0] = b2; } + #endif __syncthreads(); - const int x = xBase + threadIdx.x; - const int y = yBase + threadIdx.y; + b1 = smem1[0]; + b2 = smem2[0]; - if (x >= cols || y >= rows) - return; + float2 delta; + delta.x = A12 * b2 - A22 * b1; + delta.y = A12 * b1 - A11 * b2; - int A11i = 0; - int A12i = 0; - int A22i = 0; + nextPt.x += delta.x; + nextPt.y += delta.y; - for (int i = 0; i < c_winSize_y; ++i) + if (::fabs(delta.x) < 0.01f && ::fabs(delta.y) < 0.01f) + break; + } + + float errval = 0; + if (calcErr) + { + for (int y = threadIdx.y, i = 0; y < c_winSize_y; y += blockDim.y, ++i) { - for (int j = 0; j < c_winSize_x; ++j) + for (int x = threadIdx.x, j = 0; x < c_winSize_x; x += blockDim.x, ++j) { - int dIdx = dIdx_patch[(threadIdx.y + i) * patchWidth + (threadIdx.x + j)]; - int dIdy = dIdy_patch[(threadIdx.y + i) * patchWidth + (threadIdx.x + j)]; + work_type I_val = I_patch[i][j]; + work_type J_val = Tex_J::read(nextPt.x + x + 0.5f, nextPt.y + y + 0.5f); - A11i += dIdx * dIdx; - A12i += dIdx * dIdy; - A22i += dIdy * dIdy; + work_type diff = J_val - I_val; + + accum(errval, abs_(diff)); } } - float A11 = A11i; - float A12 = A12i; - float A22 = A22i; + reduce(smem1, errval, tid, plus()); + } - float D = A11 * A22 - A12 * A12; + if (tid == 0) + { + nextPt.x += c_halfWin_x; + nextPt.y += c_halfWin_y; - if (D < numeric_limits::epsilon()) + nextPts[blockIdx.x] = nextPt; + + if (calcErr) + err[blockIdx.x] = static_cast(errval) / (cn * c_winSize_x * c_winSize_y); + } + } + + template + void sparse_caller(int rows, int cols, const float2* prevPts, float2* nextPts, uchar* status, float* err, int ptcount, + int level, dim3 block, cudaStream_t stream) + { + dim3 grid(ptcount); + + if (level == 0 && err) + sparse<<>>(prevPts, nextPts, status, err, level, rows, cols); + else + sparse<<>>(prevPts, nextPts, status, err, level, rows, cols); + + cudaSafeCall( cudaGetLastError() ); + + if (stream == 0) + cudaSafeCall( cudaDeviceSynchronize() ); + } + + template + __global__ void dense(PtrStepf u, PtrStepf v, const PtrStepf prevU, const PtrStepf prevV, PtrStepf err, const int rows, const int cols) + { + extern __shared__ int smem[]; + + const int patchWidth = blockDim.x + 2 * c_halfWin_x; + const int patchHeight = blockDim.y + 2 * c_halfWin_y; + + int* I_patch = smem; + int* dIdx_patch = I_patch + patchWidth * patchHeight; + int* dIdy_patch = dIdx_patch + patchWidth * patchHeight; + + const int xBase = blockIdx.x * blockDim.x; + const int yBase = blockIdx.y * blockDim.y; + + for (int i = threadIdx.y; i < patchHeight; i += blockDim.y) + { + for (int j = threadIdx.x; j < patchWidth; j += blockDim.x) + { + float x = xBase - c_halfWin_x + j + 0.5f; + float y = yBase - c_halfWin_y + i + 0.5f; + + I_patch[i * patchWidth + j] = tex2D(tex_Ib, x, y); + + // Sharr Deriv + + dIdx_patch[i * patchWidth + j] = 3 * tex2D(tex_Ib, x+1, y-1) + 10 * tex2D(tex_Ib, x+1, y) + 3 * tex2D(tex_Ib, x+1, y+1) - + (3 * tex2D(tex_Ib, x-1, y-1) + 10 * tex2D(tex_Ib, x-1, y) + 3 * tex2D(tex_Ib, x-1, y+1)); + + dIdy_patch[i * patchWidth + j] = 3 * tex2D(tex_Ib, x-1, y+1) + 10 * tex2D(tex_Ib, x, y+1) + 3 * tex2D(tex_Ib, x+1, y+1) - + (3 * tex2D(tex_Ib, x-1, y-1) + 10 * tex2D(tex_Ib, x, y-1) + 3 * tex2D(tex_Ib, x+1, y-1)); + } + } + + __syncthreads(); + + const int x = xBase + threadIdx.x; + const int y = yBase + threadIdx.y; + + if (x >= cols || y >= rows) + return; + + int A11i = 0; + int A12i = 0; + int A22i = 0; + + for (int i = 0; i < c_winSize_y; ++i) + { + for (int j = 0; j < c_winSize_x; ++j) + { + int dIdx = dIdx_patch[(threadIdx.y + i) * patchWidth + (threadIdx.x + j)]; + int dIdy = dIdy_patch[(threadIdx.y + i) * patchWidth + (threadIdx.x + j)]; + + A11i += dIdx * dIdx; + A12i += dIdx * dIdy; + A22i += dIdy * dIdy; + } + } + + float A11 = A11i; + float A12 = A12i; + float A22 = A22i; + + float D = A11 * A22 - A12 * A12; + + if (D < numeric_limits::epsilon()) + { + if (calcErr) + err(y, x) = numeric_limits::max(); + + return; + } + + D = 1.f / D; + + A11 *= D; + A12 *= D; + A22 *= D; + + float2 nextPt; + nextPt.x = x + prevU(y/2, x/2) * 2.0f; + nextPt.y = y + prevV(y/2, x/2) * 2.0f; + + for (int k = 0; k < c_iters; ++k) + { + if (nextPt.x < 0 || nextPt.x >= cols || nextPt.y < 0 || nextPt.y >= rows) { if (calcErr) err(y, x) = numeric_limits::max(); @@ -577,108 +424,145 @@ namespace cv { namespace gpu { namespace device return; } - D = 1.f / D; + int b1 = 0; + int b2 = 0; - A11 *= D; - A12 *= D; - A22 *= D; - - float2 nextPt; - nextPt.x = x + prevU(y/2, x/2) * 2.0f; - nextPt.y = y + prevV(y/2, x/2) * 2.0f; - - for (int k = 0; k < c_iters; ++k) + for (int i = 0; i < c_winSize_y; ++i) { - if (nextPt.x < 0 || nextPt.x >= cols || nextPt.y < 0 || nextPt.y >= rows) + for (int j = 0; j < c_winSize_x; ++j) { - if (calcErr) - err(y, x) = numeric_limits::max(); + int I = I_patch[(threadIdx.y + i) * patchWidth + threadIdx.x + j]; + int J = tex2D(tex_Jf, nextPt.x - c_halfWin_x + j + 0.5f, nextPt.y - c_halfWin_y + i + 0.5f); - return; + int diff = (J - I) * 32; + + int dIdx = dIdx_patch[(threadIdx.y + i) * patchWidth + (threadIdx.x + j)]; + int dIdy = dIdy_patch[(threadIdx.y + i) * patchWidth + (threadIdx.x + j)]; + + b1 += diff * dIdx; + b2 += diff * dIdy; } - - int b1 = 0; - int b2 = 0; - - for (int i = 0; i < c_winSize_y; ++i) - { - for (int j = 0; j < c_winSize_x; ++j) - { - int I = I_patch[(threadIdx.y + i) * patchWidth + threadIdx.x + j]; - int J = tex2D(tex_Jf, nextPt.x - c_halfWin_x + j + 0.5f, nextPt.y - c_halfWin_y + i + 0.5f); - - int diff = (J - I) * 32; - - int dIdx = dIdx_patch[(threadIdx.y + i) * patchWidth + (threadIdx.x + j)]; - int dIdy = dIdy_patch[(threadIdx.y + i) * patchWidth + (threadIdx.x + j)]; - - b1 += diff * dIdx; - b2 += diff * dIdy; - } - } - - float2 delta; - delta.x = A12 * b2 - A22 * b1; - delta.y = A12 * b1 - A11 * b2; - - nextPt.x += delta.x; - nextPt.y += delta.y; - - if (::fabs(delta.x) < 0.01f && ::fabs(delta.y) < 0.01f) - break; } - u(y, x) = nextPt.x - x; - v(y, x) = nextPt.y - y; + float2 delta; + delta.x = A12 * b2 - A22 * b1; + delta.y = A12 * b1 - A11 * b2; - if (calcErr) - { - int errval = 0; + nextPt.x += delta.x; + nextPt.y += delta.y; - for (int i = 0; i < c_winSize_y; ++i) - { - for (int j = 0; j < c_winSize_x; ++j) - { - int I = I_patch[(threadIdx.y + i) * patchWidth + threadIdx.x + j]; - int J = tex2D(tex_Jf, nextPt.x - c_halfWin_x + j + 0.5f, nextPt.y - c_halfWin_y + i + 0.5f); - - errval += ::abs(J - I); - } - } - - err(y, x) = static_cast(errval) / (c_winSize_x * c_winSize_y); - } + if (::fabs(delta.x) < 0.01f && ::fabs(delta.y) < 0.01f) + break; } - void lkDense_gpu(PtrStepSzb I, PtrStepSzf J, PtrStepSzf u, PtrStepSzf v, PtrStepSzf prevU, PtrStepSzf prevV, - PtrStepSzf err, int2 winSize, cudaStream_t stream) + u(y, x) = nextPt.x - x; + v(y, x) = nextPt.y - y; + + if (calcErr) { - dim3 block(16, 16); - dim3 grid(divUp(I.cols, block.x), divUp(I.rows, block.y)); + int errval = 0; - bindTexture(&tex_Ib, I); - bindTexture(&tex_Jf, J); - - int2 halfWin = make_int2((winSize.x - 1) / 2, (winSize.y - 1) / 2); - const int patchWidth = block.x + 2 * halfWin.x; - const int patchHeight = block.y + 2 * halfWin.y; - size_t smem_size = 3 * patchWidth * patchHeight * sizeof(int); - - if (err.data) + for (int i = 0; i < c_winSize_y; ++i) { - lkDense<<>>(u, v, prevU, prevV, err, I.rows, I.cols); - cudaSafeCall( cudaGetLastError() ); - } - else - { - lkDense<<>>(u, v, prevU, prevV, PtrStepf(), I.rows, I.cols); - cudaSafeCall( cudaGetLastError() ); + for (int j = 0; j < c_winSize_x; ++j) + { + int I = I_patch[(threadIdx.y + i) * patchWidth + threadIdx.x + j]; + int J = tex2D(tex_Jf, nextPt.x - c_halfWin_x + j + 0.5f, nextPt.y - c_halfWin_y + i + 0.5f); + + errval += ::abs(J - I); + } } - if (stream == 0) - cudaSafeCall( cudaDeviceSynchronize() ); + err(y, x) = static_cast(errval) / (c_winSize_x * c_winSize_y); } } -}}} +} + +namespace pyrlk +{ + void loadConstants(int2 winSize, int iters) + { + cudaSafeCall( cudaMemcpyToSymbol(c_winSize_x, &winSize.x, sizeof(int)) ); + cudaSafeCall( cudaMemcpyToSymbol(c_winSize_y, &winSize.y, sizeof(int)) ); + + int2 halfWin = make_int2((winSize.x - 1) / 2, (winSize.y - 1) / 2); + cudaSafeCall( cudaMemcpyToSymbol(c_halfWin_x, &halfWin.x, sizeof(int)) ); + cudaSafeCall( cudaMemcpyToSymbol(c_halfWin_y, &halfWin.y, sizeof(int)) ); + + cudaSafeCall( cudaMemcpyToSymbol(c_iters, &iters, sizeof(int)) ); + } + + void sparse1(PtrStepSzf I, PtrStepSzf J, const float2* prevPts, float2* nextPts, uchar* status, float* err, int ptcount, + int level, dim3 block, dim3 patch, cudaStream_t stream) + { + typedef void (*func_t)(int rows, int cols, const float2* prevPts, float2* nextPts, uchar* status, float* err, int ptcount, + int level, dim3 block, cudaStream_t stream); + + static const func_t funcs[5][5] = + { + {::sparse_caller<1, 1, 1>, ::sparse_caller<1, 2, 1>, ::sparse_caller<1, 3, 1>, ::sparse_caller<1, 4, 1>, ::sparse_caller<1, 5, 1>}, + {::sparse_caller<1, 1, 2>, ::sparse_caller<1, 2, 2>, ::sparse_caller<1, 3, 2>, ::sparse_caller<1, 4, 2>, ::sparse_caller<1, 5, 2>}, + {::sparse_caller<1, 1, 3>, ::sparse_caller<1, 2, 3>, ::sparse_caller<1, 3, 3>, ::sparse_caller<1, 4, 3>, ::sparse_caller<1, 5, 3>}, + {::sparse_caller<1, 1, 4>, ::sparse_caller<1, 2, 4>, ::sparse_caller<1, 3, 4>, ::sparse_caller<1, 4, 4>, ::sparse_caller<1, 5, 4>}, + {::sparse_caller<1, 1, 5>, ::sparse_caller<1, 2, 5>, ::sparse_caller<1, 3, 5>, ::sparse_caller<1, 4, 5>, ::sparse_caller<1, 5, 5>} + }; + + bindTexture(&tex_If, I); + bindTexture(&tex_Jf, J); + + funcs[patch.y - 1][patch.x - 1](I.rows, I.cols, prevPts, nextPts, status, err, ptcount, + level, block, stream); + } + + void sparse4(PtrStepSz I, PtrStepSz J, const float2* prevPts, float2* nextPts, uchar* status, float* err, int ptcount, + int level, dim3 block, dim3 patch, cudaStream_t stream) + { + typedef void (*func_t)(int rows, int cols, const float2* prevPts, float2* nextPts, uchar* status, float* err, int ptcount, + int level, dim3 block, cudaStream_t stream); + + static const func_t funcs[5][5] = + { + {::sparse_caller<4, 1, 1>, ::sparse_caller<4, 2, 1>, ::sparse_caller<4, 3, 1>, ::sparse_caller<4, 4, 1>, ::sparse_caller<4, 5, 1>}, + {::sparse_caller<4, 1, 2>, ::sparse_caller<4, 2, 2>, ::sparse_caller<4, 3, 2>, ::sparse_caller<4, 4, 2>, ::sparse_caller<4, 5, 2>}, + {::sparse_caller<4, 1, 3>, ::sparse_caller<4, 2, 3>, ::sparse_caller<4, 3, 3>, ::sparse_caller<4, 4, 3>, ::sparse_caller<4, 5, 3>}, + {::sparse_caller<4, 1, 4>, ::sparse_caller<4, 2, 4>, ::sparse_caller<4, 3, 4>, ::sparse_caller<4, 4, 4>, ::sparse_caller<4, 5, 4>}, + {::sparse_caller<4, 1, 5>, ::sparse_caller<4, 2, 5>, ::sparse_caller<4, 3, 5>, ::sparse_caller<4, 4, 5>, ::sparse_caller<4, 5, 5>} + }; + + bindTexture(&tex_If4, I); + bindTexture(&tex_Jf4, J); + + funcs[patch.y - 1][patch.x - 1](I.rows, I.cols, prevPts, nextPts, status, err, ptcount, + level, block, stream); + } + + void dense(PtrStepSzb I, PtrStepSzf J, PtrStepSzf u, PtrStepSzf v, PtrStepSzf prevU, PtrStepSzf prevV, PtrStepSzf err, int2 winSize, cudaStream_t stream) + { + dim3 block(16, 16); + dim3 grid(divUp(I.cols, block.x), divUp(I.rows, block.y)); + + bindTexture(&tex_Ib, I); + bindTexture(&tex_Jf, J); + + int2 halfWin = make_int2((winSize.x - 1) / 2, (winSize.y - 1) / 2); + const int patchWidth = block.x + 2 * halfWin.x; + const int patchHeight = block.y + 2 * halfWin.y; + size_t smem_size = 3 * patchWidth * patchHeight * sizeof(int); + + if (err.data) + { + ::dense<<>>(u, v, prevU, prevV, err, I.rows, I.cols); + cudaSafeCall( cudaGetLastError() ); + } + else + { + ::dense<<>>(u, v, prevU, prevV, PtrStepf(), I.rows, I.cols); + cudaSafeCall( cudaGetLastError() ); + } + + if (stream == 0) + cudaSafeCall( cudaDeviceSynchronize() ); + } +} #endif /* CUDA_DISABLER */ diff --git a/modules/gpu/src/pyrlk.cpp b/modules/gpu/src/pyrlk.cpp index 47ab90415..593e37cc6 100644 --- a/modules/gpu/src/pyrlk.cpp +++ b/modules/gpu/src/pyrlk.cpp @@ -55,21 +55,18 @@ void cv::gpu::PyrLKOpticalFlow::releaseMemory() {} #else /* !defined (HAVE_CUDA) */ -namespace cv { namespace gpu { namespace device +namespace pyrlk { - namespace pyrlk - { - void loadConstants(int2 winSize, int iters); + void loadConstants(int2 winSize, int iters); - void lkSparse1_gpu(PtrStepSzf I, PtrStepSzf J, const float2* prevPts, float2* nextPts, uchar* status, float* err, int ptcount, - int level, dim3 block, dim3 patch, cudaStream_t stream = 0); - void lkSparse4_gpu(PtrStepSz I, PtrStepSz J, const float2* prevPts, float2* nextPts, uchar* status, float* err, int ptcount, - int level, dim3 block, dim3 patch, cudaStream_t stream = 0); + void sparse1(PtrStepSzf I, PtrStepSzf J, const float2* prevPts, float2* nextPts, uchar* status, float* err, int ptcount, + int level, dim3 block, dim3 patch, cudaStream_t stream = 0); + void sparse4(PtrStepSz I, PtrStepSz J, const float2* prevPts, float2* nextPts, uchar* status, float* err, int ptcount, + int level, dim3 block, dim3 patch, cudaStream_t stream = 0); - void lkDense_gpu(PtrStepSzb I, PtrStepSzf J, PtrStepSzf u, PtrStepSzf v, PtrStepSzf prevU, PtrStepSzf prevV, - PtrStepSzf err, int2 winSize, cudaStream_t stream = 0); - } -}}} + void dense(PtrStepSzb I, PtrStepSzf J, PtrStepSzf u, PtrStepSzf v, PtrStepSzf prevU, PtrStepSzf prevV, + PtrStepSzf err, int2 winSize, cudaStream_t stream = 0); +} cv::gpu::PyrLKOpticalFlow::PyrLKOpticalFlow() { @@ -104,8 +101,6 @@ namespace void cv::gpu::PyrLKOpticalFlow::sparse(const GpuMat& prevImg, const GpuMat& nextImg, const GpuMat& prevPts, GpuMat& nextPts, GpuMat& status, GpuMat* err) { - using namespace cv::gpu::device::pyrlk; - if (prevPts.empty()) { nextPts.release(); @@ -166,19 +161,19 @@ void cv::gpu::PyrLKOpticalFlow::sparse(const GpuMat& prevImg, const GpuMat& next pyrDown(nextPyr_[level - 1], nextPyr_[level]); } - loadConstants(make_int2(winSize.width, winSize.height), iters); + pyrlk::loadConstants(make_int2(winSize.width, winSize.height), iters); for (int level = maxLevel; level >= 0; level--) { if (cn == 1) { - lkSparse1_gpu(prevPyr_[level], nextPyr_[level], + pyrlk::sparse1(prevPyr_[level], nextPyr_[level], prevPts.ptr(), nextPts.ptr(), status.ptr(), level == 0 && err ? err->ptr() : 0, prevPts.cols, level, block, patch); } else { - lkSparse4_gpu(prevPyr_[level], nextPyr_[level], + pyrlk::sparse4(prevPyr_[level], nextPyr_[level], prevPts.ptr(), nextPts.ptr(), status.ptr(), level == 0 && err ? err->ptr() : 0, prevPts.cols, level, block, patch); } @@ -187,8 +182,6 @@ void cv::gpu::PyrLKOpticalFlow::sparse(const GpuMat& prevImg, const GpuMat& next void cv::gpu::PyrLKOpticalFlow::dense(const GpuMat& prevImg, const GpuMat& nextImg, GpuMat& u, GpuMat& v, GpuMat* err) { - using namespace cv::gpu::device::pyrlk; - CV_Assert(prevImg.type() == CV_8UC1); CV_Assert(prevImg.size() == nextImg.size() && prevImg.type() == nextImg.type()); CV_Assert(maxLevel >= 0); @@ -219,7 +212,7 @@ void cv::gpu::PyrLKOpticalFlow::dense(const GpuMat& prevImg, const GpuMat& nextI vPyr_[1].setTo(Scalar::all(0)); int2 winSize2i = make_int2(winSize.width, winSize.height); - loadConstants(winSize2i, iters); + pyrlk::loadConstants(winSize2i, iters); PtrStepSzf derr = err ? *err : PtrStepSzf(); @@ -229,7 +222,7 @@ void cv::gpu::PyrLKOpticalFlow::dense(const GpuMat& prevImg, const GpuMat& nextI { int idx2 = (idx + 1) & 1; - lkDense_gpu(prevPyr_[level], nextPyr_[level], uPyr_[idx], vPyr_[idx], uPyr_[idx2], vPyr_[idx2], + pyrlk::dense(prevPyr_[level], nextPyr_[level], uPyr_[idx], vPyr_[idx], uPyr_[idx2], vPyr_[idx2], level == 0 ? derr : PtrStepSzf(), winSize2i); if (level > 0) From 7f97fb481cbdd3b2a432332afecb6ae7ca421d8a Mon Sep 17 00:00:00 2001 From: Vladislav Vinogradov Date: Mon, 12 Nov 2012 14:14:48 +0400 Subject: [PATCH 086/155] FastNonLocalMeans --- modules/gpu/src/cuda/nlm.cu | 116 ++++++++++++++++++++++++++++-------- 1 file changed, 90 insertions(+), 26 deletions(-) diff --git a/modules/gpu/src/cuda/nlm.cu b/modules/gpu/src/cuda/nlm.cu index e267c733e..cd3f0b5c3 100644 --- a/modules/gpu/src/cuda/nlm.cu +++ b/modules/gpu/src/cuda/nlm.cu @@ -43,11 +43,11 @@ #if !defined CUDA_DISABLER -#include "internal_shared.hpp" - +#include "opencv2/gpu/device/common.hpp" #include "opencv2/gpu/device/vec_traits.hpp" #include "opencv2/gpu/device/vec_math.hpp" -#include "opencv2/gpu/device/block.hpp" +#include "opencv2/gpu/device/functional.hpp" +#include "opencv2/gpu/device/reduce.hpp" #include "opencv2/gpu/device/border_interpolate.hpp" using namespace cv::gpu; @@ -184,6 +184,85 @@ namespace cv { namespace gpu { namespace device { namespace imgproc { + + template struct Unroll; + template <> struct Unroll<1> + { + template + static __device__ __forceinline__ thrust::tuple smem_tuple(float* smem) + { + return cv::gpu::device::smem_tuple(smem, smem + BLOCK_SIZE); + } + + static __device__ __forceinline__ thrust::tuple tie(float& val1, float& val2) + { + return thrust::tie(val1, val2); + } + + static __device__ __forceinline__ const thrust::tuple, plus > op() + { + plus op; + return thrust::make_tuple(op, op); + } + }; + template <> struct Unroll<2> + { + template + static __device__ __forceinline__ thrust::tuple smem_tuple(float* smem) + { + return cv::gpu::device::smem_tuple(smem, smem + BLOCK_SIZE, smem + 2 * BLOCK_SIZE); + } + + static __device__ __forceinline__ thrust::tuple tie(float& val1, float2& val2) + { + return thrust::tie(val1, val2.x, val2.y); + } + + static __device__ __forceinline__ const thrust::tuple, plus, plus > op() + { + plus op; + return thrust::make_tuple(op, op, op); + } + }; + template <> struct Unroll<3> + { + template + static __device__ __forceinline__ thrust::tuple smem_tuple(float* smem) + { + return cv::gpu::device::smem_tuple(smem, smem + BLOCK_SIZE, smem + 2 * BLOCK_SIZE, smem + 3 * BLOCK_SIZE); + } + + static __device__ __forceinline__ thrust::tuple tie(float& val1, float3& val2) + { + return thrust::tie(val1, val2.x, val2.y, val2.z); + } + + static __device__ __forceinline__ const thrust::tuple, plus, plus, plus > op() + { + plus op; + return thrust::make_tuple(op, op, op, op); + } + }; + template <> struct Unroll<4> + { + template + static __device__ __forceinline__ thrust::tuple smem_tuple(float* smem) + { + return cv::gpu::device::smem_tuple(smem, smem + BLOCK_SIZE, smem + 2 * BLOCK_SIZE, smem + 3 * BLOCK_SIZE, smem + 4 * BLOCK_SIZE); + } + + static __device__ __forceinline__ thrust::tuple tie(float& val1, float4& val2) + { + return thrust::tie(val1, val2.x, val2.y, val2.z, val2.w); + } + + static __device__ __forceinline__ const thrust::tuple, plus, plus, plus, plus > op() + { + plus op; + return thrust::make_tuple(op, op, op, op, op); + } + }; + __device__ __forceinline__ int calcDist(const uchar& a, const uchar& b) { return (a-b)*(a-b); } __device__ __forceinline__ int calcDist(const uchar2& a, const uchar2& b) { return (a.x-b.x)*(a.x-b.x) + (a.y-b.y)*(a.y-b.y); } __device__ __forceinline__ int calcDist(const uchar3& a, const uchar3& b) { return (a.x-b.x)*(a.x-b.x) + (a.y-b.y)*(a.y-b.y) + (a.z-b.z)*(a.z-b.z); } @@ -340,30 +419,15 @@ namespace cv { namespace gpu { namespace device sum = sum + weight * saturate_cast(src(sy + y, sx + x)); } - volatile __shared__ float cta_buffer[CTA_SIZE]; + __shared__ float cta_buffer[CTA_SIZE * (VecTraits::cn + 1)]; - int tid = threadIdx.x; + reduce(Unroll::cn>::template smem_tuple(cta_buffer), + Unroll::cn>::tie(weights_sum, sum), + threadIdx.x, + Unroll::cn>::op()); - cta_buffer[tid] = weights_sum; - __syncthreads(); - Block::reduce(cta_buffer, plus()); - weights_sum = cta_buffer[0]; - - __syncthreads(); - - - for(int n = 0; n < VecTraits::cn; ++n) - { - cta_buffer[tid] = reinterpret_cast(&sum)[n]; - __syncthreads(); - Block::reduce(cta_buffer, plus()); - reinterpret_cast(&sum)[n] = cta_buffer[0]; - - __syncthreads(); - } - - if (tid == 0) - dst = saturate_cast(sum/weights_sum); + if (threadIdx.x == 0) + dst = saturate_cast(sum / weights_sum); } __device__ __forceinline__ void operator()(PtrStepSz& dst) const @@ -503,4 +567,4 @@ namespace cv { namespace gpu { namespace device }}} -#endif /* CUDA_DISABLER */ \ No newline at end of file +#endif /* CUDA_DISABLER */ From 19c87d1c9d75b348bb1027afe6fc29cf7457a3c0 Mon Sep 17 00:00:00 2001 From: Vladislav Vinogradov Date: Mon, 12 Nov 2012 14:17:48 +0400 Subject: [PATCH 087/155] ORB --- modules/gpu/src/cuda/orb.cu | 30 ++++++++++++++++++------------ 1 file changed, 18 insertions(+), 12 deletions(-) diff --git a/modules/gpu/src/cuda/orb.cu b/modules/gpu/src/cuda/orb.cu index 91c570957..d66b3e9ec 100644 --- a/modules/gpu/src/cuda/orb.cu +++ b/modules/gpu/src/cuda/orb.cu @@ -50,7 +50,7 @@ #include #include "opencv2/gpu/device/common.hpp" -#include "opencv2/gpu/device/utility.hpp" +#include "opencv2/gpu/device/reduce.hpp" #include "opencv2/gpu/device/functional.hpp" namespace cv { namespace gpu { namespace device @@ -75,9 +75,9 @@ namespace cv { namespace gpu { namespace device __global__ void HarrisResponses(const PtrStepb img, const short2* loc_, float* response, const int npoints, const int blockSize, const float harris_k) { - __shared__ int smem[8 * 32]; - - volatile int* srow = smem + threadIdx.y * blockDim.x; + __shared__ int smem0[8 * 32]; + __shared__ int smem1[8 * 32]; + __shared__ int smem2[8 * 32]; const int ptidx = blockIdx.x * blockDim.y + threadIdx.y; @@ -109,9 +109,12 @@ namespace cv { namespace gpu { namespace device c += Ix * Iy; } - reduce_old<32>(srow, a, threadIdx.x, plus()); - reduce_old<32>(srow, b, threadIdx.x, plus()); - reduce_old<32>(srow, c, threadIdx.x, plus()); + int* srow0 = smem0 + threadIdx.y * blockDim.x; + int* srow1 = smem1 + threadIdx.y * blockDim.x; + int* srow2 = smem2 + threadIdx.y * blockDim.x; + + plus op; + reduce<32>(smem_tuple(srow0, srow1, srow2), thrust::tie(a, b, c), threadIdx.x, thrust::make_tuple(op, op, op)); if (threadIdx.x == 0) { @@ -151,9 +154,13 @@ namespace cv { namespace gpu { namespace device __global__ void IC_Angle(const PtrStepb image, const short2* loc_, float* angle, const int npoints, const int half_k) { - __shared__ int smem[8 * 32]; + __shared__ int smem0[8 * 32]; + __shared__ int smem1[8 * 32]; - volatile int* srow = smem + threadIdx.y * blockDim.x; + int* srow0 = smem0 + threadIdx.y * blockDim.x; + int* srow1 = smem1 + threadIdx.y * blockDim.x; + + plus op; const int ptidx = blockIdx.x * blockDim.y + threadIdx.y; @@ -167,7 +174,7 @@ namespace cv { namespace gpu { namespace device for (int u = threadIdx.x - half_k; u <= half_k; u += blockDim.x) m_10 += u * image(loc.y, loc.x + u); - reduce_old<32>(srow, m_10, threadIdx.x, plus()); + reduce<32>(srow0, m_10, threadIdx.x, op); for (int v = 1; v <= half_k; ++v) { @@ -185,8 +192,7 @@ namespace cv { namespace gpu { namespace device m_sum += u * (val_plus + val_minus); } - reduce_old<32>(srow, v_sum, threadIdx.x, plus()); - reduce_old<32>(srow, m_sum, threadIdx.x, plus()); + reduce<32>(smem_tuple(srow0, srow1), thrust::tie(v_sum, m_sum), threadIdx.x, thrust::make_tuple(op, op)); m_10 += m_sum; m_01 += v * v_sum; From fbf3de43a27259959f5a6096fcc7f3bec34a5915 Mon Sep 17 00:00:00 2001 From: Vladislav Vinogradov Date: Mon, 12 Nov 2012 14:20:43 +0400 Subject: [PATCH 088/155] SURF --- modules/gpu/src/cuda/surf.cu | 382 +++++++++++---------------- modules/gpu/src/surf.cpp | 3 +- modules/gpu/test/test_features2d.cpp | 2 +- 3 files changed, 156 insertions(+), 231 deletions(-) diff --git a/modules/gpu/src/cuda/surf.cu b/modules/gpu/src/cuda/surf.cu index 451fb425d..c12192555 100644 --- a/modules/gpu/src/cuda/surf.cu +++ b/modules/gpu/src/cuda/surf.cu @@ -47,13 +47,13 @@ #if !defined CUDA_DISABLER -#include "internal_shared.hpp" +#include "opencv2/gpu/device/common.hpp" #include "opencv2/gpu/device/limits.hpp" #include "opencv2/gpu/device/saturate_cast.hpp" +#include "opencv2/gpu/device/reduce.hpp" #include "opencv2/gpu/device/utility.hpp" #include "opencv2/gpu/device/functional.hpp" #include "opencv2/gpu/device/filters.hpp" -#include namespace cv { namespace gpu { namespace device { @@ -599,8 +599,9 @@ namespace cv { namespace gpu { namespace device sumy += s_Y[threadIdx.x + 96]; } - device::reduce_old<32>(s_sumx + threadIdx.y * 32, sumx, threadIdx.x, plus()); - device::reduce_old<32>(s_sumy + threadIdx.y * 32, sumy, threadIdx.x, plus()); + plus op; + device::reduce<32>(smem_tuple(s_sumx + threadIdx.y * 32, s_sumy + threadIdx.y * 32), + thrust::tie(sumx, sumy), threadIdx.x, thrust::make_tuple(op, op)); const float temp_mod = sumx * sumx + sumy * sumy; if (temp_mod > best_mod) @@ -638,7 +639,7 @@ namespace cv { namespace gpu { namespace device kp_dir *= 180.0f / CV_PI_F; kp_dir = 360.0f - kp_dir; - if (::fabsf(kp_dir - 360.f) < FLT_EPSILON) + if (::fabsf(kp_dir - 360.f) < numeric_limits::epsilon()) kp_dir = 0.f; featureDir[blockIdx.x] = kp_dir; @@ -697,11 +698,6 @@ namespace cv { namespace gpu { namespace device { typedef uchar elem_type; - __device__ __forceinline__ WinReader(float centerX_, float centerY_, float win_offset_, float cos_dir_, float sin_dir_) : - centerX(centerX_), centerY(centerY_), win_offset(win_offset_), cos_dir(cos_dir_), sin_dir(sin_dir_) - { - } - __device__ __forceinline__ uchar operator ()(int i, int j) const { float pixel_x = centerX + (win_offset + j) * cos_dir + (win_offset + i) * sin_dir; @@ -715,285 +711,215 @@ namespace cv { namespace gpu { namespace device float win_offset; float cos_dir; float sin_dir; + int width; + int height; }; - __device__ void calc_dx_dy(float s_dx_bin[25], float s_dy_bin[25], - const float* featureX, const float* featureY, const float* featureSize, const float* featureDir) + __device__ void calc_dx_dy(const float* featureX, const float* featureY, const float* featureSize, const float* featureDir, + float& dx, float& dy) { - __shared__ float s_PATCH[6][6]; + __shared__ float s_PATCH[PATCH_SZ + 1][PATCH_SZ + 1]; - const float centerX = featureX[blockIdx.x]; - const float centerY = featureY[blockIdx.x]; - const float size = featureSize[blockIdx.x]; - float descriptor_dir = 360.0f - featureDir[blockIdx.x]; - if (std::abs(descriptor_dir - 360.f) < FLT_EPSILON) - descriptor_dir = 0.f; - descriptor_dir *= (float)(CV_PI_F / 180.0f); + dx = dy = 0.0f; - /* The sampling intervals and wavelet sized for selecting an orientation - and building the keypoint descriptor are defined relative to 's' */ - const float s = size * 1.2f / 9.0f; + WinReader win; - /* Extract a window of pixels around the keypoint of size 20s */ + win.centerX = featureX[blockIdx.x]; + win.centerY = featureY[blockIdx.x]; + + // The sampling intervals and wavelet sized for selecting an orientation + // and building the keypoint descriptor are defined relative to 's' + const float s = featureSize[blockIdx.x] * 1.2f / 9.0f; + + // Extract a window of pixels around the keypoint of size 20s const int win_size = (int)((PATCH_SZ + 1) * s); - float sin_dir; - float cos_dir; - sincosf(descriptor_dir, &sin_dir, &cos_dir); + win.width = win.height = win_size; - /* Nearest neighbour version (faster) */ - const float win_offset = -(float)(win_size - 1) / 2; - - // Compute sampling points - // since grids are 2D, need to compute xBlock and yBlock indices - const int xBlock = (blockIdx.y & 3); // blockIdx.y % 4 - const int yBlock = (blockIdx.y >> 2); // floor(blockIdx.y/4) - const int xIndex = xBlock * 5 + threadIdx.x; - const int yIndex = yBlock * 5 + threadIdx.y; - - const float icoo = ((float)yIndex / (PATCH_SZ + 1)) * win_size; - const float jcoo = ((float)xIndex / (PATCH_SZ + 1)) * win_size; - - LinearFilter filter(WinReader(centerX, centerY, win_offset, cos_dir, sin_dir)); - - s_PATCH[threadIdx.y][threadIdx.x] = filter(icoo, jcoo); - - __syncthreads(); - - if (threadIdx.x < 5 && threadIdx.y < 5) - { - const int tid = threadIdx.y * 5 + threadIdx.x; - - const float dw = c_DW[yIndex * PATCH_SZ + xIndex]; - - const float vx = (s_PATCH[threadIdx.y ][threadIdx.x + 1] - s_PATCH[threadIdx.y][threadIdx.x] + s_PATCH[threadIdx.y + 1][threadIdx.x + 1] - s_PATCH[threadIdx.y + 1][threadIdx.x ]) * dw; - const float vy = (s_PATCH[threadIdx.y + 1][threadIdx.x ] - s_PATCH[threadIdx.y][threadIdx.x] + s_PATCH[threadIdx.y + 1][threadIdx.x + 1] - s_PATCH[threadIdx.y ][threadIdx.x + 1]) * dw; - - s_dx_bin[tid] = vx; - s_dy_bin[tid] = vy; - } - } - - __device__ void reduce_sum25(volatile float* sdata1, volatile float* sdata2, volatile float* sdata3, volatile float* sdata4, int tid) - { - // first step is to reduce from 25 to 16 - if (tid < 9) // use 9 threads - { - sdata1[tid] += sdata1[tid + 16]; - sdata2[tid] += sdata2[tid + 16]; - sdata3[tid] += sdata3[tid + 16]; - sdata4[tid] += sdata4[tid + 16]; - } - - // sum (reduce) from 16 to 1 (unrolled - aligned to a half-warp) - if (tid < 8) - { - sdata1[tid] += sdata1[tid + 8]; - sdata1[tid] += sdata1[tid + 4]; - sdata1[tid] += sdata1[tid + 2]; - sdata1[tid] += sdata1[tid + 1]; - - sdata2[tid] += sdata2[tid + 8]; - sdata2[tid] += sdata2[tid + 4]; - sdata2[tid] += sdata2[tid + 2]; - sdata2[tid] += sdata2[tid + 1]; - - sdata3[tid] += sdata3[tid + 8]; - sdata3[tid] += sdata3[tid + 4]; - sdata3[tid] += sdata3[tid + 2]; - sdata3[tid] += sdata3[tid + 1]; - - sdata4[tid] += sdata4[tid + 8]; - sdata4[tid] += sdata4[tid + 4]; - sdata4[tid] += sdata4[tid + 2]; - sdata4[tid] += sdata4[tid + 1]; - } - } - - __global__ void compute_descriptors64(PtrStepf descriptors, const float* featureX, const float* featureY, const float* featureSize, const float* featureDir) - { - // 2 floats (dx,dy) for each thread (5x5 sample points in each sub-region) - __shared__ float sdx[25]; - __shared__ float sdy[25]; - __shared__ float sdxabs[25]; - __shared__ float sdyabs[25]; - - calc_dx_dy(sdx, sdy, featureX, featureY, featureSize, featureDir); - __syncthreads(); + // Nearest neighbour version (faster) + win.win_offset = -(win_size - 1.0f) / 2.0f; + float descriptor_dir = 360.0f - featureDir[blockIdx.x]; + if (::fabsf(descriptor_dir - 360.f) < numeric_limits::epsilon()) + descriptor_dir = 0.f; + descriptor_dir *= CV_PI_F / 180.0f; + sincosf(descriptor_dir, &win.sin_dir, &win.cos_dir); const int tid = threadIdx.y * blockDim.x + threadIdx.x; - if (tid < 25) + const int xLoadInd = tid % (PATCH_SZ + 1); + const int yLoadInd = tid / (PATCH_SZ + 1); + + if (yLoadInd < (PATCH_SZ + 1)) { - sdxabs[tid] = ::fabs(sdx[tid]); // |dx| array - sdyabs[tid] = ::fabs(sdy[tid]); // |dy| array - __syncthreads(); - - reduce_sum25(sdx, sdy, sdxabs, sdyabs, tid); - __syncthreads(); - - float* descriptors_block = descriptors.ptr(blockIdx.x) + (blockIdx.y << 2); - - // write dx, dy, |dx|, |dy| - if (tid == 0) + if (s > 1) { - descriptors_block[0] = sdx[0]; - descriptors_block[1] = sdy[0]; - descriptors_block[2] = sdxabs[0]; - descriptors_block[3] = sdyabs[0]; + AreaFilter filter(win, s, s); + s_PATCH[yLoadInd][xLoadInd] = filter(yLoadInd, xLoadInd); } + else + { + LinearFilter filter(win); + s_PATCH[yLoadInd][xLoadInd] = filter(yLoadInd * s, xLoadInd * s); + } + } + + __syncthreads(); + + const int xPatchInd = threadIdx.x % 5; + const int yPatchInd = threadIdx.x / 5; + + if (yPatchInd < 5) + { + const int xBlockInd = threadIdx.y % 4; + const int yBlockInd = threadIdx.y / 4; + + const int xInd = xBlockInd * 5 + xPatchInd; + const int yInd = yBlockInd * 5 + yPatchInd; + + const float dw = c_DW[yInd * PATCH_SZ + xInd]; + + dx = (s_PATCH[yInd ][xInd + 1] - s_PATCH[yInd][xInd] + s_PATCH[yInd + 1][xInd + 1] - s_PATCH[yInd + 1][xInd ]) * dw; + dy = (s_PATCH[yInd + 1][xInd ] - s_PATCH[yInd][xInd] + s_PATCH[yInd + 1][xInd + 1] - s_PATCH[yInd ][xInd + 1]) * dw; } } - __global__ void compute_descriptors128(PtrStepf descriptors, const float* featureX, const float* featureY, const float* featureSize, const float* featureDir) + __global__ void compute_descriptors_64(PtrStep descriptors, const float* featureX, const float* featureY, const float* featureSize, const float* featureDir) { - // 2 floats (dx,dy) for each thread (5x5 sample points in each sub-region) - __shared__ float sdx[25]; - __shared__ float sdy[25]; + __shared__ float smem[32 * 16]; - // sum (reduce) 5x5 area response - __shared__ float sd1[25]; - __shared__ float sd2[25]; - __shared__ float sdabs1[25]; - __shared__ float sdabs2[25]; + float* sRow = smem + threadIdx.y * 32; - calc_dx_dy(sdx, sdy, featureX, featureY, featureSize, featureDir); - __syncthreads(); + float dx, dy; + calc_dx_dy(featureX, featureY, featureSize, featureDir, dx, dy); - const int tid = threadIdx.y * blockDim.x + threadIdx.x; + float dxabs = ::fabsf(dx); + float dyabs = ::fabsf(dy); - if (tid < 25) + plus op; + + reduce<32>(sRow, dx, threadIdx.x, op); + reduce<32>(sRow, dy, threadIdx.x, op); + reduce<32>(sRow, dxabs, threadIdx.x, op); + reduce<32>(sRow, dyabs, threadIdx.x, op); + + float4* descriptors_block = descriptors.ptr(blockIdx.x) + threadIdx.y; + + // write dx, dy, |dx|, |dy| + if (threadIdx.x == 0) + *descriptors_block = make_float4(dx, dy, dxabs, dyabs); + } + + __global__ void compute_descriptors_128(PtrStep descriptors, const float* featureX, const float* featureY, const float* featureSize, const float* featureDir) + { + __shared__ float smem[32 * 16]; + + float* sRow = smem + threadIdx.y * 32; + + float dx, dy; + calc_dx_dy(featureX, featureY, featureSize, featureDir, dx, dy); + + float4* descriptors_block = descriptors.ptr(blockIdx.x) + threadIdx.y * 2; + + plus op; + + float d1 = 0.0f; + float d2 = 0.0f; + float abs1 = 0.0f; + float abs2 = 0.0f; + + if (dy >= 0) { - if (sdy[tid] >= 0) - { - sd1[tid] = sdx[tid]; - sdabs1[tid] = ::fabs(sdx[tid]); - sd2[tid] = 0; - sdabs2[tid] = 0; - } - else - { - sd1[tid] = 0; - sdabs1[tid] = 0; - sd2[tid] = sdx[tid]; - sdabs2[tid] = ::fabs(sdx[tid]); - } - __syncthreads(); - - reduce_sum25(sd1, sd2, sdabs1, sdabs2, tid); - __syncthreads(); - - float* descriptors_block = descriptors.ptr(blockIdx.x) + (blockIdx.y << 3); - - // write dx (dy >= 0), |dx| (dy >= 0), dx (dy < 0), |dx| (dy < 0) - if (tid == 0) - { - descriptors_block[0] = sd1[0]; - descriptors_block[1] = sdabs1[0]; - descriptors_block[2] = sd2[0]; - descriptors_block[3] = sdabs2[0]; - } - __syncthreads(); - - if (sdx[tid] >= 0) - { - sd1[tid] = sdy[tid]; - sdabs1[tid] = ::fabs(sdy[tid]); - sd2[tid] = 0; - sdabs2[tid] = 0; - } - else - { - sd1[tid] = 0; - sdabs1[tid] = 0; - sd2[tid] = sdy[tid]; - sdabs2[tid] = ::fabs(sdy[tid]); - } - __syncthreads(); - - reduce_sum25(sd1, sd2, sdabs1, sdabs2, tid); - __syncthreads(); - - // write dy (dx >= 0), |dy| (dx >= 0), dy (dx < 0), |dy| (dx < 0) - if (tid == 0) - { - descriptors_block[4] = sd1[0]; - descriptors_block[5] = sdabs1[0]; - descriptors_block[6] = sd2[0]; - descriptors_block[7] = sdabs2[0]; - } + d1 = dx; + abs1 = ::fabsf(dx); } + else + { + d2 = dx; + abs2 = ::fabsf(dx); + } + + reduce<32>(sRow, d1, threadIdx.x, op); + reduce<32>(sRow, d2, threadIdx.x, op); + reduce<32>(sRow, abs1, threadIdx.x, op); + reduce<32>(sRow, abs2, threadIdx.x, op); + + // write dx (dy >= 0), |dx| (dy >= 0), dx (dy < 0), |dx| (dy < 0) + if (threadIdx.x == 0) + descriptors_block[0] = make_float4(d1, abs1, d2, abs2); + + if (dx >= 0) + { + d1 = dy; + abs1 = ::fabsf(dy); + d2 = 0.0f; + abs2 = 0.0f; + } + else + { + d1 = 0.0f; + abs1 = 0.0f; + d2 = dy; + abs2 = ::fabsf(dy); + } + + reduce<32>(sRow, d1, threadIdx.x, op); + reduce<32>(sRow, d2, threadIdx.x, op); + reduce<32>(sRow, abs1, threadIdx.x, op); + reduce<32>(sRow, abs2, threadIdx.x, op); + + // write dy (dx >= 0), |dy| (dx >= 0), dy (dx < 0), |dy| (dx < 0) + if (threadIdx.x == 0) + descriptors_block[1] = make_float4(d1, abs1, d2, abs2); } template __global__ void normalize_descriptors(PtrStepf descriptors) { + __shared__ float smem[BLOCK_DIM_X]; + __shared__ float s_len; + // no need for thread ID float* descriptor_base = descriptors.ptr(blockIdx.x); // read in the unnormalized descriptor values (squared) - __shared__ float sqDesc[BLOCK_DIM_X]; - const float lookup = descriptor_base[threadIdx.x]; - sqDesc[threadIdx.x] = lookup * lookup; - __syncthreads(); + const float val = descriptor_base[threadIdx.x]; - if (BLOCK_DIM_X >= 128) - { - if (threadIdx.x < 64) - sqDesc[threadIdx.x] += sqDesc[threadIdx.x + 64]; - __syncthreads(); - } + float len = val * val; + reduce(smem, len, threadIdx.x, plus()); - // reduction to get total - if (threadIdx.x < 32) - { - volatile float* smem = sqDesc; - - smem[threadIdx.x] += smem[threadIdx.x + 32]; - smem[threadIdx.x] += smem[threadIdx.x + 16]; - smem[threadIdx.x] += smem[threadIdx.x + 8]; - smem[threadIdx.x] += smem[threadIdx.x + 4]; - smem[threadIdx.x] += smem[threadIdx.x + 2]; - smem[threadIdx.x] += smem[threadIdx.x + 1]; - } - - // compute length (square root) - __shared__ float len; if (threadIdx.x == 0) - { - len = sqrtf(sqDesc[0]); - } + s_len = ::sqrtf(len); + __syncthreads(); // normalize and store in output - descriptor_base[threadIdx.x] = lookup / len; + descriptor_base[threadIdx.x] = val / s_len; } - void compute_descriptors_gpu(const PtrStepSzf& descriptors, - const float* featureX, const float* featureY, const float* featureSize, const float* featureDir, int nFeatures) + void compute_descriptors_gpu(PtrStepSz descriptors, const float* featureX, const float* featureY, const float* featureSize, const float* featureDir, int nFeatures) { // compute unnormalized descriptors, then normalize them - odd indexing since grid must be 2D if (descriptors.cols == 64) { - compute_descriptors64<<>>(descriptors, featureX, featureY, featureSize, featureDir); + compute_descriptors_64<<>>(descriptors, featureX, featureY, featureSize, featureDir); cudaSafeCall( cudaGetLastError() ); cudaSafeCall( cudaDeviceSynchronize() ); - normalize_descriptors<64><<>>(descriptors); + normalize_descriptors<64><<>>((PtrStepSzf) descriptors); cudaSafeCall( cudaGetLastError() ); cudaSafeCall( cudaDeviceSynchronize() ); } else { - compute_descriptors128<<>>(descriptors, featureX, featureY, featureSize, featureDir); + compute_descriptors_128<<>>(descriptors, featureX, featureY, featureSize, featureDir); cudaSafeCall( cudaGetLastError() ); cudaSafeCall( cudaDeviceSynchronize() ); - normalize_descriptors<128><<>>(descriptors); + normalize_descriptors<128><<>>((PtrStepSzf) descriptors); cudaSafeCall( cudaGetLastError() ); cudaSafeCall( cudaDeviceSynchronize() ); diff --git a/modules/gpu/src/surf.cpp b/modules/gpu/src/surf.cpp index 72bb9c15e..4d1e74d9a 100644 --- a/modules/gpu/src/surf.cpp +++ b/modules/gpu/src/surf.cpp @@ -86,8 +86,7 @@ namespace cv { namespace gpu { namespace device void icvCalcOrientation_gpu(const float* featureX, const float* featureY, const float* featureSize, float* featureDir, int nFeatures); - void compute_descriptors_gpu(const PtrStepSzf& descriptors, - const float* featureX, const float* featureY, const float* featureSize, const float* featureDir, int nFeatures); + void compute_descriptors_gpu(PtrStepSz descriptors, const float* featureX, const float* featureY, const float* featureSize, const float* featureDir, int nFeatures); } }}} diff --git a/modules/gpu/test/test_features2d.cpp b/modules/gpu/test/test_features2d.cpp index 4fff37f1a..c76fe9166 100644 --- a/modules/gpu/test/test_features2d.cpp +++ b/modules/gpu/test/test_features2d.cpp @@ -328,7 +328,7 @@ TEST_P(SURF, Descriptor) int matchedCount = getMatchedPointsCount(keypoints, keypoints, matches); double matchedRatio = static_cast(matchedCount) / keypoints.size(); - EXPECT_GT(matchedRatio, 0.35); + EXPECT_GT(matchedRatio, 0.6); } } From e8f9762ef373be508692d2d692d5c5a97ad533f9 Mon Sep 17 00:00:00 2001 From: Vladislav Vinogradov Date: Mon, 12 Nov 2012 13:34:25 +0400 Subject: [PATCH 089/155] matrix reduction --- modules/gpu/src/cuda/matrix_reductions.cu | 3220 ++++++++------------- modules/gpu/src/matrix_reductions.cpp | 756 ++--- modules/gpu/test/test_core.cpp | 14 +- 3 files changed, 1521 insertions(+), 2469 deletions(-) diff --git a/modules/gpu/src/cuda/matrix_reductions.cu b/modules/gpu/src/cuda/matrix_reductions.cu index 5cda3dab3..7a0e8d2fe 100644 --- a/modules/gpu/src/cuda/matrix_reductions.cu +++ b/modules/gpu/src/cuda/matrix_reductions.cu @@ -42,2062 +42,1268 @@ #if !defined CUDA_DISABLER -#include "internal_shared.hpp" +#include "opencv2/gpu/device/common.hpp" #include "opencv2/gpu/device/limits.hpp" #include "opencv2/gpu/device/saturate_cast.hpp" +#include "opencv2/gpu/device/vec_traits.hpp" #include "opencv2/gpu/device/vec_math.hpp" +#include "opencv2/gpu/device/reduce.hpp" +#include "opencv2/gpu/device/functional.hpp" +#include "opencv2/gpu/device/utility.hpp" +#include "opencv2/gpu/device/type_traits.hpp" -namespace cv { namespace gpu { namespace device +using namespace cv::gpu; +using namespace cv::gpu::device; + +namespace { - namespace matrix_reductions + template struct Unroll; + template <> struct Unroll<1> { - // Performs reduction in shared memory - template - __device__ void sumInSmem(volatile T* data, const uint tid) + template + static __device__ __forceinline__ volatile R* smem_tuple(R* smem) { - T sum = data[tid]; - - if (size >= 512) { if (tid < 256) { data[tid] = sum = sum + data[tid + 256]; } __syncthreads(); } - if (size >= 256) { if (tid < 128) { data[tid] = sum = sum + data[tid + 128]; } __syncthreads(); } - if (size >= 128) { if (tid < 64) { data[tid] = sum = sum + data[tid + 64]; } __syncthreads(); } - - if (tid < 32) - { - if (size >= 64) data[tid] = sum = sum + data[tid + 32]; - if (size >= 32) data[tid] = sum = sum + data[tid + 16]; - if (size >= 16) data[tid] = sum = sum + data[tid + 8]; - if (size >= 8) data[tid] = sum = sum + data[tid + 4]; - if (size >= 4) data[tid] = sum = sum + data[tid + 2]; - if (size >= 2) data[tid] = sum = sum + data[tid + 1]; - } + return smem; } - struct Mask8U + template + static __device__ __forceinline__ R& tie(R& val) { - explicit Mask8U(PtrStepb mask_): mask(mask_) {} + return val; + } - __device__ __forceinline__ bool operator()(int y, int x) const - { - return mask.ptr(y)[x]; - } - - PtrStepb mask; - }; - - struct MaskTrue + template + static __device__ __forceinline__ const Op& op(const Op& op) { - __device__ __forceinline__ bool operator()(int y, int x) const - { - return true; - } - __device__ __forceinline__ MaskTrue(){} - __device__ __forceinline__ MaskTrue(const MaskTrue& mask_){} - }; - - ////////////////////////////////////////////////////////////////////////////// - // Min max - - // To avoid shared bank conflicts we convert each value into value of - // appropriate type (32 bits minimum) - template struct MinMaxTypeTraits {}; - template <> struct MinMaxTypeTraits { typedef int best_type; }; - template <> struct MinMaxTypeTraits { typedef int best_type; }; - template <> struct MinMaxTypeTraits { typedef int best_type; }; - template <> struct MinMaxTypeTraits { typedef int best_type; }; - template <> struct MinMaxTypeTraits { typedef int best_type; }; - template <> struct MinMaxTypeTraits { typedef float best_type; }; - template <> struct MinMaxTypeTraits { typedef double best_type; }; - - namespace minmax + return op; + } + }; + template <> struct Unroll<2> + { + template + static __device__ __forceinline__ thrust::tuple smem_tuple(R* smem) { - __constant__ int ctwidth; - __constant__ int ctheight; + return cv::gpu::device::smem_tuple(smem, smem + BLOCK_SIZE); + } - // Global counter of blocks finished its work - __device__ uint blocks_finished = 0; - - - // Estimates good thread configuration - // - threads variable satisfies to threads.x * threads.y == 256 - void estimateThreadCfg(int cols, int rows, dim3& threads, dim3& grid) - { - threads = dim3(32, 8); - grid = dim3(divUp(cols, threads.x * 8), divUp(rows, threads.y * 32)); - grid.x = std::min(grid.x, threads.x); - grid.y = std::min(grid.y, threads.y); - } - - - // Returns required buffer sizes - void getBufSizeRequired(int cols, int rows, int elem_size, int& bufcols, int& bufrows) - { - dim3 threads, grid; - estimateThreadCfg(cols, rows, threads, grid); - bufcols = grid.x * grid.y * elem_size; - bufrows = 2; - } - - - // Estimates device constants which are used in the kernels using specified thread configuration - void setKernelConsts(int cols, int rows, const dim3& threads, const dim3& grid) - { - int twidth = divUp(divUp(cols, grid.x), threads.x); - int theight = divUp(divUp(rows, grid.y), threads.y); - cudaSafeCall(cudaMemcpyToSymbol(ctwidth, &twidth, sizeof(ctwidth))); - cudaSafeCall(cudaMemcpyToSymbol(ctheight, &theight, sizeof(ctheight))); - } - - - // Does min and max in shared memory - template - __device__ __forceinline__ void merge(uint tid, uint offset, volatile T* minval, volatile T* maxval) - { - minval[tid] = ::min(minval[tid], minval[tid + offset]); - maxval[tid] = ::max(maxval[tid], maxval[tid + offset]); - } - - - template - __device__ void findMinMaxInSmem(volatile T* minval, volatile T* maxval, const uint tid) - { - if (size >= 512) { if (tid < 256) { merge(tid, 256, minval, maxval); } __syncthreads(); } - if (size >= 256) { if (tid < 128) { merge(tid, 128, minval, maxval); } __syncthreads(); } - if (size >= 128) { if (tid < 64) { merge(tid, 64, minval, maxval); } __syncthreads(); } - - if (tid < 32) - { - if (size >= 64) merge(tid, 32, minval, maxval); - if (size >= 32) merge(tid, 16, minval, maxval); - if (size >= 16) merge(tid, 8, minval, maxval); - if (size >= 8) merge(tid, 4, minval, maxval); - if (size >= 4) merge(tid, 2, minval, maxval); - if (size >= 2) merge(tid, 1, minval, maxval); - } - } - - - template - __global__ void minMaxKernel(const PtrStepSzb src, Mask mask, T* minval, T* maxval) - { - typedef typename MinMaxTypeTraits::best_type best_type; - __shared__ best_type sminval[nthreads]; - __shared__ best_type smaxval[nthreads]; - - uint x0 = blockIdx.x * blockDim.x * ctwidth + threadIdx.x; - uint y0 = blockIdx.y * blockDim.y * ctheight + threadIdx.y; - uint tid = threadIdx.y * blockDim.x + threadIdx.x; - - T mymin = numeric_limits::max(); - T mymax = numeric_limits::is_signed ? -numeric_limits::max() : numeric_limits::min(); - uint y_end = ::min(y0 + (ctheight - 1) * blockDim.y + 1, src.rows); - uint x_end = ::min(x0 + (ctwidth - 1) * blockDim.x + 1, src.cols); - for (uint y = y0; y < y_end; y += blockDim.y) - { - const T* src_row = (const T*)src.ptr(y); - for (uint x = x0; x < x_end; x += blockDim.x) - { - T val = src_row[x]; - if (mask(y, x)) - { - mymin = ::min(mymin, val); - mymax = ::max(mymax, val); - } - } - } - - sminval[tid] = mymin; - smaxval[tid] = mymax; - __syncthreads(); - - findMinMaxInSmem(sminval, smaxval, tid); - - if (tid == 0) - { - minval[blockIdx.y * gridDim.x + blockIdx.x] = (T)sminval[0]; - maxval[blockIdx.y * gridDim.x + blockIdx.x] = (T)smaxval[0]; - } - - #if defined (__CUDA_ARCH__) && (__CUDA_ARCH__ >= 110) - __shared__ bool is_last; - - if (tid == 0) - { - minval[blockIdx.y * gridDim.x + blockIdx.x] = (T)sminval[0]; - maxval[blockIdx.y * gridDim.x + blockIdx.x] = (T)smaxval[0]; - __threadfence(); - - uint ticket = atomicInc(&blocks_finished, gridDim.x * gridDim.y); - is_last = ticket == gridDim.x * gridDim.y - 1; - } - - __syncthreads(); - - if (is_last) - { - uint idx = ::min(tid, gridDim.x * gridDim.y - 1); - - sminval[tid] = minval[idx]; - smaxval[tid] = maxval[idx]; - __syncthreads(); - - findMinMaxInSmem(sminval, smaxval, tid); - - if (tid == 0) - { - minval[0] = (T)sminval[0]; - maxval[0] = (T)smaxval[0]; - blocks_finished = 0; - } - } - #else - if (tid == 0) - { - minval[blockIdx.y * gridDim.x + blockIdx.x] = (T)sminval[0]; - maxval[blockIdx.y * gridDim.x + blockIdx.x] = (T)smaxval[0]; - } - #endif - } - - - template - void minMaxMaskCaller(const PtrStepSzb src, const PtrStepb mask, double* minval, double* maxval, PtrStepb buf) - { - dim3 threads, grid; - estimateThreadCfg(src.cols, src.rows, threads, grid); - setKernelConsts(src.cols, src.rows, threads, grid); - - T* minval_buf = (T*)buf.ptr(0); - T* maxval_buf = (T*)buf.ptr(1); - - minMaxKernel<256, T, Mask8U><<>>(src, Mask8U(mask), minval_buf, maxval_buf); - cudaSafeCall( cudaGetLastError() ); - - cudaSafeCall( cudaDeviceSynchronize() ); - - T minval_, maxval_; - cudaSafeCall( cudaMemcpy(&minval_, minval_buf, sizeof(T), cudaMemcpyDeviceToHost) ); - cudaSafeCall( cudaMemcpy(&maxval_, maxval_buf, sizeof(T), cudaMemcpyDeviceToHost) ); - *minval = minval_; - *maxval = maxval_; - } - - template void minMaxMaskCaller(const PtrStepSzb, const PtrStepb, double*, double*, PtrStepb); - template void minMaxMaskCaller(const PtrStepSzb, const PtrStepb, double*, double*, PtrStepb); - template void minMaxMaskCaller(const PtrStepSzb, const PtrStepb, double*, double*, PtrStepb); - template void minMaxMaskCaller(const PtrStepSzb, const PtrStepb, double*, double*, PtrStepb); - template void minMaxMaskCaller(const PtrStepSzb, const PtrStepb, double*, double*, PtrStepb); - template void minMaxMaskCaller(const PtrStepSzb, const PtrStepb, double*, double*, PtrStepb); - template void minMaxMaskCaller(const PtrStepSzb, const PtrStepb, double*, double*, PtrStepb); - - - template - void minMaxCaller(const PtrStepSzb src, double* minval, double* maxval, PtrStepb buf) - { - dim3 threads, grid; - estimateThreadCfg(src.cols, src.rows, threads, grid); - setKernelConsts(src.cols, src.rows, threads, grid); - - T* minval_buf = (T*)buf.ptr(0); - T* maxval_buf = (T*)buf.ptr(1); - - minMaxKernel<256, T, MaskTrue><<>>(src, MaskTrue(), minval_buf, maxval_buf); - cudaSafeCall( cudaGetLastError() ); - - cudaSafeCall( cudaDeviceSynchronize() ); - - T minval_, maxval_; - cudaSafeCall( cudaMemcpy(&minval_, minval_buf, sizeof(T), cudaMemcpyDeviceToHost) ); - cudaSafeCall( cudaMemcpy(&maxval_, maxval_buf, sizeof(T), cudaMemcpyDeviceToHost) ); - *minval = minval_; - *maxval = maxval_; - } - - template void minMaxCaller(const PtrStepSzb, double*, double*, PtrStepb); - template void minMaxCaller(const PtrStepSzb, double*, double*, PtrStepb); - template void minMaxCaller(const PtrStepSzb, double*, double*, PtrStepb); - template void minMaxCaller(const PtrStepSzb, double*, double*, PtrStepb); - template void minMaxCaller(const PtrStepSzb, double*, double*, PtrStepb); - template void minMaxCaller(const PtrStepSzb, double*,double*, PtrStepb); - template void minMaxCaller(const PtrStepSzb, double*, double*, PtrStepb); - - - template - __global__ void minMaxPass2Kernel(T* minval, T* maxval, int size) - { - typedef typename MinMaxTypeTraits::best_type best_type; - __shared__ best_type sminval[nthreads]; - __shared__ best_type smaxval[nthreads]; - - uint tid = threadIdx.y * blockDim.x + threadIdx.x; - uint idx = ::min(tid, size - 1); - - sminval[tid] = minval[idx]; - smaxval[tid] = maxval[idx]; - __syncthreads(); - - findMinMaxInSmem(sminval, smaxval, tid); - - if (tid == 0) - { - minval[0] = (T)sminval[0]; - maxval[0] = (T)smaxval[0]; - } - } - - - template - void minMaxMaskMultipassCaller(const PtrStepSzb src, const PtrStepb mask, double* minval, double* maxval, PtrStepb buf) - { - dim3 threads, grid; - estimateThreadCfg(src.cols, src.rows, threads, grid); - setKernelConsts(src.cols, src.rows, threads, grid); - - T* minval_buf = (T*)buf.ptr(0); - T* maxval_buf = (T*)buf.ptr(1); - - minMaxKernel<256, T, Mask8U><<>>(src, Mask8U(mask), minval_buf, maxval_buf); - cudaSafeCall( cudaGetLastError() ); - minMaxPass2Kernel<256, T><<<1, 256>>>(minval_buf, maxval_buf, grid.x * grid.y); - cudaSafeCall( cudaGetLastError() ); - - cudaSafeCall(cudaDeviceSynchronize()); - - T minval_, maxval_; - cudaSafeCall( cudaMemcpy(&minval_, minval_buf, sizeof(T), cudaMemcpyDeviceToHost) ); - cudaSafeCall( cudaMemcpy(&maxval_, maxval_buf, sizeof(T), cudaMemcpyDeviceToHost) ); - *minval = minval_; - *maxval = maxval_; - } - - template void minMaxMaskMultipassCaller(const PtrStepSzb, const PtrStepb, double*, double*, PtrStepb); - template void minMaxMaskMultipassCaller(const PtrStepSzb, const PtrStepb, double*, double*, PtrStepb); - template void minMaxMaskMultipassCaller(const PtrStepSzb, const PtrStepb, double*, double*, PtrStepb); - template void minMaxMaskMultipassCaller(const PtrStepSzb, const PtrStepb, double*, double*, PtrStepb); - template void minMaxMaskMultipassCaller(const PtrStepSzb, const PtrStepb, double*, double*, PtrStepb); - template void minMaxMaskMultipassCaller(const PtrStepSzb, const PtrStepb, double*, double*, PtrStepb); - - - template - void minMaxMultipassCaller(const PtrStepSzb src, double* minval, double* maxval, PtrStepb buf) - { - dim3 threads, grid; - estimateThreadCfg(src.cols, src.rows, threads, grid); - setKernelConsts(src.cols, src.rows, threads, grid); - - T* minval_buf = (T*)buf.ptr(0); - T* maxval_buf = (T*)buf.ptr(1); - - minMaxKernel<256, T, MaskTrue><<>>(src, MaskTrue(), minval_buf, maxval_buf); - cudaSafeCall( cudaGetLastError() ); - minMaxPass2Kernel<256, T><<<1, 256>>>(minval_buf, maxval_buf, grid.x * grid.y); - cudaSafeCall( cudaGetLastError() ); - - cudaSafeCall( cudaDeviceSynchronize() ); - - T minval_, maxval_; - cudaSafeCall( cudaMemcpy(&minval_, minval_buf, sizeof(T), cudaMemcpyDeviceToHost) ); - cudaSafeCall( cudaMemcpy(&maxval_, maxval_buf, sizeof(T), cudaMemcpyDeviceToHost) ); - *minval = minval_; - *maxval = maxval_; - } - - template void minMaxMultipassCaller(const PtrStepSzb, double*, double*, PtrStepb); - template void minMaxMultipassCaller(const PtrStepSzb, double*, double*, PtrStepb); - template void minMaxMultipassCaller(const PtrStepSzb, double*, double*, PtrStepb); - template void minMaxMultipassCaller(const PtrStepSzb, double*, double*, PtrStepb); - template void minMaxMultipassCaller(const PtrStepSzb, double*, double*, PtrStepb); - template void minMaxMultipassCaller(const PtrStepSzb, double*, double*, PtrStepb); - } // namespace minmax - - /////////////////////////////////////////////////////////////////////////////// - // minMaxLoc - - namespace minmaxloc + template + static __device__ __forceinline__ thrust::tuple::elem_type&, typename VecTraits::elem_type&> tie(R& val) { - __constant__ int ctwidth; - __constant__ int ctheight; + return thrust::tie(val.x, val.y); + } - // Global counter of blocks finished its work - __device__ uint blocks_finished = 0; - - - // Estimates good thread configuration - // - threads variable satisfies to threads.x * threads.y == 256 - void estimateThreadCfg(int cols, int rows, dim3& threads, dim3& grid) - { - threads = dim3(32, 8); - grid = dim3(divUp(cols, threads.x * 8), divUp(rows, threads.y * 32)); - grid.x = std::min(grid.x, threads.x); - grid.y = std::min(grid.y, threads.y); - } - - - // Returns required buffer sizes - void getBufSizeRequired(int cols, int rows, int elem_size, int& b1cols, - int& b1rows, int& b2cols, int& b2rows) - { - dim3 threads, grid; - estimateThreadCfg(cols, rows, threads, grid); - b1cols = grid.x * grid.y * elem_size; // For values - b1rows = 2; - b2cols = grid.x * grid.y * sizeof(int); // For locations - b2rows = 2; - } - - - // Estimates device constants which are used in the kernels using specified thread configuration - void setKernelConsts(int cols, int rows, const dim3& threads, const dim3& grid) - { - int twidth = divUp(divUp(cols, grid.x), threads.x); - int theight = divUp(divUp(rows, grid.y), threads.y); - cudaSafeCall(cudaMemcpyToSymbol(ctwidth, &twidth, sizeof(ctwidth))); - cudaSafeCall(cudaMemcpyToSymbol(ctheight, &theight, sizeof(ctheight))); - } - - - template - __device__ void merge(uint tid, uint offset, volatile T* minval, volatile T* maxval, - volatile uint* minloc, volatile uint* maxloc) - { - T val = minval[tid + offset]; - if (val < minval[tid]) - { - minval[tid] = val; - minloc[tid] = minloc[tid + offset]; - } - val = maxval[tid + offset]; - if (val > maxval[tid]) - { - maxval[tid] = val; - maxloc[tid] = maxloc[tid + offset]; - } - } - - - template - __device__ void findMinMaxLocInSmem(volatile T* minval, volatile T* maxval, volatile uint* minloc, - volatile uint* maxloc, const uint tid) - { - if (size >= 512) { if (tid < 256) { merge(tid, 256, minval, maxval, minloc, maxloc); } __syncthreads(); } - if (size >= 256) { if (tid < 128) { merge(tid, 128, minval, maxval, minloc, maxloc); } __syncthreads(); } - if (size >= 128) { if (tid < 64) { merge(tid, 64, minval, maxval, minloc, maxloc); } __syncthreads(); } - - if (tid < 32) - { - if (size >= 64) merge(tid, 32, minval, maxval, minloc, maxloc); - if (size >= 32) merge(tid, 16, minval, maxval, minloc, maxloc); - if (size >= 16) merge(tid, 8, minval, maxval, minloc, maxloc); - if (size >= 8) merge(tid, 4, minval, maxval, minloc, maxloc); - if (size >= 4) merge(tid, 2, minval, maxval, minloc, maxloc); - if (size >= 2) merge(tid, 1, minval, maxval, minloc, maxloc); - } - } - - - template - __global__ void minMaxLocKernel(const PtrStepSzb src, Mask mask, T* minval, T* maxval, - uint* minloc, uint* maxloc) - { - typedef typename MinMaxTypeTraits::best_type best_type; - __shared__ best_type sminval[nthreads]; - __shared__ best_type smaxval[nthreads]; - __shared__ uint sminloc[nthreads]; - __shared__ uint smaxloc[nthreads]; - - uint x0 = blockIdx.x * blockDim.x * ctwidth + threadIdx.x; - uint y0 = blockIdx.y * blockDim.y * ctheight + threadIdx.y; - uint tid = threadIdx.y * blockDim.x + threadIdx.x; - - T mymin = numeric_limits::max(); - T mymax = numeric_limits::is_signed ? -numeric_limits::max() : numeric_limits::min(); - uint myminloc = 0; - uint mymaxloc = 0; - uint y_end = ::min(y0 + (ctheight - 1) * blockDim.y + 1, src.rows); - uint x_end = ::min(x0 + (ctwidth - 1) * blockDim.x + 1, src.cols); - - for (uint y = y0; y < y_end; y += blockDim.y) - { - const T* ptr = (const T*)src.ptr(y); - for (uint x = x0; x < x_end; x += blockDim.x) - { - if (mask(y, x)) - { - T val = ptr[x]; - if (val <= mymin) { mymin = val; myminloc = y * src.cols + x; } - if (val >= mymax) { mymax = val; mymaxloc = y * src.cols + x; } - } - } - } - - sminval[tid] = mymin; - smaxval[tid] = mymax; - sminloc[tid] = myminloc; - smaxloc[tid] = mymaxloc; - __syncthreads(); - - findMinMaxLocInSmem(sminval, smaxval, sminloc, smaxloc, tid); - - #if defined (__CUDA_ARCH__) && (__CUDA_ARCH__ >= 110) - __shared__ bool is_last; - - if (tid == 0) - { - minval[blockIdx.y * gridDim.x + blockIdx.x] = (T)sminval[0]; - maxval[blockIdx.y * gridDim.x + blockIdx.x] = (T)smaxval[0]; - minloc[blockIdx.y * gridDim.x + blockIdx.x] = sminloc[0]; - maxloc[blockIdx.y * gridDim.x + blockIdx.x] = smaxloc[0]; - __threadfence(); - - uint ticket = atomicInc(&blocks_finished, gridDim.x * gridDim.y); - is_last = ticket == gridDim.x * gridDim.y - 1; - } - - __syncthreads(); - - if (is_last) - { - uint idx = ::min(tid, gridDim.x * gridDim.y - 1); - - sminval[tid] = minval[idx]; - smaxval[tid] = maxval[idx]; - sminloc[tid] = minloc[idx]; - smaxloc[tid] = maxloc[idx]; - __syncthreads(); - - findMinMaxLocInSmem(sminval, smaxval, sminloc, smaxloc, tid); - - if (tid == 0) - { - minval[0] = (T)sminval[0]; - maxval[0] = (T)smaxval[0]; - minloc[0] = sminloc[0]; - maxloc[0] = smaxloc[0]; - blocks_finished = 0; - } - } - #else - if (tid == 0) - { - minval[blockIdx.y * gridDim.x + blockIdx.x] = (T)sminval[0]; - maxval[blockIdx.y * gridDim.x + blockIdx.x] = (T)smaxval[0]; - minloc[blockIdx.y * gridDim.x + blockIdx.x] = sminloc[0]; - maxloc[blockIdx.y * gridDim.x + blockIdx.x] = smaxloc[0]; - } - #endif - } - - - template - void minMaxLocMaskCaller(const PtrStepSzb src, const PtrStepb mask, double* minval, double* maxval, - int minloc[2], int maxloc[2], PtrStepb valbuf, PtrStepb locbuf) - { - dim3 threads, grid; - estimateThreadCfg(src.cols, src.rows, threads, grid); - setKernelConsts(src.cols, src.rows, threads, grid); - - T* minval_buf = (T*)valbuf.ptr(0); - T* maxval_buf = (T*)valbuf.ptr(1); - uint* minloc_buf = (uint*)locbuf.ptr(0); - uint* maxloc_buf = (uint*)locbuf.ptr(1); - - minMaxLocKernel<256, T, Mask8U><<>>(src, Mask8U(mask), minval_buf, maxval_buf, - minloc_buf, maxloc_buf); - cudaSafeCall( cudaGetLastError() ); - - cudaSafeCall( cudaDeviceSynchronize() ); - - T minval_, maxval_; - cudaSafeCall( cudaMemcpy(&minval_, minval_buf, sizeof(T), cudaMemcpyDeviceToHost) ); - cudaSafeCall( cudaMemcpy(&maxval_, maxval_buf, sizeof(T), cudaMemcpyDeviceToHost) ); - *minval = minval_; - *maxval = maxval_; - - uint minloc_, maxloc_; - cudaSafeCall( cudaMemcpy(&minloc_, minloc_buf, sizeof(int), cudaMemcpyDeviceToHost) ); - cudaSafeCall( cudaMemcpy(&maxloc_, maxloc_buf, sizeof(int), cudaMemcpyDeviceToHost) ); - minloc[1] = minloc_ / src.cols; minloc[0] = minloc_ - minloc[1] * src.cols; - maxloc[1] = maxloc_ / src.cols; maxloc[0] = maxloc_ - maxloc[1] * src.cols; - } - - template void minMaxLocMaskCaller(const PtrStepSzb, const PtrStepb, double*, double*, int[2], int[2], PtrStepb, PtrStepb); - template void minMaxLocMaskCaller(const PtrStepSzb, const PtrStepb, double*, double*, int[2], int[2], PtrStepb, PtrStepb); - template void minMaxLocMaskCaller(const PtrStepSzb, const PtrStepb, double*, double*, int[2], int[2], PtrStepb, PtrStepb); - template void minMaxLocMaskCaller(const PtrStepSzb, const PtrStepb, double*, double*, int[2], int[2], PtrStepb, PtrStepb); - template void minMaxLocMaskCaller(const PtrStepSzb, const PtrStepb, double*, double*, int[2], int[2], PtrStepb, PtrStepb); - template void minMaxLocMaskCaller(const PtrStepSzb, const PtrStepb, double*, double*, int[2], int[2], PtrStepb, PtrStepb); - template void minMaxLocMaskCaller(const PtrStepSzb, const PtrStepb, double*, double*, int[2], int[2], PtrStepb, PtrStepb); - - - template - void minMaxLocCaller(const PtrStepSzb src, double* minval, double* maxval, - int minloc[2], int maxloc[2], PtrStepb valbuf, PtrStepb locbuf) - { - dim3 threads, grid; - estimateThreadCfg(src.cols, src.rows, threads, grid); - setKernelConsts(src.cols, src.rows, threads, grid); - - T* minval_buf = (T*)valbuf.ptr(0); - T* maxval_buf = (T*)valbuf.ptr(1); - uint* minloc_buf = (uint*)locbuf.ptr(0); - uint* maxloc_buf = (uint*)locbuf.ptr(1); - - minMaxLocKernel<256, T, MaskTrue><<>>(src, MaskTrue(), minval_buf, maxval_buf, - minloc_buf, maxloc_buf); - cudaSafeCall( cudaGetLastError() ); - - cudaSafeCall( cudaDeviceSynchronize() ); - - T minval_, maxval_; - cudaSafeCall(cudaMemcpy(&minval_, minval_buf, sizeof(T), cudaMemcpyDeviceToHost)); - cudaSafeCall(cudaMemcpy(&maxval_, maxval_buf, sizeof(T), cudaMemcpyDeviceToHost)); - *minval = minval_; - *maxval = maxval_; - - uint minloc_, maxloc_; - cudaSafeCall(cudaMemcpy(&minloc_, minloc_buf, sizeof(int), cudaMemcpyDeviceToHost)); - cudaSafeCall(cudaMemcpy(&maxloc_, maxloc_buf, sizeof(int), cudaMemcpyDeviceToHost)); - minloc[1] = minloc_ / src.cols; minloc[0] = minloc_ - minloc[1] * src.cols; - maxloc[1] = maxloc_ / src.cols; maxloc[0] = maxloc_ - maxloc[1] * src.cols; - } - - template void minMaxLocCaller(const PtrStepSzb, double*, double*, int[2], int[2], PtrStepb, PtrStepb); - template void minMaxLocCaller(const PtrStepSzb, double*, double*, int[2], int[2], PtrStepb, PtrStepb); - template void minMaxLocCaller(const PtrStepSzb, double*, double*, int[2], int[2], PtrStepb, PtrStepb); - template void minMaxLocCaller(const PtrStepSzb, double*, double*, int[2], int[2], PtrStepb, PtrStepb); - template void minMaxLocCaller(const PtrStepSzb, double*, double*, int[2], int[2], PtrStepb, PtrStepb); - template void minMaxLocCaller(const PtrStepSzb, double*, double*, int[2], int[2], PtrStepb, PtrStepb); - template void minMaxLocCaller(const PtrStepSzb, double*, double*, int[2], int[2], PtrStepb, PtrStepb); - - - // This kernel will be used only when compute capability is 1.0 - template - __global__ void minMaxLocPass2Kernel(T* minval, T* maxval, uint* minloc, uint* maxloc, int size) - { - typedef typename MinMaxTypeTraits::best_type best_type; - __shared__ best_type sminval[nthreads]; - __shared__ best_type smaxval[nthreads]; - __shared__ uint sminloc[nthreads]; - __shared__ uint smaxloc[nthreads]; - - uint tid = threadIdx.y * blockDim.x + threadIdx.x; - uint idx = ::min(tid, size - 1); - - sminval[tid] = minval[idx]; - smaxval[tid] = maxval[idx]; - sminloc[tid] = minloc[idx]; - smaxloc[tid] = maxloc[idx]; - __syncthreads(); - - findMinMaxLocInSmem(sminval, smaxval, sminloc, smaxloc, tid); - - if (tid == 0) - { - minval[0] = (T)sminval[0]; - maxval[0] = (T)smaxval[0]; - minloc[0] = sminloc[0]; - maxloc[0] = smaxloc[0]; - } - } - - - template - void minMaxLocMaskMultipassCaller(const PtrStepSzb src, const PtrStepb mask, double* minval, double* maxval, - int minloc[2], int maxloc[2], PtrStepb valbuf, PtrStepb locbuf) - { - dim3 threads, grid; - estimateThreadCfg(src.cols, src.rows, threads, grid); - setKernelConsts(src.cols, src.rows, threads, grid); - - T* minval_buf = (T*)valbuf.ptr(0); - T* maxval_buf = (T*)valbuf.ptr(1); - uint* minloc_buf = (uint*)locbuf.ptr(0); - uint* maxloc_buf = (uint*)locbuf.ptr(1); - - minMaxLocKernel<256, T, Mask8U><<>>(src, Mask8U(mask), minval_buf, maxval_buf, - minloc_buf, maxloc_buf); - cudaSafeCall( cudaGetLastError() ); - minMaxLocPass2Kernel<256, T><<<1, 256>>>(minval_buf, maxval_buf, minloc_buf, maxloc_buf, grid.x * grid.y); - cudaSafeCall( cudaGetLastError() ); - - cudaSafeCall( cudaDeviceSynchronize() ); - - T minval_, maxval_; - cudaSafeCall(cudaMemcpy(&minval_, minval_buf, sizeof(T), cudaMemcpyDeviceToHost)); - cudaSafeCall(cudaMemcpy(&maxval_, maxval_buf, sizeof(T), cudaMemcpyDeviceToHost)); - *minval = minval_; - *maxval = maxval_; - - uint minloc_, maxloc_; - cudaSafeCall(cudaMemcpy(&minloc_, minloc_buf, sizeof(int), cudaMemcpyDeviceToHost)); - cudaSafeCall(cudaMemcpy(&maxloc_, maxloc_buf, sizeof(int), cudaMemcpyDeviceToHost)); - minloc[1] = minloc_ / src.cols; minloc[0] = minloc_ - minloc[1] * src.cols; - maxloc[1] = maxloc_ / src.cols; maxloc[0] = maxloc_ - maxloc[1] * src.cols; - } - - template void minMaxLocMaskMultipassCaller(const PtrStepSzb, const PtrStepb, double*, double*, int[2], int[2], PtrStepb, PtrStepb); - template void minMaxLocMaskMultipassCaller(const PtrStepSzb, const PtrStepb, double*, double*, int[2], int[2], PtrStepb, PtrStepb); - template void minMaxLocMaskMultipassCaller(const PtrStepSzb, const PtrStepb, double*, double*, int[2], int[2], PtrStepb, PtrStepb); - template void minMaxLocMaskMultipassCaller(const PtrStepSzb, const PtrStepb, double*, double*, int[2], int[2], PtrStepb, PtrStepb); - template void minMaxLocMaskMultipassCaller(const PtrStepSzb, const PtrStepb, double*, double*, int[2], int[2], PtrStepb, PtrStepb); - template void minMaxLocMaskMultipassCaller(const PtrStepSzb, const PtrStepb, double*, double*, int[2], int[2], PtrStepb, PtrStepb); - - - template - void minMaxLocMultipassCaller(const PtrStepSzb src, double* minval, double* maxval, - int minloc[2], int maxloc[2], PtrStepb valbuf, PtrStepb locbuf) - { - dim3 threads, grid; - estimateThreadCfg(src.cols, src.rows, threads, grid); - setKernelConsts(src.cols, src.rows, threads, grid); - - T* minval_buf = (T*)valbuf.ptr(0); - T* maxval_buf = (T*)valbuf.ptr(1); - uint* minloc_buf = (uint*)locbuf.ptr(0); - uint* maxloc_buf = (uint*)locbuf.ptr(1); - - minMaxLocKernel<256, T, MaskTrue><<>>(src, MaskTrue(), minval_buf, maxval_buf, - minloc_buf, maxloc_buf); - cudaSafeCall( cudaGetLastError() ); - minMaxLocPass2Kernel<256, T><<<1, 256>>>(minval_buf, maxval_buf, minloc_buf, maxloc_buf, grid.x * grid.y); - cudaSafeCall( cudaGetLastError() ); - - cudaSafeCall( cudaDeviceSynchronize() ); - - T minval_, maxval_; - cudaSafeCall(cudaMemcpy(&minval_, minval_buf, sizeof(T), cudaMemcpyDeviceToHost)); - cudaSafeCall(cudaMemcpy(&maxval_, maxval_buf, sizeof(T), cudaMemcpyDeviceToHost)); - *minval = minval_; - *maxval = maxval_; - - uint minloc_, maxloc_; - cudaSafeCall(cudaMemcpy(&minloc_, minloc_buf, sizeof(int), cudaMemcpyDeviceToHost)); - cudaSafeCall(cudaMemcpy(&maxloc_, maxloc_buf, sizeof(int), cudaMemcpyDeviceToHost)); - minloc[1] = minloc_ / src.cols; minloc[0] = minloc_ - minloc[1] * src.cols; - maxloc[1] = maxloc_ / src.cols; maxloc[0] = maxloc_ - maxloc[1] * src.cols; - } - - template void minMaxLocMultipassCaller(const PtrStepSzb, double*, double*, int[2], int[2], PtrStepb, PtrStepb); - template void minMaxLocMultipassCaller(const PtrStepSzb, double*, double*, int[2], int[2], PtrStepb, PtrStepb); - template void minMaxLocMultipassCaller(const PtrStepSzb, double*, double*, int[2], int[2], PtrStepb, PtrStepb); - template void minMaxLocMultipassCaller(const PtrStepSzb, double*, double*, int[2], int[2], PtrStepb, PtrStepb); - template void minMaxLocMultipassCaller(const PtrStepSzb, double*, double*, int[2], int[2], PtrStepb, PtrStepb); - template void minMaxLocMultipassCaller(const PtrStepSzb, double*, double*, int[2], int[2], PtrStepb, PtrStepb); - } // namespace minmaxloc - - ////////////////////////////////////////////////////////////////////////////////////////////////////////// - // countNonZero - - namespace countnonzero + template + static __device__ __forceinline__ const thrust::tuple op(const Op& op) { - __constant__ int ctwidth; - __constant__ int ctheight; - - __device__ uint blocks_finished = 0; - - void estimateThreadCfg(int cols, int rows, dim3& threads, dim3& grid) - { - threads = dim3(32, 8); - grid = dim3(divUp(cols, threads.x * 8), divUp(rows, threads.y * 32)); - grid.x = std::min(grid.x, threads.x); - grid.y = std::min(grid.y, threads.y); - } - - - void getBufSizeRequired(int cols, int rows, int& bufcols, int& bufrows) - { - dim3 threads, grid; - estimateThreadCfg(cols, rows, threads, grid); - bufcols = grid.x * grid.y * sizeof(int); - bufrows = 1; - } - - - void setKernelConsts(int cols, int rows, const dim3& threads, const dim3& grid) - { - int twidth = divUp(divUp(cols, grid.x), threads.x); - int theight = divUp(divUp(rows, grid.y), threads.y); - cudaSafeCall(cudaMemcpyToSymbol(ctwidth, &twidth, sizeof(twidth))); - cudaSafeCall(cudaMemcpyToSymbol(ctheight, &theight, sizeof(theight))); - } - - - template - __global__ void countNonZeroKernel(const PtrStepSzb src, volatile uint* count) - { - __shared__ uint scount[nthreads]; - - uint x0 = blockIdx.x * blockDim.x * ctwidth + threadIdx.x; - uint y0 = blockIdx.y * blockDim.y * ctheight + threadIdx.y; - uint tid = threadIdx.y * blockDim.x + threadIdx.x; - - uint cnt = 0; - for (uint y = 0; y < ctheight && y0 + y * blockDim.y < src.rows; ++y) - { - const T* ptr = (const T*)src.ptr(y0 + y * blockDim.y); - for (uint x = 0; x < ctwidth && x0 + x * blockDim.x < src.cols; ++x) - cnt += ptr[x0 + x * blockDim.x] != 0; - } - - scount[tid] = cnt; - __syncthreads(); - - sumInSmem(scount, tid); - - #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 110) - __shared__ bool is_last; - - if (tid == 0) - { - count[blockIdx.y * gridDim.x + blockIdx.x] = scount[0]; - __threadfence(); - - uint ticket = atomicInc(&blocks_finished, gridDim.x * gridDim.y); - is_last = ticket == gridDim.x * gridDim.y - 1; - } - - __syncthreads(); - - if (is_last) - { - scount[tid] = tid < gridDim.x * gridDim.y ? count[tid] : 0; - __syncthreads(); - - sumInSmem(scount, tid); - - if (tid == 0) - { - count[0] = scount[0]; - blocks_finished = 0; - } - } - #else - if (tid == 0) count[blockIdx.y * gridDim.x + blockIdx.x] = scount[0]; - #endif - } - - - template - int countNonZeroCaller(const PtrStepSzb src, PtrStepb buf) - { - dim3 threads, grid; - estimateThreadCfg(src.cols, src.rows, threads, grid); - setKernelConsts(src.cols, src.rows, threads, grid); - - uint* count_buf = (uint*)buf.ptr(0); - - countNonZeroKernel<256, T><<>>(src, count_buf); - cudaSafeCall( cudaGetLastError() ); - - cudaSafeCall( cudaDeviceSynchronize() ); - - uint count; - cudaSafeCall(cudaMemcpy(&count, count_buf, sizeof(int), cudaMemcpyDeviceToHost)); - - return count; - } - - template int countNonZeroCaller(const PtrStepSzb, PtrStepb); - template int countNonZeroCaller(const PtrStepSzb, PtrStepb); - template int countNonZeroCaller(const PtrStepSzb, PtrStepb); - template int countNonZeroCaller(const PtrStepSzb, PtrStepb); - template int countNonZeroCaller(const PtrStepSzb, PtrStepb); - template int countNonZeroCaller(const PtrStepSzb, PtrStepb); - template int countNonZeroCaller(const PtrStepSzb, PtrStepb); - - - template - __global__ void countNonZeroPass2Kernel(uint* count, int size) - { - __shared__ uint scount[nthreads]; - uint tid = threadIdx.y * blockDim.x + threadIdx.x; - - scount[tid] = tid < size ? count[tid] : 0; - __syncthreads(); - - sumInSmem(scount, tid); - - if (tid == 0) - count[0] = scount[0]; - } - - - template - int countNonZeroMultipassCaller(const PtrStepSzb src, PtrStepb buf) - { - dim3 threads, grid; - estimateThreadCfg(src.cols, src.rows, threads, grid); - setKernelConsts(src.cols, src.rows, threads, grid); - - uint* count_buf = (uint*)buf.ptr(0); - - countNonZeroKernel<256, T><<>>(src, count_buf); - cudaSafeCall( cudaGetLastError() ); - countNonZeroPass2Kernel<256, T><<<1, 256>>>(count_buf, grid.x * grid.y); - cudaSafeCall( cudaGetLastError() ); - - cudaSafeCall( cudaDeviceSynchronize() ); - - uint count; - cudaSafeCall(cudaMemcpy(&count, count_buf, sizeof(int), cudaMemcpyDeviceToHost)); - - return count; - } - - template int countNonZeroMultipassCaller(const PtrStepSzb, PtrStepb); - template int countNonZeroMultipassCaller(const PtrStepSzb, PtrStepb); - template int countNonZeroMultipassCaller(const PtrStepSzb, PtrStepb); - template int countNonZeroMultipassCaller(const PtrStepSzb, PtrStepb); - template int countNonZeroMultipassCaller(const PtrStepSzb, PtrStepb); - template int countNonZeroMultipassCaller(const PtrStepSzb, PtrStepb); - - } // namespace countnonzero - - - ////////////////////////////////////////////////////////////////////////// - // Sum - - namespace sum + return thrust::make_tuple(op, op); + } + }; + template <> struct Unroll<3> + { + template + static __device__ __forceinline__ thrust::tuple smem_tuple(R* smem) { - template struct SumType {}; - template <> struct SumType { typedef uint R; }; - template <> struct SumType { typedef int R; }; - template <> struct SumType { typedef uint R; }; - template <> struct SumType { typedef int R; }; - template <> struct SumType { typedef int R; }; - template <> struct SumType { typedef float R; }; - template <> struct SumType { typedef double R; }; + return cv::gpu::device::smem_tuple(smem, smem + BLOCK_SIZE, smem + 2 * BLOCK_SIZE); + } - template - struct IdentityOp { static __device__ __forceinline__ R call(R x) { return x; } }; - - template - struct AbsOp { static __device__ __forceinline__ R call(R x) { return ::abs(x); } }; - - template <> - struct AbsOp { static __device__ __forceinline__ uint call(uint x) { return x; } }; - - template - struct SqrOp { static __device__ __forceinline__ R call(R x) { return x * x; } }; - - __constant__ int ctwidth; - __constant__ int ctheight; - __device__ uint blocks_finished = 0; - - const int threads_x = 32; - const int threads_y = 8; - - void estimateThreadCfg(int cols, int rows, dim3& threads, dim3& grid) - { - threads = dim3(threads_x, threads_y); - grid = dim3(divUp(cols, threads.x * threads.y), - divUp(rows, threads.y * threads.x)); - grid.x = std::min(grid.x, threads.x); - grid.y = std::min(grid.y, threads.y); - } - - - void getBufSizeRequired(int cols, int rows, int cn, int& bufcols, int& bufrows) - { - dim3 threads, grid; - estimateThreadCfg(cols, rows, threads, grid); - bufcols = grid.x * grid.y * sizeof(double) * cn; - bufrows = 1; - } - - - void setKernelConsts(int cols, int rows, const dim3& threads, const dim3& grid) - { - int twidth = divUp(divUp(cols, grid.x), threads.x); - int theight = divUp(divUp(rows, grid.y), threads.y); - cudaSafeCall(cudaMemcpyToSymbol(ctwidth, &twidth, sizeof(twidth))); - cudaSafeCall(cudaMemcpyToSymbol(ctheight, &theight, sizeof(theight))); - } - - template - __global__ void sumKernel(const PtrStepSzb src, R* result) - { - __shared__ R smem[nthreads]; - - const int x0 = blockIdx.x * blockDim.x * ctwidth + threadIdx.x; - const int y0 = blockIdx.y * blockDim.y * ctheight + threadIdx.y; - const int tid = threadIdx.y * blockDim.x + threadIdx.x; - const int bid = blockIdx.y * gridDim.x + blockIdx.x; - - R sum = 0; - for (int y = 0; y < ctheight && y0 + y * blockDim.y < src.rows; ++y) - { - const T* ptr = (const T*)src.ptr(y0 + y * blockDim.y); - for (int x = 0; x < ctwidth && x0 + x * blockDim.x < src.cols; ++x) - sum += Op::call(ptr[x0 + x * blockDim.x]); - } - - smem[tid] = sum; - __syncthreads(); - - sumInSmem(smem, tid); - - #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 110) - __shared__ bool is_last; - - if (tid == 0) - { - result[bid] = smem[0]; - __threadfence(); - - uint ticket = atomicInc(&blocks_finished, gridDim.x * gridDim.y); - is_last = (ticket == gridDim.x * gridDim.y - 1); - } - - __syncthreads(); - - if (is_last) - { - smem[tid] = tid < gridDim.x * gridDim.y ? result[tid] : 0; - __syncthreads(); - - sumInSmem(smem, tid); - - if (tid == 0) - { - result[0] = smem[0]; - blocks_finished = 0; - } - } - #else - if (tid == 0) result[bid] = smem[0]; - #endif - } - - - template - __global__ void sumPass2Kernel(R* result, int size) - { - __shared__ R smem[nthreads]; - int tid = threadIdx.y * blockDim.x + threadIdx.x; - - smem[tid] = tid < size ? result[tid] : 0; - __syncthreads(); - - sumInSmem(smem, tid); - - if (tid == 0) - result[0] = smem[0]; - } - - - template - __global__ void sumKernel_C2(const PtrStepSzb src, typename TypeVec::vec_type* result) - { - typedef typename TypeVec::vec_type SrcType; - typedef typename TypeVec::vec_type DstType; - - __shared__ R smem[nthreads * 2]; - - const int x0 = blockIdx.x * blockDim.x * ctwidth + threadIdx.x; - const int y0 = blockIdx.y * blockDim.y * ctheight + threadIdx.y; - const int tid = threadIdx.y * blockDim.x + threadIdx.x; - const int bid = blockIdx.y * gridDim.x + blockIdx.x; - - SrcType val; - DstType sum = VecTraits::all(0); - for (int y = 0; y < ctheight && y0 + y * blockDim.y < src.rows; ++y) - { - const SrcType* ptr = (const SrcType*)src.ptr(y0 + y * blockDim.y); - for (int x = 0; x < ctwidth && x0 + x * blockDim.x < src.cols; ++x) - { - val = ptr[x0 + x * blockDim.x]; - sum = sum + VecTraits::make(Op::call(val.x), Op::call(val.y)); - } - } - - smem[tid] = sum.x; - smem[tid + nthreads] = sum.y; - __syncthreads(); - - sumInSmem(smem, tid); - sumInSmem(smem + nthreads, tid); - - #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 110) - __shared__ bool is_last; - - if (tid == 0) - { - DstType res; - res.x = smem[0]; - res.y = smem[nthreads]; - result[bid] = res; - __threadfence(); - - uint ticket = atomicInc(&blocks_finished, gridDim.x * gridDim.y); - is_last = (ticket == gridDim.x * gridDim.y - 1); - } - - __syncthreads(); - - if (is_last) - { - DstType res = tid < gridDim.x * gridDim.y ? result[tid] : VecTraits::all(0); - smem[tid] = res.x; - smem[tid + nthreads] = res.y; - __syncthreads(); - - sumInSmem(smem, tid); - sumInSmem(smem + nthreads, tid); - - if (tid == 0) - { - res.x = smem[0]; - res.y = smem[nthreads]; - result[0] = res; - blocks_finished = 0; - } - } - #else - if (tid == 0) - { - DstType res; - res.x = smem[0]; - res.y = smem[nthreads]; - result[bid] = res; - } - #endif - } - - - template - __global__ void sumPass2Kernel_C2(typename TypeVec::vec_type* result, int size) - { - typedef typename TypeVec::vec_type DstType; - - __shared__ R smem[nthreads * 2]; - - const int tid = threadIdx.y * blockDim.x + threadIdx.x; - - DstType res = tid < size ? result[tid] : VecTraits::all(0); - smem[tid] = res.x; - smem[tid + nthreads] = res.y; - __syncthreads(); - - sumInSmem(smem, tid); - sumInSmem(smem + nthreads, tid); - - if (tid == 0) - { - res.x = smem[0]; - res.y = smem[nthreads]; - result[0] = res; - } - } - - - template - __global__ void sumKernel_C3(const PtrStepSzb src, typename TypeVec::vec_type* result) - { - typedef typename TypeVec::vec_type SrcType; - typedef typename TypeVec::vec_type DstType; - - __shared__ R smem[nthreads * 3]; - - const int x0 = blockIdx.x * blockDim.x * ctwidth + threadIdx.x; - const int y0 = blockIdx.y * blockDim.y * ctheight + threadIdx.y; - const int tid = threadIdx.y * blockDim.x + threadIdx.x; - const int bid = blockIdx.y * gridDim.x + blockIdx.x; - - SrcType val; - DstType sum = VecTraits::all(0); - for (int y = 0; y < ctheight && y0 + y * blockDim.y < src.rows; ++y) - { - const SrcType* ptr = (const SrcType*)src.ptr(y0 + y * blockDim.y); - for (int x = 0; x < ctwidth && x0 + x * blockDim.x < src.cols; ++x) - { - val = ptr[x0 + x * blockDim.x]; - sum = sum + VecTraits::make(Op::call(val.x), Op::call(val.y), Op::call(val.z)); - } - } - - smem[tid] = sum.x; - smem[tid + nthreads] = sum.y; - smem[tid + 2 * nthreads] = sum.z; - __syncthreads(); - - sumInSmem(smem, tid); - sumInSmem(smem + nthreads, tid); - sumInSmem(smem + 2 * nthreads, tid); - - #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 110 - __shared__ bool is_last; - - if (tid == 0) - { - DstType res; - res.x = smem[0]; - res.y = smem[nthreads]; - res.z = smem[2 * nthreads]; - result[bid] = res; - __threadfence(); - - uint ticket = atomicInc(&blocks_finished, gridDim.x * gridDim.y); - is_last = (ticket == gridDim.x * gridDim.y - 1); - } - - __syncthreads(); - - if (is_last) - { - DstType res = tid < gridDim.x * gridDim.y ? result[tid] : VecTraits::all(0); - smem[tid] = res.x; - smem[tid + nthreads] = res.y; - smem[tid + 2 * nthreads] = res.z; - __syncthreads(); - - sumInSmem(smem, tid); - sumInSmem(smem + nthreads, tid); - sumInSmem(smem + 2 * nthreads, tid); - - if (tid == 0) - { - res.x = smem[0]; - res.y = smem[nthreads]; - res.z = smem[2 * nthreads]; - result[0] = res; - blocks_finished = 0; - } - } - #else - if (tid == 0) - { - DstType res; - res.x = smem[0]; - res.y = smem[nthreads]; - res.z = smem[2 * nthreads]; - result[bid] = res; - } - #endif - } - - - template - __global__ void sumPass2Kernel_C3(typename TypeVec::vec_type* result, int size) - { - typedef typename TypeVec::vec_type DstType; - - __shared__ R smem[nthreads * 3]; - - const int tid = threadIdx.y * blockDim.x + threadIdx.x; - - DstType res = tid < size ? result[tid] : VecTraits::all(0); - smem[tid] = res.x; - smem[tid + nthreads] = res.y; - smem[tid + 2 * nthreads] = res.z; - __syncthreads(); - - sumInSmem(smem, tid); - sumInSmem(smem + nthreads, tid); - sumInSmem(smem + 2 * nthreads, tid); - - if (tid == 0) - { - res.x = smem[0]; - res.y = smem[nthreads]; - res.z = smem[2 * nthreads]; - result[0] = res; - } - } - - template - __global__ void sumKernel_C4(const PtrStepSzb src, typename TypeVec::vec_type* result) - { - typedef typename TypeVec::vec_type SrcType; - typedef typename TypeVec::vec_type DstType; - - __shared__ R smem[nthreads * 4]; - - const int x0 = blockIdx.x * blockDim.x * ctwidth + threadIdx.x; - const int y0 = blockIdx.y * blockDim.y * ctheight + threadIdx.y; - const int tid = threadIdx.y * blockDim.x + threadIdx.x; - const int bid = blockIdx.y * gridDim.x + blockIdx.x; - - SrcType val; - DstType sum = VecTraits::all(0); - for (int y = 0; y < ctheight && y0 + y * blockDim.y < src.rows; ++y) - { - const SrcType* ptr = (const SrcType*)src.ptr(y0 + y * blockDim.y); - for (int x = 0; x < ctwidth && x0 + x * blockDim.x < src.cols; ++x) - { - val = ptr[x0 + x * blockDim.x]; - sum = sum + VecTraits::make(Op::call(val.x), Op::call(val.y), - Op::call(val.z), Op::call(val.w)); - } - } - - smem[tid] = sum.x; - smem[tid + nthreads] = sum.y; - smem[tid + 2 * nthreads] = sum.z; - smem[tid + 3 * nthreads] = sum.w; - __syncthreads(); - - sumInSmem(smem, tid); - sumInSmem(smem + nthreads, tid); - sumInSmem(smem + 2 * nthreads, tid); - sumInSmem(smem + 3 * nthreads, tid); - - #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 110) - __shared__ bool is_last; - - if (tid == 0) - { - DstType res; - res.x = smem[0]; - res.y = smem[nthreads]; - res.z = smem[2 * nthreads]; - res.w = smem[3 * nthreads]; - result[bid] = res; - __threadfence(); - - uint ticket = atomicInc(&blocks_finished, gridDim.x * gridDim.y); - is_last = (ticket == gridDim.x * gridDim.y - 1); - } - - __syncthreads(); - - if (is_last) - { - DstType res = tid < gridDim.x * gridDim.y ? result[tid] : VecTraits::all(0); - smem[tid] = res.x; - smem[tid + nthreads] = res.y; - smem[tid + 2 * nthreads] = res.z; - smem[tid + 3 * nthreads] = res.w; - __syncthreads(); - - sumInSmem(smem, tid); - sumInSmem(smem + nthreads, tid); - sumInSmem(smem + 2 * nthreads, tid); - sumInSmem(smem + 3 * nthreads, tid); - - if (tid == 0) - { - res.x = smem[0]; - res.y = smem[nthreads]; - res.z = smem[2 * nthreads]; - res.w = smem[3 * nthreads]; - result[0] = res; - blocks_finished = 0; - } - } - #else - if (tid == 0) - { - DstType res; - res.x = smem[0]; - res.y = smem[nthreads]; - res.z = smem[2 * nthreads]; - res.w = smem[3 * nthreads]; - result[bid] = res; - } - #endif - } - - - template - __global__ void sumPass2Kernel_C4(typename TypeVec::vec_type* result, int size) - { - typedef typename TypeVec::vec_type DstType; - - __shared__ R smem[nthreads * 4]; - - const int tid = threadIdx.y * blockDim.x + threadIdx.x; - - DstType res = tid < size ? result[tid] : VecTraits::all(0); - smem[tid] = res.x; - smem[tid + nthreads] = res.y; - smem[tid + 2 * nthreads] = res.z; - smem[tid + 3 * nthreads] = res.w; - __syncthreads(); - - sumInSmem(smem, tid); - sumInSmem(smem + nthreads, tid); - sumInSmem(smem + 2 * nthreads, tid); - sumInSmem(smem + 3 * nthreads, tid); - - if (tid == 0) - { - res.x = smem[0]; - res.y = smem[nthreads]; - res.z = smem[2 * nthreads]; - res.w = smem[3 * nthreads]; - result[0] = res; - } - } - - template - void sumMultipassCaller(const PtrStepSzb src, PtrStepb buf, double* sum, int cn) - { - typedef typename SumType::R R; - - dim3 threads, grid; - estimateThreadCfg(src.cols, src.rows, threads, grid); - setKernelConsts(src.cols, src.rows, threads, grid); - - switch (cn) - { - case 1: - sumKernel, threads_x * threads_y><<>>( - src, (typename TypeVec::vec_type*)buf.ptr(0)); - cudaSafeCall( cudaGetLastError() ); - - sumPass2Kernel<<<1, threads_x * threads_y>>>( - (typename TypeVec::vec_type*)buf.ptr(0), grid.x * grid.y); - cudaSafeCall( cudaGetLastError() ); - - break; - case 2: - sumKernel_C2, threads_x * threads_y><<>>( - src, (typename TypeVec::vec_type*)buf.ptr(0)); - cudaSafeCall( cudaGetLastError() ); - - sumPass2Kernel_C2<<<1, threads_x * threads_y>>>( - (typename TypeVec::vec_type*)buf.ptr(0), grid.x * grid.y); - cudaSafeCall( cudaGetLastError() ); - - break; - case 3: - sumKernel_C3, threads_x * threads_y><<>>( - src, (typename TypeVec::vec_type*)buf.ptr(0)); - cudaSafeCall( cudaGetLastError() ); - - sumPass2Kernel_C3<<<1, threads_x * threads_y>>>( - (typename TypeVec::vec_type*)buf.ptr(0), grid.x * grid.y); - cudaSafeCall( cudaGetLastError() ); - - break; - case 4: - sumKernel_C4, threads_x * threads_y><<>>( - src, (typename TypeVec::vec_type*)buf.ptr(0)); - cudaSafeCall( cudaGetLastError() ); - - sumPass2Kernel_C4<<<1, threads_x * threads_y>>>( - (typename TypeVec::vec_type*)buf.ptr(0), grid.x * grid.y); - cudaSafeCall( cudaGetLastError() ); - - break; - } - cudaSafeCall( cudaDeviceSynchronize() ); - - R result[4] = {0, 0, 0, 0}; - cudaSafeCall(cudaMemcpy(&result, buf.ptr(0), sizeof(R) * cn, cudaMemcpyDeviceToHost)); - - sum[0] = result[0]; - sum[1] = result[1]; - sum[2] = result[2]; - sum[3] = result[3]; - } - - template void sumMultipassCaller(const PtrStepSzb, PtrStepb, double*, int); - template void sumMultipassCaller(const PtrStepSzb, PtrStepb, double*, int); - template void sumMultipassCaller(const PtrStepSzb, PtrStepb, double*, int); - template void sumMultipassCaller(const PtrStepSzb, PtrStepb, double*, int); - template void sumMultipassCaller(const PtrStepSzb, PtrStepb, double*, int); - template void sumMultipassCaller(const PtrStepSzb, PtrStepb, double*, int); - - - template - void sumCaller(const PtrStepSzb src, PtrStepb buf, double* sum, int cn) - { - typedef typename SumType::R R; - - dim3 threads, grid; - estimateThreadCfg(src.cols, src.rows, threads, grid); - setKernelConsts(src.cols, src.rows, threads, grid); - - switch (cn) - { - case 1: - sumKernel, threads_x * threads_y><<>>( - src, (typename TypeVec::vec_type*)buf.ptr(0)); - break; - case 2: - sumKernel_C2, threads_x * threads_y><<>>( - src, (typename TypeVec::vec_type*)buf.ptr(0)); - break; - case 3: - sumKernel_C3, threads_x * threads_y><<>>( - src, (typename TypeVec::vec_type*)buf.ptr(0)); - break; - case 4: - sumKernel_C4, threads_x * threads_y><<>>( - src, (typename TypeVec::vec_type*)buf.ptr(0)); - break; - } - cudaSafeCall( cudaGetLastError() ); - - cudaSafeCall( cudaDeviceSynchronize() ); - - R result[4] = {0, 0, 0, 0}; - cudaSafeCall(cudaMemcpy(&result, buf.ptr(0), sizeof(R) * cn, cudaMemcpyDeviceToHost)); - - sum[0] = result[0]; - sum[1] = result[1]; - sum[2] = result[2]; - sum[3] = result[3]; - } - - template void sumCaller(const PtrStepSzb, PtrStepb, double*, int); - template void sumCaller(const PtrStepSzb, PtrStepb, double*, int); - template void sumCaller(const PtrStepSzb, PtrStepb, double*, int); - template void sumCaller(const PtrStepSzb, PtrStepb, double*, int); - template void sumCaller(const PtrStepSzb, PtrStepb, double*, int); - template void sumCaller(const PtrStepSzb, PtrStepb, double*, int); - - - template - void absSumMultipassCaller(const PtrStepSzb src, PtrStepb buf, double* sum, int cn) - { - typedef typename SumType::R R; - - dim3 threads, grid; - estimateThreadCfg(src.cols, src.rows, threads, grid); - setKernelConsts(src.cols, src.rows, threads, grid); - - switch (cn) - { - case 1: - sumKernel, threads_x * threads_y><<>>( - src, (typename TypeVec::vec_type*)buf.ptr(0)); - cudaSafeCall( cudaGetLastError() ); - - sumPass2Kernel<<<1, threads_x * threads_y>>>( - (typename TypeVec::vec_type*)buf.ptr(0), grid.x * grid.y); - cudaSafeCall( cudaGetLastError() ); - - break; - case 2: - sumKernel_C2, threads_x * threads_y><<>>( - src, (typename TypeVec::vec_type*)buf.ptr(0)); - cudaSafeCall( cudaGetLastError() ); - - sumPass2Kernel_C2<<<1, threads_x * threads_y>>>( - (typename TypeVec::vec_type*)buf.ptr(0), grid.x * grid.y); - cudaSafeCall( cudaGetLastError() ); - - break; - case 3: - sumKernel_C3, threads_x * threads_y><<>>( - src, (typename TypeVec::vec_type*)buf.ptr(0)); - cudaSafeCall( cudaGetLastError() ); - - sumPass2Kernel_C3<<<1, threads_x * threads_y>>>( - (typename TypeVec::vec_type*)buf.ptr(0), grid.x * grid.y); - cudaSafeCall( cudaGetLastError() ); - - break; - case 4: - sumKernel_C4, threads_x * threads_y><<>>( - src, (typename TypeVec::vec_type*)buf.ptr(0)); - cudaSafeCall( cudaGetLastError() ); - - sumPass2Kernel_C4<<<1, threads_x * threads_y>>>( - (typename TypeVec::vec_type*)buf.ptr(0), grid.x * grid.y); - cudaSafeCall( cudaGetLastError() ); - - break; - } - cudaSafeCall( cudaDeviceSynchronize() ); - - R result[4] = {0, 0, 0, 0}; - cudaSafeCall(cudaMemcpy(result, buf.ptr(0), sizeof(R) * cn, cudaMemcpyDeviceToHost)); - - sum[0] = result[0]; - sum[1] = result[1]; - sum[2] = result[2]; - sum[3] = result[3]; - } - - template void absSumMultipassCaller(const PtrStepSzb, PtrStepb, double*, int); - template void absSumMultipassCaller(const PtrStepSzb, PtrStepb, double*, int); - template void absSumMultipassCaller(const PtrStepSzb, PtrStepb, double*, int); - template void absSumMultipassCaller(const PtrStepSzb, PtrStepb, double*, int); - template void absSumMultipassCaller(const PtrStepSzb, PtrStepb, double*, int); - template void absSumMultipassCaller(const PtrStepSzb, PtrStepb, double*, int); - - - template - void absSumCaller(const PtrStepSzb src, PtrStepb buf, double* sum, int cn) - { - typedef typename SumType::R R; - - dim3 threads, grid; - estimateThreadCfg(src.cols, src.rows, threads, grid); - setKernelConsts(src.cols, src.rows, threads, grid); - - switch (cn) - { - case 1: - sumKernel, threads_x * threads_y><<>>( - src, (typename TypeVec::vec_type*)buf.ptr(0)); - break; - case 2: - sumKernel_C2, threads_x * threads_y><<>>( - src, (typename TypeVec::vec_type*)buf.ptr(0)); - break; - case 3: - sumKernel_C3, threads_x * threads_y><<>>( - src, (typename TypeVec::vec_type*)buf.ptr(0)); - break; - case 4: - sumKernel_C4, threads_x * threads_y><<>>( - src, (typename TypeVec::vec_type*)buf.ptr(0)); - break; - } - cudaSafeCall( cudaGetLastError() ); - - cudaSafeCall( cudaDeviceSynchronize() ); - - R result[4] = {0, 0, 0, 0}; - cudaSafeCall(cudaMemcpy(result, buf.ptr(0), sizeof(R) * cn, cudaMemcpyDeviceToHost)); - - sum[0] = result[0]; - sum[1] = result[1]; - sum[2] = result[2]; - sum[3] = result[3]; - } - - template void absSumCaller(const PtrStepSzb, PtrStepb, double*, int); - template void absSumCaller(const PtrStepSzb, PtrStepb, double*, int); - template void absSumCaller(const PtrStepSzb, PtrStepb, double*, int); - template void absSumCaller(const PtrStepSzb, PtrStepb, double*, int); - template void absSumCaller(const PtrStepSzb, PtrStepb, double*, int); - template void absSumCaller(const PtrStepSzb, PtrStepb, double*, int); - - - template - void sqrSumMultipassCaller(const PtrStepSzb src, PtrStepb buf, double* sum, int cn) - { - typedef typename SumType::R R; - - dim3 threads, grid; - estimateThreadCfg(src.cols, src.rows, threads, grid); - setKernelConsts(src.cols, src.rows, threads, grid); - - switch (cn) - { - case 1: - sumKernel, threads_x * threads_y><<>>( - src, (typename TypeVec::vec_type*)buf.ptr(0)); - cudaSafeCall( cudaGetLastError() ); - - sumPass2Kernel<<<1, threads_x * threads_y>>>( - (typename TypeVec::vec_type*)buf.ptr(0), grid.x * grid.y); - cudaSafeCall( cudaGetLastError() ); - - break; - case 2: - sumKernel_C2, threads_x * threads_y><<>>( - src, (typename TypeVec::vec_type*)buf.ptr(0)); - cudaSafeCall( cudaGetLastError() ); - - sumPass2Kernel_C2<<<1, threads_x * threads_y>>>( - (typename TypeVec::vec_type*)buf.ptr(0), grid.x * grid.y); - cudaSafeCall( cudaGetLastError() ); - - break; - case 3: - sumKernel_C3, threads_x * threads_y><<>>( - src, (typename TypeVec::vec_type*)buf.ptr(0)); - cudaSafeCall( cudaGetLastError() ); - - sumPass2Kernel_C3<<<1, threads_x * threads_y>>>( - (typename TypeVec::vec_type*)buf.ptr(0), grid.x * grid.y); - cudaSafeCall( cudaGetLastError() ); - - break; - case 4: - sumKernel_C4, threads_x * threads_y><<>>( - src, (typename TypeVec::vec_type*)buf.ptr(0)); - cudaSafeCall( cudaGetLastError() ); - - sumPass2Kernel_C4<<<1, threads_x * threads_y>>>( - (typename TypeVec::vec_type*)buf.ptr(0), grid.x * grid.y); - cudaSafeCall( cudaGetLastError() ); - - break; - } - cudaSafeCall( cudaDeviceSynchronize() ); - - R result[4] = {0, 0, 0, 0}; - cudaSafeCall(cudaMemcpy(result, buf.ptr(0), sizeof(R) * cn, cudaMemcpyDeviceToHost)); - - sum[0] = result[0]; - sum[1] = result[1]; - sum[2] = result[2]; - sum[3] = result[3]; - } - - template void sqrSumMultipassCaller(const PtrStepSzb, PtrStepb, double*, int); - template void sqrSumMultipassCaller(const PtrStepSzb, PtrStepb, double*, int); - template void sqrSumMultipassCaller(const PtrStepSzb, PtrStepb, double*, int); - template void sqrSumMultipassCaller(const PtrStepSzb, PtrStepb, double*, int); - template void sqrSumMultipassCaller(const PtrStepSzb, PtrStepb, double*, int); - template void sqrSumMultipassCaller(const PtrStepSzb, PtrStepb, double*, int); - - - template - void sqrSumCaller(const PtrStepSzb src, PtrStepb buf, double* sum, int cn) - { - typedef double R; - - dim3 threads, grid; - estimateThreadCfg(src.cols, src.rows, threads, grid); - setKernelConsts(src.cols, src.rows, threads, grid); - - switch (cn) - { - case 1: - sumKernel, threads_x * threads_y><<>>( - src, (typename TypeVec::vec_type*)buf.ptr(0)); - break; - case 2: - sumKernel_C2, threads_x * threads_y><<>>( - src, (typename TypeVec::vec_type*)buf.ptr(0)); - break; - case 3: - sumKernel_C3, threads_x * threads_y><<>>( - src, (typename TypeVec::vec_type*)buf.ptr(0)); - break; - case 4: - sumKernel_C4, threads_x * threads_y><<>>( - src, (typename TypeVec::vec_type*)buf.ptr(0)); - break; - } - cudaSafeCall( cudaGetLastError() ); - - cudaSafeCall( cudaDeviceSynchronize() ); - - R result[4] = {0, 0, 0, 0}; - cudaSafeCall(cudaMemcpy(result, buf.ptr(0), sizeof(R) * cn, cudaMemcpyDeviceToHost)); - - sum[0] = result[0]; - sum[1] = result[1]; - sum[2] = result[2]; - sum[3] = result[3]; - } - - template void sqrSumCaller(const PtrStepSzb, PtrStepb, double*, int); - template void sqrSumCaller(const PtrStepSzb, PtrStepb, double*, int); - template void sqrSumCaller(const PtrStepSzb, PtrStepb, double*, int); - template void sqrSumCaller(const PtrStepSzb, PtrStepb, double*, int); - template void sqrSumCaller(const PtrStepSzb, PtrStepb, double*, int); - template void sqrSumCaller(const PtrStepSzb, PtrStepb, double*, int); - } // namespace sum - - ////////////////////////////////////////////////////////////////////////////// - // reduce - - template struct SumReductor + template + static __device__ __forceinline__ thrust::tuple::elem_type&, typename VecTraits::elem_type&, typename VecTraits::elem_type&> tie(R& val) { - __device__ __forceinline__ S startValue() const - { - return 0; - } + return thrust::tie(val.x, val.y, val.z); + } - __device__ __forceinline__ SumReductor(const SumReductor& other){} - __device__ __forceinline__ SumReductor(){} - - __device__ __forceinline__ S operator ()(volatile S a, volatile S b) const - { - return a + b; - } - - __device__ __forceinline__ S result(S r, double) const - { - return r; - } - }; - - template struct AvgReductor + template + static __device__ __forceinline__ const thrust::tuple op(const Op& op) { - __device__ __forceinline__ S startValue() const - { - return 0; - } - - __device__ __forceinline__ AvgReductor(const AvgReductor& other){} - __device__ __forceinline__ AvgReductor(){} - - __device__ __forceinline__ S operator ()(volatile S a, volatile S b) const - { - return a + b; - } - - __device__ __forceinline__ double result(S r, double sz) const - { - return r / sz; - } - }; - - template struct MinReductor + return thrust::make_tuple(op, op, op); + } + }; + template <> struct Unroll<4> + { + template + static __device__ __forceinline__ thrust::tuple smem_tuple(R* smem) { - __device__ __forceinline__ S startValue() const - { - return numeric_limits::max(); - } + return cv::gpu::device::smem_tuple(smem, smem + BLOCK_SIZE, smem + 2 * BLOCK_SIZE, smem + 3 * BLOCK_SIZE); + } - __device__ __forceinline__ MinReductor(const MinReductor& other){} - __device__ __forceinline__ MinReductor(){} - - template __device__ __forceinline__ T operator ()(volatile T a, volatile T b) const - { - return saturate_cast(::min(a, b)); - } - __device__ __forceinline__ float operator ()(volatile float a, volatile float b) const - { - return ::fmin(a, b); - } - - __device__ __forceinline__ S result(S r, double) const - { - return r; - } - }; - - template struct MaxReductor + template + static __device__ __forceinline__ thrust::tuple::elem_type&, typename VecTraits::elem_type&, typename VecTraits::elem_type&, typename VecTraits::elem_type&> tie(R& val) { - __device__ __forceinline__ S startValue() const - { - return numeric_limits::min(); - } + return thrust::tie(val.x, val.y, val.z, val.w); + } - __device__ __forceinline__ MaxReductor(const MaxReductor& other){} - __device__ __forceinline__ MaxReductor(){} - - template __device__ __forceinline__ int operator ()(volatile T a, volatile T b) const - { - return ::max(a, b); - } - __device__ __forceinline__ float operator ()(volatile float a, volatile float b) const - { - return ::fmax(a, b); - } - - __device__ __forceinline__ S result(S r, double) const - { - return r; - } - }; - - template __global__ void reduceRows(const PtrStepSz src, D* dst, const Op op) + template + static __device__ __forceinline__ const thrust::tuple op(const Op& op) { - __shared__ S smem[16 * 16]; + return thrust::make_tuple(op, op, op, op); + } + }; +} - const int x = blockIdx.x * 16 + threadIdx.x; +///////////////////////////////////////////////////////////// +// sum - S myVal = op.startValue(); +namespace sum +{ + __device__ unsigned int blocks_finished = 0; - if (x < src.cols) + template struct AtomicAdd; + template struct AtomicAdd + { + static __device__ void run(R* ptr, R val) + { + ::atomicAdd(ptr, val); + } + }; + template struct AtomicAdd + { + typedef typename TypeVec::vec_type val_type; + + static __device__ void run(R* ptr, val_type val) + { + ::atomicAdd(ptr, val.x); + ::atomicAdd(ptr + 1, val.y); + } + }; + template struct AtomicAdd + { + typedef typename TypeVec::vec_type val_type; + + static __device__ void run(R* ptr, val_type val) + { + ::atomicAdd(ptr, val.x); + ::atomicAdd(ptr + 1, val.y); + ::atomicAdd(ptr + 2, val.z); + } + }; + template struct AtomicAdd + { + typedef typename TypeVec::vec_type val_type; + + static __device__ void run(R* ptr, val_type val) + { + ::atomicAdd(ptr, val.x); + ::atomicAdd(ptr + 1, val.y); + ::atomicAdd(ptr + 2, val.z); + ::atomicAdd(ptr + 3, val.w); + } + }; + + template + struct GlobalReduce + { + typedef typename TypeVec::vec_type result_type; + + static __device__ void run(result_type& sum, result_type* result, int tid, int bid, R* smem) + { + #if __CUDA_ARCH__ >= 200 + if (tid == 0) + AtomicAdd::run((R*) result, sum); + #else + __shared__ bool is_last; + + if (tid == 0) { - for (int y = threadIdx.y; y < src.rows; y += 16) - myVal = op(myVal, src.ptr(y)[x]); + result[bid] = sum; + + __threadfence(); + + unsigned int ticket = ::atomicAdd(&blocks_finished, 1); + is_last = (ticket == gridDim.x * gridDim.y - 1); } - smem[threadIdx.x * 16 + threadIdx.y] = myVal; __syncthreads(); - if (threadIdx.x < 8) + if (is_last) { - volatile S* srow = smem + threadIdx.y * 16; - srow[threadIdx.x] = op(srow[threadIdx.x], srow[threadIdx.x + 8]); - srow[threadIdx.x] = op(srow[threadIdx.x], srow[threadIdx.x + 4]); - srow[threadIdx.x] = op(srow[threadIdx.x], srow[threadIdx.x + 2]); - srow[threadIdx.x] = op(srow[threadIdx.x], srow[threadIdx.x + 1]); - } - __syncthreads(); + sum = tid < gridDim.x * gridDim.y ? result[tid] : VecTraits::all(0); - if (threadIdx.y == 0 && x < src.cols) - dst[x] = saturate_cast(op.result(smem[threadIdx.x * 16], src.rows)); - } + device::reduce(Unroll::template smem_tuple(smem), Unroll::tie(sum), tid, Unroll::op(plus())); - template