added support of 3-channels output to gpu::reprojectImageTo3D

minor refactoring of gpu tests
2012-03-28 12:48:28 +00:00
parent 07ec83cd1f
commit 8e3f1c09d2
8 changed files with 713 additions and 847 deletions
--- a/modules/gpu/src/cuda/imgproc.cu
+++ b/modules/gpu/src/cuda/imgproc.cu
@@ -316,62 +316,51 @@ namespace cv { namespace gpu { namespace device

        __constant__ float cq[16];

-        template <typename T>
-        __global__ void reprojectImageTo3D(const T* disp, size_t disp_step, float* xyzw, size_t xyzw_step, int rows, int cols)
+        template <typename T, typename D>
+        __global__ void reprojectImageTo3D(const DevMem2D_<T> disp, PtrStep<D> xyz)
        {
            const int x = blockIdx.x * blockDim.x + threadIdx.x;
            const int y = blockIdx.y * blockDim.y + threadIdx.y;

-            if (y < rows && x < cols)
-            {
+            if (y >= disp.rows || x >= disp.cols)
+                return;

-                float qx = cq[1] * y + cq[3], qy = cq[5] * y + cq[7];
-                float qz = cq[9] * y + cq[11], qw = cq[13] * y + cq[15];
+            const float qx = x * cq[ 0] + y * cq[ 1] + cq[ 3];
+            const float qy = x * cq[ 4] + y * cq[ 5] + cq[ 7];
+            const float qz = x * cq[ 8] + y * cq[ 9] + cq[11];
+            const float qw = x * cq[12] + y * cq[13] + cq[15];

-                qx += x * cq[0];
-                qy += x * cq[4];
-                qz += x * cq[8];
-                qw += x * cq[12];
+            const T d = disp(y, x);

-                T d = *(disp + disp_step * y + x);
+            const float iW = 1.f / (qw + cq[14] * d);

-                float iW = 1.f / (qw + cq[14] * d);
-                float4 v;
-                v.x = (qx + cq[2] * d) * iW;
-                v.y = (qy + cq[6] * d) * iW;
-                v.z = (qz + cq[10] * d) * iW;
-                v.w = 1.f;
+            D v = VecTraits<D>::all(1.0f);
+            v.x = (qx + cq[2] * d) * iW;
+            v.y = (qy + cq[6] * d) * iW;
+            v.z = (qz + cq[10] * d) * iW;

-                *(float4*)(xyzw + xyzw_step * y + (x * 4)) = v;
-            }
+            xyz(y, x) = v;
        }

-        template <typename T>
-        inline void reprojectImageTo3D_caller(const DevMem2D_<T>& disp, const DevMem2Df& xyzw, const float* q, const cudaStream_t& stream)
+        template <typename T, typename D>
+        void reprojectImageTo3D_gpu(const DevMem2Db disp, DevMem2Db xyz, const float* q, cudaStream_t stream)
        {
-            dim3 threads(32, 8, 1);
-            dim3 grid(1, 1, 1);
-            grid.x = divUp(disp.cols, threads.x);
-            grid.y = divUp(disp.rows, threads.y);
+            dim3 block(32, 8);
+            dim3 grid(divUp(disp.cols, block.x), divUp(disp.rows, block.y));

            cudaSafeCall( cudaMemcpyToSymbol(cq, q, 16 * sizeof(float)) );

-            reprojectImageTo3D<<<grid, threads, 0, stream>>>(disp.data, disp.step / sizeof(T), xyzw.data, xyzw.step / sizeof(float), disp.rows, disp.cols);
+            reprojectImageTo3D<T, D><<<grid, block, 0, stream>>>((DevMem2D_<T>)disp, (DevMem2D_<D>)xyz);
            cudaSafeCall( cudaGetLastError() );

            if (stream == 0)
                cudaSafeCall( cudaDeviceSynchronize() );
        }

-        void reprojectImageTo3D_gpu(const DevMem2Db& disp, const DevMem2Df& xyzw, const float* q, const cudaStream_t& stream)
-        {
-            reprojectImageTo3D_caller(disp, xyzw, q, stream);
-        }
-
-        void reprojectImageTo3D_gpu(const DevMem2D_<short>& disp, const DevMem2Df& xyzw, const float* q, const cudaStream_t& stream)
-        {
-            reprojectImageTo3D_caller(disp, xyzw, q, stream);
-        }
+        template void reprojectImageTo3D_gpu<uchar, float3>(const DevMem2Db disp, DevMem2Db xyz, const float* q, cudaStream_t stream);
+        template void reprojectImageTo3D_gpu<uchar, float4>(const DevMem2Db disp, DevMem2Db xyz, const float* q, cudaStream_t stream);
+        template void reprojectImageTo3D_gpu<short, float3>(const DevMem2Db disp, DevMem2Db xyz, const float* q, cudaStream_t stream);
+        template void reprojectImageTo3D_gpu<short, float4>(const DevMem2Db disp, DevMem2Db xyz, const float* q, cudaStream_t stream);

        /////////////////////////////////////////// Corner Harris /////////////////////////////////////////////////

--- a/modules/gpu/src/imgproc.cpp
+++ b/modules/gpu/src/imgproc.cpp
@@ -50,7 +50,7 @@ using namespace cv::gpu;
 void cv::gpu::meanShiftFiltering(const GpuMat&, GpuMat&, int, int, TermCriteria, Stream&) { throw_nogpu(); }
 void cv::gpu::meanShiftProc(const GpuMat&, GpuMat&, GpuMat&, int, int, TermCriteria, Stream&) { throw_nogpu(); }
 void cv::gpu::drawColorDisp(const GpuMat&, GpuMat&, int, Stream&) { throw_nogpu(); }
-void cv::gpu::reprojectImageTo3D(const GpuMat&, GpuMat&, const Mat&, Stream&) { throw_nogpu(); }
+void cv::gpu::reprojectImageTo3D(const GpuMat&, GpuMat&, const Mat&, int, Stream&) { throw_nogpu(); }
 void cv::gpu::copyMakeBorder(const GpuMat&, GpuMat&, int, int, int, int, int, const Scalar&, Stream&) { throw_nogpu(); }
 void cv::gpu::buildWarpPlaneMaps(Size, Rect, const Mat&, const Mat&, const Mat&, float, GpuMat&, GpuMat&, Stream&) { throw_nogpu(); }
 void cv::gpu::buildWarpCylindricalMaps(Size, Rect, const Mat&, const Mat&, float, GpuMat&, GpuMat&, Stream&) { throw_nogpu(); }
@@ -213,33 +213,29 @@ namespace cv { namespace gpu { namespace device
 {
    namespace imgproc
    {
-        void reprojectImageTo3D_gpu(const DevMem2Db& disp, const DevMem2Df& xyzw, const float* q, const cudaStream_t& stream);
-        void reprojectImageTo3D_gpu(const DevMem2D_<short>& disp, const DevMem2Df& xyzw, const float* q, const cudaStream_t& stream);
+        template <typename T, typename D>
+        void reprojectImageTo3D_gpu(const DevMem2Db disp, DevMem2Db xyz, const float* q, cudaStream_t stream);
    }
 }}}

-namespace
+void cv::gpu::reprojectImageTo3D(const GpuMat& disp, GpuMat& xyz, const Mat& Q, int dst_cn, Stream& stream)
 {
-    template <typename T>
-    void reprojectImageTo3D_caller(const GpuMat& disp, GpuMat& xyzw, const Mat& Q, const cudaStream_t& stream)
+    using namespace cv::gpu::device::imgproc;
+
+    typedef void (*func_t)(const DevMem2Db disp, DevMem2Db xyz, const float* q, cudaStream_t stream);
+    static const func_t funcs[2][4] = 
    {
-        using namespace ::cv::gpu::device::imgproc;
+        {reprojectImageTo3D_gpu<uchar, float3>, 0, 0, reprojectImageTo3D_gpu<short, float3>},
+        {reprojectImageTo3D_gpu<uchar, float4>, 0, 0, reprojectImageTo3D_gpu<short, float4>}
+    };

-        xyzw.create(disp.rows, disp.cols, CV_32FC4);
+    CV_Assert(disp.type() == CV_8U || disp.type() == CV_16S);
+    CV_Assert(Q.type() == CV_32F && Q.rows == 4 && Q.cols == 4 && Q.isContinuous());
+    CV_Assert(dst_cn == 3 || dst_cn == 4);

-        reprojectImageTo3D_gpu((DevMem2D_<T>)disp, xyzw, Q.ptr<float>(), stream);
-    }
+    xyz.create(disp.size(), CV_MAKE_TYPE(CV_32F, dst_cn));

-    typedef void (*reprojectImageTo3D_caller_t)(const GpuMat& disp, GpuMat& xyzw, const Mat& Q, const cudaStream_t& stream);
-
-    const reprojectImageTo3D_caller_t reprojectImageTo3D_callers[] = {reprojectImageTo3D_caller<unsigned char>, 0, 0, reprojectImageTo3D_caller<short>, 0, 0, 0, 0};
-}
-
-void cv::gpu::reprojectImageTo3D(const GpuMat& disp, GpuMat& xyzw, const Mat& Q, Stream& stream)
-{
-    CV_Assert((disp.type() == CV_8U || disp.type() == CV_16S) && Q.type() == CV_32F && Q.rows == 4 && Q.cols == 4);
-
-    reprojectImageTo3D_callers[disp.type()](disp, xyzw, Q, StreamAccessor::getStream(stream));
+    funcs[dst_cn == 4][disp.type()](disp, xyz, Q.ptr<float>(), StreamAccessor::getStream(stream));
 }

 ////////////////////////////////////////////////////////////////////////
@@ -1513,9 +1509,11 @@ void cv::gpu::Canny(const GpuMat& src, CannyBuf& buf, GpuMat& dst, double low_th
 {
    using namespace ::cv::gpu::device::canny;

-    CV_Assert(TargetArchs::builtWith(SHARED_ATOMICS) && DeviceInfo().supports(SHARED_ATOMICS));
    CV_Assert(src.type() == CV_8UC1);

+    if (!TargetArchs::builtWith(SHARED_ATOMICS) || !DeviceInfo().supports(SHARED_ATOMICS))
+        CV_Error(CV_StsNotImplemented, "The device doesn't support shared atomics");
+
    if( low_thresh > high_thresh )
        std::swap( low_thresh, high_thresh);