fixed gpu::filter2D border interpolation for CV_32FC1 type

added additional tests for gpu filters fixed gpu features2D tests
2012-03-21 14:38:23 +00:00
parent c1a6cb6221
commit 059cef57e6
16 changed files with 1730 additions and 1515 deletions
--- a/modules/gpu/src/cuda/imgproc.cu
+++ b/modules/gpu/src/cuda/imgproc.cu
@@ -46,16 +46,16 @@
 #include "opencv2/gpu/device/saturate_cast.hpp"
 #include "opencv2/gpu/device/border_interpolate.hpp"

-namespace cv { namespace gpu { namespace device 
+namespace cv { namespace gpu { namespace device
 {
-    namespace imgproc 
+    namespace imgproc
    {
        /////////////////////////////////// MeanShiftfiltering ///////////////////////////////////////////////

        texture<uchar4, 2> tex_meanshift;

-        __device__ short2 do_mean_shift(int x0, int y0, unsigned char* out, 
-                                        size_t out_step, int cols, int rows, 
+        __device__ short2 do_mean_shift(int x0, int y0, unsigned char* out,
+                                        size_t out_step, int cols, int rows,
                                        int sp, int sr, int maxIter, float eps)
        {
            int isr2 = sr*sr;
@@ -78,7 +78,7 @@ namespace cv { namespace gpu { namespace device
                {
                    int rowCount = 0;
                    for( int x = minx; x <= maxx; x++ )
-                    {                    
+                    {
                        uchar4 t = tex2D( tex_meanshift, x, y );

                        int norm2 = (t.x - c.x) * (t.x - c.x) + (t.y - c.y) * (t.y - c.y) + (t.z - c.z) * (t.z - c.z);
@@ -128,16 +128,16 @@ namespace cv { namespace gpu { namespace device
                do_mean_shift(x0, y0, out, out_step, cols, rows, sp, sr, maxIter, eps);
        }

-        __global__ void meanshiftproc_kernel(unsigned char* outr, size_t outrstep, 
-                                             unsigned char* outsp, size_t outspstep, 
-                                             int cols, int rows, 
+        __global__ void meanshiftproc_kernel(unsigned char* outr, size_t outrstep,
+                                             unsigned char* outsp, size_t outspstep,
+                                             int cols, int rows,
                                             int sp, int sr, int maxIter, float eps)
        {
            int x0 = blockIdx.x * blockDim.x + threadIdx.x;
            int y0 = blockIdx.y * blockDim.y + threadIdx.y;

            if( x0 < cols && y0 < rows )
-            {            
+            {
                int basesp = (blockIdx.y * blockDim.y + threadIdx.y) * outspstep + (blockIdx.x * blockDim.x + threadIdx.x) * 2 * sizeof(short);
                *(short2*)(outsp + basesp) = do_mean_shift(x0, y0, outr, outrstep, cols, rows, sp, sr, maxIter, eps);
            }
@@ -159,10 +159,10 @@ namespace cv { namespace gpu { namespace device
            if (stream == 0)
                cudaSafeCall( cudaDeviceSynchronize() );

-            //cudaSafeCall( cudaUnbindTexture( tex_meanshift ) );        
+            //cudaSafeCall( cudaUnbindTexture( tex_meanshift ) );
        }

-        void meanShiftProc_gpu(const DevMem2Db& src, DevMem2Db dstr, DevMem2Db dstsp, int sp, int sr, int maxIter, float eps, cudaStream_t stream) 
+        void meanShiftProc_gpu(const DevMem2Db& src, DevMem2Db dstr, DevMem2Db dstsp, int sp, int sr, int maxIter, float eps, cudaStream_t stream)
        {
            dim3 grid(1, 1, 1);
            dim3 threads(32, 8, 1);
@@ -178,14 +178,14 @@ namespace cv { namespace gpu { namespace device
            if (stream == 0)
                cudaSafeCall( cudaDeviceSynchronize() );

-            //cudaSafeCall( cudaUnbindTexture( tex_meanshift ) );        
+            //cudaSafeCall( cudaUnbindTexture( tex_meanshift ) );
        }

        /////////////////////////////////// drawColorDisp ///////////////////////////////////////////////

        template <typename T>
        __device__ unsigned int cvtPixel(T d, int ndisp, float S = 1, float V = 1)
-        {        
+        {
            unsigned int H = ((ndisp-d) * 240)/ndisp;

            unsigned int hi = (H/60) % 6;
@@ -195,7 +195,7 @@ namespace cv { namespace gpu { namespace device
            float t = V * (1 - (1 - f) * S);

            float3 res;
-            
+
            if (hi == 0) //R = V,	G = t,	B = p
            {
                res.x = p;
@@ -208,15 +208,15 @@ namespace cv { namespace gpu { namespace device
                res.x = p;
                res.y = V;
                res.z = q;
-            }        
-            
+            }
+
            if (hi == 2) // R = p,	G = V,	B = t
            {
                res.x = t;
                res.y = V;
                res.z = p;
            }
-                
+
            if (hi == 3) // R = p,	G = q,	B = V
            {
                res.x = V;
@@ -242,15 +242,15 @@ namespace cv { namespace gpu { namespace device
            const unsigned int r = (unsigned int)(::max(0.f, ::min(res.z, 1.f)) * 255.f);
            const unsigned int a = 255U;

-            return (a << 24) + (r << 16) + (g << 8) + b;    
-        } 
+            return (a << 24) + (r << 16) + (g << 8) + b;
+        }

        __global__ void drawColorDisp(uchar* disp, size_t disp_step, uchar* out_image, size_t out_step, int width, int height, int ndisp)
        {
            const int x = (blockIdx.x * blockDim.x + threadIdx.x) << 2;
            const int y = blockIdx.y * blockDim.y + threadIdx.y;

-            if(x < width && y < height) 
+            if(x < width && y < height)
            {
                uchar4 d4 = *(uchar4*)(disp + y * disp_step + x);

@@ -259,7 +259,7 @@ namespace cv { namespace gpu { namespace device
                res.y = cvtPixel(d4.y, ndisp);
                res.z = cvtPixel(d4.z, ndisp);
                res.w = cvtPixel(d4.w, ndisp);
-                        
+
                uint4* line = (uint4*)(out_image + y * out_step);
                line[x >> 2] = res;
            }
@@ -270,12 +270,12 @@ namespace cv { namespace gpu { namespace device
            const int x = (blockIdx.x * blockDim.x + threadIdx.x) << 1;
            const int y = blockIdx.y * blockDim.y + threadIdx.y;

-            if(x < width && y < height) 
+            if(x < width && y < height)
            {
                short2 d2 = *(short2*)(disp + y * disp_step + x);

                uint2 res;
-                res.x = cvtPixel(d2.x, ndisp);            
+                res.x = cvtPixel(d2.x, ndisp);
                res.y = cvtPixel(d2.y, ndisp);

                uint2* line = (uint2*)(out_image + y * out_step);
@@ -290,12 +290,12 @@ namespace cv { namespace gpu { namespace device
            dim3 grid(1, 1, 1);
            grid.x = divUp(src.cols, threads.x << 2);
            grid.y = divUp(src.rows, threads.y);
-             
+
            drawColorDisp<<<grid, threads, 0, stream>>>(src.data, src.step, dst.data, dst.step, src.cols, src.rows, ndisp);
            cudaSafeCall( cudaGetLastError() );

            if (stream == 0)
-                cudaSafeCall( cudaDeviceSynchronize() ); 
+                cudaSafeCall( cudaDeviceSynchronize() );
        }

        void drawColorDisp_gpu(const DevMem2D_<short>& src, const DevMem2Db& dst, int ndisp, const cudaStream_t& stream)
@@ -304,10 +304,10 @@ namespace cv { namespace gpu { namespace device
            dim3 grid(1, 1, 1);
            grid.x = divUp(src.cols, threads.x << 1);
            grid.y = divUp(src.rows, threads.y);
-             
+
            drawColorDisp<<<grid, threads, 0, stream>>>(src.data, src.step / sizeof(short), dst.data, dst.step, src.cols, src.rows, ndisp);
            cudaSafeCall( cudaGetLastError() );
-            
+
            if (stream == 0)
                cudaSafeCall( cudaDeviceSynchronize() );
        }
@@ -318,7 +318,7 @@ namespace cv { namespace gpu { namespace device

        template <typename T>
        __global__ void reprojectImageTo3D(const T* disp, size_t disp_step, float* xyzw, size_t xyzw_step, int rows, int cols)
-        {        
+        {
            const int x = blockIdx.x * blockDim.x + threadIdx.x;
            const int y = blockIdx.y * blockDim.y + threadIdx.y;

@@ -328,7 +328,7 @@ namespace cv { namespace gpu { namespace device
                float qx = cq[1] * y + cq[3], qy = cq[5] * y + cq[7];
                float qz = cq[9] * y + cq[11], qw = cq[13] * y + cq[15];

-                qx += x * cq[0]; 
+                qx += x * cq[0];
                qy += x * cq[4];
                qz += x * cq[8];
                qw += x * cq[12];
@@ -457,7 +457,7 @@ namespace cv { namespace gpu { namespace device
            bindTexture(&harrisDxTex, Dx);
            bindTexture(&harrisDyTex, Dy);

-            switch (border_type) 
+            switch (border_type)
            {
            case BORDER_REFLECT101_GPU:
                cornerHarris_kernel<<<grid, block, 0, stream>>>(block_size, k, dst, BrdRowReflect101<void>(Dx.cols), BrdColReflect101<void>(Dx.rows));
@@ -565,7 +565,7 @@ namespace cv { namespace gpu { namespace device
        {
            dim3 block(32, 8);
            dim3 grid(divUp(Dx.cols, block.x), divUp(Dx.rows, block.y));
-            
+
            bindTexture(&minEigenValDxTex, Dx);
            bindTexture(&minEigenValDyTex, Dy);

@@ -630,10 +630,10 @@ namespace cv { namespace gpu { namespace device

        __global__ void mulSpectrumsKernel(const PtrStep<cufftComplex> a, const PtrStep<cufftComplex> b, DevMem2D_<cufftComplex> c)
        {
-            const int x = blockIdx.x * blockDim.x + threadIdx.x;    
-            const int y = blockIdx.y * blockDim.y + threadIdx.y;    
+            const int x = blockIdx.x * blockDim.x + threadIdx.x;
+            const int y = blockIdx.y * blockDim.y + threadIdx.y;

-            if (x < c.cols && y < c.rows) 
+            if (x < c.cols && y < c.rows)
            {
                c.ptr(y)[x] = cuCmulf(a.ptr(y)[x], b.ptr(y)[x]);
            }
@@ -658,10 +658,10 @@ namespace cv { namespace gpu { namespace device

        __global__ void mulSpectrumsKernel_CONJ(const PtrStep<cufftComplex> a, const PtrStep<cufftComplex> b, DevMem2D_<cufftComplex> c)
        {
-            const int x = blockIdx.x * blockDim.x + threadIdx.x;    
-            const int y = blockIdx.y * blockDim.y + threadIdx.y;    
+            const int x = blockIdx.x * blockDim.x + threadIdx.x;
+            const int y = blockIdx.y * blockDim.y + threadIdx.y;

-            if (x < c.cols && y < c.rows) 
+            if (x < c.cols && y < c.rows)
            {
                c.ptr(y)[x] = cuCmulf(a.ptr(y)[x], cuConjf(b.ptr(y)[x]));
            }
@@ -689,7 +689,7 @@ namespace cv { namespace gpu { namespace device
            const int x = blockIdx.x * blockDim.x + threadIdx.x;
            const int y = blockIdx.y * blockDim.y + threadIdx.y;

-            if (x < c.cols && y < c.rows) 
+            if (x < c.cols && y < c.rows)
            {
                cufftComplex v = cuCmulf(a.ptr(y)[x], b.ptr(y)[x]);
                c.ptr(y)[x] = make_cuFloatComplex(cuCrealf(v) * scale, cuCimagf(v) * scale);
@@ -718,7 +718,7 @@ namespace cv { namespace gpu { namespace device
            const int x = blockIdx.x * blockDim.x + threadIdx.x;
            const int y = blockIdx.y * blockDim.y + threadIdx.y;

-            if (x < c.cols && y < c.rows) 
+            if (x < c.cols && y < c.rows)
            {
                cufftComplex v = cuCmulf(a.ptr(y)[x], cuConjf(b.ptr(y)[x]));
                c.ptr(y)[x] = make_cuFloatComplex(cuCrealf(v) * scale, cuCimagf(v) * scale);
@@ -736,7 +736,7 @@ namespace cv { namespace gpu { namespace device

            if (stream == 0)
                cudaSafeCall( cudaDeviceSynchronize() );
-        }    
+        }

        //////////////////////////////////////////////////////////////////////////
        // buildWarpMaps
@@ -842,7 +842,7 @@ namespace cv { namespace gpu { namespace device


        void buildWarpPlaneMaps(int tl_u, int tl_v, DevMem2Df map_x, DevMem2Df map_y,
-                                const float k_rinv[9], const float r_kinv[9], const float t[3], 
+                                const float k_rinv[9], const float r_kinv[9], const float t[3],
                                float scale, cudaStream_t stream)
        {
            cudaSafeCall(cudaMemcpyToSymbol(build_warp_maps::ck_rinv, k_rinv, 9*sizeof(float)));
@@ -911,27 +911,28 @@ namespace cv { namespace gpu { namespace device

        __constant__ float c_filter2DKernel[FILTER2D_MAX_KERNEL_SIZE * FILTER2D_MAX_KERNEL_SIZE];

-        texture<float, cudaTextureType2D, cudaReadModeElementType> filter2DTex(0, cudaFilterModePoint, cudaAddressModeBorder);
+        texture<float, cudaTextureType2D, cudaReadModeElementType> filter2DTex(0, cudaFilterModePoint, cudaAddressModeClamp);

-        __global__ void filter2D(int ofsX, int ofsY, DevMem2Df dst, const int kWidth, const int kHeight, const int anchorX, const int anchorY)
+        __global__ void filter2D(int ofsX, int ofsY, PtrStepf dst, const int kWidth, const int kHeight, const int anchorX, const int anchorY, const BrdReflect101<float> brd)
        {
            const int x = blockIdx.x * blockDim.x + threadIdx.x;
            const int y = blockIdx.y * blockDim.y + threadIdx.y;

-            if (x >= dst.cols || y >= dst.rows)
+            if (x > brd.last_col || y > brd.last_row)
                return;

            float res = 0;
-
-            const int baseX = ofsX + x - anchorX;
-            const int baseY = ofsY + y - anchorY;
-
            int kInd = 0;

            for (int i = 0; i < kHeight; ++i)
            {
                for (int j = 0; j < kWidth; ++j)
-                    res += tex2D(filter2DTex, baseX + j, baseY + i) * c_filter2DKernel[kInd++];
+                {
+                    const int srcX = ofsX + brd.idx_col(x - anchorX + j);
+                    const int srcY = ofsY + brd.idx_row(y - anchorY + i);
+
+                    res += tex2D(filter2DTex, srcX, srcY) * c_filter2DKernel[kInd++];
+                }
            }

            dst.ptr(y)[x] = res;
@@ -946,7 +947,9 @@ namespace cv { namespace gpu { namespace device

            bindTexture(&filter2DTex, src);

-            filter2D<<<grid, block, 0, stream>>>(ofsX, ofsY, dst, kWidth, kHeight, anchorX, anchorY);
+            BrdReflect101<float> brd(dst.rows, dst.cols);
+
+            filter2D<<<grid, block, 0, stream>>>(ofsX, ofsY, dst, kWidth, kHeight, anchorX, anchorY, brd);
            cudaSafeCall(cudaGetLastError());

            if (stream == 0)