gpu module refactoring: moved per-element operations into separated file

2010-12-20 09:07:19 +00:00
parent 6891a60149
commit 0465b89e7e
6 changed files with 1048 additions and 915 deletions
--- a/modules/gpu/src/cuda/mathfunc.cu
+++ b/modules/gpu/src/cuda/mathfunc.cu
@@ -78,6 +78,29 @@ namespace cv { namespace gpu { namespace mathfunc
        }
    }

+
+    struct Mask8U
+    {
+        explicit Mask8U(PtrStep mask): mask(mask) {}
+
+        __device__ bool operator()(int y, int x) const 
+        { 
+            return mask.ptr(y)[x]; 
+        }
+
+        PtrStep mask;
+    };
+
+
+    struct MaskTrue 
+    { 
+        __device__ bool operator()(int y, int x) const 
+        { 
+            return true; 
+        } 
+    };
+
+
    struct Nothing
    {
        static __device__ void calc(int, int, float, float, float*, size_t, float)
@@ -235,313 +258,6 @@ namespace cv { namespace gpu { namespace mathfunc
        callers[mag.data == 0](mag, angle, x, y, angleInDegrees, stream);
    }

-//////////////////////////////////////////////////////////////////////////////////////
-// Compare
-
-    template <typename T1, typename T2>
-    struct NotEqual
-    {
-        __device__ uchar operator()(const T1& src1, const T2& src2)
-        {
-            return static_cast<uchar>(static_cast<int>(src1 != src2) * 255);
-        }
-    };
-
-    template <typename T1, typename T2>
-    inline void compare_ne(const DevMem2D& src1, const DevMem2D& src2, const DevMem2D& dst)
-    {
-        NotEqual<T1, T2> op;
-        transform(static_cast< DevMem2D_<T1> >(src1), static_cast< DevMem2D_<T2> >(src2), dst, op, 0);
-    }
-
-    void compare_ne_8uc4(const DevMem2D& src1, const DevMem2D& src2, const DevMem2D& dst)
-    {
-        compare_ne<uint, uint>(src1, src2, dst);
-    }
-    void compare_ne_32f(const DevMem2D& src1, const DevMem2D& src2, const DevMem2D& dst)
-    {
-        compare_ne<float, float>(src1, src2, dst);
-    }
-
-
-//////////////////////////////////////////////////////////////////////////////
-// Per-element bit-wise logical matrix operations
-
-    struct Mask8U
-    {
-        explicit Mask8U(PtrStep mask): mask(mask) {}
-
-        __device__ bool operator()(int y, int x) const 
-        { 
-            return mask.ptr(y)[x]; 
-        }
-
-        PtrStep mask;
-    };
-
-
-    struct MaskTrue 
-    { 
-        __device__ bool operator()(int y, int x) const 
-        { 
-            return true; 
-        } 
-    };
-
-    //------------------------------------------------------------------------
-    // Unary operations
-
-    enum { UN_OP_NOT };
-
-    template <typename T, int opid>
-    struct UnOp;
-
-    template <typename T>
-    struct UnOp<T, UN_OP_NOT>
-    { 
-        static __device__ T call(T v) { return ~v; }
-    };
-
-
-    template <int opid>
-    __global__ void bitwise_un_op_kernel(int rows, int width, const PtrStep src, PtrStep dst)
-    {
-        const int x = (blockDim.x * blockIdx.x + threadIdx.x) * 4;
-        const int y = blockDim.y * blockIdx.y + threadIdx.y;
-
-        if (y < rows) 
-        {
-            uchar* dst_ptr = dst.ptr(y) + x;
-            const uchar* src_ptr = src.ptr(y) + x;
-            if (x + sizeof(uint) - 1 < width)
-            {
-                *(uint*)dst_ptr = UnOp<uint, opid>::call(*(uint*)src_ptr);
-            }
-            else
-            {
-                const uchar* src_end = src.ptr(y) + width;
-                while (src_ptr < src_end)
-                {
-                    *dst_ptr++ = UnOp<uchar, opid>::call(*src_ptr++);
-                }
-            }
-        }
-    }
-
-
-    template <int opid>
-    void bitwise_un_op(int rows, int width, const PtrStep src, PtrStep dst, cudaStream_t stream)
-    {
-        dim3 threads(16, 16);
-        dim3 grid(divUp(width, threads.x * sizeof(uint)), 
-                  divUp(rows, threads.y));
-
-        bitwise_un_op_kernel<opid><<<grid, threads>>>(rows, width, src, dst);
-
-        if (stream == 0) 
-            cudaSafeCall(cudaThreadSynchronize());
-    }
-
-
-    template <typename T, int opid>
-    __global__ void bitwise_un_op_kernel(int rows, int cols, int cn, const PtrStep src, const PtrStep mask, PtrStep dst)
-    {
-        const int x = blockDim.x * blockIdx.x + threadIdx.x;
-        const int y = blockDim.y * blockIdx.y + threadIdx.y;
-
-        if (x < cols && y < rows && mask.ptr(y)[x / cn]) 
-        {
-            T* dst_row = (T*)dst.ptr(y);
-            const T* src_row = (const T*)src.ptr(y);
-
-            dst_row[x] = UnOp<T, opid>::call(src_row[x]);
-        }
-    }
-
-
-    template <typename T, int opid>
-    void bitwise_un_op(int rows, int cols, int cn, const PtrStep src, const PtrStep mask, PtrStep dst, cudaStream_t stream)
-    {
-        dim3 threads(16, 16);
-        dim3 grid(divUp(cols, threads.x), divUp(rows, threads.y));
-
-        bitwise_un_op_kernel<T, opid><<<grid, threads>>>(rows, cols, cn, src, mask, dst); 
-
-        if (stream == 0) 
-            cudaSafeCall(cudaThreadSynchronize());
-    }
-
-
-    void bitwise_not_caller(int rows, int cols, int elem_size1, int cn, const PtrStep src, PtrStep dst, cudaStream_t stream)
-    {
-        bitwise_un_op<UN_OP_NOT>(rows, cols * elem_size1 * cn, src, dst, stream);
-    }
-
-
-    template <typename T>
-    void bitwise_mask_not_caller(int rows, int cols, int cn, const PtrStep src, const PtrStep mask, PtrStep dst, cudaStream_t stream)
-    {
-        bitwise_un_op<T, UN_OP_NOT>(rows, cols * cn, cn, src, mask, dst, stream);
-    }
-
-    template void bitwise_mask_not_caller<uchar>(int, int, int, const PtrStep, const PtrStep, PtrStep, cudaStream_t);
-    template void bitwise_mask_not_caller<ushort>(int, int, int, const PtrStep, const PtrStep, PtrStep, cudaStream_t);
-    template void bitwise_mask_not_caller<uint>(int, int, int, const PtrStep, const PtrStep, PtrStep, cudaStream_t);
-
-    //------------------------------------------------------------------------
-    // Binary operations
-
-    enum { BIN_OP_OR, BIN_OP_AND, BIN_OP_XOR };
-
-    template <typename T, int opid>
-    struct BinOp;
-
-    template <typename T>
-    struct BinOp<T, BIN_OP_OR>
-    { 
-        static __device__ T call(T a, T b) { return a | b; } 
-    };
-
-
-    template <typename T>
-    struct BinOp<T, BIN_OP_AND>
-    { 
-        static __device__ T call(T a, T b) { return a & b; } 
-    };
-
-    template <typename T>
-    struct BinOp<T, BIN_OP_XOR>
-    { 
-        static __device__ T call(T a, T b) { return a ^ b; } 
-    };
-
-
-    template <int opid>
-    __global__ void bitwise_bin_op_kernel(int rows, int width, const PtrStep src1, const PtrStep src2, PtrStep dst)
-    {
-        const int x = (blockDim.x * blockIdx.x + threadIdx.x) * 4;
-        const int y = blockDim.y * blockIdx.y + threadIdx.y;
-
-        if (y < rows) 
-        {
-            uchar* dst_ptr = dst.ptr(y) + x;
-            const uchar* src1_ptr = src1.ptr(y) + x;
-            const uchar* src2_ptr = src2.ptr(y) + x;
-
-            if (x + sizeof(uint) - 1 < width)
-            {
-                *(uint*)dst_ptr = BinOp<uint, opid>::call(*(uint*)src1_ptr, *(uint*)src2_ptr);
-            }
-            else
-            {
-                const uchar* src1_end = src1.ptr(y) + width;
-                while (src1_ptr < src1_end)
-                {
-                    *dst_ptr++ = BinOp<uchar, opid>::call(*src1_ptr++, *src2_ptr++);
-                }
-            }
-        }
-    }
-
-
-    template <int opid>
-    void bitwise_bin_op(int rows, int width, const PtrStep src1, const PtrStep src2, PtrStep dst, 
-                        cudaStream_t stream)
-    {
-        dim3 threads(16, 16);
-        dim3 grid(divUp(width, threads.x * sizeof(uint)), divUp(rows, threads.y));
-
-        bitwise_bin_op_kernel<opid><<<grid, threads>>>(rows, width, src1, src2, dst);
-
-        if (stream == 0) 
-            cudaSafeCall(cudaThreadSynchronize());
-    }
-
-
-    template <typename T, int opid>
-    __global__ void bitwise_bin_op_kernel(
-            int rows, int cols, int cn, const PtrStep src1, const PtrStep src2, 
-            const PtrStep mask, PtrStep dst)
-    {
-        const int x = blockDim.x * blockIdx.x + threadIdx.x;
-        const int y = blockDim.y * blockIdx.y + threadIdx.y;
-
-        if (x < cols && y < rows && mask.ptr(y)[x / cn]) 
-        {
-            T* dst_row = (T*)dst.ptr(y);
-            const T* src1_row = (const T*)src1.ptr(y);
-            const T* src2_row = (const T*)src2.ptr(y);
-
-            dst_row[x] = BinOp<T, opid>::call(src1_row[x], src2_row[x]);
-        }
-    }
-
-
-    template <typename T, int opid>
-    void bitwise_bin_op(int rows, int cols, int cn, const PtrStep src1, const PtrStep src2, 
-                        const PtrStep mask, PtrStep dst, cudaStream_t stream)
-    {
-        dim3 threads(16, 16);
-        dim3 grid(divUp(cols, threads.x), divUp(rows, threads.y));
-
-        bitwise_bin_op_kernel<T, opid><<<grid, threads>>>(rows, cols, cn, src1, src2, mask, dst); 
-
-        if (stream == 0) 
-            cudaSafeCall(cudaThreadSynchronize());
-    }
-
-
-    void bitwise_or_caller(int rows, int cols, int elem_size1, int cn, const PtrStep src1, const PtrStep src2, PtrStep dst, cudaStream_t stream)
-    {
-        bitwise_bin_op<BIN_OP_OR>(rows, cols * elem_size1 * cn, src1, src2, dst, stream);
-    }
-
-
-    template <typename T>
-    void bitwise_mask_or_caller(int rows, int cols, int cn, const PtrStep src1, const PtrStep src2, const PtrStep mask, PtrStep dst, cudaStream_t stream)
-    {
-        bitwise_bin_op<T, BIN_OP_OR>(rows, cols * cn, cn, src1, src2, mask, dst, stream);
-    }
-
-    template void bitwise_mask_or_caller<uchar>(int, int, int, const PtrStep, const PtrStep, const PtrStep, PtrStep, cudaStream_t);
-    template void bitwise_mask_or_caller<ushort>(int, int, int, const PtrStep, const PtrStep, const PtrStep, PtrStep, cudaStream_t);
-    template void bitwise_mask_or_caller<uint>(int, int, int, const PtrStep, const PtrStep, const PtrStep, PtrStep, cudaStream_t);
-
-
-    void bitwise_and_caller(int rows, int cols, int elem_size1, int cn, const PtrStep src1, const PtrStep src2, PtrStep dst, cudaStream_t stream)
-    {
-        bitwise_bin_op<BIN_OP_AND>(rows, cols * elem_size1 * cn, src1, src2, dst, stream);
-    }
-
-
-    template <typename T>
-    void bitwise_mask_and_caller(int rows, int cols, int cn, const PtrStep src1, const PtrStep src2, const PtrStep mask, PtrStep dst, cudaStream_t stream)
-    {
-        bitwise_bin_op<T, BIN_OP_AND>(rows, cols * cn, cn, src1, src2, mask, dst, stream);
-    }
-
-    template void bitwise_mask_and_caller<uchar>(int, int, int, const PtrStep, const PtrStep, const PtrStep, PtrStep, cudaStream_t);
-    template void bitwise_mask_and_caller<ushort>(int, int, int, const PtrStep, const PtrStep, const PtrStep, PtrStep, cudaStream_t);
-    template void bitwise_mask_and_caller<uint>(int, int, int, const PtrStep, const PtrStep, const PtrStep, PtrStep, cudaStream_t);
-
-
-    void bitwise_xor_caller(int rows, int cols, int elem_size1, int cn, const PtrStep src1, const PtrStep src2, PtrStep dst, cudaStream_t stream)
-    {
-        bitwise_bin_op<BIN_OP_XOR>(rows, cols * elem_size1 * cn, src1, src2, dst, stream);
-    }
-
-
-    template <typename T>
-    void bitwise_mask_xor_caller(int rows, int cols, int cn, const PtrStep src1, const PtrStep src2, const PtrStep mask, PtrStep dst, cudaStream_t stream)
-    {
-        bitwise_bin_op<T, BIN_OP_XOR>(rows, cols * cn, cn, src1, src2, mask, dst, stream);
-    }
-
-    template void bitwise_mask_xor_caller<uchar>(int, int, int, const PtrStep, const PtrStep, const PtrStep, PtrStep, cudaStream_t);
-    template void bitwise_mask_xor_caller<ushort>(int, int, int, const PtrStep, const PtrStep, const PtrStep, PtrStep, cudaStream_t);
-    template void bitwise_mask_xor_caller<uint>(int, int, int, const PtrStep, const PtrStep, const PtrStep, PtrStep, cudaStream_t);
-
-

 //////////////////////////////////////////////////////////////////////////////
 // Min max