used new device layer for cv::gpu::add

2013-08-23 18:28:13 +04:00
parent 32d578f5f0
commit 9c5da2ea22
5 changed files with 280 additions and 447 deletions
--- a/modules/cudaarithm/src/element_operations.cpp
+++ b/modules/cudaarithm/src/element_operations.cpp
@@ -336,248 +336,9 @@ namespace
 ////////////////////////////////////////////////////////////////////////
 // add

-namespace arithm
-{
-    void addMat_v4(PtrStepSz<unsigned int> src1, PtrStepSz<unsigned int> src2, PtrStepSz<unsigned int> dst, cudaStream_t stream);
-    void addMat_v2(PtrStepSz<unsigned int> src1, PtrStepSz<unsigned int> src2, PtrStepSz<unsigned int> dst, cudaStream_t stream);
+void addMat(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat& mask, double, Stream& _stream, int);

-    template <typename T, typename D>
-    void addMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
-}
-
-static void addMat(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat& mask, double, Stream& _stream, int)
-{
-    typedef void (*func_t)(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
-    static const func_t funcs[7][7] =
-    {
-        {
-            arithm::addMat<unsigned char, unsigned char>,
-            arithm::addMat<unsigned char, signed char>,
-            arithm::addMat<unsigned char, unsigned short>,
-            arithm::addMat<unsigned char, short>,
-            arithm::addMat<unsigned char, int>,
-            arithm::addMat<unsigned char, float>,
-            arithm::addMat<unsigned char, double>
-        },
-        {
-            arithm::addMat<signed char, unsigned char>,
-            arithm::addMat<signed char, signed char>,
-            arithm::addMat<signed char, unsigned short>,
-            arithm::addMat<signed char, short>,
-            arithm::addMat<signed char, int>,
-            arithm::addMat<signed char, float>,
-            arithm::addMat<signed char, double>
-        },
-        {
-            0 /*arithm::addMat<unsigned short, unsigned char>*/,
-            0 /*arithm::addMat<unsigned short, signed char>*/,
-            arithm::addMat<unsigned short, unsigned short>,
-            arithm::addMat<unsigned short, short>,
-            arithm::addMat<unsigned short, int>,
-            arithm::addMat<unsigned short, float>,
-            arithm::addMat<unsigned short, double>
-        },
-        {
-            0 /*arithm::addMat<short, unsigned char>*/,
-            0 /*arithm::addMat<short, signed char>*/,
-            arithm::addMat<short, unsigned short>,
-            arithm::addMat<short, short>,
-            arithm::addMat<short, int>,
-            arithm::addMat<short, float>,
-            arithm::addMat<short, double>
-        },
-        {
-            0 /*arithm::addMat<int, unsigned char>*/,
-            0 /*arithm::addMat<int, signed char>*/,
-            0 /*arithm::addMat<int, unsigned short>*/,
-            0 /*arithm::addMat<int, short>*/,
-            arithm::addMat<int, int>,
-            arithm::addMat<int, float>,
-            arithm::addMat<int, double>
-        },
-        {
-            0 /*arithm::addMat<float, unsigned char>*/,
-            0 /*arithm::addMat<float, signed char>*/,
-            0 /*arithm::addMat<float, unsigned short>*/,
-            0 /*arithm::addMat<float, short>*/,
-            0 /*arithm::addMat<float, int>*/,
-            arithm::addMat<float, float>,
-            arithm::addMat<float, double>
-        },
-        {
-            0 /*arithm::addMat<double, unsigned char>*/,
-            0 /*arithm::addMat<double, signed char>*/,
-            0 /*arithm::addMat<double, unsigned short>*/,
-            0 /*arithm::addMat<double, short>*/,
-            0 /*arithm::addMat<double, int>*/,
-            0 /*arithm::addMat<double, float>*/,
-            arithm::addMat<double, double>
-        }
-    };
-
-    const int sdepth = src1.depth();
-    const int ddepth = dst.depth();
-    const int cn = src1.channels();
-
-    cudaStream_t stream = StreamAccessor::getStream(_stream);
-
-    PtrStepSzb src1_(src1.rows, src1.cols * cn, src1.data, src1.step);
-    PtrStepSzb src2_(src1.rows, src1.cols * cn, src2.data, src2.step);
-    PtrStepSzb dst_(src1.rows, src1.cols * cn, dst.data, dst.step);
-
-    if (mask.empty() && (sdepth == CV_8U || sdepth == CV_16U) && ddepth == sdepth)
-    {
-        const intptr_t src1ptr = reinterpret_cast<intptr_t>(src1_.data);
-        const intptr_t src2ptr = reinterpret_cast<intptr_t>(src2_.data);
-        const intptr_t dstptr = reinterpret_cast<intptr_t>(dst_.data);
-
-        const bool isAllAligned = (src1ptr & 31) == 0 && (src2ptr & 31) == 0 && (dstptr & 31) == 0;
-
-        if (isAllAligned)
-        {
-            if (sdepth == CV_8U && (src1_.cols & 3) == 0)
-            {
-                const int vcols = src1_.cols >> 2;
-
-                arithm::addMat_v4(PtrStepSz<unsigned int>(src1_.rows, vcols, (unsigned int*) src1_.data, src1_.step),
-                                  PtrStepSz<unsigned int>(src1_.rows, vcols, (unsigned int*) src2_.data, src2_.step),
-                                  PtrStepSz<unsigned int>(src1_.rows, vcols, (unsigned int*) dst_.data, dst_.step),
-                                  stream);
-
-                return;
-            }
-            else if (sdepth == CV_16U && (src1_.cols & 1) == 0)
-            {
-                const int vcols = src1_.cols >> 1;
-
-                arithm::addMat_v2(PtrStepSz<unsigned int>(src1_.rows, vcols, (unsigned int*) src1_.data, src1_.step),
-                                  PtrStepSz<unsigned int>(src1_.rows, vcols, (unsigned int*) src2_.data, src2_.step),
-                                  PtrStepSz<unsigned int>(src1_.rows, vcols, (unsigned int*) dst_.data, dst_.step),
-                                  stream);
-
-                return;
-            }
-        }
-    }
-
-    const func_t func = funcs[sdepth][ddepth];
-
-    if (!func)
-        CV_Error(cv::Error::StsUnsupportedFormat, "Unsupported combination of source and destination types");
-
-    func(src1_, src2_, dst_, mask, stream);
-}
-
-namespace arithm
-{
-    template <typename T, typename S, typename D>
-    void addScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
-}
-
-static void addScalar(const GpuMat& src, Scalar val, bool, GpuMat& dst, const GpuMat& mask, double, Stream& _stream, int)
-{
-    typedef void (*func_t)(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
-    static const func_t funcs[7][7] =
-    {
-        {
-            arithm::addScalar<unsigned char, float, unsigned char>,
-            arithm::addScalar<unsigned char, float, signed char>,
-            arithm::addScalar<unsigned char, float, unsigned short>,
-            arithm::addScalar<unsigned char, float, short>,
-            arithm::addScalar<unsigned char, float, int>,
-            arithm::addScalar<unsigned char, float, float>,
-            arithm::addScalar<unsigned char, double, double>
-        },
-        {
-            arithm::addScalar<signed char, float, unsigned char>,
-            arithm::addScalar<signed char, float, signed char>,
-            arithm::addScalar<signed char, float, unsigned short>,
-            arithm::addScalar<signed char, float, short>,
-            arithm::addScalar<signed char, float, int>,
-            arithm::addScalar<signed char, float, float>,
-            arithm::addScalar<signed char, double, double>
-        },
-        {
-            0 /*arithm::addScalar<unsigned short, float, unsigned char>*/,
-            0 /*arithm::addScalar<unsigned short, float, signed char>*/,
-            arithm::addScalar<unsigned short, float, unsigned short>,
-            arithm::addScalar<unsigned short, float, short>,
-            arithm::addScalar<unsigned short, float, int>,
-            arithm::addScalar<unsigned short, float, float>,
-            arithm::addScalar<unsigned short, double, double>
-        },
-        {
-            0 /*arithm::addScalar<short, float, unsigned char>*/,
-            0 /*arithm::addScalar<short, float, signed char>*/,
-            arithm::addScalar<short, float, unsigned short>,
-            arithm::addScalar<short, float, short>,
-            arithm::addScalar<short, float, int>,
-            arithm::addScalar<short, float, float>,
-            arithm::addScalar<short, double, double>
-        },
-        {
-            0 /*arithm::addScalar<int, float, unsigned char>*/,
-            0 /*arithm::addScalar<int, float, signed char>*/,
-            0 /*arithm::addScalar<int, float, unsigned short>*/,
-            0 /*arithm::addScalar<int, float, short>*/,
-            arithm::addScalar<int, float, int>,
-            arithm::addScalar<int, float, float>,
-            arithm::addScalar<int, double, double>
-        },
-        {
-            0 /*arithm::addScalar<float, float, unsigned char>*/,
-            0 /*arithm::addScalar<float, float, signed char>*/,
-            0 /*arithm::addScalar<float, float, unsigned short>*/,
-            0 /*arithm::addScalar<float, float, short>*/,
-            0 /*arithm::addScalar<float, float, int>*/,
-            arithm::addScalar<float, float, float>,
-            arithm::addScalar<float, double, double>
-        },
-        {
-            0 /*arithm::addScalar<double, double, unsigned char>*/,
-            0 /*arithm::addScalar<double, double, signed char>*/,
-            0 /*arithm::addScalar<double, double, unsigned short>*/,
-            0 /*arithm::addScalar<double, double, short>*/,
-            0 /*arithm::addScalar<double, double, int>*/,
-            0 /*arithm::addScalar<double, double, float>*/,
-            arithm::addScalar<double, double, double>
-        }
-    };
-
-    typedef void (*npp_func_t)(const PtrStepSzb src, Scalar sc, PtrStepb dst, cudaStream_t stream);
-    static const npp_func_t npp_funcs[7][4] =
-    {
-        {NppArithmScalar<CV_8U , 1, nppiAddC_8u_C1RSfs >::call, 0                                                     , NppArithmScalar<CV_8U , 3, nppiAddC_8u_C3RSfs >::call, NppArithmScalar<CV_8U , 4, nppiAddC_8u_C4RSfs >::call},
-        {0                                                    , 0                                                     , 0                                                    , 0                                                    },
-        {NppArithmScalar<CV_16U, 1, nppiAddC_16u_C1RSfs>::call, 0                                                     , NppArithmScalar<CV_16U, 3, nppiAddC_16u_C3RSfs>::call, NppArithmScalar<CV_16U, 4, nppiAddC_16u_C4RSfs>::call},
-        {NppArithmScalar<CV_16S, 1, nppiAddC_16s_C1RSfs>::call, NppArithmScalar<CV_16S, 2, nppiAddC_16sc_C1RSfs>::call, NppArithmScalar<CV_16S, 3, nppiAddC_16s_C3RSfs>::call, NppArithmScalar<CV_16S, 4, nppiAddC_16s_C4RSfs>::call},
-        {NppArithmScalar<CV_32S, 1, nppiAddC_32s_C1RSfs>::call, NppArithmScalar<CV_32S, 2, nppiAddC_32sc_C1RSfs>::call, NppArithmScalar<CV_32S, 3, nppiAddC_32s_C3RSfs>::call, 0                                                    },
-        {NppArithmScalar<CV_32F, 1, nppiAddC_32f_C1R   >::call, NppArithmScalar<CV_32F, 2, nppiAddC_32fc_C1R   >::call, NppArithmScalar<CV_32F, 3, nppiAddC_32f_C3R   >::call, NppArithmScalar<CV_32F, 4, nppiAddC_32f_C4R   >::call},
-        {0                                                    , 0                                                     , 0                                                    , 0                                                    }
-    };
-
-    const int sdepth = src.depth();
-    const int ddepth = dst.depth();
-    const int cn = src.channels();
-
-    cudaStream_t stream = StreamAccessor::getStream(_stream);
-
-    const npp_func_t npp_func = npp_funcs[sdepth][cn - 1];
-    if (ddepth == sdepth && cn > 1 && npp_func != 0)
-    {
-        npp_func(src, val, dst, stream);
-        return;
-    }
-
-    CV_Assert( cn == 1 );
-
-    const func_t func = funcs[sdepth][ddepth];
-
-    if (!func)
-        CV_Error(cv::Error::StsUnsupportedFormat, "Unsupported combination of source and destination types");
-
-    func(src, val[0], dst, mask, stream);
-}
+void addScalar(const GpuMat& src, Scalar val, bool, GpuMat& dst, const GpuMat& mask, double, Stream& stream, int);

 void cv::cuda::add(InputArray src1, InputArray src2, OutputArray dst, InputArray mask, int dtype, Stream& stream)
 {