optimizations:

- new reduce implementation (with kepler optimizations) - saturate_cast via asm command - video SIMD instructions in element operations - float arithmetics instead of double - new deviceSupports function
2013-01-23 14:43:36 +04:00
parent ae6266e101
commit 281d036fcf
45 changed files with 9379 additions and 8846 deletions
--- a/modules/gpu/src/warp.cpp
+++ b/modules/gpu/src/warp.cpp
@@ -61,13 +61,13 @@ namespace cv { namespace gpu { namespace device

        template <typename T>
        void warpAffine_gpu(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation,
-                            int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+                            int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);

        void buildWarpPerspectiveMaps_gpu(float coeffs[3 * 3], PtrStepSzf xmap, PtrStepSzf ymap, cudaStream_t stream);

        template <typename T>
        void warpPerspective_gpu(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation,
-                            int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+                            int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
    }
 }}}

@@ -143,33 +143,31 @@ namespace
    {
        typedef typename NppWarpFunc<DEPTH>::npp_t npp_t;

-        static void call(const cv::gpu::GpuMat& src, cv::Size wholeSize, cv::Point ofs, cv::gpu::GpuMat& dst,
-                         double coeffs[][3], cv::Size dsize, int interpolation, cudaStream_t stream)
+        static void call(const cv::gpu::GpuMat& src, cv::gpu::GpuMat& dst, double coeffs[][3], int interpolation, cudaStream_t stream)
        {
            static const int npp_inter[] = {NPPI_INTER_NN, NPPI_INTER_LINEAR, NPPI_INTER_CUBIC};

-            dst.create(dsize, src.type());
-            dst.setTo(cv::Scalar::all(0));
-
            NppiSize srcsz;
-            srcsz.height = wholeSize.height;
-            srcsz.width = wholeSize.width;
+            srcsz.height = src.rows;
+            srcsz.width = src.cols;

            NppiRect srcroi;
-            srcroi.x = ofs.x;
-            srcroi.y = ofs.y;
+            srcroi.x = 0;
+            srcroi.y = 0;
            srcroi.height = src.rows;
            srcroi.width = src.cols;

            NppiRect dstroi;
-            dstroi.x = dstroi.y = 0;
+            dstroi.x = 0;
+            dstroi.y = 0;
            dstroi.height = dst.rows;
            dstroi.width = dst.cols;

            cv::gpu::NppStreamHandler h(stream);

-            nppSafeCall( func((npp_t*)src.datastart, srcsz, static_cast<int>(src.step), srcroi,
-                              dst.ptr<npp_t>(), static_cast<int>(dst.step), dstroi, coeffs, npp_inter[interpolation]) );
+            nppSafeCall( func(src.ptr<npp_t>(), srcsz, static_cast<int>(src.step), srcroi,
+                              dst.ptr<npp_t>(), static_cast<int>(dst.step), dstroi,
+                              coeffs, npp_inter[interpolation]) );

            if (stream == 0)
                cudaSafeCall( cudaDeviceSynchronize() );
@@ -187,6 +185,8 @@ void cv::gpu::warpAffine(const GpuMat& src, GpuMat& dst, const Mat& M, Size dsiz
    CV_Assert(interpolation == INTER_NEAREST || interpolation == INTER_LINEAR || interpolation == INTER_CUBIC);
    CV_Assert(borderMode == BORDER_REFLECT101 || borderMode == BORDER_REPLICATE || borderMode == BORDER_CONSTANT || borderMode == BORDER_REFLECT || borderMode == BORDER_WRAP);

+    dst.create(dsize, src.type());
+
    Size wholeSize;
    Point ofs;
    src.locateROI(wholeSize, ofs);
@@ -231,8 +231,7 @@ void cv::gpu::warpAffine(const GpuMat& src, GpuMat& dst, const Mat& M, Size dsiz
        }
    };

-    bool useNpp = borderMode == BORDER_CONSTANT;
-    useNpp = useNpp && useNppTab[src.depth()][src.channels() - 1][interpolation];
+    bool useNpp = borderMode == BORDER_CONSTANT && ofs.x == 0 && ofs.y == 0 && useNppTab[src.depth()][src.channels() - 1][interpolation];
    #ifdef linux
        // NPP bug on float data
        useNpp = useNpp && src.depth() != CV_32F;
@@ -240,7 +239,7 @@ void cv::gpu::warpAffine(const GpuMat& src, GpuMat& dst, const Mat& M, Size dsiz

    if (useNpp)
    {
-        typedef void (*func_t)(const cv::gpu::GpuMat& src, cv::Size wholeSize, cv::Point ofs, cv::gpu::GpuMat& dst, double coeffs[][3], cv::Size dsize, int flags, cudaStream_t stream);
+        typedef void (*func_t)(const cv::gpu::GpuMat& src, cv::gpu::GpuMat& dst, double coeffs[][3], int flags, cudaStream_t stream);

        static const func_t funcs[2][6][4] =
        {
@@ -262,6 +261,8 @@ void cv::gpu::warpAffine(const GpuMat& src, GpuMat& dst, const Mat& M, Size dsiz
            }
        };

+        dst.setTo(borderValue);
+
        double coeffs[2][3];
        Mat coeffsMat(2, 3, CV_64F, (void*)coeffs);
        M.convertTo(coeffsMat, coeffsMat.type());
@@ -269,14 +270,14 @@ void cv::gpu::warpAffine(const GpuMat& src, GpuMat& dst, const Mat& M, Size dsiz
        const func_t func = funcs[(flags & WARP_INVERSE_MAP) != 0][src.depth()][src.channels() - 1];
        CV_Assert(func != 0);

-        func(src, wholeSize, ofs, dst, coeffs, dsize, interpolation, StreamAccessor::getStream(s));
+        func(src, dst, coeffs, interpolation, StreamAccessor::getStream(s));
    }
    else
    {
        using namespace cv::gpu::device::imgproc;

        typedef void (*func_t)(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation,
-            int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+            int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);

        static const func_t funcs[6][4] =
        {
@@ -294,8 +295,6 @@ void cv::gpu::warpAffine(const GpuMat& src, GpuMat& dst, const Mat& M, Size dsiz
        int gpuBorderType;
        CV_Assert(tryConvertToGpuBorderType(borderMode, gpuBorderType));

-        dst.create(dsize, src.type());
-
        float coeffs[2 * 3];
        Mat coeffsMat(2, 3, CV_32F, (void*)coeffs);

@@ -311,11 +310,8 @@ void cv::gpu::warpAffine(const GpuMat& src, GpuMat& dst, const Mat& M, Size dsiz
        Scalar_<float> borderValueFloat;
        borderValueFloat = borderValue;

-        DeviceInfo info;
-        int cc = info.majorVersion() * 10 + info.minorVersion();
-
        func(src, PtrStepSzb(wholeSize.height, wholeSize.width, src.datastart, src.step), ofs.x, ofs.y, coeffs,
-            dst, interpolation, gpuBorderType, borderValueFloat.val, StreamAccessor::getStream(s), cc);
+            dst, interpolation, gpuBorderType, borderValueFloat.val, StreamAccessor::getStream(s), deviceSupports(FEATURE_SET_COMPUTE_20));
    }
 }

@@ -329,6 +325,8 @@ void cv::gpu::warpPerspective(const GpuMat& src, GpuMat& dst, const Mat& M, Size
    CV_Assert(interpolation == INTER_NEAREST || interpolation == INTER_LINEAR || interpolation == INTER_CUBIC);
    CV_Assert(borderMode == BORDER_REFLECT101 || borderMode == BORDER_REPLICATE || borderMode == BORDER_CONSTANT || borderMode == BORDER_REFLECT || borderMode == BORDER_WRAP);

+    dst.create(dsize, src.type());
+
    Size wholeSize;
    Point ofs;
    src.locateROI(wholeSize, ofs);
@@ -373,8 +371,7 @@ void cv::gpu::warpPerspective(const GpuMat& src, GpuMat& dst, const Mat& M, Size
        }
    };

-    bool useNpp = borderMode == BORDER_CONSTANT;
-    useNpp = useNpp && useNppTab[src.depth()][src.channels() - 1][interpolation];
+    bool useNpp = borderMode == BORDER_CONSTANT && ofs.x == 0 && ofs.y == 0 && useNppTab[src.depth()][src.channels() - 1][interpolation];
    #ifdef linux
        // NPP bug on float data
        useNpp = useNpp && src.depth() != CV_32F;
@@ -382,7 +379,7 @@ void cv::gpu::warpPerspective(const GpuMat& src, GpuMat& dst, const Mat& M, Size

    if (useNpp)
    {
-        typedef void (*func_t)(const cv::gpu::GpuMat& src, cv::Size wholeSize, cv::Point ofs, cv::gpu::GpuMat& dst, double coeffs[][3], cv::Size dsize, int flags, cudaStream_t stream);
+        typedef void (*func_t)(const cv::gpu::GpuMat& src, cv::gpu::GpuMat& dst, double coeffs[][3], int flags, cudaStream_t stream);

        static const func_t funcs[2][6][4] =
        {
@@ -404,6 +401,8 @@ void cv::gpu::warpPerspective(const GpuMat& src, GpuMat& dst, const Mat& M, Size
            }
        };

+        dst.setTo(borderValue);
+
        double coeffs[3][3];
        Mat coeffsMat(3, 3, CV_64F, (void*)coeffs);
        M.convertTo(coeffsMat, coeffsMat.type());
@@ -411,14 +410,14 @@ void cv::gpu::warpPerspective(const GpuMat& src, GpuMat& dst, const Mat& M, Size
        const func_t func = funcs[(flags & WARP_INVERSE_MAP) != 0][src.depth()][src.channels() - 1];
        CV_Assert(func != 0);

-        func(src, wholeSize, ofs, dst, coeffs, dsize, interpolation, StreamAccessor::getStream(s));
+        func(src, dst, coeffs, interpolation, StreamAccessor::getStream(s));
    }
    else
    {
        using namespace cv::gpu::device::imgproc;

        typedef void (*func_t)(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation,
-            int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+            int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);

        static const func_t funcs[6][4] =
        {
@@ -436,8 +435,6 @@ void cv::gpu::warpPerspective(const GpuMat& src, GpuMat& dst, const Mat& M, Size
        int gpuBorderType;
        CV_Assert(tryConvertToGpuBorderType(borderMode, gpuBorderType));

-        dst.create(dsize, src.type());
-
        float coeffs[3 * 3];
        Mat coeffsMat(3, 3, CV_32F, (void*)coeffs);

@@ -453,11 +450,8 @@ void cv::gpu::warpPerspective(const GpuMat& src, GpuMat& dst, const Mat& M, Size
        Scalar_<float> borderValueFloat;
        borderValueFloat = borderValue;

-        DeviceInfo info;
-        int cc = info.majorVersion() * 10 + info.minorVersion();
-
        func(src, PtrStepSzb(wholeSize.height, wholeSize.width, src.datastart, src.step), ofs.x, ofs.y, coeffs,
-            dst, interpolation, gpuBorderType, borderValueFloat.val, StreamAccessor::getStream(s), cc);
+            dst, interpolation, gpuBorderType, borderValueFloat.val, StreamAccessor::getStream(s), deviceSupports(FEATURE_SET_COMPUTE_20));
    }
 }