Merge remote-tracking branch 'origin/2.4'

Pull requests: #943 from jet47:cuda-5.5-support #944 from jet47:cmake-2.8.11-cuda-fix #912 from SpecLad:contributing #934 from SpecLad:parallel-for #931 from jet47:gpu-test-fixes #932 from bitwangyaoyao:2.4_fixBFM #918 from bitwangyaoyao:2.4_samples #924 from pengx17:2.4_arithm_fix #925 from pengx17:2.4_canny_tmp_fix #927 from bitwangyaoyao:2.4_perf #930 from pengx17:2.4_haar_ext #928 from apavlenko:bugfix_3027 #920 from asmorkalov:android_move #910 from pengx17:2.4_oclgfft #913 from janm399:2.4 #916 from bitwangyaoyao:2.4_fixPyrLK #919 from abidrahmank:2.4 #923 from pengx17:2.4_macfix Conflicts: modules/calib3d/src/stereobm.cpp modules/features2d/src/detectors.cpp modules/gpu/src/error.cpp modules/gpu/src/precomp.hpp modules/imgproc/src/distransform.cpp modules/imgproc/src/morph.cpp modules/ocl/include/opencv2/ocl/ocl.hpp modules/ocl/perf/perf_color.cpp modules/ocl/perf/perf_imgproc.cpp modules/ocl/perf/perf_match_template.cpp modules/ocl/perf/precomp.cpp modules/ocl/perf/precomp.hpp modules/ocl/src/arithm.cpp modules/ocl/src/canny.cpp modules/ocl/src/filtering.cpp modules/ocl/src/haar.cpp modules/ocl/src/hog.cpp modules/ocl/src/imgproc.cpp modules/ocl/src/opencl/haarobjectdetect.cl modules/ocl/src/pyrlk.cpp modules/video/src/bgfg_gaussmix2.cpp modules/video/src/lkpyramid.cpp platforms/linux/scripts/cmake_arm_gnueabi_hardfp.sh platforms/linux/scripts/cmake_arm_gnueabi_softfp.sh platforms/scripts/ABI_compat_generator.py samples/ocl/facedetect.cpp
2013-06-04 18:31:51 +04:00
parent d81d3fc830 75cf5cc4ee
commit bae85660da
236 changed files with 5549 additions and 3276 deletions
--- a/modules/gpu/src/calib3d.cpp
+++ b/modules/gpu/src/calib3d.cpp
@@ -150,7 +150,7 @@ namespace
    }

    // Computes rotation, translation pair for small subsets if the input data
-    class TransformHypothesesGenerator
+    class TransformHypothesesGenerator : public ParallelLoopBody
    {
    public:
        TransformHypothesesGenerator(const Mat& object_, const Mat& image_, const Mat& dist_coef_,
@@ -160,7 +160,7 @@ namespace
                  num_points(num_points_), subset_size(subset_size_), rot_matrices(rot_matrices_),
                  transl_vectors(transl_vectors_) {}

-        void operator()(const BlockedRange& range) const
+        void operator()(const Range& range) const
        {
            // Input data for generation of the current hypothesis
            std::vector<int> subset_indices(subset_size);
@@ -172,7 +172,7 @@ namespace
            Mat rot_mat(3, 3, CV_64F);
            Mat transl_vec(1, 3, CV_64F);

-            for (int iter = range.begin(); iter < range.end(); ++iter)
+            for (int iter = range.start; iter < range.end; ++iter)
            {
                selectRandom(subset_size, num_points, subset_indices);
                for (int i = 0; i < subset_size; ++i)
@@ -238,7 +238,7 @@ void cv::gpu::solvePnPRansac(const Mat& object, const Mat& image, const Mat& cam
    // Generate set of hypotheses using small subsets of the input data
    TransformHypothesesGenerator body(object, image_normalized, empty_dist_coef, eye_camera_mat,
                                      num_points, subset_size, rot_matrices, transl_vectors);
-    parallel_for(BlockedRange(0, num_iters), body);
+    parallel_for_(Range(0, num_iters), body);

    // Compute scores (i.e. number of inliers) for each hypothesis
    GpuMat d_object(object);
--- a/modules/gpu/src/cuda/calib3d.cu
+++ b/modules/gpu/src/cuda/calib3d.cu
@@ -67,8 +67,8 @@ namespace cv { namespace gpu { namespace cudev
                        crot1.x * p.x + crot1.y * p.y + crot1.z * p.z + ctransl.y,
                        crot2.x * p.x + crot2.y * p.y + crot2.z * p.z + ctransl.z);
            }
-            __device__ __forceinline__ TransformOp() {}
-            __device__ __forceinline__ TransformOp(const TransformOp&) {}
+            __host__ __device__ __forceinline__ TransformOp() {}
+            __host__ __device__ __forceinline__ TransformOp(const TransformOp&) {}
        };

        void call(const PtrStepSz<float3> src, const float* rot,
@@ -106,8 +106,8 @@ namespace cv { namespace gpu { namespace cudev
                        (cproj0.x * t.x + cproj0.y * t.y) / t.z + cproj0.z,
                        (cproj1.x * t.x + cproj1.y * t.y) / t.z + cproj1.z);
            }
-            __device__ __forceinline__ ProjectOp() {}
-            __device__ __forceinline__ ProjectOp(const ProjectOp&) {}
+            __host__ __device__ __forceinline__ ProjectOp() {}
+            __host__ __device__ __forceinline__ ProjectOp(const ProjectOp&) {}
        };

        void call(const PtrStepSz<float3> src, const float* rot,
--- a/modules/gpu/src/cuda/canny.cu
+++ b/modules/gpu/src/cuda/canny.cu
@@ -62,8 +62,8 @@ namespace canny
            return ::abs(x) + ::abs(y);
        }

-        __device__ __forceinline__ L1() {}
-        __device__ __forceinline__ L1(const L1&) {}
+        __host__ __device__ __forceinline__ L1() {}
+        __host__ __device__ __forceinline__ L1(const L1&) {}
    };
    struct L2 : binary_function<int, int, float>
    {
@@ -72,8 +72,8 @@ namespace canny
            return ::sqrtf(x * x + y * y);
        }

-        __device__ __forceinline__ L2() {}
-        __device__ __forceinline__ L2(const L2&) {}
+        __host__ __device__ __forceinline__ L2() {}
+        __host__ __device__ __forceinline__ L2(const L2&) {}
    };
 }

@@ -470,8 +470,8 @@ namespace canny
            return (uchar)(-(e >> 1));
        }

-        __device__ __forceinline__ GetEdges() {}
-        __device__ __forceinline__ GetEdges(const GetEdges&) {}
+        __host__ __device__ __forceinline__ GetEdges() {}
+        __host__ __device__ __forceinline__ GetEdges(const GetEdges&) {}
    };
 }

--- a/modules/gpu/src/cuda/element_operations.cu
+++ b/modules/gpu/src/cuda/element_operations.cu
@@ -162,8 +162,8 @@ namespace arithm
            return vadd4(a, b);
        }

-        __device__ __forceinline__ VAdd4() {}
-        __device__ __forceinline__ VAdd4(const VAdd4& other) {}
+        __host__ __device__ __forceinline__ VAdd4() {}
+        __host__ __device__ __forceinline__ VAdd4(const VAdd4&) {}
    };

    ////////////////////////////////////
@@ -175,8 +175,8 @@ namespace arithm
            return vadd2(a, b);
        }

-        __device__ __forceinline__ VAdd2() {}
-        __device__ __forceinline__ VAdd2(const VAdd2& other) {}
+        __host__ __device__ __forceinline__ VAdd2() {}
+        __host__ __device__ __forceinline__ VAdd2(const VAdd2&) {}
    };

    ////////////////////////////////////
@@ -188,8 +188,8 @@ namespace arithm
            return saturate_cast<D>(a + b);
        }

-        __device__ __forceinline__ AddMat() {}
-        __device__ __forceinline__ AddMat(const AddMat& other) {}
+        __host__ __device__ __forceinline__ AddMat() {}
+        __host__ __device__ __forceinline__ AddMat(const AddMat&) {}
    };
 }

@@ -397,8 +397,8 @@ namespace arithm
            return vsub4(a, b);
        }

-        __device__ __forceinline__ VSub4() {}
-        __device__ __forceinline__ VSub4(const VSub4& other) {}
+        __host__ __device__ __forceinline__ VSub4() {}
+        __host__ __device__ __forceinline__ VSub4(const VSub4&) {}
    };

    ////////////////////////////////////
@@ -410,8 +410,8 @@ namespace arithm
            return vsub2(a, b);
        }

-        __device__ __forceinline__ VSub2() {}
-        __device__ __forceinline__ VSub2(const VSub2& other) {}
+        __host__ __device__ __forceinline__ VSub2() {}
+        __host__ __device__ __forceinline__ VSub2(const VSub2&) {}
    };

    ////////////////////////////////////
@@ -423,8 +423,8 @@ namespace arithm
            return saturate_cast<D>(a - b);
        }

-        __device__ __forceinline__ SubMat() {}
-        __device__ __forceinline__ SubMat(const SubMat& other) {}
+        __host__ __device__ __forceinline__ SubMat() {}
+        __host__ __device__ __forceinline__ SubMat(const SubMat&) {}
    };
 }

@@ -617,8 +617,8 @@ namespace arithm
            return res;
        }

-        __device__ __forceinline__ Mul_8uc4_32f() {}
-        __device__ __forceinline__ Mul_8uc4_32f(const Mul_8uc4_32f& other) {}
+        __host__ __device__ __forceinline__ Mul_8uc4_32f() {}
+        __host__ __device__ __forceinline__ Mul_8uc4_32f(const Mul_8uc4_32f&) {}
    };

    struct Mul_16sc4_32f : binary_function<short4, float, short4>
@@ -629,8 +629,8 @@ namespace arithm
                               saturate_cast<short>(a.z * b), saturate_cast<short>(a.w * b));
        }

-        __device__ __forceinline__ Mul_16sc4_32f() {}
-        __device__ __forceinline__ Mul_16sc4_32f(const Mul_16sc4_32f& other) {}
+        __host__ __device__ __forceinline__ Mul_16sc4_32f() {}
+        __host__ __device__ __forceinline__ Mul_16sc4_32f(const Mul_16sc4_32f&) {}
    };

    template <typename T, typename D> struct Mul : binary_function<T, T, D>
@@ -640,8 +640,8 @@ namespace arithm
            return saturate_cast<D>(a * b);
        }

-        __device__ __forceinline__ Mul() {}
-        __device__ __forceinline__ Mul(const Mul& other) {}
+        __host__ __device__ __forceinline__ Mul() {}
+        __host__ __device__ __forceinline__ Mul(const Mul&) {}
    };

    template <typename T, typename S, typename D> struct MulScale : binary_function<T, T, D>
@@ -888,8 +888,8 @@ namespace arithm
            return b != 0 ? saturate_cast<D>(a / b) : 0;
        }

-        __device__ __forceinline__ Div() {}
-        __device__ __forceinline__ Div(const Div& other) {}
+        __host__ __device__ __forceinline__ Div() {}
+        __host__ __device__ __forceinline__ Div(const Div&) {}
    };
    template <typename T> struct Div<T, float> : binary_function<T, T, float>
    {
@@ -898,8 +898,8 @@ namespace arithm
            return b != 0 ? static_cast<float>(a) / b : 0;
        }

-        __device__ __forceinline__ Div() {}
-        __device__ __forceinline__ Div(const Div& other) {}
+        __host__ __device__ __forceinline__ Div() {}
+        __host__ __device__ __forceinline__ Div(const Div&) {}
    };
    template <typename T> struct Div<T, double> : binary_function<T, T, double>
    {
@@ -908,8 +908,8 @@ namespace arithm
            return b != 0 ? static_cast<double>(a) / b : 0;
        }

-        __device__ __forceinline__ Div() {}
-        __device__ __forceinline__ Div(const Div& other) {}
+        __host__ __device__ __forceinline__ Div() {}
+        __host__ __device__ __forceinline__ Div(const Div&) {}
    };

    template <typename T, typename S, typename D> struct DivScale : binary_function<T, T, D>
@@ -1196,8 +1196,8 @@ namespace arithm
            return vabsdiff4(a, b);
        }

-        __device__ __forceinline__ VAbsDiff4() {}
-        __device__ __forceinline__ VAbsDiff4(const VAbsDiff4& other) {}
+        __host__ __device__ __forceinline__ VAbsDiff4() {}
+        __host__ __device__ __forceinline__ VAbsDiff4(const VAbsDiff4&) {}
    };

    ////////////////////////////////////
@@ -1209,8 +1209,8 @@ namespace arithm
            return vabsdiff2(a, b);
        }

-        __device__ __forceinline__ VAbsDiff2() {}
-        __device__ __forceinline__ VAbsDiff2(const VAbsDiff2& other) {}
+        __host__ __device__ __forceinline__ VAbsDiff2() {}
+        __host__ __device__ __forceinline__ VAbsDiff2(const VAbsDiff2&) {}
    };

    ////////////////////////////////////
@@ -1235,8 +1235,8 @@ namespace arithm
            return saturate_cast<T>(_abs(a - b));
        }

-        __device__ __forceinline__ AbsDiffMat() {}
-        __device__ __forceinline__ AbsDiffMat(const AbsDiffMat& other) {}
+        __host__ __device__ __forceinline__ AbsDiffMat() {}
+        __host__ __device__ __forceinline__ AbsDiffMat(const AbsDiffMat&) {}
    };
 }

@@ -1370,8 +1370,8 @@ namespace arithm
            return saturate_cast<T>(x * x);
        }

-        __device__ __forceinline__ Sqr() {}
-        __device__ __forceinline__ Sqr(const Sqr& other) {}
+        __host__ __device__ __forceinline__ Sqr() {}
+        __host__ __device__ __forceinline__ Sqr(const Sqr&) {}
    };
 }

@@ -1466,8 +1466,8 @@ namespace arithm
            return saturate_cast<T>(f(x));
        }

-        __device__ __forceinline__ Exp() {}
-        __device__ __forceinline__ Exp(const Exp& other) {}
+        __host__ __device__ __forceinline__ Exp() {}
+        __host__ __device__ __forceinline__ Exp(const Exp&) {}
    };
 }

@@ -1507,8 +1507,8 @@ namespace arithm
            return vcmpeq4(a, b);
        }

-        __device__ __forceinline__ VCmpEq4() {}
-        __device__ __forceinline__ VCmpEq4(const VCmpEq4& other) {}
+        __host__ __device__ __forceinline__ VCmpEq4() {}
+        __host__ __device__ __forceinline__ VCmpEq4(const VCmpEq4&) {}
    };
    struct VCmpNe4 : binary_function<uint, uint, uint>
    {
@@ -1517,8 +1517,8 @@ namespace arithm
            return vcmpne4(a, b);
        }

-        __device__ __forceinline__ VCmpNe4() {}
-        __device__ __forceinline__ VCmpNe4(const VCmpNe4& other) {}
+        __host__ __device__ __forceinline__ VCmpNe4() {}
+        __host__ __device__ __forceinline__ VCmpNe4(const VCmpNe4&) {}
    };
    struct VCmpLt4 : binary_function<uint, uint, uint>
    {
@@ -1527,8 +1527,8 @@ namespace arithm
            return vcmplt4(a, b);
        }

-        __device__ __forceinline__ VCmpLt4() {}
-        __device__ __forceinline__ VCmpLt4(const VCmpLt4& other) {}
+        __host__ __device__ __forceinline__ VCmpLt4() {}
+        __host__ __device__ __forceinline__ VCmpLt4(const VCmpLt4&) {}
    };
    struct VCmpLe4 : binary_function<uint, uint, uint>
    {
@@ -1537,8 +1537,8 @@ namespace arithm
            return vcmple4(a, b);
        }

-        __device__ __forceinline__ VCmpLe4() {}
-        __device__ __forceinline__ VCmpLe4(const VCmpLe4& other) {}
+        __host__ __device__ __forceinline__ VCmpLe4() {}
+        __host__ __device__ __forceinline__ VCmpLe4(const VCmpLe4&) {}
    };

    ////////////////////////////////////
@@ -2008,8 +2008,8 @@ namespace arithm
            return vmin4(a, b);
        }

-        __device__ __forceinline__ VMin4() {}
-        __device__ __forceinline__ VMin4(const VMin4& other) {}
+        __host__ __device__ __forceinline__ VMin4() {}
+        __host__ __device__ __forceinline__ VMin4(const VMin4&) {}
    };

    ////////////////////////////////////
@@ -2021,8 +2021,8 @@ namespace arithm
            return vmin2(a, b);
        }

-        __device__ __forceinline__ VMin2() {}
-        __device__ __forceinline__ VMin2(const VMin2& other) {}
+        __host__ __device__ __forceinline__ VMin2() {}
+        __host__ __device__ __forceinline__ VMin2(const VMin2&) {}
    };
 }

@@ -2100,8 +2100,8 @@ namespace arithm
            return vmax4(a, b);
        }

-        __device__ __forceinline__ VMax4() {}
-        __device__ __forceinline__ VMax4(const VMax4& other) {}
+        __host__ __device__ __forceinline__ VMax4() {}
+        __host__ __device__ __forceinline__ VMax4(const VMax4&) {}
    };

    ////////////////////////////////////
@@ -2113,8 +2113,8 @@ namespace arithm
            return vmax2(a, b);
        }

-        __device__ __forceinline__ VMax2() {}
-        __device__ __forceinline__ VMax2(const VMax2& other) {}
+        __host__ __device__ __forceinline__ VMax2() {}
+        __host__ __device__ __forceinline__ VMax2(const VMax2&) {}
    };
 }

--- a/modules/gpu/src/matrix_reductions.cpp
+++ b/modules/gpu/src/matrix_reductions.cpp
@@ -188,10 +188,20 @@ double cv::gpu::norm(const GpuMat& src1, const GpuMat& src2, int normType)
    CV_Assert(src1.size() == src2.size() && src1.type() == src2.type());
    CV_Assert(normType == NORM_INF || normType == NORM_L1 || normType == NORM_L2);

-    typedef NppStatus (*npp_norm_diff_func_t)(const Npp8u* pSrc1, int nSrcStep1, const Npp8u* pSrc2, int nSrcStep2,
-        NppiSize oSizeROI, Npp64f* pRetVal);
+#if CUDA_VERSION < 5050
+    typedef NppStatus (*func_t)(const Npp8u* pSrc1, int nSrcStep1, const Npp8u* pSrc2, int nSrcStep2, NppiSize oSizeROI, Npp64f* pRetVal);

-    static const npp_norm_diff_func_t npp_norm_diff_func[] = {nppiNormDiff_Inf_8u_C1R, nppiNormDiff_L1_8u_C1R, nppiNormDiff_L2_8u_C1R};
+    static const func_t funcs[] = {nppiNormDiff_Inf_8u_C1R, nppiNormDiff_L1_8u_C1R, nppiNormDiff_L2_8u_C1R};
+#else
+    typedef NppStatus (*func_t)(const Npp8u* pSrc1, int nSrcStep1, const Npp8u* pSrc2, int nSrcStep2,
+        NppiSize oSizeROI, Npp64f* pRetVal, Npp8u * pDeviceBuffer);
+
+    typedef NppStatus (*buf_size_func_t)(NppiSize oSizeROI, int* hpBufferSize);
+
+    static const func_t funcs[] = {nppiNormDiff_Inf_8u_C1R, nppiNormDiff_L1_8u_C1R, nppiNormDiff_L2_8u_C1R};
+
+    static const buf_size_func_t buf_size_funcs[] = {nppiNormDiffInfGetBufferHostSize_8u_C1R, nppiNormDiffL1GetBufferHostSize_8u_C1R, nppiNormDiffL2GetBufferHostSize_8u_C1R};
+#endif

    NppiSize sz;
    sz.width  = src1.cols;
@@ -203,7 +213,16 @@ double cv::gpu::norm(const GpuMat& src1, const GpuMat& src2, int normType)

    DeviceBuffer dbuf;

-    nppSafeCall( npp_norm_diff_func[funcIdx](src1.ptr<Npp8u>(), static_cast<int>(src1.step), src2.ptr<Npp8u>(), static_cast<int>(src2.step), sz, dbuf) );
+#if CUDA_VERSION < 5050
+    nppSafeCall( funcs[funcIdx](src1.ptr<Npp8u>(), static_cast<int>(src1.step), src2.ptr<Npp8u>(), static_cast<int>(src2.step), sz, dbuf) );
+#else
+    int bufSize;
+    buf_size_funcs[funcIdx](sz, &bufSize);
+
+    GpuMat buf(1, bufSize, CV_8UC1);
+
+    nppSafeCall( funcs[funcIdx](src1.ptr<Npp8u>(), static_cast<int>(src1.step), src2.ptr<Npp8u>(), static_cast<int>(src2.step), sz, dbuf, buf.data) );
+#endif

    cudaSafeCall( cudaDeviceSynchronize() );