Fixed bug with submatrix in device::transform

2011-12-21 05:59:14 +00:00
parent dab3586792
commit d13a6b74b2
6 changed files with 64 additions and 23 deletions
--- a/modules/gpu/src/element_operations.cpp
+++ b/modules/gpu/src/element_operations.cpp
@@ -159,7 +159,13 @@ void cv::gpu::add(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const Gpu

    cudaStream_t stream = StreamAccessor::getStream(s);

-    if (mask.empty() && dst.type() == src1.type() && (src1.depth() == CV_8U || src1.depth() == CV_32S || src1.depth() == CV_32F))
+    bool useNpp = 
+        mask.empty() && 
+        dst.type() == src1.type() && 
+        (src1.depth() == CV_8U || src1.depth() == CV_32S || src1.depth() == CV_32F) &&
+        (isAligned(src1.data, 16) && isAligned(src2.data, 16) && isAligned(dst.data, 16));
+
+    if (useNpp)
    {
        nppArithmCaller(src1, src2, dst, nppiAdd_8u_C1RSfs, nppiAdd_8u_C4RSfs, nppiAdd_32s_C1R, nppiAdd_32f_C1R, stream);
        return;
@@ -271,7 +277,13 @@ void cv::gpu::subtract(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, cons

    cudaStream_t stream = StreamAccessor::getStream(s);

-    if (mask.empty() && dst.type() == src1.type() && (src1.depth() == CV_8U || src1.depth() == CV_32S || src1.depth() == CV_32F))
+    bool useNpp = 
+        mask.empty() && 
+        dst.type() == src1.type() && 
+        (src1.depth() == CV_8U || src1.depth() == CV_32S || src1.depth() == CV_32F) &&
+        (isAligned(src1.data, 16) && isAligned(src2.data, 16) && isAligned(dst.data, 16));
+
+    if (useNpp)
    {
        nppArithmCaller(src2, src1, dst, nppiSub_8u_C1RSfs, nppiSub_8u_C4RSfs, nppiSub_32s_C1R, nppiSub_32f_C1R, stream);
        return;
@@ -403,8 +415,13 @@ void cv::gpu::multiply(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, doub

        dst.create(src1.size(), CV_MAKE_TYPE(CV_MAT_DEPTH(dtype), src1.channels()));

+        bool useNpp = 
+            scale == 1 && 
+            dst.type() == src1.type() && 
+            (src1.depth() == CV_8U || src1.depth() == CV_32S || src1.depth() == CV_32F) &&
+            (isAligned(src1.data, 16) && isAligned(src2.data, 16) && isAligned(dst.data, 16));

-        if (scale == 1 && dst.type() == src1.type() && (src1.depth() == CV_8U || src1.depth() == CV_32S || src1.depth() == CV_32F))
+        if (useNpp)
        {
            nppArithmCaller(src2, src1, dst, nppiMul_8u_C1RSfs, nppiMul_8u_C4RSfs, nppiMul_32s_C1R, nppiMul_32f_C1R, stream);
            return;
@@ -528,8 +545,13 @@ void cv::gpu::divide(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, double

        dst.create(src1.size(), CV_MAKE_TYPE(CV_MAT_DEPTH(dtype), src1.channels()));

+        bool useNpp = 
+            scale == 1 && 
+            dst.type() == src1.type() && 
+            (src1.depth() == CV_8U || src1.depth() == CV_32S || src1.depth() == CV_32F) &&
+            (isAligned(src1.data, 16) && isAligned(src2.data, 16) && isAligned(dst.data, 16));

-        if (scale == 1 && dst.type() == src1.type() && (src1.depth() == CV_8U || src1.depth() == CV_32S || src1.depth() == CV_32F))
+        if (useNpp)
        {
            nppArithmCaller(src2, src1, dst, nppiDiv_8u_C1RSfs, nppiDiv_8u_C4RSfs, nppiDiv_32s_C1R, nppiDiv_32f_C1R, stream);
            return;
@@ -643,7 +665,7 @@ void cv::gpu::absdiff(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Strea

    static const func_t funcs[] = 
    {
-        0/*absdiff_gpu<unsigned char>*/, absdiff_gpu<signed char>, absdiff_gpu<unsigned short>, absdiff_gpu<short>, 0/*absdiff_gpu<int>*/, 0/*absdiff_gpu<float>*/, absdiff_gpu<double>
+       absdiff_gpu<unsigned char>, absdiff_gpu<signed char>, absdiff_gpu<unsigned short>, absdiff_gpu<short>, absdiff_gpu<int>, absdiff_gpu<float>, absdiff_gpu<double>
    };

    CV_Assert(src1.size() == src2.size() && src1.type() == src2.type());
@@ -656,7 +678,9 @@ void cv::gpu::absdiff(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Strea
    sz.width  = src1.cols * src1.channels();
    sz.height = src1.rows;

-    if (src1.depth() == CV_8U && (src1.cols * src1.channels()) % 4 == 0)
+    bool aligned = isAligned(src1.data, 16) && isAligned(src2.data, 16) && isAligned(dst.data, 16);
+
+    if (aligned && src1.depth() == CV_8U && (src1.cols * src1.channels()) % 4 == 0)
    {
        NppStreamHandler h(stream);

@@ -668,7 +692,7 @@ void cv::gpu::absdiff(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Strea
        if (stream == 0)
            cudaSafeCall( cudaDeviceSynchronize() );
    }
-    else if (src1.depth() == CV_8U)
+    else if (aligned && src1.depth() == CV_8U)
    {
        NppStreamHandler h(stream);

@@ -678,7 +702,7 @@ void cv::gpu::absdiff(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Strea
        if (stream == 0)
            cudaSafeCall( cudaDeviceSynchronize() );
    }
-    else if (src1.depth() == CV_32S)
+    else if (aligned && src1.depth() == CV_32S)
    {
        NppStreamHandler h(stream);

@@ -688,7 +712,7 @@ void cv::gpu::absdiff(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Strea
        if (stream == 0)
            cudaSafeCall( cudaDeviceSynchronize() );
    }
-    else if (src1.depth() == CV_32F)
+    else if (aligned && src1.depth() == CV_32F)
    {
        NppStreamHandler h(stream);