Merge pull request #2620 from ilya-lavrenov:ipp_integration

2014-04-23 15:12:44 +04:00
parent 9badfa1f83 51e2a8ec96
commit d93161812b
7 changed files with 501 additions and 61 deletions
--- a/modules/core/perf/opencl/perf_matop.cpp
+++ b/modules/core/perf/opencl/perf_matop.cpp
@@ -35,6 +35,28 @@ OCL_PERF_TEST_P(SetToFixture, SetTo,
    SANITY_CHECK(src);
 }

+///////////// SetTo with mask ////////////////////////
+
+typedef Size_MatType SetToFixture;
+
+OCL_PERF_TEST_P(SetToFixture, SetToWithMask,
+                ::testing::Combine(OCL_TEST_SIZES, OCL_TEST_TYPES))
+{
+    const Size_MatType_t params = GetParam();
+    const Size srcSize = get<0>(params);
+    const int type = get<1>(params);
+    const Scalar s = Scalar::all(17);
+
+    checkDeviceMaxMemoryAllocSize(srcSize, type);
+
+    UMat src(srcSize, type), mask(srcSize, CV_8UC1);
+    declare.in(src, mask, WARMUP_RNG).out(src);
+
+    OCL_TEST_CYCLE() src.setTo(s, mask);
+
+    SANITY_CHECK(src);
+}
+
 ///////////// ConvertTo ////////////////////////

 typedef Size_MatType ConvertToFixture;
@@ -79,6 +101,27 @@ OCL_PERF_TEST_P(CopyToFixture, CopyTo,
    SANITY_CHECK(dst);
 }

+///////////// CopyTo with mask ////////////////////////
+
+typedef Size_MatType CopyToFixture;
+
+OCL_PERF_TEST_P(CopyToFixture, CopyToWithMask,
+                ::testing::Combine(OCL_TEST_SIZES, OCL_TEST_TYPES))
+{
+    const Size_MatType_t params = GetParam();
+    const Size srcSize = get<0>(params);
+    const int type = get<1>(params);
+
+    checkDeviceMaxMemoryAllocSize(srcSize, type);
+
+    UMat src(srcSize, type), dst(srcSize, type), mask(srcSize, CV_8UC1);
+    declare.in(src, mask, WARMUP_RNG).out(dst);
+
+    OCL_TEST_CYCLE() src.copyTo(dst, mask);
+
+    SANITY_CHECK(dst);
+}
+
 } } // namespace cvtest::ocl

 #endif // HAVE_OPENCL
--- a/modules/core/src/arithm.cpp
+++ b/modules/core/src/arithm.cpp
@@ -705,6 +705,24 @@ static void max64f( const double* src1, size_t step1,
                    const double* src2, size_t step2,
                    double* dst, size_t step, Size sz, void* )
 {
+#if ARITHM_USE_IPP == 1 && !defined HAVE_IPP_ICV_ONLY
+    double* s1 = (double*)src1;
+    double* s2 = (double*)src2;
+    double* d  = dst;
+    fixSteps(sz, sizeof(dst[0]), step1, step2, step);
+    int i = 0;
+    for(; i < sz.height; i++)
+    {
+        if (0 > ippsMaxEvery_64f(s1, s2, d, sz.width))
+            break;
+        s1 = (double*)((uchar*)s1 + step1);
+        s2 = (double*)((uchar*)s2 + step2);
+        d  = (double*)((uchar*)d + step);
+    }
+    if (i == sz.height)
+        return;
+    setIppErrorStatus();
+#endif
    vBinOp64<double, OpMax<double>, IF_SIMD(VMax<double>)>(src1, step1, src2, step2, dst, step, sz);
 }

@@ -808,6 +826,24 @@ static void min64f( const double* src1, size_t step1,
                    const double* src2, size_t step2,
                    double* dst, size_t step, Size sz, void* )
 {
+#if ARITHM_USE_IPP == 1 && !defined HAVE_IPP_ICV_ONLY
+    double* s1 = (double*)src1;
+    double* s2 = (double*)src2;
+    double* d  = dst;
+    fixSteps(sz, sizeof(dst[0]), step1, step2, step);
+    int i = 0;
+    for(; i < sz.height; i++)
+    {
+        if (0 > ippsMinEvery_64f(s1, s2, d, sz.width))
+            break;
+        s1 = (double*)((uchar*)s1 + step1);
+        s2 = (double*)((uchar*)s2 + step2);
+        d  = (double*)((uchar*)d + step);
+    }
+    if (i == sz.height)
+        return;
+    setIppErrorStatus();
+#endif
    vBinOp64<double, OpMin<double>, IF_SIMD(VMin<double>)>(src1, step1, src2, step2, dst, step, sz);
 }

@@ -1977,7 +2013,16 @@ recip_( const T*, size_t, const T* src2, size_t step2,
 static void mul8u( const uchar* src1, size_t step1, const uchar* src2, size_t step2,
                   uchar* dst, size_t step, Size sz, void* scale)
 {
-    mul_(src1, step1, src2, step2, dst, step, sz, (float)*(const double*)scale);
+    float fscale = (float)*(const double*)scale;
+#if defined HAVE_IPP && !defined HAVE_IPP_ICV_ONLY
+    if (std::fabs(fscale - 1) <= FLT_EPSILON)
+    {
+        if (ippiMul_8u_C1RSfs(src1, (int)step1, src2, (int)step2, dst, (int)step, ippiSize(sz), 0) >= 0)
+            return;
+        setIppErrorStatus();
+    }
+#endif
+    mul_(src1, step1, src2, step2, dst, step, sz, fscale);
 }

 static void mul8s( const schar* src1, size_t step1, const schar* src2, size_t step2,
@@ -1989,13 +2034,31 @@ static void mul8s( const schar* src1, size_t step1, const schar* src2, size_t st
 static void mul16u( const ushort* src1, size_t step1, const ushort* src2, size_t step2,
                    ushort* dst, size_t step, Size sz, void* scale)
 {
-    mul_(src1, step1, src2, step2, dst, step, sz, (float)*(const double*)scale);
+    float fscale = (float)*(const double*)scale;
+#if defined HAVE_IPP && !defined HAVE_IPP_ICV_ONLY
+    if (std::fabs(fscale - 1) <= FLT_EPSILON)
+    {
+        if (ippiMul_16u_C1RSfs(src1, (int)step1, src2, (int)step2, dst, (int)step, ippiSize(sz), 0) >= 0)
+            return;
+        setIppErrorStatus();
+    }
+#endif
+    mul_(src1, step1, src2, step2, dst, step, sz, fscale);
 }

 static void mul16s( const short* src1, size_t step1, const short* src2, size_t step2,
                    short* dst, size_t step, Size sz, void* scale)
 {
-    mul_(src1, step1, src2, step2, dst, step, sz, (float)*(const double*)scale);
+    float fscale = (float)*(const double*)scale;
+#if defined HAVE_IPP && !defined HAVE_IPP_ICV_ONLY
+    if (std::fabs(fscale - 1) <= FLT_EPSILON)
+    {
+        if (ippiMul_16s_C1RSfs(src1, (int)step1, src2, (int)step2, dst, (int)step, ippiSize(sz), 0) >= 0)
+            return;
+        setIppErrorStatus();
+    }
+#endif
+    mul_(src1, step1, src2, step2, dst, step, sz, fscale);
 }

 static void mul32s( const int* src1, size_t step1, const int* src2, size_t step2,
@@ -2007,7 +2070,16 @@ static void mul32s( const int* src1, size_t step1, const int* src2, size_t step2
 static void mul32f( const float* src1, size_t step1, const float* src2, size_t step2,
                    float* dst, size_t step, Size sz, void* scale)
 {
-    mul_(src1, step1, src2, step2, dst, step, sz, (float)*(const double*)scale);
+    float fscale = (float)*(const double*)scale;
+#if defined HAVE_IPP && !defined HAVE_IPP_ICV_ONLY
+    if (std::fabs(fscale - 1) <= FLT_EPSILON)
+    {
+        if (ippiMul_32f_C1R(src1, (int)step1, src2, (int)step2, dst, (int)step, ippiSize(sz)) >= 0)
+            return;
+        setIppErrorStatus();
+    }
+#endif
+    mul_(src1, step1, src2, step2, dst, step, sz, fscale);
 }

 static void mul64f( const double* src1, size_t step1, const double* src2, size_t step2,
--- a/modules/core/src/copy.cpp
+++ b/modules/core/src/copy.cpp
@@ -81,6 +81,12 @@ copyMask_(const uchar* _src, size_t sstep, const uchar* mask, size_t mstep, ucha
 template<> void
 copyMask_<uchar>(const uchar* _src, size_t sstep, const uchar* mask, size_t mstep, uchar* _dst, size_t dstep, Size size)
 {
+#if defined HAVE_IPP && !defined HAVE_IPP_ICV_ONLY
+    if (ippiCopy_8u_C1MR(_src, (int)sstep, _dst, (int)dstep, ippiSize(size), mask, (int)mstep) >= 0)
+        return;
+    setIppErrorStatus();
+#endif
+
    for( ; size.height--; mask += mstep, _src += sstep, _dst += dstep )
    {
        const uchar* src = (const uchar*)_src;
@@ -111,6 +117,12 @@ copyMask_<uchar>(const uchar* _src, size_t sstep, const uchar* mask, size_t mste
 template<> void
 copyMask_<ushort>(const uchar* _src, size_t sstep, const uchar* mask, size_t mstep, uchar* _dst, size_t dstep, Size size)
 {
+#if defined HAVE_IPP && !defined HAVE_IPP_ICV_ONLY
+    if (ippiCopy_16u_C1MR((const Ipp16u *)_src, (int)sstep, (Ipp16u *)_dst, (int)dstep, ippiSize(size), mask, (int)mstep) >= 0)
+        return;
+    setIppErrorStatus();
+#endif
+
    for( ; size.height--; mask += mstep, _src += sstep, _dst += dstep )
    {
        const ushort* src = (const ushort*)_src;
@@ -165,15 +177,34 @@ static void copyMask##suffix(const uchar* src, size_t sstep, const uchar* mask,
    copyMask_<type>(src, sstep, mask, mstep, dst, dstep, size); \
 }

+#if defined HAVE_IPP && !defined HAVE_IPP_ICV_ONLY
+#define DEF_COPY_MASK_F(suffix, type, ippfavor, ipptype) \
+static void copyMask##suffix(const uchar* src, size_t sstep, const uchar* mask, size_t mstep, \
+                             uchar* dst, size_t dstep, Size size, void*) \
+{ \
+    if (ippiCopy_##ippfavor((const ipptype *)src, (int)sstep, (ipptype *)dst, (int)dstep, ippiSize(size), (const Ipp8u *)mask, (int)mstep) >= 0) \
+        return; \
+    setIppErrorStatus(); \
+    copyMask_<type>(src, sstep, mask, mstep, dst, dstep, size); \
+}
+#else
+#define DEF_COPY_MASK_F(suffix, type, ippfavor, ipptype) \
+static void copyMask##suffix(const uchar* src, size_t sstep, const uchar* mask, size_t mstep, \
+                             uchar* dst, size_t dstep, Size size, void*) \
+{ \
+    copyMask_<type>(src, sstep, mask, mstep, dst, dstep, size); \
+}
+#endif
+

 DEF_COPY_MASK(8u, uchar)
 DEF_COPY_MASK(16u, ushort)
-DEF_COPY_MASK(8uC3, Vec3b)
-DEF_COPY_MASK(32s, int)
-DEF_COPY_MASK(16uC3, Vec3s)
+DEF_COPY_MASK_F(8uC3, Vec3b, 8u_C3MR, Ipp8u)
+DEF_COPY_MASK_F(32s, int, 32s_C1MR, Ipp32s)
+DEF_COPY_MASK_F(16uC3, Vec3s, 16u_C3MR, Ipp16u)
 DEF_COPY_MASK(32sC2, Vec2i)
-DEF_COPY_MASK(32sC3, Vec3i)
-DEF_COPY_MASK(32sC4, Vec4i)
+DEF_COPY_MASK_F(32sC3, Vec3i, 32s_C3MR, Ipp32s)
+DEF_COPY_MASK_F(32sC4, Vec4i, 32s_C4MR, Ipp32s)
 DEF_COPY_MASK(32sC6, Vec6i)
 DEF_COPY_MASK(32sC8, Vec8i)

@@ -250,6 +281,12 @@ void Mat::copyTo( OutputArray _dst ) const
            Size sz = getContinuousSize(*this, dst);
            size_t len = sz.width*elemSize();

+#if defined HAVE_IPP && !defined HAVE_IPP_ICV_ONLY
+            if (ippiCopy_8u_C1R(sptr, (int)step, dptr, (int)dst.step, ippiSize((int)len, sz.height)) >= 0)
+                return;
+            setIppErrorStatus();
+#endif
+
            for( ; sz.height--; sptr += step, dptr += dst.step )
                memcpy( dptr, sptr, len );
        }
@@ -323,6 +360,27 @@ Mat& Mat::operator = (const Scalar& s)

    if( is[0] == 0 && is[1] == 0 && is[2] == 0 && is[3] == 0 )
    {
+#if defined HAVE_IPP && !defined HAVE_IPP_ICV_ONLY && 0
+        if (dims <= 2 || isContinuous())
+        {
+            IppiSize roisize = { cols, rows };
+            if (isContinuous())
+            {
+                roisize.width = (int)total();
+                roisize.height = 1;
+
+                if (ippsZero_8u(data, static_cast<int>(roisize.width * elemSize())) >= 0)
+                    return *this;
+                setIppErrorStatus();
+            }
+            roisize.width *= (int)elemSize();
+
+            if (ippiSet_8u_C1R(0, data, (int)step, roisize) >= 0)
+                return *this;
+            setIppErrorStatus();
+        }
+#endif
+
        for( size_t i = 0; i < it.nplanes; i++, ++it )
            memset( dptr, 0, elsize );
    }
@@ -359,7 +417,82 @@ Mat& Mat::setTo(InputArray _value, InputArray _mask)
    Mat value = _value.getMat(), mask = _mask.getMat();

    CV_Assert( checkScalar(value, type(), _value.kind(), _InputArray::MAT ));
-    CV_Assert( mask.empty() || mask.type() == CV_8U );
+    CV_Assert( mask.empty() || (mask.type() == CV_8U && size == mask.size) );
+
+#if defined HAVE_IPP && !defined HAVE_IPP_ICV_ONLY
+    int cn = channels(), depth0 = depth();
+
+    if (!mask.empty() && (dims <= 2 || (isContinuous() && mask.isContinuous())) &&
+            (depth0 == CV_8U || depth0 == CV_16U || depth0 == CV_16S || depth0 == CV_32S || depth0 == CV_32F) &&
+            (cn == 1 || cn == 3 || cn == 4))
+    {
+        uchar _buf[32];
+        void * buf = _buf;
+        convertAndUnrollScalar( value, type(), _buf, 1 );
+
+        IppStatus status = (IppStatus)-1;
+        IppiSize roisize = { cols, rows };
+        int mstep = (int)mask.step, dstep = (int)step;
+
+        if (isContinuous() && mask.isContinuous())
+        {
+            roisize.width = (int)total();
+            roisize.height = 1;
+        }
+
+        if (cn == 1)
+        {
+            if (depth0 == CV_8U)
+                status = ippiSet_8u_C1MR(*(Ipp8u *)buf, (Ipp8u *)data, dstep, roisize, mask.data, mstep);
+            else if (depth0 == CV_16U)
+                status = ippiSet_16u_C1MR(*(Ipp16u *)buf, (Ipp16u *)data, dstep, roisize, mask.data, mstep);
+            else if (depth0 == CV_16S)
+                status = ippiSet_16s_C1MR(*(Ipp16s *)buf, (Ipp16s *)data, dstep, roisize, mask.data, mstep);
+            else if (depth0 == CV_32S)
+                status = ippiSet_32s_C1MR(*(Ipp32s *)buf, (Ipp32s *)data, dstep, roisize, mask.data, mstep);
+            else if (depth0 == CV_32F)
+                status = ippiSet_32f_C1MR(*(Ipp32f *)buf, (Ipp32f *)data, dstep, roisize, mask.data, mstep);
+        }
+        else if (cn == 3 || cn == 4)
+        {
+#define IPP_SET(ippfavor, ippcn) \
+    do \
+    { \
+        typedef Ipp##ippfavor ipptype; \
+        ipptype ippvalue[4] = { ((ipptype *)buf)[0], ((ipptype *)buf)[1], ((ipptype *)buf)[2], ((ipptype *)buf)[3] }; \
+        status = ippiSet_##ippfavor##_C##ippcn##MR(ippvalue, (ipptype *)data, dstep, roisize, mask.data, mstep); \
+    } while ((void)0, 0)
+
+#define IPP_SET_CN(ippcn) \
+    do \
+    { \
+        if (cn == ippcn) \
+        { \
+            if (depth0 == CV_8U) \
+                IPP_SET(8u, ippcn); \
+            else if (depth0 == CV_16U) \
+                IPP_SET(16u, ippcn); \
+            else if (depth0 == CV_16S) \
+                IPP_SET(16s, ippcn); \
+            else if (depth0 == CV_32S) \
+                IPP_SET(32s, ippcn); \
+            else if (depth0 == CV_32F) \
+                IPP_SET(32f, ippcn); \
+        } \
+    } while ((void)0, 0)
+
+            IPP_SET_CN(3);
+            IPP_SET_CN(4);
+
+#undef IPP_SET_CN
+#undef IPP_SET
+        }
+
+        if (status >= 0)
+            return *this;
+        setIppErrorStatus();
+    }
+#endif

    size_t esz = elemSize();
    BinaryFunc copymask = getCopyMaskFunc(esz);
@@ -548,31 +681,65 @@ void flip( InputArray _src, OutputArray _dst, int flip_mode )
    Mat dst = _dst.getMat();
    size_t esz = CV_ELEM_SIZE(type);

-#if defined(HAVE_IPP) && !defined(HAVE_IPP_ICV_ONLY)
+#if defined HAVE_IPP && !defined HAVE_IPP_ICV_ONLY
    typedef IppStatus (CV_STDCALL * ippiMirror)(const void * pSrc, int srcStep, void * pDst, int dstStep, IppiSize roiSize, IppiAxis flip);
-    ippiMirror ippFunc =
-        type == CV_8UC1 ? (ippiMirror)ippiMirror_8u_C1R :
-        type == CV_8UC3 ? (ippiMirror)ippiMirror_8u_C3R :
-        type == CV_8UC4 ? (ippiMirror)ippiMirror_8u_C4R :
-        type == CV_16UC1 ? (ippiMirror)ippiMirror_16u_C1R :
-        type == CV_16UC3 ? (ippiMirror)ippiMirror_16u_C3R :
-        type == CV_16UC4 ? (ippiMirror)ippiMirror_16u_C4R :
-        type == CV_16SC1 ? (ippiMirror)ippiMirror_16s_C1R :
-        type == CV_16SC3 ? (ippiMirror)ippiMirror_16s_C3R :
-        type == CV_16SC4 ? (ippiMirror)ippiMirror_16s_C4R :
-        type == CV_32SC1 ? (ippiMirror)ippiMirror_32s_C1R :
-        type == CV_32SC3 ? (ippiMirror)ippiMirror_32s_C3R :
-        type == CV_32SC4 ? (ippiMirror)ippiMirror_32s_C4R :
-        type == CV_32FC1 ? (ippiMirror)ippiMirror_32f_C1R :
-        type == CV_32FC3 ? (ippiMirror)ippiMirror_32f_C3R :
-        type == CV_32FC4 ? (ippiMirror)ippiMirror_32f_C4R : 0;
+    typedef IppStatus (CV_STDCALL * ippiMirrorI)(const void * pSrcDst, int srcDstStep, IppiSize roiSize, IppiAxis flip);
+    ippiMirror ippFunc = 0;
+    ippiMirrorI ippFuncI = 0;
+
+    if (src.data == dst.data)
+    {
+        CV_SUPPRESS_DEPRECATED_START
+        ippFuncI =
+            type == CV_8UC1 ? (ippiMirrorI)ippiMirror_8u_C1IR :
+            type == CV_8UC3 ? (ippiMirrorI)ippiMirror_8u_C3IR :
+            type == CV_8UC4 ? (ippiMirrorI)ippiMirror_8u_C4IR :
+            type == CV_16UC1 ? (ippiMirrorI)ippiMirror_16u_C1IR :
+            type == CV_16UC3 ? (ippiMirrorI)ippiMirror_16u_C3IR :
+            type == CV_16UC4 ? (ippiMirrorI)ippiMirror_16u_C4IR :
+            type == CV_16SC1 ? (ippiMirrorI)ippiMirror_16s_C1IR :
+            type == CV_16SC3 ? (ippiMirrorI)ippiMirror_16s_C3IR :
+            type == CV_16SC4 ? (ippiMirrorI)ippiMirror_16s_C4IR :
+            type == CV_32SC1 ? (ippiMirrorI)ippiMirror_32s_C1IR :
+            type == CV_32SC3 ? (ippiMirrorI)ippiMirror_32s_C3IR :
+            type == CV_32SC4 ? (ippiMirrorI)ippiMirror_32s_C4IR :
+            type == CV_32FC1 ? (ippiMirrorI)ippiMirror_32f_C1IR :
+            type == CV_32FC3 ? (ippiMirrorI)ippiMirror_32f_C3IR :
+            type == CV_32FC4 ? (ippiMirrorI)ippiMirror_32f_C4IR : 0;
+        CV_SUPPRESS_DEPRECATED_END
+    }
+    else
+    {
+        ippFunc =
+            type == CV_8UC1 ? (ippiMirror)ippiMirror_8u_C1R :
+            type == CV_8UC3 ? (ippiMirror)ippiMirror_8u_C3R :
+            type == CV_8UC4 ? (ippiMirror)ippiMirror_8u_C4R :
+            type == CV_16UC1 ? (ippiMirror)ippiMirror_16u_C1R :
+            type == CV_16UC3 ? (ippiMirror)ippiMirror_16u_C3R :
+            type == CV_16UC4 ? (ippiMirror)ippiMirror_16u_C4R :
+            type == CV_16SC1 ? (ippiMirror)ippiMirror_16s_C1R :
+            type == CV_16SC3 ? (ippiMirror)ippiMirror_16s_C3R :
+            type == CV_16SC4 ? (ippiMirror)ippiMirror_16s_C4R :
+            type == CV_32SC1 ? (ippiMirror)ippiMirror_32s_C1R :
+            type == CV_32SC3 ? (ippiMirror)ippiMirror_32s_C3R :
+            type == CV_32SC4 ? (ippiMirror)ippiMirror_32s_C4R :
+            type == CV_32FC1 ? (ippiMirror)ippiMirror_32f_C1R :
+            type == CV_32FC3 ? (ippiMirror)ippiMirror_32f_C3R :
+            type == CV_32FC4 ? (ippiMirror)ippiMirror_32f_C4R : 0;
+    }
    IppiAxis axis = flip_mode == 0 ? ippAxsHorizontal :
        flip_mode > 0 ? ippAxsVertical : ippAxsBoth;
+    IppiSize roisize = { dst.cols, dst.rows };

    if (ippFunc != 0)
    {
-        IppStatus status = ippFunc(src.data, (int)src.step, dst.data, (int)dst.step, ippiSize(src.cols, src.rows), axis);
-        if (status >= 0)
+        if (ippFunc(src.data, (int)src.step, dst.data, (int)dst.step, ippiSize(src.cols, src.rows), axis) >= 0)
+            return;
+        setIppErrorStatus();
+    }
+    else if (ippFuncI != 0)
+    {
+        if (ippFuncI(dst.data, (int)dst.step, roisize, axis) >= 0)
            return;
        setIppErrorStatus();
    }
--- a/modules/core/src/matrix.cpp
+++ b/modules/core/src/matrix.cpp
@@ -3019,24 +3019,52 @@ void cv::transpose( InputArray _src, OutputArray _dst )
        return;
    }

-#if defined(HAVE_IPP) && !defined(HAVE_IPP_ICV_ONLY)
+#if defined HAVE_IPP && !defined HAVE_IPP_ICV_ONLY
    typedef IppStatus (CV_STDCALL * ippiTranspose)(const void * pSrc, int srcStep, void * pDst, int dstStep, IppiSize roiSize);
-    ippiTranspose ippFunc =
-    type == CV_8UC1 ? (ippiTranspose)ippiTranspose_8u_C1R :
-    type == CV_8UC3 ? (ippiTranspose)ippiTranspose_8u_C3R :
-    type == CV_8UC4 ? (ippiTranspose)ippiTranspose_8u_C4R :
-    type == CV_16UC1 ? (ippiTranspose)ippiTranspose_16u_C1R :
-    type == CV_16UC3 ? (ippiTranspose)ippiTranspose_16u_C3R :
-    type == CV_16UC4 ? (ippiTranspose)ippiTranspose_16u_C4R :
-    type == CV_16SC1 ? (ippiTranspose)ippiTranspose_16s_C1R :
-    type == CV_16SC3 ? (ippiTranspose)ippiTranspose_16s_C3R :
-    type == CV_16SC4 ? (ippiTranspose)ippiTranspose_16s_C4R :
-    type == CV_32SC1 ? (ippiTranspose)ippiTranspose_32s_C1R :
-    type == CV_32SC3 ? (ippiTranspose)ippiTranspose_32s_C3R :
-    type == CV_32SC4 ? (ippiTranspose)ippiTranspose_32s_C4R :
-    type == CV_32FC1 ? (ippiTranspose)ippiTranspose_32f_C1R :
-    type == CV_32FC3 ? (ippiTranspose)ippiTranspose_32f_C3R :
-    type == CV_32FC4 ? (ippiTranspose)ippiTranspose_32f_C4R : 0;
+    typedef IppStatus (CV_STDCALL * ippiTransposeI)(const void * pSrcDst, int srcDstStep, IppiSize roiSize);
+    ippiTranspose ippFunc = 0;
+    ippiTransposeI ippFuncI = 0;
+
+    if (dst.data == src.data && dst.cols == dst.rows)
+    {
+        CV_SUPPRESS_DEPRECATED_START
+        ippFuncI =
+            type == CV_8UC1 ? (ippiTransposeI)ippiTranspose_8u_C1IR :
+            type == CV_8UC3 ? (ippiTransposeI)ippiTranspose_8u_C3IR :
+            type == CV_8UC4 ? (ippiTransposeI)ippiTranspose_8u_C4IR :
+            type == CV_16UC1 ? (ippiTransposeI)ippiTranspose_16u_C1IR :
+            type == CV_16UC3 ? (ippiTransposeI)ippiTranspose_16u_C3IR :
+            type == CV_16UC4 ? (ippiTransposeI)ippiTranspose_16u_C4IR :
+            type == CV_16SC1 ? (ippiTransposeI)ippiTranspose_16s_C1IR :
+            type == CV_16SC3 ? (ippiTransposeI)ippiTranspose_16s_C3IR :
+            type == CV_16SC4 ? (ippiTransposeI)ippiTranspose_16s_C4IR :
+            type == CV_32SC1 ? (ippiTransposeI)ippiTranspose_32s_C1IR :
+            type == CV_32SC3 ? (ippiTransposeI)ippiTranspose_32s_C3IR :
+            type == CV_32SC4 ? (ippiTransposeI)ippiTranspose_32s_C4IR :
+            type == CV_32FC1 ? (ippiTransposeI)ippiTranspose_32f_C1IR :
+            type == CV_32FC3 ? (ippiTransposeI)ippiTranspose_32f_C3IR :
+            type == CV_32FC4 ? (ippiTransposeI)ippiTranspose_32f_C4IR : 0;
+        CV_SUPPRESS_DEPRECATED_END
+    }
+    else
+    {
+        ippFunc =
+            type == CV_8UC1 ? (ippiTranspose)ippiTranspose_8u_C1R :
+            type == CV_8UC3 ? (ippiTranspose)ippiTranspose_8u_C3R :
+            type == CV_8UC4 ? (ippiTranspose)ippiTranspose_8u_C4R :
+            type == CV_16UC1 ? (ippiTranspose)ippiTranspose_16u_C1R :
+            type == CV_16UC3 ? (ippiTranspose)ippiTranspose_16u_C3R :
+            type == CV_16UC4 ? (ippiTranspose)ippiTranspose_16u_C4R :
+            type == CV_16SC1 ? (ippiTranspose)ippiTranspose_16s_C1R :
+            type == CV_16SC3 ? (ippiTranspose)ippiTranspose_16s_C3R :
+            type == CV_16SC4 ? (ippiTranspose)ippiTranspose_16s_C4R :
+            type == CV_32SC1 ? (ippiTranspose)ippiTranspose_32s_C1R :
+            type == CV_32SC3 ? (ippiTranspose)ippiTranspose_32s_C3R :
+            type == CV_32SC4 ? (ippiTranspose)ippiTranspose_32s_C4R :
+            type == CV_32FC1 ? (ippiTranspose)ippiTranspose_32f_C1R :
+            type == CV_32FC3 ? (ippiTranspose)ippiTranspose_32f_C3R :
+            type == CV_32FC4 ? (ippiTranspose)ippiTranspose_32f_C4R : 0;
+    }

    IppiSize roiSize = { src.cols, src.rows };
    if (ippFunc != 0)
@@ -3045,6 +3073,12 @@ void cv::transpose( InputArray _src, OutputArray _dst )
            return;
        setIppErrorStatus();
    }
+    else if (ippFuncI != 0)
+    {
+        if (ippFuncI(dst.data, (int)dst.step, roiSize) >= 0)
+            return;
+        setIppErrorStatus();
+    }
 #endif

    if( dst.data == src.data )