diff --git a/3rdparty/carotene/hal/tegra_hal.hpp b/3rdparty/carotene/hal/tegra_hal.hpp index f1bf5c67a..401f521a6 100644 --- a/3rdparty/carotene/hal/tegra_hal.hpp +++ b/3rdparty/carotene/hal/tegra_hal.hpp @@ -1433,7 +1433,7 @@ inline int TEGRA_MORPHFREE(cvhalFilter2D *context) #define TEGRA_RESIZE(src_type, src_data, src_step, src_width, src_height, dst_data, dst_step, dst_width, dst_height, inv_scale_x, inv_scale_y, interpolation) \ ( \ - /*interpolation == CV_HAL_INTER_LINEAR ? \ + interpolation == CV_HAL_INTER_LINEAR ? \ CV_MAT_DEPTH(src_type) == CV_8U && CAROTENE_NS::isResizeLinearOpenCVSupported(CAROTENE_NS::Size2D(src_width, src_height), CAROTENE_NS::Size2D(dst_width, dst_height), ((src_type >> CV_CN_SHIFT) + 1)) && \ inv_scale_x > 0 && inv_scale_y > 0 && \ (dst_width - 0.5)/inv_scale_x - 0.5 < src_width && (dst_height - 0.5)/inv_scale_y - 0.5 < src_height && \ @@ -1441,7 +1441,7 @@ inline int TEGRA_MORPHFREE(cvhalFilter2D *context) std::abs(dst_width / inv_scale_x - src_width) < 0.1 && std::abs(dst_height / inv_scale_y - src_height) < 0.1 ? \ CAROTENE_NS::resizeLinearOpenCV(CAROTENE_NS::Size2D(src_width, src_height), CAROTENE_NS::Size2D(dst_width, dst_height), \ src_data, src_step, dst_data, dst_step, 1.0/inv_scale_x, 1.0/inv_scale_y, ((src_type >> CV_CN_SHIFT) + 1)), \ - CV_HAL_ERROR_OK : CV_HAL_ERROR_NOT_IMPLEMENTED :*/ \ + CV_HAL_ERROR_OK : CV_HAL_ERROR_NOT_IMPLEMENTED : \ interpolation == CV_HAL_INTER_AREA ? \ CV_MAT_DEPTH(src_type) == CV_8U && CAROTENE_NS::isResizeAreaSupported(1.0/inv_scale_x, 1.0/inv_scale_y, ((src_type >> CV_CN_SHIFT) + 1)) && \ std::abs(dst_width / inv_scale_x - src_width) < 0.1 && std::abs(dst_height / inv_scale_y - src_height) < 0.1 ? \ diff --git a/3rdparty/carotene/src/resize.cpp b/3rdparty/carotene/src/resize.cpp index 122a5f220..3a80d472d 100644 --- a/3rdparty/carotene/src/resize.cpp +++ b/3rdparty/carotene/src/resize.cpp @@ -1681,15 +1681,15 @@ void downsample_bilinear_8uc1(const Size2D &ssize, const Size2D &dsize, vst1_u8(internal::getRowPtr(dstBase, dstStride, row) + col, vres); #else /* ugly version matching to OpenCV's SSE optimization */ - int16x4_t v1Ls = vshrn_n_s32(v1L, 5); - int16x4_t v1Hs = vshrn_n_s32(v1H, 5); - int16x4_t v2Ls = vshrn_n_s32(v2L, 5); - int16x4_t v2Hs = vshrn_n_s32(v2H, 5); + int16x4_t v1Ls = vshrn_n_s32(v1L, 4); + int16x4_t v1Hs = vshrn_n_s32(v1H, 4); + int16x4_t v2Ls = vshrn_n_s32(v2L, 4); + int16x4_t v2Hs = vshrn_n_s32(v2H, 4); int16x8_t v1s = vqdmulhq_s16(vcombine_s16(v1Ls, v1Hs), vrw); int16x8_t v2s = vqdmulhq_s16(vcombine_s16(v2Ls, v2Hs), vrW); - int16x8_t vsum = vaddq_s16(v1s, v2s); + int16x8_t vsum = vaddq_s16(vshrq_n_s16(v1s,1), vshrq_n_s16(v2s,1)); uint8x8_t vres = vqrshrun_n_s16(vsum, 2); vst1_u8(internal::getRowPtr(dstBase, dstStride, row) + col, vres); @@ -1736,15 +1736,15 @@ void downsample_bilinear_8uc1(const Size2D &ssize, const Size2D &dsize, vst1_u8(internal::getRowPtr(dstBase, dstStride, row) + col + 8, vres); #else /* ugly version matching to OpenCV's SSE optimization */ - int16x4_t v1Ls = vshrn_n_s32(v1L, 5); - int16x4_t v1Hs = vshrn_n_s32(v1H, 5); - int16x4_t v2Ls = vshrn_n_s32(v2L, 5); - int16x4_t v2Hs = vshrn_n_s32(v2H, 5); + int16x4_t v1Ls = vshrn_n_s32(v1L, 4); + int16x4_t v1Hs = vshrn_n_s32(v1H, 4); + int16x4_t v2Ls = vshrn_n_s32(v2L, 4); + int16x4_t v2Hs = vshrn_n_s32(v2H, 4); int16x8_t v1s = vqdmulhq_s16(vcombine_s16(v1Ls, v1Hs), vrw); int16x8_t v2s = vqdmulhq_s16(vcombine_s16(v2Ls, v2Hs), vrW); - int16x8_t vsum = vaddq_s16(v1s, v2s); + int16x8_t vsum = vaddq_s16(vshrq_n_s16(v1s,1), vshrq_n_s16(v2s,1)); uint8x8_t vres = vqrshrun_n_s16(vsum, 2); vst1_u8(internal::getRowPtr(dstBase, dstStride, row) + col + 8, vres); @@ -1836,15 +1836,15 @@ downsample_bilinear_8uc1_col_loop8: vst1_u8(internal::getRowPtr(dstBase, dstStride, row) + col, vres); #else /* ugly version matching to OpenCV's SSE optimization */ - int16x4_t v1Ls = vshrn_n_s32(v1L, 5); - int16x4_t v1Hs = vshrn_n_s32(v1H, 5); - int16x4_t v2Ls = vshrn_n_s32(v2L, 5); - int16x4_t v2Hs = vshrn_n_s32(v2H, 5); + int16x4_t v1Ls = vshrn_n_s32(v1L, 4); + int16x4_t v1Hs = vshrn_n_s32(v1H, 4); + int16x4_t v2Ls = vshrn_n_s32(v2L, 4); + int16x4_t v2Hs = vshrn_n_s32(v2H, 4); int16x8_t v1s = vqdmulhq_s16(vcombine_s16(v1Ls, v1Hs), vrw); int16x8_t v2s = vqdmulhq_s16(vcombine_s16(v2Ls, v2Hs), vrW); - int16x8_t vsum = vaddq_s16(v1s, v2s); + int16x8_t vsum = vaddq_s16(vshrq_n_s16(v1s,1), vshrq_n_s16(v2s,1)); uint8x8_t vres = vqrshrun_n_s16(vsum, 2); vst1_u8(internal::getRowPtr(dstBase, dstStride, row) + col, vres);