diff --git a/modules/gpu/src/cuda/pyr_up.cu b/modules/gpu/src/cuda/pyr_up.cu index dd91bd454..d22891bf0 100644 --- a/modules/gpu/src/cuda/pyr_up.cu +++ b/modules/gpu/src/cuda/pyr_up.cu @@ -50,73 +50,91 @@ namespace cv { namespace gpu { namespace device { namespace imgproc { - template __global__ void pyrUp(const PtrStep src, DevMem2D_ dst, const B b) + template __global__ void pyrUp(const SrcPtr src, DevMem2D_ dst) { - typedef typename TypeVec::cn>::vec_type value_type; + typedef typename SrcPtr::elem_type src_t; + typedef typename TypeVec::cn>::vec_type sum_t; const int x = blockIdx.x * blockDim.x + threadIdx.x; const int y = blockIdx.y * blockDim.y + threadIdx.y; - __shared__ T smem1[10][10]; - __shared__ value_type smem2[20][16]; - - value_type sum; + __shared__ sum_t s_srcPatch[10][10]; + __shared__ sum_t s_dstPatch[20][16]; if (threadIdx.x < 10 && threadIdx.y < 10) - smem1[threadIdx.y][threadIdx.x] = b.at(blockIdx.y * blockDim.y / 2 + threadIdx.y - 1, blockIdx.x * blockDim.x / 2 + threadIdx.x - 1, src.data, src.step); + { + const int srcx = static_cast((blockIdx.x * blockDim.x) / 2 + threadIdx.x) - 1; + const int srcy = static_cast((blockIdx.y * blockDim.y) / 2 + threadIdx.y) - 1; + + s_srcPatch[threadIdx.y][threadIdx.x] = saturate_cast(src(srcy, srcx)); + } __syncthreads(); + sum_t sum = VecTraits::all(0); + + const int evenFlag = static_cast((threadIdx.x & 1) == 0); + const int oddFlag = static_cast((threadIdx.x & 1) != 0); + const bool eveny = ((threadIdx.y & 1) == 0); const int tidx = threadIdx.x; - sum = VecTraits::all(0); + if (eveny) + { + sum = sum + (evenFlag * 0.0625f) * s_srcPatch[1 + (threadIdx.y >> 1)][1 + ((tidx - 2) >> 1)]; + sum = sum + ( oddFlag * 0.25f ) * s_srcPatch[1 + (threadIdx.y >> 1)][1 + ((tidx - 1) >> 1)]; + sum = sum + (evenFlag * 0.375f ) * s_srcPatch[1 + (threadIdx.y >> 1)][1 + ((tidx ) >> 1)]; + sum = sum + ( oddFlag * 0.25f ) * s_srcPatch[1 + (threadIdx.y >> 1)][1 + ((tidx + 1) >> 1)]; + sum = sum + (evenFlag * 0.0625f) * s_srcPatch[1 + (threadIdx.y >> 1)][1 + ((tidx + 2) >> 1)]; + } - sum = sum + (tidx % 2 == 0) * 0.0625f * smem1[1 + threadIdx.y / 2][1 + ((tidx - 2) >> 1)]; - sum = sum + (tidx % 2 != 0) * 0.25f * smem1[1 + threadIdx.y / 2][1 + ((tidx - 1) >> 1)]; - sum = sum + (tidx % 2 == 0) * 0.375f * smem1[1 + threadIdx.y / 2][1 + ((tidx ) >> 1)]; - sum = sum + (tidx % 2 != 0) * 0.25f * smem1[1 + threadIdx.y / 2][1 + ((tidx + 1) >> 1)]; - sum = sum + (tidx % 2 == 0) * 0.0625f * smem1[1 + threadIdx.y / 2][1 + ((tidx + 2) >> 1)]; - - smem2[2 + threadIdx.y][tidx] = sum; + s_dstPatch[2 + threadIdx.y][threadIdx.x] = sum; if (threadIdx.y < 2) { - sum = VecTraits::all(0); + sum = VecTraits::all(0); - sum = sum + (tidx % 2 == 0) * 0.0625f * smem1[0][1 + ((tidx - 2) >> 1)]; - sum = sum + (tidx % 2 != 0) * 0.25f * smem1[0][1 + ((tidx - 1) >> 1)]; - sum = sum + (tidx % 2 == 0) * 0.375f * smem1[0][1 + ((tidx ) >> 1)]; - sum = sum + (tidx % 2 != 0) * 0.25f * smem1[0][1 + ((tidx + 1) >> 1)]; - sum = sum + (tidx % 2 == 0) * 0.0625f * smem1[0][1 + ((tidx + 2) >> 1)]; + if (eveny) + { + sum = sum + (evenFlag * 0.0625f) * s_srcPatch[0][1 + ((tidx - 2) >> 1)]; + sum = sum + ( oddFlag * 0.25f ) * s_srcPatch[0][1 + ((tidx - 1) >> 1)]; + sum = sum + (evenFlag * 0.375f ) * s_srcPatch[0][1 + ((tidx ) >> 1)]; + sum = sum + ( oddFlag * 0.25f ) * s_srcPatch[0][1 + ((tidx + 1) >> 1)]; + sum = sum + (evenFlag * 0.0625f) * s_srcPatch[0][1 + ((tidx + 2) >> 1)]; + } - smem2[threadIdx.y][tidx] = sum; + s_dstPatch[threadIdx.y][threadIdx.x] = sum; } if (threadIdx.y > 13) { - sum = VecTraits::all(0); + sum = VecTraits::all(0); - sum = sum + (tidx % 2 == 0) * 0.0625f * smem1[9][1 + ((tidx - 2) >> 1)]; - sum = sum + (tidx % 2 != 0) * 0.25f * smem1[9][1 + ((tidx - 1) >> 1)]; - sum = sum + (tidx % 2 == 0) * 0.375f * smem1[9][1 + ((tidx ) >> 1)]; - sum = sum + (tidx % 2 != 0) * 0.25f * smem1[9][1 + ((tidx + 1) >> 1)]; - sum = sum + (tidx % 2 == 0) * 0.0625f * smem1[9][1 + ((tidx + 2) >> 1)]; + if (eveny) + { + sum = sum + (evenFlag * 0.0625f) * s_srcPatch[9][1 + ((tidx - 2) >> 1)]; + sum = sum + ( oddFlag * 0.25f ) * s_srcPatch[9][1 + ((tidx - 1) >> 1)]; + sum = sum + (evenFlag * 0.375f ) * s_srcPatch[9][1 + ((tidx ) >> 1)]; + sum = sum + ( oddFlag * 0.25f ) * s_srcPatch[9][1 + ((tidx + 1) >> 1)]; + sum = sum + (evenFlag * 0.0625f) * s_srcPatch[9][1 + ((tidx + 2) >> 1)]; + } - smem2[4 + threadIdx.y][tidx] = sum; + s_dstPatch[4 + threadIdx.y][threadIdx.x] = sum; } __syncthreads(); - sum = VecTraits::all(0); + sum = VecTraits::all(0); - sum = sum + (tidx % 2 == 0) * 0.0625f * smem2[2 + threadIdx.y - 2][tidx]; - sum = sum + (tidx % 2 != 0) * 0.25f * smem2[2 + threadIdx.y - 1][tidx]; - sum = sum + (tidx % 2 == 0) * 0.375f * smem2[2 + threadIdx.y ][tidx]; - sum = sum + (tidx % 2 != 0) * 0.25f * smem2[2 + threadIdx.y + 1][tidx]; - sum = sum + (tidx % 2 == 0) * 0.0625f * smem2[2 + threadIdx.y + 2][tidx]; + const int tidy = threadIdx.y; + + sum = sum + 0.0625f * s_dstPatch[2 + tidy - 2][threadIdx.x]; + sum = sum + 0.25f * s_dstPatch[2 + tidy - 1][threadIdx.x]; + sum = sum + 0.375f * s_dstPatch[2 + tidy ][threadIdx.x]; + sum = sum + 0.25f * s_dstPatch[2 + tidy + 1][threadIdx.x]; + sum = sum + 0.0625f * s_dstPatch[2 + tidy + 2][threadIdx.x]; if (x < dst.cols && y < dst.rows) - dst.ptr(y)[x] = saturate_cast(4.0f * sum); + dst(y, x) = saturate_cast(4.0f * sum); } template class B> void pyrUp_caller(const DevMem2D_& src, const DevMem2D_& dst, cudaStream_t stream) @@ -125,8 +143,9 @@ namespace cv { namespace gpu { namespace device const dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y)); B b(src.rows, src.cols); + BorderReader< PtrStep, B > srcReader(src, b); - pyrUp<<>>(src, dst, b); + pyrUp<<>>(srcReader, dst); cudaSafeCall( cudaGetLastError() ); if (stream == 0) diff --git a/modules/gpu/test/test_imgproc.cpp b/modules/gpu/test/test_imgproc.cpp index 19552e758..41e87e2ee 100644 --- a/modules/gpu/test/test_imgproc.cpp +++ b/modules/gpu/test/test_imgproc.cpp @@ -3533,8 +3533,7 @@ PARAM_TEST_CASE(PyrUp, cv::gpu::DeviceInfo, MatType, UseRoi) cv::gpu::DeviceInfo devInfo; int type; bool useRoi; - - cv::Size size; + cv::Mat src; cv::Mat dst_gold; @@ -3549,7 +3548,7 @@ PARAM_TEST_CASE(PyrUp, cv::gpu::DeviceInfo, MatType, UseRoi) cv::RNG& rng = TS::ptr()->get_rng(); - size = cv::Size(rng.uniform(100, 200), rng.uniform(100, 200)); + cv::Size size(rng.uniform(200, 400), rng.uniform(200, 400)); src = randomMat(rng, size, type, 0.0, 255.0, false); @@ -3563,11 +3562,12 @@ TEST_P(PyrUp, Accuracy) cv::gpu::GpuMat d_dst; - cv::gpu::pyrUp(loadMat(src, useRoi), d_dst); + cv::gpu::pyrUp(loadMat(src, useRoi), d_dst, cv::BORDER_REFLECT); d_dst.download(dst); - EXPECT_MAT_NEAR(dst_gold, dst, src.depth() == CV_32F ? 1e-4 : 1.0); + // results differs only on border left and top border due different border extrapolation type + EXPECT_MAT_NEAR(dst_gold(cv::Range(1, dst_gold.rows), cv::Range(1, dst_gold.cols)), dst(cv::Range(1, dst_gold.rows), cv::Range(1, dst_gold.cols)), 1.0); } INSTANTIATE_TEST_CASE_P(ImgProc, PyrUp, Combine(