diff --git a/modules/gpu/perf/perf_imgproc.cpp b/modules/gpu/perf/perf_imgproc.cpp index 9769bfad1..4c1190a01 100644 --- a/modules/gpu/perf/perf_imgproc.cpp +++ b/modules/gpu/perf/perf_imgproc.cpp @@ -1720,7 +1720,7 @@ CV_FLAGS(GHMethod, cv::GHT_POSITION, cv::GHT_SCALE, cv::GHT_ROTATION); DEF_PARAM_TEST(Method_Sz, GHMethod, cv::Size); -PERF_TEST_P(Method_Sz, GeneralizedHough, Combine( +PERF_TEST_P(Method_Sz, ImgProc_GeneralizedHough, Combine( Values(GHMethod(cv::GHT_POSITION), GHMethod(cv::GHT_POSITION | cv::GHT_SCALE), GHMethod(cv::GHT_POSITION | cv::GHT_ROTATION), GHMethod(cv::GHT_POSITION | cv::GHT_SCALE | cv::GHT_ROTATION)), GPU_TYPICAL_MAT_SIZES)) { diff --git a/modules/gpu/src/cuda/column_filter.cu b/modules/gpu/src/cuda/column_filter.cu index 307e87ad3..f283bf693 100644 --- a/modules/gpu/src/cuda/column_filter.cu +++ b/modules/gpu/src/cuda/column_filter.cu @@ -89,20 +89,45 @@ namespace cv { namespace gpu { namespace device const int yStart = blockIdx.y * (BLOCK_DIM_Y * PATCH_PER_BLOCK) + threadIdx.y; - //Upper halo - #pragma unroll - for (int j = 0; j < HALO_SIZE; ++j) - smem[threadIdx.y + j * BLOCK_DIM_Y][threadIdx.x] = saturate_cast(brd.at_low(yStart - (HALO_SIZE - j) * BLOCK_DIM_Y, src_col, src.step)); + if (blockIdx.y > 0) + { + //Upper halo + #pragma unroll + for (int j = 0; j < HALO_SIZE; ++j) + smem[threadIdx.y + j * BLOCK_DIM_Y][threadIdx.x] = saturate_cast(src(yStart - (HALO_SIZE - j) * BLOCK_DIM_Y, x)); + } + else + { + //Upper halo + #pragma unroll + for (int j = 0; j < HALO_SIZE; ++j) + smem[threadIdx.y + j * BLOCK_DIM_Y][threadIdx.x] = saturate_cast(brd.at_low(yStart - (HALO_SIZE - j) * BLOCK_DIM_Y, src_col, src.step)); + } - //Main data - #pragma unroll - for (int j = 0; j < PATCH_PER_BLOCK; ++j) - smem[threadIdx.y + HALO_SIZE * BLOCK_DIM_Y + j * BLOCK_DIM_Y][threadIdx.x] = saturate_cast(brd.at_high(yStart + j * BLOCK_DIM_Y, src_col, src.step)); + if (blockIdx.y + 2 < gridDim.y) + { + //Main data + #pragma unroll + for (int j = 0; j < PATCH_PER_BLOCK; ++j) + smem[threadIdx.y + HALO_SIZE * BLOCK_DIM_Y + j * BLOCK_DIM_Y][threadIdx.x] = saturate_cast(src(yStart + j * BLOCK_DIM_Y, x)); - //Lower halo - #pragma unroll - for (int j = 0; j < HALO_SIZE; ++j) - smem[threadIdx.y + (PATCH_PER_BLOCK + HALO_SIZE) * BLOCK_DIM_Y + j * BLOCK_DIM_Y][threadIdx.x] = saturate_cast(brd.at_high(yStart + (PATCH_PER_BLOCK + j) * BLOCK_DIM_Y, src_col, src.step)); + //Lower halo + #pragma unroll + for (int j = 0; j < HALO_SIZE; ++j) + smem[threadIdx.y + (PATCH_PER_BLOCK + HALO_SIZE) * BLOCK_DIM_Y + j * BLOCK_DIM_Y][threadIdx.x] = saturate_cast(src(yStart + (PATCH_PER_BLOCK + j) * BLOCK_DIM_Y, x)); + } + else + { + //Main data + #pragma unroll + for (int j = 0; j < PATCH_PER_BLOCK; ++j) + smem[threadIdx.y + HALO_SIZE * BLOCK_DIM_Y + j * BLOCK_DIM_Y][threadIdx.x] = saturate_cast(brd.at_high(yStart + j * BLOCK_DIM_Y, src_col, src.step)); + + //Lower halo + #pragma unroll + for (int j = 0; j < HALO_SIZE; ++j) + smem[threadIdx.y + (PATCH_PER_BLOCK + HALO_SIZE) * BLOCK_DIM_Y + j * BLOCK_DIM_Y][threadIdx.x] = saturate_cast(brd.at_high(yStart + (PATCH_PER_BLOCK + j) * BLOCK_DIM_Y, src_col, src.step)); + } __syncthreads(); diff --git a/modules/gpu/src/cuda/pyr_down.cu b/modules/gpu/src/cuda/pyr_down.cu index 12c485f8c..ca9a598d7 100644 --- a/modules/gpu/src/cuda/pyr_down.cu +++ b/modules/gpu/src/cuda/pyr_down.cu @@ -40,7 +40,7 @@ // //M*/ -#include "internal_shared.hpp" +#include "opencv2/gpu/device/common.hpp" #include "opencv2/gpu/device/border_interpolate.hpp" #include "opencv2/gpu/device/vec_traits.hpp" #include "opencv2/gpu/device/vec_math.hpp" @@ -50,57 +50,104 @@ namespace cv { namespace gpu { namespace device { namespace imgproc { - template __global__ void pyrDown(const PtrStep src, PtrStep dst, const B b, int dst_cols) + template __global__ void pyrDown(const PtrStepSz src, PtrStep dst, const B b, int dst_cols) { - typedef typename TypeVec::cn>::vec_type value_type; + typedef typename TypeVec::cn>::vec_type work_t; + + __shared__ work_t smem[256 + 4]; const int x = blockIdx.x * blockDim.x + threadIdx.x; const int y = blockIdx.y; - __shared__ value_type smem[256 + 4]; + const int src_y = 2 * y; - value_type sum; - - const int src_y = 2*y; - - sum = VecTraits::all(0); - - sum = sum + 0.0625f * b.at(src_y - 2, x, src.data, src.step); - sum = sum + 0.25f * b.at(src_y - 1, x, src.data, src.step); - sum = sum + 0.375f * b.at(src_y , x, src.data, src.step); - sum = sum + 0.25f * b.at(src_y + 1, x, src.data, src.step); - sum = sum + 0.0625f * b.at(src_y + 2, x, src.data, src.step); - - smem[2 + threadIdx.x] = sum; - - if (threadIdx.x < 2) + if (src_y >= 2 && src_y < src.rows - 2 && x >= 2 && x < src.cols - 2) { - const int left_x = x - 2; + { + work_t sum; - sum = VecTraits::all(0); + sum = 0.0625f * src(src_y - 2, x); + sum = sum + 0.25f * src(src_y - 1, x); + sum = sum + 0.375f * src(src_y , x); + sum = sum + 0.25f * src(src_y + 1, x); + sum = sum + 0.0625f * src(src_y + 2, x); - sum = sum + 0.0625f * b.at(src_y - 2, left_x, src.data, src.step); - sum = sum + 0.25f * b.at(src_y - 1, left_x, src.data, src.step); - sum = sum + 0.375f * b.at(src_y , left_x, src.data, src.step); - sum = sum + 0.25f * b.at(src_y + 1, left_x, src.data, src.step); - sum = sum + 0.0625f * b.at(src_y + 2, left_x, src.data, src.step); + smem[2 + threadIdx.x] = sum; + } - smem[threadIdx.x] = sum; + if (threadIdx.x < 2) + { + const int left_x = x - 2; + + work_t sum; + + sum = 0.0625f * src(src_y - 2, left_x); + sum = sum + 0.25f * src(src_y - 1, left_x); + sum = sum + 0.375f * src(src_y , left_x); + sum = sum + 0.25f * src(src_y + 1, left_x); + sum = sum + 0.0625f * src(src_y + 2, left_x); + + smem[threadIdx.x] = sum; + } + + if (threadIdx.x > 253) + { + const int right_x = x + 2; + + work_t sum; + + sum = 0.0625f * src(src_y - 2, right_x); + sum = sum + 0.25f * src(src_y - 1, right_x); + sum = sum + 0.375f * src(src_y , right_x); + sum = sum + 0.25f * src(src_y + 1, right_x); + sum = sum + 0.0625f * src(src_y + 2, right_x); + + smem[4 + threadIdx.x] = sum; + } } - - if (threadIdx.x > 253) + else { - const int right_x = x + 2; + { + work_t sum; - sum = VecTraits::all(0); + sum = 0.0625f * src(b.idx_row_low (src_y - 2), b.idx_col_high(x)); + sum = sum + 0.25f * src(b.idx_row_low (src_y - 1), b.idx_col_high(x)); + sum = sum + 0.375f * src(src_y , b.idx_col_high(x)); + sum = sum + 0.25f * src(b.idx_row_high(src_y + 1), b.idx_col_high(x)); + sum = sum + 0.0625f * src(b.idx_row_high(src_y + 2), b.idx_col_high(x)); - sum = sum + 0.0625f * b.at(src_y - 2, right_x, src.data, src.step); - sum = sum + 0.25f * b.at(src_y - 1, right_x, src.data, src.step); - sum = sum + 0.375f * b.at(src_y , right_x, src.data, src.step); - sum = sum + 0.25f * b.at(src_y + 1, right_x, src.data, src.step); - sum = sum + 0.0625f * b.at(src_y + 2, right_x, src.data, src.step); + smem[2 + threadIdx.x] = sum; + } - smem[4 + threadIdx.x] = sum; + if (threadIdx.x < 2) + { + const int left_x = x - 2; + + work_t sum; + + sum = 0.0625f * src(b.idx_row_low (src_y - 2), b.idx_col(left_x)); + sum = sum + 0.25f * src(b.idx_row_low (src_y - 1), b.idx_col(left_x)); + sum = sum + 0.375f * src(src_y , b.idx_col(left_x)); + sum = sum + 0.25f * src(b.idx_row_high(src_y + 1), b.idx_col(left_x)); + sum = sum + 0.0625f * src(b.idx_row_high(src_y + 2), b.idx_col(left_x)); + + smem[threadIdx.x] = sum; + } + + if (threadIdx.x > 253) + { + const int right_x = x + 2; + + work_t sum; + + sum = 0.0625f * src(b.idx_row_low (src_y - 2), b.idx_col_high(right_x)); + sum = sum + 0.25f * src(b.idx_row_low (src_y - 1), b.idx_col_high(right_x)); + sum = sum + 0.375f * src(src_y , b.idx_col_high(right_x)); + sum = sum + 0.25f * src(b.idx_row_high(src_y + 1), b.idx_col_high(right_x)); + sum = sum + 0.0625f * src(b.idx_row_high(src_y + 2), b.idx_col_high(right_x)); + + smem[4 + threadIdx.x] = sum; + } } __syncthreads(); @@ -109,9 +156,9 @@ namespace cv { namespace gpu { namespace device { const int tid2 = threadIdx.x * 2; - sum = VecTraits::all(0); + work_t sum; - sum = sum + 0.0625f * smem[2 + tid2 - 2]; + sum = 0.0625f * smem[2 + tid2 - 2]; sum = sum + 0.25f * smem[2 + tid2 - 1]; sum = sum + 0.375f * smem[2 + tid2 ]; sum = sum + 0.25f * smem[2 + tid2 + 1]; diff --git a/modules/gpu/src/cuda/row_filter.cu b/modules/gpu/src/cuda/row_filter.cu index 8d96c796f..5b3d044ea 100644 --- a/modules/gpu/src/cuda/row_filter.cu +++ b/modules/gpu/src/cuda/row_filter.cu @@ -89,20 +89,45 @@ namespace cv { namespace gpu { namespace device const int xStart = blockIdx.x * (PATCH_PER_BLOCK * BLOCK_DIM_X) + threadIdx.x; - //Load left halo - #pragma unroll - for (int j = 0; j < HALO_SIZE; ++j) - smem[threadIdx.y][threadIdx.x + j * BLOCK_DIM_X] = saturate_cast(brd.at_low(xStart - (HALO_SIZE - j) * BLOCK_DIM_X, src_row)); + if (blockIdx.x > 0) + { + //Load left halo + #pragma unroll + for (int j = 0; j < HALO_SIZE; ++j) + smem[threadIdx.y][threadIdx.x + j * BLOCK_DIM_X] = saturate_cast(src_row[xStart - (HALO_SIZE - j) * BLOCK_DIM_X]); + } + else + { + //Load left halo + #pragma unroll + for (int j = 0; j < HALO_SIZE; ++j) + smem[threadIdx.y][threadIdx.x + j * BLOCK_DIM_X] = saturate_cast(brd.at_low(xStart - (HALO_SIZE - j) * BLOCK_DIM_X, src_row)); + } - //Load main data - #pragma unroll - for (int j = 0; j < PATCH_PER_BLOCK; ++j) - smem[threadIdx.y][threadIdx.x + HALO_SIZE * BLOCK_DIM_X + j * BLOCK_DIM_X] = saturate_cast(brd.at_high(xStart + j * BLOCK_DIM_X, src_row)); + if (blockIdx.x + 2 < gridDim.x) + { + //Load main data + #pragma unroll + for (int j = 0; j < PATCH_PER_BLOCK; ++j) + smem[threadIdx.y][threadIdx.x + HALO_SIZE * BLOCK_DIM_X + j * BLOCK_DIM_X] = saturate_cast(src_row[xStart + j * BLOCK_DIM_X]); - //Load right halo - #pragma unroll - for (int j = 0; j < HALO_SIZE; ++j) - smem[threadIdx.y][threadIdx.x + (PATCH_PER_BLOCK + HALO_SIZE) * BLOCK_DIM_X + j * BLOCK_DIM_X] = saturate_cast(brd.at_high(xStart + (PATCH_PER_BLOCK + j) * BLOCK_DIM_X, src_row)); + //Load right halo + #pragma unroll + for (int j = 0; j < HALO_SIZE; ++j) + smem[threadIdx.y][threadIdx.x + (PATCH_PER_BLOCK + HALO_SIZE) * BLOCK_DIM_X + j * BLOCK_DIM_X] = saturate_cast(src_row[xStart + (PATCH_PER_BLOCK + j) * BLOCK_DIM_X]); + } + else + { + //Load main data + #pragma unroll + for (int j = 0; j < PATCH_PER_BLOCK; ++j) + smem[threadIdx.y][threadIdx.x + HALO_SIZE * BLOCK_DIM_X + j * BLOCK_DIM_X] = saturate_cast(brd.at_high(xStart + j * BLOCK_DIM_X, src_row)); + + //Load right halo + #pragma unroll + for (int j = 0; j < HALO_SIZE; ++j) + smem[threadIdx.y][threadIdx.x + (PATCH_PER_BLOCK + HALO_SIZE) * BLOCK_DIM_X + j * BLOCK_DIM_X] = saturate_cast(brd.at_high(xStart + (PATCH_PER_BLOCK + j) * BLOCK_DIM_X, src_row)); + } __syncthreads();