resize area are fixed for scales that aren't divide 128
This commit is contained in:
parent
0f01d8df1c
commit
db08656a38
@ -537,7 +537,7 @@ namespace cv { namespace gpu { namespace device
|
||||
}
|
||||
|
||||
template<typename T, typename W>
|
||||
__global__ void resise_scan_fast_x(const DevMem2D_<T> src, DevMem2D_<W> dst, int fx, int fy, int thred_lines)
|
||||
__global__ void resise_scan_fast_x(const DevMem2D_<T> src, DevMem2D_<W> dst, int fx, int fy, int thred_lines, int stride)
|
||||
{
|
||||
extern __shared__ W sbuf[];
|
||||
|
||||
@ -545,11 +545,14 @@ namespace cv { namespace gpu { namespace device
|
||||
|
||||
// load line-block on shared memory
|
||||
int y = blockIdx.x / thred_lines;
|
||||
int input_stride = (blockIdx.x - y * thred_lines) * blockDim.x;
|
||||
int input_stride = (blockIdx.x % thred_lines) * stride;
|
||||
int x = input_stride + tid;
|
||||
|
||||
// store global data in shared memory
|
||||
sbuf[tid] = src(y, x);
|
||||
if (x < src.cols && y < src.rows)
|
||||
sbuf[tid] = src(y, x);
|
||||
else
|
||||
sbuf[tid] = 0;
|
||||
__syncthreads();
|
||||
|
||||
scan_block<inclusive, W>(sbuf);
|
||||
@ -575,7 +578,7 @@ namespace cv { namespace gpu { namespace device
|
||||
}
|
||||
|
||||
template<typename T, typename W>
|
||||
__global__ void resise_scan_fast_y(const DevMem2D_<W> src, DevMem2D_<T> dst, int fx, int fy, int thred_lines)
|
||||
__global__ void resise_scan_fast_y(const DevMem2D_<W> src, DevMem2D_<T> dst, int fx, int fy, int thred_lines, int stride)
|
||||
{
|
||||
extern __shared__ W sbuf[];
|
||||
|
||||
@ -584,13 +587,15 @@ namespace cv { namespace gpu { namespace device
|
||||
// load line-block on shared memory
|
||||
int x = blockIdx.x / thred_lines;
|
||||
|
||||
int global_stride = (blockIdx.x % thred_lines) * blockDim.x;
|
||||
if (!tid) printf("STRIDE : %d", global_stride);
|
||||
int global_stride = (blockIdx.x % thred_lines) * stride;
|
||||
int y = global_stride + tid;
|
||||
|
||||
// store global data in shared memory
|
||||
if (x < src.cols && y < src.rows)
|
||||
sbuf[tid] = src(y, x);
|
||||
else
|
||||
sbuf[tid] = 0;
|
||||
|
||||
sbuf[tid] = src(y, x);
|
||||
__syncthreads();
|
||||
scan_block<inclusive, W>(sbuf);
|
||||
|
||||
@ -623,28 +628,30 @@ namespace cv { namespace gpu { namespace device
|
||||
int iscale_x = round(fx);
|
||||
int iscale_y = round(fy);
|
||||
|
||||
const int warps = 4;
|
||||
int warps = 4;
|
||||
const int threads = 32 * warps;
|
||||
int input_stride = threads / iscale_x;
|
||||
|
||||
int thred_lines = divUp(src.cols, threads);
|
||||
int thred_lines = divUp(src.cols, input_stride * iscale_x);
|
||||
int blocks = src.rows * thred_lines;
|
||||
|
||||
printf("device code executed for X coordinate with:\nsize %d warps %d, threads %d, thred_lines %d, blocks %d\n",
|
||||
src.cols, warps, threads, thred_lines, blocks);
|
||||
printf("device code executed for X coordinate with:\nsize %d warps %d, threads %d, thred_lines %d, blocks %d input strude %d\n",
|
||||
src.cols, warps, threads, thred_lines, blocks, input_stride * iscale_x);
|
||||
|
||||
typedef typename scan_traits<T>::scan_line_type smem_type;
|
||||
|
||||
resise_scan_fast_x<T, smem_type><<<blocks, threads, warps * 32 * sizeof(smem_type)>>>
|
||||
(src, buffer, iscale_x, iscale_y, thred_lines);
|
||||
(src, buffer, iscale_x, iscale_y, thred_lines, input_stride * iscale_x);
|
||||
|
||||
thred_lines = divUp(src.rows, threads);
|
||||
input_stride = threads / iscale_y;
|
||||
thred_lines = divUp(src.rows, input_stride * iscale_y);
|
||||
blocks = dst.cols * thred_lines;
|
||||
|
||||
printf("device code executed for Y coordinate with:\nsize %d warps %d, threads %d, thred_lines %d, blocks %d\n",
|
||||
dst.rows, warps, threads, thred_lines, blocks);
|
||||
|
||||
resise_scan_fast_y<T, smem_type><<<blocks, threads, warps * 32 * sizeof(smem_type)>>>
|
||||
(buffer, dst, iscale_x, iscale_y, thred_lines);
|
||||
(buffer, dst, iscale_x, iscale_y, thred_lines, input_stride * iscale_y);
|
||||
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
|
@ -95,6 +95,7 @@ void cv::gpu::resize(const GpuMat& src, GpuMat& dst, GpuMat& buffer, Size dsize,
|
||||
CV_Assert( (fx < 1.0) && (fy < 1.0));
|
||||
CV_Assert(!(dsize == Size()) || (fx > 0 && fy > 0));
|
||||
CV_Assert(src.cols >= 128 && src.rows >= 128);
|
||||
CV_Assert((fx - 128.0) <= 0 && (fy - 128.0) <= 0);
|
||||
|
||||
if (dsize == Size())
|
||||
dsize = Size(saturate_cast<int>(src.cols * fx), saturate_cast<int>(src.rows * fy));
|
||||
|
@ -201,13 +201,13 @@ TEST_P(ResizeArea, Accuracy)
|
||||
cv::Mat gpu;
|
||||
dst.download(gpu);
|
||||
|
||||
std::cout //<< src
|
||||
// std::cout // << src
|
||||
// // << std::endl << std::endl
|
||||
// // << gpu_buff
|
||||
// // << std::endl << std::endl
|
||||
// << gpu
|
||||
// << std::endl << std::endl
|
||||
// << gpu_buff
|
||||
// << std::endl << std::endl
|
||||
<< gpu
|
||||
<< std::endl << std::endl
|
||||
<< dst_cpu<< std::endl;
|
||||
// << dst_cpu<< std::endl;
|
||||
|
||||
|
||||
EXPECT_MAT_NEAR(dst_cpu, dst, src.depth() == CV_32F ? 1e-2 : 1.0);
|
||||
@ -215,9 +215,9 @@ TEST_P(ResizeArea, Accuracy)
|
||||
|
||||
INSTANTIATE_TEST_CASE_P(GPU_ImgProc, ResizeArea, testing::Combine(
|
||||
ALL_DEVICES,
|
||||
testing::Values(cv::Size(640, 10 * 128)),//DIFFERENT_SIZES,
|
||||
testing::Values(cv::Size(640, 480)),//DIFFERENT_SIZES,
|
||||
testing::Values(MatType(CV_8UC1)/*MatType(CV_8UC3), MatType(CV_16UC1), MatType(CV_16UC3), MatType(CV_16UC4), MatType(CV_32FC1), MatType(CV_32FC3), MatType(CV_32FC4)*/),
|
||||
testing::Values(0.1),
|
||||
testing::Values(0.05, 0.1),
|
||||
testing::Values(Interpolation(cv::INTER_AREA)),
|
||||
WHOLE_SUBMAT));
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user