diff --git a/modules/ocl/src/filtering.cpp b/modules/ocl/src/filtering.cpp index b7ef9633d..d3eeb62fe 100644 --- a/modules/ocl/src/filtering.cpp +++ b/modules/ocl/src/filtering.cpp @@ -1058,74 +1058,39 @@ template <> struct index_and_sizeof template void linearRowFilter_gpu(const oclMat &src, const oclMat &dst, oclMat mat_kernel, int ksize, int anchor, int bordertype) { - Context *clCxt = src.clCxt; + CV_Assert(bordertype <= BORDER_REFLECT_101); + CV_Assert(ksize == (anchor << 1) + 1); int channels = src.oclchannels(); - size_t localThreads[3] = {16, 16, 1}; - string kernelName = "row_filter"; + size_t localThreads[3] = { 16, 16, 1 }; + size_t globalThreads[3] = { dst.cols, dst.rows, 1 }; - char btype[30]; - - switch (bordertype) - { - case 0: - sprintf(btype, "BORDER_CONSTANT"); - break; - case 1: - sprintf(btype, "BORDER_REPLICATE"); - break; - case 2: - sprintf(btype, "BORDER_REFLECT"); - break; - case 3: - sprintf(btype, "BORDER_WRAP"); - break; - case 4: - sprintf(btype, "BORDER_REFLECT_101"); - break; - } - - char compile_option[128]; - sprintf(compile_option, "-D RADIUSX=%d -D LSIZE0=%d -D LSIZE1=%d -D CN=%d -D %s", anchor, (int)localThreads[0], (int)localThreads[1], channels, btype); - - size_t globalThreads[3]; - globalThreads[1] = (dst.rows + localThreads[1] - 1) / localThreads[1] * localThreads[1]; - globalThreads[2] = (1 + localThreads[2] - 1) / localThreads[2] * localThreads[2]; + const char * const borderMap[] = { "BORDER_CONSTANT", "BORDER_REPLICATE", "BORDER_REFLECT", "BORDER_WRAP", "BORDER_REFLECT_101" }; + std::string buildOptions = format("-D RADIUSX=%d -D LSIZE0=%d -D LSIZE1=%d -D CN=%d -D %s", + anchor, (int)localThreads[0], (int)localThreads[1], channels, borderMap[bordertype]); if (src.depth() == CV_8U) { switch (channels) { case 1: - case 3: - globalThreads[0] = ((dst.cols + 4) / 4 + localThreads[0] - 1) / localThreads[0] * localThreads[0]; + globalThreads[0] = (dst.cols + 3) >> 2; break; case 2: - globalThreads[0] = ((dst.cols + 1) / 2 + localThreads[0] - 1) / localThreads[0] * localThreads[0]; + globalThreads[0] = (dst.cols + 1) >> 1; break; case 4: - globalThreads[0] = (dst.cols + localThreads[0] - 1) / localThreads[0] * localThreads[0]; + globalThreads[0] = dst.cols; break; } } - else - { - globalThreads[0] = (dst.cols + localThreads[0] - 1) / localThreads[0] * localThreads[0]; - } - //sanity checks - CV_Assert(clCxt == dst.clCxt); - CV_Assert(src.cols == dst.cols); - CV_Assert(src.oclchannels() == dst.oclchannels()); - CV_Assert(ksize == (anchor << 1) + 1); - int src_pix_per_row, dst_pix_per_row; - int src_offset_x, src_offset_y;//, dst_offset_in_pixel; - src_pix_per_row = src.step / src.elemSize(); - src_offset_x = (src.offset % src.step) / src.elemSize(); - src_offset_y = src.offset / src.step; - dst_pix_per_row = dst.step / dst.elemSize(); - //dst_offset_in_pixel = dst.offset / dst.elemSize(); + int src_pix_per_row = src.step / src.elemSize(); + int src_offset_x = (src.offset % src.step) / src.elemSize(); + int src_offset_y = src.offset / src.step; + int dst_pix_per_row = dst.step / dst.elemSize(); int ridusy = (dst.rows - src.rows) >> 1; + vector > args; args.push_back(make_pair(sizeof(cl_mem), &src.data)); args.push_back(make_pair(sizeof(cl_mem), &dst.data)); @@ -1140,7 +1105,8 @@ void linearRowFilter_gpu(const oclMat &src, const oclMat &dst, oclMat mat_kernel args.push_back(make_pair(sizeof(cl_int), (void *)&ridusy)); args.push_back(make_pair(sizeof(cl_mem), (void *)&mat_kernel.data)); - openCLExecuteKernel(clCxt, &filter_sep_row, kernelName, globalThreads, localThreads, args, channels, src.depth(), compile_option); + openCLExecuteKernel(src.clCxt, &filter_sep_row, "row_filter", globalThreads, localThreads, + args, channels, src.depth(), buildOptions.c_str()); } Ptr cv::ocl::getLinearRowFilter_GPU(int srcType, int /*bufType*/, const Mat &rowKernel, int anchor, int bordertype) diff --git a/modules/ocl/src/opencl/filter_sep_col.cl b/modules/ocl/src/opencl/filter_sep_col.cl index 60ce51360..8dd77d5a9 100644 --- a/modules/ocl/src/opencl/filter_sep_col.cl +++ b/modules/ocl/src/opencl/filter_sep_col.cl @@ -47,36 +47,6 @@ #define READ_TIMES_ROW ((2*(RADIUS+LSIZE0)-1)/LSIZE0) #endif -#ifdef BORDER_CONSTANT -//BORDER_CONSTANT: iiiiii|abcdefgh|iiiiiii -#define ELEM(i,l_edge,r_edge,elem1,elem2) (i)<(l_edge) | (i) >= (r_edge) ? (elem1) : (elem2) -#endif - -#ifdef BORDER_REPLICATE -//BORDER_REPLICATE: aaaaaa|abcdefgh|hhhhhhh -#define ADDR_L(i,l_edge,r_edge) (i) < (l_edge) ? (l_edge) : (i) -#define ADDR_R(i,r_edge,addr) (i) >= (r_edge) ? (r_edge)-1 : (addr) -#endif - -#ifdef BORDER_REFLECT -//BORDER_REFLECT: fedcba|abcdefgh|hgfedcb -#define ADDR_L(i,l_edge,r_edge) (i) < (l_edge) ? -(i)-1 : (i) -#define ADDR_R(i,r_edge,addr) (i) >= (r_edge) ? -(i)-1+((r_edge)<<1) : (addr) -#endif - -#ifdef BORDER_REFLECT_101 -//BORDER_REFLECT_101: gfedcb|abcdefgh|gfedcba -#define ADDR_L(i,l_edge,r_edge) (i) < (l_edge) ? -(i) : (i) -#define ADDR_R(i,r_edge,addr) (i) >= (r_edge) ? -(i)-2+((r_edge)<<1) : (addr) -#endif - -#ifdef BORDER_WRAP -//BORDER_WRAP: cdefgh|abcdefgh|abcdefg -#define ADDR_L(i,l_edge,r_edge) (i) < (l_edge) ? (i)+(r_edge) : (i) -#define ADDR_R(i,r_edge,addr) (i) >= (r_edge) ? (i)-(r_edge) : (addr) -#endif - - /********************************************************************************** These kernels are written for separable filters such as Sobel, Scharr, GaussianBlur. Now(6/29/2011) the kernels only support 8U data type and the anchor of the convovle @@ -107,15 +77,16 @@ __kernel __attribute__((reqd_work_group_size(LSIZE0,LSIZE1,1))) void col_filter { int x = get_global_id(0); int y = get_global_id(1); + int l_x = get_local_id(0); int l_y = get_local_id(1); - int start_addr = mad24(y,src_step_in_pixel,x); - int end_addr = mad24(src_whole_rows - 1,src_step_in_pixel,src_whole_cols); - int i; - GENTYPE_SRC sum; - GENTYPE_SRC temp[READ_TIMES_COL]; - __local GENTYPE_SRC LDS_DAT[LSIZE1*READ_TIMES_COL][LSIZE0+1]; + int start_addr = mad24(y, src_step_in_pixel, x); + int end_addr = mad24(src_whole_rows - 1, src_step_in_pixel, src_whole_cols); + + int i; + GENTYPE_SRC sum, temp[READ_TIMES_COL]; + __local GENTYPE_SRC LDS_DAT[LSIZE1 * READ_TIMES_COL][LSIZE0 + 1]; //read pixels from src for(i = 0;i= (r_edge) ? (elem1) : (elem2) -#endif - -#ifdef BORDER_REPLICATE -//BORDER_REPLICATE: aaaaaa|abcdefgh|hhhhhhh -#define ADDR_L(i,l_edge,r_edge,addr) (i) < (l_edge) ? (l_edge) : (addr) -#define ADDR_R(i,r_edge,addr) (i) >= (r_edge) ? (r_edge)-1 : (addr) -#endif - +#elif defined BORDER_REPLICATE +#define EXTRAPOLATE(x, maxV) \ + { \ + x = max(min(x, maxV - 1), 0); \ + } +#elif defined BORDER_WRAP +#define EXTRAPOLATE(x, maxV) \ + { \ + if (x < 0) \ + x -= ((x - maxV + 1) / maxV) * maxV; \ + if (x >= maxV) \ + x %= maxV; \ + } +#elif defined(BORDER_REFLECT) || defined(BORDER_REFLECT_101) +#define EXTRAPOLATE_(x, maxV, delta) \ + { \ + if (maxV == 1) \ + x = 0; \ + else \ + do \ + { \ + if ( x < 0 ) \ + x = -x - 1 + delta; \ + else \ + x = maxV - 1 - (x - maxV) - delta; \ + } \ + while (x >= maxV || x < 0); \ + } #ifdef BORDER_REFLECT -//BORDER_REFLECT: fedcba|abcdefgh|hgfedcb -#define ADDR_L(i,l_edge,r_edge,addr) (i) < (l_edge) ? -(i)-1 : (addr) -#define ADDR_R(i,r_edge,addr) (i) >= (r_edge) ? -(i)-1+((r_edge)<<1) : (addr) +#define EXTRAPOLATE(x, maxV) EXTRAPOLATE_(x, maxV, 0) +#else +#define EXTRAPOLATE(x, maxV) EXTRAPOLATE_(x, maxV, 1) #endif - -#ifdef BORDER_REFLECT_101 -//BORDER_REFLECT_101: gfedcb|abcdefgh|gfedcba -#define ADDR_L(i,l_edge,r_edge,addr) (i) < (l_edge) ? -(i) : (addr) -#define ADDR_R(i,r_edge,addr) (i) >= (r_edge) ? -(i)-2+((r_edge)<<1) : (addr) -#endif - -#ifdef BORDER_WRAP -//BORDER_WRAP: cdefgh|abcdefgh|abcdefg -#define ADDR_L(i,l_edge,r_edge,addr) (i) < (l_edge) ? (i)+(r_edge) : (addr) -#define ADDR_R(i,r_edge,addr) (i) >= (r_edge) ? (i)-(r_edge) : (addr) +#else +#error No extrapolation method #endif /********************************************************************************** @@ -96,73 +105,71 @@ The info above maybe obsolete. ***********************************************************************************/ __kernel __attribute__((reqd_work_group_size(LSIZE0,LSIZE1,1))) void row_filter_C1_D0 -(__global const uchar * restrict src, - __global float * dst, - const int dst_cols, - const int dst_rows, - const int src_whole_cols, - const int src_whole_rows, - const int src_step_in_pixel, - const int src_offset_x, - const int src_offset_y, - const int dst_step_in_pixel, - const int radiusy, - __constant float * mat_kernel __attribute__((max_constant_size(4*(2*RADIUSX+1))))) + (__global uchar * restrict src, + __global float * dst, + int dst_cols, int dst_rows, + int src_whole_cols, int src_whole_rows, + int src_step_in_pixel, + int src_offset_x, int src_offset_y, + int dst_step_in_pixel, int radiusy, + __constant float * mat_kernel __attribute__((max_constant_size(4*(2*RADIUSX+1))))) { int x = get_global_id(0)<<2; int y = get_global_id(1); int l_x = get_local_id(0); int l_y = get_local_id(1); - int start_x = x+src_offset_x-RADIUSX & 0xfffffffc; - int offset = src_offset_x-RADIUSX & 3; - int start_y = y+src_offset_y-radiusy; - int start_addr = mad24(start_y,src_step_in_pixel,start_x); + + int start_x = x+src_offset_x - RADIUSX & 0xfffffffc; + int offset = src_offset_x - RADIUSX & 3; + int start_y = y + src_offset_y - radiusy; + int start_addr = mad24(start_y, src_step_in_pixel, start_x); int i; float4 sum; uchar4 temp[READ_TIMES_ROW]; __local uchar4 LDS_DAT[LSIZE1][READ_TIMES_ROW*LSIZE0+1]; #ifdef BORDER_CONSTANT - int end_addr = mad24(src_whole_rows - 1,src_step_in_pixel,src_whole_cols); - //read pixels from src - for(i = 0; i 0)) ? current_addr : 0; temp[i] = *(__global uchar4*)&src[current_addr]; } - //judge if read out of boundary - for(i = 0; isrc_whole_cols)| (start_y<0) | (start_y >= src_whole_rows); int4 index[READ_TIMES_ROW]; int4 addr; int s_y; - if(not_all_in_range) + + if (not_all_in_range) { - //judge if read out of boundary - for(i = 0; i 0)) ? current_addr : 0; temp[i] = src[current_addr]; } + //judge if read out of boundary - for(i = 0; i 0)) ? current_addr : 0; temp[i] = src[current_addr]; } - //judge if read out of boundary - for(i = 0; i 0)) ? current_addr : 0; temp[i] = src[current_addr]; } - //judge if read out of boundary - for(i = 0; i