simplified cv::sepFilter2D OpenCL part
This commit is contained in:
		| @@ -41,6 +41,7 @@ | ||||
| //M*/ | ||||
|  | ||||
| #include "precomp.hpp" | ||||
| #define CV_OPENCL_RUN_ASSERT | ||||
| #include "opencl_kernels.hpp" | ||||
| #include <sstream> | ||||
|  | ||||
| @@ -3317,11 +3318,9 @@ static bool ocl_filter2D( InputArray _src, OutputArray _dst, int ddepth, | ||||
|     return kernel.run(2, globalsize, localsize, true); | ||||
| } | ||||
|  | ||||
| static bool ocl_sepRowFilter2D( UMat &src, UMat &buf, Mat &kernelX, int anchor, int borderType, bool sync) | ||||
| static bool ocl_sepRowFilter2D( UMat &src, UMat &buf, Mat &kernelX, int anchor, int borderType) | ||||
| { | ||||
|     int type = src.type(); | ||||
|     int cn = CV_MAT_CN(type); | ||||
|     int sdepth = CV_MAT_DEPTH(type); | ||||
|     int type = src.type(), cn = CV_MAT_CN(type), sdepth = CV_MAT_DEPTH(type); | ||||
|     Size bufSize = buf.size(); | ||||
|  | ||||
| #ifdef ANDROID | ||||
| @@ -3329,27 +3328,14 @@ static bool ocl_sepRowFilter2D( UMat &src, UMat &buf, Mat &kernelX, int anchor, | ||||
| #else | ||||
|     size_t localsize[2] = {16, 16}; | ||||
| #endif | ||||
|  | ||||
|     size_t globalsize[2] = {DIVUP(bufSize.width, localsize[0]) * localsize[0], DIVUP(bufSize.height, localsize[1]) * localsize[1]}; | ||||
|     if (CV_8U == sdepth) | ||||
|     { | ||||
|         switch (cn) | ||||
|         { | ||||
|         case 1: | ||||
|     if (type == CV_8UC1) | ||||
|         globalsize[0] = DIVUP((bufSize.width + 3) >> 2, localsize[0]) * localsize[0]; | ||||
|             break; | ||||
|         case 2: | ||||
|             globalsize[0] = DIVUP((bufSize.width + 1) >> 1, localsize[0]) * localsize[0]; | ||||
|             break; | ||||
|         case 4: | ||||
|             globalsize[0] = DIVUP(bufSize.width, localsize[0]) * localsize[0]; | ||||
|             break; | ||||
|         } | ||||
|     } | ||||
|  | ||||
|     int radiusX = anchor; | ||||
|     int radiusY = (int)((buf.rows - src.rows) >> 1); | ||||
|     int radiusX = anchor, radiusY = (buf.rows - src.rows) >> 1; | ||||
|  | ||||
|     bool isIsolatedBorder = (borderType & BORDER_ISOLATED) != 0; | ||||
|     bool isolated = (borderType & BORDER_ISOLATED) != 0; | ||||
|     const char * const borderMap[] = { "BORDER_CONSTANT", "BORDER_REPLICATE", "BORDER_REFLECT", "BORDER_WRAP", "BORDER_REFLECT_101" }, | ||||
|         * const btype = borderMap[borderType & ~BORDER_ISOLATED]; | ||||
|  | ||||
| @@ -3358,49 +3344,38 @@ static bool ocl_sepRowFilter2D( UMat &src, UMat &buf, Mat &kernelX, int anchor, | ||||
|     extra_extrapolation |= src.cols < (int)((-radiusX + globalsize[0] + 8 * localsize[0] + 3) >> 1) + 1; | ||||
|     extra_extrapolation |= src.cols < radiusX; | ||||
|  | ||||
|     cv::String build_options = cv::format("-D RADIUSX=%d -D LSIZE0=%d -D LSIZE1=%d -D CN=%d -D %s -D %s -D %s", | ||||
|         radiusX, (int)localsize[0], (int)localsize[1], cn, | ||||
|         btype, | ||||
|     char cvt[40]; | ||||
|     cv::String build_options = cv::format("-D RADIUSX=%d -D LSIZE0=%d -D LSIZE1=%d -D CN=%d -D %s -D %s -D %s" | ||||
|                                           " -D srcT=%s -D dstT=%s -D convertToDstT=%s -D srcT1=%s -D dstT1=%s", | ||||
|                                           radiusX, (int)localsize[0], (int)localsize[1], cn, btype, | ||||
|                                           extra_extrapolation ? "EXTRA_EXTRAPOLATION" : "NO_EXTRA_EXTRAPOLATION", | ||||
|         isIsolatedBorder ? "BORDER_ISOLATED" : "NO_BORDER_ISOLATED"); | ||||
|                                           isolated ? "BORDER_ISOLATED" : "NO_BORDER_ISOLATED", | ||||
|                                           ocl::typeToStr(type), ocl::typeToStr(CV_32FC(cn)), | ||||
|                                           ocl::convertTypeStr(sdepth, CV_32F, cn, cvt), | ||||
|                                           ocl::typeToStr(sdepth), ocl::typeToStr(CV_32F)); | ||||
|     build_options += ocl::kernelToStr(kernelX, CV_32F); | ||||
|  | ||||
|     Size srcWholeSize; Point srcOffset; | ||||
|     src.locateROI(srcWholeSize, srcOffset); | ||||
|  | ||||
|     std::stringstream strKernel; | ||||
|     strKernel << "row_filter"; | ||||
|     if (-1 != cn) | ||||
|         strKernel << "_C" << cn; | ||||
|     if (-1 != sdepth) | ||||
|         strKernel << "_D" << sdepth; | ||||
|     String kernelName("row_filter"); | ||||
|     if (type == CV_8UC1) | ||||
|         kernelName += "_C1_D0"; | ||||
|  | ||||
|     ocl::Kernel kernelRow; | ||||
|     if (!kernelRow.create(strKernel.str().c_str(), cv::ocl::imgproc::filterSepRow_oclsrc, | ||||
|                           build_options)) | ||||
|     ocl::Kernel k(kernelName.c_str(), cv::ocl::imgproc::filterSepRow_oclsrc, | ||||
|                   build_options); | ||||
|     if (k.empty()) | ||||
|         return false; | ||||
|  | ||||
|     int idxArg = 0; | ||||
|     idxArg = kernelRow.set(idxArg, ocl::KernelArg::PtrReadOnly(src)); | ||||
|     idxArg = kernelRow.set(idxArg, (int)(src.step / src.elemSize())); | ||||
|     k.args(ocl::KernelArg::PtrReadOnly(src), (int)(src.step / src.elemSize()), srcOffset.x, | ||||
|            srcOffset.y, src.cols, src.rows, srcWholeSize.width, srcWholeSize.height, | ||||
|            ocl::KernelArg::PtrWriteOnly(buf), (int)(buf.step / buf.elemSize()), | ||||
|            buf.cols, buf.rows, radiusY); | ||||
|  | ||||
|     idxArg = kernelRow.set(idxArg, srcOffset.x); | ||||
|     idxArg = kernelRow.set(idxArg, srcOffset.y); | ||||
|     idxArg = kernelRow.set(idxArg, src.cols); | ||||
|     idxArg = kernelRow.set(idxArg, src.rows); | ||||
|     idxArg = kernelRow.set(idxArg, srcWholeSize.width); | ||||
|     idxArg = kernelRow.set(idxArg, srcWholeSize.height); | ||||
|  | ||||
|     idxArg = kernelRow.set(idxArg, ocl::KernelArg::PtrWriteOnly(buf)); | ||||
|     idxArg = kernelRow.set(idxArg, (int)(buf.step / buf.elemSize())); | ||||
|     idxArg = kernelRow.set(idxArg, buf.cols); | ||||
|     idxArg = kernelRow.set(idxArg, buf.rows); | ||||
|     idxArg = kernelRow.set(idxArg, radiusY); | ||||
|  | ||||
|     return kernelRow.run(2, globalsize, localsize, sync); | ||||
|     return k.run(2, globalsize, localsize, false); | ||||
| } | ||||
|  | ||||
| static bool ocl_sepColFilter2D(const UMat &buf, UMat &dst, Mat &kernelY, int anchor, bool sync) | ||||
| static bool ocl_sepColFilter2D(const UMat &buf, UMat &dst, Mat &kernelY, int anchor) | ||||
| { | ||||
| #ifdef ANDROID | ||||
|     size_t localsize[2] = {16, 10}; | ||||
| @@ -3420,28 +3395,23 @@ static bool ocl_sepColFilter2D(const UMat &buf, UMat &dst, Mat &kernelY, int anc | ||||
|         globalsize[0] = DIVUP(sz.width, localsize[0]) * localsize[0]; | ||||
|  | ||||
|     char cvt[40]; | ||||
|     cv::String build_options = cv::format("-D RADIUSY=%d -D LSIZE0=%d -D LSIZE1=%d -D CN=%d -D GENTYPE_SRC=%s -D GENTYPE_DST=%s -D convert_to_DST=%s", | ||||
|                     anchor, (int)localsize[0], (int)localsize[1], cn, ocl::typeToStr(buf.type()), | ||||
|                                           ocl::typeToStr(dtype), ocl::convertTypeStr(CV_32F, ddepth, cn, cvt)); | ||||
|     cv::String build_options = cv::format("-D RADIUSY=%d -D LSIZE0=%d -D LSIZE1=%d -D CN=%d" | ||||
|                                           " -D srcT=%s -D dstT=%s -D convertToDstT=%s", | ||||
|                                           anchor, (int)localsize[0], (int)localsize[1], cn, | ||||
|                                           ocl::typeToStr(buf.type()), ocl::typeToStr(dtype), | ||||
|                                           ocl::convertTypeStr(CV_32F, ddepth, cn, cvt)); | ||||
|     build_options += ocl::kernelToStr(kernelY, CV_32F); | ||||
|  | ||||
|     ocl::Kernel kernelCol; | ||||
|     if (!kernelCol.create("col_filter", cv::ocl::imgproc::filterSepCol_oclsrc, build_options)) | ||||
|     ocl::Kernel k("col_filter", cv::ocl::imgproc::filterSepCol_oclsrc, | ||||
|                   build_options); | ||||
|     if (k.empty()) | ||||
|         return false; | ||||
|  | ||||
|     int idxArg = 0; | ||||
|     idxArg = kernelCol.set(idxArg, ocl::KernelArg::PtrReadOnly(buf)); | ||||
|     idxArg = kernelCol.set(idxArg, (int)(buf.step / buf.elemSize())); | ||||
|     idxArg = kernelCol.set(idxArg, buf.cols); | ||||
|     idxArg = kernelCol.set(idxArg, buf.rows); | ||||
|     k.args(ocl::KernelArg::PtrReadOnly(buf), (int)(buf.step / buf.elemSize()), buf.cols, | ||||
|            buf.rows, ocl::KernelArg::PtrWriteOnly(dst), (int)(dst.offset / dst.elemSize()), | ||||
|            (int)(dst.step / dst.elemSize()), dst.cols, dst.rows); | ||||
|  | ||||
|     idxArg = kernelCol.set(idxArg, ocl::KernelArg::PtrWriteOnly(dst)); | ||||
|     idxArg = kernelCol.set(idxArg, (int)(dst.offset / dst.elemSize())); | ||||
|     idxArg = kernelCol.set(idxArg, (int)(dst.step / dst.elemSize())); | ||||
|     idxArg = kernelCol.set(idxArg, dst.cols); | ||||
|     idxArg = kernelCol.set(idxArg, dst.rows); | ||||
|  | ||||
|     return kernelCol.run(2, globalsize, localsize, sync); | ||||
|     return k.run(2, globalsize, localsize, false); | ||||
| } | ||||
|  | ||||
| const int optimizedSepFilterLocalSize = 16; | ||||
| @@ -3473,12 +3443,14 @@ static bool ocl_sepFilter2D_SinglePass(InputArray _src, OutputArray _dst, | ||||
|  | ||||
|     String opts = cv::format("-D BLK_X=%d -D BLK_Y=%d -D RADIUSX=%d -D RADIUSY=%d%s%s" | ||||
|                              " -D srcT=%s -D convertToWT=%s -D WT=%s -D dstT=%s -D convertToDstT=%s" | ||||
|                              " -D %s", (int)lt2[0], (int)lt2[1], _row_kernel.size().height / 2, _col_kernel.size().height / 2, | ||||
|                              " -D %s -D srcT1=%s -D dstT1=%s -D cn=%d", (int)lt2[0], (int)lt2[1], | ||||
|                              _row_kernel.size().height / 2, _col_kernel.size().height / 2, | ||||
|                              ocl::kernelToStr(_row_kernel, CV_32F, "KERNEL_MATRIX_X").c_str(), | ||||
|                              ocl::kernelToStr(_col_kernel, CV_32F, "KERNEL_MATRIX_Y").c_str(), | ||||
|                              ocl::typeToStr(stype), ocl::convertTypeStr(sdepth, wdepth, cn, cvt[0]), | ||||
|                              ocl::typeToStr(CV_MAKE_TYPE(wdepth, cn)), ocl::typeToStr(dtype), | ||||
|                              ocl::convertTypeStr(wdepth, ddepth, cn, cvt[1]), borderMap[borderType]); | ||||
|                              ocl::convertTypeStr(wdepth, ddepth, cn, cvt[1]), borderMap[borderType], | ||||
|                              ocl::typeToStr(sdepth), ocl::typeToStr(ddepth), cn); | ||||
|  | ||||
|     ocl::Kernel k("sep_filter", ocl::imgproc::filterSep_singlePass_oclsrc, opts); | ||||
|     if (k.empty()) | ||||
| @@ -3529,10 +3501,13 @@ static bool ocl_sepFilter2D( InputArray _src, OutputArray _dst, int ddepth, | ||||
|     if (ddepth < 0) | ||||
|         ddepth = sdepth; | ||||
|  | ||||
|     CV_OCL_RUN_(kernelY.rows <= 21 && kernelX.rows <= 21 && | ||||
|         imgSize.width > optimizedSepFilterLocalSize + (kernelX.rows >> 1) && | ||||
|         imgSize.height > optimizedSepFilterLocalSize + (kernelY.rows >> 1), | ||||
|         ocl_sepFilter2D_SinglePass(_src, _dst, _kernelX, _kernelY, borderType, ddepth), true) | ||||
| //    printf("%d %d\n", imgSize.width, optimizedSepFilterLocalSize + (kernelX.rows >> 1)); | ||||
| //    printf("%d %d\n", imgSize.height, optimizedSepFilterLocalSize + (kernelY.rows >> 1)); | ||||
|  | ||||
| //    CV_OCL_RUN_(kernelY.rows <= 21 && kernelX.rows <= 21 && | ||||
| //        imgSize.width > optimizedSepFilterLocalSize + (kernelX.rows >> 1) && | ||||
| //        imgSize.height > optimizedSepFilterLocalSize + (kernelY.rows >> 1), | ||||
| //        ocl_sepFilter2D_SinglePass(_src, _dst, _kernelX, _kernelY, borderType, ddepth), true) | ||||
|  | ||||
|     UMat src = _src.getUMat(); | ||||
|     Size srcWholeSize; Point srcOffset; | ||||
| @@ -3546,12 +3521,12 @@ static bool ocl_sepFilter2D( InputArray _src, OutputArray _dst, int ddepth, | ||||
|     Size srcSize = src.size(); | ||||
|     Size bufSize(srcSize.width, srcSize.height + kernelY.cols - 1); | ||||
|     UMat buf; buf.create(bufSize, CV_MAKETYPE(CV_32F, cn)); | ||||
|     if (!ocl_sepRowFilter2D(src, buf, kernelX, anchor.x, borderType, false)) | ||||
|     if (!ocl_sepRowFilter2D(src, buf, kernelX, anchor.x, borderType)) | ||||
|         return false; | ||||
|  | ||||
|     _dst.create(srcSize, CV_MAKETYPE(ddepth, cn)); | ||||
|     UMat dst = _dst.getUMat(); | ||||
|     return ocl_sepColFilter2D(buf, dst, kernelY, anchor.y, false); | ||||
|     return ocl_sepColFilter2D(buf, dst, kernelY, anchor.y); | ||||
| } | ||||
|  | ||||
| #endif | ||||
|   | ||||
| @@ -36,16 +36,6 @@ | ||||
|  | ||||
| #define READ_TIMES_COL ((2*(RADIUSY+LSIZE1)-1)/LSIZE1) | ||||
| #define RADIUS 1 | ||||
| #if CN ==1 | ||||
| #define ALIGN (((RADIUS)+3)>>2<<2) | ||||
| #elif CN==2 | ||||
| #define ALIGN (((RADIUS)+1)>>1<<1) | ||||
| #elif CN==3 | ||||
| #define ALIGN (((RADIUS)+3)>>2<<2) | ||||
| #elif CN==4 | ||||
| #define ALIGN (RADIUS) | ||||
| #define READ_TIMES_ROW ((2*(RADIUS+LSIZE0)-1)/LSIZE0) | ||||
| #endif | ||||
|  | ||||
| #define noconvert | ||||
|  | ||||
| @@ -65,16 +55,8 @@ The info above maybe obsolete. | ||||
| #define DIG(a) a, | ||||
| __constant float mat_kernel[] = { COEFF }; | ||||
|  | ||||
| __kernel __attribute__((reqd_work_group_size(LSIZE0,LSIZE1,1))) void col_filter | ||||
|                         (__global const GENTYPE_SRC * restrict src, | ||||
|                          const int src_step_in_pixel, | ||||
|                          const int src_whole_cols, | ||||
|                          const int src_whole_rows, | ||||
|                          __global GENTYPE_DST * dst, | ||||
|                          const int dst_offset_in_pixel, | ||||
|                          const int dst_step_in_pixel, | ||||
|                          const int dst_cols, | ||||
|                          const int dst_rows) | ||||
| __kernel void col_filter(__global const srcT * src, int src_step_in_pixel, int src_whole_cols, int src_whole_rows, | ||||
|                          __global dstT * dst, int dst_offset_in_pixel, int dst_step_in_pixel, int dst_cols, int dst_rows) | ||||
| { | ||||
|     int x = get_global_id(0); | ||||
|     int y = get_global_id(1); | ||||
| @@ -85,35 +67,35 @@ __kernel __attribute__((reqd_work_group_size(LSIZE0,LSIZE1,1))) void col_filter | ||||
|     int start_addr = mad24(y, src_step_in_pixel, x); | ||||
|     int end_addr = mad24(src_whole_rows - 1, src_step_in_pixel, src_whole_cols); | ||||
|  | ||||
|     int i; | ||||
|     GENTYPE_SRC sum, temp[READ_TIMES_COL]; | ||||
|     __local GENTYPE_SRC LDS_DAT[LSIZE1 * READ_TIMES_COL][LSIZE0 + 1]; | ||||
|     srcT sum, temp[READ_TIMES_COL]; | ||||
|     __local srcT LDS_DAT[LSIZE1 * READ_TIMES_COL][LSIZE0 + 1]; | ||||
|  | ||||
|     // read pixels from src | ||||
|     for(i = 0;i<READ_TIMES_COL;i++) | ||||
|     for (int i = 0; i < READ_TIMES_COL; ++i) | ||||
|     { | ||||
|         int current_addr = start_addr+i*LSIZE1*src_step_in_pixel; | ||||
|         int current_addr = mad24(i, LSIZE1 * src_step_in_pixel, start_addr); | ||||
|         current_addr = current_addr < end_addr ? current_addr : 0; | ||||
|         temp[i] = src[current_addr]; | ||||
|     } | ||||
|  | ||||
|     // save pixels to lds | ||||
|     for(i = 0;i<READ_TIMES_COL;i++) | ||||
|     { | ||||
|         LDS_DAT[l_y+i*LSIZE1][l_x] = temp[i]; | ||||
|     } | ||||
|     for (int i = 0; i < READ_TIMES_COL; ++i) | ||||
|         LDS_DAT[mad24(i, LSIZE1, l_y)][l_x] = temp[i]; | ||||
|     barrier(CLK_LOCAL_MEM_FENCE); | ||||
|  | ||||
|     // read pixels from lds and calculate the result | ||||
|     sum = LDS_DAT[l_y + RADIUSY][l_x] * mat_kernel[RADIUSY]; | ||||
|     for(i=1;i<=RADIUSY;i++) | ||||
|     for (int i = 1; i <= RADIUSY; ++i) | ||||
|     { | ||||
|         temp[0] = LDS_DAT[l_y + RADIUSY - i][l_x]; | ||||
|         temp[1] = LDS_DAT[l_y + RADIUSY + i][l_x]; | ||||
|         sum += temp[0] * mat_kernel[RADIUSY-i]+temp[1] * mat_kernel[RADIUSY+i]; | ||||
|         sum += mad(temp[0], mat_kernel[RADIUSY - i], temp[1] * mat_kernel[RADIUSY + i]); | ||||
|     } | ||||
|  | ||||
|     // write the result to dst | ||||
|     if((x<dst_cols) & (y<dst_rows)) | ||||
|     if (x < dst_cols && y < dst_rows) | ||||
|     { | ||||
|         start_addr = mad24(y, dst_step_in_pixel, x + dst_offset_in_pixel); | ||||
|         dst[start_addr] = convert_to_DST(sum); | ||||
|         dst[start_addr] = convertToDstT(sum); | ||||
|     } | ||||
| } | ||||
|   | ||||
| @@ -35,18 +35,7 @@ | ||||
| // | ||||
|  | ||||
| #define READ_TIMES_ROW ((2*(RADIUSX+LSIZE0)-1)/LSIZE0) //for c4 only | ||||
| #define READ_TIMES_COL ((2*(RADIUSY+LSIZE1)-1)/LSIZE1) | ||||
| //#pragma OPENCL EXTENSION cl_amd_printf : enable | ||||
| #define RADIUS 1 | ||||
| #if CN ==1 | ||||
| #define ALIGN (((RADIUS)+3)>>2<<2) | ||||
| #elif CN==2 | ||||
| #define ALIGN (((RADIUS)+1)>>1<<1) | ||||
| #elif CN==3 | ||||
| #define ALIGN (((RADIUS)+3)>>2<<2) | ||||
| #elif CN==4 | ||||
| #define ALIGN (RADIUS) | ||||
| #endif | ||||
|  | ||||
| #ifdef BORDER_REPLICATE | ||||
| // BORDER_REPLICATE: aaaaaa|abcdefgh|hhhhhhh | ||||
| @@ -66,7 +55,6 @@ | ||||
| #define ADDR_R(i, r_edge, addr)    ((i) >= (r_edge) ? -(i)-2+((r_edge)<<1) : (addr)) | ||||
| #endif | ||||
|  | ||||
| //blur function does not support BORDER_WRAP | ||||
| #ifdef BORDER_WRAP | ||||
| // BORDER_WRAP: cdefgh|abcdefgh|abcdefg | ||||
| #define ADDR_L(i, l_edge, r_edge)  ((i) <  (l_edge) ? (i)+(r_edge) : (i)) | ||||
| @@ -127,35 +115,26 @@ | ||||
|     #endif //BORDER_CONSTANT | ||||
| #endif //EXTRA_EXTRAPOLATION | ||||
|  | ||||
| /********************************************************************************** | ||||
| These kernels are written for separable filters such as Sobel, Scharr, GaussianBlur. | ||||
| Now(6/29/2011) the kernels only support 8U data type and the anchor of the convovle | ||||
| kernel must be in the center. ROI is not supported either. | ||||
| For channels =1,2,4, each kernels read 4 elements(not 4 pixels), and for channels =3, | ||||
| the kernel read 4 pixels, save them to LDS and read the data needed from LDS to | ||||
| calculate the result. | ||||
| The length of the convovle kernel supported is related to the LSIZE0 and the MAX size | ||||
| of LDS, which is HW related. | ||||
| For channels = 1,3 the RADIUS is no more than LSIZE0*2 | ||||
| For channels = 2, the RADIUS is no more than LSIZE0 | ||||
| For channels = 4, arbitary RADIUS is supported unless the LDS is not enough | ||||
| Niko | ||||
| 6/29/2011 | ||||
| The info above maybe obsolete. | ||||
| ***********************************************************************************/ | ||||
| #define noconvert | ||||
|  | ||||
| #if cn != 3 | ||||
| #define loadpix(addr) *(__global const srcT *)(addr) | ||||
| #define storepix(val, addr)  *(__global dstT *)(addr) = val | ||||
| #define SRCSIZE ((int)sizeof(srcT)) | ||||
| #define DSTSIZE ((int)sizeof(dstT)) | ||||
| #else | ||||
| #define loadpix(addr)  vload3(0, (__global const srcT1 *)(addr)) | ||||
| #define storepix(val, addr) vstore3(val, 0, (__global dstT1 *)(addr)) | ||||
| #define SRCSIZE ((int)sizeof(srcT1)*3) | ||||
| #define DSTSIZE ((int)sizeof(dstT1)*3) | ||||
| #endif | ||||
|  | ||||
| #define DIG(a) a, | ||||
| __constant float mat_kernel[] = { COEFF }; | ||||
|  | ||||
| __kernel __attribute__((reqd_work_group_size(LSIZE0,LSIZE1,1))) void row_filter_C1_D0 | ||||
|     (__global uchar * restrict src, | ||||
|      int src_step_in_pixel, | ||||
|      int src_offset_x, int src_offset_y, | ||||
|      int src_cols, int src_rows, | ||||
|      int src_whole_cols, int src_whole_rows, | ||||
|      __global float * dst, | ||||
|      int dst_step_in_pixel, | ||||
|      int dst_cols, int dst_rows, | ||||
| __kernel void row_filter_C1_D0(__global const uchar * src, int src_step_in_pixel, int src_offset_x, int src_offset_y, | ||||
|                                int src_cols, int src_rows, int src_whole_cols, int src_whole_rows, | ||||
|                                __global float * dst, int dst_step_in_pixel, int dst_cols, int dst_rows, | ||||
|                                int radiusy) | ||||
| { | ||||
|     int x = get_global_id(0)<<2; | ||||
| @@ -167,7 +146,7 @@ __kernel __attribute__((reqd_work_group_size(LSIZE0,LSIZE1,1))) void row_filter_ | ||||
|     int offset = src_offset_x - RADIUSX & 3; | ||||
|     int start_y = y + src_offset_y - radiusy; | ||||
|     int start_addr = mad24(start_y, src_step_in_pixel, start_x); | ||||
|     int i; | ||||
|  | ||||
|     float4 sum; | ||||
|     uchar4 temp[READ_TIMES_ROW]; | ||||
|  | ||||
| @@ -176,16 +155,16 @@ __kernel __attribute__((reqd_work_group_size(LSIZE0,LSIZE1,1))) void row_filter_ | ||||
|     int end_addr = mad24(src_whole_rows - 1, src_step_in_pixel, src_whole_cols); | ||||
|  | ||||
|     // read pixels from src | ||||
|     for (i = 0; i < READ_TIMES_ROW; i++) | ||||
|     for (int i = 0; i < READ_TIMES_ROW; ++i) | ||||
|     { | ||||
|         int current_addr = start_addr+i*LSIZE0*4; | ||||
|         current_addr = ((current_addr < end_addr) && (current_addr > 0)) ? current_addr : 0; | ||||
|         temp[i] = *(__global uchar4*)&src[current_addr]; | ||||
|         int current_addr = mad24(i, LSIZE0 << 2, start_addr); | ||||
|         current_addr = current_addr < end_addr && current_addr > 0 ? current_addr : 0; | ||||
|         temp[i] = *(__global const uchar4 *)&src[current_addr]; | ||||
|     } | ||||
|  | ||||
|     // judge if read out of boundary | ||||
| #ifdef BORDER_ISOLATED | ||||
|     for (i = 0; i<READ_TIMES_ROW; i++) | ||||
|     for (int i = 0; i < READ_TIMES_ROW; ++i) | ||||
|     { | ||||
|         temp[i].x = ELEM(start_x+i*LSIZE0*4,   src_offset_x, src_offset_x + src_cols, 0,         temp[i].x); | ||||
|         temp[i].y = ELEM(start_x+i*LSIZE0*4+1, src_offset_x, src_offset_x + src_cols, 0,         temp[i].y); | ||||
| @@ -194,7 +173,7 @@ __kernel __attribute__((reqd_work_group_size(LSIZE0,LSIZE1,1))) void row_filter_ | ||||
|         temp[i]   = ELEM(start_y,              src_offset_y, src_offset_y + src_rows, (uchar4)0, temp[i]); | ||||
|     } | ||||
| #else | ||||
|     for (i = 0; i<READ_TIMES_ROW; i++) | ||||
|     for (int i = 0; i < READ_TIMES_ROW; ++i) | ||||
|     { | ||||
|         temp[i].x = ELEM(start_x+i*LSIZE0*4,   0, src_whole_cols, 0,         temp[i].x); | ||||
|         temp[i].y = ELEM(start_x+i*LSIZE0*4+1, 0, src_whole_cols, 0,         temp[i].y); | ||||
| @@ -209,16 +188,15 @@ __kernel __attribute__((reqd_work_group_size(LSIZE0,LSIZE1,1))) void row_filter_ | ||||
| #else | ||||
|     int not_all_in_range = (start_x<0) | (start_x + READ_TIMES_ROW*LSIZE0*4+4>src_whole_cols)| (start_y<0) | (start_y >= src_whole_rows); | ||||
| #endif | ||||
|     int4 index[READ_TIMES_ROW]; | ||||
|     int4 addr; | ||||
|     int4 index[READ_TIMES_ROW], addr; | ||||
|     int s_y; | ||||
|  | ||||
|     if (not_all_in_range) | ||||
|     { | ||||
|         // judge if read out of boundary | ||||
|         for (i = 0; i < READ_TIMES_ROW; i++) | ||||
|         for (int i = 0; i < READ_TIMES_ROW; ++i) | ||||
|         { | ||||
|             index[i] = (int4)(start_x+i*LSIZE0*4) + (int4)(0, 1, 2, 3); | ||||
|             index[i] = (int4)(mad24(i, LSIZE0 << 2, start_x)) + (int4)(0, 1, 2, 3); | ||||
| #ifdef BORDER_ISOLATED | ||||
|             EXTRAPOLATE(index[i].x, src_offset_x, src_offset_x + src_cols); | ||||
|             EXTRAPOLATE(index[i].y, src_offset_x, src_offset_x + src_cols); | ||||
| @@ -231,6 +209,7 @@ __kernel __attribute__((reqd_work_group_size(LSIZE0,LSIZE1,1))) void row_filter_ | ||||
|             EXTRAPOLATE(index[i].w, 0, src_whole_cols); | ||||
| #endif | ||||
|         } | ||||
|  | ||||
|         s_y = start_y; | ||||
| #ifdef BORDER_ISOLATED | ||||
|         EXTRAPOLATE(s_y, src_offset_y, src_offset_y + src_rows); | ||||
| @@ -239,7 +218,7 @@ __kernel __attribute__((reqd_work_group_size(LSIZE0,LSIZE1,1))) void row_filter_ | ||||
| #endif | ||||
|  | ||||
|         // read pixels from src | ||||
|         for (i = 0; i<READ_TIMES_ROW; i++) | ||||
|         for (int i = 0; i < READ_TIMES_ROW; ++i) | ||||
|         { | ||||
|             addr = mad24((int4)s_y, (int4)src_step_in_pixel, index[i]); | ||||
|             temp[i].x = src[addr.x]; | ||||
| @@ -251,23 +230,23 @@ __kernel __attribute__((reqd_work_group_size(LSIZE0,LSIZE1,1))) void row_filter_ | ||||
|     else | ||||
|     { | ||||
|         // read pixels from src | ||||
|         for (i = 0; i<READ_TIMES_ROW; i++) | ||||
|             temp[i] = *(__global uchar4*)&src[start_addr+i*LSIZE0*4]; | ||||
|         for (int i = 0; i < READ_TIMES_ROW; ++i) | ||||
|             temp[i] = *(__global uchar4*)&src[mad24(i, LSIZE0 << 2, start_addr)]; | ||||
|     } | ||||
| #endif //BORDER_CONSTANT | ||||
|  | ||||
|     // save pixels to lds | ||||
|     for (i = 0; i<READ_TIMES_ROW; i++) | ||||
|         LDS_DAT[l_y][l_x+i*LSIZE0]=temp[i]; | ||||
|     for (int i = 0; i < READ_TIMES_ROW; ++i) | ||||
|         LDS_DAT[l_y][mad24(i, LSIZE0, l_x)] = temp[i]; | ||||
|     barrier(CLK_LOCAL_MEM_FENCE); | ||||
|  | ||||
|     // read pixels from lds and calculate the result | ||||
|     sum = convert_float4(vload4(0,(__local uchar *)&LDS_DAT[l_y][l_x]+RADIUSX+offset)) * mat_kernel[RADIUSX]; | ||||
|     for (i=1; i<=RADIUSX; i++) | ||||
|     for (int i = 1; i <= RADIUSX; ++i) | ||||
|     { | ||||
|         temp[0] = vload4(0, (__local uchar*)&LDS_DAT[l_y][l_x] + RADIUSX + offset - i); | ||||
|         temp[1] = vload4(0, (__local uchar*)&LDS_DAT[l_y][l_x] + RADIUSX + offset + i); | ||||
|         sum += convert_float4(temp[0]) * mat_kernel[RADIUSX-i] + convert_float4(temp[1]) * mat_kernel[RADIUSX+i]; | ||||
|         sum += mad(convert_float4(temp[0]), mat_kernel[RADIUSX-i], convert_float4(temp[1]) * mat_kernel[RADIUSX + i]); | ||||
|     } | ||||
|  | ||||
|     start_addr = mad24(y, dst_step_in_pixel, x); | ||||
| @@ -290,15 +269,9 @@ __kernel __attribute__((reqd_work_group_size(LSIZE0,LSIZE1,1))) void row_filter_ | ||||
|         dst[start_addr] = sum.x; | ||||
| } | ||||
|  | ||||
| __kernel __attribute__((reqd_work_group_size(LSIZE0,LSIZE1,1))) void row_filter_C4_D0 | ||||
|     (__global uchar4 * restrict src, | ||||
|      int src_step_in_pixel, | ||||
|      int src_offset_x, int src_offset_y, | ||||
|      int src_cols, int src_rows, | ||||
|      int src_whole_cols, int src_whole_rows, | ||||
|      __global float4 * dst, | ||||
|      int dst_step_in_pixel, | ||||
|      int dst_cols, int dst_rows, | ||||
| __kernel void row_filter(__global const srcT * src, int src_step_in_pixel, int src_offset_x, int src_offset_y, | ||||
|                          int src_cols, int src_rows, int src_whole_cols, int src_whole_rows, | ||||
|                          __global dstT * dst, int dst_step_in_pixel, int dst_cols, int dst_rows, | ||||
|                          int radiusy) | ||||
| { | ||||
|     int x = get_global_id(0); | ||||
| @@ -308,34 +281,34 @@ __kernel __attribute__((reqd_work_group_size(LSIZE0,LSIZE1,1))) void row_filter_ | ||||
|     int start_x = x + src_offset_x - RADIUSX; | ||||
|     int start_y = y + src_offset_y - radiusy; | ||||
|     int start_addr = mad24(start_y, src_step_in_pixel, start_x); | ||||
|     int i; | ||||
|     float4 sum; | ||||
|     uchar4 temp[READ_TIMES_ROW]; | ||||
|  | ||||
|     __local uchar4 LDS_DAT[LSIZE1][READ_TIMES_ROW*LSIZE0+1]; | ||||
|     dstT sum; | ||||
|     srcT temp[READ_TIMES_ROW]; | ||||
|  | ||||
|     __local srcT LDS_DAT[LSIZE1][READ_TIMES_ROW * LSIZE0 + 1]; | ||||
| #ifdef BORDER_CONSTANT | ||||
|     int end_addr = mad24(src_whole_rows - 1, src_step_in_pixel, src_whole_cols); | ||||
|  | ||||
|     // read pixels from src | ||||
|     for (i = 0; i<READ_TIMES_ROW; i++) | ||||
|     for (int i = 0; i < READ_TIMES_ROW; i++) | ||||
|     { | ||||
|         int current_addr = start_addr+i*LSIZE0; | ||||
|         current_addr = ((current_addr < end_addr) && (current_addr > 0)) ? current_addr : 0; | ||||
|         int current_addr = mad24(i, LSIZE0, start_addr); | ||||
|         current_addr = current_addr < end_addr && current_addr > 0 ? current_addr : 0; | ||||
|         temp[i] = src[current_addr]; | ||||
|     } | ||||
|  | ||||
|     // judge if read out of boundary | ||||
| #ifdef BORDER_ISOLATED | ||||
|     for (i = 0; i<READ_TIMES_ROW; i++) | ||||
|     for (int i = 0; i < READ_TIMES_ROW; ++i) | ||||
|     { | ||||
|         temp[i]= ELEM(start_x+i*LSIZE0, src_offset_x, src_offset_x + src_cols, (uchar4)0, temp[i]); | ||||
|         temp[i]= ELEM(start_y,          src_offset_y, src_offset_y + src_rows, (uchar4)0, temp[i]); | ||||
|         temp[i] = ELEM(mad24(i, LSIZE0, start_x), src_offset_x, src_offset_x + src_cols, (srcT)(0), temp[i]); | ||||
|         temp[i] = ELEM(start_y,                   src_offset_y, src_offset_y + src_rows, (srcT)(0), temp[i]); | ||||
|     } | ||||
| #else | ||||
|     for (i = 0; i<READ_TIMES_ROW; i++) | ||||
|     for (int i = 0; i < READ_TIMES_ROW; ++i) | ||||
|     { | ||||
|         temp[i]= ELEM(start_x+i*LSIZE0, 0, src_whole_cols, (uchar4)0, temp[i]); | ||||
|         temp[i]= ELEM(start_y,          0, src_whole_rows, (uchar4)0, temp[i]); | ||||
|         temp[i] = ELEM(mad24(i, LSIZE0, start_x), 0, src_whole_cols, (srcT)(0), temp[i]); | ||||
|         temp[i] = ELEM(start_y,                   0, src_whole_rows, (srcT)(0), temp[i]); | ||||
|     } | ||||
| #endif | ||||
| #else | ||||
| @@ -343,10 +316,11 @@ __kernel __attribute__((reqd_work_group_size(LSIZE0,LSIZE1,1))) void row_filter_ | ||||
|     int s_x, s_y; | ||||
|  | ||||
|     // judge if read out of boundary | ||||
|     for (i = 0; i<READ_TIMES_ROW; i++) | ||||
|     for (int i = 0; i < READ_TIMES_ROW; ++i) | ||||
|     { | ||||
|         s_x = start_x+i*LSIZE0; | ||||
|         s_x = mad24(i, LSIZE0, start_x); | ||||
|         s_y = start_y; | ||||
|  | ||||
| #ifdef BORDER_ISOLATED | ||||
|         EXTRAPOLATE(s_x, src_offset_x, src_offset_x + src_cols); | ||||
|         EXTRAPOLATE(s_y, src_offset_y, src_offset_y + src_rows); | ||||
| @@ -358,22 +332,22 @@ __kernel __attribute__((reqd_work_group_size(LSIZE0,LSIZE1,1))) void row_filter_ | ||||
|     } | ||||
|  | ||||
|     // read pixels from src | ||||
|     for (i = 0; i<READ_TIMES_ROW; i++) | ||||
|     for (int i = 0; i < READ_TIMES_ROW; ++i) | ||||
|         temp[i] = src[index[i]]; | ||||
| #endif // BORDER_CONSTANT | ||||
|  | ||||
|     // save pixels to lds | ||||
|     for (i = 0; i<READ_TIMES_ROW; i++) | ||||
|         LDS_DAT[l_y][l_x+i*LSIZE0]=temp[i]; | ||||
|     for (int i = 0; i < READ_TIMES_ROW; ++i) | ||||
|         LDS_DAT[l_y][mad24(i, LSIZE0, l_x)] = temp[i]; | ||||
|     barrier(CLK_LOCAL_MEM_FENCE); | ||||
|  | ||||
|     // read pixels from lds and calculate the result | ||||
|     sum =convert_float4(LDS_DAT[l_y][l_x+RADIUSX])*mat_kernel[RADIUSX]; | ||||
|     for (i=1; i<=RADIUSX; i++) | ||||
|     sum = convertToDstT(LDS_DAT[l_y][l_x + RADIUSX]) * mat_kernel[RADIUSX]; | ||||
|     for (int i = 1; i <= RADIUSX; ++i) | ||||
|     { | ||||
|         temp[0] = LDS_DAT[l_y][l_x + RADIUSX - i]; | ||||
|         temp[1] = LDS_DAT[l_y][l_x + RADIUSX + i]; | ||||
|         sum += convert_float4(temp[0])*mat_kernel[RADIUSX-i]+convert_float4(temp[1])*mat_kernel[RADIUSX+i]; | ||||
|         sum += mad(convertToDstT(temp[0]), mat_kernel[RADIUSX - i], convertToDstT(temp[1]) * mat_kernel[RADIUSX + i]); | ||||
|     } | ||||
|     // write the result to dst | ||||
|     if (x < dst_cols && y < dst_rows) | ||||
| @@ -382,188 +356,3 @@ __kernel __attribute__((reqd_work_group_size(LSIZE0,LSIZE1,1))) void row_filter_ | ||||
|         dst[start_addr] = sum; | ||||
|     } | ||||
| } | ||||
|  | ||||
| __kernel __attribute__((reqd_work_group_size(LSIZE0,LSIZE1,1))) void row_filter_C1_D5 | ||||
|     (__global float * restrict src, | ||||
|      int src_step_in_pixel, | ||||
|      int src_offset_x, int src_offset_y, | ||||
|      int src_cols, int src_rows, | ||||
|      int src_whole_cols, int src_whole_rows, | ||||
|      __global float * dst, | ||||
|      int dst_step_in_pixel, | ||||
|      int dst_cols, int dst_rows, | ||||
|      int radiusy) | ||||
| { | ||||
|     int x = get_global_id(0); | ||||
|     int y = get_global_id(1); | ||||
|     int l_x = get_local_id(0); | ||||
|     int l_y = get_local_id(1); | ||||
|     int start_x = x+src_offset_x-RADIUSX; | ||||
|     int start_y = y+src_offset_y-radiusy; | ||||
|     int start_addr = mad24(start_y,src_step_in_pixel,start_x); | ||||
|     int i; | ||||
|     float sum; | ||||
|     float temp[READ_TIMES_ROW]; | ||||
|  | ||||
|     __local float LDS_DAT[LSIZE1][READ_TIMES_ROW*LSIZE0+1]; | ||||
| #ifdef BORDER_CONSTANT | ||||
|     int end_addr = mad24(src_whole_rows - 1,src_step_in_pixel,src_whole_cols); | ||||
|  | ||||
|     // read pixels from src | ||||
|     for (i = 0; i<READ_TIMES_ROW; i++) | ||||
|     { | ||||
|         int current_addr = start_addr+i*LSIZE0; | ||||
|         current_addr = ((current_addr < end_addr) && (current_addr > 0)) ? current_addr : 0; | ||||
|         temp[i] = src[current_addr]; | ||||
|     } | ||||
|  | ||||
|     // judge if read out of boundary | ||||
| #ifdef BORDER_ISOLATED | ||||
|     for (i = 0; i<READ_TIMES_ROW; i++) | ||||
|     { | ||||
|         temp[i]= ELEM(start_x+i*LSIZE0, src_offset_x, src_offset_x + src_cols, (float)0,temp[i]); | ||||
|         temp[i]= ELEM(start_y,          src_offset_y, src_offset_y + src_rows, (float)0,temp[i]); | ||||
|     } | ||||
| #else | ||||
|     for (i = 0; i<READ_TIMES_ROW; i++) | ||||
|     { | ||||
|         temp[i]= ELEM(start_x+i*LSIZE0, 0, src_whole_cols, (float)0,temp[i]); | ||||
|         temp[i]= ELEM(start_y,          0, src_whole_rows, (float)0,temp[i]); | ||||
|     } | ||||
| #endif | ||||
| #else // BORDER_CONSTANT | ||||
|     int index[READ_TIMES_ROW]; | ||||
|     int s_x,s_y; | ||||
|     // judge if read out of boundary | ||||
|     for (i = 0; i<READ_TIMES_ROW; i++) | ||||
|     { | ||||
|         s_x = start_x + i*LSIZE0, s_y = start_y; | ||||
| #ifdef BORDER_ISOLATED | ||||
|         EXTRAPOLATE(s_x, src_offset_x, src_offset_x + src_cols); | ||||
|         EXTRAPOLATE(s_y, src_offset_y, src_offset_y + src_rows); | ||||
| #else | ||||
|         EXTRAPOLATE(s_x, 0, src_whole_cols); | ||||
|         EXTRAPOLATE(s_y, 0, src_whole_rows); | ||||
| #endif | ||||
|  | ||||
|         index[i]=mad24(s_y, src_step_in_pixel, s_x); | ||||
|     } | ||||
|     // read pixels from src | ||||
|     for (i = 0; i<READ_TIMES_ROW; i++) | ||||
|         temp[i] = src[index[i]]; | ||||
| #endif// BORDER_CONSTANT | ||||
|  | ||||
|     //save pixels to lds | ||||
|     for (i = 0; i<READ_TIMES_ROW; i++) | ||||
|         LDS_DAT[l_y][l_x+i*LSIZE0]=temp[i]; | ||||
|     barrier(CLK_LOCAL_MEM_FENCE); | ||||
|  | ||||
|     // read pixels from lds and calculate the result | ||||
|     sum =LDS_DAT[l_y][l_x+RADIUSX]*mat_kernel[RADIUSX]; | ||||
|     for (i=1; i<=RADIUSX; i++) | ||||
|     { | ||||
|         temp[0]=LDS_DAT[l_y][l_x+RADIUSX-i]; | ||||
|         temp[1]=LDS_DAT[l_y][l_x+RADIUSX+i]; | ||||
|         sum += temp[0]*mat_kernel[RADIUSX-i]+temp[1]*mat_kernel[RADIUSX+i]; | ||||
|     } | ||||
|  | ||||
|     // write the result to dst | ||||
|     if (x<dst_cols && y<dst_rows) | ||||
|     { | ||||
|         start_addr = mad24(y,dst_step_in_pixel,x); | ||||
|         dst[start_addr] = sum; | ||||
|     } | ||||
| } | ||||
|  | ||||
| __kernel __attribute__((reqd_work_group_size(LSIZE0,LSIZE1,1))) void row_filter_C4_D5 | ||||
|     (__global float4 * restrict src, | ||||
|      int src_step_in_pixel, | ||||
|      int src_offset_x, int src_offset_y, | ||||
|      int src_cols, int src_rows, | ||||
|      int src_whole_cols, int src_whole_rows, | ||||
|      __global float4 * dst, | ||||
|      int dst_step_in_pixel, | ||||
|      int dst_cols, int dst_rows, | ||||
|      int radiusy) | ||||
| { | ||||
|     int x = get_global_id(0); | ||||
|     int y = get_global_id(1); | ||||
|     int l_x = get_local_id(0); | ||||
|     int l_y = get_local_id(1); | ||||
|     int start_x = x+src_offset_x-RADIUSX; | ||||
|     int start_y = y+src_offset_y-radiusy; | ||||
|     int start_addr = mad24(start_y,src_step_in_pixel,start_x); | ||||
|     int i; | ||||
|     float4 sum; | ||||
|     float4 temp[READ_TIMES_ROW]; | ||||
|  | ||||
|     __local float4 LDS_DAT[LSIZE1][READ_TIMES_ROW*LSIZE0+1]; | ||||
| #ifdef BORDER_CONSTANT | ||||
|     int end_addr = mad24(src_whole_rows - 1,src_step_in_pixel,src_whole_cols); | ||||
|  | ||||
|     // read pixels from src | ||||
|     for (i = 0; i<READ_TIMES_ROW; i++) | ||||
|     { | ||||
|         int current_addr = start_addr+i*LSIZE0; | ||||
|         current_addr = ((current_addr < end_addr) && (current_addr > 0)) ? current_addr : 0; | ||||
|         temp[i] = src[current_addr]; | ||||
|     } | ||||
|  | ||||
|     // judge if read out of boundary | ||||
| #ifdef BORDER_ISOLATED | ||||
|     for (i = 0; i<READ_TIMES_ROW; i++) | ||||
|     { | ||||
|         temp[i]= ELEM(start_x+i*LSIZE0, src_offset_x, src_offset_x + src_cols, (float4)0,temp[i]); | ||||
|         temp[i]= ELEM(start_y,          src_offset_y, src_offset_y + src_rows, (float4)0,temp[i]); | ||||
|     } | ||||
| #else | ||||
|     for (i = 0; i<READ_TIMES_ROW; i++) | ||||
|     { | ||||
|         temp[i]= ELEM(start_x+i*LSIZE0, 0, src_whole_cols, (float4)0,temp[i]); | ||||
|         temp[i]= ELEM(start_y,          0, src_whole_rows, (float4)0,temp[i]); | ||||
|     } | ||||
| #endif | ||||
| #else | ||||
|     int index[READ_TIMES_ROW]; | ||||
|     int s_x,s_y; | ||||
|  | ||||
|     // judge if read out of boundary | ||||
|     for (i = 0; i<READ_TIMES_ROW; i++) | ||||
|     { | ||||
|         s_x = start_x + i*LSIZE0, s_y = start_y; | ||||
| #ifdef BORDER_ISOLATED | ||||
|         EXTRAPOLATE(s_x, src_offset_x, src_offset_x + src_cols); | ||||
|         EXTRAPOLATE(s_y, src_offset_y, src_offset_y + src_rows); | ||||
| #else | ||||
|         EXTRAPOLATE(s_x, 0, src_whole_cols); | ||||
|         EXTRAPOLATE(s_y, 0, src_whole_rows); | ||||
| #endif | ||||
|  | ||||
|         index[i]=mad24(s_y,src_step_in_pixel,s_x); | ||||
|     } | ||||
|     // read pixels from src | ||||
|     for (i = 0; i<READ_TIMES_ROW; i++) | ||||
|         temp[i] = src[index[i]]; | ||||
| #endif | ||||
|  | ||||
|     // save pixels to lds | ||||
|     for (i = 0; i<READ_TIMES_ROW; i++) | ||||
|         LDS_DAT[l_y][l_x+i*LSIZE0]=temp[i]; | ||||
|     barrier(CLK_LOCAL_MEM_FENCE); | ||||
|  | ||||
|     // read pixels from lds and calculate the result | ||||
|     sum =LDS_DAT[l_y][l_x+RADIUSX]*mat_kernel[RADIUSX]; | ||||
|     for (i=1; i<=RADIUSX; i++) | ||||
|     { | ||||
|         temp[0]=LDS_DAT[l_y][l_x+RADIUSX-i]; | ||||
|         temp[1]=LDS_DAT[l_y][l_x+RADIUSX+i]; | ||||
|         sum += temp[0]*mat_kernel[RADIUSX-i]+temp[1]*mat_kernel[RADIUSX+i]; | ||||
|     } | ||||
|  | ||||
|     // write the result to dst | ||||
|     if (x<dst_cols && y<dst_rows) | ||||
|     { | ||||
|         start_addr = mad24(y,dst_step_in_pixel,x); | ||||
|         dst[start_addr] = sum; | ||||
|     } | ||||
| } | ||||
|   | ||||
| @@ -75,6 +75,7 @@ | ||||
| #endif | ||||
|  | ||||
| #define SRC(_x,_y) convertToWT(((global srcT*)(Src+(_y)*src_step))[_x]) | ||||
| #define DST(_x,_y) (((global dstT*)(Dst+dst_offset+(_y)*dst_step))[_x]) | ||||
|  | ||||
| #ifdef BORDER_CONSTANT | ||||
| // CCCCCC|abcdefgh|CCCCCCC | ||||
| @@ -83,8 +84,6 @@ | ||||
| #define ELEM(_x,_y,r_edge,t_edge,const_v) SRC((_x),(_y)) | ||||
| #endif | ||||
|  | ||||
| #define DST(_x,_y) (((global dstT*)(Dst+dst_offset+(_y)*dst_step))[_x]) | ||||
|  | ||||
| #define noconvert | ||||
|  | ||||
| // horizontal and vertical filter kernels | ||||
| @@ -108,8 +107,8 @@ __kernel void sep_filter(__global uchar* Src, int src_step, int srcOffsetX, int | ||||
|     int lix = get_local_id(0); | ||||
|     int liy = get_local_id(1); | ||||
|  | ||||
|     int x = (int)get_global_id(0); | ||||
|     int y = (int)get_global_id(1); | ||||
|     int x = get_global_id(0); | ||||
|     int y = get_global_id(1); | ||||
|  | ||||
|     // calculate pixel position in source image taking image offset into account | ||||
|     int srcX = x + srcOffsetX - RADIUSX; | ||||
|   | ||||
| @@ -79,12 +79,14 @@ PARAM_TEST_CASE(SepFilter2D, MatDepth, Channels, BorderType, bool, bool) | ||||
|             ksize.width++; | ||||
|         if (1 != (ksize.height % 2)) | ||||
|             ksize.height++; | ||||
|  | ||||
|         Mat temp = randomMat(Size(ksize.width, 1), CV_MAKE_TYPE(CV_32F, 1), -MAX_VALUE, MAX_VALUE); | ||||
|         cv::normalize(temp, kernelX, 1.0, 0.0, NORM_L1); | ||||
|         temp = randomMat(Size(1, ksize.height),  CV_MAKE_TYPE(CV_32F, 1), -MAX_VALUE, MAX_VALUE); | ||||
|         cv::normalize(temp, kernelY, 1.0, 0.0, NORM_L1); | ||||
|  | ||||
|         Size roiSize = randomSize(ksize.width, MAX_VALUE, ksize.height, MAX_VALUE); | ||||
|         Size roiSize = randomSize(ksize.width + 16, MAX_VALUE, ksize.height + 20, MAX_VALUE); | ||||
|         std::cout << roiSize << std::endl; | ||||
|         int rest = roiSize.width % 4; | ||||
|         if (0 != rest) | ||||
|             roiSize.width += (4 - rest); | ||||
|   | ||||
		Reference in New Issue
	
	Block a user
	 Ilya Lavrenov
					Ilya Lavrenov