simplified cv::sepFilter2D OpenCL part
This commit is contained in:
		| @@ -41,6 +41,7 @@ | |||||||
| //M*/ | //M*/ | ||||||
|  |  | ||||||
| #include "precomp.hpp" | #include "precomp.hpp" | ||||||
|  | #define CV_OPENCL_RUN_ASSERT | ||||||
| #include "opencl_kernels.hpp" | #include "opencl_kernels.hpp" | ||||||
| #include <sstream> | #include <sstream> | ||||||
|  |  | ||||||
| @@ -3317,11 +3318,9 @@ static bool ocl_filter2D( InputArray _src, OutputArray _dst, int ddepth, | |||||||
|     return kernel.run(2, globalsize, localsize, true); |     return kernel.run(2, globalsize, localsize, true); | ||||||
| } | } | ||||||
|  |  | ||||||
| static bool ocl_sepRowFilter2D( UMat &src, UMat &buf, Mat &kernelX, int anchor, int borderType, bool sync) | static bool ocl_sepRowFilter2D( UMat &src, UMat &buf, Mat &kernelX, int anchor, int borderType) | ||||||
| { | { | ||||||
|     int type = src.type(); |     int type = src.type(), cn = CV_MAT_CN(type), sdepth = CV_MAT_DEPTH(type); | ||||||
|     int cn = CV_MAT_CN(type); |  | ||||||
|     int sdepth = CV_MAT_DEPTH(type); |  | ||||||
|     Size bufSize = buf.size(); |     Size bufSize = buf.size(); | ||||||
|  |  | ||||||
| #ifdef ANDROID | #ifdef ANDROID | ||||||
| @@ -3329,27 +3328,14 @@ static bool ocl_sepRowFilter2D( UMat &src, UMat &buf, Mat &kernelX, int anchor, | |||||||
| #else | #else | ||||||
|     size_t localsize[2] = {16, 16}; |     size_t localsize[2] = {16, 16}; | ||||||
| #endif | #endif | ||||||
|  |  | ||||||
|     size_t globalsize[2] = {DIVUP(bufSize.width, localsize[0]) * localsize[0], DIVUP(bufSize.height, localsize[1]) * localsize[1]}; |     size_t globalsize[2] = {DIVUP(bufSize.width, localsize[0]) * localsize[0], DIVUP(bufSize.height, localsize[1]) * localsize[1]}; | ||||||
|     if (CV_8U == sdepth) |     if (type == CV_8UC1) | ||||||
|     { |         globalsize[0] = DIVUP((bufSize.width + 3) >> 2, localsize[0]) * localsize[0]; | ||||||
|         switch (cn) |  | ||||||
|         { |  | ||||||
|         case 1: |  | ||||||
|             globalsize[0] = DIVUP((bufSize.width + 3) >> 2, localsize[0]) * localsize[0]; |  | ||||||
|             break; |  | ||||||
|         case 2: |  | ||||||
|             globalsize[0] = DIVUP((bufSize.width + 1) >> 1, localsize[0]) * localsize[0]; |  | ||||||
|             break; |  | ||||||
|         case 4: |  | ||||||
|             globalsize[0] = DIVUP(bufSize.width, localsize[0]) * localsize[0]; |  | ||||||
|             break; |  | ||||||
|         } |  | ||||||
|     } |  | ||||||
|  |  | ||||||
|     int radiusX = anchor; |     int radiusX = anchor, radiusY = (buf.rows - src.rows) >> 1; | ||||||
|     int radiusY = (int)((buf.rows - src.rows) >> 1); |  | ||||||
|  |  | ||||||
|     bool isIsolatedBorder = (borderType & BORDER_ISOLATED) != 0; |     bool isolated = (borderType & BORDER_ISOLATED) != 0; | ||||||
|     const char * const borderMap[] = { "BORDER_CONSTANT", "BORDER_REPLICATE", "BORDER_REFLECT", "BORDER_WRAP", "BORDER_REFLECT_101" }, |     const char * const borderMap[] = { "BORDER_CONSTANT", "BORDER_REPLICATE", "BORDER_REFLECT", "BORDER_WRAP", "BORDER_REFLECT_101" }, | ||||||
|         * const btype = borderMap[borderType & ~BORDER_ISOLATED]; |         * const btype = borderMap[borderType & ~BORDER_ISOLATED]; | ||||||
|  |  | ||||||
| @@ -3358,49 +3344,38 @@ static bool ocl_sepRowFilter2D( UMat &src, UMat &buf, Mat &kernelX, int anchor, | |||||||
|     extra_extrapolation |= src.cols < (int)((-radiusX + globalsize[0] + 8 * localsize[0] + 3) >> 1) + 1; |     extra_extrapolation |= src.cols < (int)((-radiusX + globalsize[0] + 8 * localsize[0] + 3) >> 1) + 1; | ||||||
|     extra_extrapolation |= src.cols < radiusX; |     extra_extrapolation |= src.cols < radiusX; | ||||||
|  |  | ||||||
|     cv::String build_options = cv::format("-D RADIUSX=%d -D LSIZE0=%d -D LSIZE1=%d -D CN=%d -D %s -D %s -D %s", |     char cvt[40]; | ||||||
|         radiusX, (int)localsize[0], (int)localsize[1], cn, |     cv::String build_options = cv::format("-D RADIUSX=%d -D LSIZE0=%d -D LSIZE1=%d -D CN=%d -D %s -D %s -D %s" | ||||||
|         btype, |                                           " -D srcT=%s -D dstT=%s -D convertToDstT=%s -D srcT1=%s -D dstT1=%s", | ||||||
|         extra_extrapolation ? "EXTRA_EXTRAPOLATION" : "NO_EXTRA_EXTRAPOLATION", |                                           radiusX, (int)localsize[0], (int)localsize[1], cn, btype, | ||||||
|         isIsolatedBorder ? "BORDER_ISOLATED" : "NO_BORDER_ISOLATED"); |                                           extra_extrapolation ? "EXTRA_EXTRAPOLATION" : "NO_EXTRA_EXTRAPOLATION", | ||||||
|  |                                           isolated ? "BORDER_ISOLATED" : "NO_BORDER_ISOLATED", | ||||||
|  |                                           ocl::typeToStr(type), ocl::typeToStr(CV_32FC(cn)), | ||||||
|  |                                           ocl::convertTypeStr(sdepth, CV_32F, cn, cvt), | ||||||
|  |                                           ocl::typeToStr(sdepth), ocl::typeToStr(CV_32F)); | ||||||
|     build_options += ocl::kernelToStr(kernelX, CV_32F); |     build_options += ocl::kernelToStr(kernelX, CV_32F); | ||||||
|  |  | ||||||
|     Size srcWholeSize; Point srcOffset; |     Size srcWholeSize; Point srcOffset; | ||||||
|     src.locateROI(srcWholeSize, srcOffset); |     src.locateROI(srcWholeSize, srcOffset); | ||||||
|  |  | ||||||
|     std::stringstream strKernel; |     String kernelName("row_filter"); | ||||||
|     strKernel << "row_filter"; |     if (type == CV_8UC1) | ||||||
|     if (-1 != cn) |         kernelName += "_C1_D0"; | ||||||
|         strKernel << "_C" << cn; |  | ||||||
|     if (-1 != sdepth) |  | ||||||
|         strKernel << "_D" << sdepth; |  | ||||||
|  |  | ||||||
|     ocl::Kernel kernelRow; |     ocl::Kernel k(kernelName.c_str(), cv::ocl::imgproc::filterSepRow_oclsrc, | ||||||
|     if (!kernelRow.create(strKernel.str().c_str(), cv::ocl::imgproc::filterSepRow_oclsrc, |                   build_options); | ||||||
|                           build_options)) |     if (k.empty()) | ||||||
|         return false; |         return false; | ||||||
|  |  | ||||||
|     int idxArg = 0; |     k.args(ocl::KernelArg::PtrReadOnly(src), (int)(src.step / src.elemSize()), srcOffset.x, | ||||||
|     idxArg = kernelRow.set(idxArg, ocl::KernelArg::PtrReadOnly(src)); |            srcOffset.y, src.cols, src.rows, srcWholeSize.width, srcWholeSize.height, | ||||||
|     idxArg = kernelRow.set(idxArg, (int)(src.step / src.elemSize())); |            ocl::KernelArg::PtrWriteOnly(buf), (int)(buf.step / buf.elemSize()), | ||||||
|  |            buf.cols, buf.rows, radiusY); | ||||||
|  |  | ||||||
|     idxArg = kernelRow.set(idxArg, srcOffset.x); |     return k.run(2, globalsize, localsize, false); | ||||||
|     idxArg = kernelRow.set(idxArg, srcOffset.y); |  | ||||||
|     idxArg = kernelRow.set(idxArg, src.cols); |  | ||||||
|     idxArg = kernelRow.set(idxArg, src.rows); |  | ||||||
|     idxArg = kernelRow.set(idxArg, srcWholeSize.width); |  | ||||||
|     idxArg = kernelRow.set(idxArg, srcWholeSize.height); |  | ||||||
|  |  | ||||||
|     idxArg = kernelRow.set(idxArg, ocl::KernelArg::PtrWriteOnly(buf)); |  | ||||||
|     idxArg = kernelRow.set(idxArg, (int)(buf.step / buf.elemSize())); |  | ||||||
|     idxArg = kernelRow.set(idxArg, buf.cols); |  | ||||||
|     idxArg = kernelRow.set(idxArg, buf.rows); |  | ||||||
|     idxArg = kernelRow.set(idxArg, radiusY); |  | ||||||
|  |  | ||||||
|     return kernelRow.run(2, globalsize, localsize, sync); |  | ||||||
| } | } | ||||||
|  |  | ||||||
| static bool ocl_sepColFilter2D(const UMat &buf, UMat &dst, Mat &kernelY, int anchor, bool sync) | static bool ocl_sepColFilter2D(const UMat &buf, UMat &dst, Mat &kernelY, int anchor) | ||||||
| { | { | ||||||
| #ifdef ANDROID | #ifdef ANDROID | ||||||
|     size_t localsize[2] = {16, 10}; |     size_t localsize[2] = {16, 10}; | ||||||
| @@ -3420,28 +3395,23 @@ static bool ocl_sepColFilter2D(const UMat &buf, UMat &dst, Mat &kernelY, int anc | |||||||
|         globalsize[0] = DIVUP(sz.width, localsize[0]) * localsize[0]; |         globalsize[0] = DIVUP(sz.width, localsize[0]) * localsize[0]; | ||||||
|  |  | ||||||
|     char cvt[40]; |     char cvt[40]; | ||||||
|     cv::String build_options = cv::format("-D RADIUSY=%d -D LSIZE0=%d -D LSIZE1=%d -D CN=%d -D GENTYPE_SRC=%s -D GENTYPE_DST=%s -D convert_to_DST=%s", |     cv::String build_options = cv::format("-D RADIUSY=%d -D LSIZE0=%d -D LSIZE1=%d -D CN=%d" | ||||||
|                     anchor, (int)localsize[0], (int)localsize[1], cn, ocl::typeToStr(buf.type()), |                                           " -D srcT=%s -D dstT=%s -D convertToDstT=%s", | ||||||
|                                           ocl::typeToStr(dtype), ocl::convertTypeStr(CV_32F, ddepth, cn, cvt)); |                                           anchor, (int)localsize[0], (int)localsize[1], cn, | ||||||
|  |                                           ocl::typeToStr(buf.type()), ocl::typeToStr(dtype), | ||||||
|  |                                           ocl::convertTypeStr(CV_32F, ddepth, cn, cvt)); | ||||||
|     build_options += ocl::kernelToStr(kernelY, CV_32F); |     build_options += ocl::kernelToStr(kernelY, CV_32F); | ||||||
|  |  | ||||||
|     ocl::Kernel kernelCol; |     ocl::Kernel k("col_filter", cv::ocl::imgproc::filterSepCol_oclsrc, | ||||||
|     if (!kernelCol.create("col_filter", cv::ocl::imgproc::filterSepCol_oclsrc, build_options)) |                   build_options); | ||||||
|  |     if (k.empty()) | ||||||
|         return false; |         return false; | ||||||
|  |  | ||||||
|     int idxArg = 0; |     k.args(ocl::KernelArg::PtrReadOnly(buf), (int)(buf.step / buf.elemSize()), buf.cols, | ||||||
|     idxArg = kernelCol.set(idxArg, ocl::KernelArg::PtrReadOnly(buf)); |            buf.rows, ocl::KernelArg::PtrWriteOnly(dst), (int)(dst.offset / dst.elemSize()), | ||||||
|     idxArg = kernelCol.set(idxArg, (int)(buf.step / buf.elemSize())); |            (int)(dst.step / dst.elemSize()), dst.cols, dst.rows); | ||||||
|     idxArg = kernelCol.set(idxArg, buf.cols); |  | ||||||
|     idxArg = kernelCol.set(idxArg, buf.rows); |  | ||||||
|  |  | ||||||
|     idxArg = kernelCol.set(idxArg, ocl::KernelArg::PtrWriteOnly(dst)); |     return k.run(2, globalsize, localsize, false); | ||||||
|     idxArg = kernelCol.set(idxArg, (int)(dst.offset / dst.elemSize())); |  | ||||||
|     idxArg = kernelCol.set(idxArg, (int)(dst.step / dst.elemSize())); |  | ||||||
|     idxArg = kernelCol.set(idxArg, dst.cols); |  | ||||||
|     idxArg = kernelCol.set(idxArg, dst.rows); |  | ||||||
|  |  | ||||||
|     return kernelCol.run(2, globalsize, localsize, sync); |  | ||||||
| } | } | ||||||
|  |  | ||||||
| const int optimizedSepFilterLocalSize = 16; | const int optimizedSepFilterLocalSize = 16; | ||||||
| @@ -3473,12 +3443,14 @@ static bool ocl_sepFilter2D_SinglePass(InputArray _src, OutputArray _dst, | |||||||
|  |  | ||||||
|     String opts = cv::format("-D BLK_X=%d -D BLK_Y=%d -D RADIUSX=%d -D RADIUSY=%d%s%s" |     String opts = cv::format("-D BLK_X=%d -D BLK_Y=%d -D RADIUSX=%d -D RADIUSY=%d%s%s" | ||||||
|                              " -D srcT=%s -D convertToWT=%s -D WT=%s -D dstT=%s -D convertToDstT=%s" |                              " -D srcT=%s -D convertToWT=%s -D WT=%s -D dstT=%s -D convertToDstT=%s" | ||||||
|                              " -D %s", (int)lt2[0], (int)lt2[1], _row_kernel.size().height / 2, _col_kernel.size().height / 2, |                              " -D %s -D srcT1=%s -D dstT1=%s -D cn=%d", (int)lt2[0], (int)lt2[1], | ||||||
|  |                              _row_kernel.size().height / 2, _col_kernel.size().height / 2, | ||||||
|                              ocl::kernelToStr(_row_kernel, CV_32F, "KERNEL_MATRIX_X").c_str(), |                              ocl::kernelToStr(_row_kernel, CV_32F, "KERNEL_MATRIX_X").c_str(), | ||||||
|                              ocl::kernelToStr(_col_kernel, CV_32F, "KERNEL_MATRIX_Y").c_str(), |                              ocl::kernelToStr(_col_kernel, CV_32F, "KERNEL_MATRIX_Y").c_str(), | ||||||
|                              ocl::typeToStr(stype), ocl::convertTypeStr(sdepth, wdepth, cn, cvt[0]), |                              ocl::typeToStr(stype), ocl::convertTypeStr(sdepth, wdepth, cn, cvt[0]), | ||||||
|                              ocl::typeToStr(CV_MAKE_TYPE(wdepth, cn)), ocl::typeToStr(dtype), |                              ocl::typeToStr(CV_MAKE_TYPE(wdepth, cn)), ocl::typeToStr(dtype), | ||||||
|                              ocl::convertTypeStr(wdepth, ddepth, cn, cvt[1]), borderMap[borderType]); |                              ocl::convertTypeStr(wdepth, ddepth, cn, cvt[1]), borderMap[borderType], | ||||||
|  |                              ocl::typeToStr(sdepth), ocl::typeToStr(ddepth), cn); | ||||||
|  |  | ||||||
|     ocl::Kernel k("sep_filter", ocl::imgproc::filterSep_singlePass_oclsrc, opts); |     ocl::Kernel k("sep_filter", ocl::imgproc::filterSep_singlePass_oclsrc, opts); | ||||||
|     if (k.empty()) |     if (k.empty()) | ||||||
| @@ -3529,10 +3501,13 @@ static bool ocl_sepFilter2D( InputArray _src, OutputArray _dst, int ddepth, | |||||||
|     if (ddepth < 0) |     if (ddepth < 0) | ||||||
|         ddepth = sdepth; |         ddepth = sdepth; | ||||||
|  |  | ||||||
|     CV_OCL_RUN_(kernelY.rows <= 21 && kernelX.rows <= 21 && | //    printf("%d %d\n", imgSize.width, optimizedSepFilterLocalSize + (kernelX.rows >> 1)); | ||||||
|         imgSize.width > optimizedSepFilterLocalSize + (kernelX.rows >> 1) && | //    printf("%d %d\n", imgSize.height, optimizedSepFilterLocalSize + (kernelY.rows >> 1)); | ||||||
|         imgSize.height > optimizedSepFilterLocalSize + (kernelY.rows >> 1), |  | ||||||
|         ocl_sepFilter2D_SinglePass(_src, _dst, _kernelX, _kernelY, borderType, ddepth), true) | //    CV_OCL_RUN_(kernelY.rows <= 21 && kernelX.rows <= 21 && | ||||||
|  | //        imgSize.width > optimizedSepFilterLocalSize + (kernelX.rows >> 1) && | ||||||
|  | //        imgSize.height > optimizedSepFilterLocalSize + (kernelY.rows >> 1), | ||||||
|  | //        ocl_sepFilter2D_SinglePass(_src, _dst, _kernelX, _kernelY, borderType, ddepth), true) | ||||||
|  |  | ||||||
|     UMat src = _src.getUMat(); |     UMat src = _src.getUMat(); | ||||||
|     Size srcWholeSize; Point srcOffset; |     Size srcWholeSize; Point srcOffset; | ||||||
| @@ -3546,12 +3521,12 @@ static bool ocl_sepFilter2D( InputArray _src, OutputArray _dst, int ddepth, | |||||||
|     Size srcSize = src.size(); |     Size srcSize = src.size(); | ||||||
|     Size bufSize(srcSize.width, srcSize.height + kernelY.cols - 1); |     Size bufSize(srcSize.width, srcSize.height + kernelY.cols - 1); | ||||||
|     UMat buf; buf.create(bufSize, CV_MAKETYPE(CV_32F, cn)); |     UMat buf; buf.create(bufSize, CV_MAKETYPE(CV_32F, cn)); | ||||||
|     if (!ocl_sepRowFilter2D(src, buf, kernelX, anchor.x, borderType, false)) |     if (!ocl_sepRowFilter2D(src, buf, kernelX, anchor.x, borderType)) | ||||||
|         return false; |         return false; | ||||||
|  |  | ||||||
|     _dst.create(srcSize, CV_MAKETYPE(ddepth, cn)); |     _dst.create(srcSize, CV_MAKETYPE(ddepth, cn)); | ||||||
|     UMat dst = _dst.getUMat(); |     UMat dst = _dst.getUMat(); | ||||||
|     return ocl_sepColFilter2D(buf, dst, kernelY, anchor.y, false); |     return ocl_sepColFilter2D(buf, dst, kernelY, anchor.y); | ||||||
| } | } | ||||||
|  |  | ||||||
| #endif | #endif | ||||||
|   | |||||||
| @@ -36,16 +36,6 @@ | |||||||
|  |  | ||||||
| #define READ_TIMES_COL ((2*(RADIUSY+LSIZE1)-1)/LSIZE1) | #define READ_TIMES_COL ((2*(RADIUSY+LSIZE1)-1)/LSIZE1) | ||||||
| #define RADIUS 1 | #define RADIUS 1 | ||||||
| #if CN ==1 |  | ||||||
| #define ALIGN (((RADIUS)+3)>>2<<2) |  | ||||||
| #elif CN==2 |  | ||||||
| #define ALIGN (((RADIUS)+1)>>1<<1) |  | ||||||
| #elif CN==3 |  | ||||||
| #define ALIGN (((RADIUS)+3)>>2<<2) |  | ||||||
| #elif CN==4 |  | ||||||
| #define ALIGN (RADIUS) |  | ||||||
| #define READ_TIMES_ROW ((2*(RADIUS+LSIZE0)-1)/LSIZE0) |  | ||||||
| #endif |  | ||||||
|  |  | ||||||
| #define noconvert | #define noconvert | ||||||
|  |  | ||||||
| @@ -65,16 +55,8 @@ The info above maybe obsolete. | |||||||
| #define DIG(a) a, | #define DIG(a) a, | ||||||
| __constant float mat_kernel[] = { COEFF }; | __constant float mat_kernel[] = { COEFF }; | ||||||
|  |  | ||||||
| __kernel __attribute__((reqd_work_group_size(LSIZE0,LSIZE1,1))) void col_filter | __kernel void col_filter(__global const srcT * src, int src_step_in_pixel, int src_whole_cols, int src_whole_rows, | ||||||
|                         (__global const GENTYPE_SRC * restrict src, |                          __global dstT * dst, int dst_offset_in_pixel, int dst_step_in_pixel, int dst_cols, int dst_rows) | ||||||
|                          const int src_step_in_pixel, |  | ||||||
|                          const int src_whole_cols, |  | ||||||
|                          const int src_whole_rows, |  | ||||||
|                          __global GENTYPE_DST * dst, |  | ||||||
|                          const int dst_offset_in_pixel, |  | ||||||
|                          const int dst_step_in_pixel, |  | ||||||
|                          const int dst_cols, |  | ||||||
|                          const int dst_rows) |  | ||||||
| { | { | ||||||
|     int x = get_global_id(0); |     int x = get_global_id(0); | ||||||
|     int y = get_global_id(1); |     int y = get_global_id(1); | ||||||
| @@ -85,35 +67,35 @@ __kernel __attribute__((reqd_work_group_size(LSIZE0,LSIZE1,1))) void col_filter | |||||||
|     int start_addr = mad24(y, src_step_in_pixel, x); |     int start_addr = mad24(y, src_step_in_pixel, x); | ||||||
|     int end_addr = mad24(src_whole_rows - 1, src_step_in_pixel, src_whole_cols); |     int end_addr = mad24(src_whole_rows - 1, src_step_in_pixel, src_whole_cols); | ||||||
|  |  | ||||||
|     int i; |     srcT sum, temp[READ_TIMES_COL]; | ||||||
|     GENTYPE_SRC sum, temp[READ_TIMES_COL]; |     __local srcT LDS_DAT[LSIZE1 * READ_TIMES_COL][LSIZE0 + 1]; | ||||||
|     __local GENTYPE_SRC LDS_DAT[LSIZE1 * READ_TIMES_COL][LSIZE0 + 1]; |  | ||||||
|  |  | ||||||
|     //read pixels from src |     // read pixels from src | ||||||
|     for(i = 0;i<READ_TIMES_COL;i++) |     for (int i = 0; i < READ_TIMES_COL; ++i) | ||||||
|     { |     { | ||||||
|         int current_addr = start_addr+i*LSIZE1*src_step_in_pixel; |         int current_addr = mad24(i, LSIZE1 * src_step_in_pixel, start_addr); | ||||||
|         current_addr = current_addr < end_addr ? current_addr : 0; |         current_addr = current_addr < end_addr ? current_addr : 0; | ||||||
|         temp[i] = src[current_addr]; |         temp[i] = src[current_addr]; | ||||||
|     } |     } | ||||||
|     //save pixels to lds |  | ||||||
|     for(i = 0;i<READ_TIMES_COL;i++) |     // save pixels to lds | ||||||
|     { |     for (int i = 0; i < READ_TIMES_COL; ++i) | ||||||
|         LDS_DAT[l_y+i*LSIZE1][l_x] = temp[i]; |         LDS_DAT[mad24(i, LSIZE1, l_y)][l_x] = temp[i]; | ||||||
|     } |  | ||||||
|     barrier(CLK_LOCAL_MEM_FENCE); |     barrier(CLK_LOCAL_MEM_FENCE); | ||||||
|     //read pixels from lds and calculate the result |  | ||||||
|     sum = LDS_DAT[l_y+RADIUSY][l_x]*mat_kernel[RADIUSY]; |     // read pixels from lds and calculate the result | ||||||
|     for(i=1;i<=RADIUSY;i++) |     sum = LDS_DAT[l_y + RADIUSY][l_x] * mat_kernel[RADIUSY]; | ||||||
|  |     for (int i = 1; i <= RADIUSY; ++i) | ||||||
|     { |     { | ||||||
|         temp[0]=LDS_DAT[l_y+RADIUSY-i][l_x]; |         temp[0] = LDS_DAT[l_y + RADIUSY - i][l_x]; | ||||||
|         temp[1]=LDS_DAT[l_y+RADIUSY+i][l_x]; |         temp[1] = LDS_DAT[l_y + RADIUSY + i][l_x]; | ||||||
|         sum += temp[0] * mat_kernel[RADIUSY-i]+temp[1] * mat_kernel[RADIUSY+i]; |         sum += mad(temp[0], mat_kernel[RADIUSY - i], temp[1] * mat_kernel[RADIUSY + i]); | ||||||
|     } |     } | ||||||
|     //write the result to dst |  | ||||||
|     if((x<dst_cols) & (y<dst_rows)) |     // write the result to dst | ||||||
|  |     if (x < dst_cols && y < dst_rows) | ||||||
|     { |     { | ||||||
|         start_addr = mad24(y, dst_step_in_pixel, x + dst_offset_in_pixel); |         start_addr = mad24(y, dst_step_in_pixel, x + dst_offset_in_pixel); | ||||||
|         dst[start_addr] = convert_to_DST(sum); |         dst[start_addr] = convertToDstT(sum); | ||||||
|     } |     } | ||||||
| } | } | ||||||
|   | |||||||
| @@ -35,40 +35,28 @@ | |||||||
| // | // | ||||||
|  |  | ||||||
| #define READ_TIMES_ROW ((2*(RADIUSX+LSIZE0)-1)/LSIZE0) //for c4 only | #define READ_TIMES_ROW ((2*(RADIUSX+LSIZE0)-1)/LSIZE0) //for c4 only | ||||||
| #define READ_TIMES_COL ((2*(RADIUSY+LSIZE1)-1)/LSIZE1) |  | ||||||
| //#pragma OPENCL EXTENSION cl_amd_printf : enable |  | ||||||
| #define RADIUS 1 | #define RADIUS 1 | ||||||
| #if CN ==1 |  | ||||||
| #define ALIGN (((RADIUS)+3)>>2<<2) |  | ||||||
| #elif CN==2 |  | ||||||
| #define ALIGN (((RADIUS)+1)>>1<<1) |  | ||||||
| #elif CN==3 |  | ||||||
| #define ALIGN (((RADIUS)+3)>>2<<2) |  | ||||||
| #elif CN==4 |  | ||||||
| #define ALIGN (RADIUS) |  | ||||||
| #endif |  | ||||||
|  |  | ||||||
| #ifdef BORDER_REPLICATE | #ifdef BORDER_REPLICATE | ||||||
| //BORDER_REPLICATE:     aaaaaa|abcdefgh|hhhhhhh | // BORDER_REPLICATE: aaaaaa|abcdefgh|hhhhhhh | ||||||
| #define ADDR_L(i, l_edge, r_edge)  ((i) <  (l_edge) ? (l_edge)   : (i)) | #define ADDR_L(i, l_edge, r_edge)  ((i) <  (l_edge) ? (l_edge)   : (i)) | ||||||
| #define ADDR_R(i, r_edge, addr)    ((i) >= (r_edge) ? (r_edge)-1 : (addr)) | #define ADDR_R(i, r_edge, addr)    ((i) >= (r_edge) ? (r_edge)-1 : (addr)) | ||||||
| #endif | #endif | ||||||
|  |  | ||||||
| #ifdef BORDER_REFLECT | #ifdef BORDER_REFLECT | ||||||
| //BORDER_REFLECT:       fedcba|abcdefgh|hgfedcb | // BORDER_REFLECT: fedcba|abcdefgh|hgfedcb | ||||||
| #define ADDR_L(i, l_edge, r_edge)  ((i) <  (l_edge) ? -(i)-1               : (i)) | #define ADDR_L(i, l_edge, r_edge)  ((i) <  (l_edge) ? -(i)-1               : (i)) | ||||||
| #define ADDR_R(i, r_edge, addr)    ((i) >= (r_edge) ? -(i)-1+((r_edge)<<1) : (addr)) | #define ADDR_R(i, r_edge, addr)    ((i) >= (r_edge) ? -(i)-1+((r_edge)<<1) : (addr)) | ||||||
| #endif | #endif | ||||||
|  |  | ||||||
| #ifdef BORDER_REFLECT_101 | #ifdef BORDER_REFLECT_101 | ||||||
| //BORDER_REFLECT_101:   gfedcb|abcdefgh|gfedcba | // BORDER_REFLECT_101: gfedcb|abcdefgh|gfedcba | ||||||
| #define ADDR_L(i, l_edge, r_edge)  ((i) <  (l_edge) ? -(i)                 : (i)) | #define ADDR_L(i, l_edge, r_edge)  ((i) <  (l_edge) ? -(i)                 : (i)) | ||||||
| #define ADDR_R(i, r_edge, addr)    ((i) >= (r_edge) ? -(i)-2+((r_edge)<<1) : (addr)) | #define ADDR_R(i, r_edge, addr)    ((i) >= (r_edge) ? -(i)-2+((r_edge)<<1) : (addr)) | ||||||
| #endif | #endif | ||||||
|  |  | ||||||
| //blur function does not support BORDER_WRAP |  | ||||||
| #ifdef BORDER_WRAP | #ifdef BORDER_WRAP | ||||||
| //BORDER_WRAP:          cdefgh|abcdefgh|abcdefg | // BORDER_WRAP: cdefgh|abcdefgh|abcdefg | ||||||
| #define ADDR_L(i, l_edge, r_edge)  ((i) <  (l_edge) ? (i)+(r_edge) : (i)) | #define ADDR_L(i, l_edge, r_edge)  ((i) <  (l_edge) ? (i)+(r_edge) : (i)) | ||||||
| #define ADDR_R(i, r_edge, addr)    ((i) >= (r_edge) ? (i)-(r_edge) : (addr)) | #define ADDR_R(i, r_edge, addr)    ((i) >= (r_edge) ? (i)-(r_edge) : (addr)) | ||||||
| #endif | #endif | ||||||
| @@ -127,65 +115,56 @@ | |||||||
|     #endif //BORDER_CONSTANT |     #endif //BORDER_CONSTANT | ||||||
| #endif //EXTRA_EXTRAPOLATION | #endif //EXTRA_EXTRAPOLATION | ||||||
|  |  | ||||||
| /********************************************************************************** | #define noconvert | ||||||
| These kernels are written for separable filters such as Sobel, Scharr, GaussianBlur. |  | ||||||
| Now(6/29/2011) the kernels only support 8U data type and the anchor of the convovle | #if cn != 3 | ||||||
| kernel must be in the center. ROI is not supported either. | #define loadpix(addr) *(__global const srcT *)(addr) | ||||||
| For channels =1,2,4, each kernels read 4 elements(not 4 pixels), and for channels =3, | #define storepix(val, addr)  *(__global dstT *)(addr) = val | ||||||
| the kernel read 4 pixels, save them to LDS and read the data needed from LDS to | #define SRCSIZE ((int)sizeof(srcT)) | ||||||
| calculate the result. | #define DSTSIZE ((int)sizeof(dstT)) | ||||||
| The length of the convovle kernel supported is related to the LSIZE0 and the MAX size | #else | ||||||
| of LDS, which is HW related. | #define loadpix(addr)  vload3(0, (__global const srcT1 *)(addr)) | ||||||
| For channels = 1,3 the RADIUS is no more than LSIZE0*2 | #define storepix(val, addr) vstore3(val, 0, (__global dstT1 *)(addr)) | ||||||
| For channels = 2, the RADIUS is no more than LSIZE0 | #define SRCSIZE ((int)sizeof(srcT1)*3) | ||||||
| For channels = 4, arbitary RADIUS is supported unless the LDS is not enough | #define DSTSIZE ((int)sizeof(dstT1)*3) | ||||||
| Niko | #endif | ||||||
| 6/29/2011 |  | ||||||
| The info above maybe obsolete. |  | ||||||
| ***********************************************************************************/ |  | ||||||
|  |  | ||||||
| #define DIG(a) a, | #define DIG(a) a, | ||||||
| __constant float mat_kernel[] = { COEFF }; | __constant float mat_kernel[] = { COEFF }; | ||||||
|  |  | ||||||
| __kernel __attribute__((reqd_work_group_size(LSIZE0,LSIZE1,1))) void row_filter_C1_D0 | __kernel void row_filter_C1_D0(__global const uchar * src, int src_step_in_pixel, int src_offset_x, int src_offset_y, | ||||||
|     (__global uchar * restrict src, |                                int src_cols, int src_rows, int src_whole_cols, int src_whole_rows, | ||||||
|      int src_step_in_pixel, |                                __global float * dst, int dst_step_in_pixel, int dst_cols, int dst_rows, | ||||||
|      int src_offset_x, int src_offset_y, |                                int radiusy) | ||||||
|      int src_cols, int src_rows, |  | ||||||
|      int src_whole_cols, int src_whole_rows, |  | ||||||
|      __global float * dst, |  | ||||||
|      int dst_step_in_pixel, |  | ||||||
|      int dst_cols, int dst_rows, |  | ||||||
|      int radiusy) |  | ||||||
| { | { | ||||||
|     int x = get_global_id(0)<<2; |     int x = get_global_id(0)<<2; | ||||||
|     int y = get_global_id(1); |     int y = get_global_id(1); | ||||||
|     int l_x = get_local_id(0); |     int l_x = get_local_id(0); | ||||||
|     int l_y = get_local_id(1); |     int l_y = get_local_id(1); | ||||||
|  |  | ||||||
|     int start_x = x+src_offset_x - RADIUSX & 0xfffffffc; |     int start_x = x + src_offset_x - RADIUSX & 0xfffffffc; | ||||||
|     int offset = src_offset_x - RADIUSX & 3; |     int offset = src_offset_x - RADIUSX & 3; | ||||||
|     int start_y = y + src_offset_y - radiusy; |     int start_y = y + src_offset_y - radiusy; | ||||||
|     int start_addr = mad24(start_y, src_step_in_pixel, start_x); |     int start_addr = mad24(start_y, src_step_in_pixel, start_x); | ||||||
|     int i; |  | ||||||
|     float4 sum; |     float4 sum; | ||||||
|     uchar4 temp[READ_TIMES_ROW]; |     uchar4 temp[READ_TIMES_ROW]; | ||||||
|  |  | ||||||
|     __local uchar4 LDS_DAT[LSIZE1][READ_TIMES_ROW*LSIZE0+1]; |     __local uchar4 LDS_DAT[LSIZE1][READ_TIMES_ROW * LSIZE0 + 1]; | ||||||
| #ifdef BORDER_CONSTANT | #ifdef BORDER_CONSTANT | ||||||
|     int end_addr = mad24(src_whole_rows - 1, src_step_in_pixel, src_whole_cols); |     int end_addr = mad24(src_whole_rows - 1, src_step_in_pixel, src_whole_cols); | ||||||
|  |  | ||||||
|     // read pixels from src |     // read pixels from src | ||||||
|     for (i = 0; i < READ_TIMES_ROW; i++) |     for (int i = 0; i < READ_TIMES_ROW; ++i) | ||||||
|     { |     { | ||||||
|         int current_addr = start_addr+i*LSIZE0*4; |         int current_addr = mad24(i, LSIZE0 << 2, start_addr); | ||||||
|         current_addr = ((current_addr < end_addr) && (current_addr > 0)) ? current_addr : 0; |         current_addr = current_addr < end_addr && current_addr > 0 ? current_addr : 0; | ||||||
|         temp[i] = *(__global uchar4*)&src[current_addr]; |         temp[i] = *(__global const uchar4 *)&src[current_addr]; | ||||||
|     } |     } | ||||||
|  |  | ||||||
|     // judge if read out of boundary |     // judge if read out of boundary | ||||||
| #ifdef BORDER_ISOLATED | #ifdef BORDER_ISOLATED | ||||||
|     for (i = 0; i<READ_TIMES_ROW; i++) |     for (int i = 0; i < READ_TIMES_ROW; ++i) | ||||||
|     { |     { | ||||||
|         temp[i].x = ELEM(start_x+i*LSIZE0*4,   src_offset_x, src_offset_x + src_cols, 0,         temp[i].x); |         temp[i].x = ELEM(start_x+i*LSIZE0*4,   src_offset_x, src_offset_x + src_cols, 0,         temp[i].x); | ||||||
|         temp[i].y = ELEM(start_x+i*LSIZE0*4+1, src_offset_x, src_offset_x + src_cols, 0,         temp[i].y); |         temp[i].y = ELEM(start_x+i*LSIZE0*4+1, src_offset_x, src_offset_x + src_cols, 0,         temp[i].y); | ||||||
| @@ -194,7 +173,7 @@ __kernel __attribute__((reqd_work_group_size(LSIZE0,LSIZE1,1))) void row_filter_ | |||||||
|         temp[i]   = ELEM(start_y,              src_offset_y, src_offset_y + src_rows, (uchar4)0, temp[i]); |         temp[i]   = ELEM(start_y,              src_offset_y, src_offset_y + src_rows, (uchar4)0, temp[i]); | ||||||
|     } |     } | ||||||
| #else | #else | ||||||
|     for (i = 0; i<READ_TIMES_ROW; i++) |     for (int i = 0; i < READ_TIMES_ROW; ++i) | ||||||
|     { |     { | ||||||
|         temp[i].x = ELEM(start_x+i*LSIZE0*4,   0, src_whole_cols, 0,         temp[i].x); |         temp[i].x = ELEM(start_x+i*LSIZE0*4,   0, src_whole_cols, 0,         temp[i].x); | ||||||
|         temp[i].y = ELEM(start_x+i*LSIZE0*4+1, 0, src_whole_cols, 0,         temp[i].y); |         temp[i].y = ELEM(start_x+i*LSIZE0*4+1, 0, src_whole_cols, 0,         temp[i].y); | ||||||
| @@ -209,16 +188,15 @@ __kernel __attribute__((reqd_work_group_size(LSIZE0,LSIZE1,1))) void row_filter_ | |||||||
| #else | #else | ||||||
|     int not_all_in_range = (start_x<0) | (start_x + READ_TIMES_ROW*LSIZE0*4+4>src_whole_cols)| (start_y<0) | (start_y >= src_whole_rows); |     int not_all_in_range = (start_x<0) | (start_x + READ_TIMES_ROW*LSIZE0*4+4>src_whole_cols)| (start_y<0) | (start_y >= src_whole_rows); | ||||||
| #endif | #endif | ||||||
|     int4 index[READ_TIMES_ROW]; |     int4 index[READ_TIMES_ROW], addr; | ||||||
|     int4 addr; |  | ||||||
|     int s_y; |     int s_y; | ||||||
|  |  | ||||||
|     if (not_all_in_range) |     if (not_all_in_range) | ||||||
|     { |     { | ||||||
|         // judge if read out of boundary |         // judge if read out of boundary | ||||||
|         for (i = 0; i < READ_TIMES_ROW; i++) |         for (int i = 0; i < READ_TIMES_ROW; ++i) | ||||||
|         { |         { | ||||||
|             index[i] = (int4)(start_x+i*LSIZE0*4) + (int4)(0, 1, 2, 3); |             index[i] = (int4)(mad24(i, LSIZE0 << 2, start_x)) + (int4)(0, 1, 2, 3); | ||||||
| #ifdef BORDER_ISOLATED | #ifdef BORDER_ISOLATED | ||||||
|             EXTRAPOLATE(index[i].x, src_offset_x, src_offset_x + src_cols); |             EXTRAPOLATE(index[i].x, src_offset_x, src_offset_x + src_cols); | ||||||
|             EXTRAPOLATE(index[i].y, src_offset_x, src_offset_x + src_cols); |             EXTRAPOLATE(index[i].y, src_offset_x, src_offset_x + src_cols); | ||||||
| @@ -231,6 +209,7 @@ __kernel __attribute__((reqd_work_group_size(LSIZE0,LSIZE1,1))) void row_filter_ | |||||||
|             EXTRAPOLATE(index[i].w, 0, src_whole_cols); |             EXTRAPOLATE(index[i].w, 0, src_whole_cols); | ||||||
| #endif | #endif | ||||||
|         } |         } | ||||||
|  |  | ||||||
|         s_y = start_y; |         s_y = start_y; | ||||||
| #ifdef BORDER_ISOLATED | #ifdef BORDER_ISOLATED | ||||||
|         EXTRAPOLATE(s_y, src_offset_y, src_offset_y + src_rows); |         EXTRAPOLATE(s_y, src_offset_y, src_offset_y + src_rows); | ||||||
| @@ -239,9 +218,9 @@ __kernel __attribute__((reqd_work_group_size(LSIZE0,LSIZE1,1))) void row_filter_ | |||||||
| #endif | #endif | ||||||
|  |  | ||||||
|         // read pixels from src |         // read pixels from src | ||||||
|         for (i = 0; i<READ_TIMES_ROW; i++) |         for (int i = 0; i < READ_TIMES_ROW; ++i) | ||||||
|         { |         { | ||||||
|             addr = mad24((int4)s_y,(int4)src_step_in_pixel,index[i]); |             addr = mad24((int4)s_y, (int4)src_step_in_pixel, index[i]); | ||||||
|             temp[i].x = src[addr.x]; |             temp[i].x = src[addr.x]; | ||||||
|             temp[i].y = src[addr.y]; |             temp[i].y = src[addr.y]; | ||||||
|             temp[i].z = src[addr.z]; |             temp[i].z = src[addr.z]; | ||||||
| @@ -251,26 +230,26 @@ __kernel __attribute__((reqd_work_group_size(LSIZE0,LSIZE1,1))) void row_filter_ | |||||||
|     else |     else | ||||||
|     { |     { | ||||||
|         // read pixels from src |         // read pixels from src | ||||||
|         for (i = 0; i<READ_TIMES_ROW; i++) |         for (int i = 0; i < READ_TIMES_ROW; ++i) | ||||||
|             temp[i] = *(__global uchar4*)&src[start_addr+i*LSIZE0*4]; |             temp[i] = *(__global uchar4*)&src[mad24(i, LSIZE0 << 2, start_addr)]; | ||||||
|     } |     } | ||||||
| #endif //BORDER_CONSTANT | #endif //BORDER_CONSTANT | ||||||
|  |  | ||||||
|     // save pixels to lds |     // save pixels to lds | ||||||
|     for (i = 0; i<READ_TIMES_ROW; i++) |     for (int i = 0; i < READ_TIMES_ROW; ++i) | ||||||
|         LDS_DAT[l_y][l_x+i*LSIZE0]=temp[i]; |         LDS_DAT[l_y][mad24(i, LSIZE0, l_x)] = temp[i]; | ||||||
|     barrier(CLK_LOCAL_MEM_FENCE); |     barrier(CLK_LOCAL_MEM_FENCE); | ||||||
|  |  | ||||||
|     // read pixels from lds and calculate the result |     // read pixels from lds and calculate the result | ||||||
|     sum =convert_float4(vload4(0,(__local uchar*)&LDS_DAT[l_y][l_x]+RADIUSX+offset))*mat_kernel[RADIUSX]; |     sum = convert_float4(vload4(0,(__local uchar *)&LDS_DAT[l_y][l_x]+RADIUSX+offset)) * mat_kernel[RADIUSX]; | ||||||
|     for (i=1; i<=RADIUSX; i++) |     for (int i = 1; i <= RADIUSX; ++i) | ||||||
|     { |     { | ||||||
|         temp[0] = vload4(0, (__local uchar*)&LDS_DAT[l_y][l_x] + RADIUSX + offset - i); |         temp[0] = vload4(0, (__local uchar*)&LDS_DAT[l_y][l_x] + RADIUSX + offset - i); | ||||||
|         temp[1] = vload4(0, (__local uchar*)&LDS_DAT[l_y][l_x] + RADIUSX + offset + i); |         temp[1] = vload4(0, (__local uchar*)&LDS_DAT[l_y][l_x] + RADIUSX + offset + i); | ||||||
|         sum += convert_float4(temp[0]) * mat_kernel[RADIUSX-i] + convert_float4(temp[1]) * mat_kernel[RADIUSX+i]; |         sum += mad(convert_float4(temp[0]), mat_kernel[RADIUSX-i], convert_float4(temp[1]) * mat_kernel[RADIUSX + i]); | ||||||
|     } |     } | ||||||
|  |  | ||||||
|     start_addr = mad24(y,dst_step_in_pixel,x); |     start_addr = mad24(y, dst_step_in_pixel, x); | ||||||
|  |  | ||||||
|     // write the result to dst |     // write the result to dst | ||||||
|     if ((x+3<dst_cols) & (y<dst_rows)) |     if ((x+3<dst_cols) & (y<dst_rows)) | ||||||
| @@ -290,63 +269,58 @@ __kernel __attribute__((reqd_work_group_size(LSIZE0,LSIZE1,1))) void row_filter_ | |||||||
|         dst[start_addr] = sum.x; |         dst[start_addr] = sum.x; | ||||||
| } | } | ||||||
|  |  | ||||||
| __kernel __attribute__((reqd_work_group_size(LSIZE0,LSIZE1,1))) void row_filter_C4_D0 | __kernel void row_filter(__global const srcT * src, int src_step_in_pixel, int src_offset_x, int src_offset_y, | ||||||
|     (__global uchar4 * restrict src, |                          int src_cols, int src_rows, int src_whole_cols, int src_whole_rows, | ||||||
|      int src_step_in_pixel, |                          __global dstT * dst, int dst_step_in_pixel, int dst_cols, int dst_rows, | ||||||
|      int src_offset_x, int src_offset_y, |                          int radiusy) | ||||||
|      int src_cols, int src_rows, |  | ||||||
|      int src_whole_cols, int src_whole_rows, |  | ||||||
|      __global float4 * dst, |  | ||||||
|      int dst_step_in_pixel, |  | ||||||
|      int dst_cols, int dst_rows, |  | ||||||
|      int radiusy) |  | ||||||
| { | { | ||||||
|     int x = get_global_id(0); |     int x = get_global_id(0); | ||||||
|     int y = get_global_id(1); |     int y = get_global_id(1); | ||||||
|     int l_x = get_local_id(0); |     int l_x = get_local_id(0); | ||||||
|     int l_y = get_local_id(1); |     int l_y = get_local_id(1); | ||||||
|     int start_x = x+src_offset_x-RADIUSX; |     int start_x = x + src_offset_x - RADIUSX; | ||||||
|     int start_y = y+src_offset_y-radiusy; |     int start_y = y + src_offset_y - radiusy; | ||||||
|     int start_addr = mad24(start_y,src_step_in_pixel,start_x); |     int start_addr = mad24(start_y, src_step_in_pixel, start_x); | ||||||
|     int i; |  | ||||||
|     float4 sum; |  | ||||||
|     uchar4 temp[READ_TIMES_ROW]; |  | ||||||
|  |  | ||||||
|     __local uchar4 LDS_DAT[LSIZE1][READ_TIMES_ROW*LSIZE0+1]; |     dstT sum; | ||||||
|  |     srcT temp[READ_TIMES_ROW]; | ||||||
|  |  | ||||||
|  |     __local srcT LDS_DAT[LSIZE1][READ_TIMES_ROW * LSIZE0 + 1]; | ||||||
| #ifdef BORDER_CONSTANT | #ifdef BORDER_CONSTANT | ||||||
|     int end_addr = mad24(src_whole_rows - 1,src_step_in_pixel,src_whole_cols); |     int end_addr = mad24(src_whole_rows - 1, src_step_in_pixel, src_whole_cols); | ||||||
|  |  | ||||||
|     // read pixels from src |     // read pixels from src | ||||||
|     for (i = 0; i<READ_TIMES_ROW; i++) |     for (int i = 0; i < READ_TIMES_ROW; i++) | ||||||
|     { |     { | ||||||
|         int current_addr = start_addr+i*LSIZE0; |         int current_addr = mad24(i, LSIZE0, start_addr); | ||||||
|         current_addr = ((current_addr < end_addr) && (current_addr > 0)) ? current_addr : 0; |         current_addr = current_addr < end_addr && current_addr > 0 ? current_addr : 0; | ||||||
|         temp[i] = src[current_addr]; |         temp[i] = src[current_addr]; | ||||||
|     } |     } | ||||||
|  |  | ||||||
|     //judge if read out of boundary |     // judge if read out of boundary | ||||||
| #ifdef BORDER_ISOLATED | #ifdef BORDER_ISOLATED | ||||||
|     for (i = 0; i<READ_TIMES_ROW; i++) |     for (int i = 0; i < READ_TIMES_ROW; ++i) | ||||||
|     { |     { | ||||||
|         temp[i]= ELEM(start_x+i*LSIZE0, src_offset_x, src_offset_x + src_cols, (uchar4)0, temp[i]); |         temp[i] = ELEM(mad24(i, LSIZE0, start_x), src_offset_x, src_offset_x + src_cols, (srcT)(0), temp[i]); | ||||||
|         temp[i]= ELEM(start_y,          src_offset_y, src_offset_y + src_rows, (uchar4)0, temp[i]); |         temp[i] = ELEM(start_y,                   src_offset_y, src_offset_y + src_rows, (srcT)(0), temp[i]); | ||||||
|     } |     } | ||||||
| #else | #else | ||||||
|     for (i = 0; i<READ_TIMES_ROW; i++) |     for (int i = 0; i < READ_TIMES_ROW; ++i) | ||||||
|     { |     { | ||||||
|         temp[i]= ELEM(start_x+i*LSIZE0, 0, src_whole_cols, (uchar4)0, temp[i]); |         temp[i] = ELEM(mad24(i, LSIZE0, start_x), 0, src_whole_cols, (srcT)(0), temp[i]); | ||||||
|         temp[i]= ELEM(start_y,          0, src_whole_rows, (uchar4)0, temp[i]); |         temp[i] = ELEM(start_y,                   0, src_whole_rows, (srcT)(0), temp[i]); | ||||||
|     } |     } | ||||||
| #endif | #endif | ||||||
| #else | #else | ||||||
|     int index[READ_TIMES_ROW]; |     int index[READ_TIMES_ROW]; | ||||||
|     int s_x,s_y; |     int s_x, s_y; | ||||||
|  |  | ||||||
|     // judge if read out of boundary |     // judge if read out of boundary | ||||||
|     for (i = 0; i<READ_TIMES_ROW; i++) |     for (int i = 0; i < READ_TIMES_ROW; ++i) | ||||||
|     { |     { | ||||||
|         s_x = start_x+i*LSIZE0; |         s_x = mad24(i, LSIZE0, start_x); | ||||||
|         s_y = start_y; |         s_y = start_y; | ||||||
|  |  | ||||||
| #ifdef BORDER_ISOLATED | #ifdef BORDER_ISOLATED | ||||||
|         EXTRAPOLATE(s_x, src_offset_x, src_offset_x + src_cols); |         EXTRAPOLATE(s_x, src_offset_x, src_offset_x + src_cols); | ||||||
|         EXTRAPOLATE(s_y, src_offset_y, src_offset_y + src_rows); |         EXTRAPOLATE(s_y, src_offset_y, src_offset_y + src_rows); | ||||||
| @@ -354,216 +328,31 @@ __kernel __attribute__((reqd_work_group_size(LSIZE0,LSIZE1,1))) void row_filter_ | |||||||
|         EXTRAPOLATE(s_x, 0, src_whole_cols); |         EXTRAPOLATE(s_x, 0, src_whole_cols); | ||||||
|         EXTRAPOLATE(s_y, 0, src_whole_rows); |         EXTRAPOLATE(s_y, 0, src_whole_rows); | ||||||
| #endif | #endif | ||||||
|         index[i]=mad24(s_y, src_step_in_pixel, s_x); |         index[i] = mad24(s_y, src_step_in_pixel, s_x); | ||||||
|     } |     } | ||||||
|  |  | ||||||
|     //read pixels from src |     // read pixels from src | ||||||
|     for (i = 0; i<READ_TIMES_ROW; i++) |     for (int i = 0; i < READ_TIMES_ROW; ++i) | ||||||
|         temp[i] = src[index[i]]; |         temp[i] = src[index[i]]; | ||||||
| #endif //BORDER_CONSTANT | #endif // BORDER_CONSTANT | ||||||
|  |  | ||||||
|     //save pixels to lds |  | ||||||
|     for (i = 0; i<READ_TIMES_ROW; i++) |  | ||||||
|         LDS_DAT[l_y][l_x+i*LSIZE0]=temp[i]; |  | ||||||
|     barrier(CLK_LOCAL_MEM_FENCE); |  | ||||||
|  |  | ||||||
|     //read pixels from lds and calculate the result |  | ||||||
|     sum =convert_float4(LDS_DAT[l_y][l_x+RADIUSX])*mat_kernel[RADIUSX]; |  | ||||||
|     for (i=1; i<=RADIUSX; i++) |  | ||||||
|     { |  | ||||||
|         temp[0]=LDS_DAT[l_y][l_x+RADIUSX-i]; |  | ||||||
|         temp[1]=LDS_DAT[l_y][l_x+RADIUSX+i]; |  | ||||||
|         sum += convert_float4(temp[0])*mat_kernel[RADIUSX-i]+convert_float4(temp[1])*mat_kernel[RADIUSX+i]; |  | ||||||
|     } |  | ||||||
|     //write the result to dst |  | ||||||
|     if (x<dst_cols && y<dst_rows) |  | ||||||
|     { |  | ||||||
|         start_addr = mad24(y,dst_step_in_pixel,x); |  | ||||||
|         dst[start_addr] = sum; |  | ||||||
|     } |  | ||||||
| } |  | ||||||
|  |  | ||||||
| __kernel __attribute__((reqd_work_group_size(LSIZE0,LSIZE1,1))) void row_filter_C1_D5 |  | ||||||
|     (__global float * restrict src, |  | ||||||
|      int src_step_in_pixel, |  | ||||||
|      int src_offset_x, int src_offset_y, |  | ||||||
|      int src_cols, int src_rows, |  | ||||||
|      int src_whole_cols, int src_whole_rows, |  | ||||||
|      __global float * dst, |  | ||||||
|      int dst_step_in_pixel, |  | ||||||
|      int dst_cols, int dst_rows, |  | ||||||
|      int radiusy) |  | ||||||
| { |  | ||||||
|     int x = get_global_id(0); |  | ||||||
|     int y = get_global_id(1); |  | ||||||
|     int l_x = get_local_id(0); |  | ||||||
|     int l_y = get_local_id(1); |  | ||||||
|     int start_x = x+src_offset_x-RADIUSX; |  | ||||||
|     int start_y = y+src_offset_y-radiusy; |  | ||||||
|     int start_addr = mad24(start_y,src_step_in_pixel,start_x); |  | ||||||
|     int i; |  | ||||||
|     float sum; |  | ||||||
|     float temp[READ_TIMES_ROW]; |  | ||||||
|  |  | ||||||
|     __local float LDS_DAT[LSIZE1][READ_TIMES_ROW*LSIZE0+1]; |  | ||||||
| #ifdef BORDER_CONSTANT |  | ||||||
|     int end_addr = mad24(src_whole_rows - 1,src_step_in_pixel,src_whole_cols); |  | ||||||
|  |  | ||||||
|     // read pixels from src |  | ||||||
|     for (i = 0; i<READ_TIMES_ROW; i++) |  | ||||||
|     { |  | ||||||
|         int current_addr = start_addr+i*LSIZE0; |  | ||||||
|         current_addr = ((current_addr < end_addr) && (current_addr > 0)) ? current_addr : 0; |  | ||||||
|         temp[i] = src[current_addr]; |  | ||||||
|     } |  | ||||||
|  |  | ||||||
|     // judge if read out of boundary |  | ||||||
| #ifdef BORDER_ISOLATED |  | ||||||
|     for (i = 0; i<READ_TIMES_ROW; i++) |  | ||||||
|     { |  | ||||||
|         temp[i]= ELEM(start_x+i*LSIZE0, src_offset_x, src_offset_x + src_cols, (float)0,temp[i]); |  | ||||||
|         temp[i]= ELEM(start_y,          src_offset_y, src_offset_y + src_rows, (float)0,temp[i]); |  | ||||||
|     } |  | ||||||
| #else |  | ||||||
|     for (i = 0; i<READ_TIMES_ROW; i++) |  | ||||||
|     { |  | ||||||
|         temp[i]= ELEM(start_x+i*LSIZE0, 0, src_whole_cols, (float)0,temp[i]); |  | ||||||
|         temp[i]= ELEM(start_y,          0, src_whole_rows, (float)0,temp[i]); |  | ||||||
|     } |  | ||||||
| #endif |  | ||||||
| #else // BORDER_CONSTANT |  | ||||||
|     int index[READ_TIMES_ROW]; |  | ||||||
|     int s_x,s_y; |  | ||||||
|     // judge if read out of boundary |  | ||||||
|     for (i = 0; i<READ_TIMES_ROW; i++) |  | ||||||
|     { |  | ||||||
|         s_x = start_x + i*LSIZE0, s_y = start_y; |  | ||||||
| #ifdef BORDER_ISOLATED |  | ||||||
|         EXTRAPOLATE(s_x, src_offset_x, src_offset_x + src_cols); |  | ||||||
|         EXTRAPOLATE(s_y, src_offset_y, src_offset_y + src_rows); |  | ||||||
| #else |  | ||||||
|         EXTRAPOLATE(s_x, 0, src_whole_cols); |  | ||||||
|         EXTRAPOLATE(s_y, 0, src_whole_rows); |  | ||||||
| #endif |  | ||||||
|  |  | ||||||
|         index[i]=mad24(s_y, src_step_in_pixel, s_x); |  | ||||||
|     } |  | ||||||
|     // read pixels from src |  | ||||||
|     for (i = 0; i<READ_TIMES_ROW; i++) |  | ||||||
|         temp[i] = src[index[i]]; |  | ||||||
| #endif// BORDER_CONSTANT |  | ||||||
|  |  | ||||||
|     //save pixels to lds |  | ||||||
|     for (i = 0; i<READ_TIMES_ROW; i++) |  | ||||||
|         LDS_DAT[l_y][l_x+i*LSIZE0]=temp[i]; |  | ||||||
|     barrier(CLK_LOCAL_MEM_FENCE); |  | ||||||
|  |  | ||||||
|     // read pixels from lds and calculate the result |  | ||||||
|     sum =LDS_DAT[l_y][l_x+RADIUSX]*mat_kernel[RADIUSX]; |  | ||||||
|     for (i=1; i<=RADIUSX; i++) |  | ||||||
|     { |  | ||||||
|         temp[0]=LDS_DAT[l_y][l_x+RADIUSX-i]; |  | ||||||
|         temp[1]=LDS_DAT[l_y][l_x+RADIUSX+i]; |  | ||||||
|         sum += temp[0]*mat_kernel[RADIUSX-i]+temp[1]*mat_kernel[RADIUSX+i]; |  | ||||||
|     } |  | ||||||
|  |  | ||||||
|     // write the result to dst |  | ||||||
|     if (x<dst_cols && y<dst_rows) |  | ||||||
|     { |  | ||||||
|         start_addr = mad24(y,dst_step_in_pixel,x); |  | ||||||
|         dst[start_addr] = sum; |  | ||||||
|     } |  | ||||||
| } |  | ||||||
|  |  | ||||||
| __kernel __attribute__((reqd_work_group_size(LSIZE0,LSIZE1,1))) void row_filter_C4_D5 |  | ||||||
|     (__global float4 * restrict src, |  | ||||||
|      int src_step_in_pixel, |  | ||||||
|      int src_offset_x, int src_offset_y, |  | ||||||
|      int src_cols, int src_rows, |  | ||||||
|      int src_whole_cols, int src_whole_rows, |  | ||||||
|      __global float4 * dst, |  | ||||||
|      int dst_step_in_pixel, |  | ||||||
|      int dst_cols, int dst_rows, |  | ||||||
|      int radiusy) |  | ||||||
| { |  | ||||||
|     int x = get_global_id(0); |  | ||||||
|     int y = get_global_id(1); |  | ||||||
|     int l_x = get_local_id(0); |  | ||||||
|     int l_y = get_local_id(1); |  | ||||||
|     int start_x = x+src_offset_x-RADIUSX; |  | ||||||
|     int start_y = y+src_offset_y-radiusy; |  | ||||||
|     int start_addr = mad24(start_y,src_step_in_pixel,start_x); |  | ||||||
|     int i; |  | ||||||
|     float4 sum; |  | ||||||
|     float4 temp[READ_TIMES_ROW]; |  | ||||||
|  |  | ||||||
|     __local float4 LDS_DAT[LSIZE1][READ_TIMES_ROW*LSIZE0+1]; |  | ||||||
| #ifdef BORDER_CONSTANT |  | ||||||
|     int end_addr = mad24(src_whole_rows - 1,src_step_in_pixel,src_whole_cols); |  | ||||||
|  |  | ||||||
|     // read pixels from src |  | ||||||
|     for (i = 0; i<READ_TIMES_ROW; i++) |  | ||||||
|     { |  | ||||||
|         int current_addr = start_addr+i*LSIZE0; |  | ||||||
|         current_addr = ((current_addr < end_addr) && (current_addr > 0)) ? current_addr : 0; |  | ||||||
|         temp[i] = src[current_addr]; |  | ||||||
|     } |  | ||||||
|  |  | ||||||
|     // judge if read out of boundary |  | ||||||
| #ifdef BORDER_ISOLATED |  | ||||||
|     for (i = 0; i<READ_TIMES_ROW; i++) |  | ||||||
|     { |  | ||||||
|         temp[i]= ELEM(start_x+i*LSIZE0, src_offset_x, src_offset_x + src_cols, (float4)0,temp[i]); |  | ||||||
|         temp[i]= ELEM(start_y,          src_offset_y, src_offset_y + src_rows, (float4)0,temp[i]); |  | ||||||
|     } |  | ||||||
| #else |  | ||||||
|     for (i = 0; i<READ_TIMES_ROW; i++) |  | ||||||
|     { |  | ||||||
|         temp[i]= ELEM(start_x+i*LSIZE0, 0, src_whole_cols, (float4)0,temp[i]); |  | ||||||
|         temp[i]= ELEM(start_y,          0, src_whole_rows, (float4)0,temp[i]); |  | ||||||
|     } |  | ||||||
| #endif |  | ||||||
| #else |  | ||||||
|     int index[READ_TIMES_ROW]; |  | ||||||
|     int s_x,s_y; |  | ||||||
|  |  | ||||||
|     // judge if read out of boundary |  | ||||||
|     for (i = 0; i<READ_TIMES_ROW; i++) |  | ||||||
|     { |  | ||||||
|         s_x = start_x + i*LSIZE0, s_y = start_y; |  | ||||||
| #ifdef BORDER_ISOLATED |  | ||||||
|         EXTRAPOLATE(s_x, src_offset_x, src_offset_x + src_cols); |  | ||||||
|         EXTRAPOLATE(s_y, src_offset_y, src_offset_y + src_rows); |  | ||||||
| #else |  | ||||||
|         EXTRAPOLATE(s_x, 0, src_whole_cols); |  | ||||||
|         EXTRAPOLATE(s_y, 0, src_whole_rows); |  | ||||||
| #endif |  | ||||||
|  |  | ||||||
|         index[i]=mad24(s_y,src_step_in_pixel,s_x); |  | ||||||
|     } |  | ||||||
|     // read pixels from src |  | ||||||
|     for (i = 0; i<READ_TIMES_ROW; i++) |  | ||||||
|         temp[i] = src[index[i]]; |  | ||||||
| #endif |  | ||||||
|  |  | ||||||
|     // save pixels to lds |     // save pixels to lds | ||||||
|     for (i = 0; i<READ_TIMES_ROW; i++) |     for (int i = 0; i < READ_TIMES_ROW; ++i) | ||||||
|         LDS_DAT[l_y][l_x+i*LSIZE0]=temp[i]; |         LDS_DAT[l_y][mad24(i, LSIZE0, l_x)] = temp[i]; | ||||||
|     barrier(CLK_LOCAL_MEM_FENCE); |     barrier(CLK_LOCAL_MEM_FENCE); | ||||||
|  |  | ||||||
|     // read pixels from lds and calculate the result |     // read pixels from lds and calculate the result | ||||||
|     sum =LDS_DAT[l_y][l_x+RADIUSX]*mat_kernel[RADIUSX]; |     sum = convertToDstT(LDS_DAT[l_y][l_x + RADIUSX]) * mat_kernel[RADIUSX]; | ||||||
|     for (i=1; i<=RADIUSX; i++) |     for (int i = 1; i <= RADIUSX; ++i) | ||||||
|     { |     { | ||||||
|         temp[0]=LDS_DAT[l_y][l_x+RADIUSX-i]; |         temp[0] = LDS_DAT[l_y][l_x + RADIUSX - i]; | ||||||
|         temp[1]=LDS_DAT[l_y][l_x+RADIUSX+i]; |         temp[1] = LDS_DAT[l_y][l_x + RADIUSX + i]; | ||||||
|         sum += temp[0]*mat_kernel[RADIUSX-i]+temp[1]*mat_kernel[RADIUSX+i]; |         sum += mad(convertToDstT(temp[0]), mat_kernel[RADIUSX - i], convertToDstT(temp[1]) * mat_kernel[RADIUSX + i]); | ||||||
|     } |     } | ||||||
|  |  | ||||||
|     // write the result to dst |     // write the result to dst | ||||||
|     if (x<dst_cols && y<dst_rows) |     if (x < dst_cols && y < dst_rows) | ||||||
|     { |     { | ||||||
|         start_addr = mad24(y,dst_step_in_pixel,x); |         start_addr = mad24(y, dst_step_in_pixel, x); | ||||||
|         dst[start_addr] = sum; |         dst[start_addr] = sum; | ||||||
|     } |     } | ||||||
| } | } | ||||||
|   | |||||||
| @@ -75,6 +75,7 @@ | |||||||
| #endif | #endif | ||||||
|  |  | ||||||
| #define SRC(_x,_y) convertToWT(((global srcT*)(Src+(_y)*src_step))[_x]) | #define SRC(_x,_y) convertToWT(((global srcT*)(Src+(_y)*src_step))[_x]) | ||||||
|  | #define DST(_x,_y) (((global dstT*)(Dst+dst_offset+(_y)*dst_step))[_x]) | ||||||
|  |  | ||||||
| #ifdef BORDER_CONSTANT | #ifdef BORDER_CONSTANT | ||||||
| // CCCCCC|abcdefgh|CCCCCCC | // CCCCCC|abcdefgh|CCCCCCC | ||||||
| @@ -83,8 +84,6 @@ | |||||||
| #define ELEM(_x,_y,r_edge,t_edge,const_v) SRC((_x),(_y)) | #define ELEM(_x,_y,r_edge,t_edge,const_v) SRC((_x),(_y)) | ||||||
| #endif | #endif | ||||||
|  |  | ||||||
| #define DST(_x,_y) (((global dstT*)(Dst+dst_offset+(_y)*dst_step))[_x]) |  | ||||||
|  |  | ||||||
| #define noconvert | #define noconvert | ||||||
|  |  | ||||||
| // horizontal and vertical filter kernels | // horizontal and vertical filter kernels | ||||||
| @@ -101,15 +100,15 @@ __kernel void sep_filter(__global uchar* Src, int src_step, int srcOffsetX, int | |||||||
|     // all these should be defined on host during compile time |     // all these should be defined on host during compile time | ||||||
|     // first lsmem array for source pixels used in first pass, |     // first lsmem array for source pixels used in first pass, | ||||||
|     // second lsmemDy for storing first pass results |     // second lsmemDy for storing first pass results | ||||||
|     __local WT lsmem[BLK_Y+2*RADIUSY][BLK_X+2*RADIUSX]; |     __local WT lsmem[BLK_Y + 2 * RADIUSY][BLK_X + 2 * RADIUSX]; | ||||||
|     __local WT lsmemDy[BLK_Y][BLK_X+2*RADIUSX]; |     __local WT lsmemDy[BLK_Y][BLK_X + 2 * RADIUSX]; | ||||||
|  |  | ||||||
|     // get local and global ids - used as image and local memory array indexes |     // get local and global ids - used as image and local memory array indexes | ||||||
|     int lix = get_local_id(0); |     int lix = get_local_id(0); | ||||||
|     int liy = get_local_id(1); |     int liy = get_local_id(1); | ||||||
|  |  | ||||||
|     int x = (int)get_global_id(0); |     int x = get_global_id(0); | ||||||
|     int y = (int)get_global_id(1); |     int y = get_global_id(1); | ||||||
|  |  | ||||||
|     // calculate pixel position in source image taking image offset into account |     // calculate pixel position in source image taking image offset into account | ||||||
|     int srcX = x + srcOffsetX - RADIUSX; |     int srcX = x + srcOffsetX - RADIUSX; | ||||||
|   | |||||||
| @@ -79,12 +79,14 @@ PARAM_TEST_CASE(SepFilter2D, MatDepth, Channels, BorderType, bool, bool) | |||||||
|             ksize.width++; |             ksize.width++; | ||||||
|         if (1 != (ksize.height % 2)) |         if (1 != (ksize.height % 2)) | ||||||
|             ksize.height++; |             ksize.height++; | ||||||
|  |  | ||||||
|         Mat temp = randomMat(Size(ksize.width, 1), CV_MAKE_TYPE(CV_32F, 1), -MAX_VALUE, MAX_VALUE); |         Mat temp = randomMat(Size(ksize.width, 1), CV_MAKE_TYPE(CV_32F, 1), -MAX_VALUE, MAX_VALUE); | ||||||
|         cv::normalize(temp, kernelX, 1.0, 0.0, NORM_L1); |         cv::normalize(temp, kernelX, 1.0, 0.0, NORM_L1); | ||||||
|         temp = randomMat(Size(1, ksize.height),  CV_MAKE_TYPE(CV_32F, 1), -MAX_VALUE, MAX_VALUE); |         temp = randomMat(Size(1, ksize.height),  CV_MAKE_TYPE(CV_32F, 1), -MAX_VALUE, MAX_VALUE); | ||||||
|         cv::normalize(temp, kernelY, 1.0, 0.0, NORM_L1); |         cv::normalize(temp, kernelY, 1.0, 0.0, NORM_L1); | ||||||
|  |  | ||||||
|         Size roiSize = randomSize(ksize.width, MAX_VALUE, ksize.height, MAX_VALUE); |         Size roiSize = randomSize(ksize.width + 16, MAX_VALUE, ksize.height + 20, MAX_VALUE); | ||||||
|  |         std::cout << roiSize << std::endl; | ||||||
|         int rest = roiSize.width % 4; |         int rest = roiSize.width % 4; | ||||||
|         if (0 != rest) |         if (0 != rest) | ||||||
|             roiSize.width += (4 - rest); |             roiSize.width += (4 - rest); | ||||||
|   | |||||||
		Reference in New Issue
	
	Block a user
	 Ilya Lavrenov
					Ilya Lavrenov