From 730ead44fedeabb6f30f31f2219dc30007d7c6e2 Mon Sep 17 00:00:00 2001 From: vbystricky Date: Thu, 26 Jun 2014 12:46:03 +0400 Subject: [PATCH 1/2] Optimize OpenCL version of sepFilter2D --- modules/imgproc/src/filter.cpp | 2 +- .../src/opencl/filterSep_singlePass.cl | 90 ++++++++++++------- 2 files changed, 59 insertions(+), 33 deletions(-) diff --git a/modules/imgproc/src/filter.cpp b/modules/imgproc/src/filter.cpp index e51986c39..d23de91ea 100644 --- a/modules/imgproc/src/filter.cpp +++ b/modules/imgproc/src/filter.cpp @@ -3492,7 +3492,7 @@ static bool ocl_sepFilter2D_SinglePass(InputArray _src, OutputArray _dst, return false; size_t lt2[2] = { optimizedSepFilterLocalSize, optimizedSepFilterLocalSize }; - size_t gt2[2] = { lt2[0] * (1 + (size.width - 1) / lt2[0]), lt2[1] * (1 + (size.height - 1) / lt2[1]) }; + size_t gt2[2] = { lt2[0] * (1 + (size.width - 1) / lt2[0]), optimizedSepFilterLocalSize}; char cvt[2][40]; const char * const borderMap[] = { "BORDER_CONSTANT", "BORDER_REPLICATE", "BORDER_REFLECT", "BORDER_WRAP", diff --git a/modules/imgproc/src/opencl/filterSep_singlePass.cl b/modules/imgproc/src/opencl/filterSep_singlePass.cl index 3952577d7..6c3bbdc16 100644 --- a/modules/imgproc/src/opencl/filterSep_singlePass.cl +++ b/modules/imgproc/src/opencl/filterSep_singlePass.cl @@ -119,17 +119,15 @@ __kernel void sep_filter(__global uchar* Src, int src_step, int srcOffsetX, int int liy = get_local_id(1); int x = get_global_id(0); - int y = get_global_id(1); // calculate pixel position in source image taking image offset into account int srcX = x + srcOffsetX - RADIUSX; - int srcY = y + srcOffsetY - RADIUSY; // extrapolate coordinates, if needed // and read my own source pixel into local memory // with account for extra border pixels, which will be read by starting workitems int clocY = liy; - int cSrcY = srcY; + int cSrcY = liy + srcOffsetY - RADIUSY; do { int yb = cSrcY; @@ -154,48 +152,76 @@ __kernel void sep_filter(__global uchar* Src, int src_step, int srcOffsetX, int while (clocY < BLK_Y+(RADIUSY*2)); barrier(CLK_LOCAL_MEM_FENCE); - // do vertical filter pass - // and store intermediate results to second local memory array - int i, clocX = lix; - WT sum = (WT) 0; - do + for (int y = 0; y < dst_rows; y+=BLK_Y) { - sum = (WT) 0; - for (i=0; i<=2*RADIUSY; i++) + // do vertical filter pass + // and store intermediate results to second local memory array + int i, clocX = lix; + WT sum = (WT) 0; + do + { + sum = (WT) 0; + for (i=0; i<=2*RADIUSY; i++) #if (defined(INTEGER_ARITHMETIC) && !INTEL_DEVICE) - sum = mad24(lsmem[liy+i][clocX], mat_kernelY[i], sum); + sum = mad24(lsmem[liy + i][clocX], mat_kernelY[i], sum); #else - sum = mad(lsmem[liy+i][clocX], mat_kernelY[i], sum); + sum = mad(lsmem[liy + i][clocX], mat_kernelY[i], sum); #endif - lsmemDy[liy][clocX] = sum; - clocX += BLK_X; - } - while(clocX < BLK_X+(RADIUSX*2)); - barrier(CLK_LOCAL_MEM_FENCE); + lsmemDy[liy][clocX] = sum; + clocX += BLK_X; + } + while(clocX < BLK_X+(RADIUSX*2)); + barrier(CLK_LOCAL_MEM_FENCE); - // if this pixel happened to be out of image borders because of global size rounding, - // then just return - if( x >= dst_cols || y >=dst_rows ) - return; - - // do second horizontal filter pass - // and calculate final result - sum = 0.0f; - for (i=0; i<=2*RADIUSX; i++) + // if this pixel happened to be out of image borders because of global size rounding, + // then just return + if ((x < dst_cols) && (y + liy < dst_rows)) + { + // do second horizontal filter pass + // and calculate final result + sum = 0.0f; + for (i=0; i<=2*RADIUSX; i++) #if (defined(INTEGER_ARITHMETIC) && !INTEL_DEVICE) - sum = mad24(lsmemDy[liy][lix+i], mat_kernelX[i], sum); + sum = mad24(lsmemDy[liy][lix+i], mat_kernelX[i], sum); #else - sum = mad(lsmemDy[liy][lix+i], mat_kernelX[i], sum); + sum = mad(lsmemDy[liy][lix+i], mat_kernelX[i], sum); #endif #ifdef INTEGER_ARITHMETIC #ifdef INTEL_DEVICE - sum = (sum + (1 << (SHIFT_BITS-1))) / (1 << SHIFT_BITS); + sum = (sum + (1 << (SHIFT_BITS-1))) / (1 << SHIFT_BITS); #else - sum = (sum + (1 << (SHIFT_BITS-1))) >> SHIFT_BITS; + sum = (sum + (1 << (SHIFT_BITS-1))) >> SHIFT_BITS; #endif #endif + // store result into destination image + storepix(convertToDstT(sum + (WT)(delta)), Dst + mad24(y + liy, dst_step, mad24(x, DSTSIZE, dst_offset))); + } + + for (int i = liy * BLK_X + lix; i < (RADIUSY*2) * (BLK_X+(RADIUSX*2)); i += BLK_X * BLK_Y) + { + int clocX = i % (BLK_X+(RADIUSX*2)); + int clocY = i / (BLK_X+(RADIUSX*2)); + lsmem[clocY][clocX] = lsmem[clocY + BLK_Y][clocX]; + } + barrier(CLK_LOCAL_MEM_FENCE); + + int cSrcY = y + BLK_Y + liy + srcOffsetY + RADIUSY; + EXTRAPOLATE(cSrcY, (height)); + + clocX = lix; + int cSrcX = x + srcOffsetX - RADIUSX; + do + { + int xb = cSrcX; + EXTRAPOLATE(xb,(width)); + lsmem[liy + 2*RADIUSY][clocX] = ELEM(xb, cSrcY, (width), (height), 0 ); + + clocX += BLK_X; + cSrcX += BLK_X; + } + while(clocX < BLK_X+(RADIUSX*2)); + barrier(CLK_LOCAL_MEM_FENCE); + } - // store result into destination image - storepix(convertToDstT(sum + (WT)(delta)), Dst + mad24(y, dst_step, mad24(x, DSTSIZE, dst_offset))); } From 1a73aa1f6a3e25cdbe749e4ceaf1a75ece0fb6fb Mon Sep 17 00:00:00 2001 From: vbystricky Date: Thu, 26 Jun 2014 15:43:40 +0400 Subject: [PATCH 2/2] Change local size --- modules/imgproc/src/filter.cpp | 11 ++++++----- modules/imgproc/src/opencl/filterSep_singlePass.cl | 10 ++++------ 2 files changed, 10 insertions(+), 11 deletions(-) diff --git a/modules/imgproc/src/filter.cpp b/modules/imgproc/src/filter.cpp index d23de91ea..6c0da79cc 100644 --- a/modules/imgproc/src/filter.cpp +++ b/modules/imgproc/src/filter.cpp @@ -3471,7 +3471,8 @@ static bool ocl_sepColFilter2D(const UMat & buf, UMat & dst, const Mat & kernelY return k.run(2, globalsize, localsize, false); } -const int optimizedSepFilterLocalSize = 16; +const int optimizedSepFilterLocalWidth = 16; +const int optimizedSepFilterLocalHeight = 8; static bool ocl_sepFilter2D_SinglePass(InputArray _src, OutputArray _dst, Mat row_kernel, Mat col_kernel, @@ -3491,8 +3492,8 @@ static bool ocl_sepFilter2D_SinglePass(InputArray _src, OutputArray _dst, borderType == BORDER_REFLECT_101)) return false; - size_t lt2[2] = { optimizedSepFilterLocalSize, optimizedSepFilterLocalSize }; - size_t gt2[2] = { lt2[0] * (1 + (size.width - 1) / lt2[0]), optimizedSepFilterLocalSize}; + size_t lt2[2] = { optimizedSepFilterLocalWidth, optimizedSepFilterLocalHeight }; + size_t gt2[2] = { lt2[0] * (1 + (size.width - 1) / lt2[0]), lt2[1]}; char cvt[2][40]; const char * const borderMap[] = { "BORDER_CONSTANT", "BORDER_REPLICATE", "BORDER_REFLECT", "BORDER_WRAP", @@ -3584,8 +3585,8 @@ static bool ocl_sepFilter2D( InputArray _src, OutputArray _dst, int ddepth, } CV_OCL_RUN_(kernelY.cols <= 21 && kernelX.cols <= 21 && - imgSize.width > optimizedSepFilterLocalSize + anchor.x && - imgSize.height > optimizedSepFilterLocalSize + anchor.y && + imgSize.width > optimizedSepFilterLocalWidth + anchor.x && + imgSize.height > optimizedSepFilterLocalHeight + anchor.y && (!(borderType & BORDER_ISOLATED) || _src.offset() == 0) && anchor == Point(kernelX.cols >> 1, kernelY.cols >> 1) && (d.isIntel() || (d.isAMD() && !d.hostUnifiedMemory())), diff --git a/modules/imgproc/src/opencl/filterSep_singlePass.cl b/modules/imgproc/src/opencl/filterSep_singlePass.cl index 6c3bbdc16..8c14f2d77 100644 --- a/modules/imgproc/src/opencl/filterSep_singlePass.cl +++ b/modules/imgproc/src/opencl/filterSep_singlePass.cl @@ -127,10 +127,9 @@ __kernel void sep_filter(__global uchar* Src, int src_step, int srcOffsetX, int // and read my own source pixel into local memory // with account for extra border pixels, which will be read by starting workitems int clocY = liy; - int cSrcY = liy + srcOffsetY - RADIUSY; do { - int yb = cSrcY; + int yb = clocY + srcOffsetY - RADIUSY; EXTRAPOLATE(yb, (height)); int clocX = lix; @@ -147,7 +146,6 @@ __kernel void sep_filter(__global uchar* Src, int src_step, int srcOffsetX, int while(clocX < BLK_X+(RADIUSX*2)); clocY += BLK_Y; - cSrcY += BLK_Y; } while (clocY < BLK_Y+(RADIUSY*2)); barrier(CLK_LOCAL_MEM_FENCE); @@ -206,8 +204,8 @@ __kernel void sep_filter(__global uchar* Src, int src_step, int srcOffsetX, int } barrier(CLK_LOCAL_MEM_FENCE); - int cSrcY = y + BLK_Y + liy + srcOffsetY + RADIUSY; - EXTRAPOLATE(cSrcY, (height)); + int yb = y + liy + BLK_Y + srcOffsetY + RADIUSY; + EXTRAPOLATE(yb, (height)); clocX = lix; int cSrcX = x + srcOffsetX - RADIUSX; @@ -215,7 +213,7 @@ __kernel void sep_filter(__global uchar* Src, int src_step, int srcOffsetX, int { int xb = cSrcX; EXTRAPOLATE(xb,(width)); - lsmem[liy + 2*RADIUSY][clocX] = ELEM(xb, cSrcY, (width), (height), 0 ); + lsmem[liy + 2*RADIUSY][clocX] = ELEM(xb, yb, (width), (height), 0 ); clocX += BLK_X; cSrcX += BLK_X;