From 168c0b038549153d45f67d46faf22bf637f745fb Mon Sep 17 00:00:00 2001 From: peng xiao Date: Thu, 2 May 2013 16:14:28 +0800 Subject: [PATCH 1/3] Optimize ocl::stereobm. 1. Use macro defines for some parameters(radius). 2. Reduce local memory usage. 3. Fix accuracy problem on Intel GPU. --- modules/ocl/src/opencl/stereobm.cl | 207 +++++++++-------------------- modules/ocl/src/stereobm.cpp | 98 ++++++-------- modules/ocl/test/test_calib3d.cpp | 16 +-- 3 files changed, 109 insertions(+), 212 deletions(-) diff --git a/modules/ocl/src/opencl/stereobm.cl b/modules/ocl/src/opencl/stereobm.cl index 196a786d5..ea983df01 100644 --- a/modules/ocl/src/opencl/stereobm.cl +++ b/modules/ocl/src/opencl/stereobm.cl @@ -16,6 +16,8 @@ // // @Authors // Jia Haipeng, jiahaipeng95@gmail.com +// Sen Liu, swjtuls1987@126.com +// Peng Xiao, pengxiao@outlook.com // // Redistribution and use in source and binary forms, with or without modification, // are permitted provided that the following conditions are met: @@ -50,55 +52,33 @@ #define STEREO_MIND 0 // The minimum d range to check #define STEREO_DISP_STEP N_DISPARITIES // the d step, must be <= 1 to avoid aliasing -int SQ(int a) +#ifndef radius +#define radius 64 +#endif + +unsigned int CalcSSD(__local unsigned int *col_ssd) { - return a * a; + unsigned int cache = col_ssd[0]; + + for(int i = 1, j = radius + 1; i <= radius; i++, j++) + cache += col_ssd[i] + col_ssd[j]; + + return cache; } -unsigned int CalcSSD(volatile __local unsigned int *col_ssd_cache, - volatile __local unsigned int *col_ssd, int radius) -{ - unsigned int cache = 0; - unsigned int cache2 = 0; - - for(int i = 1; i <= radius; i++) - cache += col_ssd[i]; - - col_ssd_cache[0] = cache; - - barrier(CLK_LOCAL_MEM_FENCE); - - if (get_local_id(0) < BLOCK_W - radius) - cache2 = col_ssd_cache[radius]; - else - for(int i = radius + 1; i < (2 * radius + 1); i++) - cache2 += col_ssd[i]; - - return col_ssd[0] + cache + cache2; -} - -uint2 MinSSD(volatile __local unsigned int *col_ssd_cache, - volatile __local unsigned int *col_ssd, int radius) +uint2 MinSSD(__local unsigned int *col_ssd) { unsigned int ssd[N_DISPARITIES]; //See above: #define COL_SSD_SIZE (BLOCK_W + 2 * radius) - ssd[0] = CalcSSD(col_ssd_cache, col_ssd + 0 * (BLOCK_W + 2 * radius), radius); - barrier(CLK_LOCAL_MEM_FENCE); - ssd[1] = CalcSSD(col_ssd_cache, col_ssd + 1 * (BLOCK_W + 2 * radius), radius); - barrier(CLK_LOCAL_MEM_FENCE); - ssd[2] = CalcSSD(col_ssd_cache, col_ssd + 2 * (BLOCK_W + 2 * radius), radius); - barrier(CLK_LOCAL_MEM_FENCE); - ssd[3] = CalcSSD(col_ssd_cache, col_ssd + 3 * (BLOCK_W + 2 * radius), radius); - barrier(CLK_LOCAL_MEM_FENCE); - ssd[4] = CalcSSD(col_ssd_cache, col_ssd + 4 * (BLOCK_W + 2 * radius), radius); - barrier(CLK_LOCAL_MEM_FENCE); - ssd[5] = CalcSSD(col_ssd_cache, col_ssd + 5 * (BLOCK_W + 2 * radius), radius); - barrier(CLK_LOCAL_MEM_FENCE); - ssd[6] = CalcSSD(col_ssd_cache, col_ssd + 6 * (BLOCK_W + 2 * radius), radius); - barrier(CLK_LOCAL_MEM_FENCE); - ssd[7] = CalcSSD(col_ssd_cache, col_ssd + 7 * (BLOCK_W + 2 * radius), radius); - barrier(CLK_LOCAL_MEM_FENCE); + ssd[0] = CalcSSD(col_ssd + 0 * (BLOCK_W + 2 * radius)); + ssd[1] = CalcSSD(col_ssd + 1 * (BLOCK_W + 2 * radius)); + ssd[2] = CalcSSD(col_ssd + 2 * (BLOCK_W + 2 * radius)); + ssd[3] = CalcSSD(col_ssd + 3 * (BLOCK_W + 2 * radius)); + ssd[4] = CalcSSD(col_ssd + 4 * (BLOCK_W + 2 * radius)); + ssd[5] = CalcSSD(col_ssd + 5 * (BLOCK_W + 2 * radius)); + ssd[6] = CalcSSD(col_ssd + 6 * (BLOCK_W + 2 * radius)); + ssd[7] = CalcSSD(col_ssd + 7 * (BLOCK_W + 2 * radius)); unsigned int mssd = min(min(min(ssd[0], ssd[1]), min(ssd[4], ssd[5])), min(min(ssd[2], ssd[3]), min(ssd[6], ssd[7]))); @@ -113,124 +93,67 @@ uint2 MinSSD(volatile __local unsigned int *col_ssd_cache, } void StepDown(int idx1, int idx2, __global unsigned char* imageL, - __global unsigned char* imageR, int d, volatile __local unsigned int *col_ssd, int radius) + __global unsigned char* imageR, int d, __local unsigned int *col_ssd) { - unsigned char leftPixel1; - unsigned char leftPixel2; - unsigned char rightPixel1[8]; - unsigned char rightPixel2[8]; - unsigned int diff1, diff2; - - leftPixel1 = imageL[idx1]; - leftPixel2 = imageL[idx2]; - - idx1 = idx1 - d; - idx2 = idx2 - d; - - rightPixel1[7] = imageR[idx1 - 7]; - rightPixel1[0] = imageR[idx1 - 0]; - rightPixel1[1] = imageR[idx1 - 1]; - rightPixel1[2] = imageR[idx1 - 2]; - rightPixel1[3] = imageR[idx1 - 3]; - rightPixel1[4] = imageR[idx1 - 4]; - rightPixel1[5] = imageR[idx1 - 5]; - rightPixel1[6] = imageR[idx1 - 6]; - - rightPixel2[7] = imageR[idx2 - 7]; - rightPixel2[0] = imageR[idx2 - 0]; - rightPixel2[1] = imageR[idx2 - 1]; - rightPixel2[2] = imageR[idx2 - 2]; - rightPixel2[3] = imageR[idx2 - 3]; - rightPixel2[4] = imageR[idx2 - 4]; - rightPixel2[5] = imageR[idx2 - 5]; - rightPixel2[6] = imageR[idx2 - 6]; - - //See above: #define COL_SSD_SIZE (BLOCK_W + 2 * radius) - diff1 = leftPixel1 - rightPixel1[0]; - diff2 = leftPixel2 - rightPixel2[0]; - col_ssd[0 * (BLOCK_W + 2 * radius)] += SQ(diff2) - SQ(diff1); - - diff1 = leftPixel1 - rightPixel1[1]; - diff2 = leftPixel2 - rightPixel2[1]; - col_ssd[1 * (BLOCK_W + 2 * radius)] += SQ(diff2) - SQ(diff1); - - diff1 = leftPixel1 - rightPixel1[2]; - diff2 = leftPixel2 - rightPixel2[2]; - col_ssd[2 * (BLOCK_W + 2 * radius)] += SQ(diff2) - SQ(diff1); - - diff1 = leftPixel1 - rightPixel1[3]; - diff2 = leftPixel2 - rightPixel2[3]; - col_ssd[3 * (BLOCK_W + 2 * radius)] += SQ(diff2) - SQ(diff1); - - diff1 = leftPixel1 - rightPixel1[4]; - diff2 = leftPixel2 - rightPixel2[4]; - col_ssd[4 * (BLOCK_W + 2 * radius)] += SQ(diff2) - SQ(diff1); - - diff1 = leftPixel1 - rightPixel1[5]; - diff2 = leftPixel2 - rightPixel2[5]; - col_ssd[5 * (BLOCK_W + 2 * radius)] += SQ(diff2) - SQ(diff1); - - diff1 = leftPixel1 - rightPixel1[6]; - diff2 = leftPixel2 - rightPixel2[6]; - col_ssd[6 * (BLOCK_W + 2 * radius)] += SQ(diff2) - SQ(diff1); - - diff1 = leftPixel1 - rightPixel1[7]; - diff2 = leftPixel2 - rightPixel2[7]; - col_ssd[7 * (BLOCK_W + 2 * radius)] += SQ(diff2) - SQ(diff1); + uint8 imgR1 = convert_uint8(vload8(0, imageR + (idx1 - d - 7))); + uint8 imgR2 = convert_uint8(vload8(0, imageR + (idx2 - d - 7))); + uint8 diff1 = (uint8)(imageL[idx1]) - imgR1; + uint8 diff2 = (uint8)(imageL[idx2]) - imgR2; + uint8 res = diff2 * diff2 - diff1 * diff1; + col_ssd[0 * (BLOCK_W + 2 * radius)] += res.s7; + col_ssd[1 * (BLOCK_W + 2 * radius)] += res.s6; + col_ssd[2 * (BLOCK_W + 2 * radius)] += res.s5; + col_ssd[3 * (BLOCK_W + 2 * radius)] += res.s4; + col_ssd[4 * (BLOCK_W + 2 * radius)] += res.s3; + col_ssd[5 * (BLOCK_W + 2 * radius)] += res.s2; + col_ssd[6 * (BLOCK_W + 2 * radius)] += res.s1; + col_ssd[7 * (BLOCK_W + 2 * radius)] += res.s0; } void InitColSSD(int x_tex, int y_tex, int im_pitch, __global unsigned char* imageL, __global unsigned char* imageR, int d, - volatile __local unsigned int *col_ssd, int radius) + __local unsigned int *col_ssd) { - unsigned char leftPixel1; + uint8 leftPixel1; int idx; - unsigned int diffa[] = {0, 0, 0, 0, 0, 0, 0, 0}; + uint8 diffa = 0; for(int i = 0; i < (2 * radius + 1); i++) { idx = y_tex * im_pitch + x_tex; - leftPixel1 = imageL[idx]; - idx = idx - d; - - diffa[0] += SQ(leftPixel1 - imageR[idx - 0]); - diffa[1] += SQ(leftPixel1 - imageR[idx - 1]); - diffa[2] += SQ(leftPixel1 - imageR[idx - 2]); - diffa[3] += SQ(leftPixel1 - imageR[idx - 3]); - diffa[4] += SQ(leftPixel1 - imageR[idx - 4]); - diffa[5] += SQ(leftPixel1 - imageR[idx - 5]); - diffa[6] += SQ(leftPixel1 - imageR[idx - 6]); - diffa[7] += SQ(leftPixel1 - imageR[idx - 7]); + leftPixel1 = (uint8)(imageL[idx]); + uint8 imgR = convert_uint8(vload8(0, imageR + (idx - d - 7))); + uint8 res = leftPixel1 - imgR; + diffa += res * res; y_tex += 1; } //See above: #define COL_SSD_SIZE (BLOCK_W + 2 * radius) - col_ssd[0 * (BLOCK_W + 2 * radius)] = diffa[0]; - col_ssd[1 * (BLOCK_W + 2 * radius)] = diffa[1]; - col_ssd[2 * (BLOCK_W + 2 * radius)] = diffa[2]; - col_ssd[3 * (BLOCK_W + 2 * radius)] = diffa[3]; - col_ssd[4 * (BLOCK_W + 2 * radius)] = diffa[4]; - col_ssd[5 * (BLOCK_W + 2 * radius)] = diffa[5]; - col_ssd[6 * (BLOCK_W + 2 * radius)] = diffa[6]; - col_ssd[7 * (BLOCK_W + 2 * radius)] = diffa[7]; + col_ssd[0 * (BLOCK_W + 2 * radius)] = diffa.s7; + col_ssd[1 * (BLOCK_W + 2 * radius)] = diffa.s6; + col_ssd[2 * (BLOCK_W + 2 * radius)] = diffa.s5; + col_ssd[3 * (BLOCK_W + 2 * radius)] = diffa.s4; + col_ssd[4 * (BLOCK_W + 2 * radius)] = diffa.s3; + col_ssd[5 * (BLOCK_W + 2 * radius)] = diffa.s2; + col_ssd[6 * (BLOCK_W + 2 * radius)] = diffa.s1; + col_ssd[7 * (BLOCK_W + 2 * radius)] = diffa.s0; } __kernel void stereoKernel(__global unsigned char *left, __global unsigned char *right, __global unsigned int *cminSSDImage, int cminSSD_step, __global unsigned char *disp, int disp_step,int cwidth, int cheight, - int img_step, int maxdisp, int radius, + int img_step, int maxdisp, __local unsigned int *col_ssd_cache) { - - volatile __local unsigned int *col_ssd = col_ssd_cache + BLOCK_W + get_local_id(0); - volatile __local unsigned int *col_ssd_extra = get_local_id(0) < (2 * radius) ? col_ssd + BLOCK_W : 0; + __local unsigned int *col_ssd = col_ssd_cache + get_local_id(0); + __local unsigned int *col_ssd_extra = get_local_id(0) < (2 * radius) ? col_ssd + BLOCK_W : 0; int X = get_group_id(0) * BLOCK_W + get_local_id(0) + maxdisp + radius; // int Y = get_group_id(1) * ROWSperTHREAD + radius; #define Y (get_group_id(1) * ROWSperTHREAD + radius) - volatile __global unsigned int* minSSDImage = cminSSDImage + X + Y * cminSSD_step; + __global unsigned int* minSSDImage = cminSSDImage + X + Y * cminSSD_step; __global unsigned char* disparImage = disp + X + Y * disp_step; int end_row = ROWSperTHREAD < (cheight - Y) ? ROWSperTHREAD:(cheight - Y); @@ -244,14 +167,14 @@ __kernel void stereoKernel(__global unsigned char *left, __global unsigned char { y_tex = Y - radius; - InitColSSD(x_tex, y_tex, img_step, left, right, d, col_ssd, radius); + InitColSSD(x_tex, y_tex, img_step, left, right, d, col_ssd); if (col_ssd_extra > 0) if (x_tex + BLOCK_W < cwidth) - InitColSSD(x_tex + BLOCK_W, y_tex, img_step, left, right, d, col_ssd_extra, radius); + InitColSSD(x_tex + BLOCK_W, y_tex, img_step, left, right, d, col_ssd_extra); barrier(CLK_LOCAL_MEM_FENCE); //before MinSSD function - uint2 minSSD = MinSSD(col_ssd_cache + get_local_id(0), col_ssd, radius); + uint2 minSSD = MinSSD(col_ssd); if (X < cwidth - radius && Y < cheight - radius) { if (minSSD.x < minSSDImage[0]) @@ -266,19 +189,14 @@ __kernel void stereoKernel(__global unsigned char *left, __global unsigned char int idx1 = y_tex * img_step + x_tex; int idx2 = min(y_tex + (2 * radius + 1), cheight - 1) * img_step + x_tex; - barrier(CLK_GLOBAL_MEM_FENCE); - barrier(CLK_LOCAL_MEM_FENCE); - - StepDown(idx1, idx2, left, right, d, col_ssd, radius); + StepDown(idx1, idx2, left, right, d, col_ssd); if (col_ssd_extra > 0) if (x_tex + BLOCK_W < cwidth) - StepDown(idx1, idx2, left + BLOCK_W, right + BLOCK_W, d, col_ssd_extra, radius); - - y_tex += 1; + StepDown(idx1, idx2, left + BLOCK_W, right + BLOCK_W, d, col_ssd_extra); barrier(CLK_LOCAL_MEM_FENCE); - uint2 minSSD = MinSSD(col_ssd_cache + get_local_id(0), col_ssd, radius); + uint2 minSSD = MinSSD(col_ssd); if (X < cwidth - radius && row < cheight - radius - Y) { int idx = row * cminSSD_step; @@ -288,10 +206,11 @@ __kernel void stereoKernel(__global unsigned char *left, __global unsigned char minSSDImage[idx] = minSSD.x; } } + + y_tex++; } // for row loop } // for d loop } - ////////////////////////////////////////////////////////////////////////////////////////////////// //////////////////////////// Sobel Prefiler (signal channel)////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/modules/ocl/src/stereobm.cpp b/modules/ocl/src/stereobm.cpp index fe3b2557d..e947e2e9b 100644 --- a/modules/ocl/src/stereobm.cpp +++ b/modules/ocl/src/stereobm.cpp @@ -74,28 +74,21 @@ namespace stereoBM //////////////////////////////////////////////////////////////////////// static void prefilter_xsobel(const oclMat &input, oclMat &output, int prefilterCap) { - Context *clCxt = input.clCxt; - string kernelName = "prefilter_xsobel"; - cl_kernel kernel = openCLGetKernelFromSource(clCxt, &stereobm, kernelName); size_t blockSize = 1; size_t globalThreads[3] = { input.cols, input.rows, 1 }; size_t localThreads[3] = { blockSize, blockSize, 1 }; - openCLVerifyKernel(clCxt, kernel, localThreads); - openCLSafeCall(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&input.data)); - openCLSafeCall(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&output.data)); - openCLSafeCall(clSetKernelArg(kernel, 2, sizeof(cl_int), (void *)&input.rows)); - openCLSafeCall(clSetKernelArg(kernel, 3, sizeof(cl_int), (void *)&input.cols)); - openCLSafeCall(clSetKernelArg(kernel, 4, sizeof(cl_int), (void *)&prefilterCap)); - - openCLSafeCall(clEnqueueNDRangeKernel((cl_command_queue)clCxt->oclCommandQueue(), kernel, 3, NULL, - globalThreads, localThreads, 0, NULL, NULL)); - - clFinish((cl_command_queue)clCxt->oclCommandQueue()); - openCLSafeCall(clReleaseKernel(kernel)); + std::vector> args; + args.push_back(std::make_pair(sizeof(cl_mem), (void *)&input.data)); + args.push_back(std::make_pair(sizeof(cl_mem), (void *)&output.data)); + args.push_back(std::make_pair(sizeof(cl_int), (void *)&input.rows)); + args.push_back(std::make_pair(sizeof(cl_int), (void *)&input.cols)); + args.push_back(std::make_pair(sizeof(cl_int), (void *)&prefilterCap)); + openCLExecuteKernel(Context::getContext(), &stereobm, kernelName, + globalThreads, localThreads, args, -1, -1); } ////////////////////////////////////////////////////////////////////////// //////////////////////////////common//////////////////////////////////// @@ -115,19 +108,13 @@ static void stereo_bm(const oclMat &left, const oclMat &right, oclMat &disp, { int winsz2 = winSize >> 1; - //if(winsz2 == 0 || winsz2 >= calles_num) - //cv::ocl:error("Unsupported window size", __FILE__, __LINE__, __FUNCTION__); - - Context *clCxt = left.clCxt; - string kernelName = "stereoKernel"; - cl_kernel kernel = openCLGetKernelFromSource(clCxt, &stereobm, kernelName); disp.setTo(Scalar_::all(0)); minSSD_buf.setTo(Scalar_::all(0xFFFFFFFF)); size_t minssd_step = minSSD_buf.step / minSSD_buf.elemSize(); - size_t local_mem_size = (BLOCK_W + N_DISPARITIES * (BLOCK_W + 2 * winsz2)) * + size_t local_mem_size = (N_DISPARITIES * (BLOCK_W + 2 * winsz2)) * sizeof(cl_uint); //size_t blockSize = 1; size_t localThreads[] = { BLOCK_W, 1,1}; @@ -136,26 +123,23 @@ static void stereo_bm(const oclMat &left, const oclMat &right, oclMat &disp, 1 }; - openCLVerifyKernel(clCxt, kernel, localThreads); - openCLSafeCall(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&left.data)); - openCLSafeCall(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&right.data)); - openCLSafeCall(clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *)&minSSD_buf.data)); - openCLSafeCall(clSetKernelArg(kernel, 3, sizeof(cl_int), (void *)&minssd_step)); - openCLSafeCall(clSetKernelArg(kernel, 4, sizeof(cl_mem), (void *)&disp.data)); - openCLSafeCall(clSetKernelArg(kernel, 5, sizeof(cl_int), (void *)&disp.step)); - openCLSafeCall(clSetKernelArg(kernel, 6, sizeof(cl_int), (void *)&left.cols)); - openCLSafeCall(clSetKernelArg(kernel, 7, sizeof(cl_int), (void *)&left.rows)); - openCLSafeCall(clSetKernelArg(kernel, 8, sizeof(cl_int), (void *)&left.step)); - openCLSafeCall(clSetKernelArg(kernel, 9, sizeof(cl_int), (void *)&maxdisp)); - openCLSafeCall(clSetKernelArg(kernel, 10, sizeof(cl_int), (void *)&winsz2)); - openCLSafeCall(clSetKernelArg(kernel, 11, local_mem_size, (void *)NULL)); + std::vector> args; + args.push_back(std::make_pair(sizeof(cl_mem), (void *)&left.data)); + args.push_back(std::make_pair(sizeof(cl_mem), (void *)&right.data)); + args.push_back(std::make_pair(sizeof(cl_mem), (void *)&minSSD_buf.data)); + args.push_back(std::make_pair(sizeof(cl_int), (void *)&minssd_step)); + args.push_back(std::make_pair(sizeof(cl_mem), (void *)&disp.data)); + args.push_back(std::make_pair(sizeof(cl_int), (void *)&disp.step)); + args.push_back(std::make_pair(sizeof(cl_int), (void *)&left.cols)); + args.push_back(std::make_pair(sizeof(cl_int), (void *)&left.rows)); + args.push_back(std::make_pair(sizeof(cl_int), (void *)&left.step)); + args.push_back(std::make_pair(sizeof(cl_int), (void *)&maxdisp)); + args.push_back(std::make_pair(local_mem_size, (void *)NULL)); - openCLSafeCall(clEnqueueNDRangeKernel((cl_command_queue)clCxt->oclCommandQueue(), kernel, 2, NULL, - globalThreads, localThreads, 0, NULL, NULL)); - - - clFinish((cl_command_queue)clCxt->oclCommandQueue()); - openCLSafeCall(clReleaseKernel(kernel)); + char opt [128]; + sprintf(opt, "-D radius=%d", winsz2); + openCLExecuteKernel(Context::getContext(), &stereobm, kernelName, + globalThreads, localThreads, args, -1, -1, opt); } //////////////////////////////////////////////////////////////////////////// ///////////////////////////////postfilter_textureness/////////////////////// @@ -163,10 +147,7 @@ static void stereo_bm(const oclMat &left, const oclMat &right, oclMat &disp, static void postfilter_textureness(oclMat &left, int winSize, float avergeTexThreshold, oclMat &disparity) { - Context *clCxt = left.clCxt; - string kernelName = "textureness_kernel"; - cl_kernel kernel = openCLGetKernelFromSource(clCxt, &stereobm, kernelName); size_t blockSize = 1; size_t localThreads[] = { BLOCK_W, blockSize ,1}; @@ -177,22 +158,19 @@ static void postfilter_textureness(oclMat &left, int winSize, size_t local_mem_size = (localThreads[0] + localThreads[0] + (winSize / 2) * 2) * sizeof(float); - openCLVerifyKernel(clCxt, kernel, localThreads); - openCLSafeCall(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&disparity.data)); - openCLSafeCall(clSetKernelArg(kernel, 1, sizeof(cl_int), (void *)&disparity.rows)); - openCLSafeCall(clSetKernelArg(kernel, 2, sizeof(cl_int), (void *)&disparity.cols)); - openCLSafeCall(clSetKernelArg(kernel, 3, sizeof(cl_int), (void *)&disparity.step)); - openCLSafeCall(clSetKernelArg(kernel, 4, sizeof(cl_mem), (void *)&left.data)); - openCLSafeCall(clSetKernelArg(kernel, 5, sizeof(cl_int), (void *)&left.rows)); - openCLSafeCall(clSetKernelArg(kernel, 6, sizeof(cl_int), (void *)&left.cols)); - openCLSafeCall(clSetKernelArg(kernel, 7, sizeof(cl_int), (void *)&winSize)); - openCLSafeCall(clSetKernelArg(kernel, 8, sizeof(cl_float), (void *)&avergeTexThreshold)); - openCLSafeCall(clSetKernelArg(kernel, 9, local_mem_size, NULL)); - openCLSafeCall(clEnqueueNDRangeKernel((cl_command_queue)clCxt->oclCommandQueue(), kernel, 2, NULL, - globalThreads, localThreads, 0, NULL, NULL)); - - clFinish((cl_command_queue)clCxt->oclCommandQueue()); - openCLSafeCall(clReleaseKernel(kernel)); + std::vector> args; + args.push_back(std::make_pair(sizeof(cl_mem), (void *)&disparity.data)); + args.push_back(std::make_pair(sizeof(cl_int), (void *)&disparity.rows)); + args.push_back(std::make_pair(sizeof(cl_int), (void *)&disparity.cols)); + args.push_back(std::make_pair(sizeof(cl_int), (void *)&disparity.step)); + args.push_back(std::make_pair(sizeof(cl_mem), (void *)&left.data)); + args.push_back(std::make_pair(sizeof(cl_int), (void *)&left.rows)); + args.push_back(std::make_pair(sizeof(cl_int), (void *)&left.cols)); + args.push_back(std::make_pair(sizeof(cl_int), (void *)&winSize)); + args.push_back(std::make_pair(sizeof(cl_float), (void *)&avergeTexThreshold)); + args.push_back(std::make_pair(local_mem_size, (void*)NULL)); + openCLExecuteKernel(Context::getContext(), &stereobm, kernelName, + globalThreads, localThreads, args, -1, -1); } ////////////////////////////////////////////////////////////////////////////// /////////////////////////////////////operator///////////////////////////////// diff --git a/modules/ocl/test/test_calib3d.cpp b/modules/ocl/test/test_calib3d.cpp index 179829e0e..b556e5a3c 100644 --- a/modules/ocl/test/test_calib3d.cpp +++ b/modules/ocl/test/test_calib3d.cpp @@ -59,7 +59,7 @@ PARAM_TEST_CASE(StereoMatchBM, int, int) virtual void SetUp() { n_disp = GET_PARAM(0); - winSize = GET_PARAM(1); + winSize = GET_PARAM(1); } }; @@ -69,27 +69,27 @@ TEST_P(StereoMatchBM, Regression) Mat left_image = readImage("stereobm/aloe-L.png", IMREAD_GRAYSCALE); Mat right_image = readImage("stereobm/aloe-R.png", IMREAD_GRAYSCALE); Mat disp_gold = readImage("stereobm/aloe-disp.png", IMREAD_GRAYSCALE); - ocl::oclMat d_left, d_right; - ocl::oclMat d_disp(left_image.size(), CV_8U); - Mat disp; + ocl::oclMat d_left, d_right; + ocl::oclMat d_disp(left_image.size(), CV_8U); + Mat disp; ASSERT_FALSE(left_image.empty()); ASSERT_FALSE(right_image.empty()); ASSERT_FALSE(disp_gold.empty()); - d_left.upload(left_image); - d_right.upload(right_image); + d_left.upload(left_image); + d_right.upload(right_image); ocl::StereoBM_OCL bm(0, n_disp, winSize); bm(d_left, d_right, d_disp); - d_disp.download(disp); + d_disp.download(disp); EXPECT_MAT_SIMILAR(disp_gold, disp, 1e-3); } INSTANTIATE_TEST_CASE_P(OCL_Calib3D, StereoMatchBM, testing::Combine(testing::Values(128), - testing::Values(19))); + testing::Values(19))); PARAM_TEST_CASE(StereoMatchBP, int, int, int, float, float, float, float) { From ed2199a4970f643702d0e01eca27069d8bdd4fb5 Mon Sep 17 00:00:00 2001 From: peng xiao Date: Fri, 3 May 2013 09:54:11 +0800 Subject: [PATCH 2/3] Fix build --- modules/ocl/src/stereobm.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/modules/ocl/src/stereobm.cpp b/modules/ocl/src/stereobm.cpp index e947e2e9b..e620c2a6d 100644 --- a/modules/ocl/src/stereobm.cpp +++ b/modules/ocl/src/stereobm.cpp @@ -80,7 +80,7 @@ static void prefilter_xsobel(const oclMat &input, oclMat &output, int prefilterC size_t globalThreads[3] = { input.cols, input.rows, 1 }; size_t localThreads[3] = { blockSize, blockSize, 1 }; - std::vector> args; + std::vector< std::pair > args; args.push_back(std::make_pair(sizeof(cl_mem), (void *)&input.data)); args.push_back(std::make_pair(sizeof(cl_mem), (void *)&output.data)); args.push_back(std::make_pair(sizeof(cl_int), (void *)&input.rows)); @@ -123,7 +123,7 @@ static void stereo_bm(const oclMat &left, const oclMat &right, oclMat &disp, 1 }; - std::vector> args; + std::vector< std::pair > args; args.push_back(std::make_pair(sizeof(cl_mem), (void *)&left.data)); args.push_back(std::make_pair(sizeof(cl_mem), (void *)&right.data)); args.push_back(std::make_pair(sizeof(cl_mem), (void *)&minSSD_buf.data)); @@ -158,7 +158,7 @@ static void postfilter_textureness(oclMat &left, int winSize, size_t local_mem_size = (localThreads[0] + localThreads[0] + (winSize / 2) * 2) * sizeof(float); - std::vector> args; + std::vector< std::pair > args; args.push_back(std::make_pair(sizeof(cl_mem), (void *)&disparity.data)); args.push_back(std::make_pair(sizeof(cl_int), (void *)&disparity.rows)); args.push_back(std::make_pair(sizeof(cl_int), (void *)&disparity.cols)); From 69e6d0016e5f98f43445b647605943cf020543d5 Mon Sep 17 00:00:00 2001 From: peng xiao Date: Wed, 8 May 2013 17:29:24 +0800 Subject: [PATCH 3/3] Optimize stereobm a bit. Speedup about 30% on 6730M GPU. --- modules/ocl/src/opencl/stereobm.cl | 80 ++++++++++++++++-------------- 1 file changed, 42 insertions(+), 38 deletions(-) diff --git a/modules/ocl/src/opencl/stereobm.cl b/modules/ocl/src/opencl/stereobm.cl index ea983df01..bd86a7f3f 100644 --- a/modules/ocl/src/opencl/stereobm.cl +++ b/modules/ocl/src/opencl/stereobm.cl @@ -60,8 +60,9 @@ unsigned int CalcSSD(__local unsigned int *col_ssd) { unsigned int cache = col_ssd[0]; - for(int i = 1, j = radius + 1; i <= radius; i++, j++) - cache += col_ssd[i] + col_ssd[j]; +#pragma unroll + for(int i = 1; i <= (radius << 1); i++) + cache += col_ssd[i]; return cache; } @@ -69,20 +70,22 @@ unsigned int CalcSSD(__local unsigned int *col_ssd) uint2 MinSSD(__local unsigned int *col_ssd) { unsigned int ssd[N_DISPARITIES]; + const int win_size = (radius << 1); - //See above: #define COL_SSD_SIZE (BLOCK_W + 2 * radius) - ssd[0] = CalcSSD(col_ssd + 0 * (BLOCK_W + 2 * radius)); - ssd[1] = CalcSSD(col_ssd + 1 * (BLOCK_W + 2 * radius)); - ssd[2] = CalcSSD(col_ssd + 2 * (BLOCK_W + 2 * radius)); - ssd[3] = CalcSSD(col_ssd + 3 * (BLOCK_W + 2 * radius)); - ssd[4] = CalcSSD(col_ssd + 4 * (BLOCK_W + 2 * radius)); - ssd[5] = CalcSSD(col_ssd + 5 * (BLOCK_W + 2 * radius)); - ssd[6] = CalcSSD(col_ssd + 6 * (BLOCK_W + 2 * radius)); - ssd[7] = CalcSSD(col_ssd + 7 * (BLOCK_W + 2 * radius)); + //See above: #define COL_SSD_SIZE (BLOCK_W + WIN_SIZE) + ssd[0] = CalcSSD(col_ssd + 0 * (BLOCK_W + win_size)); + ssd[1] = CalcSSD(col_ssd + 1 * (BLOCK_W + win_size)); + ssd[2] = CalcSSD(col_ssd + 2 * (BLOCK_W + win_size)); + ssd[3] = CalcSSD(col_ssd + 3 * (BLOCK_W + win_size)); + ssd[4] = CalcSSD(col_ssd + 4 * (BLOCK_W + win_size)); + ssd[5] = CalcSSD(col_ssd + 5 * (BLOCK_W + win_size)); + ssd[6] = CalcSSD(col_ssd + 6 * (BLOCK_W + win_size)); + ssd[7] = CalcSSD(col_ssd + 7 * (BLOCK_W + win_size)); unsigned int mssd = min(min(min(ssd[0], ssd[1]), min(ssd[4], ssd[5])), min(min(ssd[2], ssd[3]), min(ssd[6], ssd[7]))); int bestIdx = 0; + for (int i = 0; i < N_DISPARITIES; i++) { if (mssd == ssd[i]) @@ -100,14 +103,15 @@ void StepDown(int idx1, int idx2, __global unsigned char* imageL, uint8 diff1 = (uint8)(imageL[idx1]) - imgR1; uint8 diff2 = (uint8)(imageL[idx2]) - imgR2; uint8 res = diff2 * diff2 - diff1 * diff1; - col_ssd[0 * (BLOCK_W + 2 * radius)] += res.s7; - col_ssd[1 * (BLOCK_W + 2 * radius)] += res.s6; - col_ssd[2 * (BLOCK_W + 2 * radius)] += res.s5; - col_ssd[3 * (BLOCK_W + 2 * radius)] += res.s4; - col_ssd[4 * (BLOCK_W + 2 * radius)] += res.s3; - col_ssd[5 * (BLOCK_W + 2 * radius)] += res.s2; - col_ssd[6 * (BLOCK_W + 2 * radius)] += res.s1; - col_ssd[7 * (BLOCK_W + 2 * radius)] += res.s0; + const int win_size = (radius << 1); + col_ssd[0 * (BLOCK_W + win_size)] += res.s7; + col_ssd[1 * (BLOCK_W + win_size)] += res.s6; + col_ssd[2 * (BLOCK_W + win_size)] += res.s5; + col_ssd[3 * (BLOCK_W + win_size)] += res.s4; + col_ssd[4 * (BLOCK_W + win_size)] += res.s3; + col_ssd[5 * (BLOCK_W + win_size)] += res.s2; + col_ssd[6 * (BLOCK_W + win_size)] += res.s1; + col_ssd[7 * (BLOCK_W + win_size)] += res.s0; } void InitColSSD(int x_tex, int y_tex, int im_pitch, __global unsigned char* imageL, @@ -115,28 +119,27 @@ void InitColSSD(int x_tex, int y_tex, int im_pitch, __global unsigned char* imag __local unsigned int *col_ssd) { uint8 leftPixel1; - int idx; uint8 diffa = 0; - - for(int i = 0; i < (2 * radius + 1); i++) + int idx = y_tex * im_pitch + x_tex; + const int win_size = (radius << 1); + for(int i = 0; i < (win_size + 1); i++) { - idx = y_tex * im_pitch + x_tex; leftPixel1 = (uint8)(imageL[idx]); uint8 imgR = convert_uint8(vload8(0, imageR + (idx - d - 7))); uint8 res = leftPixel1 - imgR; diffa += res * res; - y_tex += 1; + idx += im_pitch; } - //See above: #define COL_SSD_SIZE (BLOCK_W + 2 * radius) - col_ssd[0 * (BLOCK_W + 2 * radius)] = diffa.s7; - col_ssd[1 * (BLOCK_W + 2 * radius)] = diffa.s6; - col_ssd[2 * (BLOCK_W + 2 * radius)] = diffa.s5; - col_ssd[3 * (BLOCK_W + 2 * radius)] = diffa.s4; - col_ssd[4 * (BLOCK_W + 2 * radius)] = diffa.s3; - col_ssd[5 * (BLOCK_W + 2 * radius)] = diffa.s2; - col_ssd[6 * (BLOCK_W + 2 * radius)] = diffa.s1; - col_ssd[7 * (BLOCK_W + 2 * radius)] = diffa.s0; + //See above: #define COL_SSD_SIZE (BLOCK_W + WIN_SIZE) + col_ssd[0 * (BLOCK_W + win_size)] = diffa.s7; + col_ssd[1 * (BLOCK_W + win_size)] = diffa.s6; + col_ssd[2 * (BLOCK_W + win_size)] = diffa.s5; + col_ssd[3 * (BLOCK_W + win_size)] = diffa.s4; + col_ssd[4 * (BLOCK_W + win_size)] = diffa.s3; + col_ssd[5 * (BLOCK_W + win_size)] = diffa.s2; + col_ssd[6 * (BLOCK_W + win_size)] = diffa.s1; + col_ssd[7 * (BLOCK_W + win_size)] = diffa.s0; } __kernel void stereoKernel(__global unsigned char *left, __global unsigned char *right, @@ -145,15 +148,14 @@ __kernel void stereoKernel(__global unsigned char *left, __global unsigned char int img_step, int maxdisp, __local unsigned int *col_ssd_cache) { - __local unsigned int *col_ssd = col_ssd_cache + get_local_id(0); - __local unsigned int *col_ssd_extra = get_local_id(0) < (2 * radius) ? col_ssd + BLOCK_W : 0; + __local unsigned int *col_ssd = col_ssd_cache + get_local_id(0); + __local unsigned int *col_ssd_extra = get_local_id(0) < (radius << 1) ? col_ssd + BLOCK_W : 0; int X = get_group_id(0) * BLOCK_W + get_local_id(0) + maxdisp + radius; - // int Y = get_group_id(1) * ROWSperTHREAD + radius; #define Y (get_group_id(1) * ROWSperTHREAD + radius) - __global unsigned int* minSSDImage = cminSSDImage + X + Y * cminSSD_step; + __global unsigned int* minSSDImage = cminSSDImage + X + Y * cminSSD_step; __global unsigned char* disparImage = disp + X + Y * disp_step; int end_row = ROWSperTHREAD < (cheight - Y) ? ROWSperTHREAD:(cheight - Y); @@ -187,7 +189,9 @@ __kernel void stereoKernel(__global unsigned char *left, __global unsigned char for(int row = 1; row < end_row; row++) { int idx1 = y_tex * img_step + x_tex; - int idx2 = min(y_tex + (2 * radius + 1), cheight - 1) * img_step + x_tex; + int idx2 = min(y_tex + ((radius << 1) + 1), cheight - 1) * img_step + x_tex; + + barrier(CLK_LOCAL_MEM_FENCE); StepDown(idx1, idx2, left, right, d, col_ssd); if (col_ssd_extra > 0)