From 7285341083a1210db1485eee1075540fa5ebc884 Mon Sep 17 00:00:00 2001 From: peng xiao Date: Mon, 21 Oct 2013 10:21:37 +0800 Subject: [PATCH] 1. Let btvRegWeights to be constant per `process` call. 2. Let Farneback to be the default optical flow method. 3. Fix a timing method bug for ocl path. 4. Remove useless finish operation in farneback 5. Refactor buffer usage. --- modules/ocl/src/optical_flow_farneback.cpp | 2 - modules/superres/src/btv_l1_ocl.cpp | 78 +++------ modules/superres/src/opencl/superres_btvl1.cl | 163 ++++++------------ samples/gpu/super_resolution.cpp | 6 +- 4 files changed, 84 insertions(+), 165 deletions(-) diff --git a/modules/ocl/src/optical_flow_farneback.cpp b/modules/ocl/src/optical_flow_farneback.cpp index 05a850bd1..050375820 100644 --- a/modules/ocl/src/optical_flow_farneback.cpp +++ b/modules/ocl/src/optical_flow_farneback.cpp @@ -336,8 +336,6 @@ void cv::ocl::FarnebackOpticalFlow::updateFlow_boxFilter( swap(M, bufM); - finish(); - optflow_farneback::updateFlowOcl(M, flowx, flowy); if (updateMatrices) diff --git a/modules/superres/src/btv_l1_ocl.cpp b/modules/superres/src/btv_l1_ocl.cpp index 432d2368a..b4f4acdaf 100644 --- a/modules/superres/src/btv_l1_ocl.cpp +++ b/modules/superres/src/btv_l1_ocl.cpp @@ -70,6 +70,7 @@ namespace cv { float* btvWeights_ = NULL; size_t btvWeights_size = 0; + oclMat c_btvRegWeights; } } @@ -82,10 +83,6 @@ namespace btv_l1_device_ocl void upscale(const oclMat& src, oclMat& dst, int scale); - float diffSign(float a, float b); - - Point3f diffSign(Point3f a, Point3f b); - void diffSign(const oclMat& src1, const oclMat& src2, oclMat& dst); void calcBtvRegularization(const oclMat& src, oclMat& dst, int ksize); @@ -165,20 +162,6 @@ void btv_l1_device_ocl::upscale(const oclMat& src, oclMat& dst, int scale) } -float btv_l1_device_ocl::diffSign(float a, float b) -{ - return a > b ? 1.0f : a < b ? -1.0f : 0.0f; -} - -Point3f btv_l1_device_ocl::diffSign(Point3f a, Point3f b) -{ - return Point3f( - a.x > b.x ? 1.0f : a.x < b.x ? -1.0f : 0.0f, - a.y > b.y ? 1.0f : a.y < b.y ? -1.0f : 0.0f, - a.z > b.z ? 1.0f : a.z < b.z ? -1.0f : 0.0f - ); -} - void btv_l1_device_ocl::diffSign(const oclMat& src1, const oclMat& src2, oclMat& dst) { Context* clCxt = Context::getContext(); @@ -228,12 +211,6 @@ void btv_l1_device_ocl::calcBtvRegularization(const oclMat& src, oclMat& dst, in int cn = src.oclchannels(); - cl_mem c_btvRegWeights; - size_t count = btvWeights_size * sizeof(float); - c_btvRegWeights = openCLCreateBuffer(clCxt, CL_MEM_READ_ONLY, count); - int cl_safe_check = clEnqueueWriteBuffer(getClCommandQueue(clCxt), c_btvRegWeights, 1, 0, count, btvWeights_, 0, NULL, NULL); - CV_Assert(cl_safe_check == CL_SUCCESS); - args.push_back(make_pair(sizeof(cl_mem), (void*)&src_.data)); args.push_back(make_pair(sizeof(cl_mem), (void*)&dst_.data)); args.push_back(make_pair(sizeof(cl_int), (void*)&src_step)); @@ -242,11 +219,9 @@ void btv_l1_device_ocl::calcBtvRegularization(const oclMat& src, oclMat& dst, in args.push_back(make_pair(sizeof(cl_int), (void*)&src.cols)); args.push_back(make_pair(sizeof(cl_int), (void*)&ksize)); args.push_back(make_pair(sizeof(cl_int), (void*)&cn)); - args.push_back(make_pair(sizeof(cl_mem), (void*)&c_btvRegWeights)); + args.push_back(make_pair(sizeof(cl_mem), (void*)&c_btvRegWeights.data)); openCLExecuteKernel(clCxt, &superres_btvl1, kernel_name, global_thread, local_thread, args, -1, -1); - cl_safe_check = clReleaseMemObject(c_btvRegWeights); - CV_Assert(cl_safe_check == CL_SUCCESS); } namespace @@ -321,9 +296,6 @@ namespace { CV_Assert( src.channels() == 1 || src.channels() == 3 || src.channels() == 4 ); - dst.create(src.rows * scale, src.cols * scale, src.type()); - dst.setTo(Scalar::all(0)); - btv_l1_device_ocl::upscale(src, dst, scale); } @@ -351,12 +323,13 @@ namespace btvWeights_ = &btvWeights[0]; btvWeights_size = size; + Mat btvWeights_mheader(1, static_cast(size), CV_32FC1, btvWeights_); + c_btvRegWeights = btvWeights_mheader; } void calcBtvRegularization(const oclMat& src, oclMat& dst, int btvKernelSize) { dst.create(src.size(), src.type()); - dst.setTo(Scalar::all(0)); const int ksize = (btvKernelSize - 1) / 2; @@ -407,7 +380,7 @@ namespace oclMat highRes_; vector diffTerms_; - vector a_, b_, c_; + oclMat a_, b_, c_, d_; oclMat regTerm_; }; @@ -421,7 +394,7 @@ namespace btvKernelSize_ = 7; blurKernelSize_ = 5; blurSigma_ = 0.0; - opticalFlow_ = createOptFlow_DualTVL1_OCL(); + opticalFlow_ = createOptFlow_Farneback_OCL(); curBlurKernelSize_ = -1; curBlurSigma_ = -1.0; @@ -487,34 +460,36 @@ namespace // iterations diffTerms_.resize(src.size()); - a_.resize(src.size()); - b_.resize(src.size()); - c_.resize(src.size()); - + bool d_inited = false; + a_.create(highRes_.size(), highRes_.type()); + b_.create(highRes_.size(), highRes_.type()); + c_.create(lowResSize, highRes_.type()); + d_.create(highRes_.rows, highRes_.cols, highRes_.type()); for (int i = 0; i < iterations_; ++i) { + if(!d_inited) + { + d_.setTo(0); + d_inited = true; + } for (size_t k = 0; k < src.size(); ++k) { diffTerms_[k].create(highRes_.size(), highRes_.type()); - a_[k].create(highRes_.size(), highRes_.type()); - b_[k].create(highRes_.size(), highRes_.type()); - c_[k].create(lowResSize, highRes_.type()); - // a = M * Ih - ocl::remap(highRes_, a_[k], backwardMaps_[k].first, backwardMaps_[k].second, INTER_NEAREST, BORDER_CONSTANT, Scalar()); + ocl::remap(highRes_, a_, backwardMaps_[k].first, backwardMaps_[k].second, INTER_NEAREST, BORDER_CONSTANT, Scalar()); // b = HM * Ih - filters_[k]->apply(a_[k], b_[k], Rect(0,0,-1,-1)); + filters_[k]->apply(a_, b_, Rect(0,0,-1,-1)); // c = DHF * Ih - ocl::resize(b_[k], c_[k], lowResSize, 0, 0, INTER_NEAREST); + ocl::resize(b_, c_, lowResSize, 0, 0, INTER_NEAREST); - diffSign(src[k], c_[k], c_[k]); + diffSign(src[k], c_, c_); // a = Dt * diff - upscale(c_[k], a_[k], scale_); + upscale(c_, d_, scale_); // b = HtDt * diff - filters_[k]->apply(a_[k], b_[k], Rect(0,0,-1,-1)); + filters_[k]->apply(d_, b_, Rect(0,0,-1,-1)); // diffTerm = MtHtDt * diff - ocl::remap(b_[k], diffTerms_[k], forwardMaps_[k].first, forwardMaps_[k].second, INTER_NEAREST, BORDER_CONSTANT, Scalar()); + ocl::remap(b_, diffTerms_[k], forwardMaps_[k].first, forwardMaps_[k].second, INTER_NEAREST, BORDER_CONSTANT, Scalar()); } if (lambda_ > 0) @@ -549,10 +524,11 @@ namespace highRes_.release(); diffTerms_.clear(); - a_.clear(); - b_.clear(); - c_.clear(); + a_.release(); + b_.release(); + c_.release(); regTerm_.release(); + c_btvRegWeights.release(); } //////////////////////////////////////////////////////////// diff --git a/modules/superres/src/opencl/superres_btvl1.cl b/modules/superres/src/opencl/superres_btvl1.cl index 472062323..a08227491 100644 --- a/modules/superres/src/opencl/superres_btvl1.cl +++ b/modules/superres/src/opencl/superres_btvl1.cl @@ -44,24 +44,24 @@ //M*/ __kernel void buildMotionMapsKernel(__global float* forwardMotionX, - __global float* forwardMotionY, - __global float* backwardMotionX, - __global float* backwardMotionY, - __global float* forwardMapX, - __global float* forwardMapY, - __global float* backwardMapX, - __global float* backwardMapY, - int forwardMotionX_row, - int forwardMotionX_col, - int forwardMotionX_step, - int forwardMotionY_step, - int backwardMotionX_step, - int backwardMotionY_step, - int forwardMapX_step, - int forwardMapY_step, - int backwardMapX_step, - int backwardMapY_step - ) + __global float* forwardMotionY, + __global float* backwardMotionX, + __global float* backwardMotionY, + __global float* forwardMapX, + __global float* forwardMapY, + __global float* backwardMapX, + __global float* backwardMapY, + int forwardMotionX_row, + int forwardMotionX_col, + int forwardMotionX_step, + int forwardMotionY_step, + int backwardMotionX_step, + int backwardMotionY_step, + int forwardMapX_step, + int forwardMapY_step, + int backwardMapX_step, + int backwardMapY_step + ) { int x = get_global_id(0); int y = get_global_id(1); @@ -83,14 +83,14 @@ __kernel void buildMotionMapsKernel(__global float* forwardMotionX, } __kernel void upscaleKernel(__global float* src, - __global float* dst, - int src_step, - int dst_step, - int src_row, - int src_col, - int scale, - int channels - ) + __global float* dst, + int src_step, + int dst_step, + int src_row, + int src_col, + int scale, + int channels + ) { int x = get_global_id(0); int y = get_global_id(1); @@ -100,17 +100,10 @@ __kernel void upscaleKernel(__global float* src, if(channels == 1) { dst[y * scale * dst_step + x * scale] = src[y * src_step + x]; - }else if(channels == 3) + } + else { - dst[y * channels * scale * dst_step + 3 * x * scale + 0] = src[y * channels * src_step + 3 * x + 0]; - dst[y * channels * scale * dst_step + 3 * x * scale + 1] = src[y * channels * src_step + 3 * x + 1]; - dst[y * channels * scale * dst_step + 3 * x * scale + 2] = src[y * channels * src_step + 3 * x + 2]; - }else - { - dst[y * channels * scale * dst_step + 4 * x * scale + 0] = src[y * channels * src_step + 4 * x + 0]; - dst[y * channels * scale * dst_step + 4 * x * scale + 1] = src[y * channels * src_step + 4 * x + 1]; - dst[y * channels * scale * dst_step + 4 * x * scale + 2] = src[y * channels * src_step + 4 * x + 2]; - dst[y * channels * scale * dst_step + 4 * x * scale + 3] = src[y * channels * src_step + 4 * x + 3]; + vstore4(vload4(0, src + y * channels * src_step + 4 * x), 0, dst + y * channels * scale * dst_step + 4 * x * scale); } } } @@ -121,15 +114,6 @@ float diffSign(float a, float b) return a > b ? 1.0f : a < b ? -1.0f : 0.0f; } -float3 diffSign3(float3 a, float3 b) -{ - float3 pos; - pos.x = a.x > b.x ? 1.0f : a.x < b.x ? -1.0f : 0.0f; - pos.y = a.y > b.y ? 1.0f : a.y < b.y ? -1.0f : 0.0f; - pos.z = a.z > b.z ? 1.0f : a.z < b.z ? -1.0f : 0.0f; - return pos; -} - float4 diffSign4(float4 a, float4 b) { float4 pos; @@ -141,13 +125,13 @@ float4 diffSign4(float4 a, float4 b) } __kernel void diffSignKernel(__global float* src1, - __global float* src2, - __global float* dst, - int src1_row, - int src1_col, - int dst_step, - int src1_step, - int src2_step) + __global float* src2, + __global float* dst, + int src1_row, + int src1_col, + int dst_step, + int src1_step, + int src2_step) { int x = get_global_id(0); int y = get_global_id(1); @@ -156,19 +140,18 @@ __kernel void diffSignKernel(__global float* src1, { dst[y * dst_step + x] = diffSign(src1[y * src1_step + x], src2[y * src2_step + x]); } - barrier(CLK_LOCAL_MEM_FENCE); } __kernel void calcBtvRegularizationKernel(__global float* src, - __global float* dst, - int src_step, - int dst_step, - int src_row, - int src_col, - int ksize, - int channels, - __global float* c_btvRegWeights - ) + __global float* dst, + int src_step, + int dst_step, + int src_row, + int src_col, + int ksize, + int channels, + __constant float* c_btvRegWeights + ) { int x = get_global_id(0) + ksize; int y = get_global_id(1) + ksize; @@ -180,57 +163,19 @@ __kernel void calcBtvRegularizationKernel(__global float* src, const float srcVal = src[y * src_step + x]; float dstVal = 0.0f; - for (int m = 0, count = 0; m <= ksize; ++m) - { - for (int l = ksize; l + m >= 0; --l, ++count) - dstVal = dstVal + c_btvRegWeights[count] * (diffSign(srcVal, src[(y + m) * src_step + (x + l)]) - diffSign(src[(y - m) * src_step + (x - l)], srcVal)); - } - dst[y * dst_step + x] = dstVal; - }else if(channels == 3) - { - float3 srcVal; - srcVal.x = src[y * src_step + 3 * x + 0]; - srcVal.y = src[y * src_step + 3 * x + 1]; - srcVal.z = src[y * src_step + 3 * x + 2]; - - float3 dstVal; - dstVal.x = 0.0f; - dstVal.y = 0.0f; - dstVal.z = 0.0f; - for (int m = 0, count = 0; m <= ksize; ++m) { for (int l = ksize; l + m >= 0; --l, ++count) { - float3 src1; - src1.x = src[(y + m) * src_step + 3 * (x + l) + 0]; - src1.y = src[(y + m) * src_step + 3 * (x + l) + 1]; - src1.z = src[(y + m) * src_step + 3 * (x + l) + 2]; - - float3 src2; - src2.x = src[(y - m) * src_step + 3 * (x - l) + 0]; - src2.y = src[(y - m) * src_step + 3 * (x - l) + 1]; - src2.z = src[(y - m) * src_step + 3 * (x - l) + 2]; - - dstVal = dstVal + c_btvRegWeights[count] * (diffSign3(srcVal, src1) - diffSign3(src2, srcVal)); + dstVal = dstVal + c_btvRegWeights[count] * (diffSign(srcVal, src[(y + m) * src_step + (x + l)]) - diffSign(src[(y - m) * src_step + (x - l)], srcVal)); } } - dst[y * dst_step + 3 * x + 0] = dstVal.x; - dst[y * dst_step + 3 * x + 1] = dstVal.y; - dst[y * dst_step + 3 * x + 2] = dstVal.z; - }else + dst[y * dst_step + x] = dstVal; + } + else { - float4 srcVal; - srcVal.x = src[y * src_step + 4 * x + 0];//r type =float - srcVal.y = src[y * src_step + 4 * x + 1];//g - srcVal.z = src[y * src_step + 4 * x + 2];//b - srcVal.w = src[y * src_step + 4 * x + 3];//a - - float4 dstVal; - dstVal.x = 0.0f; - dstVal.y = 0.0f; - dstVal.z = 0.0f; - dstVal.w = 0.0f; + float4 srcVal = vload4(0, src + y * src_step + 4 * x); + float4 dstVal = 0.f; for (int m = 0, count = 0; m <= ksize; ++m) { @@ -249,13 +194,9 @@ __kernel void calcBtvRegularizationKernel(__global float* src, src2.w = src[(y - m) * src_step + 4 * (x - l) + 3]; dstVal = dstVal + c_btvRegWeights[count] * (diffSign4(srcVal, src1) - diffSign4(src2, srcVal)); - } } - dst[y * dst_step + 4 * x + 0] = dstVal.x; - dst[y * dst_step + 4 * x + 1] = dstVal.y; - dst[y * dst_step + 4 * x + 2] = dstVal.z; - dst[y * dst_step + 4 * x + 3] = dstVal.w; + vstore4(dstVal, 0, dst + y * dst_step + 4 * x); } } } diff --git a/samples/gpu/super_resolution.cpp b/samples/gpu/super_resolution.cpp index 435e711a1..6efd24144 100644 --- a/samples/gpu/super_resolution.cpp +++ b/samples/gpu/super_resolution.cpp @@ -221,7 +221,11 @@ int main(int argc, const char* argv[]) if(useOcl) { - MEASURE_TIME(superRes->nextFrame(result_)); + MEASURE_TIME( + { + superRes->nextFrame(result_); + ocl::finish(); + }); } else #endif