diff --git a/modules/nonfree/src/surf.ocl.cpp b/modules/nonfree/src/surf.ocl.cpp index de7cac2fd..f8c1ad729 100644 --- a/modules/nonfree/src/surf.ocl.cpp +++ b/modules/nonfree/src/surf.ocl.cpp @@ -82,12 +82,6 @@ namespace cv } } - -static inline size_t divUp(size_t total, size_t grain) -{ - return (total + grain - 1) / grain; -} - static inline int calcSize(int octave, int layer) { /* Wavelet size at first layer of first octave. */ diff --git a/modules/ocl/include/opencv2/ocl/ocl.hpp b/modules/ocl/include/opencv2/ocl/ocl.hpp index c117d0b2f..42ac75840 100644 --- a/modules/ocl/include/opencv2/ocl/ocl.hpp +++ b/modules/ocl/include/opencv2/ocl/ocl.hpp @@ -1887,6 +1887,11 @@ namespace cv oclMat temp4; oclMat temp5; }; + + static inline size_t divUp(size_t total, size_t grain) + { + return (total + grain - 1) / grain; + } } } #if defined _MSC_VER && _MSC_VER >= 1200 diff --git a/modules/ocl/src/arithm.cpp b/modules/ocl/src/arithm.cpp index 819c01390..0cc803d19 100644 --- a/modules/ocl/src/arithm.cpp +++ b/modules/ocl/src/arithm.cpp @@ -108,13 +108,6 @@ namespace cv } } -////////////////////////////////////////////////////////////////////////// -//////////////////common///////////////////////////////////////////////// -/////////////////////////////////////////////////////////////////////// -inline int divUp(int total, int grain) -{ - return (total + grain - 1) / grain; -} ////////////////////////////////////////////////////////////////////////////// /////////////////////// add subtract multiply divide ///////////////////////// ////////////////////////////////////////////////////////////////////////////// @@ -150,10 +143,7 @@ void arithmetic_run(const oclMat &src1, const oclMat &src2, oclMat &dst, int cols = divUp(dst.cols * channels + offset_cols, vector_length); size_t localThreads[3] = { 64, 4, 1 }; - size_t globalThreads[3] = { divUp(cols, localThreads[0]) *localThreads[0], - divUp(dst.rows, localThreads[1]) *localThreads[1], - 1 - }; + size_t globalThreads[3] = { cols, dst.rows, 1 }; int dst_step1 = dst.cols * dst.elemSize(); vector > args; @@ -226,10 +216,7 @@ static void arithmetic_run(const oclMat &src1, const oclMat &src2, oclMat &dst, int cols = divUp(dst.cols + offset_cols, vector_length); size_t localThreads[3] = { 64, 4, 1 }; - size_t globalThreads[3] = { divUp(cols, localThreads[0]) *localThreads[0], - divUp(dst.rows, localThreads[1]) *localThreads[1], - 1 - }; + size_t globalThreads[3] = { cols, dst.rows, 1 }; int dst_step1 = dst.cols * dst.elemSize(); vector > args; @@ -338,10 +325,7 @@ void arithmetic_scalar_run(const oclMat &src1, const Scalar &src2, oclMat &dst, int cols = divUp(dst.cols + offset_cols, vector_length); size_t localThreads[3] = { 64, 4, 1 }; - size_t globalThreads[3] = { divUp(cols, localThreads[0]) *localThreads[0], - divUp(dst.rows, localThreads[1]) *localThreads[1], - 1 - }; + size_t globalThreads[3] = { cols, dst.rows, 1 }; int dst_step1 = dst.cols * dst.elemSize(); vector > args; @@ -397,10 +381,7 @@ static void arithmetic_scalar_run(const oclMat &src, oclMat &dst, string kernelN int cols = divUp(dst.cols * channels + offset_cols, vector_length); size_t localThreads[3] = { 64, 4, 1 }; - size_t globalThreads[3] = { divUp(cols, localThreads[0]) *localThreads[0], - divUp(dst.rows, localThreads[1]) *localThreads[1], - 1 - }; + size_t globalThreads[3] = { cols, dst.rows, 1 }; int dst_step1 = dst.cols * dst.elemSize(); vector > args; @@ -515,10 +496,8 @@ static void compare_run(const oclMat &src1, const oclMat &src2, oclMat &dst, str int offset_cols = (dst.offset / dst.elemSize1()) & (vector_length - 1); int cols = divUp(dst.cols + offset_cols, vector_length); size_t localThreads[3] = { 64, 4, 1 }; - size_t globalThreads[3] = { divUp(cols, localThreads[0]) *localThreads[0], - divUp(dst.rows, localThreads[1]) *localThreads[1], - 1 - }; + size_t globalThreads[3] = { cols, dst.rows, 1 }; + int dst_step1 = dst.cols * dst.elemSize(); vector > args; args.push_back( make_pair( sizeof(cl_mem), (void *)&src1.data )); @@ -945,10 +924,7 @@ static void arithmetic_flip_rows_run(const oclMat &src, oclMat &dst, string kern int rows = divUp(dst.rows, 2); size_t localThreads[3] = { 64, 4, 1 }; - size_t globalThreads[3] = { divUp(cols, localThreads[0]) *localThreads[0], - divUp(rows, localThreads[1]) *localThreads[1], - 1 - }; + size_t globalThreads[3] = { cols, rows, 1 }; int dst_step1 = dst.cols * dst.elemSize(); vector > args; @@ -993,10 +969,7 @@ static void arithmetic_flip_cols_run(const oclMat &src, oclMat &dst, string kern int rows = isVertical ? divUp(dst.rows, 2) : dst.rows; size_t localThreads[3] = { 64, 4, 1 }; - size_t globalThreads[3] = { divUp(cols, localThreads[0]) *localThreads[0], - divUp(rows, localThreads[1]) *localThreads[1], - 1 - }; + size_t globalThreads[3] = { cols, rows, 1 }; int dst_step1 = dst.cols * dst.elemSize(); vector > args; @@ -1156,10 +1129,7 @@ static void arithmetic_exp_log_run(const oclMat &src, oclMat &dst, string kernel int depth = dst.depth(); size_t localThreads[3] = { 64, 4, 1 }; - size_t globalThreads[3] = { divUp(dst.cols, localThreads[0]) *localThreads[0], - divUp(dst.rows, localThreads[1]) *localThreads[1], - 1 - }; + size_t globalThreads[3] = { dst.cols, dst.rows, 1 }; vector > args; args.push_back( make_pair( sizeof(cl_int), (void *)&src.rows )); @@ -1201,13 +1171,9 @@ static void arithmetic_magnitude_phase_run(const oclMat &src1, const oclMat &src size_t vector_length = 1; int offset_cols = ((dst.offset % dst.step) / dst.elemSize1()) & (vector_length - 1); int cols = divUp(dst.cols * channels + offset_cols, vector_length); - int rows = dst.rows; size_t localThreads[3] = { 64, 4, 1 }; - size_t globalThreads[3] = { divUp(cols, localThreads[0]) *localThreads[0], - divUp(rows, localThreads[1]) *localThreads[1], - 1 - }; + size_t globalThreads[3] = { cols, dst.rows, 1 }; vector > args; args.push_back( make_pair( sizeof(cl_mem), (void *)&src1.data )); @@ -1252,13 +1218,9 @@ static void arithmetic_phase_run(const oclMat &src1, const oclMat &src2, oclMat size_t vector_length = 1; int offset_cols = ((dst.offset % dst.step) / dst.elemSize1()) & (vector_length - 1); int cols = divUp(dst.cols * channels + offset_cols, vector_length); - int rows = dst.rows; size_t localThreads[3] = { 64, 4, 1 }; - size_t globalThreads[3] = { divUp(cols, localThreads[0]) *localThreads[0], - divUp(rows, localThreads[1]) *localThreads[1], - 1 - }; + size_t globalThreads[3] = { cols, dst.rows, 1 }; int dst_step1 = dst.cols * dst.elemSize(); vector > args; @@ -1283,15 +1245,9 @@ void cv::ocl::phase(const oclMat &x, const oclMat &y, oclMat &Angle , bool angle Angle.create(x.size(), x.type()); string kernelName = angleInDegrees ? "arithm_phase_indegrees" : "arithm_phase_inradians"; if(angleInDegrees) - { arithmetic_phase_run(x, y, Angle, kernelName, &arithm_phase); - //cout<<"1"< > args; @@ -1333,7 +1285,7 @@ static void arithmetic_cartToPolar_run(const oclMat &src1, const oclMat &src2, o args.push_back( make_pair( sizeof(cl_mem), (void *)&dst_cart.data )); args.push_back( make_pair( sizeof(cl_int), (void *)&dst_cart.step )); args.push_back( make_pair( sizeof(cl_int), (void *)&dst_cart.offset )); - args.push_back( make_pair( sizeof(cl_int), (void *)&rows )); + args.push_back( make_pair( sizeof(cl_int), (void *)&src1.rows )); args.push_back( make_pair( sizeof(cl_int), (void *)&cols )); args.push_back( make_pair( sizeof(cl_int), (void *)&tmp )); @@ -1369,10 +1321,7 @@ static void arithmetic_ptc_run(const oclMat &src1, const oclMat &src2, oclMat &d int rows = src2.rows; size_t localThreads[3] = { 64, 4, 1 }; - size_t globalThreads[3] = { divUp(cols, localThreads[0]) *localThreads[0], - divUp(rows, localThreads[1]) *localThreads[1], - 1 - }; + size_t globalThreads[3] = { cols, rows, 1 }; int tmp = angleInDegrees ? 1 : 0; vector > args; @@ -1632,10 +1581,7 @@ static void bitwise_run(const oclMat &src1, oclMat &dst, string kernelName, cons int cols = divUp(dst.cols * channels + offset_cols, vector_length); size_t localThreads[3] = { 64, 4, 1 }; - size_t globalThreads[3] = { divUp(cols, localThreads[0]) *localThreads[0], - divUp(dst.rows, localThreads[1]) *localThreads[1], - 1 - }; + size_t globalThreads[3] = { cols, dst.rows, 1 }; int dst_step1 = dst.cols * dst.elemSize(); vector > args; @@ -1678,10 +1624,7 @@ void bitwise_run(const oclMat &src1, const oclMat &src2, oclMat &dst, string ker int cols = divUp(dst.cols * channels + offset_cols, vector_length); size_t localThreads[3] = { 64, 4, 1 }; - size_t globalThreads[3] = { divUp(cols, localThreads[0]) *localThreads[0], - divUp(dst.rows, localThreads[1]) *localThreads[1], - 1 - }; + size_t globalThreads[3] = { cols, dst.rows, 1 }; int dst_step1 = dst.cols * dst.elemSize(); vector > args; @@ -1739,10 +1682,7 @@ static void bitwise_run(const oclMat &src1, const oclMat &src2, oclMat &dst, int cols = divUp(dst.cols + offset_cols, vector_length); size_t localThreads[3] = { 64, 4, 1 }; - size_t globalThreads[3] = { divUp(cols, localThreads[0]) *localThreads[0], - divUp(dst.rows, localThreads[1]) *localThreads[1], - 1 - }; + size_t globalThreads[3] = { cols, dst.rows, 1 }; int dst_step1 = dst.cols * dst.elemSize(); vector > args; @@ -1800,10 +1740,7 @@ void bitwise_scalar_run(const oclMat &src1, const Scalar &src2, oclMat &dst, int cols = divUp(dst.cols + offset_cols, vector_length); size_t localThreads[3] = { 64, 4, 1 }; - size_t globalThreads[3] = { divUp(cols, localThreads[0]) *localThreads[0], - divUp(dst.rows, localThreads[1]) *localThreads[1], - 1 - }; + size_t globalThreads[3] = { cols, dst.rows, 1 }; int dst_step1 = dst.cols * dst.elemSize(); vector > args; @@ -2096,10 +2033,7 @@ static void transpose_run(const oclMat &src, oclMat &dst, string kernelName) int cols = divUp(src.cols + offset_cols, vector_length); size_t localThreads[3] = { TILE_DIM, BLOCK_ROWS, 1 }; - size_t globalThreads[3] = { divUp(cols, TILE_DIM) *localThreads[0], - divUp(src.rows, TILE_DIM) *localThreads[1], - 1 - }; + size_t globalThreads[3] = { cols, src.rows, 1 }; vector > args; args.push_back( make_pair( sizeof(cl_mem), (void *)&src.data )); @@ -2154,10 +2088,7 @@ void cv::ocl::addWeighted(const oclMat &src1, double alpha, const oclMat &src2, int cols = divUp(dst.cols * channels + offset_cols, vector_length); size_t localThreads[3] = { 256, 1, 1 }; - size_t globalThreads[3] = { divUp(cols, localThreads[0]) *localThreads[0], - divUp(dst.rows, localThreads[1]) *localThreads[1], - 1 - }; + size_t globalThreads[3] = { cols, dst.rows, 1}; int dst_step1 = dst.cols * dst.elemSize(); int src1_step = (int) src1.step; @@ -2220,10 +2151,7 @@ void cv::ocl::magnitudeSqr(const oclMat &src1, const oclMat &src2, oclMat &dst) int cols = divUp(dst.cols * channels + offset_cols, vector_length); size_t localThreads[3] = { 256, 1, 1 }; - size_t globalThreads[3] = { divUp(cols, localThreads[0]) *localThreads[0], - divUp(dst.rows, localThreads[1]) *localThreads[1], - 1 - }; + size_t globalThreads[3] = { cols, dst.rows, 1 }; int dst_step1 = dst.cols * dst.elemSize(); vector > args; @@ -2268,10 +2196,7 @@ void cv::ocl::magnitudeSqr(const oclMat &src1, oclMat &dst) int cols = divUp(dst.cols * channels + offset_cols, vector_length); size_t localThreads[3] = { 256, 1, 1 }; - size_t globalThreads[3] = { divUp(cols, localThreads[0]) *localThreads[0], - divUp(dst.rows, localThreads[1]) *localThreads[1], - 1 - }; + size_t globalThreads[3] = { cols, dst.rows, 1 }; int dst_step1 = dst.cols * dst.elemSize(); vector > args; @@ -2303,10 +2228,7 @@ static void arithmetic_pow_run(const oclMat &src1, double p, oclMat &dst, string int rows = dst.rows; size_t localThreads[3] = { 64, 4, 1 }; - size_t globalThreads[3] = { divUp(cols, localThreads[0]) *localThreads[0], - divUp(rows, localThreads[1]) *localThreads[1], - 1 - }; + size_t globalThreads[3] = { cols, rows, 1 }; int dst_step1 = dst.cols * dst.elemSize(); vector > args; diff --git a/modules/ocl/src/canny.cpp b/modules/ocl/src/canny.cpp index 82bb01bfd..4c7b988f6 100644 --- a/modules/ocl/src/canny.cpp +++ b/modules/ocl/src/canny.cpp @@ -360,14 +360,13 @@ void canny::edgesHysteresisGlobal_gpu(oclMat &map, oclMat &st1, oclMat &st2, voi vector< pair > args; size_t localThreads[3] = {128, 1, 1}; -#define DIVUP(a, b) ((a)+(b)-1)/(b) int count_i[1] = {0}; while(count > 0) { openCLSafeCall(clEnqueueWriteBuffer(*(cl_command_queue*)getoclCommandQueue(), (cl_mem)counter, 1, 0, sizeof(int), &count_i, 0, NULL, NULL)); args.clear(); - size_t globalThreads[3] = {std::min(count, 65535u) * 128, DIVUP(count, 65535), 1}; + size_t globalThreads[3] = {std::min(count, 65535u) * 128, divUp(count, 65535), 1}; args.push_back( make_pair( sizeof(cl_mem), (void *)&map.data)); args.push_back( make_pair( sizeof(cl_mem), (void *)&st1.data)); args.push_back( make_pair( sizeof(cl_mem), (void *)&st2.data)); @@ -382,7 +381,6 @@ void canny::edgesHysteresisGlobal_gpu(oclMat &map, oclMat &st1, oclMat &st2, voi openCLSafeCall(clEnqueueReadBuffer(*(cl_command_queue*)getoclCommandQueue(), (cl_mem)counter, 1, 0, sizeof(int), &count, 0, NULL, NULL)); std::swap(st1, st2); } -#undef DIVUP } void canny::getEdges_gpu(oclMat &map, oclMat &dst, int rows, int cols) diff --git a/modules/ocl/src/filtering.cpp b/modules/ocl/src/filtering.cpp index c0557980b..e252d852c 100644 --- a/modules/ocl/src/filtering.cpp +++ b/modules/ocl/src/filtering.cpp @@ -68,22 +68,12 @@ extern const char *filtering_adaptive_bilateral; } } -namespace -{ -inline int divUp(int total, int grain) -{ - return (total + grain - 1) / grain; -} -} - namespace { inline void normalizeAnchor(int &anchor, int ksize) { if (anchor < 0) - { anchor = ksize >> 1; - } CV_Assert(0 <= anchor && anchor < ksize); } @@ -97,9 +87,7 @@ inline void normalizeAnchor(Point &anchor, const Size &ksize) inline void normalizeROI(Rect &roi, const Size &ksize, const Point &anchor, const Size &src_size) { if (roi == Rect(0, 0, -1, -1)) - { roi = Rect(0, 0, src_size.width, src_size.height); - } CV_Assert(ksize.height > 0 && ksize.width > 0 && ((ksize.height & 1) == 1) && ((ksize.width & 1) == 1)); CV_Assert((anchor.x == -1 && anchor.y == -1) || (anchor.x == ksize.width >> 1 && anchor.y == ksize.height >> 1)); @@ -112,10 +100,7 @@ inline void normalizeKernel(const Mat &kernel, oclMat &gpu_krnl, int type = CV_8 int scale = nDivisor && (kernel.depth() == CV_32F || kernel.depth() == CV_64F) ? 256 : 1; if (nDivisor) - { *nDivisor = scale; - } - Mat temp(kernel.size(), type); kernel.convertTo(temp, type, scale); Mat cont_krnl = temp.reshape(1, 1); @@ -125,9 +110,7 @@ inline void normalizeKernel(const Mat &kernel, oclMat &gpu_krnl, int type = CV_8 int count = cont_krnl.cols >> 1; for (int i = 0; i < count; ++i) - { std::swap(cont_krnl.at(0, i), cont_krnl.at(0, cont_krnl.cols - 1 - i)); - } } gpu_krnl.upload(cont_krnl); @@ -627,8 +610,6 @@ static void GPUFilter2D(const oclMat &src, oclMat &dst, const oclMat &mat_kernel int localWidth = localThreads[0] + paddingPixels; int localHeight = localThreads[1] + paddingPixels; - // 260 = divup((localThreads[0] + filterWidth * 2), 4) * 4 - // 6 = (ROWS_PER_GROUP_WHICH_IS_4 + filterWidth * 2) size_t localMemSize = ksize_3x3 ? 260 * 6 * src.elemSize() : (localWidth * localHeight) * src.elemSize(); int vector_lengths[4][7] = {{4, 4, 4, 4, 4, 4, 4}, @@ -1713,4 +1694,4 @@ void cv::ocl::adaptiveBilateralFilter(const oclMat& src, oclMat& dst, Size ksize openCLExecuteKernel(Context::getContext(), &filtering_adaptive_bilateral, kernelName, globalThreads, localThreads, args, cn, depth, build_options); -} \ No newline at end of file +} diff --git a/modules/ocl/src/hog.cpp b/modules/ocl/src/hog.cpp index 4aafb47d9..55872829a 100644 --- a/modules/ocl/src/hog.cpp +++ b/modules/ocl/src/hog.cpp @@ -124,11 +124,6 @@ namespace cv using namespace ::cv::ocl::device; -static inline int divUp(int total, int grain) -{ - return (total + grain - 1) / grain; -} - cv::ocl::HOGDescriptor::HOGDescriptor(Size win_size_, Size block_size_, Size block_stride_, Size cell_size_, int nbins_, double win_sigma_, double threshold_L2hys_, bool gamma_correction_, int nlevels_) @@ -1671,7 +1666,8 @@ void cv::ocl::device::hog::compute_hists(int nbins, { openCLExecuteKernel(clCxt, &objdetect_hog, kernelName, globalThreads, localThreads, args, -1, -1, "-D CPU"); - }else + } + else { cl_kernel kernel = openCLGetKernelFromSource(clCxt, &objdetect_hog, kernelName); int wave_size = queryDeviceInfo(kernel); diff --git a/modules/ocl/src/imgproc.cpp b/modules/ocl/src/imgproc.cpp index ff509fb11..2ed786fe4 100644 --- a/modules/ocl/src/imgproc.cpp +++ b/modules/ocl/src/imgproc.cpp @@ -1518,11 +1518,6 @@ namespace cv // CLAHE namespace clahe { - inline int divUp(int total, int grain) - { - return (total + grain - 1) / grain * grain; - } - static void calcLut(const oclMat &src, oclMat &dst, const int tilesX, const int tilesY, const cv::Size tileSize, const int clipLimit, const float lutScale) @@ -1546,9 +1541,7 @@ namespace cv size_t globalThreads[3] = { tilesX * localThreads[0], tilesY * localThreads[1], 1 }; bool is_cpu = queryDeviceInfo(); if (is_cpu) - { openCLExecuteKernel(Context::getContext(), &imgproc_clahe, kernelName, globalThreads, localThreads, args, -1, -1, (char*)" -D CPU"); - } else { cl_kernel kernel = openCLGetKernelFromSource(Context::getContext(), &imgproc_clahe, kernelName); @@ -1583,7 +1576,7 @@ namespace cv String kernelName = "transform"; size_t localThreads[3] = { 32, 8, 1 }; - size_t globalThreads[3] = { divUp(src.cols, localThreads[0]), divUp(src.rows, localThreads[1]), 1 }; + size_t globalThreads[3] = { src.cols, src.rows, 1 }; openCLExecuteKernel(Context::getContext(), &imgproc_clahe, kernelName, globalThreads, localThreads, args, -1, -1); } @@ -1801,10 +1794,7 @@ namespace cv } } //////////////////////////////////convolve//////////////////////////////////////////////////// -inline int divUp(int total, int grain) -{ - return (total + grain - 1) / grain; -} + static void convolve_run(const oclMat &src, const oclMat &temp1, oclMat &dst, string kernelName, const char **kernelString) { CV_Assert(src.depth() == CV_32FC1); @@ -1826,10 +1816,7 @@ static void convolve_run(const oclMat &src, const oclMat &temp1, oclMat &dst, st int rows = dst.rows; size_t localThreads[3] = { 16, 16, 1 }; - size_t globalThreads[3] = { divUp(cols, localThreads[0]) *localThreads[0], - divUp(rows, localThreads[1]) *localThreads[1], - 1 - }; + size_t globalThreads[3] = { cols, rows, 1 }; vector > args; args.push_back( make_pair( sizeof(cl_mem), (void *)&src.data )); diff --git a/modules/ocl/src/initialization.cpp b/modules/ocl/src/initialization.cpp index 564b40357..8f5fae3f8 100644 --- a/modules/ocl/src/initialization.cpp +++ b/modules/ocl/src/initialization.cpp @@ -285,11 +285,6 @@ namespace cv return 0; } - inline int divUp(int total, int grain) - { - return (total + grain - 1) / grain; - } - int getDevice(std::vector &oclinfo, int devicetype) { //TODO: cache oclinfo vector @@ -707,11 +702,10 @@ namespace cv if ( localThreads != NULL) { - globalThreads[0] = divUp(globalThreads[0], localThreads[0]) * localThreads[0]; - globalThreads[1] = divUp(globalThreads[1], localThreads[1]) * localThreads[1]; - globalThreads[2] = divUp(globalThreads[2], localThreads[2]) * localThreads[2]; + globalThreads[0] = alignSize(globalThreads[0], localThreads[0]); + globalThreads[1] = alignSize(globalThreads[1], localThreads[1]); + globalThreads[2] = alignSize(globalThreads[2], localThreads[2]); - //size_t blockSize = localThreads[0] * localThreads[1] * localThreads[2]; cv::ocl::openCLVerifyKernel(clCxt, kernel, localThreads); } for(size_t i = 0; i < args.size(); i ++) @@ -742,10 +736,6 @@ namespace cv execute_time = (double)(end_time - start_time) / (1000 * 1000); total_time = (double)(end_time - queue_time) / (1000 * 1000); - // cout << setiosflags(ios::left) << setw(15) << execute_time; - // cout << setiosflags(ios::left) << setw(15) << total_time - execute_time; - // cout << setiosflags(ios::left) << setw(15) << total_time << endl; - total_execute_time += execute_time; total_kernel_time += total_time; clReleaseEvent(event); diff --git a/modules/ocl/src/matrix_operations.cpp b/modules/ocl/src/matrix_operations.cpp index cd09f4402..61b6df896 100644 --- a/modules/ocl/src/matrix_operations.cpp +++ b/modules/ocl/src/matrix_operations.cpp @@ -307,11 +307,6 @@ void cv::ocl::oclMat::download(cv::Mat &m) const m.adjustROI(-ofs.y, ofs.y + rows - wholerows, -ofs.x, ofs.x + cols - wholecols); } -/////////////////////common////////////////////////////////////// -inline int divUp(int total, int grain) -{ - return (total + grain - 1) / grain; -} /////////////////////////////////////////////////////////////////////////// ////////////////////////////////// CopyTo ///////////////////////////////// /////////////////////////////////////////////////////////////////////////// @@ -331,11 +326,7 @@ static void copy_to_with_mask(const oclMat &src, oclMat &dst, const oclMat &mask char compile_option[32]; sprintf(compile_option, "-D GENTYPE=%s", string_types[dst.oclchannels() - 1][dst.depth()].c_str()); size_t localThreads[3] = {16, 16, 1}; - size_t globalThreads[3]; - - globalThreads[0] = divUp(dst.cols, localThreads[0]) * localThreads[0]; - globalThreads[1] = divUp(dst.rows, localThreads[1]) * localThreads[1]; - globalThreads[2] = 1; + size_t globalThreads[3] = { dst.cols, dst.rows, 1 }; int dststep_in_pixel = dst.step / dst.elemSize(), dstoffset_in_pixel = dst.offset / dst.elemSize(); int srcstep_in_pixel = src.step / src.elemSize(), srcoffset_in_pixel = src.offset / src.elemSize(); diff --git a/modules/ocl/src/mcwutil.cpp b/modules/ocl/src/mcwutil.cpp index 2966d53db..fc94e2f3d 100644 --- a/modules/ocl/src/mcwutil.cpp +++ b/modules/ocl/src/mcwutil.cpp @@ -71,12 +71,6 @@ namespace cv { namespace ocl { - - inline int divUp(int total, int grain) - { - return (total + grain - 1) / grain; - } - // provide additional methods for the user to interact with the command queue after a task is fired static void openCLExecuteKernel_2(Context *clCxt , const char **source, string kernelName, size_t globalThreads[3], size_t localThreads[3], vector< pair > &args, int channels, diff --git a/modules/ocl/src/optical_flow_farneback.cpp b/modules/ocl/src/optical_flow_farneback.cpp index e622446bb..618a637f0 100644 --- a/modules/ocl/src/optical_flow_farneback.cpp +++ b/modules/ocl/src/optical_flow_farneback.cpp @@ -73,11 +73,6 @@ oclMat gKer; float ig[4]; -inline int divUp(int total, int grain) -{ - return (total + grain - 1) / grain; -} - inline void setGaussianBlurKernel(const float *c_gKer, int ksizeHalf) { cv::Mat t_gKer(1, ksizeHalf + 1, CV_32FC1, const_cast(c_gKer)); @@ -88,7 +83,7 @@ static void gaussianBlurOcl(const oclMat &src, int ksizeHalf, oclMat &dst) { string kernelName("gaussianBlur"); size_t localThreads[3] = { 256, 1, 1 }; - size_t globalThreads[3] = { divUp(src.cols, localThreads[0]) * localThreads[0], src.rows, 1 }; + size_t globalThreads[3] = { src.cols, src.rows, 1 }; int smem_size = (localThreads[0] + 2*ksizeHalf) * sizeof(float); CV_Assert(dst.size() == src.size()); @@ -138,10 +133,7 @@ static void updateMatricesOcl(const oclMat &flowx, const oclMat &flowy, const oc { string kernelName("updateMatrices"); size_t localThreads[3] = { 32, 8, 1 }; - size_t globalThreads[3] = { divUp(flowx.cols, localThreads[0]) * localThreads[0], - divUp(flowx.rows, localThreads[1]) * localThreads[1], - 1 - }; + size_t globalThreads[3] = { flowx.cols, flowx.rows, 1 }; std::vector< std::pair > args; args.push_back(std::make_pair(sizeof(cl_mem), (void *)&M.data)); @@ -166,7 +158,7 @@ static void boxFilter5Ocl(const oclMat &src, int ksizeHalf, oclMat &dst) string kernelName("boxFilter5"); int height = src.rows / 5; size_t localThreads[3] = { 256, 1, 1 }; - size_t globalThreads[3] = { divUp(src.cols, localThreads[0]) * localThreads[0], height, 1 }; + size_t globalThreads[3] = { src.cols, height, 1 }; int smem_size = (localThreads[0] + 2*ksizeHalf) * 5 * sizeof(float); std::vector< std::pair > args; @@ -188,10 +180,7 @@ static void updateFlowOcl(const oclMat &M, oclMat &flowx, oclMat &flowy) string kernelName("updateFlow"); int cols = divUp(flowx.cols, 4); size_t localThreads[3] = { 32, 8, 1 }; - size_t globalThreads[3] = { divUp(cols, localThreads[0]) * localThreads[0], - divUp(flowx.rows, localThreads[1]) * localThreads[0], - 1 - }; + size_t globalThreads[3] = { cols, flowx.rows, 1 }; std::vector< std::pair > args; args.push_back(std::make_pair(sizeof(cl_mem), (void *)&flowx.data)); @@ -211,9 +200,8 @@ static void gaussianBlur5Ocl(const oclMat &src, int ksizeHalf, oclMat &dst) { string kernelName("gaussianBlur5"); int height = src.rows / 5; - int width = src.cols; size_t localThreads[3] = { 256, 1, 1 }; - size_t globalThreads[3] = { divUp(width, localThreads[0]) * localThreads[0], height, 1 }; + size_t globalThreads[3] = { src.cols, height, 1 }; int smem_size = (localThreads[0] + 2*ksizeHalf) * 5 * sizeof(float); std::vector< std::pair > args; @@ -222,7 +210,7 @@ static void gaussianBlur5Ocl(const oclMat &src, int ksizeHalf, oclMat &dst) args.push_back(std::make_pair(sizeof(cl_mem), (void *)&gKer.data)); args.push_back(std::make_pair(smem_size, (void *)NULL)); args.push_back(std::make_pair(sizeof(cl_int), (void *)&height)); - args.push_back(std::make_pair(sizeof(cl_int), (void *)&width)); + args.push_back(std::make_pair(sizeof(cl_int), (void *)&src.cols)); args.push_back(std::make_pair(sizeof(cl_int), (void *)&dst.step)); args.push_back(std::make_pair(sizeof(cl_int), (void *)&src.step)); args.push_back(std::make_pair(sizeof(cl_int), (void *)&ksizeHalf)); diff --git a/modules/ocl/src/split_merge.cpp b/modules/ocl/src/split_merge.cpp index de3d2700a..79bd0f0e2 100644 --- a/modules/ocl/src/split_merge.cpp +++ b/modules/ocl/src/split_merge.cpp @@ -73,61 +73,6 @@ namespace cv { namespace split_merge { - /////////////////////////////////////////////////////////// - ///////////////common///////////////////////////////////// - ///////////////////////////////////////////////////////// - inline int divUp(int total, int grain) - { - return (total + grain - 1) / grain; - } - //////////////////////////////////////////////////////////////////////////// - ////////////////////merge////////////////////////////////////////////////// - //////////////////////////////////////////////////////////////////////////// - // static void merge_vector_run_no_roi(const oclMat *mat_src, size_t n, oclMat &mat_dst) - // { - // Context *clCxt = mat_dst.clCxt; - // int channels = mat_dst.oclchannels(); - // int depth = mat_dst.depth(); - - // string kernelName = "merge_vector"; - - // int indexes[4][7] = {{0, 0, 0, 0, 0, 0, 0}, - // {4, 4, 2, 2, 1, 1, 1}, - // {4, 4, 2, 2 , 1, 1, 1}, - // {4, 4, 2, 2, 1, 1, 1} - // }; - - // size_t index = indexes[channels - 1][mat_dst.depth()]; - // int cols = divUp(mat_dst.cols, index); - // size_t localThreads[3] = { 64, 4, 1 }; - // size_t globalThreads[3] = { divUp(cols, localThreads[0]) *localThreads[0], - // divUp(mat_dst.rows, localThreads[1]) *localThreads[1], - // 1 - // }; - - // vector > args; - // args.push_back( make_pair( sizeof(cl_int), (void *)&mat_dst.rows)); - // args.push_back( make_pair( sizeof(cl_int), (void *)&cols)); - // args.push_back( make_pair( sizeof(cl_mem), (void *)&mat_dst.data)); - // args.push_back( make_pair( sizeof(cl_int), (void *)&mat_dst.step)); - // args.push_back( make_pair( sizeof(cl_mem), (void *)&mat_src[0].data)); - // args.push_back( make_pair( sizeof(cl_int), (void *)&mat_src[0].step)); - // args.push_back( make_pair( sizeof(cl_mem), (void *)&mat_src[1].data)); - // args.push_back( make_pair( sizeof(cl_int), (void *)&mat_src[1].step)); - // if(n >= 3) - // { - // args.push_back( make_pair( sizeof(cl_mem), (void *)&mat_src[2].data)); - // args.push_back( make_pair( sizeof(cl_int), (void *)&mat_src[2].step)); - // } - // if(n >= 4) - // { - // args.push_back( make_pair( sizeof(cl_mem), (void *)&mat_src[3].data)); - // args.push_back( make_pair( sizeof(cl_int), (void *)&mat_src[3].step)); - // } - - // openCLExecuteKernel(clCxt, &merge_mat, kernelName, globalThreads, localThreads, args, channels, depth); - // } - static void merge_vector_run(const oclMat *mat_src, size_t n, oclMat &mat_dst) { if(!mat_dst.clCxt->supportsFeature(Context::CL_DOUBLE) && mat_dst.type() == CV_64F) @@ -153,10 +98,7 @@ namespace cv int cols = divUp(mat_dst.cols + offset_cols, vector_length); size_t localThreads[3] = { 64, 4, 1 }; - size_t globalThreads[3] = { divUp(cols, localThreads[0]) *localThreads[0], - divUp(mat_dst.rows, localThreads[1]) *localThreads[1], - 1 - }; + size_t globalThreads[3] = { cols, mat_dst.rows, 1 }; int dst_step1 = mat_dst.cols * mat_dst.elemSize(); vector > args; @@ -176,10 +118,6 @@ namespace cv args.push_back( make_pair( sizeof(cl_int), (void *)&mat_src[2].step)); args.push_back( make_pair( sizeof(cl_int), (void *)&mat_src[2].offset)); - // if channel == 3, then the matrix will convert to channel =4 - //if(n == 3) - // args.push_back( make_pair( sizeof(cl_int), (void *)&offset_cols)); - if(n == 3) { args.push_back( make_pair( sizeof(cl_mem), (void *)&mat_src[2].data)); @@ -229,53 +167,6 @@ namespace cv mat_dst.create(size, CV_MAKETYPE(depth, total_channels)); merge_vector_run(mat_src, n, mat_dst); } - //////////////////////////////////////////////////////////////////////////////////////////////////// - //////////////////////////////////////split///////////////////////////////////////////////////////////// - ////////////////////////////////////////////////////////////////////////////////////////////////// - // static void split_vector_run_no_roi(const oclMat &mat_src, oclMat *mat_dst) - // { - // Context *clCxt = mat_src.clCxt; - // int channels = mat_src.oclchannels(); - // int depth = mat_src.depth(); - - // string kernelName = "split_vector"; - - // int indexes[4][7] = {{0, 0, 0, 0, 0, 0, 0}, - // {8, 8, 8, 8, 4, 4, 2}, - // {8, 8, 8, 8 , 4, 4, 4}, - // {4, 4, 2, 2, 1, 1, 1} - // }; - - // size_t index = indexes[channels - 1][mat_dst[0].depth()]; - // int cols = divUp(mat_src.cols, index); - // size_t localThreads[3] = { 64, 4, 1 }; - // size_t globalThreads[3] = { divUp(cols, localThreads[0]) *localThreads[0], - // divUp(mat_src.rows, localThreads[1]) *localThreads[1], - // 1 - // }; - - // vector > args; - // args.push_back( make_pair( sizeof(cl_mem), (void *)&mat_src.data)); - // args.push_back( make_pair( sizeof(cl_int), (void *)&mat_src.step)); - // args.push_back( make_pair( sizeof(cl_int), (void *)&mat_src.rows)); - // args.push_back( make_pair( sizeof(cl_int), (void *)&cols)); - // args.push_back( make_pair( sizeof(cl_mem), (void *)&mat_dst[0].data)); - // args.push_back( make_pair( sizeof(cl_int), (void *)&mat_dst[0].step)); - // args.push_back( make_pair( sizeof(cl_mem), (void *)&mat_dst[1].data)); - // args.push_back( make_pair( sizeof(cl_int), (void *)&mat_dst[1].step)); - // if(channels >= 3) - // { - // args.push_back( make_pair( sizeof(cl_mem), (void *)&mat_dst[2].data)); - // args.push_back( make_pair( sizeof(cl_int), (void *)&mat_dst[2].step)); - // } - // if(channels >= 4) - // { - // args.push_back( make_pair( sizeof(cl_mem), (void *)&mat_dst[3].data)); - // args.push_back( make_pair( sizeof(cl_int), (void *)&mat_dst[3].step)); - // } - - // openCLExecuteKernel(clCxt, &split_mat, kernelName, globalThreads, localThreads, args, channels, depth); - // } static void split_vector_run(const oclMat &mat_src, oclMat *mat_dst) { @@ -311,9 +202,7 @@ namespace cv : divUp(mat_src.cols + max_offset_cols, vector_length); size_t localThreads[3] = { 64, 4, 1 }; - size_t globalThreads[3] = { divUp(cols, localThreads[0]) *localThreads[0], - divUp(mat_src.rows, localThreads[1]) *localThreads[1], 1 - }; + size_t globalThreads[3] = { cols, mat_src.rows, 1 }; int dst_step1 = mat_dst[0].cols * mat_dst[0].elemSize(); vector > args; diff --git a/modules/ocl/src/stereo_csbp.cpp b/modules/ocl/src/stereo_csbp.cpp index 1ae70c07d..b119eadf9 100644 --- a/modules/ocl/src/stereo_csbp.cpp +++ b/modules/ocl/src/stereo_csbp.cpp @@ -96,13 +96,6 @@ namespace cv { namespace stereoCSBP { - ////////////////////////////////////////////////////////////////////////// - //////////////////////////////common//////////////////////////////////// - //////////////////////////////////////////////////////////////////////// - static inline int divUp(int total, int grain) - { - return (total + grain - 1) / grain; - } static string get_kernel_name(string kernel_name, int data_type) { stringstream idxStr; @@ -132,10 +125,7 @@ namespace cv //size_t blockSize = 256; size_t localThreads[] = {32, 8 ,1}; - size_t globalThreads[] = {divUp(w, localThreads[0]) *localThreads[0], - divUp(h, localThreads[1]) *localThreads[1], - 1 - }; + size_t globalThreads[] = { w, h, 1 }; int cdisp_step1 = msg_step * h; openCLVerifyKernel(clCxt, kernel, localThreads); @@ -177,7 +167,7 @@ namespace cv const int threadsNum = 256; //size_t blockSize = threadsNum; size_t localThreads[3] = {win_size, 1, threadsNum / win_size}; - size_t globalThreads[3] = {w *localThreads[0], + size_t globalThreads[3] = { w *localThreads[0], h * divUp(rthis.ndisp, localThreads[2]) *localThreads[1], 1 * localThreads[2] }; @@ -222,10 +212,7 @@ namespace cv //size_t blockSize = 256; size_t localThreads[] = {32, 8 ,1}; - size_t globalThreads[] = {divUp(w, localThreads[0]) *localThreads[0], - divUp(h, localThreads[1]) *localThreads[1], - 1 - }; + size_t globalThreads[] = { w, h, 1 }; int disp_step = msg_step * h; openCLVerifyKernel(clCxt, kernel, localThreads); @@ -257,10 +244,7 @@ namespace cv //size_t blockSize = 256; size_t localThreads[] = {32, 8, 1}; - size_t globalThreads[] = {divUp(w, localThreads[0]) *localThreads[0], - divUp(h, localThreads[1]) *localThreads[1], - 1 - }; + size_t globalThreads[] = { w, h, 1 }; int disp_step = msg_step * h; openCLVerifyKernel(clCxt, kernel, localThreads); @@ -291,14 +275,10 @@ namespace cv init_data_cost_reduce_caller(left, right, temp, rthis, msg_step, h, w, level); if(rthis.use_local_init_data_cost == true) - { get_first_initial_local_caller(data_cost_selected, disp_selected_pyr, temp, rthis, h, w, nr_plane, msg_step); - } else - { get_first_initial_global_caller(data_cost_selected, disp_selected_pyr, temp, rthis, h, w, nr_plane, msg_step); - } } /////////////////////////////////////////////////////////////////////////////////////////////////// @@ -317,12 +297,8 @@ namespace cv cl_kernel kernel = openCLGetKernelFromSource(clCxt, &stereocsbp, kernelName); - //size_t blockSize = 256; - size_t localThreads[] = {32, 8, 1}; - size_t globalThreads[] = {divUp(w, localThreads[0]) *localThreads[0], - divUp(h, localThreads[1]) *localThreads[1], - 1 - }; + size_t localThreads[] = { 32, 8, 1 }; + size_t globalThreads[] = { w, h, 1 }; int disp_step1 = msg_step1 * h; int disp_step2 = msg_step2 * h2; @@ -366,8 +342,8 @@ namespace cv const size_t threadsNum = 256; //size_t blockSize = threadsNum; - size_t localThreads[3] = {win_size, 1, threadsNum / win_size}; - size_t globalThreads[3] = {w *localThreads[0], + size_t localThreads[3] = { win_size, 1, threadsNum / win_size }; + size_t globalThreads[3] = { w *localThreads[0], h * divUp(nr_plane, localThreads[2]) *localThreads[1], 1 * localThreads[2] }; @@ -431,10 +407,7 @@ namespace cv //size_t blockSize = 256; size_t localThreads[] = {32, 8, 1}; - size_t globalThreads[] = {divUp(w, localThreads[0]) *localThreads[0], - divUp(h, localThreads[1]) *localThreads[1], - 1 - }; + size_t globalThreads[] = { w, h, 1 }; int disp_step1 = msg_step1 * h; int disp_step2 = msg_step2 * h2; @@ -535,10 +508,7 @@ namespace cv //size_t blockSize = 256; size_t localThreads[] = {32, 8, 1}; - size_t globalThreads[] = {divUp(disp.cols, localThreads[0]) *localThreads[0], - divUp(disp.rows, localThreads[1]) *localThreads[1], - 1 - }; + size_t globalThreads[] = { disp.cols, disp.rows, 1 }; int step_size = disp.step / disp.elemSize(); int disp_step = disp.rows * msg_step; diff --git a/modules/ocl/src/stereobm.cpp b/modules/ocl/src/stereobm.cpp index 151a7eae2..8195346c0 100644 --- a/modules/ocl/src/stereobm.cpp +++ b/modules/ocl/src/stereobm.cpp @@ -96,10 +96,7 @@ static void prefilter_xsobel(const oclMat &input, oclMat &output, int prefilterC #define N_DISPARITIES 8 #define ROWSperTHREAD 21 #define BLOCK_W 128 -static inline int divUp(int total, int grain) -{ - return (total + grain - 1) / grain; -} + //////////////////////////////////////////////////////////////////////////// ///////////////////////////////stereoBM_GPU//////////////////////////////// //////////////////////////////////////////////////////////////////////////// @@ -117,11 +114,10 @@ static void stereo_bm(const oclMat &left, const oclMat &right, oclMat &disp, size_t local_mem_size = (N_DISPARITIES * (BLOCK_W + 2 * winsz2)) * sizeof(cl_uint); //size_t blockSize = 1; - size_t localThreads[] = { BLOCK_W, 1,1}; - size_t globalThreads[] = { divUp(left.cols - maxdisp - 2 * winsz2, BLOCK_W) *BLOCK_W, + size_t localThreads[] = { BLOCK_W, 1, 1 }; + size_t globalThreads[] = { left.cols - maxdisp - 2 * winsz2, divUp(left.rows - 2 * winsz2, ROWSperTHREAD), - 1 - }; + 1 }; std::vector< std::pair > args; args.push_back(std::make_pair(sizeof(cl_mem), (void *)&left.data)); @@ -151,10 +147,9 @@ static void postfilter_textureness(oclMat &left, int winSize, size_t blockSize = 1; size_t localThreads[] = { BLOCK_W, blockSize ,1}; - size_t globalThreads[] = { divUp(left.cols, BLOCK_W) *BLOCK_W, + size_t globalThreads[] = { left.cols, divUp(left.rows, 2 * ROWSperTHREAD), - 1 - }; + 1 }; size_t local_mem_size = (localThreads[0] + localThreads[0] + (winSize / 2) * 2) * sizeof(float); diff --git a/modules/ocl/src/stereobp.cpp b/modules/ocl/src/stereobp.cpp index 4a326b814..fe9136057 100644 --- a/modules/ocl/src/stereobp.cpp +++ b/modules/ocl/src/stereobp.cpp @@ -104,10 +104,7 @@ namespace cv { openCLFree(cl_con_struct); } - static inline int divUp(int total, int grain) - { - return (total + grain - 1) / grain; - } + ///////////////////////////////////////////////////////////////////////////// ///////////////////////////comp data//////////////////////////////////////// /////////////////////////////////////////////////////////////////////////