From 6da5d2133180e3a819e64dcec37692db2309d757 Mon Sep 17 00:00:00 2001 From: Vadim Pisarevsky Date: Sun, 1 Dec 2013 03:12:19 +0400 Subject: [PATCH] fixed many bugs related to Mat::getUMat(), asynchronous kernel execution etc. Also, played a bit with ocl::cvtColor vs cv::cvtColor performance --- modules/core/include/opencv2/core/mat.hpp | 19 +- modules/core/include/opencv2/core/mat.inl.hpp | 8 - modules/core/src/matrix.cpp | 251 ++++++++++-------- modules/core/src/ocl.cpp | 117 +++++--- modules/core/src/umatrix.cpp | 45 +++- modules/core/test/test_umat.cpp | 37 +++ modules/imgproc/src/color.cpp | 7 +- modules/imgproc/src/opencl/cvtcolor.cl | 24 ++ modules/ocl/test/test_api.cpp | 59 ++++ modules/python/src2/cv2.cpp | 42 +-- 10 files changed, 401 insertions(+), 208 deletions(-) diff --git a/modules/core/include/opencv2/core/mat.hpp b/modules/core/include/opencv2/core/mat.hpp index 2f38f8bbb..2cf1e35dc 100644 --- a/modules/core/include/opencv2/core/mat.hpp +++ b/modules/core/include/opencv2/core/mat.hpp @@ -279,21 +279,22 @@ public: //virtual void allocate(int dims, const int* sizes, int type, int*& refcount, // uchar*& datastart, uchar*& data, size_t* step) = 0; //virtual void deallocate(int* refcount, uchar* datastart, uchar* data) = 0; - virtual UMatData* allocate(int dims, const int* sizes, - int type, size_t* step) const = 0; + virtual UMatData* allocate(int dims, const int* sizes, int type, + void* data, size_t* step, int flags) const = 0; virtual bool allocate(UMatData* data, int accessflags) const = 0; virtual void deallocate(UMatData* data) const = 0; - virtual void map(UMatData* data, int accessflags) const = 0; - virtual void unmap(UMatData* data) const = 0; + virtual void sync(UMatData* u) const; + virtual void map(UMatData* data, int accessflags) const; + virtual void unmap(UMatData* data) const; virtual void download(UMatData* data, void* dst, int dims, const size_t sz[], const size_t srcofs[], const size_t srcstep[], - const size_t dststep[]) const = 0; + const size_t dststep[]) const; virtual void upload(UMatData* data, const void* src, int dims, const size_t sz[], const size_t dstofs[], const size_t dststep[], - const size_t srcstep[]) const = 0; + const size_t srcstep[]) const; virtual void copy(UMatData* srcdata, UMatData* dstdata, int dims, const size_t sz[], const size_t srcofs[], const size_t srcstep[], - const size_t dstofs[], const size_t dststep[], bool sync) const = 0; + const size_t dstofs[], const size_t dststep[], bool sync) const; }; @@ -335,8 +336,10 @@ protected: struct CV_EXPORTS UMatData { enum { COPY_ON_MAP=1, HOST_COPY_OBSOLETE=2, - DEVICE_COPY_OBSOLETE=4, TEMP_UMAT=8, TEMP_COPIED_UMAT=24 }; + DEVICE_COPY_OBSOLETE=4, TEMP_UMAT=8, TEMP_COPIED_UMAT=24, + USER_ALLOCATED=32 }; UMatData(const MatAllocator* allocator); + ~UMatData(); // provide atomic access to the structure void lock(); diff --git a/modules/core/include/opencv2/core/mat.inl.hpp b/modules/core/include/opencv2/core/mat.inl.hpp index 84f1cc4a6..8a0a6a6be 100644 --- a/modules/core/include/opencv2/core/mat.inl.hpp +++ b/modules/core/include/opencv2/core/mat.inl.hpp @@ -3131,14 +3131,6 @@ cols(1), allocator(0), u(0), offset(0), size(&rows) } -inline -UMat::~UMat() -{ - release(); - if( step.p != step.buf ) - fastFree(step.p); -} - inline UMat& UMat::operator = (const UMat& m) { diff --git a/modules/core/src/matrix.cpp b/modules/core/src/matrix.cpp index cb5d7e4cb..995c10b5b 100644 --- a/modules/core/src/matrix.cpp +++ b/modules/core/src/matrix.cpp @@ -48,147 +48,166 @@ namespace cv { +void MatAllocator::sync(UMatData*) const +{ +} + +void MatAllocator::map(UMatData*, int) const +{ +} + +void MatAllocator::unmap(UMatData* u) const +{ + if(u->urefcount == 0 && u->refcount == 0) + deallocate(u); +} + +void MatAllocator::download(UMatData* u, void* dstptr, + int dims, const size_t sz[], + const size_t srcofs[], const size_t srcstep[], + const size_t dststep[]) const +{ + if(!u) + return; + int isz[CV_MAX_DIM]; + uchar* srcptr = u->data; + for( int i = 0; i < dims; i++ ) + { + CV_Assert( sz[i] <= (size_t)INT_MAX ); + if( sz[i] == 0 ) + return; + if( srcofs ) + srcptr += srcofs[i]*(i <= dims-2 ? srcstep[i] : 1); + isz[i] = (int)sz[i]; + } + + Mat src(dims, isz, CV_8U, srcptr, srcstep); + Mat dst(dims, isz, CV_8U, dstptr, dststep); + + const Mat* arrays[] = { &src, &dst }; + uchar* ptrs[2]; + NAryMatIterator it(arrays, ptrs, 2); + size_t j, planesz = it.size; + + for( j = 0; j < it.nplanes; j++, ++it ) + memcpy(ptrs[1], ptrs[0], planesz); +} + + +void MatAllocator::upload(UMatData* u, const void* srcptr, int dims, const size_t sz[], + const size_t dstofs[], const size_t dststep[], + const size_t srcstep[]) const +{ + if(!u) + return; + int isz[CV_MAX_DIM]; + uchar* dstptr = u->data; + for( int i = 0; i < dims; i++ ) + { + CV_Assert( sz[i] <= (size_t)INT_MAX ); + if( sz[i] == 0 ) + return; + if( dstofs ) + dstptr += dstofs[i]*(i <= dims-2 ? dststep[i] : 1); + isz[i] = (int)sz[i]; + } + + Mat src(dims, isz, CV_8U, (void*)srcptr, srcstep); + Mat dst(dims, isz, CV_8U, dstptr, dststep); + + const Mat* arrays[] = { &src, &dst }; + uchar* ptrs[2]; + NAryMatIterator it(arrays, ptrs, 2); + size_t j, planesz = it.size; + + for( j = 0; j < it.nplanes; j++, ++it ) + memcpy(ptrs[1], ptrs[0], planesz); +} + +void MatAllocator::copy(UMatData* usrc, UMatData* udst, int dims, const size_t sz[], + const size_t srcofs[], const size_t srcstep[], + const size_t dstofs[], const size_t dststep[], bool sync) const +{ + if(!usrc || !udst) + return; + int isz[CV_MAX_DIM]; + uchar* srcptr = usrc->data; + uchar* dstptr = udst->data; + for( int i = 0; i < dims; i++ ) + { + CV_Assert( sz[i] <= (size_t)INT_MAX ); + if( sz[i] == 0 ) + return; + if( srcofs ) + srcptr += srcofs[i]*(i <= dims-2 ? srcstep[i] : 1); + if( dstofs ) + dstptr += dstofs[i]*(i <= dims-2 ? dststep[i] : 1); + isz[i] = (int)sz[i]; + } + + Mat src(dims, isz, CV_8U, srcptr, srcstep); + Mat dst(dims, isz, CV_8U, dstptr, dststep); + + const Mat* arrays[] = { &src, &dst }; + uchar* ptrs[2]; + NAryMatIterator it(arrays, ptrs, 2); + size_t j, planesz = it.size; + + for( j = 0; j < it.nplanes; j++, ++it ) + memcpy(ptrs[1], ptrs[0], planesz); +} + class StdMatAllocator : public MatAllocator { public: - UMatData* allocate(int dims, const int* sizes, int type, size_t* step) const + UMatData* allocate(int dims, const int* sizes, int type, + void* data0, size_t* step, int /*flags*/) const { size_t total = CV_ELEM_SIZE(type); for( int i = dims-1; i >= 0; i-- ) { if( step ) - step[i] = total; + { + if( data0 && step[i] != CV_AUTOSTEP ) + { + CV_Assert(total <= step[i]); + total = step[i]; + } + else + step[i] = total; + } total *= sizes[i]; } - uchar* data = (uchar*)fastMalloc(total); + uchar* data = data0 ? (uchar*)data0 : (uchar*)fastMalloc(total); UMatData* u = new UMatData(this); u->data = u->origdata = data; u->size = total; - u->refcount = 1; + u->refcount = data0 == 0; + if(data0) + u->flags |= UMatData::USER_ALLOCATED; return u; } - bool allocate(UMatData* u, int accessFlags) const + bool allocate(UMatData* u, int /*accessFlags*/) const { if(!u) return false; - if(u->handle != 0) - return true; - return UMat::getStdAllocator()->allocate(u, accessFlags); + CV_XADD(&u->urefcount, 1); + return true; } void deallocate(UMatData* u) const { - if(u) + if(u && u->refcount == 0) { - fastFree(u->origdata); + if( !(u->flags & UMatData::USER_ALLOCATED) ) + { + fastFree(u->origdata); + u->origdata = 0; + } delete u; } } - - void map(UMatData*, int) const - { - } - - void unmap(UMatData* u) const - { - if(u->urefcount == 0) - deallocate(u); - } - - void download(UMatData* u, void* dstptr, - int dims, const size_t sz[], - const size_t srcofs[], const size_t srcstep[], - const size_t dststep[]) const - { - if(!u) - return; - int isz[CV_MAX_DIM]; - uchar* srcptr = u->data; - for( int i = 0; i < dims; i++ ) - { - CV_Assert( sz[i] <= (size_t)INT_MAX ); - if( sz[i] == 0 ) - return; - if( srcofs ) - srcptr += srcofs[i]*(i <= dims-2 ? srcstep[i] : 1); - isz[i] = (int)sz[i]; - } - - Mat src(dims, isz, CV_8U, srcptr, srcstep); - Mat dst(dims, isz, CV_8U, dstptr, dststep); - - const Mat* arrays[] = { &src, &dst }; - uchar* ptrs[2]; - NAryMatIterator it(arrays, ptrs, 2); - size_t j, planesz = it.size; - - for( j = 0; j < it.nplanes; j++, ++it ) - memcpy(ptrs[1], ptrs[0], planesz); - } - - void upload(UMatData* u, const void* srcptr, int dims, const size_t sz[], - const size_t dstofs[], const size_t dststep[], - const size_t srcstep[]) const - { - if(!u) - return; - int isz[CV_MAX_DIM]; - uchar* dstptr = u->data; - for( int i = 0; i < dims; i++ ) - { - CV_Assert( sz[i] <= (size_t)INT_MAX ); - if( sz[i] == 0 ) - return; - if( dstofs ) - dstptr += dstofs[i]*(i <= dims-2 ? dststep[i] : 1); - isz[i] = (int)sz[i]; - } - - Mat src(dims, isz, CV_8U, (void*)srcptr, srcstep); - Mat dst(dims, isz, CV_8U, dstptr, dststep); - - const Mat* arrays[] = { &src, &dst }; - uchar* ptrs[2]; - NAryMatIterator it(arrays, ptrs, 2); - size_t j, planesz = it.size; - - for( j = 0; j < it.nplanes; j++, ++it ) - memcpy(ptrs[1], ptrs[0], planesz); - } - - void copy(UMatData* usrc, UMatData* udst, int dims, const size_t sz[], - const size_t srcofs[], const size_t srcstep[], - const size_t dstofs[], const size_t dststep[], bool) const - { - if(!usrc || !udst) - return; - int isz[CV_MAX_DIM]; - uchar* srcptr = usrc->data; - uchar* dstptr = udst->data; - for( int i = 0; i < dims; i++ ) - { - CV_Assert( sz[i] <= (size_t)INT_MAX ); - if( sz[i] == 0 ) - return; - if( srcofs ) - srcptr += srcofs[i]*(i <= dims-2 ? srcstep[i] : 1); - if( dstofs ) - dstptr += dstofs[i]*(i <= dims-2 ? dststep[i] : 1); - isz[i] = (int)sz[i]; - } - - Mat src(dims, isz, CV_8U, srcptr, srcstep); - Mat dst(dims, isz, CV_8U, dstptr, dststep); - - const Mat* arrays[] = { &src, &dst }; - uchar* ptrs[2]; - NAryMatIterator it(arrays, ptrs, 2); - size_t j, planesz = it.size; - - for( j = 0; j < it.nplanes; j++, ++it ) - memcpy(ptrs[1], ptrs[0], planesz); - } }; @@ -364,13 +383,13 @@ void Mat::create(int d, const int* _sizes, int _type) a = a0; try { - u = a->allocate(dims, size, _type, step.p); + u = a->allocate(dims, size, _type, 0, step.p, 0); CV_Assert(u != 0); } catch(...) { if(a != a0) - u = a0->allocate(dims, size, _type, step.p); + u = a0->allocate(dims, size, _type, 0, step.p, 0); CV_Assert(u != 0); } CV_Assert( step[dims-1] == (size_t)CV_ELEM_SIZE(flags) ); diff --git a/modules/core/src/ocl.cpp b/modules/core/src/ocl.cpp index 1d4c41939..22d802243 100644 --- a/modules/core/src/ocl.cpp +++ b/modules/core/src/ocl.cpp @@ -612,7 +612,7 @@ static void* initOpenCLAndLoad(const char* funcname) return 0; } - return funcname ? dlsym(handle, funcname) : 0; + return funcname && handle ? dlsym(handle, funcname) : 0; } #elif defined WIN32 || defined _WIN32 @@ -2002,7 +2002,7 @@ void* Queue::ptr() const Queue& Queue::getDefault() { Queue& q = TLSData::get()->oclQueue; - if( !q.p ) + if( !q.p && haveOpenCL() ) q.create(Context2::getDefault()); return q; } @@ -2251,22 +2251,32 @@ int Kernel::set(int i, const KernelArg& arg) } -bool Kernel::run(int dims, size_t globalsize[], size_t localsize[], +bool Kernel::run(int dims, size_t _globalsize[], size_t _localsize[], bool sync, const Queue& q) { if(!p || !p->handle || p->e != 0) return false; - AutoBuffer _globalSize(dims); - size_t * globalSizePtr = (size_t *)_globalSize; - for (int i = 0; i < dims; ++i) - globalSizePtr[i] = localsize == NULL ? globalsize[i] : - ((globalsize[i] + localsize[i] - 1) / localsize[i]) * localsize[i]; - cl_command_queue qq = getQueue(q); - size_t offset[CV_MAX_DIM] = {0}; + size_t offset[CV_MAX_DIM] = {0}, globalsize[CV_MAX_DIM] = {1,1,1}, localsize[CV_MAX_DIM] = {1,1,1}; + size_t total = 1; + for (int i = 0; i < dims; i++) + { + size_t val0 = _localsize ? _localsize[i] : + dims == 1 ? 64 : dims == 2 ? 16>>i : dims == 3 ? 8>>(i>0) : 1; + size_t val = 1; + while( val*2 < val0 ) + val *= 2; + if( _localsize ) + localsize[i] = val; + CV_Assert(_globalsize && _globalsize[i] >= 0); + total *= _globalsize[i]; + globalsize[i] = ((_globalsize[i] + val - 1)/val)*val; + } + if( total == 0 ) + return true; cl_int retval = clEnqueueNDRangeKernel(qq, p->handle, (cl_uint)dims, - offset, globalSizePtr, localsize, 0, 0, + offset, globalsize, _localsize ? localsize : 0, 0, 0, sync ? 0 : &p->e); if( sync || retval < 0 ) { @@ -2361,14 +2371,23 @@ struct Program::Impl retval = clBuildProgram(handle, n, (const cl_device_id*)deviceList, buildflags.c_str(), 0, 0); - if( retval == CL_BUILD_PROGRAM_FAILURE ) + if( retval < 0 ) { - char buf[1<<16]; size_t retsz = 0; - clGetProgramBuildInfo(handle, (cl_device_id)deviceList[0], CL_PROGRAM_BUILD_LOG, - sizeof(buf)-16, buf, &retsz); - errmsg = String(buf); - CV_Error_(Error::StsAssert, ("OpenCL program can not be built: %s", errmsg.c_str())); + retval = clGetProgramBuildInfo(handle, (cl_device_id)deviceList[0], + CL_PROGRAM_BUILD_LOG, 0, 0, &retsz); + if( retval >= 0 && retsz > 0 ) + { + AutoBuffer bufbuf(retsz + 16); + char* buf = bufbuf; + retval = clGetProgramBuildInfo(handle, (cl_device_id)deviceList[0], + CL_PROGRAM_BUILD_LOG, retsz+1, buf, &retsz); + if( retval >= 0 ) + { + errmsg = String(buf); + CV_Error_(Error::StsAssert, ("OpenCL program can not be built: %s", errmsg.c_str())); + } + } } CV_Assert(retval >= 0); } @@ -2608,17 +2627,17 @@ ProgramSource2::hash_t ProgramSource2::hash() const class OpenCLAllocator : public MatAllocator { public: - OpenCLAllocator() {} + OpenCLAllocator() { matStdAllocator = Mat::getStdAllocator(); } - UMatData* defaultAllocate(int dims, const int* sizes, int type, size_t* step) const + UMatData* defaultAllocate(int dims, const int* sizes, int type, void* data, size_t* step, int flags) const { - UMatData* u = Mat::getStdAllocator()->allocate(dims, sizes, type, step); + UMatData* u = matStdAllocator->allocate(dims, sizes, type, data, step, flags); u->urefcount = 1; u->refcount = 0; return u; } - void getBestFlags(const Context2& ctx, int& createFlags, int& flags0) const + void getBestFlags(const Context2& ctx, int /*flags*/, int& createFlags, int& flags0) const { const Device& dev = ctx.device(0); createFlags = CL_MEM_READ_WRITE; @@ -2629,10 +2648,12 @@ public: flags0 = UMatData::COPY_ON_MAP; } - UMatData* allocate(int dims, const int* sizes, int type, size_t* step) const + UMatData* allocate(int dims, const int* sizes, int type, + void* data, size_t* step, int flags) const { if(!useOpenCL()) - return defaultAllocate(dims, sizes, type, step); + return defaultAllocate(dims, sizes, type, data, step, flags); + CV_Assert(data == 0); size_t total = CV_ELEM_SIZE(type); for( int i = dims-1; i >= 0; i-- ) { @@ -2643,13 +2664,13 @@ public: Context2& ctx = Context2::getDefault(); int createFlags = 0, flags0 = 0; - getBestFlags(ctx, createFlags, flags0); + getBestFlags(ctx, flags, createFlags, flags0); cl_int retval = 0; void* handle = clCreateBuffer((cl_context)ctx.ptr(), createFlags, total, 0, &retval); if( !handle || retval < 0 ) - return defaultAllocate(dims, sizes, type, step); + return defaultAllocate(dims, sizes, type, data, step, flags); UMatData* u = new UMatData(this); u->data = 0; u->size = total; @@ -2672,7 +2693,7 @@ public: CV_Assert(u->origdata != 0); Context2& ctx = Context2::getDefault(); int createFlags = 0, flags0 = 0; - getBestFlags(ctx, createFlags, flags0); + getBestFlags(ctx, accessFlags, createFlags, flags0); cl_context ctx_handle = (cl_context)ctx.ptr(); cl_int retval = 0; @@ -2697,19 +2718,41 @@ public: return true; } + void sync(UMatData* u) const + { + cl_command_queue q = (cl_command_queue)Queue::getDefault().ptr(); + clFinish(q); + + if( u->hostCopyObsolete() && u->handle && + u->tempCopiedUMat() && u->refcount > 0 && u->origdata) + { + UMatDataAutoLock lock(u); + clEnqueueReadBuffer(q, (cl_mem)u->handle, CL_TRUE, 0, + u->size, u->origdata, 0, 0, 0); + u->markHostCopyObsolete(false); + } + else if( u->copyOnMap() && u->deviceCopyObsolete() && u->data ) + { + UMatDataAutoLock lock(u); + clEnqueueWriteBuffer(q, (cl_mem)u->handle, CL_TRUE, 0, + u->size, u->data, 0, 0, 0); + } + } + void deallocate(UMatData* u) const { if(!u) return; + UMatDataAutoLock lock(u); // TODO: !!! when we add Shared Virtual Memory Support, - // this function (as well as the others should be corrected) + // this function (as well as the others) should be corrected CV_Assert(u->handle != 0 && u->urefcount == 0); if(u->tempUMat()) { if( u->hostCopyObsolete() && u->refcount > 0 && u->tempCopiedUMat() ) { - clEnqueueWriteBuffer((cl_command_queue)Queue::getDefault().ptr(), + clEnqueueReadBuffer((cl_command_queue)Queue::getDefault().ptr(), (cl_mem)u->handle, CL_TRUE, 0, u->size, u->origdata, 0, 0, 0); } @@ -2717,7 +2760,7 @@ public: clReleaseMemObject((cl_mem)u->handle); u->handle = 0; u->currAllocator = u->prevAllocator; - if(u->data && u->copyOnMap()) + if(u->data && u->copyOnMap() && !(u->flags & UMatData::USER_ALLOCATED)) fastFree(u->data); u->data = u->origdata; if(u->refcount == 0) @@ -2725,8 +2768,11 @@ public: } else { - if(u->data && u->copyOnMap()) + if(u->data && u->copyOnMap() && !(u->flags & UMatData::USER_ALLOCATED)) + { fastFree(u->data); + u->data = 0; + } clReleaseMemObject((cl_mem)u->handle); u->handle = 0; delete u; @@ -2793,15 +2839,18 @@ public: UMatDataAutoLock autolock(u); cl_command_queue q = (cl_command_queue)Queue::getDefault().ptr(); + cl_int retval = 0; if( !u->copyOnMap() && u->data ) { - CV_Assert( clEnqueueUnmapMemObject(q, (cl_mem)u->handle, u->data, 0, 0, 0) >= 0 ); + CV_Assert( (retval = clEnqueueUnmapMemObject(q, + (cl_mem)u->handle, u->data, 0, 0, 0)) >= 0 ); + clFinish(q); u->data = 0; } else if( u->copyOnMap() && u->deviceCopyObsolete() ) { - CV_Assert( clEnqueueWriteBuffer(q, (cl_mem)u->handle, CL_TRUE, 0, - u->size, u->data, 0, 0, 0) >= 0 ); + CV_Assert( (retval = clEnqueueWriteBuffer(q, (cl_mem)u->handle, CL_TRUE, 0, + u->size, u->data, 0, 0, 0)) >= 0 ); } u->markDeviceCopyObsolete(false); u->markHostCopyObsolete(false); @@ -3033,6 +3082,8 @@ public: if( sync ) clFinish(q); } + + MatAllocator* matStdAllocator; }; MatAllocator* getOpenCLAllocator() diff --git a/modules/core/src/umatrix.cpp b/modules/core/src/umatrix.cpp index 2b659fb0a..33c193d2e 100644 --- a/modules/core/src/umatrix.cpp +++ b/modules/core/src/umatrix.cpp @@ -62,6 +62,17 @@ UMatData::UMatData(const MatAllocator* allocator) userdata = 0; } +UMatData::~UMatData() +{ + prevAllocator = currAllocator = 0; + urefcount = refcount = 0; + data = origdata = 0; + size = 0; + flags = 0; + handle = 0; + userdata = 0; +} + void UMatData::lock() { umatLocks[(size_t)(void*)this % UMAT_NLOCKS].lock(); @@ -75,7 +86,9 @@ void UMatData::unlock() MatAllocator* UMat::getStdAllocator() { - return ocl::getOpenCLAllocator(); + if( ocl::haveOpenCL() ) + return ocl::getOpenCLAllocator(); + return Mat::getStdAllocator(); } void swap( UMat& a, UMat& b ) @@ -195,13 +208,21 @@ static void finalizeHdr(UMat& m) UMat Mat::getUMat(int accessFlags) const { UMat hdr; - if(!u) + if(!data) return hdr; - UMat::getStdAllocator()->allocate(u, accessFlags); + UMatData* temp_u = u; + if(!temp_u) + { + MatAllocator *a = allocator, *a0 = getStdAllocator(); + if(!a) + a = a0; + temp_u = a->allocate(dims, size.p, type(), data, step.p, accessFlags); + } + UMat::getStdAllocator()->allocate(temp_u, accessFlags); hdr.flags = flags; setSize(hdr, dims, size.p, step.p); finalizeHdr(hdr); - hdr.u = u; + hdr.u = temp_u; hdr.offset = data - datastart; return hdr; } @@ -237,13 +258,13 @@ void UMat::create(int d, const int* _sizes, int _type) a = a0; try { - u = a->allocate(dims, size, _type, step.p); + u = a->allocate(dims, size, _type, 0, step.p, 0); CV_Assert(u != 0); } catch(...) { if(a != a0) - u = a0->allocate(dims, size, _type, step.p); + u = a0->allocate(dims, size, _type, 0, step.p, 0); CV_Assert(u != 0); } CV_Assert( step[dims-1] == (size_t)CV_ELEM_SIZE(flags) ); @@ -262,6 +283,16 @@ void UMat::copySize(const UMat& m) } } + +UMat::~UMat() +{ + if( u && u->refcount > 0 ) + u->currAllocator->sync(u); + release(); + if( step.p != step.buf ) + fastFree(step.p); +} + void UMat::deallocate() { u->currAllocator->deallocate(u); @@ -546,7 +577,7 @@ Mat UMat::getMat(int accessFlags) const { if(!u) return Mat(); - u->currAllocator->map(u, accessFlags); + u->currAllocator->map(u, accessFlags | ACCESS_READ); CV_Assert(u->data != 0); Mat hdr(dims, size.p, type(), u->data + offset, step.p); hdr.u = u; diff --git a/modules/core/test/test_umat.cpp b/modules/core/test/test_umat.cpp index d7efaa0a7..54df893bf 100644 --- a/modules/core/test/test_umat.cpp +++ b/modules/core/test/test_umat.cpp @@ -200,3 +200,40 @@ void CV_UMatTest::run( int /* start_from */) } TEST(Core_UMat, base) { CV_UMatTest test; test.safe_run(); } + +TEST(Core_UMat, simple) +{ + { + int a[3] = { 1, 2, 3 }; + Mat m = Mat(1, 1, CV_32SC3, a); + UMat u = m.getUMat(ACCESS_READ); + EXPECT_NE((void*)NULL, u.u); + } + + { + Mat m(10, 10, CV_8UC1), ref; + for (int y = 0; y < m.rows; ++y) + { + uchar * const ptr = m.ptr(y); + for (int x = 0; x < m.cols; ++x) + ptr[x] = x + y * 2; + } + + ref = m.clone(); + Rect r(1, 1, 8, 8); + ref(r).setTo(17); + + { + UMat um = m(r).getUMat(ACCESS_WRITE); + um.setTo(17); + } + + double err = norm(m, ref, NORM_INF); + if(err > 0) + { + std::cout << "m: " << m << std::endl; + std::cout << "ref: " << ref << std::endl; + } + EXPECT_EQ(err, 0.); + } +} diff --git a/modules/imgproc/src/color.cpp b/modules/imgproc/src/color.cpp index fb2627a53..4bc3ffb88 100644 --- a/modules/imgproc/src/color.cpp +++ b/modules/imgproc/src/color.cpp @@ -2695,6 +2695,7 @@ static bool ocl_cvtColor( InputArray _src, OutputArray _dst, int code, int dcn ) UMat src = _src.getUMat(), dst; Size sz = src.size(), dstSz = sz; int scn = src.channels(), depth = src.depth(), bidx; + int dims = 2, stripeSize = 32; size_t globalsize[] = { src.cols, src.rows }; ocl::Kernel k; @@ -2765,7 +2766,9 @@ static bool ocl_cvtColor( InputArray _src, OutputArray _dst, int code, int dcn ) bidx = code == COLOR_BGR2GRAY || code == COLOR_BGRA2GRAY ? 0 : 2; dcn = 1; k.create("RGB2Gray", ocl::imgproc::cvtcolor_oclsrc, - format("-D depth=%d -D scn=%d -D dcn=1 -D bidx=%d", depth, scn, bidx)); + format("-D depth=%d -D scn=%d -D dcn=1 -D bidx=%d -D STRIPE_SIZE=%d", + depth, scn, bidx, stripeSize)); + globalsize[0] = (src.cols + stripeSize-1)/stripeSize; break; } case COLOR_GRAY2BGR: @@ -3027,7 +3030,7 @@ static bool ocl_cvtColor( InputArray _src, OutputArray _dst, int code, int dcn ) _dst.create(dstSz, CV_MAKETYPE(depth, dcn)); dst = _dst.getUMat(); k.args(ocl::KernelArg::ReadOnlyNoSize(src), ocl::KernelArg::WriteOnly(dst)); - ok = k.run(2, globalsize, 0, false); + ok = k.run(dims, globalsize, 0, false); } return ok; } diff --git a/modules/imgproc/src/opencl/cvtcolor.cl b/modules/imgproc/src/opencl/cvtcolor.cl index ca696290d..85c0ca6e3 100644 --- a/modules/imgproc/src/opencl/cvtcolor.cl +++ b/modules/imgproc/src/opencl/cvtcolor.cl @@ -75,6 +75,10 @@ #error "invalid depth: should be 0 (CV_8U), 2 (CV_16U) or 5 (CV_32F)" #endif +#ifndef STRIPE_SIZE +#define STRIPE_SIZE 1 +#endif + #define CV_DESCALE(x,n) (((x) + (1 << ((n)-1))) >> (n)) enum @@ -105,6 +109,7 @@ __kernel void RGB2Gray(__global const uchar* srcptr, int srcstep, int srcoffset, __global uchar* dstptr, int dststep, int dstoffset, int rows, int cols) { +#if 0 const int x = get_global_id(0); const int y = get_global_id(1); @@ -118,6 +123,25 @@ __kernel void RGB2Gray(__global const uchar* srcptr, int srcstep, int srcoffset, dst[0] = (DATA_TYPE)CV_DESCALE((src[bidx] * B2Y + src[1] * G2Y + src[(bidx^2)] * R2Y), yuv_shift); #endif } +#else + const int x0 = get_global_id(0)*STRIPE_SIZE; + const int x1 = min(x0 + STRIPE_SIZE, cols); + const int y = get_global_id(1); + + if( y < rows ) + { + __global const DATA_TYPE* src = (__global const DATA_TYPE*)(srcptr + mad24(y, srcstep, srcoffset)) + x0*scn; + __global DATA_TYPE* dst = (__global DATA_TYPE*)(dstptr + mad24(y, dststep, dstoffset)); + int x; + for( x = x0; x < x1; x++, src += scn ) +#ifdef DEPTH_5 + dst[x] = src[bidx] * 0.114f + src[1] * 0.587f + src[(bidx^2)] * 0.299f; +#else + dst[x] = (DATA_TYPE)(mad24(src[bidx], B2Y, mad24(src[1], G2Y, + mad24(src[(bidx^2)], R2Y, 1 << (yuv_shift-1)))) >> yuv_shift); +#endif + } +#endif } __kernel void Gray2RGB(__global const uchar* srcptr, int srcstep, int srcoffset, diff --git a/modules/ocl/test/test_api.cpp b/modules/ocl/test/test_api.cpp index 36eb8bde1..a95e60e65 100644 --- a/modules/ocl/test/test_api.cpp +++ b/modules/ocl/test/test_api.cpp @@ -41,6 +41,7 @@ #include "test_precomp.hpp" #include "opencv2/ocl/cl_runtime/cl_runtime.hpp" // for OpenCL types: cl_mem +#include "opencv2/core/ocl.hpp" TEST(TestAPI, openCLExecuteKernelInterop) { @@ -78,3 +79,61 @@ TEST(TestAPI, openCLExecuteKernelInterop) EXPECT_LE(checkNorm(cpuMat, dst), 1e-3); } + +TEST(OCL_TestTAPI, performance) +{ + cv::RNG rng; +#if 1 + cv::Mat src(1280,768,CV_8UC4), dst; + rng.fill(src, RNG::UNIFORM, 0, 255); +#else + cv::Mat src = cv::imread("/Users/vp/work/opencv/samples/c/lena.jpg", 1), dst; +#endif + + cv::UMat usrc, udst; + src.copyTo(usrc); + + cv::ocl::oclMat osrc(src); + cv::ocl::oclMat odst; + + int cvtcode = cv::COLOR_BGR2GRAY; + int i, niters = 10; + double t; + + cv::ocl::cvtColor(osrc, odst, cvtcode); + cv::ocl::finish(); + t = (double)cv::getTickCount(); + for(i = 0; i < niters; i++) + { + cv::ocl::cvtColor(osrc, odst, cvtcode); + } + cv::ocl::finish(); + t = (double)cv::getTickCount() - t; + printf("ocl exec time = %gms per iter\n", t*1000./niters/cv::getTickFrequency()); + + cv::cvtColor(usrc, udst, cvtcode); + cv::ocl::finish2(); + t = (double)cv::getTickCount(); + for(i = 0; i < niters; i++) + { + cv::cvtColor(usrc, udst, cvtcode); + } + cv::ocl::finish2(); + t = (double)cv::getTickCount() - t; + printf("t-api exec time = %gms per iter\n", t*1000./niters/cv::getTickFrequency()); + + cv::cvtColor(src, dst, cvtcode); + t = (double)cv::getTickCount(); + for(i = 0; i < niters; i++) + { + cv::cvtColor(src, dst, cvtcode); + } + t = (double)cv::getTickCount() - t; + printf("cpu exec time = %gms per iter\n", t*1000./niters/cv::getTickFrequency()); + /*cv::imshow("result0", dst); + cv::imshow("result1", udst); + cv::waitKey(); + cv::destroyWindow("result0"); + cv::destroyWindow("result1");*/ +} + diff --git a/modules/python/src2/cv2.cpp b/modules/python/src2/cv2.cpp index 20b4128a2..734f121a3 100644 --- a/modules/python/src2/cv2.cpp +++ b/modules/python/src2/cv2.cpp @@ -195,8 +195,14 @@ public: return u; } - UMatData* allocate(int dims0, const int* sizes, int type, size_t* step) const + UMatData* allocate(int dims0, const int* sizes, int type, void* data, size_t* step, int flags) const { + if( data != 0 ) + { + CV_Error(Error::StsAssert, "The data should normally be NULL!"); + // probably this is safe to do in such extreme case + return stdAllocator->allocate(dims0, sizes, type, data, step, flags); + } PyEnsureGIL gil; int depth = CV_MAT_DEPTH(type); @@ -229,43 +235,11 @@ public: { PyEnsureGIL gil; PyObject* o = (PyObject*)u->userdata; - Py_DECREF(o); + Py_XDECREF(o); delete u; } } - void map(UMatData*, int) const - { - } - - void unmap(UMatData* u) const - { - if(u->urefcount == 0) - deallocate(u); - } - - void download(UMatData* u, void* dstptr, - int dims, const size_t sz[], - const size_t srcofs[], const size_t srcstep[], - const size_t dststep[]) const - { - stdAllocator->download(u, dstptr, dims, sz, srcofs, srcstep, dststep); - } - - void upload(UMatData* u, const void* srcptr, int dims, const size_t sz[], - const size_t dstofs[], const size_t dststep[], - const size_t srcstep[]) const - { - stdAllocator->upload(u, srcptr, dims, sz, dstofs, dststep, srcstep); - } - - void copy(UMatData* usrc, UMatData* udst, int dims, const size_t sz[], - const size_t srcofs[], const size_t srcstep[], - const size_t dstofs[], const size_t dststep[], bool sync) const - { - stdAllocator->copy(usrc, udst, dims, sz, srcofs, srcstep, dstofs, dststep, sync); - } - const MatAllocator* stdAllocator; };