extended ocl::convertTo

2013-09-11 13:35:39 +04:00
parent 6a21eca7e7
commit f20cc2bce8
4 changed files with 203 additions and 538 deletions
--- a/modules/ocl/src/matrix_operations.cpp
+++ b/modules/ocl/src/matrix_operations.cpp
@@ -382,40 +382,50 @@ void cv::ocl::oclMat::copyTo( oclMat &mat, const oclMat &mask) const
 ///////////////////////////////////////////////////////////////////////////
 static void convert_run(const oclMat &src, oclMat &dst, double alpha, double beta)
 {
-    string kernelName = "convert_to_S";
-    stringstream idxStr;
-    idxStr << src.depth();
-    kernelName += idxStr.str();
+    string kernelName = "convert_to";
    float alpha_f = alpha, beta_f = beta;
+    int sdepth = src.depth(), ddepth = dst.depth();
+    int sstep1 = (int)src.step1(), dstep1 = (int)dst.step1();
+    int cols1 = src.cols * src.oclchannels();
+
+    char buildOptions[150], convertString[50];
+    const char * typeMap[] = { "uchar", "char", "ushort", "short", "int", "float", "double" };
+    sprintf(convertString, "convert_%s_sat_rte", typeMap[ddepth]);
+    sprintf(buildOptions, "-D srcT=%s -D dstT=%s -D convertToDstType=%s", typeMap[sdepth],
+            typeMap[ddepth], CV_32F == ddepth || ddepth == CV_64F ? "" : convertString);
+
    CV_DbgAssert(src.rows == dst.rows && src.cols == dst.cols);
    vector<pair<size_t , const void *> > args;
-    size_t localThreads[3] = {16, 16, 1};
-    size_t globalThreads[3];
-    globalThreads[0] = (dst.cols + localThreads[0] - 1) / localThreads[0] * localThreads[0];
-    globalThreads[1] = (dst.rows + localThreads[1] - 1) / localThreads[1] * localThreads[1];
-    globalThreads[2] = 1;
-    int dststep_in_pixel = dst.step / dst.elemSize(), dstoffset_in_pixel = dst.offset / dst.elemSize();
-    int srcstep_in_pixel = src.step / src.elemSize(), srcoffset_in_pixel = src.offset / src.elemSize();
-    if(dst.type() == CV_8UC1)
-    {
-        globalThreads[0] = ((dst.cols + 4) / 4 + localThreads[0]) / localThreads[0] * localThreads[0];
-    }
+
+    size_t localThreads[3] = { 16, 16, 1 };
+    size_t globalThreads[3] = { divUp(cols1, localThreads[0]) * localThreads[0],
+                                divUp(dst.rows, localThreads[1]) * localThreads[1], 1 };
+
+    int doffset1 = dst.offset / dst.elemSize1();
+    int soffset1 = src.offset / src.elemSize1();
+
    args.push_back( make_pair( sizeof(cl_mem) , (void *)&src.data ));
    args.push_back( make_pair( sizeof(cl_mem) , (void *)&dst.data ));
-    args.push_back( make_pair( sizeof(cl_int) , (void *)&src.cols ));
+    args.push_back( make_pair( sizeof(cl_int) , (void *)&cols1 ));
    args.push_back( make_pair( sizeof(cl_int) , (void *)&src.rows ));
-    args.push_back( make_pair( sizeof(cl_int) , (void *)&srcstep_in_pixel ));
-    args.push_back( make_pair( sizeof(cl_int) , (void *)&srcoffset_in_pixel ));
-    args.push_back( make_pair( sizeof(cl_int) , (void *)&dststep_in_pixel ));
-    args.push_back( make_pair( sizeof(cl_int) , (void *)&dstoffset_in_pixel ));
+    args.push_back( make_pair( sizeof(cl_int) , (void *)&sstep1 ));
+    args.push_back( make_pair( sizeof(cl_int) , (void *)&soffset1 ));
+    args.push_back( make_pair( sizeof(cl_int) , (void *)&dstep1 ));
+    args.push_back( make_pair( sizeof(cl_int) , (void *)&doffset1 ));
    args.push_back( make_pair( sizeof(cl_float) , (void *)&alpha_f ));
    args.push_back( make_pair( sizeof(cl_float) , (void *)&beta_f ));
+
    openCLExecuteKernel(dst.clCxt , &operator_convertTo, kernelName, globalThreads,
-                        localThreads, args, dst.oclchannels(), dst.depth());
+                        localThreads, args, -1, -1, buildOptions);
 }
 void cv::ocl::oclMat::convertTo( oclMat &dst, int rtype, double alpha, double beta ) const
 {
-    //cout << "cv::ocl::oclMat::convertTo()" << endl;
+    if (!clCxt->supportsFeature(Context::CL_DOUBLE) &&
+            (depth() == CV_64F || dst.depth() == CV_64F))
+    {
+        CV_Error(CV_GpuNotSupported, "Selected device don't support double\r\n");
+        return;
+    }

    bool noScale = fabs(alpha - 1) < std::numeric_limits<double>::epsilon()
                   && fabs(beta) < std::numeric_limits<double>::epsilon();
@@ -425,7 +435,6 @@ void cv::ocl::oclMat::convertTo( oclMat &dst, int rtype, double alpha, double be
    else
        rtype = CV_MAKETYPE(CV_MAT_DEPTH(rtype), channels());

-    //int scn = channels();
    int sdepth = depth(), ddepth = CV_MAT_DEPTH(rtype);
    if( sdepth == ddepth && noScale )
    {
@@ -447,7 +456,6 @@ void cv::ocl::oclMat::convertTo( oclMat &dst, int rtype, double alpha, double be
 ///////////////////////////////////////////////////////////////////////////
 oclMat &cv::ocl::oclMat::operator = (const Scalar &s)
 {
-    //cout << "cv::ocl::oclMat::=" << endl;
    setTo(s);
    return *this;
 }
--- a/modules/ocl/src/opencl/operator_convertTo.cl
+++ b/modules/ocl/src/opencl/operator_convertTo.cl
@@ -33,352 +33,28 @@
 // the use of this software, even if advised of the possibility of such damage.
 //
 //
-#define F float
-#define F2 float2
-#define F4 float4
-__kernel void convert_to_S4_C1_D0(
-        __global const int* restrict srcMat,
-        __global uchar* dstMat,
-        int cols,
-        int rows,
-        int srcStep_in_pixel,
-        int srcoffset_in_pixel,
-        int dstStep_in_pixel,
-        int dstoffset_in_pixel,
-        F alpha,
-        F beta)
-{
-        int x=get_global_id(0)<<2;
-        int y=get_global_id(1);
-        //int src_addr_start = mad24(y,srcStep_in_pixel,srcoffset_in_pixel);
-        //int src_addr_end = mad24(y,srcStep_in_pixel,cols+srcoffset_in_pixel);
-        int off_src = (dstoffset_in_pixel & 3);
-        int srcidx = mad24(y,srcStep_in_pixel,x+ srcoffset_in_pixel - off_src);
-        int dst_addr_start = mad24(y,dstStep_in_pixel,dstoffset_in_pixel);
-        int dst_addr_end = mad24(y,dstStep_in_pixel,cols+dstoffset_in_pixel);
-        int dstidx = mad24(y,dstStep_in_pixel,x+ dstoffset_in_pixel & (int)0xfffffffc);
-        if(x+3<cols && y<rows && off_src==0)
-        {
-            float4 temp_src = convert_float4(vload4(0,srcMat+srcidx));
-            *(__global uchar4*)(dstMat+dstidx) = convert_uchar4_sat(temp_src*(F4)alpha+(F4)beta);
-        }
-        else
-        {
-            if(x+3<cols && y<rows)
-            {
-                float4 temp_src = convert_float4(vload4(0,srcMat+srcidx));
-                uchar4 temp_dst = convert_uchar4_sat(temp_src*(F4)alpha+(F4)beta);
-                dstMat[dstidx] = temp_dst.x;
-                dstMat[dstidx+1] = temp_dst.y;
-                dstMat[dstidx+2] = temp_dst.z;
-                dstMat[dstidx+3] = temp_dst.w;
-            }
-            else if(x+2<cols && y<rows)
-            {
-                float4 temp_src = convert_float4(vload4(0,srcMat+srcidx));
-                uchar4 temp_dst = convert_uchar4_sat(temp_src*(F4)alpha+(F4)beta);
-                dstMat[dstidx] = temp_dst.x;
-                dstMat[dstidx+1] = temp_dst.y;
-                dstMat[dstidx+2] = temp_dst.z;
-            }
-            else if(x+1<cols && y<rows)
-            {
-                float2 temp_src = convert_float2(vload2(0,srcMat+srcidx));
-                uchar2 temp_dst = convert_uchar2_sat(temp_src*(F2)alpha+(F2)beta);
-                dstMat[dstidx] = temp_dst.x;
-                dstMat[dstidx+1] = temp_dst.y;
-            }
-            else if(x<cols && y<rows)
-            {
-                dstMat[dstidx] = convert_uchar_sat(convert_float(srcMat[srcidx])*alpha+beta);;
-            }
-        }
-}

-__kernel void convert_to_S4_C4_D0(
-        __global const int4* restrict srcMat,
-        __global uchar4* dstMat,
-        int cols,
-        int rows,
-        int srcStep_in_pixel,
-        int srcoffset_in_pixel,
-        int dstStep_in_pixel,
-        int dstoffset_in_pixel,
-        F alpha,
-        F beta)
-{
-        int x=get_global_id(0);
-        int y=get_global_id(1);
-        int srcidx = mad24(y,srcStep_in_pixel,x+ srcoffset_in_pixel);
-        int dstidx = mad24(y,dstStep_in_pixel,x+ dstoffset_in_pixel);
-        if ( (x < cols) & (y < rows) )
-        {
-            float4 temp_src = convert_float4(srcMat[srcidx]);
-            dstMat[dstidx] = convert_uchar4_sat(temp_src*alpha+beta);
-        }
-}
+#ifdef DOUBLE_SUPPORT
+#pragma OPENCL EXTENSION cl_khr_fp64:enable
+#endif

-__kernel void convert_to_S5_C1_D0(
-        __global const float* restrict srcMat,
-        __global uchar* dstMat,
-        int cols,
-        int rows,
-        int srcStep_in_pixel,
-        int srcoffset_in_pixel,
-        int dstStep_in_pixel,
-        int dstoffset_in_pixel,
-        F alpha,
-        F beta)
+__kernel void convert_to(
+        __global const srcT* restrict srcMat,
+        __global dstT* dstMat,
+        int cols1, int rows,
+        int sstep1, int soffset1,
+        int dstep1, int doffset1,
+        float alpha, float beta)
 {
-        int x=get_global_id(0)<<2;
-        int y=get_global_id(1);
-        //int src_addr_start = mad24(y,srcStep_in_pixel,srcoffset_in_pixel);
-        //int src_addr_end = mad24(y,srcStep_in_pixel,cols+srcoffset_in_pixel);
-        int off_src = (dstoffset_in_pixel & 3);
-        int srcidx = mad24(y,srcStep_in_pixel,x+ srcoffset_in_pixel - off_src);
-        int dst_addr_start = mad24(y,dstStep_in_pixel,dstoffset_in_pixel);
-        int dst_addr_end = mad24(y,dstStep_in_pixel,cols+dstoffset_in_pixel);
-        int dstidx = mad24(y,dstStep_in_pixel,x+ dstoffset_in_pixel & (int)0xfffffffc);
-        if(x+3<cols && y<rows && off_src==0)
-        {
-            float4 temp_src = vload4(0,srcMat+srcidx);
-            *(__global uchar4*)(dstMat+dstidx) = convert_uchar4_sat(temp_src*(F4)alpha+(F4)beta);
-        }
-        else
-        {
-            if(x+3<cols && y<rows)
-            {
-                float4 temp_src = vload4(0,srcMat+srcidx);
-                uchar4 temp_dst = convert_uchar4_sat(temp_src*(F4)alpha+(F4)beta);
-                dstMat[dstidx] = temp_dst.x;
-                dstMat[dstidx+1] = temp_dst.y;
-                dstMat[dstidx+2] = temp_dst.z;
-                dstMat[dstidx+3] = temp_dst.w;
-            }
-            else if(x+2<cols && y<rows)
-            {
-                float4 temp_src = vload4(0,srcMat+srcidx);
-                uchar4 temp_dst = convert_uchar4_sat(temp_src*(F4)alpha+(F4)beta);
-                dstMat[dstidx] = temp_dst.x;
-                dstMat[dstidx+1] = temp_dst.y;
-                dstMat[dstidx+2] = temp_dst.z;
-            }
-            else if(x+1<cols && y<rows)
-            {
-                float2 temp_src = vload2(0,srcMat+srcidx);
-                uchar2 temp_dst = convert_uchar2_sat(temp_src*(F2)alpha+(F2)beta);
-                dstMat[dstidx] = temp_dst.x;
-                dstMat[dstidx+1] = temp_dst.y;
-            }
-            else if(x<cols && y<rows)
-            {
-                dstMat[dstidx] = convert_uchar_sat(srcMat[srcidx]*alpha+beta);;
-            }
-        }
-}
-__kernel void convert_to_S5_C4_D0(
-        __global const float4* restrict srcMat,
-        __global uchar4* dstMat,
-        int cols,
-        int rows,
-        int srcStep_in_pixel,
-        int srcoffset_in_pixel,
-        int dstStep_in_pixel,
-        int dstoffset_in_pixel,
-        F alpha,
-        F beta)
-{
-        int x=get_global_id(0);
-        int y=get_global_id(1);
-        int srcidx = mad24(y,srcStep_in_pixel,x+ srcoffset_in_pixel);
-        int dstidx = mad24(y,dstStep_in_pixel,x+ dstoffset_in_pixel);
-        if ( (x < cols) & (y < rows) )
-        {
-            float4 temp_src = srcMat[srcidx];
-            dstMat[dstidx] = convert_uchar4_sat(temp_src*alpha+beta);
-        }
-}
+        int x = get_global_id(0);
+        int y = get_global_id(1);

-__kernel void convert_to_S0_C1_D4(
-        __global const uchar* restrict srcMat,
-        __global int* dstMat,
-        int cols,
-        int rows,
-        int srcStep_in_pixel,
-        int srcoffset_in_pixel,
-        int dstStep_in_pixel,
-        int dstoffset_in_pixel,
-        F alpha,
-        F beta)
-{
-        int x=get_global_id(0);
-        int y=get_global_id(1);
-        int srcidx = mad24(y,srcStep_in_pixel,x+ srcoffset_in_pixel);
-        int dstidx = mad24(y,dstStep_in_pixel,x+ dstoffset_in_pixel);
-        if ( (x < cols) & (y < rows) )
+        int srcidx = mad24(y, sstep1, x + soffset1);
+        int dstidx = mad24(y, dstep1, x + doffset1);
+
+        if ( (x < cols1) && (y < rows) )
        {
            float temp_src = convert_float(srcMat[srcidx]);
-            dstMat[dstidx] = convert_int_sat(temp_src*alpha+beta);
-        }
-}
-
-__kernel void convert_to_S5_C1_D4(
-        __global const float* restrict srcMat,
-        __global int* dstMat,
-        int cols,
-        int rows,
-        int srcStep_in_pixel,
-        int srcoffset_in_pixel,
-        int dstStep_in_pixel,
-        int dstoffset_in_pixel,
-        F alpha,
-        F beta)
-{
-        int x=get_global_id(0);
-        int y=get_global_id(1);
-        int srcidx = mad24(y,srcStep_in_pixel,x+ srcoffset_in_pixel);
-        int dstidx = mad24(y,dstStep_in_pixel,x+ dstoffset_in_pixel);
-        if ( (x < cols) & (y < rows) )
-        {
-            float temp_src = srcMat[srcidx];
-            dstMat[dstidx] = convert_int_sat(temp_src*alpha+beta);
-        }
-}
-
-__kernel void convert_to_S0_C4_D4(
-        __global const uchar4* restrict srcMat,
-        __global int4* dstMat,
-        int cols,
-        int rows,
-        int srcStep_in_pixel,
-        int srcoffset_in_pixel,
-        int dstStep_in_pixel,
-        int dstoffset_in_pixel,
-        F alpha,
-        F beta)
-{
-        int x=get_global_id(0);
-        int y=get_global_id(1);
-        int srcidx = mad24(y,srcStep_in_pixel,x+ srcoffset_in_pixel);
-        int dstidx = mad24(y,dstStep_in_pixel,x+ dstoffset_in_pixel);
-        if ( (x < cols) & (y < rows) )
-        {
-            float4 temp_src = convert_float4(srcMat[srcidx]);
-            dstMat[dstidx] = convert_int4_sat(temp_src*alpha+beta);
-        }
-}
-
-__kernel void convert_to_S5_C4_D4(
-        __global const float4* restrict srcMat,
-        __global int4* dstMat,
-        int cols,
-        int rows,
-        int srcStep_in_pixel,
-        int srcoffset_in_pixel,
-        int dstStep_in_pixel,
-        int dstoffset_in_pixel,
-        F alpha,
-        F beta)
-{
-        int x=get_global_id(0);
-        int y=get_global_id(1);
-        int srcidx = mad24(y,srcStep_in_pixel,x+ srcoffset_in_pixel);
-        int dstidx = mad24(y,dstStep_in_pixel,x+ dstoffset_in_pixel);
-        if ( (x < cols) & (y < rows) )
-        {
-            float4 temp_src = srcMat[srcidx];
-            dstMat[dstidx] = convert_int4_sat(temp_src*alpha+beta);
-        }
-}
-
-__kernel void convert_to_S0_C1_D5(
-        __global const uchar* restrict srcMat,
-        __global float* dstMat,
-        int cols,
-        int rows,
-        int srcStep_in_pixel,
-        int srcoffset_in_pixel,
-        int dstStep_in_pixel,
-        int dstoffset_in_pixel,
-        F alpha,
-        F beta)
-{
-        int x=get_global_id(0);
-        int y=get_global_id(1);
-        int srcidx = mad24(y,srcStep_in_pixel,x+ srcoffset_in_pixel);
-        int dstidx = mad24(y,dstStep_in_pixel,x+ dstoffset_in_pixel);
-        if ( (x < cols) & (y < rows) )
-        {
-            float temp_src = convert_float(srcMat[srcidx]);
-            dstMat[dstidx] = temp_src*alpha+beta;
-        }
-}
-
-__kernel void convert_to_S4_C1_D5(
-        __global const int* restrict srcMat,
-        __global float* dstMat,
-        int cols,
-        int rows,
-        int srcStep_in_pixel,
-        int srcoffset_in_pixel,
-        int dstStep_in_pixel,
-        int dstoffset_in_pixel,
-        F alpha,
-        F beta)
-{
-        int x=get_global_id(0);
-        int y=get_global_id(1);
-        int srcidx = mad24(y,srcStep_in_pixel,x+ srcoffset_in_pixel);
-        int dstidx = mad24(y,dstStep_in_pixel,x+ dstoffset_in_pixel);
-        if ( (x < cols) & (y < rows) )
-        {
-            float temp_src = convert_float(srcMat[srcidx]);
-            dstMat[dstidx] = temp_src*alpha+beta;
-        }
-}
-
-__kernel void convert_to_S0_C4_D5(
-        __global const uchar4* restrict srcMat,
-        __global float4* dstMat,
-        int cols,
-        int rows,
-        int srcStep_in_pixel,
-        int srcoffset_in_pixel,
-        int dstStep_in_pixel,
-        int dstoffset_in_pixel,
-        F alpha,
-        F beta)
-{
-        int x=get_global_id(0);
-        int y=get_global_id(1);
-        int srcidx = mad24(y,srcStep_in_pixel,x+ srcoffset_in_pixel);
-        int dstidx = mad24(y,dstStep_in_pixel,x+ dstoffset_in_pixel);
-        if ( (x < cols) & (y < rows) )
-        {
-            float4 temp_src = convert_float4(srcMat[srcidx]);
-            dstMat[dstidx] = temp_src*alpha+beta;
-        }
-}
-
-__kernel void convert_to_S4_C4_D5(
-        __global const int4* restrict srcMat,
-        __global float4* dstMat,
-        int cols,
-        int rows,
-        int srcStep_in_pixel,
-        int srcoffset_in_pixel,
-        int dstStep_in_pixel,
-        int dstoffset_in_pixel,
-        F alpha,
-        F beta)
-{
-        int x=get_global_id(0);
-        int y=get_global_id(1);
-        int srcidx = mad24(y,srcStep_in_pixel,x+ srcoffset_in_pixel);
-        int dstidx = mad24(y,dstStep_in_pixel,x+ dstoffset_in_pixel);
-        if ( (x < cols) & (y < rows) )
-        {
-            float4 temp_src = convert_float4(srcMat[srcidx]);
-            dstMat[dstidx] = temp_src*alpha+beta;
+            dstMat[dstidx] = convertToDstType(temp_src*alpha+beta);
        }
 }