added gpu 1d window sum, convertTo, based on NPP.

added RGB <-> XYZ color conversion. gpu morphology minor fix.
2010-09-29 09:07:53 +00:00
parent 12b7f3a0d0
commit fb7aa43feb
9 changed files with 567 additions and 500 deletions
--- a/modules/gpu/src/cuda/color.cu
+++ b/modules/gpu/src/cuda/color.cu
@@ -187,8 +187,7 @@ namespace cv { namespace gpu { namespace improc

 namespace imgproc
 {
-    template <int GREEN_BITS, int DSTCN> struct RGB5x52RGBConverter {};
-    
+    template <int GREEN_BITS, int DSTCN> struct RGB5x52RGBConverter {};    
    template <int DSTCN> struct RGB5x52RGBConverter<5, DSTCN>
    {
        typedef typename TypeVec<uchar, DSTCN>::vec_t dst_t;
@@ -239,7 +238,6 @@ namespace imgproc
    }

    template <int SRCCN, int GREEN_BITS> struct RGB2RGB5x5Converter {};
-
    template<int SRCCN> struct RGB2RGB5x5Converter<SRCCN, 6> 
    {
        static __device__ unsigned short cvt(const uchar* src_ptr, int bidx)
@@ -258,7 +256,7 @@ namespace imgproc
    {
        static __device__ unsigned short cvt(const uchar* src_ptr, int bidx)
        {
-            return (unsigned short)((src_ptr[bidx] >> 3) | ((src_ptr[1] & ~7) << 2) | ((src_ptr[bidx^2] & ~7) << 7)|(src_ptr[3] ? 0x8000 : 0));
+            return (unsigned short)((src_ptr[bidx] >> 3) | ((src_ptr[1] & ~7) << 2) | ((src_ptr[bidx^2] & ~7) << 7) | (src_ptr[3] ? 0x8000 : 0));
        }
    };    

@@ -343,7 +341,7 @@ namespace cv { namespace gpu { namespace improc
 namespace imgproc
 {
    template <int DSTCN, typename T>
-    __global__ void Gray2RGB(const T* src_, size_t src_step, T* dst_, size_t dst_step, int rows, int cols)
+    __global__ void Gray2RGB(const uchar* src_, size_t src_step, uchar* dst_, size_t dst_step, int rows, int cols)
    {
        typedef typename TypeVec<T, DSTCN>::vec_t dst_t;

@@ -352,18 +350,17 @@ namespace imgproc

        if (y < rows && x < cols)
        {
-            T src = src_[y * src_step + x];
+            T src = *(const T*)(src_ + y * src_step + x * sizeof(T));
            dst_t dst;
            dst.x = src;
            dst.y = src;
            dst.z = src;
            setAlpha(dst, ColorChannel<T>::max());
-            *(dst_t*)(dst_ + y * dst_step + x * DSTCN) = dst;
+            *(dst_t*)(dst_ + y * dst_step + x * DSTCN * sizeof(T)) = dst;
        }
    }

    template <int GREEN_BITS> struct Gray2RGB5x5Converter {};
-
    template<> struct Gray2RGB5x5Converter<6> 
    {
        static __device__ unsigned short cvt(unsigned int t)
@@ -378,7 +375,7 @@ namespace imgproc
            t >>= 3;
            return (unsigned short)(t | (t << 5) | (t << 10));
        }
-    };   
+    };

    template<int GREEN_BITS>
    __global__ void Gray2RGB5x5(const uchar* src_, size_t src_step, uchar* dst_, size_t dst_step, int rows, int cols)
@@ -398,7 +395,7 @@ namespace imgproc
 namespace cv { namespace gpu { namespace improc
 {
    template <typename T, int DSTCN>
-    void Gray2RGB_caller(const DevMem2D_<T>& src, const DevMem2D_<T>& dst, cudaStream_t stream)
+    void Gray2RGB_caller(const DevMem2D& src, const DevMem2D& dst, cudaStream_t stream)
    {
        dim3 threads(32, 8, 1);
        dim3 grid(1, 1, 1);
@@ -406,14 +403,14 @@ namespace cv { namespace gpu { namespace improc
        grid.x = divUp(src.cols, threads.x);
        grid.y = divUp(src.rows, threads.y);

-        imgproc::Gray2RGB<DSTCN><<<grid, threads, 0, stream>>>(src.ptr, src.step / sizeof(T), 
-            dst.ptr, dst.step / sizeof(T), src.rows, src.cols);
+        imgproc::Gray2RGB<DSTCN, T><<<grid, threads, 0, stream>>>(src.ptr, src.step, 
+            dst.ptr, dst.step, src.rows, src.cols);

        if (stream == 0)
            cudaSafeCall( cudaThreadSynchronize() );
    }

-    void Gray2RGB_gpu(const DevMem2D& src, const DevMem2D& dst, int dstcn, cudaStream_t stream)
+    void Gray2RGB_gpu_8u(const DevMem2D& src, const DevMem2D& dst, int dstcn, cudaStream_t stream)
    {
        typedef void (*Gray2RGB_caller_t)(const DevMem2D& src, const DevMem2D& dst, cudaStream_t stream);
        static const Gray2RGB_caller_t Gray2RGB_callers[] = {Gray2RGB_caller<uchar, 3>, Gray2RGB_caller<uchar, 4>};
@@ -421,17 +418,17 @@ namespace cv { namespace gpu { namespace improc
        Gray2RGB_callers[dstcn - 3](src, dst, stream);
    }

-    void Gray2RGB_gpu(const DevMem2D_<unsigned short>& src, const DevMem2D_<unsigned short>& dst, int dstcn, cudaStream_t stream)
+    void Gray2RGB_gpu_16u(const DevMem2D& src, const DevMem2D& dst, int dstcn, cudaStream_t stream)
    {
-        typedef void (*Gray2RGB_caller_t)(const DevMem2D_<unsigned short>& src, const DevMem2D_<unsigned short>& dst, cudaStream_t stream);
+        typedef void (*Gray2RGB_caller_t)(const DevMem2D& src, const DevMem2D& dst, cudaStream_t stream);
        static const Gray2RGB_caller_t Gray2RGB_callers[] = {Gray2RGB_caller<unsigned short, 3>, Gray2RGB_caller<unsigned short, 4>};

        Gray2RGB_callers[dstcn - 3](src, dst, stream);
    }

-    void Gray2RGB_gpu(const DevMem2Df& src, const DevMem2Df& dst, int dstcn, cudaStream_t stream)
+    void Gray2RGB_gpu_32f(const DevMem2D& src, const DevMem2D& dst, int dstcn, cudaStream_t stream)
    {
-        typedef void (*Gray2RGB_caller_t)(const DevMem2Df& src, const DevMem2Df& dst, cudaStream_t stream);
+        typedef void (*Gray2RGB_caller_t)(const DevMem2D& src, const DevMem2D& dst, cudaStream_t stream);
        static const Gray2RGB_caller_t Gray2RGB_callers[] = {Gray2RGB_caller<float, 3>, Gray2RGB_caller<float, 4>};

        Gray2RGB_callers[dstcn - 3](src, dst, stream);
@@ -484,7 +481,6 @@ namespace imgproc
    };

    template <int GREEN_BITS> struct RGB5x52GrayConverter {};
-
    template<> struct RGB5x52GrayConverter<6> 
    {
        static __device__ unsigned char cvt(unsigned int t)
@@ -514,200 +510,46 @@ namespace imgproc
        }
    }

-    __global__ void RGB2Gray_3(const uchar* src_, size_t src_step, uchar* dst_, size_t dst_step, int rows, int cols, int bidx)
+    template <typename T> struct RGB2GrayConvertor 
    {
-		const int x = (blockDim.x * blockIdx.x + threadIdx.x) << 2;
-		const int y = blockDim.y * blockIdx.y + threadIdx.y;
+        static __device__ T cvt(const T* src, int bidx)
+        {
+            return (T)CV_DESCALE((unsigned)(src[bidx] * B2Y + src[1] * G2Y + src[bidx^2] * R2Y), yuv_shift);
+        }
+    };
+    template <> struct RGB2GrayConvertor<float> 
+    {
+        static __device__ float cvt(const float* src, int bidx)
+        {
+            const float cr = 0.299f;
+            const float cg = 0.587f;
+            const float cb = 0.114f;
+
+            return src[bidx] * cb + src[1] * cg + src[bidx^2] * cr;
+        }
+    };
+
+    template <int SRCCN, typename T>
+    __global__ void RGB2Gray(const uchar* src_, size_t src_step, uchar* dst_, size_t dst_step, int rows, int cols, int bidx)
+    {
+        typedef typename TypeVec<T, SRCCN>::vec_t src_t;
+
+        const int x = blockDim.x * blockIdx.x + threadIdx.x;
+        const int y = blockDim.y * blockIdx.y + threadIdx.y;

        if (y < rows && x < cols)
        {
-            const uchar* src = src_ + y * src_step + x * 3;
+            src_t src = *(const src_t*)(src_ + y * src_step + x * SRCCN * sizeof(T));

-            uchar t0 = src[bidx], t1 = src[1], t2 = src[bidx ^ 2];
-
-            uchar4 dst;
-            dst.x = (uchar)CV_DESCALE((unsigned)(t0 * B2Y + t1 * G2Y + t2 * R2Y), yuv_shift);
-
-            src += 3;
-            t0 = src[bidx], t1 = src[1], t2 = src[bidx ^ 2];
-            dst.y = (uchar)CV_DESCALE((unsigned)(t0 * B2Y + t1 * G2Y + t2 * R2Y), yuv_shift);
-
-            src += 3;
-            t0 = src[bidx], t1 = src[1], t2 = src[bidx ^ 2];
-            dst.z = (uchar)CV_DESCALE((unsigned)(t0 * B2Y + t1 * G2Y + t2 * R2Y), yuv_shift);
-
-            src += 3;
-            t0 = src[bidx], t1 = src[1], t2 = src[bidx ^ 2];
-            dst.w = (uchar)CV_DESCALE((unsigned)(t0 * B2Y + t1 * G2Y + t2 * R2Y), yuv_shift);
-
-            *(uchar4*)(dst_ + y * dst_step + x) = dst;
+            *(T*)(dst_ + y * dst_step + x * sizeof(T)) = RGB2GrayConvertor<T>::cvt((const T*)(&src), bidx);
        }
-    }
-
-    __global__ void RGB2Gray_3(const unsigned short* src_, size_t src_step, unsigned short* dst_, size_t dst_step, int rows, int cols, int bidx)
-    {
-		const int x = (blockDim.x * blockIdx.x + threadIdx.x) << 1;
-		const int y = blockDim.y * blockIdx.y + threadIdx.y;
-
-        if (y < rows && x < cols)
-        {
-            const unsigned short* src = src_ + y * src_step + x * 3;
-
-            unsigned short t0 = src[bidx], t1 = src[1], t2 = src[bidx ^ 2];
-
-            ushort2 dst;
-            dst.x = (unsigned short)CV_DESCALE((unsigned)(t0 * B2Y + t1 * G2Y + t2 * R2Y), yuv_shift);
-
-            src += 3;
-            t0 = src[bidx], t1 = src[1], t2 = src[bidx ^ 2];
-            dst.y = (unsigned short)CV_DESCALE((unsigned)(t0 * B2Y + t1 * G2Y + t2 * R2Y), yuv_shift);
-
-            *(ushort2*)(dst_ + y * dst_step + x) = dst;
-        }
-    }
-
-    __global__ void RGB2Gray_3(const float* src_, size_t src_step, float* dst_, size_t dst_step, int rows, int cols, int bidx)
-    {
-        const float cr = 0.299f;
-        const float cg = 0.587f;
-        const float cb = 0.114f;
-
-		const int x = blockDim.x * blockIdx.x + threadIdx.x;
-		const int y = blockDim.y * blockIdx.y + threadIdx.y;
-
-        if (y < rows && x < cols)
-        {
-            const float* src = src_ + y * src_step + x * 3;
-
-            float t0 = src[bidx], t1 = src[1], t2 = src[bidx ^ 2];
-            *(dst_ + y * dst_step + x) = t0 * cb + t1 * cg + t2 * cr;
-        }
-    }
-
-    __global__ void RGB2Gray_4(const uchar* src_, size_t src_step, uchar* dst_, size_t dst_step, int rows, int cols, int bidx)
-    {
-		const int x = (blockDim.x * blockIdx.x + threadIdx.x) << 2;
-		const int y = blockDim.y * blockIdx.y + threadIdx.y;
-
-        if (y < rows && x < cols)
-        {
-            uchar4 src = *(uchar4*)(src_ + y * src_step + (x << 2));
-
-            uchar t0 = ((uchar*)(&src))[bidx], t1 = src.y, t2 = ((uchar*)(&src))[bidx ^ 2];
-
-            uchar4 dst;
-            dst.x = (uchar)CV_DESCALE((unsigned)(t0 * B2Y + t1 * G2Y + t2 * R2Y), yuv_shift);
-
-            src = *(uchar4*)(src_ + y * src_step + (x << 2) + 4);
-            t0 = ((uchar*)(&src))[bidx], t1 = src.y, t2 = ((uchar*)(&src))[bidx ^ 2];
-            dst.y = (uchar)CV_DESCALE((unsigned)(t0 * B2Y + t1 * G2Y + t2 * R2Y), yuv_shift);
-
-            src = *(uchar4*)(src_ + y * src_step + (x << 2) + 8);
-            t0 = ((uchar*)(&src))[bidx], t1 = src.y, t2 = ((uchar*)(&src))[bidx ^ 2];
-            dst.z = (uchar)CV_DESCALE((unsigned)(t0 * B2Y + t1 * G2Y + t2 * R2Y), yuv_shift);
-
-            src = *(uchar4*)(src_ + y * src_step + (x << 2) + 12);
-            t0 = ((uchar*)(&src))[bidx], t1 = src.y, t2 = ((uchar*)(&src))[bidx ^ 2];
-            dst.w = (uchar)CV_DESCALE((unsigned)(t0 * B2Y + t1 * G2Y + t2 * R2Y), yuv_shift);
-
-            *(uchar4*)(dst_ + y * dst_step + x) = dst;
-        }
-    }
-
-    __global__ void RGB2Gray_4(const unsigned short* src_, size_t src_step, unsigned short* dst_, size_t dst_step, int rows, int cols, int bidx)
-    {
-		const int x = (blockDim.x * blockIdx.x + threadIdx.x) << 1;
-		const int y = blockDim.y * blockIdx.y + threadIdx.y;
-
-        if (y < rows && x < cols)
-        {
-            ushort4 src = *(ushort4*)(src_ + y * src_step + (x << 2));
-
-            unsigned short t0 = ((unsigned short*)(&src))[bidx], t1 = src.y, t2 = ((unsigned short*)(&src))[bidx ^ 2];
-
-            ushort2 dst;
-            dst.x = (unsigned short)CV_DESCALE((unsigned)(t0 * B2Y + t1 * G2Y + t2 * R2Y), yuv_shift);
-
-            src = *(ushort4*)(src_ + y * src_step + (x << 2) + 4);
-            t0 = ((unsigned short*)(&src))[bidx], t1 = src.y, t2 = ((unsigned short*)(&src))[bidx ^ 2];
-            dst.y = (unsigned short)CV_DESCALE((unsigned)(t0 * B2Y + t1 * G2Y + t2 * R2Y), yuv_shift);
-
-            *(ushort2*)(dst_ + y * dst_step + x) = dst;
-        }
-    }
-
-    __global__ void RGB2Gray_4(const float* src_, size_t src_step, float* dst_, size_t dst_step, int rows, int cols, int bidx)
-    {
-        const float cr = 0.299f;
-        const float cg = 0.587f;
-        const float cb = 0.114f;
-
-		const int x = blockDim.x * blockIdx.x + threadIdx.x;
-		const int y = blockDim.y * blockIdx.y + threadIdx.y;
-
-        if (y < rows && x < cols)
-        {
-            float4 src = *(float4*)(src_ + y * src_step + (x << 2));
-
-            float t0 = ((float*)(&src))[bidx], t1 = src.y, t2 = ((float*)(&src))[bidx ^ 2];
-            *(dst_ + y * dst_step + x) = t0 * cb + t1 * cg + t2 * cr;
-        }
-    }
+    }   
 }

 namespace cv { namespace gpu { namespace improc
 {
-    void RGB2Gray_gpu(const DevMem2D& src, int srccn, const DevMem2D& dst, int bidx, cudaStream_t stream)
-    {
-        dim3 threads(32, 8, 1);
-        dim3 grid(1, 1, 1);
-
-        grid.x = divUp(src.cols, threads.x << 2);
-        grid.y = divUp(src.rows, threads.y);
-
-        switch (srccn)
-        {
-        case 3:
-            imgproc::RGB2Gray_3<<<grid, threads, 0, stream>>>(src.ptr, src.step / sizeof(uchar), dst.ptr, dst.step / sizeof(uchar), src.rows, src.cols, bidx);
-            break;
-        case 4:
-            imgproc::RGB2Gray_4<<<grid, threads, 0, stream>>>(src.ptr, src.step / sizeof(uchar), dst.ptr, dst.step / sizeof(uchar), src.rows, src.cols, bidx);
-            break;
-        default:
-            cv::gpu::error("Unsupported channels count", __FILE__, __LINE__);
-            break;
-        }
-
-        if (stream == 0)
-            cudaSafeCall( cudaThreadSynchronize() );
-    }
-
-    void RGB2Gray_gpu(const DevMem2D_<unsigned short>& src, int srccn, const DevMem2D_<unsigned short>& dst, int bidx, cudaStream_t stream)
-    {
-        dim3 threads(32, 8, 1);
-        dim3 grid(1, 1, 1);
-
-        grid.x = divUp(src.cols, threads.x << 1);
-        grid.y = divUp(src.rows, threads.y);
-
-        switch (srccn)
-        {
-        case 3:
-            imgproc::RGB2Gray_3<<<grid, threads, 0, stream>>>(src.ptr, src.step / sizeof(unsigned short), dst.ptr, dst.step / sizeof(unsigned short), src.rows, src.cols, bidx);
-            break;
-        case 4:
-            imgproc::RGB2Gray_4<<<grid, threads, 0, stream>>>(src.ptr, src.step / sizeof(unsigned short), dst.ptr, dst.step / sizeof(unsigned short), src.rows, src.cols, bidx);
-            break;
-        default:
-            cv::gpu::error("Unsupported channels count", __FILE__, __LINE__);
-            break;
-        }
-
-        if (stream == 0)
-            cudaSafeCall( cudaThreadSynchronize() );
-    }
-
-    void RGB2Gray_gpu(const DevMem2Df& src, int srccn, const DevMem2Df& dst, int bidx, cudaStream_t stream)
+    template <typename T, int SRCCN>
+    void RGB2Gray_caller(const DevMem2D& src, const DevMem2D& dst, int bidx, cudaStream_t stream)
    {
        dim3 threads(32, 8, 1);
        dim3 grid(1, 1, 1);
@@ -715,23 +557,37 @@ namespace cv { namespace gpu { namespace improc
        grid.x = divUp(src.cols, threads.x);
        grid.y = divUp(src.rows, threads.y);

-        switch (srccn)
-        {
-        case 3:
-            imgproc::RGB2Gray_3<<<grid, threads, 0, stream>>>(src.ptr, src.step / sizeof(float), dst.ptr, dst.step / sizeof(float), src.rows, src.cols, bidx);
-            break;
-        case 4:
-            imgproc::RGB2Gray_4<<<grid, threads, 0, stream>>>(src.ptr, src.step / sizeof(float), dst.ptr, dst.step / sizeof(float), src.rows, src.cols, bidx);
-            break;
-        default:
-            cv::gpu::error("Unsupported channels count", __FILE__, __LINE__);
-            break;
-        }
+        imgproc::RGB2Gray<SRCCN, T><<<grid, threads, 0, stream>>>(src.ptr, src.step, 
+            dst.ptr, dst.step, src.rows, src.cols, bidx);

        if (stream == 0)
            cudaSafeCall( cudaThreadSynchronize() );
    }

+    void RGB2Gray_gpu_8u(const DevMem2D& src, int srccn, const DevMem2D& dst, int bidx, cudaStream_t stream)
+    {
+        typedef void (*RGB2Gray_caller_t)(const DevMem2D& src, const DevMem2D& dst, int bidx, cudaStream_t stream);
+        RGB2Gray_caller_t RGB2Gray_callers[] = {RGB2Gray_caller<unsigned char, 3>, RGB2Gray_caller<unsigned char, 4>};
+
+        RGB2Gray_callers[srccn - 3](src, dst, bidx, stream);
+    }
+
+    void RGB2Gray_gpu_16u(const DevMem2D& src, int srccn, const DevMem2D& dst, int bidx, cudaStream_t stream)
+    {
+        typedef void (*RGB2Gray_caller_t)(const DevMem2D& src, const DevMem2D& dst, int bidx, cudaStream_t stream);
+        RGB2Gray_caller_t RGB2Gray_callers[] = {RGB2Gray_caller<unsigned short, 3>, RGB2Gray_caller<unsigned short, 4>};
+
+        RGB2Gray_callers[srccn - 3](src, dst, bidx, stream);
+    }
+
+    void RGB2Gray_gpu_32f(const DevMem2D& src, int srccn, const DevMem2D& dst, int bidx, cudaStream_t stream)
+    {
+        typedef void (*RGB2Gray_caller_t)(const DevMem2D& src, const DevMem2D& dst, int bidx, cudaStream_t stream);
+        RGB2Gray_caller_t RGB2Gray_callers[] = {RGB2Gray_caller<float, 3>, RGB2Gray_caller<float, 4>};
+
+        RGB2Gray_callers[srccn - 3](src, dst, bidx, stream);
+    }    
+
    template <int GREEN_BITS>
    void RGB5x52Gray_caller(const DevMem2D& src, const DevMem2D& dst, cudaStream_t stream)
    {
@@ -784,7 +640,6 @@ namespace imgproc
            dst.z = saturate_cast<T>(Cb);
        }
    };
-
    template<> struct RGB2YCrCbConverter<float>
    {
        typedef typename TypeVec<float, 3>::vec_t dst_t;
@@ -832,7 +687,6 @@ namespace imgproc
            dst[bidx^2] = saturate_cast<T>(r);
        }
    };
-
    template <> struct YCrCb2RGBConvertor<float>
    {
        typedef typename TypeVec<float, 3>::vec_t src_t;
@@ -982,185 +836,194 @@ namespace cv { namespace gpu { namespace improc

 ////////////////////////////////////// RGB <-> XYZ ///////////////////////////////////////

-//namespace imgproc
-//{
-//    static const float sRGB2XYZ_D65[] =
-//    {
-//        0.412453f, 0.357580f, 0.180423f,
-//        0.212671f, 0.715160f, 0.072169f,
-//        0.019334f, 0.119193f, 0.950227f
-//    };
-//
-//    static const float XYZ2sRGB_D65[] =
-//    {
-//        3.240479f, -1.53715f, -0.498535f,
-//        -0.969256f, 1.875991f, 0.041556f,
-//        0.055648f, -0.204043f, 1.057311f
-//    };
-//
-//    template<typename _Tp> struct RGB2XYZ_f
-//    {
-//        typedef _Tp channel_type;
-//
-//        RGB2XYZ_f(int _srccn, int blueIdx, const float* _coeffs) : srccn(_srccn)
-//        {
-//            memcpy(coeffs, _coeffs ? _coeffs : sRGB2XYZ_D65, 9*sizeof(coeffs[0]));
-//            if(blueIdx == 0)
-//            {
-//                std::swap(coeffs[0], coeffs[2]);
-//                std::swap(coeffs[3], coeffs[5]);
-//                std::swap(coeffs[6], coeffs[8]);
-//            }
-//        }
-//        void operator()(const _Tp* src, _Tp* dst, int n) const
-//        {
-//            int scn = srccn;
-//            float C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2],
-//                  C3 = coeffs[3], C4 = coeffs[4], C5 = coeffs[5],
-//                  C6 = coeffs[6], C7 = coeffs[7], C8 = coeffs[8];
-//
-//            n *= 3;
-//            for(int i = 0; i < n; i += 3, src += scn)
-//            {
-//			    _Tp X = saturate_cast<_Tp>(src[0]*C0 + src[1]*C1 + src[2]*C2);
-//			    _Tp Y = saturate_cast<_Tp>(src[0]*C3 + src[1]*C4 + src[2]*C5);
-//			    _Tp Z = saturate_cast<_Tp>(src[0]*C6 + src[1]*C7 + src[2]*C8);
-//                dst[i] = X; dst[i+1] = Y; dst[i+2] = Z;
-//            }
-//        }
-//        int srccn;
-//        float coeffs[9];
-//    };
-//
-//    template<typename _Tp> struct RGB2XYZ_i
-//    {
-//        typedef _Tp channel_type;
-//
-//        RGB2XYZ_i(int _srccn, int blueIdx, const float* _coeffs) : srccn(_srccn)
-//        {
-//            static const int coeffs0[] =
-//            {
-//                1689,    1465,    739,
-//                871,     2929,    296,
-//                79,      488,     3892
-//            };
-//            for( int i = 0; i < 9; i++ )
-//                coeffs[i] = _coeffs ? cvRound(_coeffs[i]*(1 << xyz_shift)) : coeffs0[i];
-//            if(blueIdx == 0)
-//            {
-//                std::swap(coeffs[0], coeffs[2]);
-//                std::swap(coeffs[3], coeffs[5]);
-//                std::swap(coeffs[6], coeffs[8]);
-//            }
-//        }
-//        void operator()(const _Tp* src, _Tp* dst, int n) const
-//        {
-//            int scn = srccn;
-//            int C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2],
-//                C3 = coeffs[3], C4 = coeffs[4], C5 = coeffs[5],
-//                C6 = coeffs[6], C7 = coeffs[7], C8 = coeffs[8];
-//            n *= 3;
-//            for(int i = 0; i < n; i += 3, src += scn)
-//            {
-//                int X = CV_DESCALE(src[0]*C0 + src[1]*C1 + src[2]*C2, xyz_shift);
-//                int Y = CV_DESCALE(src[0]*C3 + src[1]*C4 + src[2]*C5, xyz_shift);
-//                int Z = CV_DESCALE(src[0]*C6 + src[1]*C7 + src[2]*C8, xyz_shift);
-//                dst[i] = saturate_cast<_Tp>(X); dst[i+1] = saturate_cast<_Tp>(Y);
-//                dst[i+2] = saturate_cast<_Tp>(Z);
-//            }
-//        }
-//        int srccn;
-//        int coeffs[9];
-//    };
-//
-//    template<typename _Tp> struct XYZ2RGB_f
-//    {
-//        typedef _Tp channel_type;
-//
-//        XYZ2RGB_f(int _dstcn, int _blueIdx, const float* _coeffs)
-//        : dstcn(_dstcn), blueIdx(_blueIdx)
-//        {
-//            memcpy(coeffs, _coeffs ? _coeffs : XYZ2sRGB_D65, 9*sizeof(coeffs[0]));
-//            if(blueIdx == 0)
-//            {
-//                std::swap(coeffs[0], coeffs[6]);
-//                std::swap(coeffs[1], coeffs[7]);
-//                std::swap(coeffs[2], coeffs[8]);
-//            }
-//        }
-//
-//        void operator()(const _Tp* src, _Tp* dst, int n) const
-//        {
-//            int dcn = dstcn;
-//		    _Tp alpha = ColorChannel<_Tp>::max();
-//            float C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2],
-//                  C3 = coeffs[3], C4 = coeffs[4], C5 = coeffs[5],
-//                  C6 = coeffs[6], C7 = coeffs[7], C8 = coeffs[8];
-//            n *= 3;
-//            for(int i = 0; i < n; i += 3, dst += dcn)
-//            {
-//			    _Tp B = saturate_cast<_Tp>(src[i]*C0 + src[i+1]*C1 + src[i+2]*C2);
-//			    _Tp G = saturate_cast<_Tp>(src[i]*C3 + src[i+1]*C4 + src[i+2]*C5);
-//			    _Tp R = saturate_cast<_Tp>(src[i]*C6 + src[i+1]*C7 + src[i+2]*C8);
-//                dst[0] = B; dst[1] = G; dst[2] = R;
-//			    if( dcn == 4 )
-//				    dst[3] = alpha;
-//            }
-//        }
-//        int dstcn, blueIdx;
-//        float coeffs[9];
-//    };
-//
-//    template<typename _Tp> struct XYZ2RGB_i
-//    {
-//        typedef _Tp channel_type;
-//
-//        XYZ2RGB_i(int _dstcn, int _blueIdx, const int* _coeffs)
-//        : dstcn(_dstcn), blueIdx(_blueIdx)
-//        {
-//            static const int coeffs0[] =
-//            {
-//                13273,  -6296,  -2042,
-//                -3970,   7684,    170,
-//                  228,   -836,   4331
-//            };
-//            for(int i = 0; i < 9; i++)
-//                coeffs[i] = _coeffs ? cvRound(_coeffs[i]*(1 << xyz_shift)) : coeffs0[i];
-//
-//            if(blueIdx == 0)
-//            {
-//                std::swap(coeffs[0], coeffs[6]);
-//                std::swap(coeffs[1], coeffs[7]);
-//                std::swap(coeffs[2], coeffs[8]);
-//            }
-//        }
-//        void operator()(const _Tp* src, _Tp* dst, int n) const
-//        {
-//            int dcn = dstcn;
-//            _Tp alpha = ColorChannel<_Tp>::max();
-//            int C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2],
-//                C3 = coeffs[3], C4 = coeffs[4], C5 = coeffs[5],
-//                C6 = coeffs[6], C7 = coeffs[7], C8 = coeffs[8];
-//            n *= 3;
-//            for(int i = 0; i < n; i += 3, dst += dcn)
-//            {
-//                int B = CV_DESCALE(src[i]*C0 + src[i+1]*C1 + src[i+2]*C2, xyz_shift);
-//                int G = CV_DESCALE(src[i]*C3 + src[i+1]*C4 + src[i+2]*C5, xyz_shift);
-//                int R = CV_DESCALE(src[i]*C6 + src[i+1]*C7 + src[i+2]*C8, xyz_shift);
-//                dst[0] = saturate_cast<_Tp>(B); dst[1] = saturate_cast<_Tp>(G);
-//                dst[2] = saturate_cast<_Tp>(R);
-//                if( dcn == 4 )
-//				    dst[3] = alpha;
-//            }
-//        }
-//        int dstcn, blueIdx;
-//        int coeffs[9];
-//    };
-//}
-//
-//namespace cv { namespace gpu { namespace impl
-//{
-//}}}
+namespace imgproc
+{
+    __constant__ float cXYZ_D65f[9];
+    __constant__ int cXYZ_D65i[9];
+
+    template <typename T> struct RGB2XYZConvertor
+    {
+        typedef typename TypeVec<T, 3>::vec_t dst_t;
+        static __device__ dst_t cvt(const T* src)
+        {
+            dst_t dst;
+
+	        dst.x = saturate_cast<T>(CV_DESCALE(src[0] * cXYZ_D65i[0] + src[1] * cXYZ_D65i[1] + src[2] * cXYZ_D65i[2], xyz_shift));
+	        dst.y = saturate_cast<T>(CV_DESCALE(src[0] * cXYZ_D65i[3] + src[1] * cXYZ_D65i[4] + src[2] * cXYZ_D65i[5], xyz_shift));
+	        dst.z = saturate_cast<T>(CV_DESCALE(src[0] * cXYZ_D65i[6] + src[1] * cXYZ_D65i[7] + src[2] * cXYZ_D65i[8], xyz_shift));
+
+            return dst;
+        }
+    };
+    template <> struct RGB2XYZConvertor<float>
+    {
+        typedef typename TypeVec<float, 3>::vec_t dst_t;
+        static __device__ dst_t cvt(const float* src)
+        {
+            dst_t dst;
+
+	        dst.x = src[0] * cXYZ_D65f[0] + src[1] * cXYZ_D65f[1] + src[2] * cXYZ_D65f[2];
+	        dst.y = src[0] * cXYZ_D65f[3] + src[1] * cXYZ_D65f[4] + src[2] * cXYZ_D65f[5];
+	        dst.z = src[0] * cXYZ_D65f[6] + src[1] * cXYZ_D65f[7] + src[2] * cXYZ_D65f[8];
+
+            return dst;
+        }
+    };
+
+    template <int SRCCN, typename T>
+    __global__ void RGB2XYZ(const uchar* src_, size_t src_step, uchar* dst_, size_t dst_step, int rows, int cols)
+    {
+        typedef typename TypeVec<T, SRCCN>::vec_t src_t;
+        typedef typename TypeVec<T, 3>::vec_t dst_t;
+
+		const int x = blockDim.x * blockIdx.x + threadIdx.x;
+		const int y = blockDim.y * blockIdx.y + threadIdx.y;
+
+        if (y < rows && x < cols)
+        {
+            src_t src = *(const src_t*)(src_ + y * src_step + x * SRCCN * sizeof(T));
+            
+            *(dst_t*)(dst_ + y * dst_step + x * 3 * sizeof(T)) = RGB2XYZConvertor<T>::cvt((const T*)(&src));
+        }
+    }
+
+    template <typename T> struct XYZ2RGBConvertor
+    {
+        typedef typename TypeVec<T, 3>::vec_t src_t;
+        static __device__ void cvt(const src_t& src, T* dst)
+        {
+            dst[0] = saturate_cast<T>(CV_DESCALE(src.x * cXYZ_D65i[0] + src.y * cXYZ_D65i[1] + src.z * cXYZ_D65i[2], xyz_shift));
+		    dst[1] = saturate_cast<T>(CV_DESCALE(src.x * cXYZ_D65i[3] + src.y * cXYZ_D65i[4] + src.z * cXYZ_D65i[5], xyz_shift));
+		    dst[2] = saturate_cast<T>(CV_DESCALE(src.x * cXYZ_D65i[6] + src.y * cXYZ_D65i[7] + src.z * cXYZ_D65i[8], xyz_shift));
+        }
+    };
+    template <> struct XYZ2RGBConvertor<float>
+    {
+        typedef typename TypeVec<float, 3>::vec_t src_t;
+        static __device__ void cvt(const src_t& src, float* dst)
+        {
+            dst[0] = src.x * cXYZ_D65f[0] + src.y * cXYZ_D65f[1] + src.z * cXYZ_D65f[2];
+		    dst[1] = src.x * cXYZ_D65f[3] + src.y * cXYZ_D65f[4] + src.z * cXYZ_D65f[5];
+		    dst[2] = src.x * cXYZ_D65f[6] + src.y * cXYZ_D65f[7] + src.z * cXYZ_D65f[8];
+        }
+    };
+
+    template <int DSTCN, typename T>
+    __global__ void XYZ2RGB(const uchar* src_, size_t src_step, uchar* dst_, size_t dst_step, int rows, int cols)
+    {
+        typedef typename TypeVec<T, 3>::vec_t src_t;
+        typedef typename TypeVec<T, DSTCN>::vec_t dst_t;
+
+		const int x = blockDim.x * blockIdx.x + threadIdx.x;
+		const int y = blockDim.y * blockIdx.y + threadIdx.y;
+
+        if (y < rows && x < cols)
+        {
+            src_t src = *(const src_t*)(src_ + y * src_step + x * 3 * sizeof(T));
+
+            dst_t dst;
+            XYZ2RGBConvertor<T>::cvt(src, (T*)(&dst));
+            setAlpha(dst, ColorChannel<T>::max());
+            
+            *(dst_t*)(dst_ + y * dst_step + x * DSTCN * sizeof(T)) = dst;
+        }
+    }
+}
+
+namespace cv { namespace gpu { namespace improc
+{
+    template <typename T, int SRCCN>
+    void RGB2XYZ_caller(const DevMem2D& src, const DevMem2D& dst, cudaStream_t stream)
+    {
+        dim3 threads(32, 8, 1);
+        dim3 grid(1, 1, 1);
+
+        grid.x = divUp(src.cols, threads.x);
+        grid.y = divUp(src.rows, threads.y);
+
+        imgproc::RGB2XYZ<SRCCN, T><<<grid, threads, 0, stream>>>(src.ptr, src.step, 
+            dst.ptr, dst.step, src.rows, src.cols);
+
+        if (stream == 0)
+            cudaSafeCall( cudaThreadSynchronize() );
+    }
+
+    void RGB2XYZ_gpu_8u(const DevMem2D& src, int srccn, const DevMem2D& dst, const int* coeffs, cudaStream_t stream)
+    {
+        typedef void (*RGB2XYZ_caller_t)(const DevMem2D& src, const DevMem2D& dst, cudaStream_t stream);
+        static const RGB2XYZ_caller_t RGB2XYZ_callers[] = {RGB2XYZ_caller<uchar, 3>, RGB2XYZ_caller<uchar, 4>};
+
+        cudaSafeCall( cudaMemcpyToSymbol(imgproc::cXYZ_D65i, coeffs, 9 * sizeof(int)) );
+
+        RGB2XYZ_callers[srccn-3](src, dst, stream);
+    }
+
+    void RGB2XYZ_gpu_16u(const DevMem2D& src, int srccn, const DevMem2D& dst, const int* coeffs, cudaStream_t stream)
+    {
+        typedef void (*RGB2XYZ_caller_t)(const DevMem2D& src, const DevMem2D& dst, cudaStream_t stream);
+        static const RGB2XYZ_caller_t RGB2XYZ_callers[] = {RGB2XYZ_caller<unsigned short, 3>, RGB2XYZ_caller<unsigned short, 4>};
+        
+        cudaSafeCall( cudaMemcpyToSymbol(imgproc::cXYZ_D65i, coeffs, 9 * sizeof(int)) );
+
+        RGB2XYZ_callers[srccn-3](src, dst, stream);
+    }
+
+    void RGB2XYZ_gpu_32f(const DevMem2D& src, int srccn, const DevMem2D& dst, const float* coeffs, cudaStream_t stream)
+    {
+        typedef void (*RGB2XYZ_caller_t)(const DevMem2D& src, const DevMem2D& dst, cudaStream_t stream);
+        static const RGB2XYZ_caller_t RGB2XYZ_callers[] = {RGB2XYZ_caller<float, 3>, RGB2XYZ_caller<float, 4>};
+        
+        cudaSafeCall( cudaMemcpyToSymbol(imgproc::cXYZ_D65f, coeffs, 9 * sizeof(float)) );
+
+        RGB2XYZ_callers[srccn-3](src, dst, stream);
+    }
+    
+    template <typename T, int DSTCN>
+    void XYZ2RGB_caller(const DevMem2D& src, const DevMem2D& dst, cudaStream_t stream)
+    {
+        dim3 threads(32, 8, 1);
+        dim3 grid(1, 1, 1);
+
+        grid.x = divUp(src.cols, threads.x);
+        grid.y = divUp(src.rows, threads.y);
+
+        imgproc::XYZ2RGB<DSTCN, T><<<grid, threads, 0, stream>>>(src.ptr, src.step, 
+            dst.ptr, dst.step, src.rows, src.cols);
+
+        if (stream == 0)
+            cudaSafeCall( cudaThreadSynchronize() );
+    }
+
+    void XYZ2RGB_gpu_8u(const DevMem2D& src, const DevMem2D& dst, int dstcn, const int* coeffs, cudaStream_t stream)
+    {
+        typedef void (*XYZ2RGB_caller_t)(const DevMem2D& src, const DevMem2D& dst, cudaStream_t stream);
+        static const XYZ2RGB_caller_t XYZ2RGB_callers[] = {XYZ2RGB_caller<uchar, 3>, XYZ2RGB_caller<uchar, 4>};
+
+        cudaSafeCall( cudaMemcpyToSymbol(imgproc::cXYZ_D65i, coeffs, 9 * sizeof(int)) );
+
+        XYZ2RGB_callers[dstcn-3](src, dst, stream);
+    }
+
+    void XYZ2RGB_gpu_16u(const DevMem2D& src, const DevMem2D& dst, int dstcn, const int* coeffs, cudaStream_t stream)
+    {
+        typedef void (*XYZ2RGB_caller_t)(const DevMem2D& src, const DevMem2D& dst, cudaStream_t stream);
+        static const XYZ2RGB_caller_t XYZ2RGB_callers[] = {XYZ2RGB_caller<unsigned short, 3>, XYZ2RGB_caller<unsigned short, 4>};
+        
+        cudaSafeCall( cudaMemcpyToSymbol(imgproc::cXYZ_D65i, coeffs, 9 * sizeof(int)) );
+
+        XYZ2RGB_callers[dstcn-3](src, dst, stream);
+    }
+
+    void XYZ2RGB_gpu_32f(const DevMem2D& src, const DevMem2D& dst, int dstcn, const float* coeffs, cudaStream_t stream)
+    {
+        typedef void (*XYZ2RGB_caller_t)(const DevMem2D& src, const DevMem2D& dst, cudaStream_t stream);
+        static const XYZ2RGB_caller_t XYZ2RGB_callers[] = {XYZ2RGB_caller<float, 3>, XYZ2RGB_caller<float, 4>};
+        
+        cudaSafeCall( cudaMemcpyToSymbol(imgproc::cXYZ_D65f, coeffs, 9 * sizeof(float)) );
+
+        XYZ2RGB_callers[dstcn-3](src, dst, stream);
+    }
+}}}

 ////////////////////////////////////// RGB <-> HSV ///////////////////////////////////////