diff --git a/modules/ocl/src/arithm.cpp b/modules/ocl/src/arithm.cpp
index e81ee56a5..2a663b990 100644
--- a/modules/ocl/src/arithm.cpp
+++ b/modules/ocl/src/arithm.cpp
@@ -64,6 +64,7 @@ namespace cv
     {
         //////////////////////////////// OpenCL kernel strings /////////////////////
 
+        extern const char *arithm_absdiff_nonsaturate;
         extern const char *arithm_nonzero;
         extern const char *arithm_sum;
         extern const char *arithm_minMax;
@@ -435,14 +436,12 @@ Scalar cv::ocl::sqrSum(const oclMat &src)
     static sumFunc functab[3] =
     {
         arithmetic_sum<int>,
-        arithmetic_sum<float>,
+        arithmetic_sum<double>,
         arithmetic_sum<double>
     };
 
     bool hasDouble = src.clCxt->supportsFeature(Context::CL_DOUBLE);
-    int ddepth = std::max(src.depth(), CV_32S);
-    if (!hasDouble && ddepth == CV_64F)
-        ddepth = CV_32F;
+    int ddepth = src.depth() <= CV_32S ? CV_32S : (hasDouble ? CV_64F : CV_32F);
 
     sumFunc func = functab[ddepth - CV_32S];
     return func(src, SQR_SUM, ddepth);
@@ -595,57 +594,102 @@ void cv::ocl::minMax_buf(const oclMat &src, double *minVal, double *maxVal, cons
 
 double cv::ocl::norm(const oclMat &src1, int normType)
 {
-    return norm(src1, oclMat(src1.size(), src1.type(), Scalar::all(0)), normType);
+    CV_Assert((normType & NORM_RELATIVE) == 0);
+    return norm(src1, oclMat(), normType);
+}
+
+static void arithm_absdiff_nonsaturate_run(const oclMat & src1, const oclMat & src2, oclMat & diff)
+{
+    CV_Assert(src1.step % src1.elemSize() == 0 && (src2.empty() || src2.step % src2.elemSize() == 0));
+    Context *clCxt = src1.clCxt;
+
+    int ddepth = CV_64F;
+    diff.create(src1.size(), CV_MAKE_TYPE(ddepth, src1.channels()));
+
+    int oclChannels = src1.oclchannels(), sdepth = src1.depth();
+    int src1step1 = src1.step / src1.elemSize(), src1offset1 = src1.offset / src1.elemSize();
+    int src2step1 = src2.step / src2.elemSize(), src2offset1 = src2.offset / src2.elemSize();
+    int diffstep1 = diff.step / diff.elemSize(), diffoffset1 = diff.offset / diff.elemSize();
+
+    string kernelName = "arithm_absdiff_nonsaturate";
+    size_t localThreads[3]  = { 16, 16, 1 };
+    size_t globalThreads[3] = { diff.cols, diff.rows, 1 };
+
+    const char * const typeMap[] = { "uchar", "char", "ushort", "short", "int", "float", "double" };
+    const char * const channelMap[] = { "", "", "2", "4", "4" };
+
+    std::string buildOptions = format("-D srcT=%s%s -D dstT=%s%s -D convertToDstT=convert_%s%s",
+                                      typeMap[sdepth], channelMap[oclChannels],
+                                      typeMap[ddepth], channelMap[oclChannels],
+                                      typeMap[ddepth], channelMap[oclChannels]);
+
+    vector<pair<size_t , const void *> > args;
+    args.push_back( make_pair( sizeof(cl_mem), (void *)&src1.data ));
+    args.push_back( make_pair( sizeof(cl_int), (void *)&src1step1 ));
+    args.push_back( make_pair( sizeof(cl_int), (void *)&src1offset1 ));
+
+    if (!src2.empty())
+    {
+        args.push_back( make_pair( sizeof(cl_mem), (void *)&src2.data ));
+        args.push_back( make_pair( sizeof(cl_int), (void *)&src2step1 ));
+        args.push_back( make_pair( sizeof(cl_int), (void *)&src2offset1 ));
+
+        kernelName += "_binary";
+    }
+
+    args.push_back( make_pair( sizeof(cl_mem), (void *)&diff.data ));
+    args.push_back( make_pair( sizeof(cl_int), (void *)&diffstep1 ));
+    args.push_back( make_pair( sizeof(cl_int), (void *)&diffoffset1 ));
+
+    args.push_back( make_pair( sizeof(cl_int), (void *)&src1.cols ));
+    args.push_back( make_pair( sizeof(cl_int), (void *)&src1.rows ));
+
+    openCLExecuteKernel(clCxt, &arithm_absdiff_nonsaturate,
+                        kernelName, globalThreads, localThreads,
+                        args, -1, -1, buildOptions.c_str());
 }
 
 double cv::ocl::norm(const oclMat &src1, const oclMat &src2, int normType)
 {
+    CV_Assert(!src1.empty());
+    CV_Assert(src2.empty() || (src1.type() == src2.type() && src1.size() == src2.size()));
+
+    if (!src1.clCxt->supportsFeature(Context::CL_DOUBLE) && src1.depth() == CV_64F)
+    {
+        CV_Error(CV_GpuNotSupported, "Selected device doesn't support double");
+    }
+
     bool isRelative = (normType & NORM_RELATIVE) != 0;
-    normType &= 7;
-    CV_Assert(src1.depth() <= CV_32S && src1.type() == src2.type() && ( normType == NORM_INF || normType == NORM_L1 || normType == NORM_L2));
-    int channels = src1.oclchannels(), i = 0, *p;
+    normType &= NORM_TYPE_MASK;
+    CV_Assert(normType == NORM_INF || normType == NORM_L1 || normType == NORM_L2);
+
+    Scalar s;
+    int cn = src1.channels();
     double r = 0;
-    oclMat gm1(src1.size(), src1.type());
-    int min_int = (normType == NORM_INF ? CL_INT_MIN : 0);
-    Mat m(1, 1, CV_MAKETYPE(CV_32S, channels), cv::Scalar::all(min_int));
-    oclMat gm2(m), emptyMat;
-    switch(normType)
+    oclMat diff;
+    arithm_absdiff_nonsaturate_run(src1, src2, diff);
+
+    switch (normType)
     {
     case NORM_INF:
-        //  arithmetic_run(src1, src2, gm1, "arithm_op_absdiff");
-        //arithmetic_minMax_run(gm1,emptyMat, gm2,"arithm_op_max");
-        m = (gm2);
-        p = (int *)m.data;
-        r = -std::numeric_limits<double>::max();
-        for (i = 0; i < channels; i++)
-        {
-            r = std::max(r, (double)p[i]);
-        }
+        diff = diff.reshape(1);
+        minMax(diff, NULL, &r);
         break;
     case NORM_L1:
-        //arithmetic_run(src1, src2, gm1, "arithm_op_absdiff");
-        //arithmetic_sum_run(gm1, gm2,"arithm_op_sum");
-        m = (gm2);
-        p = (int *)m.data;
-        for (i = 0; i < channels; i++)
-        {
-            r = r + (double)p[i];
-        }
+        s = sum(diff);
+        for (int i = 0; i < cn; ++i)
+            r += s[i];
         break;
     case NORM_L2:
-        //arithmetic_run(src1, src2, gm1, "arithm_op_absdiff");
-        //arithmetic_sum_run(gm1, gm2,"arithm_op_squares_sum");
-        m = (gm2);
-        p = (int *)m.data;
-        for (i = 0; i < channels; i++)
-        {
-            r = r + (double)p[i];
-        }
+        s = sqrSum(diff);
+        for (int i = 0; i < cn; ++i)
+            r += s[i];
         r = std::sqrt(r);
         break;
     }
     if (isRelative)
         r = r / norm(src2, normType);
+
     return r;
 }
 
diff --git a/modules/ocl/src/opencl/arithm_absdiff_nonsaturate.cl b/modules/ocl/src/opencl/arithm_absdiff_nonsaturate.cl
new file mode 100644
index 000000000..e5d827139
--- /dev/null
+++ b/modules/ocl/src/opencl/arithm_absdiff_nonsaturate.cl
@@ -0,0 +1,93 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
+// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// @Authors
+//    Jia Haipeng, jiahaipeng95@gmail.com
+//
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other oclMaterials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors as is and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if defined (DOUBLE_SUPPORT)
+#ifdef cl_khr_fp64
+#pragma OPENCL EXTENSION cl_khr_fp64:enable
+#elif defined (cl_amd_fp64)
+#pragma OPENCL EXTENSION cl_amd_fp64:enable
+#endif
+#endif
+
+__kernel void arithm_absdiff_nonsaturate_binary(__global srcT *src1, int src1_step, int src1_offset,
+                         __global srcT *src2, int src2_step, int src2_offset,
+                         __global dstT *dst, int dst_step, int dst_offset,
+                         int cols, int rows)
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < rows)
+    {
+        int src1_index = mad24(y, src1_step, x + src1_offset);
+        int src2_index = mad24(y, src2_step, x + src2_offset);
+        int dst_index  = mad24(y, dst_step, x + dst_offset);
+
+        dstT t0 = convertToDstT(src1[src1_index]);
+        dstT t1 = convertToDstT(src2[src2_index]);
+        dstT t2 = t0 - t1;
+
+        dst[dst_index] = t2 >= 0 ? t2 : -t2;
+    }
+}
+
+__kernel void arithm_absdiff_nonsaturate(__global srcT *src1, int src1_step, int src1_offset,
+                         __global dstT *dst, int dst_step, int dst_offset,
+                         int cols, int rows)
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < rows)
+    {
+        int src1_index = mad24(y, src1_step, x + src1_offset);
+        int dst_index  = mad24(y, dst_step, x + dst_offset);
+
+        dstT t0 = convertToDstT(src1[src1_index]);
+
+        dst[dst_index] = t0 >= 0 ? t0 : -t0;
+    }
+}
diff --git a/modules/ocl/test/test_arithm.cpp b/modules/ocl/test/test_arithm.cpp
index ac4842e23..db01d9503 100644
--- a/modules/ocl/test/test_arithm.cpp
+++ b/modules/ocl/test/test_arithm.cpp
@@ -220,8 +220,8 @@ PARAM_TEST_CASE(ArithmTestBase, int, int, bool)
 
         cv::RNG &rng = TS::ptr()->get_rng();
 
-        src1 = randomMat(rng, randomSize(MIN_VALUE, MAX_VALUE), type, 5, 16, false);
-        src2 = randomMat(rng, !use_roi ? src1.size() : randomSize(MIN_VALUE, MAX_VALUE), type, -15440, 14450, false);
+        src1 = randomMat(rng, randomSize(MIN_VALUE, MAX_VALUE), type, 2, 11, false);
+        src2 = randomMat(rng, !use_roi ? src1.size() : randomSize(MIN_VALUE, MAX_VALUE), type, -1540, 1740, false);
         dst1 = randomMat(rng, !use_roi ? src1.size() : randomSize(MIN_VALUE, MAX_VALUE), type, 5, 16, false);
         dst2 = randomMat(rng, !use_roi ? src1.size() : randomSize(MIN_VALUE, MAX_VALUE), type, 5, 16, false);
         mask = randomMat(rng, !use_roi ? src1.size() : randomSize(MIN_VALUE, MAX_VALUE), CV_8UC1, 0, 2, false);
@@ -1440,7 +1440,7 @@ TEST_P(SetIdentity, Mat)
     }
 }
 
-//////////////////////////////// setIdentity /////////////////////////////////////////////////
+//////////////////////////////// meanStdDev /////////////////////////////////////////////////
 
 typedef ArithmTestBase MeanStdDev;
 
@@ -1458,12 +1458,70 @@ TEST_P(MeanStdDev, Mat)
 
         for (int i = 0; i < 4; ++i)
         {
-            EXPECT_NEAR(cpu_mean[i], gpu_mean[i], 1e-5);
+            EXPECT_NEAR(cpu_mean[i], gpu_mean[i], 0.1);
             EXPECT_NEAR(cpu_stddev[i], gpu_stddev[i], 0.1);
         }
     }
 }
 
+//////////////////////////////// Norm /////////////////////////////////////////////////
+
+typedef ArithmTestBase Norm;
+
+TEST_P(Norm, NORM_INF)
+{
+    for (int relative = 0; relative < 2; ++relative)
+        for (int j = 0; j < LOOP_TIMES; j++)
+        {
+            random_roi();
+
+            int type = NORM_INF;
+            if (relative == 1)
+                type |= NORM_RELATIVE;
+
+            const double cpuRes = cv::norm(src1_roi, src2_roi, type);
+            const double gpuRes = cv::ocl::norm(gsrc1, gsrc2, type);
+
+            EXPECT_NEAR(cpuRes, gpuRes, 0.1);
+        }
+}
+
+TEST_P(Norm, NORM_L1)
+{
+    for (int relative = 0; relative < 2; ++relative)
+        for (int j = 0; j < LOOP_TIMES; j++)
+        {
+            random_roi();
+
+            int type = NORM_L1;
+            if (relative == 1)
+                type |= NORM_RELATIVE;
+
+            const double cpuRes = cv::norm(src1_roi, src2_roi, type);
+            const double gpuRes = cv::ocl::norm(gsrc1, gsrc2, type);
+
+            EXPECT_NEAR(cpuRes, gpuRes, 0.1);
+        }
+}
+
+TEST_P(Norm, NORM_L2)
+{
+    for (int relative = 0; relative < 2; ++relative)
+        for (int j = 0; j < LOOP_TIMES; j++)
+        {
+            random_roi();
+
+            int type = NORM_L2;
+            if (relative == 1)
+                type |= NORM_RELATIVE;
+
+            const double cpuRes = cv::norm(src1_roi, src2_roi, type);
+            const double gpuRes = cv::ocl::norm(gsrc1, gsrc2, type);
+
+            EXPECT_NEAR(cpuRes, gpuRes, 0.1);
+        }
+}
+
 //////////////////////////////////////// Instantiation /////////////////////////////////////////
 
 INSTANTIATE_TEST_CASE_P(Arithm, Lut, Combine(testing::Range(CV_8U, CV_USRTYPE1), testing::Range(1, 5), Bool(), Bool()));
@@ -1495,5 +1553,6 @@ INSTANTIATE_TEST_CASE_P(Arithm, Pow, Combine(Values(CV_32F, CV_64F), testing::Ra
 INSTANTIATE_TEST_CASE_P(Arithm, AddWeighted, Combine(testing::Range(CV_8U, CV_USRTYPE1), testing::Range(1, 5), Bool()));
 INSTANTIATE_TEST_CASE_P(Arithm, SetIdentity, Combine(testing::Range(CV_8U, CV_USRTYPE1), testing::Range(1, 5), Bool()));
 INSTANTIATE_TEST_CASE_P(Arithm, MeanStdDev, Combine(testing::Range(CV_8U, CV_USRTYPE1), testing::Range(1, 5), Bool()));
+INSTANTIATE_TEST_CASE_P(Arithm, Norm, Combine(testing::Range(CV_8U, CV_USRTYPE1), testing::Range(1, 5), Bool()));
 
 #endif // HAVE_OPENCL