diff --git a/modules/ocl/doc/ml_machine_learning.rst b/modules/ocl/doc/ml_machine_learning.rst
index 321cec9db..eb72cbeef 100644
--- a/modules/ocl/doc/ml_machine_learning.rst
+++ b/modules/ocl/doc/ml_machine_learning.rst
@@ -85,4 +85,28 @@ Finds centers of clusters and groups input samples around the clusters.
 
             * **KMEANS_USE_INITIAL_LABELS** During the first (and possibly the only) attempt, use the user-supplied labels instead of computing them from the initial centers. For the second and further attempts, use the random or semi-random centers. Use one of  ``KMEANS_*_CENTERS``  flag to specify the exact method.
 
-    :param centers: Output matrix of the cluster centers, one row per each cluster center.
\ No newline at end of file
+    :param centers: Output matrix of the cluster centers, one row per each cluster center.
+
+ocl::distanceToCenters
+----------------------
+For each samples in ``source``, find its closest neighour in ``centers``.
+
+.. ocv:function:: void ocl::distanceToCenters(oclMat &dists, oclMat &labels, const oclMat &src, const oclMat &centers, int distType = NORM_L2SQR, const oclMat &indices = oclMat())
+
+    :param dists: The output distances calculated from each sample to the best matched center.
+
+    :param labels: The output index of best matched center for each row of sample.
+
+    :param src: Floating-point matrix of input samples. One row per sample.
+
+    :param centers: Floating-point matrix of center candidates. One row per center.
+
+    :param distType: Distance metric to calculate distances. Supports ``NORM_L1`` and ``NORM_L2SQR``.
+
+    :param indices: Optional source indices. If not empty:
+
+            * only the indexed source samples will be processed
+            * outputs, i.e., ``dists`` and ``labels``, have the same size of indices
+            * outputs are in the same order of indices instead of the order of src
+
+The method is a utility function which maybe used for multiple clustering algorithms such as K-means.
diff --git a/modules/ocl/include/opencv2/ocl/ocl.hpp b/modules/ocl/include/opencv2/ocl/ocl.hpp
index 8c770ee38..4f4289967 100644
--- a/modules/ocl/include/opencv2/ocl/ocl.hpp
+++ b/modules/ocl/include/opencv2/ocl/ocl.hpp
@@ -877,7 +877,10 @@ namespace cv
 
         //! Compute closest centers for each lines in source and lable it after center's index
         // supports CV_32FC1/CV_32FC2/CV_32FC4 data type
-        CV_EXPORTS void distanceToCenters(oclMat &dists, oclMat &labels, const oclMat &src, const oclMat &centers);
+        // supports NORM_L1 and NORM_L2 distType
+        // if indices is provided, only the indexed rows will be calculated and their results are in the same
+        // order of indices
+        CV_EXPORTS void distanceToCenters(oclMat &dists, oclMat &labels, const oclMat &src, const oclMat &centers, int distType = NORM_L2SQR, const oclMat &indices = oclMat());
 
         //!Does k-means procedure on GPU
         // supports CV_32FC1/CV_32FC2/CV_32FC4 data type
diff --git a/modules/ocl/perf/perf_imgproc.cpp b/modules/ocl/perf/perf_imgproc.cpp
index f2314c6a7..c326b0b42 100644
--- a/modules/ocl/perf/perf_imgproc.cpp
+++ b/modules/ocl/perf/perf_imgproc.cpp
@@ -860,3 +860,64 @@ PERF_TEST_P(columnSumFixture, columnSum, OCL_TYPICAL_MAT_SIZES)
     else
         OCL_PERF_ELSE
 }
+
+//////////////////////////////distanceToCenters////////////////////////////////////////////////
+
+CV_ENUM(DistType, NORM_L1, NORM_L2SQR);
+typedef tuple<Size, DistType> distanceToCentersParameters;
+typedef TestBaseWithParam<distanceToCentersParameters> distanceToCentersFixture;
+
+static void distanceToCentersPerfTest(Mat& src, Mat& centers, Mat& dists, Mat& labels, int distType)
+{
+    Mat batch_dists;
+    cv::batchDistance(src,centers,batch_dists, CV_32FC1, noArray(), distType);
+    std::vector<float> dists_v;
+    std::vector<int> labels_v;
+    for(int i = 0; i<batch_dists.rows; i++)
+    {
+        Mat r = batch_dists.row(i);
+        double mVal;
+        Point mLoc;
+        minMaxLoc(r, &mVal, NULL, &mLoc, NULL);
+        dists_v.push_back((float)mVal);
+        labels_v.push_back(mLoc.x);
+    }
+    Mat temp_dists(dists_v);
+    Mat temp_labels(labels_v);
+    temp_dists.reshape(1,1).copyTo(dists);
+    temp_labels.reshape(1,1).copyTo(labels);
+}
+
+PERF_TEST_P(distanceToCentersFixture, distanceToCenters, ::testing::Combine(::testing::Values(cv::Size(256,256), cv::Size(512,512)), DistType::all()) )
+{
+    Size size = get<0>(GetParam());
+    int distType = get<1>(GetParam());
+    Mat src(size, CV_32FC1);
+    Mat centers(size, CV_32FC1);
+    Mat dists(cv::Size(src.rows,1), CV_32FC1);
+    Mat labels(cv::Size(src.rows,1), CV_32SC1);
+    declare.in(src, centers, WARMUP_RNG).out(dists, labels);
+    if (RUN_OCL_IMPL)
+    {
+        ocl::oclMat ocl_src(src);
+        ocl::oclMat ocl_centers(centers);
+        ocl::oclMat ocl_dists(dists);
+        ocl::oclMat ocl_labels(labels);
+
+        OCL_TEST_CYCLE() ocl::distanceToCenters(ocl_dists,ocl_labels,ocl_src, ocl_centers, distType);
+
+        ocl_dists.download(dists);
+        ocl_labels.download(labels);
+
+        SANITY_CHECK(dists, 1e-6, ERROR_RELATIVE);
+        SANITY_CHECK(labels);
+    }
+    else if (RUN_PLAIN_IMPL)
+    {
+        TEST_CYCLE() distanceToCentersPerfTest(src,centers,dists,labels,distType);
+        SANITY_CHECK(dists, 1e-6, ERROR_RELATIVE);
+        SANITY_CHECK(labels);
+    }
+    else
+        OCL_PERF_ELSE
+}
diff --git a/modules/ocl/src/kmeans.cpp b/modules/ocl/src/kmeans.cpp
index a5d5afd1f..58a68a750 100644
--- a/modules/ocl/src/kmeans.cpp
+++ b/modules/ocl/src/kmeans.cpp
@@ -160,32 +160,61 @@ static void generateCentersPP(const Mat& _data, Mat& _out_centers,
     }
 }
 
-void cv::ocl::distanceToCenters(oclMat &dists, oclMat &labels, const oclMat &src, const oclMat &centers)
+void cv::ocl::distanceToCenters(oclMat &dists, oclMat &labels, const oclMat &src, const oclMat &centers, int distType, const oclMat &indices)
 {
-    //if(src.clCxt -> impl -> double_support == 0 && src.type() == CV_64F)
-    //{
-    //    CV_Error(CV_OpenCLDoubleNotSupported, "Selected device doesn't support double");
-    //    return;
-    //}
+    CV_Assert(src.cols*src.oclchannels() == centers.cols*centers.oclchannels());
+    CV_Assert(src.depth() == CV_32F && centers.depth() == CV_32F);
+    bool is_label_row_major = false;
+    ensureSizeIsEnough(1, src.rows, CV_32FC1, dists);
+    if(labels.empty() || (!labels.empty() && labels.rows == src.rows && labels.cols == 1))
+    {
+        ensureSizeIsEnough(src.rows, 1, CV_32SC1, labels);
+        is_label_row_major = true;
+    }
+    CV_Assert(distType == NORM_L1 || distType == NORM_L2SQR);
 
-    Context  *clCxt = src.clCxt;
-    int labels_step = (int)(labels.step/labels.elemSize());
-    string kernelname = "distanceToCenters";
-    int threadNum = src.rows > 256 ? 256 : src.rows;
-    size_t localThreads[3]  = {1, threadNum, 1};
-    size_t globalThreads[3] = {1, src.rows, 1};
+    std::stringstream build_opt_ss;
+    build_opt_ss
+        << (distType == NORM_L1 ? "-D L1_DIST" : "-D L2SQR_DIST")
+        << (indices.empty() ? "" : " -D USE_INDEX");
+
+    String build_opt = build_opt_ss.str();
+
+    const int src_step = (int)(src.oclchannels() * src.step / src.elemSize());
+    const int centers_step = (int)(centers.oclchannels() * centers.step / centers.elemSize());
+
+    const int colsNumb = centers.cols*centers.oclchannels();
+
+    const int label_step   = is_label_row_major ? (int)(labels.step / labels.elemSize()) : 1;
+    String kernelname = "distanceToCenters";
+
+    const int number_of_input = indices.empty() ? src.rows : indices.size().area();
+
+    const int src_offset = (int)src.offset/src.elemSize();
+    const int centers_offset = (int)centers.offset/centers.elemSize();
+
+    size_t globalThreads[3] = {number_of_input, 1, 1};
 
     vector<pair<size_t, const void *> > args;
-    args.push_back(make_pair(sizeof(cl_int), (void *)&labels_step));
-    args.push_back(make_pair(sizeof(cl_int), (void *)&centers.rows));
     args.push_back(make_pair(sizeof(cl_mem), (void *)&src.data));
-    args.push_back(make_pair(sizeof(cl_mem), (void *)&labels.data));
-    args.push_back(make_pair(sizeof(cl_int), (void *)&centers.cols));
-    args.push_back(make_pair(sizeof(cl_int), (void *)&src.rows));
     args.push_back(make_pair(sizeof(cl_mem), (void *)&centers.data));
-    args.push_back(make_pair(sizeof(cl_mem), (void*)&dists.data));
+    if(!indices.empty())
+    {
+        args.push_back(make_pair(sizeof(cl_mem), (void *)&indices.data));
+    }
+    args.push_back(make_pair(sizeof(cl_mem), (void *)&labels.data));
+    args.push_back(make_pair(sizeof(cl_mem), (void *)&dists.data));
+    args.push_back(make_pair(sizeof(cl_int), (void *)&colsNumb));
+    args.push_back(make_pair(sizeof(cl_int), (void *)&src_step));
+    args.push_back(make_pair(sizeof(cl_int), (void *)&centers_step));
+    args.push_back(make_pair(sizeof(cl_int), (void *)&label_step));
+    args.push_back(make_pair(sizeof(cl_int), (void *)&number_of_input));
+    args.push_back(make_pair(sizeof(cl_int), (void *)&centers.rows));
+    args.push_back(make_pair(sizeof(cl_int), (void *)&src_offset));
+    args.push_back(make_pair(sizeof(cl_int), (void *)&centers_offset));
 
-    openCLExecuteKernel(clCxt, &kmeans_kernel, kernelname, globalThreads, localThreads, args, -1, -1, NULL);
+    openCLExecuteKernel(Context::getContext(), &kmeans_kernel,
+        kernelname, globalThreads, NULL, args, -1, -1, build_opt.c_str());
 }
 ///////////////////////////////////k - means /////////////////////////////////////////////////////////
 double cv::ocl::kmeans(const oclMat &_src, int K, oclMat &_bestLabels,
@@ -404,17 +433,17 @@ double cv::ocl::kmeans(const oclMat &_src, int K, oclMat &_bestLabels,
 
             _bestLabels.upload(_labels);
             _centers.upload(centers);
+
             distanceToCenters(_dists, _bestLabels, _src, _centers);
 
             Mat dists;
             _dists.download(dists);
             _bestLabels.download(_labels);
-
-            double* dist = dists.ptr<double>(0);
+            float* dist = dists.ptr<float>(0);
             compactness = 0;
             for( i = 0; i < N; i++ )
             {
-                compactness += dist[i];
+                    compactness += (double)dist[i];
             }
         }
 
diff --git a/modules/ocl/src/opencl/kmeans_kernel.cl b/modules/ocl/src/opencl/kmeans_kernel.cl
index 9846d522f..f62a08f63 100644
--- a/modules/ocl/src/opencl/kmeans_kernel.cl
+++ b/modules/ocl/src/opencl/kmeans_kernel.cl
@@ -16,6 +16,7 @@
 //
 // @Authors
 //    Xiaopeng Fu, fuxiaopeng2222@163.com
+//    Peng Xiao, pengxiao@outlook.com
 //
 // Redistribution and use in source and binary forms, with or without modification,
 // are permitted provided that the following conditions are met:
@@ -43,42 +44,81 @@
 //
 //M*/
 
-__kernel void distanceToCenters(
-    int label_step, int K,
-    __global float *src,
-    __global int *labels, int dims, int rows,
-    __global float *centers,
-    __global float *dists)
+#ifdef L1_DIST
+#  define DISTANCE(A, B) fabs((A) - (B))
+#elif defined L2SQR_DIST
+#  define DISTANCE(A, B) ((A) - (B)) * ((A) - (B))
+#else
+#  define DISTANCE(A, B) ((A) - (B)) * ((A) - (B))
+#endif
+
+inline float dist(__global const float * center, __global const float * src, int feature_cols)
 {
-    int gid = get_global_id(1);
-
-    float dist, euDist, min;
-    int minCentroid;
-
-    if(gid >= rows)
-        return;
-
-    for(int i = 0 ; i < K; i++)
+    float res = 0;
+    float4 tmp4;
+    int i;
+    for(i = 0; i < feature_cols / 4; i += 4, center += 4, src += 4)
     {
-        euDist = 0;
-        for(int j = 0; j < dims; j++)
-        {
-            dist = (src[j + gid * dims]
-                    - centers[j + i * dims]);
-            euDist += dist * dist;
-        }
+        tmp4 = vload4(0, center) - vload4(0, src);
+#ifdef L1_DIST
+        tmp4 = fabs(tmp4);
+#else
+        tmp4 *= tmp4;
+#endif
+        res += tmp4.x + tmp4.y + tmp4.z + tmp4.w;
+    }
 
-        if(i == 0)
+    for(; i < feature_cols; ++i, ++center, ++src)
+    {
+        res += DISTANCE(*src, *center);
+    }
+    return res;
+}
+
+// to be distinguished with distanceToCenters in kmeans_kernel.cl
+__kernel void distanceToCenters(
+    __global const float *src,
+    __global const float *centers,
+#ifdef USE_INDEX
+    __global const int   *indices,
+#endif
+    __global int   *labels,
+    __global float *dists,
+    int feature_cols,
+    int src_step,
+    int centers_step,
+    int label_step,
+    int input_size,
+    int K,
+    int offset_src,
+    int offset_centers
+)
+{
+    int gid = get_global_id(0);
+    float euDist, minval;
+    int minCentroid;
+    if(gid >= input_size)
+    {
+        return;
+    }
+    src += offset_src;
+    centers += offset_centers;
+#ifdef USE_INDEX
+    src += indices[gid] * src_step;
+#else
+    src += gid * src_step;
+#endif
+    minval = dist(centers, src, feature_cols);
+    minCentroid = 0;
+    for(int i = 1 ; i < K; i++)
+    {
+        euDist = dist(centers + i * centers_step, src, feature_cols);
+        if(euDist < minval)
         {
-            min = euDist;
-            minCentroid = 0;
-        }
-        else if(euDist < min)
-        {
-            min = euDist;
+            minval = euDist;
             minCentroid = i;
         }
     }
-    dists[gid] = min;
-    labels[label_step * gid] = minCentroid;
+    labels[gid * label_step] = minCentroid;
+    dists[gid] = minval;
 }
diff --git a/modules/ocl/test/test_kmeans.cpp b/modules/ocl/test/test_kmeans.cpp
index 16d84dd92..6539c51c4 100644
--- a/modules/ocl/test/test_kmeans.cpp
+++ b/modules/ocl/test/test_kmeans.cpp
@@ -99,7 +99,6 @@ PARAM_TEST_CASE(Kmeans, int, int, int)
     }
 };
 OCL_TEST_P(Kmeans, Mat){
-
     if(flags & KMEANS_USE_INITIAL_LABELS)
     {
         // inital a given labels
@@ -116,11 +115,9 @@ OCL_TEST_P(Kmeans, Mat){
         kmeans(src, K, labels,
             TermCriteria( CV_TERMCRIT_EPS+CV_TERMCRIT_ITER, 100, 0),
             1, flags, centers);
-
         ocl::kmeans(d_src, K, d_labels,
             TermCriteria( CV_TERMCRIT_EPS+CV_TERMCRIT_ITER, 100, 0),
             1, flags, d_centers);
-
         Mat dd_labels(d_labels);
         Mat dd_centers(d_centers);
         if(flags & KMEANS_USE_INITIAL_LABELS)
@@ -153,9 +150,97 @@ OCL_TEST_P(Kmeans, Mat){
         }
     }
 }
+
 INSTANTIATE_TEST_CASE_P(OCL_ML, Kmeans, Combine(
     Values(3, 5, 8),
     Values(CV_32FC1, CV_32FC2, CV_32FC4),
     Values(OCL_KMEANS_USE_INITIAL_LABELS/*, OCL_KMEANS_PP_CENTERS*/)));
 
+
+/////////////////////////////// DistanceToCenters //////////////////////////////////////////
+
+CV_ENUM(DistType, NORM_L1, NORM_L2SQR);
+
+PARAM_TEST_CASE(distanceToCenters, DistType, bool)
+{
+    cv::Size size;
+    int distType;
+    bool useRoi;
+    cv::Mat src, centers, src_roi, centers_roi;
+    cv::ocl::oclMat ocl_src, ocl_centers, ocl_src_roi, ocl_centers_roi;
+
+    virtual void SetUp()
+    {
+        distType = GET_PARAM(0);
+        useRoi = GET_PARAM(1);
+    }
+
+    void random_roi()
+    {
+        Size roiSize_src = randomSize(10,1000);
+        Size roiSize_centers = randomSize(10, 1000);
+        roiSize_src.width = roiSize_centers.width;
+
+        Border srcBorder = randomBorder(0, useRoi ? 500 : 0);
+        randomSubMat(src, src_roi, roiSize_src, srcBorder, CV_32FC1, -SHRT_MAX, SHRT_MAX);
+
+        Border centersBorder = randomBorder(0, useRoi ? 500 : 0);
+        randomSubMat(centers, centers_roi, roiSize_centers, centersBorder, CV_32FC1, -SHRT_MAX, SHRT_MAX);
+
+        for(int i = 0; i<centers.rows; i++)
+            centers.at<float>(i, randomInt(0,centers.cols-1)) = (float)randomDouble(SHRT_MAX, INT_MAX);
+
+        generateOclMat(ocl_src, ocl_src_roi, src, roiSize_src, srcBorder);
+        generateOclMat(ocl_centers, ocl_centers_roi, centers, roiSize_centers, centersBorder);
+
+    }
+
+};
+
+OCL_TEST_P(distanceToCenters, Accuracy)
+{
+    for(int j = 0; j< LOOP_TIMES; j++)
+    {
+        random_roi();
+
+        cv::ocl::oclMat ocl_dists;
+        cv::ocl::oclMat ocl_labels;
+
+        cv::ocl::distanceToCenters(ocl_dists,ocl_labels,ocl_src_roi, ocl_centers_roi, distType);
+
+        Mat labels, dists;
+        ocl_labels.download(labels);
+        ocl_dists.download(dists);
+
+        ASSERT_EQ(ocl_dists.cols, ocl_labels.rows);
+
+        Mat batch_dists;
+
+        cv::batchDistance(src_roi, centers_roi, batch_dists, CV_32FC1, noArray(), distType);
+
+        std::vector<double> gold_dists_v;
+
+        for(int i = 0; i<batch_dists.rows; i++)
+        {
+            Mat r = batch_dists.row(i);
+            double mVal;
+            Point mLoc;
+            minMaxLoc(r, &mVal, NULL, &mLoc, NULL);
+
+            int ocl_label = *(int*)labels.row(i).col(0).data;
+            ASSERT_EQ(mLoc.x, ocl_label);
+
+            gold_dists_v.push_back(mVal);
+        }
+        Mat gold_dists(gold_dists_v);
+        dists.convertTo(dists, CV_64FC1);
+        double relative_error = cv::norm(gold_dists.t(), dists, NORM_INF|NORM_RELATIVE);
+        ASSERT_LE(relative_error, 1e-5);
+    }
+}
+
+
+INSTANTIATE_TEST_CASE_P (OCL_ML, distanceToCenters, Combine(DistType::all(), Bool()) );
+
+
 #endif