diff --git a/modules/ocl/src/arithm.cpp b/modules/ocl/src/arithm.cpp
index 035cea781..c1147cb41 100644
--- a/modules/ocl/src/arithm.cpp
+++ b/modules/ocl/src/arithm.cpp
@@ -64,7 +64,6 @@ namespace cv
     {
         //////////////////////////////// OpenCL kernel strings /////////////////////
 
-        extern const char *transpose_kernel;
         extern const char *arithm_nonzero;
         extern const char *arithm_sum;
         extern const char *arithm_sum_3;
@@ -1265,9 +1264,8 @@ int cv::ocl::countNonZero(const oclMat &src)
         CV_Error(CV_GpuNotSupported, "select device don't support double");
     }
     CV_Assert(groupnum != 0);
-    groupnum = groupnum * 2;
+//    groupnum = groupnum * 2;
     int vlen = 8 , dbsize = groupnum * vlen;
-    //cl_ulong start, end;
     Context *clCxt = src.clCxt;
     string kernelName = "arithm_op_nonzero";
     int *p = new int[dbsize], nonzero = 0;
@@ -1529,7 +1527,7 @@ oclMatExpr::operator oclMat() const
 #define TILE_DIM      (32)
 #define BLOCK_ROWS    (256/TILE_DIM)
 
-static void transpose_run(const oclMat &src, oclMat &dst, string kernelName)
+static void transpose_run(const oclMat &src, oclMat &dst, string kernelName, bool inplace = false)
 {
     Context  *clCxt = src.clCxt;
     if (!clCxt->supportsFeature(Context::CL_DOUBLE) && src.depth() == CV_64F)
@@ -1544,7 +1542,7 @@ static void transpose_run(const oclMat &src, oclMat &dst, string kernelName)
                                       channelsString[src.channels()]);
 
     size_t localThreads[3]  = { TILE_DIM, BLOCK_ROWS, 1 };
-    size_t globalThreads[3] = { src.cols, src.rows, 1 };
+    size_t globalThreads[3] = { src.cols, inplace ? src.rows : divUp(src.rows, TILE_DIM) * BLOCK_ROWS, 1 };
 
     int srcstep1 = src.step / src.elemSize(), dststep1 = dst.step / dst.elemSize();
     int srcoffset1 = src.offset / src.elemSize(), dstoffset1 = dst.offset / dst.elemSize();
@@ -1568,8 +1566,8 @@ void cv::ocl::transpose(const oclMat &src, oclMat &dst)
     CV_Assert(src.depth() <= CV_64F && src.channels() <= 4);
 
     if ( src.data == dst.data && src.cols == src.rows && dst.offset == src.offset
-            && dst.rows == dst.cols && src.cols == dst.cols)
-        transpose_run( src, dst, "transpose_inplace");
+         && dst.size() == src.size())
+        transpose_run( src, dst, "transpose_inplace", true);
     else
     {
         dst.create(src.cols, src.rows, src.type());
diff --git a/modules/ocl/src/opencl/arithm_minMax.cl b/modules/ocl/src/opencl/arithm_minMax.cl
index 1dcb138eb..23b293306 100644
--- a/modules/ocl/src/opencl/arithm_minMax.cl
+++ b/modules/ocl/src/opencl/arithm_minMax.cl
@@ -44,9 +44,14 @@
 //M*/
 
 /**************************************PUBLICFUNC*************************************/
+
 #if defined (DOUBLE_SUPPORT)
+#ifdef cl_amd_fp64
+#pragma OPENCL EXTENSION cl_amd_fp64:enable
+#elif defined (cl_khr_fp64)
 #pragma OPENCL EXTENSION cl_khr_fp64:enable
 #endif
+#endif
 
 #if defined (DEPTH_0)
 #define VEC_TYPE uchar8
diff --git a/modules/ocl/src/opencl/arithm_transpose.cl b/modules/ocl/src/opencl/arithm_transpose.cl
index 57f7f1b9d..5328d1f1b 100644
--- a/modules/ocl/src/opencl/arithm_transpose.cl
+++ b/modules/ocl/src/opencl/arithm_transpose.cl
@@ -44,23 +44,78 @@
 //M*/
 
 #if defined (DOUBLE_SUPPORT)
+#ifdef cl_amd_fp64
+#pragma OPENCL EXTENSION cl_amd_fp64:enable
+#elif defined (cl_khr_fp64)
 #pragma OPENCL EXTENSION cl_khr_fp64:enable
 #endif
+#endif
+
+#define TILE_DIM      32
+#define BLOCK_ROWS    8
+#define LDS_STEP      TILE_DIM
 
 __kernel void transpose(__global const T* src, __global T* dst,
     int src_cols, int src_rows,
     int src_step, int dst_step,
     int src_offset, int dst_offset)
 {
-    int x = get_global_id(0);
-    int y = get_global_id(1);
+    int gp_x = get_group_id(0),   gp_y = get_group_id(1);
+    int gs_x = get_num_groups(0), gs_y = get_num_groups(1);
+
+    int groupId_x, groupId_y;
+
+    if(src_rows == src_cols)
+    {
+        groupId_y = gp_x;
+        groupId_x = (gp_x + gp_y) % gs_x;
+    }
+    else
+    {
+        int bid = gp_x + gs_x * gp_y;
+        groupId_y =  bid % gs_y;
+        groupId_x = ((bid / gs_y) + groupId_y) % gs_x;
+    }
+
+    int lx = get_local_id(0);
+    int ly = get_local_id(1);
+
+    int x = groupId_x * TILE_DIM + lx;
+    int y = groupId_y * TILE_DIM + ly;
+
+    int x_index = groupId_y * TILE_DIM + lx;
+    int y_index = groupId_x * TILE_DIM + ly;
+
+    __local T title[TILE_DIM * LDS_STEP];
 
     if (x < src_cols && y < src_rows)
     {
-        int srcIdx = mad24(y, src_step, src_offset + x);
-        int dstIdx = mad24(x, dst_step, dst_offset + y);
+        int index_src = mad24(y, src_step, x);
 
-        dst[dstIdx] = src[srcIdx];
+        for(int i = 0; i < TILE_DIM; i += BLOCK_ROWS)
+        {
+            if (y + i < src_rows)
+            {
+                title[(ly + i) * LDS_STEP + lx] = src[src_offset + index_src];
+                index_src = mad24(BLOCK_ROWS, src_step, index_src);
+            }
+        }
+    }
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    if (x_index < src_rows && y_index < src_cols)
+    {
+        int index_dst = mad24(y_index, dst_step, x_index);
+
+        for(int i = 0; i < TILE_DIM; i += BLOCK_ROWS)
+        {
+            if ((y_index + i) < src_cols)
+            {
+                dst[dst_offset + index_dst] = title[lx * LDS_STEP + ly + i];
+                index_dst +=  dst_step * BLOCK_ROWS;
+            }
+        }
     }
 }
 
@@ -72,7 +127,7 @@ __kernel void transpose_inplace(__global T* src, __global T* dst,
     int x = get_global_id(0);
     int y = get_global_id(1);
 
-    if (x < src_cols && y < src_rows && x < y)
+    if (y < src_rows && x < y)
     {
         int srcIdx = mad24(y, src_step, src_offset + x);
         int dstIdx = mad24(x, dst_step, dst_offset + y);
diff --git a/modules/ocl/test/utility.hpp b/modules/ocl/test/utility.hpp
index 48c8bbcd9..7c491916f 100644
--- a/modules/ocl/test/utility.hpp
+++ b/modules/ocl/test/utility.hpp
@@ -48,7 +48,7 @@
 #define MHEIGHT 256
 
 #define MIN_VALUE 171
-#define MAX_VALUE 351
+#define MAX_VALUE 357
 
 //#define RANDOMROI
 int randomInt(int minVal, int maxVal);