optimized index access

2014-05-27 17:27:24 +04:00 · 2014-05-27 17:27:24 +04:00 · 437927b7bb
commit 437927b7bb
parent c7dc884855
1 changed files with 54 additions and 26 deletions
--- a/modules/core/src/opencl/flip.cl
+++ b/modules/core/src/opencl/flip.cl
@ -54,18 +54,28 @@ __kernel void arithm_flip_rows(__global const uchar * srcptr, int src_step, int
                               int rows, int cols, int thread_rows, int thread_cols)
 {
    int x = get_global_id(0);
-    int y = get_global_id(1)*PIX_PER_WI_Y;
+    int y0 = get_global_id(1)*PIX_PER_WI_Y;

    if (x < cols)
    {
-        #pragma unroll
-        for (int cy = 0; cy < PIX_PER_WI_Y && y < thread_rows; ++cy, ++y)
-        {
-            T src0 = loadpix(srcptr + mad24(y, src_step, mad24(x, TSIZE, src_offset)));
-            T src1 = loadpix(srcptr + mad24(rows - y - 1, src_step, mad24(x, TSIZE, src_offset)));
+        int src_index0 = mad24(y0, src_step, mad24(x, TSIZE, src_offset));
+        int src_index1 = mad24(rows - y0 - 1, src_step, mad24(x, TSIZE, src_offset));
+        int dst_index0 = mad24(y0, dst_step, mad24(x, TSIZE, dst_offset));
+        int dst_index1 = mad24(rows - y0 - 1, dst_step, mad24(x, TSIZE, dst_offset));

-            storepix(src1, dstptr + mad24(y, dst_step, mad24(x, TSIZE, dst_offset)));
-            storepix(src0, dstptr + mad24(rows - y - 1, dst_step, mad24(x, TSIZE, dst_offset)));
+        #pragma unroll
+        for (int y = y0, y1 = min(thread_rows, y0 + PIX_PER_WI_Y); y < y1; ++y)
+        {
+            T src0 = loadpix(srcptr + src_index0);
+            T src1 = loadpix(srcptr + src_index1);
+
+            storepix(src1, dstptr + dst_index0);
+            storepix(src0, dstptr + dst_index1);
+
+            src_index0 += src_step;
+            src_index1 -= src_step;
+            dst_index0 += dst_step;
+            dst_index1 -= dst_step;
        }
    }
 }
@ -75,19 +85,28 @@ __kernel void arithm_flip_rows_cols(__global const uchar * srcptr, int src_step,
                                    int rows, int cols, int thread_rows, int thread_cols)
 {
    int x = get_global_id(0);
-    int y = get_global_id(1)*PIX_PER_WI_Y;
+    int y0 = get_global_id(1)*PIX_PER_WI_Y;

    if (x < cols)
    {
-        int x1 = cols - x - 1;
-        #pragma unroll
-        for (int cy = 0; cy < PIX_PER_WI_Y && y < thread_rows; ++cy, ++y)
-        {
-            T src0 = loadpix(srcptr + mad24(y, src_step, mad24(x, TSIZE, src_offset)));
-            T src1 = loadpix(srcptr + mad24(rows - y - 1, src_step, mad24(x1, TSIZE, src_offset)));
+        int src_index0 = mad24(y0, src_step, mad24(x, TSIZE, src_offset));
+        int src_index1 = mad24(rows - y0 - 1, src_step, mad24(cols - x - 1, TSIZE, src_offset));
+        int dst_index0 = mad24(y0, dst_step, mad24(x, TSIZE, dst_offset));
+        int dst_index1 = mad24(rows - y0 - 1, dst_step, mad24(cols - x - 1, TSIZE, dst_offset));

-            storepix(src0, dstptr + mad24(rows - y - 1, dst_step, mad24(x1, TSIZE, dst_offset)));
-            storepix(src1, dstptr + mad24(y, dst_step, mad24(x, TSIZE, dst_offset)));
+        #pragma unroll
+        for (int y = y0, y1 = min(thread_rows, y0 + PIX_PER_WI_Y); y < y1; ++y)
+        {
+            T src0 = loadpix(srcptr + src_index0);
+            T src1 = loadpix(srcptr + src_index1);
+
+            storepix(src1, dstptr + dst_index0);
+            storepix(src0, dstptr + dst_index1);
+
+            src_index0 += src_step;
+            src_index1 -= src_step;
+            dst_index0 += dst_step;
+            dst_index1 -= dst_step;
        }
    }
 }
@ -97,19 +116,28 @@ __kernel void arithm_flip_cols(__global const uchar * srcptr, int src_step, int
                               int rows, int cols, int thread_rows, int thread_cols)
 {
    int x = get_global_id(0);
-    int y = get_global_id(1)*PIX_PER_WI_Y;
+    int y0 = get_global_id(1)*PIX_PER_WI_Y;

    if (x < thread_cols)
    {
-        int x1 = cols - x - 1;
-        #pragma unroll
-        for (int cy = 0; cy < PIX_PER_WI_Y && y < rows; ++cy, ++y)
-        {
-            T src0 = loadpix(srcptr + mad24(y, src_step, mad24(x, TSIZE, src_offset)));
-            T src1 = loadpix(srcptr + mad24(y, src_step, mad24(x1, TSIZE, src_offset)));
+        int src_index0 = mad24(y0, src_step, mad24(x, TSIZE, src_offset));
+        int src_index1 = mad24(y0, src_step, mad24(cols - x - 1, TSIZE, src_offset));
+        int dst_index0 = mad24(y0, dst_step, mad24(x, TSIZE, dst_offset));
+        int dst_index1 = mad24(y0, dst_step, mad24(cols - x - 1, TSIZE, dst_offset));

-            storepix(src0, dstptr + mad24(y, dst_step, mad24(x1, TSIZE, dst_offset)));
-            storepix(src1, dstptr + mad24(y, dst_step, mad24(x, TSIZE, dst_offset)));
+        #pragma unroll
+        for (int y = y0, y1 = min(rows, y0 + PIX_PER_WI_Y); y < y1; ++y)
+        {
+            T src0 = loadpix(srcptr + src_index0);
+            T src1 = loadpix(srcptr + src_index1);
+
+            storepix(src1, dstptr + dst_index0);
+            storepix(src0, dstptr + dst_index1);
+
+            src_index0 += src_step;
+            src_index1 += src_step;
+            dst_index0 += dst_step;
+            dst_index1 += dst_step;
        }
    }
 }