other kernels now use row scheme
This commit is contained in:
@@ -52,37 +52,47 @@
|
||||
__kernel void inrange(__global const uchar * src1ptr, int src1_step, int src1_offset,
|
||||
__global uchar * dstptr, int dst_step, int dst_offset, int dst_rows, int dst_cols,
|
||||
#ifdef HAVE_SCALAR
|
||||
__global const T * src2, __global const T * src3
|
||||
__global const T * src2, __global const T * src3,
|
||||
#else
|
||||
__global const uchar * src2ptr, int src2_step, int src2_offset,
|
||||
__global const uchar * src3ptr, int src3_step, int src3_offset
|
||||
__global const uchar * src3ptr, int src3_step, int src3_offset,
|
||||
#endif
|
||||
)
|
||||
int rowsPerWI)
|
||||
{
|
||||
int x = get_global_id(0);
|
||||
int y = get_global_id(1);
|
||||
int y0 = get_global_id(1) * rowsPerWI;
|
||||
|
||||
if (x < dst_cols && y < dst_rows)
|
||||
if (x < dst_cols)
|
||||
{
|
||||
int src1_index = mad24(y, src1_step, mad24(x, (int)sizeof(T) * cn, src1_offset));
|
||||
int dst_index = mad24(y, dst_step, x + dst_offset);
|
||||
__global const T * src1 = (__global const T *)(src1ptr + src1_index);
|
||||
__global uchar * dst = dstptr + dst_index;
|
||||
|
||||
int src1_index = mad24(y0, src1_step, mad24(x, (int)sizeof(T) * cn, src1_offset));
|
||||
int dst_index = mad24(y0, dst_step, x + dst_offset);
|
||||
#ifndef HAVE_SCALAR
|
||||
int src2_index = mad24(y, src2_step, mad24(x, (int)sizeof(T) * cn, src2_offset));
|
||||
int src3_index = mad24(y, src3_step, mad24(x, (int)sizeof(T) * cn, src3_offset));
|
||||
__global const T * src2 = (__global const T *)(src2ptr + src2_index);
|
||||
__global const T * src3 = (__global const T *)(src3ptr + src3_index);
|
||||
int src2_index = mad24(y0, src2_step, mad24(x, (int)sizeof(T) * cn, src2_offset));
|
||||
int src3_index = mad24(y0, src3_step, mad24(x, (int)sizeof(T) * cn, src3_offset));
|
||||
#endif
|
||||
|
||||
dst[0] = 255;
|
||||
for (int y = y0, y1 = min(dst_rows, y0 + rowsPerWI); y < y1; ++y, src1_index += src1_step, dst_index += dst_step)
|
||||
{
|
||||
__global const T * src1 = (__global const T *)(src1ptr + src1_index);
|
||||
__global uchar * dst = dstptr + dst_index;
|
||||
#ifndef HAVE_SCALAR
|
||||
__global const T * src2 = (__global const T *)(src2ptr + src2_index);
|
||||
__global const T * src3 = (__global const T *)(src3ptr + src3_index);
|
||||
#endif
|
||||
|
||||
for (int c = 0; c < cn; ++c)
|
||||
if (src2[c] > src1[c] || src3[c] < src1[c])
|
||||
{
|
||||
dst[0] = 0;
|
||||
break;
|
||||
}
|
||||
dst[0] = 255;
|
||||
|
||||
for (int c = 0; c < cn; ++c)
|
||||
if (src2[c] > src1[c] || src3[c] < src1[c])
|
||||
{
|
||||
dst[0] = 0;
|
||||
break;
|
||||
}
|
||||
|
||||
#ifndef HAVE_SCALAR
|
||||
src2_index += src2_step;
|
||||
src3_index += src3_step;
|
||||
#endif
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -45,20 +45,28 @@
|
||||
__global const uchar * src##i##ptr, int src##i##_step, int src##i##_offset,
|
||||
#define DECLARE_OUTPUT_MAT(i) \
|
||||
__global uchar * dst##i##ptr, int dst##i##_step, int dst##i##_offset,
|
||||
#define DECLARE_INDEX(i) \
|
||||
int src##i##_index = mad24(src##i##_step, y0, mad24(x, (int)sizeof(T) * scn##i, src##i##_offset)); \
|
||||
int dst##i##_index = mad24(dst##i##_step, y0, mad24(x, (int)sizeof(T) * dcn##i, dst##i##_offset));
|
||||
#define PROCESS_ELEM(i) \
|
||||
int src##i##_index = mad24(src##i##_step, y, mad24(x, (int)sizeof(T) * scn##i, src##i##_offset)); \
|
||||
__global const T * src##i = (__global const T *)(src##i##ptr + src##i##_index); \
|
||||
int dst##i##_index = mad24(dst##i##_step, y, mad24(x, (int)sizeof(T) * dcn##i, dst##i##_offset)); \
|
||||
__global T * dst##i = (__global T *)(dst##i##ptr + dst##i##_index); \
|
||||
dst##i[0] = src##i[0];
|
||||
dst##i[0] = src##i[0]; \
|
||||
src##i##_index += src##i##_step; \
|
||||
dst##i##_index += dst##i##_step;
|
||||
|
||||
__kernel void mixChannels(DECLARE_INPUT_MATS DECLARE_OUTPUT_MATS int rows, int cols)
|
||||
__kernel void mixChannels(DECLARE_INPUT_MAT_N DECLARE_OUTPUT_MAT_N int rows, int cols, int rowsPerWI)
|
||||
{
|
||||
int x = get_global_id(0);
|
||||
int y = get_global_id(1);
|
||||
int y0 = get_global_id(1) * rowsPerWI;
|
||||
|
||||
if (x < cols && y < rows)
|
||||
if (x < cols)
|
||||
{
|
||||
PROCESS_ELEMS
|
||||
DECLARE_INDEX_N
|
||||
|
||||
for (int y = y0, y1 = min(y0 + rowsPerWI, rows); y < y1; ++y)
|
||||
{
|
||||
PROCESS_ELEM_N
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -56,26 +56,30 @@ inline float2 conjf(float2 a)
|
||||
__kernel void mulAndScaleSpectrums(__global const uchar * src1ptr, int src1_step, int src1_offset,
|
||||
__global const uchar * src2ptr, int src2_step, int src2_offset,
|
||||
__global uchar * dstptr, int dst_step, int dst_offset,
|
||||
int dst_rows, int dst_cols)
|
||||
int dst_rows, int dst_cols, int rowsPerWI)
|
||||
{
|
||||
int x = get_global_id(0);
|
||||
int y = get_global_id(1);
|
||||
int y0 = get_global_id(1) * rowsPerWI;
|
||||
|
||||
if (x < dst_cols && y < dst_rows)
|
||||
if (x < dst_cols)
|
||||
{
|
||||
int src1_index = mad24(y, src1_step, mad24(x, (int)sizeof(float2), src1_offset));
|
||||
int src2_index = mad24(y, src2_step, mad24(x, (int)sizeof(float2), src2_offset));
|
||||
int dst_index = mad24(y, dst_step, mad24(x, (int)sizeof(float2), dst_offset));
|
||||
int src1_index = mad24(y0, src1_step, mad24(x, (int)sizeof(float2), src1_offset));
|
||||
int src2_index = mad24(y0, src2_step, mad24(x, (int)sizeof(float2), src2_offset));
|
||||
int dst_index = mad24(y0, dst_step, mad24(x, (int)sizeof(float2), dst_offset));
|
||||
|
||||
float2 src0 = *(__global const float2 *)(src1ptr + src1_index);
|
||||
float2 src1 = *(__global const float2 *)(src2ptr + src2_index);
|
||||
__global float2 * dst = (__global float2 *)(dstptr + dst_index);
|
||||
for (int y = y0, y1 = min(dst_rows, y0 + rowsPerWI); y < y1; ++y,
|
||||
src1_index += src1_step, src2_index += src2_step, dst_index += dst_step)
|
||||
{
|
||||
float2 src0 = *(__global const float2 *)(src1ptr + src1_index);
|
||||
float2 src1 = *(__global const float2 *)(src2ptr + src2_index);
|
||||
__global float2 * dst = (__global float2 *)(dstptr + dst_index);
|
||||
|
||||
#ifdef CONJ
|
||||
float2 v = cmulf(src0, conjf(src1));
|
||||
float2 v = cmulf(src0, conjf(src1));
|
||||
#else
|
||||
float2 v = cmulf(src0, src1);
|
||||
float2 v = cmulf(src0, src1);
|
||||
#endif
|
||||
dst[0] = v;
|
||||
dst[0] = v;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -56,15 +56,16 @@
|
||||
#endif
|
||||
|
||||
__kernel void setIdentity(__global uchar * srcptr, int src_step, int src_offset, int rows, int cols,
|
||||
ST scalar_)
|
||||
ST scalar_, int rowsPerWI)
|
||||
{
|
||||
int x = get_global_id(0);
|
||||
int y = get_global_id(1);
|
||||
int y0 = get_global_id(1) * rowsPerWI;
|
||||
|
||||
if (x < cols && y < rows)
|
||||
if (x < cols)
|
||||
{
|
||||
int src_index = mad24(y, src_step, mad24(x, TSIZE, src_offset));
|
||||
int src_index = mad24(y0, src_step, mad24(x, TSIZE, src_offset));
|
||||
|
||||
storepix(x == y ? scalar : (T)(0), srcptr + src_index);
|
||||
for (int y = y0, y1 = min(rows, y0 + rowsPerWI); y < y1; ++y, src_index += src_step)
|
||||
storepix(x == y ? scalar : (T)(0), srcptr + src_index);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -44,42 +44,58 @@
|
||||
#ifdef OP_MERGE
|
||||
|
||||
#define DECLARE_SRC_PARAM(index) __global const uchar * src##index##ptr, int src##index##_step, int src##index##_offset,
|
||||
#define DECLARE_DATA(index) __global const T * src##index = \
|
||||
(__global T *)(src##index##ptr + mad24(src##index##_step, y, mad24(x, (int)sizeof(T) * scn##index, src##index##_offset)));
|
||||
#define PROCESS_ELEM(index) dst[index] = src##index[0];
|
||||
#define DECLARE_INDEX(index) int src##index##_index = mad24(src##index##_step, y0, mad24(x, (int)sizeof(T) * scn##index, src##index##_offset));
|
||||
#define PROCESS_ELEM(index) \
|
||||
__global const T * src##index = (__global const T *)(src##index##ptr + src##index##_index); \
|
||||
dst[index] = src##index[0]; \
|
||||
src##index##_index += src##index##_step;
|
||||
|
||||
__kernel void merge(DECLARE_SRC_PARAMS_N
|
||||
__global uchar * dstptr, int dst_step, int dst_offset,
|
||||
int rows, int cols)
|
||||
int rows, int cols, int rowsPerWI)
|
||||
{
|
||||
int x = get_global_id(0);
|
||||
int y = get_global_id(1);
|
||||
int y0 = get_global_id(1) * rowsPerWI;
|
||||
|
||||
if (x < cols && y < rows)
|
||||
if (x < cols)
|
||||
{
|
||||
DECLARE_DATA_N
|
||||
__global T * dst = (__global T *)(dstptr + mad24(dst_step, y, mad24(x, (int)sizeof(T) * cn, dst_offset)));
|
||||
PROCESS_ELEMS_N
|
||||
DECLARE_INDEX_N
|
||||
int dst_index = mad24(dst_step, y0, mad24(x, (int)sizeof(T) * cn, dst_offset));
|
||||
|
||||
for (int y = y0, y1 = min(rows, y0 + rowsPerWI); y < y1; ++y, dst_index += dst_step)
|
||||
{
|
||||
__global T * dst = (__global T *)(dstptr + dst_index);
|
||||
|
||||
PROCESS_ELEMS_N
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#elif defined OP_SPLIT
|
||||
|
||||
#define DECLARE_DST_PARAM(index) , __global uchar * dst##index##ptr, int dst##index##_step, int dst##index##_offset
|
||||
#define DECLARE_DATA(index) __global T * dst##index = \
|
||||
(__global T *)(dst##index##ptr + mad24(y, dst##index##_step, mad24(x, (int)sizeof(T), dst##index##_offset)));
|
||||
#define PROCESS_ELEM(index) dst##index[0] = src[index];
|
||||
#define DECLARE_INDEX(index) int dst##index##_index = mad24(y0, dst##index##_step, mad24(x, (int)sizeof(T), dst##index##_offset));
|
||||
#define PROCESS_ELEM(index) \
|
||||
__global T * dst##index = (__global T *)(dst##index##ptr + dst##index##_index); \
|
||||
dst##index[0] = src[index]; \
|
||||
dst##index##_index += dst##index##_step;
|
||||
|
||||
__kernel void split(__global uchar* srcptr, int src_step, int src_offset, int rows, int cols DECLARE_DST_PARAMS)
|
||||
__kernel void split(__global uchar* srcptr, int src_step, int src_offset, int rows, int cols DECLARE_DST_PARAMS, int rowsPerWI)
|
||||
{
|
||||
int x = get_global_id(0);
|
||||
int y = get_global_id(1);
|
||||
int y0 = get_global_id(1) * rowsPerWI;
|
||||
|
||||
if (x < cols && y < rows)
|
||||
if (x < cols)
|
||||
{
|
||||
DECLARE_DATA_N
|
||||
__global const T * src = (__global const T *)(srcptr + mad24(y, src_step, mad24(x, cn * (int)sizeof(T), src_offset)));
|
||||
PROCESS_ELEMS_N
|
||||
DECLARE_INDEX_N
|
||||
int src_index = mad24(y0, src_step, mad24(x, cn * (int)sizeof(T), src_offset));
|
||||
|
||||
for (int y = y0, y1 = min(rows, y0 + rowsPerWI); y < y1; ++y, src_index += src_step)
|
||||
{
|
||||
__global const T * src = (__global const T *)(srcptr + src_index);
|
||||
|
||||
PROCESS_ELEMS_N
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user