Merge pull request #1254 from pengx17:2.4_filter2d_
This commit is contained in:
commit
6ae9870de4
@ -691,7 +691,7 @@ namespace cv
|
||||
//! returns 2D filter with the specified kernel
|
||||
// supports CV_8UC1 and CV_8UC4 types
|
||||
CV_EXPORTS Ptr<BaseFilter_GPU> getLinearFilter_GPU(int srcType, int dstType, const Mat &kernel, const Size &ksize,
|
||||
Point anchor = Point(-1, -1), int borderType = BORDER_DEFAULT);
|
||||
const Point &anchor = Point(-1, -1), int borderType = BORDER_DEFAULT);
|
||||
|
||||
//! returns the non-separable linear filter engine
|
||||
CV_EXPORTS Ptr<FilterEngine_GPU> createLinearFilter_GPU(int srcType, int dstType, const Mat &kernel,
|
||||
|
@ -572,7 +572,7 @@ void cv::ocl::morphologyEx(const oclMat &src, oclMat &dst, int op, const Mat &ke
|
||||
|
||||
namespace
|
||||
{
|
||||
typedef void (*GPUFilter2D_t)(const oclMat & , oclMat & , oclMat & , Size &, const Point, const int);
|
||||
typedef void (*GPUFilter2D_t)(const oclMat & , oclMat & , const oclMat & , const Size &, const Point&, const int);
|
||||
|
||||
class LinearFilter_GPU : public BaseFilter_GPU
|
||||
{
|
||||
@ -591,21 +591,22 @@ public:
|
||||
};
|
||||
}
|
||||
|
||||
static void GPUFilter2D(const oclMat &src, oclMat &dst, oclMat &mat_kernel,
|
||||
Size &ksize, const Point anchor, const int borderType)
|
||||
static void GPUFilter2D(const oclMat &src, oclMat &dst, const oclMat &mat_kernel,
|
||||
const Size &ksize, const Point& anchor, const int borderType)
|
||||
{
|
||||
CV_Assert(src.clCxt == dst.clCxt);
|
||||
CV_Assert((src.cols == dst.cols) &&
|
||||
(src.rows == dst.rows));
|
||||
CV_Assert((src.oclchannels() == dst.oclchannels()));
|
||||
CV_Assert((borderType != 0));
|
||||
CV_Assert(ksize.height > 0 && ksize.width > 0 && ((ksize.height & 1) == 1) && ((ksize.width & 1) == 1));
|
||||
CV_Assert((anchor.x == -1 && anchor.y == -1) || (anchor.x == ksize.width >> 1 && anchor.y == ksize.height >> 1));
|
||||
CV_Assert(ksize.width == ksize.height);
|
||||
Context *clCxt = src.clCxt;
|
||||
int cn = src.oclchannels();
|
||||
int depth = src.depth();
|
||||
|
||||
string kernelName = "filter2D";
|
||||
int filterWidth = ksize.width;
|
||||
bool ksize_3x3 = filterWidth == 3 && src.type() != CV_32FC4; // CV_32FC4 is not tuned up with filter2d_3x3 kernel
|
||||
|
||||
string kernelName = ksize_3x3 ? "filter2D_3x3" : "filter2D";
|
||||
|
||||
size_t src_offset_x = (src.offset % src.step) / src.elemSize();
|
||||
size_t src_offset_y = src.offset / src.step;
|
||||
@ -613,55 +614,81 @@ static void GPUFilter2D(const oclMat &src, oclMat &dst, oclMat &mat_kernel,
|
||||
size_t dst_offset_x = (dst.offset % dst.step) / dst.elemSize();
|
||||
size_t dst_offset_y = dst.offset / dst.step;
|
||||
|
||||
int paddingPixels = filterWidth & (-2);
|
||||
|
||||
size_t localThreads[3] = {ksize_3x3 ? 256 : 16, ksize_3x3 ? 1 : 16, 1};
|
||||
size_t globalThreads[3] = {src.wholecols, src.wholerows, 1};
|
||||
|
||||
int cn = src.oclchannels();
|
||||
int src_step = (int)(src.step/src.elemSize());
|
||||
int dst_step = (int)(dst.step/src.elemSize());
|
||||
|
||||
int localWidth = localThreads[0] + paddingPixels;
|
||||
int localHeight = localThreads[1] + paddingPixels;
|
||||
|
||||
// 260 = divup((localThreads[0] + filterWidth * 2), 4) * 4
|
||||
// 6 = (ROWS_PER_GROUP_WHICH_IS_4 + filterWidth * 2)
|
||||
size_t localMemSize = ksize_3x3 ? 260 * 6 * src.elemSize() : (localWidth * localHeight) * src.elemSize();
|
||||
|
||||
int vector_lengths[4][7] = {{4, 4, 4, 4, 4, 4, 4},
|
||||
{4, 4, 1, 1, 1, 1, 1},
|
||||
{1, 1, 1, 1, 1, 1, 1},
|
||||
{4, 4, 4, 4, 1, 1, 4}
|
||||
{4, 4, 1, 1, 1, 1, 1},
|
||||
{1, 1, 1, 1, 1, 1, 1},
|
||||
{4, 4, 4, 4, 1, 1, 4}
|
||||
};
|
||||
|
||||
int vector_length = vector_lengths[cn - 1][depth];
|
||||
int offset_cols = (dst_offset_x) & (vector_length - 1);
|
||||
int cols = dst.cols + offset_cols;
|
||||
int rows = divUp(dst.rows, vector_length);
|
||||
|
||||
size_t localThreads[3] = {256, 1, 1};
|
||||
size_t globalThreads[3] = { divUp(cols, localThreads[0]) *localThreads[0],
|
||||
divUp(rows, localThreads[1]) *localThreads[1], 1
|
||||
};
|
||||
int cols = dst.cols + ((dst_offset_x) & (vector_lengths[cn - 1][src.depth()] - 1));
|
||||
|
||||
vector< pair<size_t, const void *> > args;
|
||||
args.push_back(make_pair(sizeof(cl_mem), (void *)&src.data));
|
||||
args.push_back(make_pair(sizeof(cl_int), (void *)&src.step));
|
||||
args.push_back(make_pair(sizeof(cl_mem), (void *)&dst.data));
|
||||
args.push_back(make_pair(sizeof(cl_int), (void *)&src_step));
|
||||
args.push_back(make_pair(sizeof(cl_int), (void *)&dst_step));
|
||||
args.push_back(make_pair(sizeof(cl_mem), (void *)&mat_kernel.data));
|
||||
args.push_back(make_pair(localMemSize, (void *)NULL));
|
||||
args.push_back(make_pair(sizeof(cl_int), (void *)&src.wholerows));
|
||||
args.push_back(make_pair(sizeof(cl_int), (void *)&src.wholecols));
|
||||
args.push_back(make_pair(sizeof(cl_int), (void *)&src_offset_x));
|
||||
args.push_back(make_pair(sizeof(cl_int), (void *)&src_offset_y));
|
||||
args.push_back(make_pair(sizeof(cl_mem), (void *)&dst.data));
|
||||
args.push_back(make_pair(sizeof(cl_int), (void *)&dst.step));
|
||||
args.push_back(make_pair(sizeof(cl_int), (void *)&dst_offset_x));
|
||||
args.push_back(make_pair(sizeof(cl_int), (void *)&dst_offset_y));
|
||||
args.push_back(make_pair(sizeof(cl_mem), (void *)&mat_kernel.data));
|
||||
args.push_back(make_pair(sizeof(cl_int), (void *)&src.cols));
|
||||
args.push_back(make_pair(sizeof(cl_int), (void *)&src.rows));
|
||||
args.push_back(make_pair(sizeof(cl_int), (void *)&cols));
|
||||
args.push_back(make_pair(sizeof(cl_int), (void *)&src.wholecols));
|
||||
args.push_back(make_pair(sizeof(cl_int), (void *)&src.wholerows));
|
||||
|
||||
const int buffer_size = 100;
|
||||
char opt_buffer [buffer_size] = "";
|
||||
sprintf(opt_buffer, "-DANCHOR=%d -DANX=%d -DANY=%d", ksize.width, anchor.x, anchor.y);
|
||||
|
||||
openCLExecuteKernel(clCxt, &filtering_laplacian, kernelName, globalThreads, localThreads, args, cn, depth, opt_buffer);
|
||||
char btype[30];
|
||||
switch (borderType)
|
||||
{
|
||||
case 0:
|
||||
sprintf(btype, "BORDER_CONSTANT");
|
||||
break;
|
||||
case 1:
|
||||
sprintf(btype, "BORDER_REPLICATE");
|
||||
break;
|
||||
case 2:
|
||||
sprintf(btype, "BORDER_REFLECT");
|
||||
break;
|
||||
case 3:
|
||||
CV_Error(CV_StsUnsupportedFormat, "BORDER_WRAP is not supported!");
|
||||
return;
|
||||
case 4:
|
||||
sprintf(btype, "BORDER_REFLECT_101");
|
||||
break;
|
||||
}
|
||||
int type = src.depth();
|
||||
char build_options[150];
|
||||
sprintf(build_options, "-D %s -D IMG_C_%d_%d -D CN=%d -D FILTER_SIZE=%d", btype, cn, type, cn, ksize.width);
|
||||
openCLExecuteKernel(clCxt, &filtering_laplacian, kernelName, globalThreads, localThreads, args, -1, -1, build_options);
|
||||
}
|
||||
|
||||
Ptr<BaseFilter_GPU> cv::ocl::getLinearFilter_GPU(int srcType, int dstType, const Mat &kernel, const Size &ksize,
|
||||
Point anchor, int borderType)
|
||||
const Point &anchor, int borderType)
|
||||
{
|
||||
static const GPUFilter2D_t GPUFilter2D_callers[] = {0, GPUFilter2D, 0, GPUFilter2D, GPUFilter2D};
|
||||
|
||||
CV_Assert((srcType == CV_8UC1 || srcType == CV_8UC3 || srcType == CV_8UC4 || srcType == CV_32FC1 || srcType == CV_32FC3 || srcType == CV_32FC4) && dstType == srcType);
|
||||
|
||||
oclMat gpu_krnl;
|
||||
int nDivisor;
|
||||
normalizeKernel(kernel, gpu_krnl, CV_32S, &nDivisor, false);
|
||||
normalizeAnchor(anchor, ksize);
|
||||
Point norm_archor = anchor;
|
||||
normalizeKernel(kernel, gpu_krnl, CV_32FC1);
|
||||
normalizeAnchor(norm_archor, ksize);
|
||||
|
||||
return Ptr<BaseFilter_GPU>(new LinearFilter_GPU(ksize, anchor, gpu_krnl, GPUFilter2D_callers[CV_MAT_CN(srcType)],
|
||||
borderType));
|
||||
|
@ -15,7 +15,9 @@
|
||||
// Third party copyrights are property of their respective owners.
|
||||
//
|
||||
// @Authors
|
||||
// Pang Erping, erping@multicorewareinc.com
|
||||
// Jia Haipeng, jiahaipeng95@gmail.com
|
||||
// Peng Xiao, pengxiao@outlook.com
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without modification,
|
||||
// are permitted provided that the following conditions are met:
|
||||
@ -42,49 +44,195 @@
|
||||
// the use of this software, even if advised of the possibility of such damage.
|
||||
//
|
||||
//M*/
|
||||
#define BORDER_REFLECT_101
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
/////////////////////////////////Macro for border type////////////////////////////////////////////
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
#ifdef BORDER_REPLICATE
|
||||
|
||||
//BORDER_REPLICATE: aaaaaa|abcdefgh|hhhhhhh
|
||||
#define ADDR_L(i, l_edge, r_edge) ((i) < (l_edge) ? (l_edge) : (i))
|
||||
#define ADDR_R(i, r_edge, addr) ((i) >= (r_edge) ? (r_edge)-1 : (addr))
|
||||
#define ADDR_H(i, t_edge, b_edge) ((i) < (t_edge) ? (t_edge) :(i))
|
||||
#define ADDR_H(i, t_edge, b_edge) ((i) < (t_edge) ? (t_edge) : (i))
|
||||
#define ADDR_B(i, b_edge, addr) ((i) >= (b_edge) ? (b_edge)-1 :(addr))
|
||||
#endif
|
||||
|
||||
#ifdef BORDER_REFLECT
|
||||
//BORDER_REFLECT: fedcba|abcdefgh|hgfedcb
|
||||
#define ADDR_L(i, l_edge, r_edge) ((i) < (l_edge) ? -(i)-1 : (i))
|
||||
#define ADDR_L(i, l_edge, r_edge) ((i) < (l_edge) ? ((l_edge)<<1)-(i)-1 : (i))
|
||||
#define ADDR_R(i, r_edge, addr) ((i) >= (r_edge) ? -(i)-1+((r_edge)<<1) : (addr))
|
||||
#define ADDR_H(i, t_edge, b_edge) ((i) < (t_edge) ? -(i)-1 : (i))
|
||||
#define ADDR_H(i, t_edge, b_edge) ((i) < (t_edge) ? ((t_edge)<<1)-(i)-1 : (i))
|
||||
#define ADDR_B(i, b_edge, addr) ((i) >= (b_edge) ? -(i)-1+((b_edge)<<1) : (addr))
|
||||
#endif
|
||||
|
||||
#ifdef BORDER_REFLECT_101
|
||||
//BORDER_REFLECT_101: gfedcb|abcdefgh|gfedcba
|
||||
#define ADDR_L(i, l_edge, r_edge) ((i) < (l_edge) ? -(i) : (i))
|
||||
#define ADDR_L(i, l_edge, r_edge) ((i) < (l_edge) ? ((l_edge)<<1)-(i) : (i))
|
||||
#define ADDR_R(i, r_edge, addr) ((i) >= (r_edge) ? -(i)-2+((r_edge)<<1) : (addr))
|
||||
#define ADDR_H(i, t_edge, b_edge) ((i) < (t_edge) ? -(i) : (i))
|
||||
#define ADDR_H(i, t_edge, b_edge) ((i) < (t_edge) ? ((t_edge)<<1)-(i) : (i))
|
||||
#define ADDR_B(i, b_edge, addr) ((i) >= (b_edge) ? -(i)-2+((b_edge)<<1) : (addr))
|
||||
#endif
|
||||
|
||||
#ifdef BORDER_WRAP
|
||||
//BORDER_WRAP: cdefgh|abcdefgh|abcdefg
|
||||
#define ADDR_L(i, l_edge, r_edge) ((i) < (l_edge) ? (i)+(r_edge) : (i))
|
||||
#define ADDR_R(i, r_edge, addr) ((i) >= (r_edge) ? (i)-(r_edge) : (addr))
|
||||
#define ADDR_H(i, t_edge, b_edge) ((i) < (t_edge) ? (i)+(b_edge) : (i))
|
||||
#define ADDR_B(i, b_edge, addr) ((i) >= (b_edge) ? (i)-(b_edge) : (addr))
|
||||
#ifdef IMG_C_1_0
|
||||
#define T_IMG uchar
|
||||
#define T_IMGx4 uchar4
|
||||
#define T_IMG_C1 uchar
|
||||
#define CONVERT_TYPE convert_uchar_sat
|
||||
#define CONVERT_TYPEx4 convert_uchar4_sat
|
||||
#endif
|
||||
#ifdef IMG_C_4_0
|
||||
#define T_IMG uchar4
|
||||
#define T_IMGx4 uchar16
|
||||
#define T_IMG_C1 uchar
|
||||
#define CONVERT_TYPE convert_uchar4_sat
|
||||
#define CONVERT_TYPEx4 convert_uchar16_sat
|
||||
#endif
|
||||
#ifdef IMG_C_1_5
|
||||
#define T_IMG float
|
||||
#define T_IMGx4 float4
|
||||
#define T_IMG_C1 float
|
||||
#define CONVERT_TYPE convert_float
|
||||
#define CONVERT_TYPEx4 convert_float4
|
||||
#endif
|
||||
#ifdef IMG_C_4_5
|
||||
#define T_IMG float4
|
||||
#define T_IMGx4 float16
|
||||
#define T_IMG_C1 float
|
||||
#define CONVERT_TYPE convert_float4
|
||||
#define CONVERT_TYPEx4 convert_float16
|
||||
#endif
|
||||
|
||||
#ifndef CN
|
||||
#define CN 1
|
||||
#endif
|
||||
|
||||
#if CN == 1
|
||||
#define T_SUM float
|
||||
#define T_SUMx4 float4
|
||||
#define CONVERT_TYPE_SUM convert_float
|
||||
#define CONVERT_TYPE_SUMx4 convert_float4
|
||||
#define SUM_ZERO (0.0f)
|
||||
#define SUM_ZEROx4 (0.0f, 0.0f, 0.0f, 0.0f)
|
||||
#define VLOAD4 vload4
|
||||
#define SX x
|
||||
#define SY y
|
||||
#define SZ z
|
||||
#define SW w
|
||||
#elif CN == 4
|
||||
#define T_SUM float4
|
||||
#define T_SUMx4 float16
|
||||
#define CONVERT_TYPE_SUM convert_float4
|
||||
#define CONVERT_TYPE_SUMx4 convert_float16
|
||||
#define SUM_ZERO (0.0f, 0.0f, 0.0f, 0.0f)
|
||||
#define SUM_ZEROx4 (0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f)
|
||||
#define VLOAD4 vload16
|
||||
#define SX s0123
|
||||
#define SY s4567
|
||||
#define SZ s89ab
|
||||
#define SW scdef
|
||||
#endif
|
||||
|
||||
#ifndef FILTER_SIZE
|
||||
#define FILTER_SIZE 3
|
||||
#endif
|
||||
|
||||
#define LOCAL_GROUP_SIZE 16
|
||||
|
||||
#define LOCAL_WIDTH ((FILTER_SIZE/2)*2 + LOCAL_GROUP_SIZE)
|
||||
#define LOCAL_HEIGHT ((FILTER_SIZE/2)*2 + LOCAL_GROUP_SIZE)
|
||||
|
||||
#define FILTER_RADIUS (FILTER_SIZE >> 1)
|
||||
|
||||
__kernel void filter2D(
|
||||
__global T_IMG *src,
|
||||
__global T_IMG *dst,
|
||||
int src_step,
|
||||
int dst_step,
|
||||
__constant float *mat_kernel,
|
||||
__local T_IMG *local_data,
|
||||
int wholerows,
|
||||
int wholecols,
|
||||
int src_offset_x,
|
||||
int src_offset_y,
|
||||
int dst_offset_x,
|
||||
int dst_offset_y,
|
||||
int cols,
|
||||
int rows,
|
||||
int operate_cols
|
||||
)
|
||||
{
|
||||
int groupStartCol = get_group_id(0) * get_local_size(0);
|
||||
int groupStartRow = get_group_id(1) * get_local_size(1);
|
||||
|
||||
int localCol = get_local_id(0);
|
||||
int localRow = get_local_id(1);
|
||||
int globalCol = groupStartCol + localCol;
|
||||
int globalRow = groupStartRow + localRow;
|
||||
const int src_offset = mad24(src_offset_y, src_step, src_offset_x);
|
||||
const int dst_offset = mad24(dst_offset_y, dst_step, dst_offset_x);
|
||||
#ifdef BORDER_CONSTANT
|
||||
for(int i = localRow; i < LOCAL_HEIGHT; i += get_local_size(1))
|
||||
{
|
||||
int curRow = groupStartRow + i;
|
||||
for(int j = localCol; j < LOCAL_WIDTH; j += get_local_size(0))
|
||||
{
|
||||
int curCol = groupStartCol + j;
|
||||
if(curRow < FILTER_RADIUS - src_offset_y || (curRow - FILTER_RADIUS) >= wholerows - src_offset_y||
|
||||
curCol < FILTER_RADIUS - src_offset_x || (curCol - FILTER_RADIUS) >= wholecols - src_offset_x)
|
||||
{
|
||||
local_data[(i) * LOCAL_WIDTH + j] = 0;
|
||||
}
|
||||
else
|
||||
{
|
||||
local_data[(i) * LOCAL_WIDTH + j] = src[(curRow - FILTER_RADIUS) * src_step + curCol - FILTER_RADIUS + src_offset];
|
||||
}
|
||||
}
|
||||
}
|
||||
#else
|
||||
for(int i = localRow; i < LOCAL_HEIGHT; i += get_local_size(1))
|
||||
{
|
||||
int curRow = groupStartRow + i;
|
||||
|
||||
curRow = ADDR_H(curRow, FILTER_RADIUS - src_offset_y, wholerows - src_offset_y);
|
||||
|
||||
curRow = ADDR_B(curRow - FILTER_RADIUS, wholerows - src_offset_y, curRow - FILTER_RADIUS);
|
||||
|
||||
for(int j = localCol; j < LOCAL_WIDTH; j += get_local_size(0))
|
||||
{
|
||||
int curCol = groupStartCol + j;
|
||||
curCol = ADDR_L(curCol, FILTER_RADIUS - src_offset_x, wholecols - src_offset_x);
|
||||
curCol = ADDR_R(curCol - FILTER_RADIUS, wholecols - src_offset_x, curCol - FILTER_RADIUS);
|
||||
if(curRow < wholerows && curCol < wholecols)
|
||||
{
|
||||
local_data[(i) * LOCAL_WIDTH + j] = src[(curRow) * src_step + curCol + src_offset];
|
||||
}
|
||||
}
|
||||
}
|
||||
#endif
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
if(globalRow < rows && globalCol < cols)
|
||||
{
|
||||
T_SUM sum = (T_SUM)SUM_ZERO;
|
||||
int filterIdx = 0;
|
||||
for(int i = 0; i < FILTER_SIZE; i++)
|
||||
{
|
||||
int offset = (i + localRow) * LOCAL_WIDTH;
|
||||
|
||||
for(int j = 0; j < FILTER_SIZE; j++)
|
||||
{
|
||||
sum += CONVERT_TYPE_SUM(local_data[offset + j + localCol]) * mat_kernel[filterIdx++];
|
||||
}
|
||||
}
|
||||
dst[(globalRow)*dst_step + (globalCol) + dst_offset] = CONVERT_TYPE(sum);
|
||||
}
|
||||
}
|
||||
|
||||
/// following is specific for 3x3 kernels
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
/////////////////////////////Macro for define elements number per thread/////////////////////////////
|
||||
////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
//#define ANCHOR 3
|
||||
//#define ANX 1
|
||||
//#define ANY 1
|
||||
#define ANX 1
|
||||
#define ANY 1
|
||||
|
||||
#define ROWS_PER_GROUP 4
|
||||
#define ROWS_PER_GROUP_BITS 2
|
||||
@ -101,10 +249,23 @@
|
||||
///////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
/////////////////////////////////////////8uC1////////////////////////////////////////////////////////
|
||||
////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
__kernel void filter2D_C1_D0(__global uchar *src, int src_step, int src_offset_x, int src_offset_y,
|
||||
__global uchar *dst, int dst_step, int dst_offset_x, int dst_offset_y,
|
||||
__constant int *mat_kernel __attribute__((max_constant_size (16384))),
|
||||
int cols,int rows, int operate_cols, int wholecols, int wholerows)
|
||||
__kernel void filter2D_3x3(
|
||||
__global T_IMG *src,
|
||||
__global T_IMG *dst,
|
||||
int src_step,
|
||||
int dst_step,
|
||||
__constant float *mat_kernel,
|
||||
__local T_IMG *local_data,
|
||||
int wholerows,
|
||||
int wholecols,
|
||||
int src_offset_x,
|
||||
int src_offset_y,
|
||||
int dst_offset_x,
|
||||
int dst_offset_y,
|
||||
int cols,
|
||||
int rows,
|
||||
int operate_cols
|
||||
)
|
||||
{
|
||||
int gX = get_global_id(0);
|
||||
int gY = get_global_id(1);
|
||||
@ -118,7 +279,6 @@ __kernel void filter2D_C1_D0(__global uchar *src, int src_step, int src_offset_x
|
||||
int cols_start_index_group = src_offset_x - dst_align + groupX_size * groupX_id - ANX;
|
||||
int rows_start_index = src_offset_y + (gY << ROWS_PER_GROUP_BITS) - ANY;
|
||||
|
||||
__local uchar local_data[LOCAL_MEM_STEP * ROWS_FETCH];
|
||||
if((gY << 2) < rows)
|
||||
{
|
||||
for(int i = 0; i < ROWS_FETCH; ++i)
|
||||
@ -129,19 +289,19 @@ __kernel void filter2D_C1_D0(__global uchar *src, int src_step, int src_offset_x
|
||||
int selected_row = rows_start_index + i;
|
||||
int selected_cols = cols_start_index_group + lX;
|
||||
|
||||
uchar data = *(src + selected_row * src_step + selected_cols);
|
||||
int con = selected_row >=0 && selected_row < wholerows && selected_cols >=0 && selected_cols < wholecols;
|
||||
T_IMG data = src[mad24(selected_row, src_step, selected_cols)];
|
||||
int con = selected_row >= 0 && selected_row < wholerows && selected_cols >= 0 && selected_cols < wholecols;
|
||||
data = con ? data : 0;
|
||||
local_data[i * LOCAL_MEM_STEP + lX ] =data;
|
||||
local_data[mad24(i, LOCAL_MEM_STEP, lX)] = data;
|
||||
|
||||
if(lX < (ANX << 1))
|
||||
{
|
||||
selected_cols = cols_start_index_group + lX + groupX_size;
|
||||
|
||||
data = *(src + selected_row * src_step + selected_cols);
|
||||
con = selected_row >=0 && selected_row < wholerows && selected_cols >=0 && selected_cols < wholecols;
|
||||
data = src[mad24(selected_row, src_step, selected_cols)];
|
||||
con = selected_row >= 0 && selected_row < wholerows && selected_cols >= 0 && selected_cols < wholecols;
|
||||
data = con ? data : 0;
|
||||
local_data[i * LOCAL_MEM_STEP + lX + groupX_size] =data;
|
||||
local_data[mad24(i, LOCAL_MEM_STEP, lX) + groupX_size] = data;
|
||||
}
|
||||
#else
|
||||
int selected_row = ADDR_H(rows_start_index + i, 0, wholerows);
|
||||
@ -150,17 +310,17 @@ __kernel void filter2D_C1_D0(__global uchar *src, int src_step, int src_offset_x
|
||||
int selected_cols = ADDR_L(cols_start_index_group + lX, 0, wholecols);
|
||||
selected_cols = ADDR_R(cols_start_index_group + lX, wholecols, selected_cols);
|
||||
|
||||
uchar data = *(src + selected_row * src_step + selected_cols);
|
||||
T_IMG data = src[mad24(selected_row, src_step, selected_cols)];
|
||||
|
||||
local_data[i * LOCAL_MEM_STEP + lX ] =data;
|
||||
local_data[mad24(i, LOCAL_MEM_STEP, lX)] = data;
|
||||
|
||||
if(lX < (ANX << 1))
|
||||
{
|
||||
selected_cols = cols_start_index_group + lX + groupX_size;
|
||||
selected_cols = ADDR_R(selected_cols, wholecols, selected_cols);
|
||||
|
||||
data = *(src + selected_row * src_step + selected_cols);
|
||||
local_data[i * LOCAL_MEM_STEP + lX + groupX_size] =data;
|
||||
data = src[mad24(selected_row, src_step, selected_cols)];
|
||||
local_data[mad24(i, LOCAL_MEM_STEP, lX) + groupX_size] = data;
|
||||
}
|
||||
#endif
|
||||
}
|
||||
@ -177,355 +337,40 @@ __kernel void filter2D_C1_D0(__global uchar *src, int src_step, int src_offset_x
|
||||
|
||||
int dst_rows_end = dst_offset_y + rows;
|
||||
int dst_rows_index = dst_offset_y + (gY << ROWS_PER_GROUP_BITS) + (lX >> THREADS_PER_ROW_BIT);
|
||||
dst = dst + mad24(dst_rows_index, dst_step, dst_cols_index);
|
||||
|
||||
uchar4 dst_data = *((__global uchar4 *)(dst + dst_rows_index * dst_step + dst_cols_index));
|
||||
T_IMGx4 dst_data = *(__global T_IMGx4 *)dst;
|
||||
|
||||
int4 sum = (int4)(0);
|
||||
uchar4 data;
|
||||
T_SUMx4 sum = (T_SUMx4)SUM_ZEROx4;
|
||||
T_IMGx4 data;
|
||||
|
||||
for(int i = 0; i < ANCHOR; i++)
|
||||
for(int i = 0; i < FILTER_SIZE; i++)
|
||||
{
|
||||
#pragma unroll
|
||||
for(int j = 0; j < ANCHOR; j++)
|
||||
for(int j = 0; j < FILTER_SIZE; j++)
|
||||
{
|
||||
if(dst_rows_index < dst_rows_end)
|
||||
{
|
||||
int local_row = (lX >> THREADS_PER_ROW_BIT) + i;
|
||||
int local_cols = ((lX % THREADS_PER_ROW) << ELEMENTS_PER_THREAD_BIT) + j;
|
||||
|
||||
data = vload4(0, local_data+local_row * LOCAL_MEM_STEP + local_cols);
|
||||
sum = sum + (mat_kernel[i * ANCHOR + j] * convert_int4_sat(data));
|
||||
data = VLOAD4(0, (__local T_IMG_C1 *)(local_data + local_row * LOCAL_MEM_STEP + local_cols));
|
||||
sum = sum + (mat_kernel[i * FILTER_SIZE + j] * CONVERT_TYPE_SUMx4(data));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if(dst_rows_index < dst_rows_end)
|
||||
{
|
||||
sum.x = ((dst_cols_index + 0 >= dst_cols_start) && (dst_cols_index + 0 < dst_cols_end)) ? sum.x : dst_data.x;
|
||||
sum.y = ((dst_cols_index + 1 >= dst_cols_start) && (dst_cols_index + 1 < dst_cols_end)) ? sum.y : dst_data.y;
|
||||
sum.z = ((dst_cols_index + 2 >= dst_cols_start) && (dst_cols_index + 2 < dst_cols_end)) ? sum.z : dst_data.z;
|
||||
sum.w = ((dst_cols_index + 3 >= dst_cols_start) && (dst_cols_index + 3 < dst_cols_end)) ? sum.w : dst_data.w;
|
||||
*((__global uchar4 *)(dst + dst_rows_index * dst_step + dst_cols_index)) = convert_uchar4_sat(sum);
|
||||
T_IMGx4 tmp_dst = CONVERT_TYPEx4(sum);
|
||||
tmp_dst.SX = ((dst_cols_index + 0 >= dst_cols_start) && (dst_cols_index + 0 < dst_cols_end)) ?
|
||||
tmp_dst.SX : dst_data.SX;
|
||||
tmp_dst.SY = ((dst_cols_index + 1 >= dst_cols_start) && (dst_cols_index + 1 < dst_cols_end)) ?
|
||||
tmp_dst.SY : dst_data.SY;
|
||||
tmp_dst.SZ = ((dst_cols_index + 2 >= dst_cols_start) && (dst_cols_index + 2 < dst_cols_end)) ?
|
||||
tmp_dst.SZ : dst_data.SZ;
|
||||
tmp_dst.SW = ((dst_cols_index + 3 >= dst_cols_start) && (dst_cols_index + 3 < dst_cols_end)) ?
|
||||
tmp_dst.SW : dst_data.SW;
|
||||
*(__global T_IMGx4 *)dst = tmp_dst;
|
||||
}
|
||||
}
|
||||
}
|
||||
///////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
/////////////////////////////////////////32FC1////////////////////////////////////////////////////////
|
||||
////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
__kernel void filter2D_C1_D5(__global float *src, int src_step, int src_offset_x, int src_offset_y,
|
||||
__global float *dst, int dst_step, int dst_offset_x, int dst_offset_y,
|
||||
__constant int *mat_kernel __attribute__((max_constant_size (16384))),
|
||||
int cols,int rows, int operate_cols, int wholecols, int wholerows)
|
||||
{
|
||||
int gX = get_global_id(0);
|
||||
int gY = get_global_id(1);
|
||||
|
||||
int lX = get_local_id(0);
|
||||
|
||||
int groupX_size = get_local_size(0);
|
||||
int groupX_id = get_group_id(0);
|
||||
|
||||
#define dst_align (dst_offset_x & 3)
|
||||
int cols_start_index_group = src_offset_x - dst_align + groupX_size * groupX_id - ANX;
|
||||
int rows_start_index = src_offset_y + (gY << ROWS_PER_GROUP_BITS) - ANY;
|
||||
|
||||
__local float local_data[LOCAL_MEM_STEP * ROWS_FETCH];
|
||||
if(((gY << 2) < rows))
|
||||
{
|
||||
for(int i = 0; i < ROWS_FETCH; ++i)
|
||||
{
|
||||
if((rows_start_index - src_offset_y) + i < rows + ANY)
|
||||
{
|
||||
#ifdef BORDER_CONSTANT
|
||||
int selected_row = rows_start_index + i;
|
||||
int selected_cols = cols_start_index_group + lX;
|
||||
|
||||
float data = *((__global float *)((__global char *)src + selected_row * src_step + (selected_cols << 2)));
|
||||
int con = selected_row >=0 && selected_row < wholerows && selected_cols >=0 && selected_cols < wholecols;
|
||||
data = con ? data : 0;
|
||||
local_data[i * LOCAL_MEM_STEP + lX ] =data;
|
||||
|
||||
if(lX < (ANX << 1))
|
||||
{
|
||||
selected_cols = cols_start_index_group + lX + groupX_size;
|
||||
|
||||
data = *((__global float *)((__global char *)src + selected_row * src_step + (selected_cols << 2)));
|
||||
con = selected_row >=0 && selected_row < wholerows && selected_cols >=0 && selected_cols < wholecols;
|
||||
data = con ? data : 0;
|
||||
local_data[i * LOCAL_MEM_STEP + lX + groupX_size] =data;
|
||||
}
|
||||
#else
|
||||
int selected_row = ADDR_H(rows_start_index + i, 0, wholerows);
|
||||
selected_row = ADDR_B(rows_start_index + i, wholerows, selected_row);
|
||||
|
||||
int selected_cols = ADDR_L(cols_start_index_group + lX, 0, wholecols);
|
||||
selected_cols = ADDR_R(cols_start_index_group + lX, wholecols, selected_cols);
|
||||
|
||||
float data = *((__global float *)((__global char *)src + selected_row * src_step + (selected_cols << 2)));
|
||||
local_data[i * LOCAL_MEM_STEP + lX] =data;
|
||||
|
||||
if(lX < (ANX << 1))
|
||||
{
|
||||
selected_cols = cols_start_index_group + lX + groupX_size;
|
||||
selected_cols = ADDR_R(selected_cols, wholecols, selected_cols);
|
||||
|
||||
data = *((__global float *)((__global char *)src + selected_row * src_step + (selected_cols << 2)));
|
||||
local_data[i * LOCAL_MEM_STEP + lX + groupX_size] =data;
|
||||
}
|
||||
#endif
|
||||
}
|
||||
}
|
||||
}
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
int process_col = groupX_size * groupX_id + ((lX % THREADS_PER_ROW) << 2);
|
||||
if(((gY << 2) < rows) && (process_col < operate_cols))
|
||||
{
|
||||
int dst_cols_start = dst_offset_x;
|
||||
int dst_cols_end = dst_offset_x + cols;
|
||||
int dst_cols_index = (dst_offset_x + process_col) & 0xfffffffc;
|
||||
|
||||
int dst_rows_end = dst_offset_y + rows;
|
||||
int dst_rows_index = dst_offset_y + (gY << ROWS_PER_GROUP_BITS) + (lX >> THREADS_PER_ROW_BIT);
|
||||
|
||||
float4 dst_data = *((__global float4*)((__global char *)dst + dst_rows_index * dst_step + (dst_cols_index << 2)));
|
||||
|
||||
float4 sum = (float4)(0);
|
||||
float4 data;
|
||||
|
||||
for(int i = 0; i < ANCHOR; i++)
|
||||
{
|
||||
#pragma unroll
|
||||
for(int j = 0; j < ANCHOR; j++)
|
||||
{
|
||||
if(dst_rows_index < dst_rows_end)
|
||||
{
|
||||
int local_row = (lX >> THREADS_PER_ROW_BIT) + i;
|
||||
int local_cols = ((lX % THREADS_PER_ROW) << ELEMENTS_PER_THREAD_BIT) + j;
|
||||
|
||||
data = vload4(0, local_data+local_row * LOCAL_MEM_STEP + local_cols);
|
||||
sum = sum + ((float)(mat_kernel[i * ANCHOR + j]) * data);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if(dst_rows_index < dst_rows_end)
|
||||
{
|
||||
sum.x = ((dst_cols_index + 0 >= dst_cols_start) && (dst_cols_index + 0 < dst_cols_end)) ? sum.x : dst_data.x;
|
||||
sum.y = ((dst_cols_index + 1 >= dst_cols_start) && (dst_cols_index + 1 < dst_cols_end)) ? sum.y : dst_data.y;
|
||||
sum.z = ((dst_cols_index + 2 >= dst_cols_start) && (dst_cols_index + 2 < dst_cols_end)) ? sum.z : dst_data.z;
|
||||
sum.w = ((dst_cols_index + 3 >= dst_cols_start) && (dst_cols_index + 3 < dst_cols_end)) ? sum.w : dst_data.w;
|
||||
|
||||
*((__global float4 *)((__global char *)dst + dst_rows_index * dst_step + (dst_cols_index << 2))) = sum;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
/////////////////////////////////////////8uC4////////////////////////////////////////////////////////
|
||||
////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
__kernel void filter2D_C4_D0(__global uchar4 *src, int src_step, int src_offset_x, int src_offset_y,
|
||||
__global uchar4 *dst, int dst_step, int dst_offset_x, int dst_offset_y,
|
||||
__constant int *mat_kernel __attribute__((max_constant_size (16384))),
|
||||
int cols,int rows, int operate_cols, int wholecols, int wholerows)
|
||||
{
|
||||
int gX = get_global_id(0);
|
||||
int gY = get_global_id(1);
|
||||
|
||||
int lX = get_local_id(0);
|
||||
|
||||
int groupX_size = get_local_size(0);
|
||||
int groupX_id = get_group_id(0);
|
||||
|
||||
#define dst_align (dst_offset_x & 3)
|
||||
int cols_start_index_group = src_offset_x - dst_align + groupX_size * groupX_id - ANX;
|
||||
int rows_start_index = src_offset_y + (gY << ROWS_PER_GROUP_BITS) - ANY;
|
||||
|
||||
__local uchar4 local_data[LOCAL_MEM_STEP * ROWS_FETCH];
|
||||
|
||||
if(((gY << 2) < rows))
|
||||
{
|
||||
for(int i = 0; i < ROWS_FETCH; ++i)
|
||||
{
|
||||
if((rows_start_index - src_offset_y) + i < rows + ANY)
|
||||
{
|
||||
#ifdef BORDER_CONSTANT
|
||||
int selected_row = rows_start_index + i;
|
||||
int selected_cols = cols_start_index_group + lX;
|
||||
|
||||
uchar4 data = *((__global uchar4*)((__global char*)src + selected_row * src_step + (selected_cols << 2)));
|
||||
int con = selected_row >=0 && selected_row < wholerows && selected_cols >=0 && selected_cols < wholecols;
|
||||
data = con ? data : 0;
|
||||
local_data[i * LOCAL_MEM_STEP + lX ] =data;
|
||||
|
||||
if(lX < (ANX << 1))
|
||||
{
|
||||
selected_cols = cols_start_index_group + lX + groupX_size;
|
||||
|
||||
data = *((__global uchar4*)((__global char*)src + selected_row * src_step + (selected_cols << 2)));
|
||||
con = selected_row >=0 && selected_row < wholerows && selected_cols >=0 && selected_cols < wholecols;
|
||||
data = con ? data : 0;
|
||||
local_data[i * LOCAL_MEM_STEP + lX + groupX_size] =data;
|
||||
}
|
||||
#else
|
||||
int selected_row = ADDR_H(rows_start_index + i, 0, wholerows);
|
||||
selected_row = ADDR_B(rows_start_index + i, wholerows, selected_row);
|
||||
|
||||
int selected_cols = ADDR_L(cols_start_index_group + lX, 0, wholecols);
|
||||
selected_cols = ADDR_R(cols_start_index_group + lX, wholecols, selected_cols);
|
||||
|
||||
uchar4 data = *((__global uchar4*)((__global char*)src + selected_row * src_step + (selected_cols << 2)));
|
||||
|
||||
local_data[i * LOCAL_MEM_STEP + lX] =data;
|
||||
|
||||
if(lX < (ANX << 1))
|
||||
{
|
||||
selected_cols = cols_start_index_group + lX + groupX_size;
|
||||
selected_cols = ADDR_R(selected_cols, wholecols, selected_cols);
|
||||
|
||||
data = *((__global uchar4*)((__global char*)src + selected_row * src_step + (selected_cols << 2)));
|
||||
local_data[i * LOCAL_MEM_STEP + lX + groupX_size] =data;
|
||||
}
|
||||
#endif
|
||||
}
|
||||
}
|
||||
}
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
int process_col = groupX_size * groupX_id + ((lX % THREADS_PER_ROW) << 2);
|
||||
if(((gY << 2) < rows) && (process_col < operate_cols))
|
||||
{
|
||||
int dst_cols_start = dst_offset_x;
|
||||
int dst_cols_end = dst_offset_x + cols;
|
||||
int dst_cols_index = (dst_offset_x + process_col) & 0xfffffffc;
|
||||
|
||||
int dst_rows_end = dst_offset_y + rows;
|
||||
int dst_rows_index = dst_offset_y + (gY << ROWS_PER_GROUP_BITS) + (lX >> THREADS_PER_ROW_BIT);
|
||||
|
||||
uchar16 dst_data;
|
||||
dst_data = *((__global uchar16*)((__global char *)dst + dst_rows_index * dst_step + (dst_cols_index << 2)));
|
||||
|
||||
int16 sum = (int16)(0);
|
||||
uchar16 data;
|
||||
|
||||
for(int i = 0; i < ANCHOR; i++)
|
||||
{
|
||||
#pragma unroll
|
||||
for(int j = 0; j < ANCHOR; j++)
|
||||
{
|
||||
if(dst_rows_index < dst_rows_end)
|
||||
{
|
||||
int local_row = (lX >> THREADS_PER_ROW_BIT) + i;
|
||||
int local_cols = ((lX % THREADS_PER_ROW) << ELEMENTS_PER_THREAD_BIT) + j;
|
||||
|
||||
data = vload16(0, (__local uchar *)(local_data+local_row * LOCAL_MEM_STEP + local_cols));
|
||||
sum = sum + (mat_kernel[i * ANCHOR + j] * convert_int16_sat(data));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if(dst_rows_index < dst_rows_end)
|
||||
{
|
||||
uchar16 sum1 = convert_uchar16_sat(sum);
|
||||
sum1.s0123 = ((dst_cols_index + 0 >= dst_cols_start) && (dst_cols_index + 0 < dst_cols_end))?
|
||||
sum1.s0123 : dst_data.s0123;
|
||||
sum1.s4567 = ((dst_cols_index + 1 >= dst_cols_start) && (dst_cols_index + 1 < dst_cols_end))?
|
||||
sum1.s4567 : dst_data.s4567;
|
||||
sum1.s89ab = ((dst_cols_index + 2 >= dst_cols_start) && (dst_cols_index + 2 < dst_cols_end))?
|
||||
sum1.s89ab : dst_data.s89ab;
|
||||
sum1.scdef = ((dst_cols_index + 3 >= dst_cols_start) && (dst_cols_index + 3 < dst_cols_end))?
|
||||
sum1.scdef : dst_data.scdef;
|
||||
|
||||
*((__global uchar16*)((__global char *)dst + dst_rows_index * dst_step + (dst_cols_index << 2))) = sum1;
|
||||
}
|
||||
}
|
||||
}
|
||||
///////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
/////////////////////////////////////////32FC4////////////////////////////////////////////////////////
|
||||
////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
#define ROWS_FETCH_C4 (1 + ANY + ANY) //(ROWS_PER_GROUP + anY * 2)
|
||||
#define LOCAL_MEM_STEP_C4 260 //divup((get_local_size(0) + anX * 2), 4) * 4)
|
||||
__kernel void filter2D_C4_D5(__global float4 *src, int src_step, int src_offset_x, int src_offset_y,
|
||||
__global float4 *dst, int dst_step, int dst_offset_x, int dst_offset_y,
|
||||
__constant int *mat_kernel __attribute__((max_constant_size (16384))),
|
||||
int cols,int rows, int operate_cols, int wholecols, int wholerows)
|
||||
{
|
||||
int gX = get_global_id(0);
|
||||
int gY = get_global_id(1);
|
||||
|
||||
int lX = get_local_id(0);
|
||||
|
||||
int groupX_size = get_local_size(0);
|
||||
int groupX_id = get_group_id(0);
|
||||
|
||||
int cols_start_index_group = src_offset_x + groupX_size * groupX_id - ANX;
|
||||
int rows_start_index = src_offset_y + gY - ANY;
|
||||
|
||||
__local float4 local_data[LOCAL_MEM_STEP_C4 * ROWS_FETCH_C4];
|
||||
if((gY < rows) && (gX < (operate_cols + ANX + ANX)))
|
||||
{
|
||||
for(int i = 0; i < ROWS_FETCH_C4; ++i)
|
||||
{
|
||||
if((rows_start_index - src_offset_y) + i < rows + ANY)
|
||||
{
|
||||
#ifdef BORDER_CONSTANT
|
||||
int selected_row = rows_start_index + i;
|
||||
int selected_cols = cols_start_index_group + lX;
|
||||
|
||||
float4 data = *((__global float4*)((__global char*)src + selected_row * src_step + (selected_cols << 4)));
|
||||
int con = selected_row >=0 && selected_row < wholerows && selected_cols >=0 && selected_cols < wholecols;
|
||||
data = con ? data : 0;
|
||||
local_data[i * LOCAL_MEM_STEP + lX ] =data;
|
||||
|
||||
if(lX < (ANX << 1))
|
||||
{
|
||||
selected_cols = cols_start_index_group + lX + groupX_size;
|
||||
|
||||
data = *((__global float4*)((__global char*)src + selected_row * src_step + (selected_cols << 4)));
|
||||
con = selected_row >=0 && selected_row < wholerows && selected_cols >=0 && selected_cols < wholecols;
|
||||
data = con ? data : 0;
|
||||
local_data[i * LOCAL_MEM_STEP + lX + groupX_size] =data;
|
||||
}
|
||||
#else
|
||||
int selected_row = ADDR_H(rows_start_index + i, 0, wholerows);
|
||||
selected_row = ADDR_B(rows_start_index + i, wholerows, selected_row);
|
||||
|
||||
int selected_cols = ADDR_L(cols_start_index_group + lX, 0, wholecols);
|
||||
selected_cols = ADDR_R(cols_start_index_group + lX, wholecols, selected_cols);
|
||||
|
||||
float4 data = *((__global float4*)((__global char*)src + selected_row * src_step + (selected_cols << 4)));
|
||||
local_data[i * LOCAL_MEM_STEP_C4 + lX] =data;
|
||||
|
||||
if(lX < (ANX << 1))
|
||||
{
|
||||
selected_cols = cols_start_index_group + lX + groupX_size;
|
||||
selected_cols = ADDR_R(selected_cols, wholecols, selected_cols);
|
||||
|
||||
data = *((__global float4*)((__global char*)src + selected_row * src_step + (selected_cols << 4)));
|
||||
local_data[i * LOCAL_MEM_STEP_C4 + lX + groupX_size] =data;
|
||||
}
|
||||
#endif
|
||||
}
|
||||
}
|
||||
}
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
if((gY < rows) && (gX < operate_cols))
|
||||
{
|
||||
int dst_cols_index = dst_offset_x + gX;
|
||||
int dst_rows_index = dst_offset_y + gY;
|
||||
|
||||
float4 sum = (float4)(0);
|
||||
|
||||
for(int i = 0; i < ANCHOR; i++)
|
||||
{
|
||||
for(int j = 0; j < ANCHOR; j++)
|
||||
{
|
||||
int local_cols = lX + j;
|
||||
sum = sum + ((float)mat_kernel[i * ANCHOR + j] * local_data[i * LOCAL_MEM_STEP_C4 + local_cols]);
|
||||
}
|
||||
}
|
||||
|
||||
*((__global float4*)((__global char *)dst + dst_rows_index * dst_step + (dst_cols_index << 4))) = sum;
|
||||
}
|
||||
}
|
||||
|
@ -324,6 +324,35 @@ TEST_P(GaussianBlur, Mat)
|
||||
|
||||
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
// Filter2D
|
||||
struct Filter2D : FilterTestBase
|
||||
{
|
||||
int type;
|
||||
cv::Size ksize;
|
||||
int bordertype;
|
||||
Point anchor;
|
||||
virtual void SetUp()
|
||||
{
|
||||
type = GET_PARAM(0);
|
||||
ksize = GET_PARAM(1);
|
||||
bordertype = GET_PARAM(3);
|
||||
Init(type);
|
||||
anchor = Point(-1,-1);
|
||||
}
|
||||
};
|
||||
|
||||
TEST_P(Filter2D, Mat)
|
||||
{
|
||||
cv::Mat kernel = randomMat(cv::Size(ksize.width, ksize.height), CV_32FC1, 0.0, 1.0);
|
||||
for(int j = 0; j < LOOP_TIMES; j++)
|
||||
{
|
||||
random_roi();
|
||||
cv::filter2D(mat1_roi, dst_roi, -1, kernel, anchor, 0.0, bordertype);
|
||||
cv::ocl::filter2D(gmat1, gdst, -1, kernel, anchor, bordertype);
|
||||
Near(1);
|
||||
}
|
||||
}
|
||||
INSTANTIATE_TEST_CASE_P(Filter, Blur, Combine(
|
||||
Values(CV_8UC1, CV_8UC3, CV_8UC4, CV_32FC1, CV_32FC4),
|
||||
Values(cv::Size(3, 3), cv::Size(5, 5), cv::Size(7, 7)),
|
||||
@ -331,7 +360,7 @@ INSTANTIATE_TEST_CASE_P(Filter, Blur, Combine(
|
||||
Values((MatType)cv::BORDER_CONSTANT, (MatType)cv::BORDER_REPLICATE, (MatType)cv::BORDER_REFLECT, (MatType)cv::BORDER_REFLECT_101)));
|
||||
|
||||
|
||||
INSTANTIATE_TEST_CASE_P(Filters, Laplacian, Combine(
|
||||
INSTANTIATE_TEST_CASE_P(Filter, Laplacian, Combine(
|
||||
Values(CV_8UC1, CV_8UC3, CV_8UC4, CV_32FC1, CV_32FC3, CV_32FC4),
|
||||
Values(Size(3, 3)),
|
||||
Values(Size(0, 0)), //not use
|
||||
@ -365,4 +394,10 @@ INSTANTIATE_TEST_CASE_P(Filter, GaussianBlur, Combine(
|
||||
|
||||
|
||||
|
||||
INSTANTIATE_TEST_CASE_P(Filter, Filter2D, testing::Combine(
|
||||
Values(CV_8UC1, CV_32FC1, CV_32FC4),
|
||||
Values(Size(3, 3), Size(15, 15), Size(25, 25)),
|
||||
Values(Size(0, 0)), //not use
|
||||
Values((MatType)cv::BORDER_CONSTANT, (MatType)cv::BORDER_REFLECT101, (MatType)cv::BORDER_REPLICATE, (MatType)cv::BORDER_REFLECT)));
|
||||
|
||||
#endif // HAVE_OPENCL
|
||||
|
Loading…
x
Reference in New Issue
Block a user