Optimization of cv::pyrDown for 8UC1.

This commit is contained in:
Alexander Karsakov 2014-06-20 15:38:15 +04:00
parent 1db9cc3f0d
commit eeddda4701
2 changed files with 150 additions and 56 deletions

View File

@ -79,12 +79,16 @@
#define SRC(_x,_y) convertToFT(loadpix(srcData + mad24(_y, src_step, PIXSIZE * _x)))
#if kercn == 4
#define SRC4(_x,_y) convert_float4(*(__global const uchar4*)(srcData + mad24(_y, src_step, PIXSIZE * _x)))
#endif
#define noconvert
__kernel void pyrDown(__global const uchar * src, int src_step, int src_offset, int src_rows, int src_cols,
__global uchar * dst, int dst_step, int dst_offset, int dst_rows, int dst_cols)
{
const int x = get_global_id(0);
const int x = get_global_id(0)*kercn;
const int y = get_group_id(1);
__local FT smem[LOCAL_SIZE + 4];
@ -97,98 +101,185 @@ __kernel void pyrDown(__global const uchar * src, int src_step, int src_offset,
FT co3 = 0.0625f;
const int src_y = 2*y;
int col;
if (src_y >= 2 && src_y < src_rows - 2 && x >= 2 && x < src_cols - 2)
if (src_y >= 2 && src_y < src_rows - 2)
{
sum = co3 * SRC(x, src_y - 2);
sum = sum + co2 * SRC(x, src_y - 1);
sum = sum + co1 * SRC(x, src_y );
sum = sum + co2 * SRC(x, src_y + 1);
sum = sum + co3 * SRC(x, src_y + 2);
#if kercn == 1
col = EXTRAPOLATE(x, src_cols);
sum = co3* SRC(col, src_y - 2);
sum = fma(co2, SRC(col, src_y - 1), sum);
sum = fma(co1, SRC(col, src_y ), sum);
sum = fma(co2, SRC(col, src_y + 1), sum);
sum = fma(co3, SRC(col, src_y + 2), sum);
smem[2 + get_local_id(0)] = sum;
if (get_local_id(0) < 2)
#else
if (x < src_cols-4)
{
const int left_x = x - 2;
float4 sum4;
sum4 = co3* SRC4(x, src_y - 2);
sum4 = fma(co2, SRC4(x, src_y - 1), sum4);
sum4 = fma(co1, SRC4(x, src_y ), sum4);
sum4 = fma(co2, SRC4(x, src_y + 1), sum4);
sum4 = fma(co3, SRC4(x, src_y + 2), sum4);
sum = co3 * SRC(left_x, src_y - 2);
sum = sum + co2 * SRC(left_x, src_y - 1);
sum = sum + co1 * SRC(left_x, src_y );
sum = sum + co2 * SRC(left_x, src_y + 1);
sum = sum + co3 * SRC(left_x, src_y + 2);
smem[get_local_id(0)] = sum;
}
if (get_local_id(0) > LOCAL_SIZE - 3)
{
const int right_x = x + 2;
sum = co3 * SRC(right_x, src_y - 2);
sum = sum + co2 * SRC(right_x, src_y - 1);
sum = sum + co1 * SRC(right_x, src_y );
sum = sum + co2 * SRC(right_x, src_y + 1);
sum = sum + co3 * SRC(right_x, src_y + 2);
smem[4 + get_local_id(0)] = sum;
}
vstore4(sum4, get_local_id(0), (__local float*) &smem[2]);
}
else
{
int col = EXTRAPOLATE(x, src_cols);
sum = co3 * SRC(col, EXTRAPOLATE(src_y - 2, src_rows));
sum = sum + co2 * SRC(col, EXTRAPOLATE(src_y - 1, src_rows));
sum = sum + co1 * SRC(col, EXTRAPOLATE(src_y , src_rows));
sum = sum + co2 * SRC(col, EXTRAPOLATE(src_y + 1, src_rows));
sum = sum + co3 * SRC(col, EXTRAPOLATE(src_y + 2, src_rows));
smem[2 + get_local_id(0)] = sum;
for (int i=0; i<4; i++)
{
col = EXTRAPOLATE(x+i, src_cols);
sum = co3* SRC(col, src_y - 2);
sum = fma(co2, SRC(col, src_y - 1), sum);
sum = fma(co1, SRC(col, src_y ), sum);
sum = fma(co2, SRC(col, src_y + 1), sum);
sum = fma(co3, SRC(col, src_y + 2), sum);
smem[2 + 4*get_local_id(0)+i] = sum;
}
}
#endif
if (get_local_id(0) < 2)
{
col = EXTRAPOLATE(x - 2, src_cols);
col = EXTRAPOLATE((int)(get_group_id(0)*LOCAL_SIZE + get_local_id(0) - 2), src_cols);
sum = co3 * SRC(col, EXTRAPOLATE(src_y - 2, src_rows));
sum = sum + co2 * SRC(col, EXTRAPOLATE(src_y - 1, src_rows));
sum = sum + co1 * SRC(col, EXTRAPOLATE(src_y , src_rows));
sum = sum + co2 * SRC(col, EXTRAPOLATE(src_y + 1, src_rows));
sum = sum + co3 * SRC(col, EXTRAPOLATE(src_y + 2, src_rows));
sum = co3* SRC(col, src_y - 2);
sum = fma(co2, SRC(col, src_y - 1), sum);
sum = fma(co1, SRC(col, src_y ), sum);
sum = fma(co2, SRC(col, src_y + 1), sum);
sum = fma(co3, SRC(col, src_y + 2), sum);
smem[get_local_id(0)] = sum;
}
if (get_local_id(0) > LOCAL_SIZE - 3)
if (get_local_id(0) > 1 && get_local_id(0) < 4)
{
col = EXTRAPOLATE(x + 2, src_cols);
col = EXTRAPOLATE((int)((get_group_id(0)+1)*LOCAL_SIZE + get_local_id(0) - 2), src_cols);
sum = co3* SRC(col, src_y - 2);
sum = fma(co2, SRC(col, src_y - 1), sum);
sum = fma(co1, SRC(col, src_y ), sum);
sum = fma(co2, SRC(col, src_y + 1), sum);
sum = fma(co3, SRC(col, src_y + 2), sum);
smem[LOCAL_SIZE + get_local_id(0)] = sum;
}
}
else // need extrapolate y
{
#if kercn == 1
col = EXTRAPOLATE(x, src_cols);
sum = co3* SRC(col, EXTRAPOLATE(src_y - 2, src_rows));
sum = sum + co2 * SRC(col, EXTRAPOLATE(src_y - 1, src_rows));
sum = sum + co1 * SRC(col, EXTRAPOLATE(src_y , src_rows));
sum = sum + co2 * SRC(col, EXTRAPOLATE(src_y + 1, src_rows));
sum = sum + co3 * SRC(col, EXTRAPOLATE(src_y + 2, src_rows));
sum = fma(co2, SRC(col, EXTRAPOLATE(src_y - 1, src_rows)), sum);
sum = fma(co1, SRC(col, EXTRAPOLATE(src_y , src_rows)), sum);
sum = fma(co2, SRC(col, EXTRAPOLATE(src_y + 1, src_rows)), sum);
sum = fma(co3, SRC(col, EXTRAPOLATE(src_y + 2, src_rows)), sum);
smem[4 + get_local_id(0)] = sum;
smem[2 + get_local_id(0)] = sum;
#else
if (x < src_cols-4)
{
float4 sum4;
sum4 = co3* SRC4(x, EXTRAPOLATE(src_y - 2, src_rows));
sum4 = fma(co2, SRC4(x, EXTRAPOLATE(src_y - 1, src_rows)), sum4);
sum4 = fma(co1, SRC4(x, EXTRAPOLATE(src_y , src_rows)), sum4);
sum4 = fma(co2, SRC4(x, EXTRAPOLATE(src_y + 1, src_rows)), sum4);
sum4 = fma(co3, SRC4(x, EXTRAPOLATE(src_y + 2, src_rows)), sum4);
vstore4(sum4, get_local_id(0), (__local float*) &smem[2]);
}
else
{
for (int i=0; i<4; i++)
{
col = EXTRAPOLATE(x+i, src_cols);
sum = co3* SRC(col, EXTRAPOLATE(src_y - 2, src_rows));
sum = fma(co2, SRC(col, EXTRAPOLATE(src_y - 1, src_rows)), sum);
sum = fma(co1, SRC(col, EXTRAPOLATE(src_y , src_rows)), sum);
sum = fma(co2, SRC(col, EXTRAPOLATE(src_y + 1, src_rows)), sum);
sum = fma(co3, SRC(col, EXTRAPOLATE(src_y + 2, src_rows)), sum);
smem[2 + 4*get_local_id(0)+i] = sum;
}
}
#endif
if (get_local_id(0) < 2)
{
col = EXTRAPOLATE((int)(get_group_id(0)*LOCAL_SIZE + get_local_id(0) - 2), src_cols);
sum = co3* SRC(col, EXTRAPOLATE(src_y - 2, src_rows));
sum = fma(co2, SRC(col, EXTRAPOLATE(src_y - 1, src_rows)), sum);
sum = fma(co1, SRC(col, EXTRAPOLATE(src_y , src_rows)), sum);
sum = fma(co2, SRC(col, EXTRAPOLATE(src_y + 1, src_rows)), sum);
sum = fma(co3, SRC(col, EXTRAPOLATE(src_y + 2, src_rows)), sum);
smem[get_local_id(0)] = sum;
}
if (get_local_id(0) > 1 && get_local_id(0) < 4)
{
col = EXTRAPOLATE((int)((get_group_id(0)+1)*LOCAL_SIZE + get_local_id(0) - 2), src_cols);
sum = co3* SRC(col, EXTRAPOLATE(src_y - 2, src_rows));
sum = fma(co2, SRC(col, EXTRAPOLATE(src_y - 1, src_rows)), sum);
sum = fma(co1, SRC(col, EXTRAPOLATE(src_y , src_rows)), sum);
sum = fma(co2, SRC(col, EXTRAPOLATE(src_y + 1, src_rows)), sum);
sum = fma(co3, SRC(col, EXTRAPOLATE(src_y + 2, src_rows)), sum);
smem[LOCAL_SIZE + get_local_id(0)] = sum;
}
}
barrier(CLK_LOCAL_MEM_FENCE);
#if kercn == 1
if (get_local_id(0) < LOCAL_SIZE / 2)
{
const int tid2 = get_local_id(0) * 2;
sum = co3 * smem[2 + tid2 - 2];
sum = sum + co2 * smem[2 + tid2 - 1];
sum = sum + co1 * smem[2 + tid2 ];
sum = sum + co2 * smem[2 + tid2 + 1];
sum = sum + co3 * smem[2 + tid2 + 2];
sum = co3* smem[2 + tid2 + 2];
#if cn == 1
sum = sum + dot(vload4(0, (__local float*) (&smem)+tid2), (float4)(co3, co2, co1, co2));
#else
sum = fma(co3, smem[2 + tid2 - 2], sum);
sum = fma(co2, smem[2 + tid2 - 1], sum);
sum = fma(co1, smem[2 + tid2 ], sum);
sum = fma(co2, smem[2 + tid2 + 1], sum);
#endif
const int dst_x = (get_group_id(0) * get_local_size(0) + tid2) / 2;
if (dst_x < dst_cols)
storepix(convertToT(sum), dstData + y * dst_step + dst_x * PIXSIZE);
}
#else
int tid4 = get_local_id(0) * 4;
sum = co3* smem[2 + tid4 + 2];
sum = fma(co3, smem[2 + tid4 - 2], sum);
sum = fma(co2, smem[2 + tid4 - 1], sum);
sum = fma(co1, smem[2 + tid4 ], sum);
sum = fma(co2, smem[2 + tid4 + 1], sum);
int dst_x = (get_group_id(0) * LOCAL_SIZE + tid4) / 2;
if (dst_x < dst_cols)
storepix(convertToT(sum), dstData + mad24(y, dst_step, dst_x * PIXSIZE));
tid4 += 2;
dst_x += 1;
sum = co3* smem[2 + tid4 + 2];
sum = fma(co3, smem[2 + tid4 - 2], sum);
sum = fma(co2, smem[2 + tid4 - 1], sum);
sum = fma(co1, smem[2 + tid4 ], sum);
sum = fma(co2, smem[2 + tid4 + 1], sum);
if (dst_x < dst_cols)
storepix(convertToT(sum), dstData + mad24(y, dst_step, dst_x * PIXSIZE));
#endif
}

View File

@ -405,10 +405,10 @@ typedef void (*PyrFunc)(const Mat&, Mat&, int);
static bool ocl_pyrDown( InputArray _src, OutputArray _dst, const Size& _dsz, int borderType)
{
int type = _src.type(), depth = CV_MAT_DEPTH(type), channels = CV_MAT_CN(type);
int type = _src.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type);
bool doubleSupport = ocl::Device::getDefault().doubleFPConfig() > 0;
if (channels > 4 || (depth == CV_64F && !doubleSupport))
if (cn > 4 || (depth == CV_64F && !doubleSupport))
return false;
Size ssize = _src.size();
@ -423,17 +423,20 @@ static bool ocl_pyrDown( InputArray _src, OutputArray _dst, const Size& _dsz, in
int float_depth = depth == CV_64F ? CV_64F : CV_32F;
const int local_size = 256;
int kercn = 1;
if (depth == CV_8U && cn == 1 && float_depth == CV_32F)
kercn = 4;
const char * const borderMap[] = { "BORDER_CONSTANT", "BORDER_REPLICATE", "BORDER_REFLECT", "BORDER_WRAP",
"BORDER_REFLECT_101" };
char cvt[2][50];
String buildOptions = format(
"-D T=%s -D FT=%s -D convertToT=%s -D convertToFT=%s%s "
"-D T1=%s -D cn=%d -D %s -D LOCAL_SIZE=%d",
ocl::typeToStr(type), ocl::typeToStr(CV_MAKETYPE(float_depth, channels)),
ocl::convertTypeStr(float_depth, depth, channels, cvt[0]),
ocl::convertTypeStr(depth, float_depth, channels, cvt[1]),
"-D T1=%s -D cn=%d -D kercn=%d -D %s -D LOCAL_SIZE=%d",
ocl::typeToStr(type), ocl::typeToStr(CV_MAKETYPE(float_depth, cn)),
ocl::convertTypeStr(float_depth, depth, cn, cvt[0]),
ocl::convertTypeStr(depth, float_depth, cn, cvt[1]),
doubleSupport ? " -D DOUBLE_SUPPORT" : "",
ocl::typeToStr(depth), channels, borderMap[borderType], local_size
ocl::typeToStr(depth), cn, kercn, borderMap[borderType], local_size
);
ocl::Kernel k("pyrDown", ocl::imgproc::pyr_down_oclsrc, buildOptions);
if (k.empty())
@ -441,8 +444,8 @@ static bool ocl_pyrDown( InputArray _src, OutputArray _dst, const Size& _dsz, in
k.args(ocl::KernelArg::ReadOnly(src), ocl::KernelArg::WriteOnly(dst));
size_t localThreads[2] = { local_size, 1 };
size_t globalThreads[2] = { src.cols, dst.rows };
size_t localThreads[2] = { local_size/kercn, 1 };
size_t globalThreads[2] = { (src.cols + (kercn-1))/kercn, dst.rows };
return k.run(2, globalThreads, localThreads, false);
}