Enabled integer arithmetic for row_filter_C1_D0
This commit is contained in:
parent
a3825acee4
commit
fc10ffefb8
@ -3475,14 +3475,13 @@ static bool ocl_sepFilter2D( InputArray _src, OutputArray _dst, int ddepth,
|
|||||||
|
|
||||||
int bdepth = CV_32F;
|
int bdepth = CV_32F;
|
||||||
bool int_arithm = false;
|
bool int_arithm = false;
|
||||||
if( sdepth == CV_8U &&
|
if( sdepth == CV_8U && ddepth == CV_8U &&
|
||||||
((rtype == KERNEL_SMOOTH+KERNEL_SYMMETRICAL &&
|
rtype == KERNEL_SMOOTH+KERNEL_SYMMETRICAL &&
|
||||||
ctype == KERNEL_SMOOTH+KERNEL_SYMMETRICAL &&
|
ctype == KERNEL_SMOOTH+KERNEL_SYMMETRICAL)
|
||||||
ddepth == CV_8U)))
|
|
||||||
{
|
{
|
||||||
bdepth = CV_32S;
|
bdepth = CV_32S;
|
||||||
_kernelX.getMat().reshape(1,1).convertTo( kernelX, CV_32S, 1 << shift_bits );
|
kernelX.convertTo( kernelX, CV_32S, 1 << shift_bits );
|
||||||
_kernelY.getMat().reshape(1,1).convertTo( kernelY, CV_32S, 1 << shift_bits );
|
kernelY.convertTo( kernelY, CV_32S, 1 << shift_bits );
|
||||||
int_arithm = true;
|
int_arithm = true;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -3500,7 +3499,7 @@ static bool ocl_sepFilter2D( InputArray _src, OutputArray _dst, int ddepth,
|
|||||||
src.locateROI(srcWholeSize, srcOffset);
|
src.locateROI(srcWholeSize, srcOffset);
|
||||||
|
|
||||||
bool fast8uc1 = type == CV_8UC1 && srcOffset.x % 4 == 0 &&
|
bool fast8uc1 = type == CV_8UC1 && srcOffset.x % 4 == 0 &&
|
||||||
src.cols % 4 == 0 && src.step % 4 == 0 && !int_arithm;
|
src.cols % 4 == 0 && src.step % 4 == 0;
|
||||||
|
|
||||||
Size srcSize = src.size();
|
Size srcSize = src.size();
|
||||||
Size bufSize(srcSize.width, srcSize.height + kernelY.cols - 1);
|
Size bufSize(srcSize.width, srcSize.height + kernelY.cols - 1);
|
||||||
|
@ -141,6 +141,14 @@
|
|||||||
#define DIG(a) a,
|
#define DIG(a) a,
|
||||||
__constant dstT1 mat_kernel[] = { COEFF };
|
__constant dstT1 mat_kernel[] = { COEFF };
|
||||||
|
|
||||||
|
#ifndef INTEGER_ARITHMETIC
|
||||||
|
#define dstT4 float4
|
||||||
|
#define convertDstVec convert_float4
|
||||||
|
#else
|
||||||
|
#define dstT4 int4
|
||||||
|
#define convertDstVec convert_int4
|
||||||
|
#endif
|
||||||
|
|
||||||
__kernel void row_filter_C1_D0(__global const uchar * src, int src_step_in_pixel, int src_offset_x, int src_offset_y,
|
__kernel void row_filter_C1_D0(__global const uchar * src, int src_step_in_pixel, int src_offset_x, int src_offset_y,
|
||||||
int src_cols, int src_rows, int src_whole_cols, int src_whole_rows,
|
int src_cols, int src_rows, int src_whole_cols, int src_whole_rows,
|
||||||
__global float * dst, int dst_step_in_pixel, int dst_cols, int dst_rows,
|
__global float * dst, int dst_step_in_pixel, int dst_cols, int dst_rows,
|
||||||
@ -156,7 +164,7 @@ __kernel void row_filter_C1_D0(__global const uchar * src, int src_step_in_pixel
|
|||||||
int start_y = y + src_offset_y - radiusy;
|
int start_y = y + src_offset_y - radiusy;
|
||||||
int start_addr = mad24(start_y, src_step_in_pixel, start_x);
|
int start_addr = mad24(start_y, src_step_in_pixel, start_x);
|
||||||
|
|
||||||
float4 sum;
|
dstT4 sum;
|
||||||
uchar4 temp[READ_TIMES_ROW];
|
uchar4 temp[READ_TIMES_ROW];
|
||||||
|
|
||||||
__local uchar4 LDS_DAT[LSIZE1][READ_TIMES_ROW * LSIZE0 + 1];
|
__local uchar4 LDS_DAT[LSIZE1][READ_TIMES_ROW * LSIZE0 + 1];
|
||||||
@ -250,19 +258,23 @@ __kernel void row_filter_C1_D0(__global const uchar * src, int src_step_in_pixel
|
|||||||
barrier(CLK_LOCAL_MEM_FENCE);
|
barrier(CLK_LOCAL_MEM_FENCE);
|
||||||
|
|
||||||
// read pixels from lds and calculate the result
|
// read pixels from lds and calculate the result
|
||||||
sum = convert_float4(vload4(0,(__local uchar *)&LDS_DAT[l_y][l_x]+RADIUSX+offset)) * mat_kernel[RADIUSX];
|
sum = convertDstVec(vload4(0,(__local uchar *)&LDS_DAT[l_y][l_x]+RADIUSX+offset)) * mat_kernel[RADIUSX];
|
||||||
for (int i = 1; i <= RADIUSX; ++i)
|
for (int i = 1; i <= RADIUSX; ++i)
|
||||||
{
|
{
|
||||||
temp[0] = vload4(0, (__local uchar*)&LDS_DAT[l_y][l_x] + RADIUSX + offset - i);
|
temp[0] = vload4(0, (__local uchar*)&LDS_DAT[l_y][l_x] + RADIUSX + offset - i);
|
||||||
temp[1] = vload4(0, (__local uchar*)&LDS_DAT[l_y][l_x] + RADIUSX + offset + i);
|
temp[1] = vload4(0, (__local uchar*)&LDS_DAT[l_y][l_x] + RADIUSX + offset + i);
|
||||||
sum += mad(convert_float4(temp[0]), mat_kernel[RADIUSX-i], convert_float4(temp[1]) * mat_kernel[RADIUSX + i]);
|
#ifndef INTEGER_ARITHMETIC
|
||||||
|
sum += mad(convertDstVec(temp[0]), mat_kernel[RADIUSX-i], convertDstVec(temp[1]) * mat_kernel[RADIUSX + i]);
|
||||||
|
#else
|
||||||
|
sum += mad24(convertDstVec(temp[0]), mat_kernel[RADIUSX-i], convertDstVec(temp[1]) * mat_kernel[RADIUSX + i]);
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
start_addr = mad24(y, dst_step_in_pixel, x);
|
start_addr = mad24(y, dst_step_in_pixel, x);
|
||||||
|
|
||||||
// write the result to dst
|
// write the result to dst
|
||||||
if ((x+3<dst_cols) & (y<dst_rows))
|
if ((x+3<dst_cols) & (y<dst_rows))
|
||||||
*(__global float4*)&dst[start_addr] = sum;
|
*(__global dstT4*)&dst[start_addr] = sum;
|
||||||
else if ((x+2<dst_cols) && (y<dst_rows))
|
else if ((x+2<dst_cols) && (y<dst_rows))
|
||||||
{
|
{
|
||||||
dst[start_addr] = sum.x;
|
dst[start_addr] = sum.x;
|
||||||
|
Loading…
x
Reference in New Issue
Block a user