Changed integer operations to float for Intel devices
This commit is contained in:
parent
da58425fbc
commit
01123aaa36
@ -3491,9 +3491,19 @@ static bool ocl_sepFilter2D( InputArray _src, OutputArray _dst, int ddepth,
|
|||||||
rtype == KERNEL_SMOOTH+KERNEL_SYMMETRICAL &&
|
rtype == KERNEL_SMOOTH+KERNEL_SYMMETRICAL &&
|
||||||
ctype == KERNEL_SMOOTH+KERNEL_SYMMETRICAL)
|
ctype == KERNEL_SMOOTH+KERNEL_SYMMETRICAL)
|
||||||
{
|
{
|
||||||
bdepth = CV_32S;
|
if (ocl::Device::getDefault().isIntel())
|
||||||
kernelX.convertTo( kernelX, bdepth, 1 << shift_bits );
|
{
|
||||||
kernelY.convertTo( kernelY, bdepth, 1 << shift_bits );
|
for (int i=0; i<kernelX.cols; i++)
|
||||||
|
kernelX.at<float>(0, i) = (float) cvRound(kernelX.at<float>(0, i) * (1 << shift_bits));
|
||||||
|
if (kernelX.data != kernelY.data)
|
||||||
|
for (int i=0; i<kernelX.cols; i++)
|
||||||
|
kernelY.at<float>(0, i) = (float) cvRound(kernelY.at<float>(0, i) * (1 << shift_bits));
|
||||||
|
} else
|
||||||
|
{
|
||||||
|
bdepth = CV_32S;
|
||||||
|
kernelX.convertTo( kernelX, bdepth, 1 << shift_bits );
|
||||||
|
kernelY.convertTo( kernelY, bdepth, 1 << shift_bits );
|
||||||
|
}
|
||||||
int_arithm = true;
|
int_arithm = true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -97,15 +97,19 @@ __kernel void col_filter(__global const uchar * src, int src_step, int src_offse
|
|||||||
{
|
{
|
||||||
temp[0] = LDS_DAT[l_y + RADIUSY - i][l_x];
|
temp[0] = LDS_DAT[l_y + RADIUSY - i][l_x];
|
||||||
temp[1] = LDS_DAT[l_y + RADIUSY + i][l_x];
|
temp[1] = LDS_DAT[l_y + RADIUSY + i][l_x];
|
||||||
#ifndef INTEGER_ARITHMETIC
|
#if (defined(INTEGER_ARITHMETIC) && !INTEL_DEVICE)
|
||||||
sum += mad(temp[0], mat_kernel[RADIUSY - i], temp[1] * mat_kernel[RADIUSY + i]);
|
|
||||||
#else
|
|
||||||
sum += mad24(temp[0],mat_kernel[RADIUSY - i], temp[1] * mat_kernel[RADIUSY + i]);
|
sum += mad24(temp[0],mat_kernel[RADIUSY - i], temp[1] * mat_kernel[RADIUSY + i]);
|
||||||
|
#else
|
||||||
|
sum += mad(temp[0], mat_kernel[RADIUSY - i], temp[1] * mat_kernel[RADIUSY + i]);
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
#ifdef INTEGER_ARITHMETIC
|
#ifdef INTEGER_ARITHMETIC
|
||||||
|
#ifdef INTEL_DEVICE
|
||||||
|
sum = (sum + (1 << (SHIFT_BITS-1))) / (1 << SHIFT_BITS);
|
||||||
|
#else
|
||||||
sum = (sum + (1 << (SHIFT_BITS-1))) >> SHIFT_BITS;
|
sum = (sum + (1 << (SHIFT_BITS-1))) >> SHIFT_BITS;
|
||||||
|
#endif
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
// write the result to dst
|
// write the result to dst
|
||||||
|
@ -141,12 +141,12 @@
|
|||||||
#define DIG(a) a,
|
#define DIG(a) a,
|
||||||
__constant dstT1 mat_kernel[] = { COEFF };
|
__constant dstT1 mat_kernel[] = { COEFF };
|
||||||
|
|
||||||
#ifndef INTEGER_ARITHMETIC
|
#if (defined(INTEGER_ARITHMETIC) && !INTEL_DEVICE)
|
||||||
#define dstT4 float4
|
|
||||||
#define convertDstVec convert_float4
|
|
||||||
#else
|
|
||||||
#define dstT4 int4
|
#define dstT4 int4
|
||||||
#define convertDstVec convert_int4
|
#define convertDstVec convert_int4
|
||||||
|
#else
|
||||||
|
#define dstT4 float4
|
||||||
|
#define convertDstVec convert_float4
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
__kernel void row_filter_C1_D0(__global const uchar * src, int src_step_in_pixel, int src_offset_x, int src_offset_y,
|
__kernel void row_filter_C1_D0(__global const uchar * src, int src_step_in_pixel, int src_offset_x, int src_offset_y,
|
||||||
@ -263,10 +263,10 @@ __kernel void row_filter_C1_D0(__global const uchar * src, int src_step_in_pixel
|
|||||||
{
|
{
|
||||||
temp[0] = vload4(0, (__local uchar*)&LDS_DAT[l_y][l_x] + RADIUSX + offset - i);
|
temp[0] = vload4(0, (__local uchar*)&LDS_DAT[l_y][l_x] + RADIUSX + offset - i);
|
||||||
temp[1] = vload4(0, (__local uchar*)&LDS_DAT[l_y][l_x] + RADIUSX + offset + i);
|
temp[1] = vload4(0, (__local uchar*)&LDS_DAT[l_y][l_x] + RADIUSX + offset + i);
|
||||||
#ifndef INTEGER_ARITHMETIC
|
#if (defined(INTEGER_ARITHMETIC) && !INTEL_DEVICE)
|
||||||
sum += mad(convertDstVec(temp[0]), mat_kernel[RADIUSX-i], convertDstVec(temp[1]) * mat_kernel[RADIUSX + i]);
|
|
||||||
#else
|
|
||||||
sum += mad24(convertDstVec(temp[0]), mat_kernel[RADIUSX-i], convertDstVec(temp[1]) * mat_kernel[RADIUSX + i]);
|
sum += mad24(convertDstVec(temp[0]), mat_kernel[RADIUSX-i], convertDstVec(temp[1]) * mat_kernel[RADIUSX + i]);
|
||||||
|
#else
|
||||||
|
sum += mad(convertDstVec(temp[0]), mat_kernel[RADIUSX-i], convertDstVec(temp[1]) * mat_kernel[RADIUSX + i]);
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -368,10 +368,10 @@ __kernel void row_filter(__global const uchar * src, int src_step, int src_offse
|
|||||||
{
|
{
|
||||||
temp[0] = LDS_DAT[l_y][l_x + RADIUSX - i];
|
temp[0] = LDS_DAT[l_y][l_x + RADIUSX - i];
|
||||||
temp[1] = LDS_DAT[l_y][l_x + RADIUSX + i];
|
temp[1] = LDS_DAT[l_y][l_x + RADIUSX + i];
|
||||||
#ifndef INTEGER_ARITHMETIC
|
#if (defined(INTEGER_ARITHMETIC) && !INTEL_DEVICE)
|
||||||
sum += mad(convertToDstT(temp[0]), mat_kernel[RADIUSX - i], convertToDstT(temp[1]) * mat_kernel[RADIUSX + i]);
|
|
||||||
#else
|
|
||||||
sum += mad24(convertToDstT(temp[0]), mat_kernel[RADIUSX - i], convertToDstT(temp[1]) * mat_kernel[RADIUSX + i]);
|
sum += mad24(convertToDstT(temp[0]), mat_kernel[RADIUSX - i], convertToDstT(temp[1]) * mat_kernel[RADIUSX + i]);
|
||||||
|
#else
|
||||||
|
sum += mad(convertToDstT(temp[0]), mat_kernel[RADIUSX - i], convertToDstT(temp[1]) * mat_kernel[RADIUSX + i]);
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -162,10 +162,10 @@ __kernel void sep_filter(__global uchar* Src, int src_step, int srcOffsetX, int
|
|||||||
{
|
{
|
||||||
sum = (WT) 0;
|
sum = (WT) 0;
|
||||||
for (i=0; i<=2*RADIUSY; i++)
|
for (i=0; i<=2*RADIUSY; i++)
|
||||||
#ifndef INTEGER_ARITHMETIC
|
#if (defined(INTEGER_ARITHMETIC) && !INTEL_DEVICE)
|
||||||
sum = mad(lsmem[liy+i][clocX], mat_kernelY[i], sum);
|
|
||||||
#else
|
|
||||||
sum = mad24(lsmem[liy+i][clocX], mat_kernelY[i], sum);
|
sum = mad24(lsmem[liy+i][clocX], mat_kernelY[i], sum);
|
||||||
|
#else
|
||||||
|
sum = mad(lsmem[liy+i][clocX], mat_kernelY[i], sum);
|
||||||
#endif
|
#endif
|
||||||
lsmemDy[liy][clocX] = sum;
|
lsmemDy[liy][clocX] = sum;
|
||||||
clocX += BLK_X;
|
clocX += BLK_X;
|
||||||
@ -182,12 +182,18 @@ __kernel void sep_filter(__global uchar* Src, int src_step, int srcOffsetX, int
|
|||||||
// and calculate final result
|
// and calculate final result
|
||||||
sum = 0.0f;
|
sum = 0.0f;
|
||||||
for (i=0; i<=2*RADIUSX; i++)
|
for (i=0; i<=2*RADIUSX; i++)
|
||||||
#ifndef INTEGER_ARITHMETIC
|
#if (defined(INTEGER_ARITHMETIC) && !INTEL_DEVICE)
|
||||||
sum = mad(lsmemDy[liy][lix+i], mat_kernelX[i], sum);
|
|
||||||
#else
|
|
||||||
sum = mad24(lsmemDy[liy][lix+i], mat_kernelX[i], sum);
|
sum = mad24(lsmemDy[liy][lix+i], mat_kernelX[i], sum);
|
||||||
|
#else
|
||||||
|
sum = mad(lsmemDy[liy][lix+i], mat_kernelX[i], sum);
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifdef INTEGER_ARITHMETIC
|
||||||
|
#ifdef INTEL_DEVICE
|
||||||
|
sum = (sum + (1 << (SHIFT_BITS-1))) / (1 << SHIFT_BITS);
|
||||||
|
#else
|
||||||
sum = (sum + (1 << (SHIFT_BITS-1))) >> SHIFT_BITS;
|
sum = (sum + (1 << (SHIFT_BITS-1))) >> SHIFT_BITS;
|
||||||
|
#endif
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
// store result into destination image
|
// store result into destination image
|
||||||
|
Loading…
x
Reference in New Issue
Block a user