diff --git a/modules/ocl/src/opencl/pyr_down.cl b/modules/ocl/src/opencl/pyr_down.cl index 9fe8e8a97..e40ad3492 100644 --- a/modules/ocl/src/opencl/pyr_down.cl +++ b/modules/ocl/src/opencl/pyr_down.cl @@ -43,37 +43,6 @@ // //M*/ -//#pragma OPENCL EXTENSION cl_amd_printf : enable - - -uchar round_uchar_int(int v) -{ - return (uchar)((uint)v <= 255 ? v : v > 0 ? 255 : 0); -} - -uchar round_uchar_float(float v) -{ - return round_uchar_int(convert_int_sat_rte(v)); -} - -uchar4 round_uchar4_int4(int4 v) -{ - uchar4 result; - result.x = (uchar)(v.x <= 255 ? v.x : v.x > 0 ? 255 : 0); - result.y = (uchar)(v.y <= 255 ? v.y : v.y > 0 ? 255 : 0); - result.z = (uchar)(v.z <= 255 ? v.z : v.z > 0 ? 255 : 0); - result.w = (uchar)(v.w <= 255 ? v.w : v.w > 0 ? 255 : 0); - return result; -} - -uchar4 round_uchar4_float4(float4 v) -{ - return round_uchar4_int4(convert_int4_sat_rte(v)); -} - - - - int idx_row_low(int y, int last_row) { return abs(y) % (last_row + 1); @@ -104,6 +73,10 @@ int idx_col(int x, int last_col) return idx_col_low(idx_col_high(x, last_col), last_col); } +/////////////////////////////////////////////////////////////////////// +////////////////////////// CV_8UC1 /////////////////////////////////// +/////////////////////////////////////////////////////////////////////// + __kernel void pyrDown_C1_D0(__global uchar * srcData, int srcStep, int srcRows, int srcCols, __global uchar *dst, int dstStep, int dstCols) { const int x = get_global_id(0); @@ -211,10 +184,14 @@ __kernel void pyrDown_C1_D0(__global uchar * srcData, int srcStep, int srcRows, const int dst_x = (get_group_id(0) * get_local_size(0) + tid2) / 2; if (dst_x < dstCols) - dst[y * dstStep + dst_x] = round_uchar_float(sum); + dst[y * dstStep + dst_x] = convert_uchar_sat_rte(sum); } } +/////////////////////////////////////////////////////////////////////// +////////////////////////// CV_8UC4 /////////////////////////////////// +/////////////////////////////////////////////////////////////////////// + __kernel void pyrDown_C4_D0(__global uchar4 * srcData, int srcStep, int srcRows, int srcCols, __global uchar4 *dst, int dstStep, int dstCols) { const int x = get_global_id(0); @@ -228,16 +205,16 @@ __kernel void pyrDown_C4_D0(__global uchar4 * srcData, int srcStep, int srcRows, const int last_row = srcRows - 1; const int last_col = srcCols - 1; - float4 co1 = 0.375f;//(float4)(0.375f, 0.375f, 0.375f, 0.375f); - float4 co2 = 0.25f;//(float4)(0.25f, 0.25f, 0.25f, 0.25f); - float4 co3 = 0.0625f;//(float4)(0.0625f, 0.0625f, 0.0625f, 0.0625f); + float4 co1 = 0.375f; + float4 co2 = 0.25f; + float4 co3 = 0.0625f; if (src_y >= 2 && src_y < srcRows - 2 && x >= 2 && x < srcCols - 2) { sum = co3 * convert_float4((((srcData + (src_y - 2) * srcStep / 4))[x])); - sum = sum + co2 * convert_float4((((srcData + (src_y - 1) * srcStep / 4))[x])); - sum = sum + co1 * convert_float4((((srcData + (src_y ) * srcStep / 4))[x])); - sum = sum + co2 * convert_float4((((srcData + (src_y + 1) * srcStep / 4))[x])); + sum = sum + co2 * convert_float4((((srcData + (src_y - 1) * srcStep / 4))[x])); + sum = sum + co1 * convert_float4((((srcData + (src_y ) * srcStep / 4))[x])); + sum = sum + co2 * convert_float4((((srcData + (src_y + 1) * srcStep / 4))[x])); sum = sum + co3 * convert_float4((((srcData + (src_y + 2) * srcStep / 4))[x])); smem[2 + get_local_id(0)] = sum; @@ -247,9 +224,9 @@ __kernel void pyrDown_C4_D0(__global uchar4 * srcData, int srcStep, int srcRows, const int left_x = x - 2; sum = co3 * convert_float4((((srcData + (src_y - 2) * srcStep / 4))[left_x])); - sum = sum + co2 * convert_float4((((srcData + (src_y - 1) * srcStep / 4))[left_x])); - sum = sum + co1 * convert_float4((((srcData + (src_y ) * srcStep / 4))[left_x])); - sum = sum + co2 * convert_float4((((srcData + (src_y + 1) * srcStep / 4))[left_x])); + sum = sum + co2 * convert_float4((((srcData + (src_y - 1) * srcStep / 4))[left_x])); + sum = sum + co1 * convert_float4((((srcData + (src_y ) * srcStep / 4))[left_x])); + sum = sum + co2 * convert_float4((((srcData + (src_y + 1) * srcStep / 4))[left_x])); sum = sum + co3 * convert_float4((((srcData + (src_y + 2) * srcStep / 4))[left_x])); smem[get_local_id(0)] = sum; @@ -260,9 +237,9 @@ __kernel void pyrDown_C4_D0(__global uchar4 * srcData, int srcStep, int srcRows, const int right_x = x + 2; sum = co3 * convert_float4((((srcData + (src_y - 2) * srcStep / 4))[right_x])); - sum = sum + co2 * convert_float4((((srcData + (src_y - 1) * srcStep / 4))[right_x])); - sum = sum + co1 * convert_float4((((srcData + (src_y ) * srcStep / 4))[right_x])); - sum = sum + co2 * convert_float4((((srcData + (src_y + 1) * srcStep / 4))[right_x])); + sum = sum + co2 * convert_float4((((srcData + (src_y - 1) * srcStep / 4))[right_x])); + sum = sum + co1 * convert_float4((((srcData + (src_y ) * srcStep / 4))[right_x])); + sum = sum + co2 * convert_float4((((srcData + (src_y + 1) * srcStep / 4))[right_x])); sum = sum + co3 * convert_float4((((srcData + (src_y + 2) * srcStep / 4))[right_x])); smem[4 + get_local_id(0)] = sum; @@ -273,9 +250,9 @@ __kernel void pyrDown_C4_D0(__global uchar4 * srcData, int srcStep, int srcRows, int col = idx_col(x, last_col); sum = co3 * convert_float4((((srcData + idx_row(src_y - 2, last_row) * srcStep / 4))[col])); - sum = sum + co2 * convert_float4((((srcData + idx_row(src_y - 1, last_row) * srcStep / 4))[col])); - sum = sum + co1 * convert_float4((((srcData + idx_row(src_y , last_row) * srcStep / 4))[col])); - sum = sum + co2 * convert_float4((((srcData + idx_row(src_y + 1, last_row) * srcStep / 4))[col])); + sum = sum + co2 * convert_float4((((srcData + idx_row(src_y - 1, last_row) * srcStep / 4))[col])); + sum = sum + co1 * convert_float4((((srcData + idx_row(src_y , last_row) * srcStep / 4))[col])); + sum = sum + co2 * convert_float4((((srcData + idx_row(src_y + 1, last_row) * srcStep / 4))[col])); sum = sum + co3 * convert_float4((((srcData + idx_row(src_y + 2, last_row) * srcStep / 4))[col])); smem[2 + get_local_id(0)] = sum; @@ -287,9 +264,9 @@ __kernel void pyrDown_C4_D0(__global uchar4 * srcData, int srcStep, int srcRows, col = idx_col(left_x, last_col); sum = co3 * convert_float4((((srcData + idx_row(src_y - 2, last_row) * srcStep / 4))[col])); - sum = sum + co2 * convert_float4((((srcData + idx_row(src_y - 1, last_row) * srcStep / 4))[col])); - sum = sum + co1 * convert_float4((((srcData + idx_row(src_y , last_row) * srcStep / 4))[col])); - sum = sum + co2 * convert_float4((((srcData + idx_row(src_y + 1, last_row) * srcStep / 4))[col])); + sum = sum + co2 * convert_float4((((srcData + idx_row(src_y - 1, last_row) * srcStep / 4))[col])); + sum = sum + co1 * convert_float4((((srcData + idx_row(src_y , last_row) * srcStep / 4))[col])); + sum = sum + co2 * convert_float4((((srcData + idx_row(src_y + 1, last_row) * srcStep / 4))[col])); sum = sum + co3 * convert_float4((((srcData + idx_row(src_y + 2, last_row) * srcStep / 4))[col])); smem[get_local_id(0)] = sum; @@ -302,9 +279,9 @@ __kernel void pyrDown_C4_D0(__global uchar4 * srcData, int srcStep, int srcRows, col = idx_col(right_x, last_col); sum = co3 * convert_float4((((srcData + idx_row(src_y - 2, last_row) * srcStep / 4))[col])); - sum = sum + co2 * convert_float4((((srcData + idx_row(src_y - 1, last_row) * srcStep / 4))[col])); - sum = sum + co1 * convert_float4((((srcData + idx_row(src_y , last_row) * srcStep / 4))[col])); - sum = sum + co2 * convert_float4((((srcData + idx_row(src_y + 1, last_row) * srcStep / 4))[col])); + sum = sum + co2 * convert_float4((((srcData + idx_row(src_y - 1, last_row) * srcStep / 4))[col])); + sum = sum + co1 * convert_float4((((srcData + idx_row(src_y , last_row) * srcStep / 4))[col])); + sum = sum + co2 * convert_float4((((srcData + idx_row(src_y + 1, last_row) * srcStep / 4))[col])); sum = sum + co3 * convert_float4((((srcData + idx_row(src_y + 2, last_row) * srcStep / 4))[col])); smem[4 + get_local_id(0)] = sum; @@ -318,18 +295,490 @@ __kernel void pyrDown_C4_D0(__global uchar4 * srcData, int srcStep, int srcRows, const int tid2 = get_local_id(0) * 2; sum = co3 * smem[2 + tid2 - 2]; - sum = sum + co2 * smem[2 + tid2 - 1]; - sum = sum + co1 * smem[2 + tid2 ]; - sum = sum + co2 * smem[2 + tid2 + 1]; + sum = sum + co2 * smem[2 + tid2 - 1]; + sum = sum + co1 * smem[2 + tid2 ]; + sum = sum + co2 * smem[2 + tid2 + 1]; sum = sum + co3 * smem[2 + tid2 + 2]; const int dst_x = (get_group_id(0) * get_local_size(0) + tid2) / 2; if (dst_x < dstCols) - dst[y * dstStep / 4 + dst_x] = round_uchar4_float4(sum); + dst[y * dstStep / 4 + dst_x] = convert_uchar4_sat_rte(sum); } } +/////////////////////////////////////////////////////////////////////// +////////////////////////// CV_16UC1 ////////////////////////////////// +/////////////////////////////////////////////////////////////////////// + +__kernel void pyrDown_C1_D2(__global ushort * srcData, int srcStep, int srcRows, int srcCols, __global ushort *dst, int dstStep, int dstCols) +{ + const int x = get_global_id(0); + const int y = get_group_id(1); + + __local float smem[256 + 4]; + + float sum; + + const int src_y = 2*y; + const int last_row = srcRows - 1; + const int last_col = srcCols - 1; + + if (src_y >= 2 && src_y < srcRows - 2 && x >= 2 && x < srcCols - 2) + { + sum = 0.0625f * ((__global ushort*)((__global char*)srcData + (src_y - 2) * srcStep))[x]; + sum = sum + 0.25f * ((__global ushort*)((__global char*)srcData + (src_y - 1) * srcStep))[x]; + sum = sum + 0.375f * ((__global ushort*)((__global char*)srcData + (src_y ) * srcStep))[x]; + sum = sum + 0.25f * ((__global ushort*)((__global char*)srcData + (src_y + 1) * srcStep))[x]; + sum = sum + 0.0625f * ((__global ushort*)((__global char*)srcData + (src_y + 2) * srcStep))[x]; + + smem[2 + get_local_id(0)] = sum; + + if (get_local_id(0) < 2) + { + const int left_x = x - 2; + + sum = 0.0625f * ((__global ushort*)((__global char*)srcData + (src_y - 2) * srcStep))[left_x]; + sum = sum + 0.25f * ((__global ushort*)((__global char*)srcData + (src_y - 1) * srcStep))[left_x]; + sum = sum + 0.375f * ((__global ushort*)((__global char*)srcData + (src_y ) * srcStep))[left_x]; + sum = sum + 0.25f * ((__global ushort*)((__global char*)srcData + (src_y + 1) * srcStep))[left_x]; + sum = sum + 0.0625f * ((__global ushort*)((__global char*)srcData + (src_y + 2) * srcStep))[left_x]; + + smem[get_local_id(0)] = sum; + } + + if (get_local_id(0) > 253) + { + const int right_x = x + 2; + + sum = 0.0625f * ((__global ushort*)((__global char*)srcData + (src_y - 2) * srcStep))[right_x]; + sum = sum + 0.25f * ((__global ushort*)((__global char*)srcData + (src_y - 1) * srcStep))[right_x]; + sum = sum + 0.375f * ((__global ushort*)((__global char*)srcData + (src_y ) * srcStep))[right_x]; + sum = sum + 0.25f * ((__global ushort*)((__global char*)srcData + (src_y + 1) * srcStep))[right_x]; + sum = sum + 0.0625f * ((__global ushort*)((__global char*)srcData + (src_y + 2) * srcStep))[right_x]; + + smem[4 + get_local_id(0)] = sum; + } + } + else + { + int col = idx_col(x, last_col); + + sum = 0.0625f * ((__global ushort*)((__global char*)srcData + idx_row(src_y - 2, last_row) * srcStep))[col]; + sum = sum + 0.25f * ((__global ushort*)((__global char*)srcData + idx_row(src_y - 1, last_row) * srcStep))[col]; + sum = sum + 0.375f * ((__global ushort*)((__global char*)srcData + idx_row(src_y , last_row) * srcStep))[col]; + sum = sum + 0.25f * ((__global ushort*)((__global char*)srcData + idx_row(src_y + 1, last_row) * srcStep))[col]; + sum = sum + 0.0625f * ((__global ushort*)((__global char*)srcData + idx_row(src_y + 2, last_row) * srcStep))[col]; + + smem[2 + get_local_id(0)] = sum; + + if (get_local_id(0) < 2) + { + const int left_x = x - 2; + + col = idx_col(left_x, last_col); + + sum = 0.0625f * ((__global ushort*)((__global char*)srcData + idx_row(src_y - 2, last_row) * srcStep))[col]; + sum = sum + 0.25f * ((__global ushort*)((__global char*)srcData + idx_row(src_y - 1, last_row) * srcStep))[col]; + sum = sum + 0.375f * ((__global ushort*)((__global char*)srcData + idx_row(src_y , last_row) * srcStep))[col]; + sum = sum + 0.25f * ((__global ushort*)((__global char*)srcData + idx_row(src_y + 1, last_row) * srcStep))[col]; + sum = sum + 0.0625f * ((__global ushort*)((__global char*)srcData + idx_row(src_y + 2, last_row) * srcStep))[col]; + + smem[get_local_id(0)] = sum; + } + + if (get_local_id(0) > 253) + { + const int right_x = x + 2; + + col = idx_col(right_x, last_col); + + sum = 0.0625f * ((__global ushort*)((__global char*)srcData + idx_row(src_y - 2, last_row) * srcStep))[col]; + sum = sum + 0.25f * ((__global ushort*)((__global char*)srcData + idx_row(src_y - 1, last_row) * srcStep))[col]; + sum = sum + 0.375f * ((__global ushort*)((__global char*)srcData + idx_row(src_y , last_row) * srcStep))[col]; + sum = sum + 0.25f * ((__global ushort*)((__global char*)srcData + idx_row(src_y + 1, last_row) * srcStep))[col]; + sum = sum + 0.0625f * ((__global ushort*)((__global char*)srcData + idx_row(src_y + 2, last_row) * srcStep))[col]; + + smem[4 + get_local_id(0)] = sum; + } + } + + barrier(CLK_LOCAL_MEM_FENCE); + + if (get_local_id(0) < 128) + { + const int tid2 = get_local_id(0) * 2; + + sum = 0.0625f * smem[2 + tid2 - 2]; + sum = sum + 0.25f * smem[2 + tid2 - 1]; + sum = sum + 0.375f * smem[2 + tid2 ]; + sum = sum + 0.25f * smem[2 + tid2 + 1]; + sum = sum + 0.0625f * smem[2 + tid2 + 2]; + + const int dst_x = (get_group_id(0) * get_local_size(0) + tid2) / 2; + + if (dst_x < dstCols) + dst[y * dstStep / 2 + dst_x] = convert_ushort_sat_rte(sum); + } +} + +/////////////////////////////////////////////////////////////////////// +////////////////////////// CV_16UC4 ////////////////////////////////// +/////////////////////////////////////////////////////////////////////// + +__kernel void pyrDown_C4_D2(__global ushort4 * srcData, int srcStep, int srcRows, int srcCols, __global ushort4 *dst, int dstStep, int dstCols) +{ + const int x = get_global_id(0); + const int y = get_group_id(1); + + __local float4 smem[256 + 4]; + + float4 sum; + + const int src_y = 2*y; + const int last_row = srcRows - 1; + const int last_col = srcCols - 1; + + float4 co1 = 0.375f; + float4 co2 = 0.25f; + float4 co3 = 0.0625f; + + if (src_y >= 2 && src_y < srcRows - 2 && x >= 2 && x < srcCols - 2) + { + sum = co3 * convert_float4(((__global ushort4*)((__global char4*)srcData + (src_y - 2) * srcStep / 4))[x]); + sum = sum + co2 * convert_float4(((__global ushort4*)((__global char4*)srcData + (src_y - 1) * srcStep / 4))[x]); + sum = sum + co1 * convert_float4(((__global ushort4*)((__global char4*)srcData + (src_y ) * srcStep / 4))[x]); + sum = sum + co2 * convert_float4(((__global ushort4*)((__global char4*)srcData + (src_y + 1) * srcStep / 4))[x]); + sum = sum + co3 * convert_float4(((__global ushort4*)((__global char4*)srcData + (src_y + 2) * srcStep / 4))[x]); + + smem[2 + get_local_id(0)] = sum; + + if (get_local_id(0) < 2) + { + const int left_x = x - 2; + + sum = co3 * convert_float4(((__global ushort4*)((__global char4*)srcData + (src_y - 2) * srcStep / 4))[left_x]); + sum = sum + co2 * convert_float4(((__global ushort4*)((__global char4*)srcData + (src_y - 1) * srcStep / 4))[left_x]); + sum = sum + co1 * convert_float4(((__global ushort4*)((__global char4*)srcData + (src_y ) * srcStep / 4))[left_x]); + sum = sum + co2 * convert_float4(((__global ushort4*)((__global char4*)srcData + (src_y + 1) * srcStep / 4))[left_x]); + sum = sum + co3 * convert_float4(((__global ushort4*)((__global char4*)srcData + (src_y + 2) * srcStep / 4))[left_x]); + + smem[get_local_id(0)] = sum; + } + + if (get_local_id(0) > 253) + { + const int right_x = x + 2; + + sum = co3 * convert_float4(((__global ushort4*)((__global char4*)srcData + (src_y - 2) * srcStep / 4))[right_x]); + sum = sum + co2 * convert_float4(((__global ushort4*)((__global char4*)srcData + (src_y - 1) * srcStep / 4))[right_x]); + sum = sum + co1 * convert_float4(((__global ushort4*)((__global char4*)srcData + (src_y ) * srcStep / 4))[right_x]); + sum = sum + co2 * convert_float4(((__global ushort4*)((__global char4*)srcData + (src_y + 1) * srcStep / 4))[right_x]); + sum = sum + co3 * convert_float4(((__global ushort4*)((__global char4*)srcData + (src_y + 2) * srcStep / 4))[right_x]); + + smem[4 + get_local_id(0)] = sum; + } + } + else + { + int col = idx_col(x, last_col); + + sum = co3 * convert_float4(((__global ushort4*)((__global char4*)srcData + idx_row(src_y - 2, last_row) * srcStep / 4))[col]); + sum = sum + co2 * convert_float4(((__global ushort4*)((__global char4*)srcData + idx_row(src_y - 1, last_row) * srcStep / 4))[col]); + sum = sum + co1 * convert_float4(((__global ushort4*)((__global char4*)srcData + idx_row(src_y , last_row) * srcStep / 4))[col]); + sum = sum + co2 * convert_float4(((__global ushort4*)((__global char4*)srcData + idx_row(src_y + 1, last_row) * srcStep / 4))[col]); + sum = sum + co3 * convert_float4(((__global ushort4*)((__global char4*)srcData + idx_row(src_y + 2, last_row) * srcStep / 4))[col]); + + smem[2 + get_local_id(0)] = sum; + + if (get_local_id(0) < 2) + { + const int left_x = x - 2; + + col = idx_col(left_x, last_col); + + sum = co3 * convert_float4(((__global ushort4*)((__global char4*)srcData + idx_row(src_y - 2, last_row) * srcStep / 4))[col]); + sum = sum + co2 * convert_float4(((__global ushort4*)((__global char4*)srcData + idx_row(src_y - 1, last_row) * srcStep / 4))[col]); + sum = sum + co1 * convert_float4(((__global ushort4*)((__global char4*)srcData + idx_row(src_y , last_row) * srcStep / 4))[col]); + sum = sum + co2 * convert_float4(((__global ushort4*)((__global char4*)srcData + idx_row(src_y + 1, last_row) * srcStep / 4))[col]); + sum = sum + co3 * convert_float4(((__global ushort4*)((__global char4*)srcData + idx_row(src_y + 2, last_row) * srcStep / 4))[col]); + + smem[get_local_id(0)] = sum; + } + + if (get_local_id(0) > 253) + { + const int right_x = x + 2; + + col = idx_col(right_x, last_col); + + sum = co3 * convert_float4(((__global ushort4*)((__global char4*)srcData + idx_row(src_y - 2, last_row) * srcStep / 4))[col]); + sum = sum + co2 * convert_float4(((__global ushort4*)((__global char4*)srcData + idx_row(src_y - 1, last_row) * srcStep / 4))[col]); + sum = sum + co1 * convert_float4(((__global ushort4*)((__global char4*)srcData + idx_row(src_y , last_row) * srcStep / 4))[col]); + sum = sum + co2 * convert_float4(((__global ushort4*)((__global char4*)srcData + idx_row(src_y + 1, last_row) * srcStep / 4))[col]); + sum = sum + co3 * convert_float4(((__global ushort4*)((__global char4*)srcData + idx_row(src_y + 2, last_row) * srcStep / 4))[col]); + + smem[4 + get_local_id(0)] = sum; + } + } + + barrier(CLK_LOCAL_MEM_FENCE); + + if (get_local_id(0) < 128) + { + const int tid2 = get_local_id(0) * 2; + + sum = co3 * smem[2 + tid2 - 2]; + sum = sum + co2 * smem[2 + tid2 - 1]; + sum = sum + co1 * smem[2 + tid2 ]; + sum = sum + co2 * smem[2 + tid2 + 1]; + sum = sum + co3 * smem[2 + tid2 + 2]; + + const int dst_x = (get_group_id(0) * get_local_size(0) + tid2) / 2; + + if (dst_x < dstCols) + dst[y * dstStep / 8 + dst_x] = convert_ushort4_sat_rte(sum); + } +} + +/////////////////////////////////////////////////////////////////////// +////////////////////////// CV_16SC1 ////////////////////////////////// +/////////////////////////////////////////////////////////////////////// + +__kernel void pyrDown_C1_D3(__global short * srcData, int srcStep, int srcRows, int srcCols, __global short *dst, int dstStep, int dstCols) +{ + const int x = get_global_id(0); + const int y = get_group_id(1); + + __local float smem[256 + 4]; + + float sum; + + const int src_y = 2*y; + const int last_row = srcRows - 1; + const int last_col = srcCols - 1; + + if (src_y >= 2 && src_y < srcRows - 2 && x >= 2 && x < srcCols - 2) + { + sum = 0.0625f * ((__global short*)((__global char*)srcData + (src_y - 2) * srcStep))[x]; + sum = sum + 0.25f * ((__global short*)((__global char*)srcData + (src_y - 1) * srcStep))[x]; + sum = sum + 0.375f * ((__global short*)((__global char*)srcData + (src_y ) * srcStep))[x]; + sum = sum + 0.25f * ((__global short*)((__global char*)srcData + (src_y + 1) * srcStep))[x]; + sum = sum + 0.0625f * ((__global short*)((__global char*)srcData + (src_y + 2) * srcStep))[x]; + + smem[2 + get_local_id(0)] = sum; + + if (get_local_id(0) < 2) + { + const int left_x = x - 2; + + sum = 0.0625f * ((__global short*)((__global char*)srcData + (src_y - 2) * srcStep))[left_x]; + sum = sum + 0.25f * ((__global short*)((__global char*)srcData + (src_y - 1) * srcStep))[left_x]; + sum = sum + 0.375f * ((__global short*)((__global char*)srcData + (src_y ) * srcStep))[left_x]; + sum = sum + 0.25f * ((__global short*)((__global char*)srcData + (src_y + 1) * srcStep))[left_x]; + sum = sum + 0.0625f * ((__global short*)((__global char*)srcData + (src_y + 2) * srcStep))[left_x]; + + smem[get_local_id(0)] = sum; + } + + if (get_local_id(0) > 253) + { + const int right_x = x + 2; + + sum = 0.0625f * ((__global short*)((__global char*)srcData + (src_y - 2) * srcStep))[right_x]; + sum = sum + 0.25f * ((__global short*)((__global char*)srcData + (src_y - 1) * srcStep))[right_x]; + sum = sum + 0.375f * ((__global short*)((__global char*)srcData + (src_y ) * srcStep))[right_x]; + sum = sum + 0.25f * ((__global short*)((__global char*)srcData + (src_y + 1) * srcStep))[right_x]; + sum = sum + 0.0625f * ((__global short*)((__global char*)srcData + (src_y + 2) * srcStep))[right_x]; + + smem[4 + get_local_id(0)] = sum; + } + } + else + { + int col = idx_col(x, last_col); + + sum = 0.0625f * ((__global short*)((__global char*)srcData + idx_row(src_y - 2, last_row) * srcStep))[col]; + sum = sum + 0.25f * ((__global short*)((__global char*)srcData + idx_row(src_y - 1, last_row) * srcStep))[col]; + sum = sum + 0.375f * ((__global short*)((__global char*)srcData + idx_row(src_y , last_row) * srcStep))[col]; + sum = sum + 0.25f * ((__global short*)((__global char*)srcData + idx_row(src_y + 1, last_row) * srcStep))[col]; + sum = sum + 0.0625f * ((__global short*)((__global char*)srcData + idx_row(src_y + 2, last_row) * srcStep))[col]; + + smem[2 + get_local_id(0)] = sum; + + if (get_local_id(0) < 2) + { + const int left_x = x - 2; + + col = idx_col(left_x, last_col); + + sum = 0.0625f * ((__global short*)((__global char*)srcData + idx_row(src_y - 2, last_row) * srcStep))[col]; + sum = sum + 0.25f * ((__global short*)((__global char*)srcData + idx_row(src_y - 1, last_row) * srcStep))[col]; + sum = sum + 0.375f * ((__global short*)((__global char*)srcData + idx_row(src_y , last_row) * srcStep))[col]; + sum = sum + 0.25f * ((__global short*)((__global char*)srcData + idx_row(src_y + 1, last_row) * srcStep))[col]; + sum = sum + 0.0625f * ((__global short*)((__global char*)srcData + idx_row(src_y + 2, last_row) * srcStep))[col]; + + smem[get_local_id(0)] = sum; + } + + if (get_local_id(0) > 253) + { + const int right_x = x + 2; + + col = idx_col(right_x, last_col); + + sum = 0.0625f * ((__global short*)((__global char*)srcData + idx_row(src_y - 2, last_row) * srcStep))[col]; + sum = sum + 0.25f * ((__global short*)((__global char*)srcData + idx_row(src_y - 1, last_row) * srcStep))[col]; + sum = sum + 0.375f * ((__global short*)((__global char*)srcData + idx_row(src_y , last_row) * srcStep))[col]; + sum = sum + 0.25f * ((__global short*)((__global char*)srcData + idx_row(src_y + 1, last_row) * srcStep))[col]; + sum = sum + 0.0625f * ((__global short*)((__global char*)srcData + idx_row(src_y + 2, last_row) * srcStep))[col]; + + smem[4 + get_local_id(0)] = sum; + } + } + + barrier(CLK_LOCAL_MEM_FENCE); + + if (get_local_id(0) < 128) + { + const int tid2 = get_local_id(0) * 2; + + sum = 0.0625f * smem[2 + tid2 - 2]; + sum = sum + 0.25f * smem[2 + tid2 - 1]; + sum = sum + 0.375f * smem[2 + tid2 ]; + sum = sum + 0.25f * smem[2 + tid2 + 1]; + sum = sum + 0.0625f * smem[2 + tid2 + 2]; + + const int dst_x = (get_group_id(0) * get_local_size(0) + tid2) / 2; + + if (dst_x < dstCols) + dst[y * dstStep / 2 + dst_x] = convert_short_sat_rte(sum); + } +} + +/////////////////////////////////////////////////////////////////////// +////////////////////////// CV_16SC4 ////////////////////////////////// +/////////////////////////////////////////////////////////////////////// + +__kernel void pyrDown_C4_D3(__global short4 * srcData, int srcStep, int srcRows, int srcCols, __global short4 *dst, int dstStep, int dstCols) +{ + const int x = get_global_id(0); + const int y = get_group_id(1); + + __local float4 smem[256 + 4]; + + float4 sum; + + const int src_y = 2*y; + const int last_row = srcRows - 1; + const int last_col = srcCols - 1; + + float4 co1 = 0.375f; + float4 co2 = 0.25f; + float4 co3 = 0.0625f; + + if (src_y >= 2 && src_y < srcRows - 2 && x >= 2 && x < srcCols - 2) + { + sum = co3 * convert_float4(((__global short4*)((__global char4*)srcData + (src_y - 2) * srcStep / 4))[x]); + sum = sum + co2 * convert_float4(((__global short4*)((__global char4*)srcData + (src_y - 1) * srcStep / 4))[x]); + sum = sum + co1 * convert_float4(((__global short4*)((__global char4*)srcData + (src_y ) * srcStep / 4))[x]); + sum = sum + co2 * convert_float4(((__global short4*)((__global char4*)srcData + (src_y + 1) * srcStep / 4))[x]); + sum = sum + co3 * convert_float4(((__global short4*)((__global char4*)srcData + (src_y + 2) * srcStep / 4))[x]); + + smem[2 + get_local_id(0)] = sum; + + if (get_local_id(0) < 2) + { + const int left_x = x - 2; + + sum = co3 * convert_float4(((__global short4*)((__global char4*)srcData + (src_y - 2) * srcStep / 4))[left_x]); + sum = sum + co2 * convert_float4(((__global short4*)((__global char4*)srcData + (src_y - 1) * srcStep / 4))[left_x]); + sum = sum + co1 * convert_float4(((__global short4*)((__global char4*)srcData + (src_y ) * srcStep / 4))[left_x]); + sum = sum + co2 * convert_float4(((__global short4*)((__global char4*)srcData + (src_y + 1) * srcStep / 4))[left_x]); + sum = sum + co3 * convert_float4(((__global short4*)((__global char4*)srcData + (src_y + 2) * srcStep / 4))[left_x]); + + smem[get_local_id(0)] = sum; + } + + if (get_local_id(0) > 253) + { + const int right_x = x + 2; + + sum = co3 * convert_float4(((__global short4*)((__global char4*)srcData + (src_y - 2) * srcStep / 4))[right_x]); + sum = sum + co2 * convert_float4(((__global short4*)((__global char4*)srcData + (src_y - 1) * srcStep / 4))[right_x]); + sum = sum + co1 * convert_float4(((__global short4*)((__global char4*)srcData + (src_y ) * srcStep / 4))[right_x]); + sum = sum + co2 * convert_float4(((__global short4*)((__global char4*)srcData + (src_y + 1) * srcStep / 4))[right_x]); + sum = sum + co3 * convert_float4(((__global short4*)((__global char4*)srcData + (src_y + 2) * srcStep / 4))[right_x]); + + smem[4 + get_local_id(0)] = sum; + } + } + else + { + int col = idx_col(x, last_col); + + sum = co3 * convert_float4(((__global short4*)((__global char4*)srcData + idx_row(src_y - 2, last_row) * srcStep / 4))[col]); + sum = sum + co2 * convert_float4(((__global short4*)((__global char4*)srcData + idx_row(src_y - 1, last_row) * srcStep / 4))[col]); + sum = sum + co1 * convert_float4(((__global short4*)((__global char4*)srcData + idx_row(src_y , last_row) * srcStep / 4))[col]); + sum = sum + co2 * convert_float4(((__global short4*)((__global char4*)srcData + idx_row(src_y + 1, last_row) * srcStep / 4))[col]); + sum = sum + co3 * convert_float4(((__global short4*)((__global char4*)srcData + idx_row(src_y + 2, last_row) * srcStep / 4))[col]); + + smem[2 + get_local_id(0)] = sum; + + if (get_local_id(0) < 2) + { + const int left_x = x - 2; + + col = idx_col(left_x, last_col); + + sum = co3 * convert_float4(((__global short4*)((__global char4*)srcData + idx_row(src_y - 2, last_row) * srcStep / 4))[col]); + sum = sum + co2 * convert_float4(((__global short4*)((__global char4*)srcData + idx_row(src_y - 1, last_row) * srcStep / 4))[col]); + sum = sum + co1 * convert_float4(((__global short4*)((__global char4*)srcData + idx_row(src_y , last_row) * srcStep / 4))[col]); + sum = sum + co2 * convert_float4(((__global short4*)((__global char4*)srcData + idx_row(src_y + 1, last_row) * srcStep / 4))[col]); + sum = sum + co3 * convert_float4(((__global short4*)((__global char4*)srcData + idx_row(src_y + 2, last_row) * srcStep / 4))[col]); + + smem[get_local_id(0)] = sum; + } + + if (get_local_id(0) > 253) + { + const int right_x = x + 2; + + col = idx_col(right_x, last_col); + + sum = co3 * convert_float4(((__global short4*)((__global char4*)srcData + idx_row(src_y - 2, last_row) * srcStep / 4))[col]); + sum = sum + co2 * convert_float4(((__global short4*)((__global char4*)srcData + idx_row(src_y - 1, last_row) * srcStep / 4))[col]); + sum = sum + co1 * convert_float4(((__global short4*)((__global char4*)srcData + idx_row(src_y , last_row) * srcStep / 4))[col]); + sum = sum + co2 * convert_float4(((__global short4*)((__global char4*)srcData + idx_row(src_y + 1, last_row) * srcStep / 4))[col]); + sum = sum + co3 * convert_float4(((__global short4*)((__global char4*)srcData + idx_row(src_y + 2, last_row) * srcStep / 4))[col]); + + smem[4 + get_local_id(0)] = sum; + } + } + + barrier(CLK_LOCAL_MEM_FENCE); + + if (get_local_id(0) < 128) + { + const int tid2 = get_local_id(0) * 2; + + sum = co3 * smem[2 + tid2 - 2]; + sum = sum + co2 * smem[2 + tid2 - 1]; + sum = sum + co1 * smem[2 + tid2 ]; + sum = sum + co2 * smem[2 + tid2 + 1]; + sum = sum + co3 * smem[2 + tid2 + 2]; + + const int dst_x = (get_group_id(0) * get_local_size(0) + tid2) / 2; + + if (dst_x < dstCols) + dst[y * dstStep / 8 + dst_x] = convert_short4_sat_rte(sum); + } +} + +/////////////////////////////////////////////////////////////////////// +////////////////////////// CV_32FC1 ////////////////////////////////// +/////////////////////////////////////////////////////////////////////// + __kernel void pyrDown_C1_D5(__global float * srcData, int srcStep, int srcRows, int srcCols, __global float *dst, int dstStep, int dstCols) { const int x = get_global_id(0); @@ -441,6 +890,10 @@ __kernel void pyrDown_C1_D5(__global float * srcData, int srcStep, int srcRows, } } +/////////////////////////////////////////////////////////////////////// +////////////////////////// CV_32FC4 ////////////////////////////////// +/////////////////////////////////////////////////////////////////////// + __kernel void pyrDown_C4_D5(__global float4 * srcData, int srcStep, int srcRows, int srcCols, __global float4 *dst, int dstStep, int dstCols) { const int x = get_global_id(0); @@ -454,16 +907,16 @@ __kernel void pyrDown_C4_D5(__global float4 * srcData, int srcStep, int srcRows, const int last_row = srcRows - 1; const int last_col = srcCols - 1; - float4 co1 = 0.375f;//(float4)(0.375f, 0.375f, 0.375f, 0.375f); - float4 co2 = 0.25f;//(float4)(0.25f, 0.25f, 0.25f, 0.25f); - float4 co3 = 0.0625f;//(float4)(0.0625f, 0.0625f, 0.0625f, 0.0625f); + float4 co1 = 0.375f; + float4 co2 = 0.25f; + float4 co3 = 0.0625f; if (src_y >= 2 && src_y < srcRows - 2 && x >= 2 && x < srcCols - 2) { sum = co3 * ((__global float4*)((__global char4*)srcData + (src_y - 2) * srcStep / 4))[x]; - sum = sum + co2 * ((__global float4*)((__global char4*)srcData + (src_y - 1) * srcStep / 4))[x]; - sum = sum + co1 * ((__global float4*)((__global char4*)srcData + (src_y ) * srcStep / 4))[x]; - sum = sum + co2 * ((__global float4*)((__global char4*)srcData + (src_y + 1) * srcStep / 4))[x]; + sum = sum + co2 * ((__global float4*)((__global char4*)srcData + (src_y - 1) * srcStep / 4))[x]; + sum = sum + co1 * ((__global float4*)((__global char4*)srcData + (src_y ) * srcStep / 4))[x]; + sum = sum + co2 * ((__global float4*)((__global char4*)srcData + (src_y + 1) * srcStep / 4))[x]; sum = sum + co3 * ((__global float4*)((__global char4*)srcData + (src_y + 2) * srcStep / 4))[x]; smem[2 + get_local_id(0)] = sum; @@ -473,9 +926,9 @@ __kernel void pyrDown_C4_D5(__global float4 * srcData, int srcStep, int srcRows, const int left_x = x - 2; sum = co3 * ((__global float4*)((__global char4*)srcData + (src_y - 2) * srcStep / 4))[left_x]; - sum = sum + co2 * ((__global float4*)((__global char4*)srcData + (src_y - 1) * srcStep / 4))[left_x]; - sum = sum + co1 * ((__global float4*)((__global char4*)srcData + (src_y ) * srcStep / 4))[left_x]; - sum = sum + co2 * ((__global float4*)((__global char4*)srcData + (src_y + 1) * srcStep / 4))[left_x]; + sum = sum + co2 * ((__global float4*)((__global char4*)srcData + (src_y - 1) * srcStep / 4))[left_x]; + sum = sum + co1 * ((__global float4*)((__global char4*)srcData + (src_y ) * srcStep / 4))[left_x]; + sum = sum + co2 * ((__global float4*)((__global char4*)srcData + (src_y + 1) * srcStep / 4))[left_x]; sum = sum + co3 * ((__global float4*)((__global char4*)srcData + (src_y + 2) * srcStep / 4))[left_x]; smem[get_local_id(0)] = sum; @@ -486,9 +939,9 @@ __kernel void pyrDown_C4_D5(__global float4 * srcData, int srcStep, int srcRows, const int right_x = x + 2; sum = co3 * ((__global float4*)((__global char4*)srcData + (src_y - 2) * srcStep / 4))[right_x]; - sum = sum + co2 * ((__global float4*)((__global char4*)srcData + (src_y - 1) * srcStep / 4))[right_x]; - sum = sum + co1 * ((__global float4*)((__global char4*)srcData + (src_y ) * srcStep / 4))[right_x]; - sum = sum + co2 * ((__global float4*)((__global char4*)srcData + (src_y + 1) * srcStep / 4))[right_x]; + sum = sum + co2 * ((__global float4*)((__global char4*)srcData + (src_y - 1) * srcStep / 4))[right_x]; + sum = sum + co1 * ((__global float4*)((__global char4*)srcData + (src_y ) * srcStep / 4))[right_x]; + sum = sum + co2 * ((__global float4*)((__global char4*)srcData + (src_y + 1) * srcStep / 4))[right_x]; sum = sum + co3 * ((__global float4*)((__global char4*)srcData + (src_y + 2) * srcStep / 4))[right_x]; smem[4 + get_local_id(0)] = sum; @@ -499,9 +952,9 @@ __kernel void pyrDown_C4_D5(__global float4 * srcData, int srcStep, int srcRows, int col = idx_col(x, last_col); sum = co3 * ((__global float4*)((__global char4*)srcData + idx_row(src_y - 2, last_row) * srcStep / 4))[col]; - sum = sum + co2 * ((__global float4*)((__global char4*)srcData + idx_row(src_y - 1, last_row) * srcStep / 4))[col]; - sum = sum + co1 * ((__global float4*)((__global char4*)srcData + idx_row(src_y , last_row) * srcStep / 4))[col]; - sum = sum + co2 * ((__global float4*)((__global char4*)srcData + idx_row(src_y + 1, last_row) * srcStep / 4))[col]; + sum = sum + co2 * ((__global float4*)((__global char4*)srcData + idx_row(src_y - 1, last_row) * srcStep / 4))[col]; + sum = sum + co1 * ((__global float4*)((__global char4*)srcData + idx_row(src_y , last_row) * srcStep / 4))[col]; + sum = sum + co2 * ((__global float4*)((__global char4*)srcData + idx_row(src_y + 1, last_row) * srcStep / 4))[col]; sum = sum + co3 * ((__global float4*)((__global char4*)srcData + idx_row(src_y + 2, last_row) * srcStep / 4))[col]; smem[2 + get_local_id(0)] = sum; @@ -513,9 +966,9 @@ __kernel void pyrDown_C4_D5(__global float4 * srcData, int srcStep, int srcRows, col = idx_col(left_x, last_col); sum = co3 * ((__global float4*)((__global char4*)srcData + idx_row(src_y - 2, last_row) * srcStep / 4))[col]; - sum = sum + co2 * ((__global float4*)((__global char4*)srcData + idx_row(src_y - 1, last_row) * srcStep / 4))[col]; - sum = sum + co1 * ((__global float4*)((__global char4*)srcData + idx_row(src_y , last_row) * srcStep / 4))[col]; - sum = sum + co2 * ((__global float4*)((__global char4*)srcData + idx_row(src_y + 1, last_row) * srcStep / 4))[col]; + sum = sum + co2 * ((__global float4*)((__global char4*)srcData + idx_row(src_y - 1, last_row) * srcStep / 4))[col]; + sum = sum + co1 * ((__global float4*)((__global char4*)srcData + idx_row(src_y , last_row) * srcStep / 4))[col]; + sum = sum + co2 * ((__global float4*)((__global char4*)srcData + idx_row(src_y + 1, last_row) * srcStep / 4))[col]; sum = sum + co3 * ((__global float4*)((__global char4*)srcData + idx_row(src_y + 2, last_row) * srcStep / 4))[col]; smem[get_local_id(0)] = sum; @@ -528,9 +981,9 @@ __kernel void pyrDown_C4_D5(__global float4 * srcData, int srcStep, int srcRows, col = idx_col(right_x, last_col); sum = co3 * ((__global float4*)((__global char4*)srcData + idx_row(src_y - 2, last_row) * srcStep / 4))[col]; - sum = sum + co2 * ((__global float4*)((__global char4*)srcData + idx_row(src_y - 1, last_row) * srcStep / 4))[col]; - sum = sum + co1 * ((__global float4*)((__global char4*)srcData + idx_row(src_y , last_row) * srcStep / 4))[col]; - sum = sum + co2 * ((__global float4*)((__global char4*)srcData + idx_row(src_y + 1, last_row) * srcStep / 4))[col]; + sum = sum + co2 * ((__global float4*)((__global char4*)srcData + idx_row(src_y - 1, last_row) * srcStep / 4))[col]; + sum = sum + co1 * ((__global float4*)((__global char4*)srcData + idx_row(src_y , last_row) * srcStep / 4))[col]; + sum = sum + co2 * ((__global float4*)((__global char4*)srcData + idx_row(src_y + 1, last_row) * srcStep / 4))[col]; sum = sum + co3 * ((__global float4*)((__global char4*)srcData + idx_row(src_y + 2, last_row) * srcStep / 4))[col]; smem[4 + get_local_id(0)] = sum; @@ -544,9 +997,9 @@ __kernel void pyrDown_C4_D5(__global float4 * srcData, int srcStep, int srcRows, const int tid2 = get_local_id(0) * 2; sum = co3 * smem[2 + tid2 - 2]; - sum = sum + co2 * smem[2 + tid2 - 1]; - sum = sum + co1 * smem[2 + tid2 ]; - sum = sum + co2 * smem[2 + tid2 + 1]; + sum = sum + co2 * smem[2 + tid2 - 1]; + sum = sum + co1 * smem[2 + tid2 ]; + sum = sum + co2 * smem[2 + tid2 + 1]; sum = sum + co3 * smem[2 + tid2 + 2]; const int dst_x = (get_group_id(0) * get_local_size(0) + tid2) / 2; diff --git a/modules/ocl/src/opencl/pyr_up.cl b/modules/ocl/src/opencl/pyr_up.cl index 4afa7b710..f58205c02 100644 --- a/modules/ocl/src/opencl/pyr_up.cl +++ b/modules/ocl/src/opencl/pyr_up.cl @@ -46,18 +46,18 @@ // //M*/ -//#pragma OPENCL EXTENSION cl_amd_printf : enable - uchar get_valid_uchar(float data) { return (uchar)(data <= 255 ? data : data > 0 ? 255 : 0); } + /////////////////////////////////////////////////////////////////////// ////////////////////////// CV_8UC1 ////////////////////////////////// /////////////////////////////////////////////////////////////////////// -__kernel void pyrUp_C1_D0(__global uchar* src,__global uchar* dst, - int srcRows,int dstRows,int srcCols,int dstCols, - int srcOffset,int dstOffset,int srcStep,int dstStep) + +__kernel void pyrUp_C1_D0(__global uchar* src, __global uchar* dst, + int srcRows, int dstRows, int srcCols, int dstCols, + int srcOffset, int dstOffset, int srcStep, int dstStep) { const int x = get_global_id(0); const int y = get_global_id(1); @@ -144,15 +144,15 @@ __kernel void pyrUp_C1_D0(__global uchar* src,__global uchar* dst, if ((x < dstCols) && (y < dstRows)) dst[x + y * dstStep] = convert_uchar_sat_rte(4.0f * sum); - } /////////////////////////////////////////////////////////////////////// ////////////////////////// CV_16UC1 ///////////////////////////////// /////////////////////////////////////////////////////////////////////// -__kernel void pyrUp_C1_D2(__global ushort* src,__global ushort* dst, - int srcRows,int dstRows,int srcCols,int dstCols, - int srcOffset,int dstOffset,int srcStep,int dstStep) + +__kernel void pyrUp_C1_D2(__global ushort* src, __global ushort* dst, + int srcRows, int dstRows, int srcCols, int dstCols, + int srcOffset, int dstOffset, int srcStep, int dstStep) { const int x = get_global_id(0); const int y = get_global_id(1); @@ -245,16 +245,116 @@ __kernel void pyrUp_C1_D2(__global ushort* src,__global ushort* dst, sum = sum + 0.0625f * s_dstPatch[2 + tidy + 2][get_local_id(0)]; if ((x < dstCols) && (y < dstRows)) - dst[x + y * dstStep] = convert_short_sat_rte(4.0f * sum); + dst[x + y * dstStep] = convert_ushort_sat_rte(4.0f * sum); +} +/////////////////////////////////////////////////////////////////////// +////////////////////////// CV_16SC1 ///////////////////////////////// +/////////////////////////////////////////////////////////////////////// + +__kernel void pyrUp_C1_D3(__global short* src, __global short* dst, + int srcRows, int dstRows, int srcCols, int dstCols, + int srcOffset, int dstOffset, int srcStep, int dstStep) +{ + const int x = get_global_id(0); + const int y = get_global_id(1); + + __local float s_srcPatch[10][10]; + __local float s_dstPatch[20][16]; + + srcStep = srcStep >> 1; + dstStep = dstStep >> 1; + srcOffset = srcOffset >> 1; + dstOffset = dstOffset >> 1; + + if( get_local_id(0) < 10 && get_local_id(1) < 10 ) + { + int srcx = (int)(get_group_id(0) * get_local_size(0) / 2 + get_local_id(0)) - 1; + int srcy = (int)(get_group_id(1) * get_local_size(1) / 2 + get_local_id(1)) - 1; + + srcx = abs(srcx); + srcx = min(srcCols - 1,srcx); + + srcy = abs(srcy); + srcy = min(srcRows -1 ,srcy); + + s_srcPatch[get_local_id(1)][get_local_id(0)] = (float)(src[srcx + srcy * srcStep]); + } + + barrier(CLK_LOCAL_MEM_FENCE); + + float sum = 0; + + const int evenFlag = (int)((get_local_id(0) & 1) == 0); + const int oddFlag = (int)((get_local_id(0) & 1) != 0); + const bool eveny = ((get_local_id(1) & 1) == 0); + const int tidx = get_local_id(0); + + if (eveny) + { + sum = sum + (evenFlag * 0.0625f) * s_srcPatch[1 + (get_local_id(1) >> 1)][1 + ((tidx - 2) >> 1)]; + sum = sum + ( oddFlag * 0.25f ) * s_srcPatch[1 + (get_local_id(1) >> 1)][1 + ((tidx - 1) >> 1)]; + sum = sum + (evenFlag * 0.375f ) * s_srcPatch[1 + (get_local_id(1) >> 1)][1 + ((tidx ) >> 1)]; + sum = sum + ( oddFlag * 0.25f ) * s_srcPatch[1 + (get_local_id(1) >> 1)][1 + ((tidx + 1) >> 1)]; + sum = sum + (evenFlag * 0.0625f) * s_srcPatch[1 + (get_local_id(1) >> 1)][1 + ((tidx + 2) >> 1)]; + } + + s_dstPatch[2 + get_local_id(1)][get_local_id(0)] = sum; + + if (get_local_id(1) < 2) + { + sum = 0; + + if (eveny) + { + sum = sum + (evenFlag * 0.0625f) * s_srcPatch[0][1 + ((tidx - 2) >> 1)]; + sum = sum + ( oddFlag * 0.25f ) * s_srcPatch[0][1 + ((tidx - 1) >> 1)]; + sum = sum + (evenFlag * 0.375f ) * s_srcPatch[0][1 + ((tidx ) >> 1)]; + sum = sum + ( oddFlag * 0.25f ) * s_srcPatch[0][1 + ((tidx + 1) >> 1)]; + sum = sum + (evenFlag * 0.0625f) * s_srcPatch[0][1 + ((tidx + 2) >> 1)]; + } + + s_dstPatch[get_local_id(1)][get_local_id(0)] = sum; + } + + if (get_local_id(1) > 13) + { + sum = 0; + + if (eveny) + { + sum = sum + (evenFlag * 0.0625f) * s_srcPatch[9][1 + ((tidx - 2) >> 1)]; + sum = sum + ( oddFlag * 0.25f ) * s_srcPatch[9][1 + ((tidx - 1) >> 1)]; + sum = sum + (evenFlag * 0.375f ) * s_srcPatch[9][1 + ((tidx ) >> 1)]; + sum = sum + ( oddFlag * 0.25f ) * s_srcPatch[9][1 + ((tidx + 1) >> 1)]; + sum = sum + (evenFlag * 0.0625f) * s_srcPatch[9][1 + ((tidx + 2) >> 1)]; + } + s_dstPatch[4 + get_local_id(1)][get_local_id(0)] = sum; + } + + barrier(CLK_LOCAL_MEM_FENCE); + + sum = 0; + + const int tidy = get_local_id(1); + + sum = sum + 0.0625f * s_dstPatch[2 + tidy - 2][get_local_id(0)]; + sum = sum + 0.25f * s_dstPatch[2 + tidy - 1][get_local_id(0)]; + sum = sum + 0.375f * s_dstPatch[2 + tidy ][get_local_id(0)]; + sum = sum + 0.25f * s_dstPatch[2 + tidy + 1][get_local_id(0)]; + sum = sum + 0.0625f * s_dstPatch[2 + tidy + 2][get_local_id(0)]; + + if ((x < dstCols) && (y < dstRows)) + dst[x + y * dstStep] = convert_short_sat_rte(4.0f * sum); } /////////////////////////////////////////////////////////////////////// ////////////////////////// CV_32FC1 ///////////////////////////////// /////////////////////////////////////////////////////////////////////// -__kernel void pyrUp_C1_D5(__global float* src,__global float* dst, - int srcRows,int dstRows,int srcCols,int dstCols, - int srcOffset,int dstOffset,int srcStep,int dstStep) + +__kernel void pyrUp_C1_D5(__global float* src, __global float* dst, + int srcRows, int dstRows, int srcCols, int dstCols, + int srcOffset, int dstOffset, int srcStep, int dstStep) { const int x = get_global_id(0); const int y = get_global_id(1); @@ -346,15 +446,15 @@ __kernel void pyrUp_C1_D5(__global float* src,__global float* dst, if ((x < dstCols) && (y < dstRows)) dst[x + y * dstStep] = (float)(4.0f * sum); - } /////////////////////////////////////////////////////////////////////// ////////////////////////// CV_8UC4 ////////////////////////////////// /////////////////////////////////////////////////////////////////////// -__kernel void pyrUp_C4_D0(__global uchar4* src,__global uchar4* dst, - int srcRows,int dstRows,int srcCols,int dstCols, - int srcOffset,int dstOffset,int srcStep,int dstStep) + +__kernel void pyrUp_C4_D0(__global uchar4* src, __global uchar4* dst, + int srcRows, int dstRows, int srcCols, int dstCols, + int srcOffset, int dstOffset, int srcStep, int dstStep) { const int x = get_global_id(0); const int y = get_global_id(1); @@ -451,17 +551,16 @@ __kernel void pyrUp_C4_D0(__global uchar4* src,__global uchar4* dst, sum = sum + co3 * s_dstPatch[2 + tidy + 2][tidx]; if ((x < dstCols) && (y < dstRows)) - { dst[x + y * dstStep] = convert_uchar4_sat_rte(4.0f * sum); - } } /////////////////////////////////////////////////////////////////////// ////////////////////////// CV_16UC4 ////////////////////////////////// /////////////////////////////////////////////////////////////////////// -__kernel void pyrUp_C4_D2(__global ushort4* src,__global ushort4* dst, - int srcRows,int dstRows,int srcCols,int dstCols, - int srcOffset,int dstOffset,int srcStep,int dstStep) + +__kernel void pyrUp_C4_D2(__global ushort4* src, __global ushort4* dst, + int srcRows, int dstRows, int srcCols, int dstCols, + int srcOffset, int dstOffset, int srcStep, int dstStep) { const int x = get_global_id(0); const int y = get_global_id(1); @@ -560,17 +659,123 @@ __kernel void pyrUp_C4_D2(__global ushort4* src,__global ushort4* dst, sum = sum + co3 * s_dstPatch[2 + tidy + 2][get_local_id(0)]; if ((x < dstCols) && (y < dstRows)) - { dst[x + y * dstStep] = convert_ushort4_sat_rte(4.0f * sum); +} + +/////////////////////////////////////////////////////////////////////// +////////////////////////// CV_16SC4 ////////////////////////////////// +/////////////////////////////////////////////////////////////////////// + +__kernel void pyrUp_C4_D3(__global short4* src, __global short4* dst, + int srcRows, int dstRows, int srcCols, int dstCols, + int srcOffset, int dstOffset, int srcStep, int dstStep) +{ + const int x = get_global_id(0); + const int y = get_global_id(1); + + __local float4 s_srcPatch[10][10]; + __local float4 s_dstPatch[20][16]; + + srcOffset >>= 3; + dstOffset >>= 3; + srcStep >>= 3; + dstStep >>= 3; + + if( get_local_id(0) < 10 && get_local_id(1) < 10 ) + { + int srcx = (int)(get_group_id(0) * get_local_size(0) / 2 + get_local_id(0)) - 1; + int srcy = (int)(get_group_id(1) * get_local_size(1) / 2 + get_local_id(1)) - 1; + + srcx = abs(srcx); + srcx = min(srcCols - 1,srcx); + + srcy = abs(srcy); + srcy = min(srcRows -1 ,srcy); + + s_srcPatch[get_local_id(1)][get_local_id(0)] = convert_float4(src[srcx + srcy * srcStep]); } + + barrier(CLK_LOCAL_MEM_FENCE); + + float4 sum = (float4)(0,0,0,0); + + const float4 evenFlag = (float4)((get_local_id(0) & 1) == 0); + const float4 oddFlag = (float4)((get_local_id(0) & 1) != 0); + const bool eveny = ((get_local_id(1) & 1) == 0); + const int tidx = get_local_id(0); + + float4 co1 = (float4)(0.375f, 0.375f, 0.375f, 0.375f); + float4 co2 = (float4)(0.25f, 0.25f, 0.25f, 0.25f); + float4 co3 = (float4)(0.0625f, 0.0625f, 0.0625f, 0.0625f); + + + if(eveny) + { + sum = sum + ( evenFlag* co3 ) * s_srcPatch[1 + (get_local_id(1) >> 1)][1 + ((tidx - 2) >> 1)]; + sum = sum + ( oddFlag * co2 ) * s_srcPatch[1 + (get_local_id(1) >> 1)][1 + ((tidx - 1) >> 1)]; + sum = sum + ( evenFlag* co1 ) * s_srcPatch[1 + (get_local_id(1) >> 1)][1 + ((tidx ) >> 1)]; + sum = sum + ( oddFlag * co2 ) * s_srcPatch[1 + (get_local_id(1) >> 1)][1 + ((tidx + 1) >> 1)]; + sum = sum + ( evenFlag* co3 ) * s_srcPatch[1 + (get_local_id(1) >> 1)][1 + ((tidx + 2) >> 1)]; + + } + + s_dstPatch[2 + get_local_id(1)][get_local_id(0)] = sum; + + if (get_local_id(1) < 2) + { + sum = 0; + + if (eveny) + { + sum = sum + (evenFlag * co3 ) * s_srcPatch[0][1 + ((tidx - 2) >> 1)]; + sum = sum + (oddFlag * co2 ) * s_srcPatch[0][1 + ((tidx - 1) >> 1)]; + sum = sum + (evenFlag * co1 ) * s_srcPatch[0][1 + ((tidx ) >> 1)]; + sum = sum + (oddFlag * co2 ) * s_srcPatch[0][1 + ((tidx + 1) >> 1)]; + sum = sum + (evenFlag * co3 ) * s_srcPatch[0][1 + ((tidx + 2) >> 1)]; + } + + s_dstPatch[get_local_id(1)][get_local_id(0)] = sum; + } + + if (get_local_id(1) > 13) + { + sum = 0; + + if (eveny) + { + sum = sum + (evenFlag * co3) * s_srcPatch[9][1 + ((tidx - 2) >> 1)]; + sum = sum + ( oddFlag * co2) * s_srcPatch[9][1 + ((tidx - 1) >> 1)]; + sum = sum + (evenFlag * co1) * s_srcPatch[9][1 + ((tidx ) >> 1)]; + sum = sum + ( oddFlag * co2) * s_srcPatch[9][1 + ((tidx + 1) >> 1)]; + sum = sum + (evenFlag * co3) * s_srcPatch[9][1 + ((tidx + 2) >> 1)]; + + } + s_dstPatch[4 + get_local_id(1)][get_local_id(0)] = sum; + } + + barrier(CLK_LOCAL_MEM_FENCE); + + sum = 0; + + const int tidy = get_local_id(1); + + sum = sum + co3 * s_dstPatch[2 + tidy - 2][get_local_id(0)]; + sum = sum + co2 * s_dstPatch[2 + tidy - 1][get_local_id(0)]; + sum = sum + co1 * s_dstPatch[2 + tidy ][get_local_id(0)]; + sum = sum + co2 * s_dstPatch[2 + tidy + 1][get_local_id(0)]; + sum = sum + co3 * s_dstPatch[2 + tidy + 2][get_local_id(0)]; + + if ((x < dstCols) && (y < dstRows)) + dst[x + y * dstStep] = convert_short4_sat_rte(4.0f * sum); } /////////////////////////////////////////////////////////////////////// ////////////////////////// CV_32FC4 ////////////////////////////////// /////////////////////////////////////////////////////////////////////// -__kernel void pyrUp_C4_D5(__global float4* src,__global float4* dst, - int srcRows,int dstRows,int srcCols,int dstCols, - int srcOffset,int dstOffset,int srcStep,int dstStep) + +__kernel void pyrUp_C4_D5(__global float4* src, __global float4* dst, + int srcRows, int dstRows, int srcCols, int dstCols, + int srcOffset, int dstOffset, int srcStep, int dstStep) { const int x = get_global_id(0); const int y = get_global_id(1); @@ -667,7 +872,5 @@ __kernel void pyrUp_C4_D5(__global float4* src,__global float4* dst, sum = sum + co3 * s_dstPatch[2 + tidy + 2][tidx]; if ((x < dstCols) && (y < dstRows)) - { dst[x + y * dstStep] = 4.0f * sum; - } } diff --git a/modules/ocl/src/pyrdown.cpp b/modules/ocl/src/pyrdown.cpp index 1e3e31de9..5043da05d 100644 --- a/modules/ocl/src/pyrdown.cpp +++ b/modules/ocl/src/pyrdown.cpp @@ -73,24 +73,11 @@ static void pyrdown_run(const oclMat &src, const oclMat &dst) CV_Assert(src.depth() != CV_8S); Context *clCxt = src.clCxt; - //int channels = dst.channels(); - //int depth = dst.depth(); - string kernelName = "pyrDown"; - //int vector_lengths[4][7] = {{4, 0, 4, 4, 1, 1, 1}, - // {4, 0, 4, 4, 1, 1, 1}, - // {4, 0, 4, 4, 1, 1, 1}, - // {4, 0, 4, 4, 1, 1, 1} - //}; - - //size_t vector_length = vector_lengths[channels-1][depth]; - //int offset_cols = (dst.offset / dst.elemSize1()) & (vector_length - 1); - size_t localThreads[3] = { 256, 1, 1 }; size_t globalThreads[3] = { src.cols, dst.rows, 1}; - //int dst_step1 = dst.cols * dst.elemSize(); vector > args; args.push_back( make_pair( sizeof(cl_mem), (void *)&src.data )); args.push_back( make_pair( sizeof(cl_int), (void *)&src.step )); @@ -107,7 +94,9 @@ static void pyrdown_run(const oclMat &src, const oclMat &dst) void cv::ocl::pyrDown(const oclMat &src, oclMat &dst) { - CV_Assert(src.depth() <= CV_32F && src.channels() <= 4); + int depth = src.depth(), channels = src.channels(); + CV_Assert(depth == CV_8U || depth == CV_16U || depth == CV_16S || depth == CV_32F); + CV_Assert(channels == 1 || channels == 3 || channels == 4); dst.create((src.rows + 1) / 2, (src.cols + 1) / 2, src.type()); diff --git a/modules/ocl/src/pyrup.cpp b/modules/ocl/src/pyrup.cpp index 14f20aa88..95a2915f4 100644 --- a/modules/ocl/src/pyrup.cpp +++ b/modules/ocl/src/pyrup.cpp @@ -61,6 +61,11 @@ namespace cv extern const char *pyr_up; void pyrUp(const cv::ocl::oclMat &src, cv::ocl::oclMat &dst) { + int depth = src.depth(), channels = src.channels(); + + CV_Assert(depth == CV_8U || depth == CV_16U || depth == CV_16S || depth == CV_32F); + CV_Assert(channels == 1 || channels == 3 || channels == 4); + dst.create(src.rows * 2, src.cols * 2, src.type()); Context *clCxt = src.clCxt; diff --git a/modules/ocl/test/test_pyramids.cpp b/modules/ocl/test/test_pyramids.cpp index 58179ac18..9070ee5aa 100644 --- a/modules/ocl/test/test_pyramids.cpp +++ b/modules/ocl/test/test_pyramids.cpp @@ -57,60 +57,63 @@ using namespace std; PARAM_TEST_CASE(PyrBase, MatType, int) { - int type; + int depth; int channels; + Mat dst_cpu; oclMat gdst; + virtual void SetUp() { - type = GET_PARAM(0); + depth = GET_PARAM(0); channels = GET_PARAM(1); } - }; /////////////////////// PyrDown ////////////////////////// -struct PyrDown : PyrBase {}; + +typedef PyrBase PyrDown; TEST_P(PyrDown, Mat) { - for(int j = 0; j < LOOP_TIMES; j++) + for (int j = 0; j < LOOP_TIMES; j++) { Size size(MWIDTH, MHEIGHT); - Mat src = randomMat(size, CV_MAKETYPE(type, channels)); + Mat src = randomMat(size, CV_MAKETYPE(depth, channels)); oclMat gsrc(src); pyrDown(src, dst_cpu); pyrDown(gsrc, gdst); - EXPECT_MAT_NEAR(dst_cpu, Mat(gdst), type == CV_32F ? 1e-4f : 1.0f); + EXPECT_MAT_NEAR(dst_cpu, Mat(gdst), depth == CV_32F ? 1e-4f : 1.0f); } } INSTANTIATE_TEST_CASE_P(OCL_ImgProc, PyrDown, Combine( - Values(CV_8U, CV_32F), Values(1, 3, 4))); + Values(CV_8U, CV_16U, CV_16S, CV_32F), + Values(1, 3, 4))); /////////////////////// PyrUp ////////////////////////// -struct PyrUp : PyrBase {}; +typedef PyrBase PyrUp; TEST_P(PyrUp, Accuracy) { - for(int j = 0; j < LOOP_TIMES; j++) + for (int j = 0; j < LOOP_TIMES; j++) { Size size(MWIDTH, MHEIGHT); - Mat src = randomMat(size, CV_MAKETYPE(type, channels)); + Mat src = randomMat(size, CV_MAKETYPE(depth, channels)); oclMat gsrc(src); pyrUp(src, dst_cpu); pyrUp(gsrc, gdst); - EXPECT_MAT_NEAR(dst_cpu, Mat(gdst), (type == CV_32F ? 1e-4f : 1.0)); + EXPECT_MAT_NEAR(dst_cpu, Mat(gdst), (depth == CV_32F ? 1e-4f : 1.0)); } - } -INSTANTIATE_TEST_CASE_P(OCL_ImgProc, PyrUp, testing::Combine( - Values(CV_8U, CV_32F), Values(1, 3, 4))); +INSTANTIATE_TEST_CASE_P(OCL_ImgProc, PyrUp, Combine( + Values(CV_8U, CV_16U, CV_16S, CV_32F), + Values(1, 3, 4))); #endif // HAVE_OPENCL