diff --git a/modules/imgproc/src/opencl/moments.cl b/modules/imgproc/src/opencl/moments.cl index 44c29d9c6..9cc5a873c 100644 --- a/modules/imgproc/src/opencl/moments.cl +++ b/modules/imgproc/src/opencl/moments.cl @@ -1,5 +1,9 @@ /* See LICENSE file in the root OpenCV directory */ +#if TILE_SIZE > 16 +#error "TILE SIZE should be <= 16" +#endif + __kernel void moments(__global const uchar* src, int src_step, int src_offset, int src_rows, int src_cols, __global int* mom0, int xtiles) { @@ -15,30 +19,50 @@ __kernel void moments(__global const uchar* src, int src_step, int src_offset, int m00=0, m10=0, m01=0, m20=0, m11=0, m02=0, m30=0, m21=0, m12=0, m03=0; __global const uchar* ptr = src + src_offset + y_min*src_step + x_min; __global int* mom = mom0 + (xtiles*y + x)*10; + x = x_max & -4; for( y = 0; y < y_max; y++, ptr += src_step ) { - int4 S = (int4)(0,0,0,0); + int4 S = (int4)(0,0,0,0), p; - for( x = 0; x <= x_max - 4; x += 4 ) + #define SUM_ELEM(elem, ofs) \ + (int4)(1, (ofs), ((ofs)*(ofs)), ((ofs)*(ofs)*(ofs)))*elem + if( x_max >= 4 ) { - int4 p = convert_int4(vload4(0, ptr + x)); - #define SUM_ELEM(elem, ofs) \ - (int4)(elem, (x+ofs)*elem, (x+ofs)*(x+ofs)*elem, (x+ofs)*(x+ofs)*(x+ofs)*elem) + p = convert_int4(vload4(0, ptr)); S += SUM_ELEM(p.s0, 0) + SUM_ELEM(p.s1, 1) + SUM_ELEM(p.s2, 2) + SUM_ELEM(p.s3, 3); + + if( x_max >= 8 ) + { + p = convert_int4(vload4(0, ptr+4)); + S += SUM_ELEM(p.s0, 4) + SUM_ELEM(p.s1, 5) + SUM_ELEM(p.s2, 6) + SUM_ELEM(p.s3, 7); + + if( x_max >= 12 ) + { + p = convert_int4(vload4(0, ptr+8)); + S += SUM_ELEM(p.s0, 8) + SUM_ELEM(p.s1, 9) + SUM_ELEM(p.s2, 10) + SUM_ELEM(p.s3, 11); + + if( x_max >= 16 ) + { + p = convert_int4(vload4(0, ptr+12)); + S += SUM_ELEM(p.s0, 12) + SUM_ELEM(p.s1, 13) + SUM_ELEM(p.s2, 14) + SUM_ELEM(p.s3, 15); + } + } + } } + if( x < x_max ) { int ps = ptr[x]; - S += SUM_ELEM(ps, 0); + S += SUM_ELEM(ps, x); if( x+1 < x_max ) { ps = ptr[x+1]; - S += SUM_ELEM(ps, 1); + S += SUM_ELEM(ps, x+1); if( x+2 < x_max ) { ps = ptr[x+2]; - S += SUM_ELEM(ps, 2); + S += SUM_ELEM(ps, x+2); } } } diff --git a/modules/imgproc/test/test_moments.cpp b/modules/imgproc/test/test_moments.cpp index 52bccd6e9..45987dc08 100644 --- a/modules/imgproc/test/test_moments.cpp +++ b/modules/imgproc/test/test_moments.cpp @@ -43,6 +43,13 @@ using namespace cv; using namespace std; +#define OCL_TUNING_MODE 0 +#if OCL_TUNING_MODE +#define OCL_TUNING_MODE_ONLY(code) code +#else +#define OCL_TUNING_MODE_ONLY(code) +#endif + // image moments class CV_MomentsTest : public cvtest::ArrayTest { @@ -71,6 +78,7 @@ CV_MomentsTest::CV_MomentsTest() test_array[REF_OUTPUT].push_back(NULL); coi = -1; is_binary = false; + OCL_TUNING_MODE_ONLY(test_case_count = 10); //element_wise_relative_error = false; } @@ -97,7 +105,6 @@ void CV_MomentsTest::get_minmax_bounds( int i, int j, int type, Scalar& low, Sca } } - void CV_MomentsTest::get_test_array_types_and_sizes( int test_case_idx, vector >& sizes, vector >& types ) { @@ -115,6 +122,14 @@ void CV_MomentsTest::get_test_array_types_and_sizes( int test_case_idx, if( cn == 2 || try_umat ) cn = 1; + + OCL_TUNING_MODE_ONLY( + cn = 1; + depth = CV_8U; + try_umat = true; + is_binary = false; + sizes[INPUT][0] = Size(1024,768) + ); types[INPUT][0] = CV_MAKETYPE(depth, cn); types[OUTPUT][0] = types[REF_OUTPUT][0] = CV_64FC1; @@ -160,7 +175,16 @@ void CV_MomentsTest::run_func() { UMat u; test_mat[INPUT][0].clone().copyTo(u); + OCL_TUNING_MODE_ONLY( + static double ttime = 0; + static int ncalls = 0; + moments(u, is_binary != 0); + double t = (double)getTickCount()); Moments new_m = moments(u, is_binary != 0); + OCL_TUNING_MODE_ONLY( + ttime += (double)getTickCount() - t; + ncalls++; + printf("%g\n", ttime/ncalls/u.total())); *m = new_m; } else