replaced factors computation by precomputed values, added kernel for

binary mode
This commit is contained in:
mlyashko 2014-06-20 11:55:17 +04:00
parent acebfcd24f
commit 44ffa42064
2 changed files with 111 additions and 65 deletions

View File

@ -369,11 +369,16 @@ Moments::Moments( double _m00, double _m10, double _m01, double _m20, double _m1
#ifdef HAVE_OPENCL
static bool ocl_moments( InputArray _src, Moments& m)
static bool ocl_moments( InputArray _src, Moments& m, bool binary)
{
const int TILE_SIZE = 32;
const int K = 10;
ocl::Kernel k("moments", ocl::imgproc::moments_oclsrc, format("-D TILE_SIZE=%d", TILE_SIZE));
ocl::Kernel k = ocl::Kernel("moments", ocl::imgproc::moments_oclsrc,
format("-D TILE_SIZE=%d%s",
TILE_SIZE,
binary ? " -D OP_MOMENTS_BINARY" : ""));
if( k.empty() )
return false;
@ -451,8 +456,8 @@ cv::Moments cv::moments( InputArray _src, bool binary )
return m;
#ifdef HAVE_OPENCL
if( !(ocl::useOpenCL() && type == CV_8UC1 && !binary &&
_src.isUMat() && ocl_moments(_src, m)) )
if( !(ocl::useOpenCL() && type == CV_8UC1 &&
_src.isUMat() && ocl_moments(_src, m, binary)) )
#endif
{
Mat mat = _src.getMat();

View File

@ -4,6 +4,7 @@
#error "TILE SIZE should be 32"
#endif
__kernel void moments(__global const uchar* src, int src_step, int src_offset,
int src_rows, int src_cols, __global int* mom0, int xtiles)
{
@ -29,22 +30,42 @@ __kernel void moments(__global const uchar* src, int src_step, int src_offset,
if (x_max >= 4)
{
p = convert_int4(vload4(0, ptr));
S += SUM_ELEM(p.s0, 0) + SUM_ELEM(p.s1, 1) + SUM_ELEM(p.s2, 2) + SUM_ELEM(p.s3, 3);
#ifdef OP_MOMENTS_BINARY
p = min(p, 1);
#endif
S += (int4)(p.s0, 0, 0, 0) + (int4)(p.s1, p.s1, p.s1, p.s1) +
(int4)(p.s2, p.s2 * 2, p.s2 * 4, p.s2 * 8) + (int4)(p.s3, p.s3 * 3, p.s3 * 9, p.s3 * 27);
//SUM_ELEM(p.s0, 0) + SUM_ELEM(p.s1, 1) + SUM_ELEM(p.s2, 2) + SUM_ELEM(p.s3, 3);
if (x_max >= 8)
{
p = convert_int4(vload4(0, ptr + 4));
S += SUM_ELEM(p.s0, 4) + SUM_ELEM(p.s1, 5) + SUM_ELEM(p.s2, 6) + SUM_ELEM(p.s3, 7);
#ifdef OP_MOMENTS_BINARY
p = min(p, 1);
#endif
S += (int4)(p.s0, p.s0 * 4, p.s0 * 16, p.s0 * 64) + (int4)(p.s1, p.s1 * 5, p.s1 * 25, p.s1 * 125) +
(int4)(p.s2, p.s2 * 6, p.s2 * 36, p.s2 * 216) + (int4)(p.s3, p.s3 * 7, p.s3 * 49, p.s3 * 343);
//SUM_ELEM(p.s0, 4) + SUM_ELEM(p.s1, 5) + SUM_ELEM(p.s2, 6) + SUM_ELEM(p.s3, 7);
if (x_max >= 12)
{
p = convert_int4(vload4(0, ptr + 8));
S += SUM_ELEM(p.s0, 8) + SUM_ELEM(p.s1, 9) + SUM_ELEM(p.s2, 10) + SUM_ELEM(p.s3, 11);
#ifdef OP_MOMENTS_BINARY
p = min(p, 1);
#endif
S += (int4)(p.s0, p.s0 * 8, p.s0 * 64, p.s0 * 512) + (int4)(p.s1, p.s1 * 9, p.s1 * 81, p.s1 * 729) +
(int4)(p.s2, p.s2 * 10, p.s2 * 100, p.s2 * 1000) + (int4)(p.s3, p.s3 * 11, p.s3 * 121, p.s3 * 1331);
//SUM_ELEM(p.s0, 8) + SUM_ELEM(p.s1, 9) + SUM_ELEM(p.s2, 10) + SUM_ELEM(p.s3, 11);
if (x_max >= 16)
{
p = convert_int4(vload4(0, ptr + 12));
S += SUM_ELEM(p.s0, 12) + SUM_ELEM(p.s1, 13) + SUM_ELEM(p.s2, 14) + SUM_ELEM(p.s3, 15);
#ifdef OP_MOMENTS_BINARY
p = min(p, 1);
#endif
S += (int4)(p.s0, p.s0 * 12, p.s0 * 144, p.s0 * 1728) + (int4)(p.s1, p.s1 * 13, p.s1 * 169, p.s1 * 2197) +
(int4)(p.s2, p.s2 * 14, p.s2 * 196, p.s2 * 2744) + (int4)(p.s3, p.s3 * 15, p.s3 * 225, p.s3 * 3375);
//SUM_ELEM(p.s0, 12) + SUM_ELEM(p.s1, 13) + SUM_ELEM(p.s2, 14) + SUM_ELEM(p.s3, 15);
}
}
}
@ -53,22 +74,42 @@ __kernel void moments(__global const uchar* src, int src_step, int src_offset,
if (x_max >= 20)
{
p = convert_int4(vload4(0, ptr + 16));
S += SUM_ELEM(p.s0, 16) + SUM_ELEM(p.s1, 17) + SUM_ELEM(p.s2, 18) + SUM_ELEM(p.s3, 19);
#ifdef OP_MOMENTS_BINARY
p = min(p, 1);
#endif
S += (int4)(p.s0, p.s0 * 16, p.s0 * 256, p.s0 * 4096) + (int4)(p.s1, p.s1 * 17, p.s1 * 289, p.s1 * 4913) +
(int4)(p.s2, p.s2 * 18, p.s2 * 324, p.s2 * 5832) + (int4)(p.s3, p.s3 * 19, p.s3 * 361, p.s3 * 6859);
//SUM_ELEM(p.s0, 16) + SUM_ELEM(p.s1, 17) + SUM_ELEM(p.s2, 18) + SUM_ELEM(p.s3, 19);
if (x_max >= 24)
{
p = convert_int4(vload4(0, ptr + 20));
S += SUM_ELEM(p.s0, 20) + SUM_ELEM(p.s1, 21) + SUM_ELEM(p.s2, 22) + SUM_ELEM(p.s3, 23);
#ifdef OP_MOMENTS_BINARY
p = min(p, 1);
#endif
S += (int4)(p.s0, p.s0 * 20, p.s0 * 400, p.s0 * 8000) + (int4)(p.s1, p.s1 * 21, p.s1 * 441, p.s1 * 9261) +
(int4)(p.s2, p.s2 * 22, p.s2 * 484, p.s2 * 10648) + (int4)(p.s3, p.s3 * 23, p.s3 * 529, p.s3 * 12167);
//SUM_ELEM(p.s0, 20) + SUM_ELEM(p.s1, 21) + SUM_ELEM(p.s2, 22) + SUM_ELEM(p.s3, 23);
if (x_max >= 28)
{
p = convert_int4(vload4(0, ptr + 24));
S += SUM_ELEM(p.s0, 24) + SUM_ELEM(p.s1, 25) + SUM_ELEM(p.s2, 26) + SUM_ELEM(p.s3, 27);
#ifdef OP_MOMENTS_BINARY
p = min(p, 1);
#endif
S += (int4)(p.s0, p.s0 * 24, p.s0 * 576, p.s0 * 13824) + (int4)(p.s1, p.s1 * 25, p.s1 * 625, p.s1 * 15625) +
(int4)(p.s2, p.s2 * 26, p.s2 * 676, p.s2 * 17576) + (int4)(p.s3, p.s3 * 27, p.s3 * 729, p.s3 * 19683);
//SUM_ELEM(p.s0, 24) + SUM_ELEM(p.s1, 25) + SUM_ELEM(p.s2, 26) + SUM_ELEM(p.s3, 27);
if (x_max >= 32)
{
p = convert_int4(vload4(0, ptr + 28));
S += SUM_ELEM(p.s0, 28) + SUM_ELEM(p.s1, 29) + SUM_ELEM(p.s2, 30) + SUM_ELEM(p.s3, 31);
#ifdef OP_MOMENTS_BINARY
p = min(p, 1);
#endif
S += (int4)(p.s0, p.s0 * 28, p.s0 * 784, p.s0 * 21952) + (int4)(p.s1, p.s1 * 29, p.s1 * 841, p.s1 * 24389) +
(int4)(p.s2, p.s2 * 30, p.s2 * 900, p.s2 * 27000) + (int4)(p.s3, p.s3 * 31, p.s3 * 961, p.s3 * 29791);
//SUM_ELEM(p.s0, 28) + SUM_ELEM(p.s1, 29) + SUM_ELEM(p.s2, 30) + SUM_ELEM(p.s3, 31);
}
}
}
@ -77,14 +118,23 @@ __kernel void moments(__global const uchar* src, int src_step, int src_offset,
if (x < x_max)
{
int ps = ptr[x];
#ifdef OP_MOMENTS_BINARY
ps = min(ps, 1);
#endif
S += SUM_ELEM(ps, x);
if (x + 1 < x_max)
{
ps = ptr[x + 1];
#ifdef OP_MOMENTS_BINARY
ps = min(ps, 1);
#endif
S += SUM_ELEM(ps, x + 1);
if (x + 2 < x_max)
{
ps = ptr[x + 2];
#ifdef OP_MOMENTS_BINARY
ps = min(ps, 1);
#endif
S += SUM_ELEM(ps, x + 2);
}
}
@ -129,19 +179,10 @@ __kernel void moments(__global const uchar* src, int src_step, int src_offset,
REDUCE(4);
REDUCE(2);
if( y == 0 )
if (y < 10)
{
__global int* momout = mom0 + (y0*xtiles + x0) * 10;
momout[0] = mom[0][0] + mom[1][0];
momout[1] = mom[0][1] + mom[1][1];
momout[2] = mom[0][2] + mom[1][2];
momout[3] = mom[0][3] + mom[1][3];
momout[4] = mom[0][4] + mom[1][4];
momout[5] = mom[0][5] + mom[1][5];
momout[6] = mom[0][6] + mom[1][6];
momout[7] = mom[0][7] + mom[1][7];
momout[8] = mom[0][8] + mom[1][8];
momout[9] = mom[0][9] + mom[1][9];
momout[y] = mom[0][y] + mom[1][y];
}
}
}