Merge remote-tracking branch 'upstream/master'

This commit is contained in:
Brian Park 2014-10-29 23:15:22 -07:00
commit 5de5f26223
18 changed files with 1238 additions and 241 deletions

View File

@ -44,217 +44,251 @@
////////////////////////////////////////// stereoBM ////////////////////////////////////////////// ////////////////////////////////////////// stereoBM //////////////////////////////////////////////
////////////////////////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////////////////////////
#ifdef csize
#define MAX_VAL 32767 #define MAX_VAL 32767
void calcDisp(__local short * cost, __global short * disp, int uniquenessRatio, int mindisp, int ndisp, int w, #ifndef WSZ
__local int * bestDisp, __local int * bestCost, int d, int x, int y, int cols, int rows, int wsz2) #define WSZ 2
#endif
#define WSZ2 (WSZ / 2)
#ifdef DEFINE_KERNEL_STEREOBM
#define DISPARITY_SHIFT 4
#define FILTERED ((MIN_DISP - 1) << DISPARITY_SHIFT)
void calcDisp(__local short * cost, __global short * disp, int uniquenessRatio,
__local int * bestDisp, __local int * bestCost, int d, int x, int y, int cols, int rows)
{ {
short FILTERED = (mindisp - 1)<<4; int best_disp = *bestDisp, best_cost = *bestCost;
int best_disp = *bestDisp, best_cost = *bestCost, best_disp_back = ndisp - best_disp - 1; barrier(CLK_LOCAL_MEM_FENCE);
short c = cost[0]; short c = cost[0];
int thresh = best_cost + (best_cost * uniquenessRatio / 100); int thresh = best_cost + (best_cost * uniquenessRatio / 100);
bool notUniq = ( (c <= thresh) && (d < (best_disp_back - 1) || d > (best_disp_back + 1) ) ); bool notUniq = ( (c <= thresh) && (d < (best_disp - 1) || d > (best_disp + 1) ) );
if (notUniq) if (notUniq)
*bestCost = FILTERED; *bestCost = FILTERED;
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
if( *bestCost != FILTERED && x < cols-wsz2-mindisp && y < rows-wsz2 && d == best_disp_back) if( *bestCost != FILTERED && x < cols - WSZ2 - MIN_DISP && y < rows - WSZ2 && d == best_disp)
{ {
int y3 = (best_disp_back > 0) ? cost[-w] : cost[w], int d_aprox = 0;
y2 = c, int yp =0, yn = 0;
y1 = (best_disp_back < ndisp-1) ? cost[w] : cost[-w]; if ((0 < best_disp) && (best_disp < NUM_DISP - 1))
int d_aprox = y3+y1-2*y2 + abs(y3-y1);
disp[0] = (short)(((best_disp_back + mindisp)*256 + (d_aprox != 0 ? (y3-y1)*256/d_aprox : 0) + 15) >> 4);
}
}
int calcLocalIdx(int x, int y, int d, int w)
{ {
return d*2*w + (w - 1 - y + x); yp = cost[-2 * BLOCK_SIZE_Y];
yn = cost[2 * BLOCK_SIZE_Y];
d_aprox = yp + yn - 2 * c + abs(yp - yn);
}
disp[0] = (short)(((best_disp + MIN_DISP)*256 + (d_aprox != 0 ? (yp - yn) * 256 / d_aprox : 0) + 15) >> 4);
} }
void calcNewCoordinates(int * x, int * y, int nthread)
{
int oldX = *x - (1-nthread), oldY = *y;
*x = (oldX == oldY) ? (0*nthread + (oldX + 2)*(1-nthread) ) : (oldX+1)*(1-nthread) + (oldX+1)*nthread;
*y = (oldX == oldY) ? (0*(1-nthread) + (oldY + 1)*nthread) : oldY + 1*(1-nthread);
} }
short calcCostBorder(__global const uchar * leftptr, __global const uchar * rightptr, int x, int y, int nthread, short calcCostBorder(__global const uchar * leftptr, __global const uchar * rightptr, int x, int y, int nthread,
int wsz2, short * costbuf, int * h, int cols, int d, short cost, int winsize) short * costbuf, int *h, int cols, int d, short cost)
{ {
int head = (*h)%wsz; int head = (*h) % WSZ;
__global const uchar * left, * right; __global const uchar * left, * right;
int idx = mad24(y+wsz2*(2*nthread-1), cols, x+wsz2*(1-2*nthread)); int idx = mad24(y + WSZ2 * (2 * nthread - 1), cols, x + WSZ2 * (1 - 2 * nthread));
left = leftptr + idx; left = leftptr + idx;
right = rightptr + (idx - d); right = rightptr + (idx - d);
int shift = 1*nthread + cols*(1-nthread);
short costdiff = 0; short costdiff = 0;
for(int i = 0; i < winsize; i++) if (0 == nthread)
{
#pragma unroll
for (int i = 0; i < WSZ; i++)
{ {
costdiff += abs( left[0] - right[0] ); costdiff += abs( left[0] - right[0] );
left += shift; left += cols;
right += shift; right += cols;
}
}
else // (1 == nthread)
{
#pragma unroll
for (int i = 0; i < WSZ; i++)
{
costdiff += abs(left[i] - right[i]);
}
} }
cost += costdiff - costbuf[head]; cost += costdiff - costbuf[head];
costbuf[head] = costdiff; costbuf[head] = costdiff;
(*h) = (*h)%wsz + 1; *h = head + 1;
return cost; return cost;
} }
short calcCostInside(__global const uchar * leftptr, __global const uchar * rightptr, int x, int y, short calcCostInside(__global const uchar * leftptr, __global const uchar * rightptr, int x, int y,
int wsz2, int cols, int d, short cost_up_left, short cost_up, short cost_left, int cols, int d, short cost_up_left, short cost_up, short cost_left)
int winsize)
{ {
__global const uchar * left, * right; __global const uchar * left, * right;
int idx = mad24(y-wsz2-1, cols, x-wsz2-1); int idx = mad24(y - WSZ2 - 1, cols, x - WSZ2 - 1);
left = leftptr + idx; left = leftptr + idx;
right = rightptr + (idx - d); right = rightptr + (idx - d);
int idx2 = winsize*cols; int idx2 = WSZ*cols;
uchar corrner1 = abs(left[0] - right[0]), uchar corrner1 = abs(left[0] - right[0]),
corrner2 = abs(left[winsize] - right[winsize]), corrner2 = abs(left[WSZ] - right[WSZ]),
corrner3 = abs(left[idx2] - right[idx2]), corrner3 = abs(left[idx2] - right[idx2]),
corrner4 = abs(left[idx2 + winsize] - right[idx2 + winsize]); corrner4 = abs(left[idx2 + WSZ] - right[idx2 + WSZ]);
return cost_up + cost_left - cost_up_left + corrner1 - return cost_up + cost_left - cost_up_left + corrner1 -
corrner2 - corrner3 + corrner4; corrner2 - corrner3 + corrner4;
} }
__kernel void stereoBM(__global const uchar * leftptr, __global const uchar * rightptr, __global uchar * dispptr, __kernel void stereoBM(__global const uchar * leftptr,
int disp_step, int disp_offset, int rows, int cols, int mindisp, int ndisp, __global const uchar * rightptr,
int preFilterCap, int textureTreshold, int uniquenessRatio, int sizeX, int sizeY, int winsize) __global uchar * dispptr, int disp_step, int disp_offset,
int rows, int cols, // rows, cols of left and right images, not disp
int textureTreshold, int uniquenessRatio)
{ {
int gx = get_global_id(0)*sizeX; int lz = get_local_id(0);
int gy = get_global_id(1)*sizeY; int gx = get_global_id(1) * BLOCK_SIZE_X;
int lz = get_local_id(2); int gy = get_global_id(2) * BLOCK_SIZE_Y;
int nthread = lz/ndisp; int nthread = lz / NUM_DISP;
int d = lz%ndisp; int disp_idx = lz % NUM_DISP;
int wsz2 = wsz/2;
__global short * disp; __global short * disp;
__global const uchar * left, * right; __global const uchar * left, * right;
__local short costFunc[csize]; __local short costFunc[2 * BLOCK_SIZE_Y * NUM_DISP];
__local short * cost; __local short * cost;
__local int best_disp[2]; __local int best_disp[2];
__local int best_cost[2]; __local int best_cost[2];
best_cost[nthread] = MAX_VAL; best_cost[nthread] = MAX_VAL;
best_disp[nthread] = MAX_VAL; best_disp[nthread] = -1;
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
short costbuf[wsz]; short costbuf[WSZ];
int head = 0; int head = 0;
int shiftX = wsz2 + ndisp + mindisp - 1; int shiftX = WSZ2 + NUM_DISP + MIN_DISP - 1;
int shiftY = wsz2; int shiftY = WSZ2;
int x = gx + shiftX, y = gy + shiftY, lx = 0, ly = 0; int x = gx + shiftX, y = gy + shiftY, lx = 0, ly = 0;
int costIdx = calcLocalIdx(lx, ly, d, sizeY); int costIdx = disp_idx * 2 * BLOCK_SIZE_Y + (BLOCK_SIZE_Y - 1);
cost = costFunc + costIdx; cost = costFunc + costIdx;
int tempcost = 0; int tempcost = 0;
if(x < cols-wsz2-mindisp && y < rows-wsz2) if (x < cols - WSZ2 - MIN_DISP && y < rows - WSZ2)
{ {
int shift = 1*nthread + cols*(1-nthread); if (0 == nthread)
for(int i = 0; i < winsize; i++)
{ {
int idx = mad24(y-wsz2+i*nthread, cols, x-wsz2+i*(1-nthread)); #pragma unroll
for (int i = 0; i < WSZ; i++)
{
int idx = mad24(y - WSZ2, cols, x - WSZ2 + i);
left = leftptr + idx; left = leftptr + idx;
right = rightptr + (idx - d); right = rightptr + (idx - disp_idx);
short costdiff = 0; short costdiff = 0;
for(int j = 0; j < winsize; j++) for(int j = 0; j < WSZ; j++)
{ {
costdiff += abs( left[0] - right[0] ); costdiff += abs( left[0] - right[0] );
left += shift; left += cols;
right += shift; right += cols;
} }
if(nthread==1) costbuf[i] = costdiff;
}
}
else // (1 == nthread)
{ {
tempcost += costdiff; #pragma unroll
for (int i = 0; i < WSZ; i++)
{
int idx = mad24(y - WSZ2 + i, cols, x - WSZ2);
left = leftptr + idx;
right = rightptr + (idx - disp_idx);
short costdiff = 0;
for (int j = 0; j < WSZ; j++)
{
costdiff += abs( left[j] - right[j]);
}
tempcost += costdiff;
costbuf[i] = costdiff;
} }
costbuf[head] = costdiff;
head++;
} }
} }
if (nthread == 1) if (nthread == 1)
{ {
cost[0] = tempcost; cost[0] = tempcost;
atomic_min(best_cost+nthread, tempcost); atomic_min(best_cost + 1, tempcost);
} }
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
if (best_cost[1] == tempcost) if (best_cost[1] == tempcost)
atomic_min(best_disp + 1, ndisp - d - 1); atomic_max(best_disp + 1, disp_idx);
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
int dispIdx = mad24(gy, disp_step, disp_offset + gx*(int)sizeof(short)); int dispIdx = mad24(gy, disp_step, mad24((int)sizeof(short), gx, disp_offset));
disp = (__global short *)(dispptr + dispIdx); disp = (__global short *)(dispptr + dispIdx);
calcDisp(cost, disp, uniquenessRatio, mindisp, ndisp, 2*sizeY, calcDisp(cost, disp, uniquenessRatio, best_disp + 1, best_cost + 1, disp_idx, x, y, cols, rows);
best_disp + 1, best_cost+1, d, x, y, cols, rows, wsz2);
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
lx = 1 - nthread; lx = 1 - nthread;
ly = nthread; ly = nthread;
for(int i = 0; i < sizeY*sizeX/2; i++) for (int i = 0; i < BLOCK_SIZE_Y * BLOCK_SIZE_X / 2; i++)
{ {
x = (lx < sizeX) ? gx + shiftX + lx : cols; x = (lx < BLOCK_SIZE_X) ? gx + shiftX + lx : cols;
y = (ly < sizeY) ? gy + shiftY + ly : rows; y = (ly < BLOCK_SIZE_Y) ? gy + shiftY + ly : rows;
best_cost[nthread] = MAX_VAL; best_cost[nthread] = MAX_VAL;
best_disp[nthread] = MAX_VAL; best_disp[nthread] = -1;
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
costIdx = calcLocalIdx(lx, ly, d, sizeY); costIdx = mad24(2 * BLOCK_SIZE_Y, disp_idx, (BLOCK_SIZE_Y - 1 - ly + lx));
if (0 > costIdx)
costIdx = BLOCK_SIZE_Y - 1;
cost = costFunc + costIdx; cost = costFunc + costIdx;
if (x < cols - WSZ2 - MIN_DISP && y < rows - WSZ2)
if(x < cols-wsz2-mindisp && y < rows-wsz2 )
{ {
tempcost = (ly * (1 - nthread) + lx * nthread == 0) ? tempcost = (ly * (1 - nthread) + lx * nthread == 0) ?
calcCostBorder(leftptr, rightptr, x, y, nthread, wsz2, costbuf, &head, cols, d, calcCostBorder(leftptr, rightptr, x, y, nthread, costbuf, &head, cols, disp_idx, cost[2*nthread-1]) :
cost[2*nthread-1], winsize) : calcCostInside(leftptr, rightptr, x, y, cols, disp_idx, cost[0], cost[1], cost[-1]);
calcCostInside(leftptr, rightptr, x, y, wsz2, cols, d,
cost[0], cost[1], cost[-1], winsize);
} }
cost[0] = tempcost; cost[0] = tempcost;
atomic_min(best_cost + nthread, tempcost); atomic_min(best_cost + nthread, tempcost);
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
if (best_cost[nthread] == tempcost) if (best_cost[nthread] == tempcost)
atomic_min(best_disp + nthread, ndisp - d - 1); atomic_max(best_disp + nthread, disp_idx);
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
int dispIdx = mad24(gy+ly, disp_step, disp_offset + (gx+lx)*(int)sizeof(short)); dispIdx = mad24(gy + ly, disp_step, mad24((int)sizeof(short), (gx + lx), disp_offset));
disp = (__global short *)(dispptr + dispIdx); disp = (__global short *)(dispptr + dispIdx);
calcDisp(cost, disp, uniquenessRatio, mindisp, ndisp, 2*sizeY, calcDisp(cost, disp, uniquenessRatio, best_disp + nthread, best_cost + nthread, disp_idx, x, y, cols, rows);
best_disp + nthread, best_cost + nthread, d, x, y, cols, rows, wsz2);
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
calcNewCoordinates(&lx, &ly, nthread); if (lx + nthread - 1 == ly)
{
lx = (lx + nthread + 1) * (1 - nthread);
ly = (ly + 1) * nthread;
}
else
{
lx += nthread;
ly = ly - nthread + 1;
} }
} }
}
#endif #endif //DEFINE_KERNEL_STEREOBM
////////////////////////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////////////////////////
/////////////////////////////////////// Norm Prefiler //////////////////////////////////////////// /////////////////////////////////////// Norm Prefiler ////////////////////////////////////////////
////////////////////////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////////////////////////
__kernel void prefilter_norm(__global unsigned char *input, __global unsigned char *output, __kernel void prefilter_norm(__global unsigned char *input, __global unsigned char *output,
int rows, int cols, int prefilterCap, int winsize, int scale_g, int scale_s) int rows, int cols, int prefilterCap, int scale_g, int scale_s)
{ {
// prefilterCap in range 1..63, checked in StereoBMImpl::compute
int x = get_global_id(0); int x = get_global_id(0);
int y = get_global_id(1); int y = get_global_id(1);
int wsz2 = winsize/2;
if(x < cols && y < rows) if(x < cols && y < rows)
{ {
@ -262,13 +296,13 @@ __kernel void prefilter_norm(__global unsigned char *input, __global unsigned ch
input[y * cols + max(x-1,0)] * 1 + input[ y * cols + x] * 4 + input[y * cols + min(x+1, cols-1)] * 1 + input[y * cols + max(x-1,0)] * 1 + input[ y * cols + x] * 4 + input[y * cols + min(x+1, cols-1)] * 1 +
input[min(y+1, rows-1) * cols + x] * 1; input[min(y+1, rows-1) * cols + x] * 1;
int cov2 = 0; int cov2 = 0;
for(int i = -wsz2; i < wsz2+1; i++) for(int i = -WSZ2; i < WSZ2+1; i++)
for(int j = -wsz2; j < wsz2+1; j++) for(int j = -WSZ2; j < WSZ2+1; j++)
cov2 += input[clamp(y+i, 0, rows-1) * cols + clamp(x+j, 0, cols-1)]; cov2 += input[clamp(y+i, 0, rows-1) * cols + clamp(x+j, 0, cols-1)];
int res = (cov1*scale_g - cov2*scale_s)>>10; int res = (cov1*scale_g - cov2*scale_s)>>10;
res = min(clamp(res, -prefilterCap, prefilterCap) + prefilterCap, 255); res = clamp(res, -prefilterCap, prefilterCap) + prefilterCap;
output[y * cols + x] = res & 0xFF; output[y * cols + x] = res;
} }
} }
@ -280,20 +314,21 @@ __kernel void prefilter_norm(__global unsigned char *input, __global unsigned ch
__kernel void prefilter_xsobel(__global unsigned char *input, __global unsigned char *output, __kernel void prefilter_xsobel(__global unsigned char *input, __global unsigned char *output,
int rows, int cols, int prefilterCap) int rows, int cols, int prefilterCap)
{ {
// prefilterCap in range 1..63, checked in StereoBMImpl::compute
int x = get_global_id(0); int x = get_global_id(0);
int y = get_global_id(1); int y = get_global_id(1);
if(x < cols && y < rows) if(x < cols && y < rows)
{ {
output[y * cols + x] = min(prefilterCap, 255) & 0xFF; if (0 < x && !((y == rows-1) & (rows%2==1) ) )
}
if(x < cols && y < rows && x > 0 && !((y == rows-1)&(rows%2==1) ) )
{ {
int cov = input[ ((y > 0) ? y-1 : y+1) * cols + (x-1)] * (-1) + input[ ((y > 0) ? y-1 : y+1) * cols + ((x<cols-1) ? x+1 : x-1)] * (1) + int cov = input[ ((y > 0) ? y-1 : y+1) * cols + (x-1)] * (-1) + input[ ((y > 0) ? y-1 : y+1) * cols + ((x<cols-1) ? x+1 : x-1)] * (1) +
input[ (y) * cols + (x-1)] * (-2) + input[ (y) * cols + ((x<cols-1) ? x+1 : x-1)] * (2) + input[ (y) * cols + (x-1)] * (-2) + input[ (y) * cols + ((x<cols-1) ? x+1 : x-1)] * (2) +
input[((y<rows-1)?(y+1):(y-1))* cols + (x-1)] * (-1) + input[((y<rows-1)?(y+1):(y-1))* cols + ((x<cols-1) ? x+1 : x-1)] * (1); input[((y<rows-1)?(y+1):(y-1))* cols + (x-1)] * (-1) + input[((y<rows-1)?(y+1):(y-1))* cols + ((x<cols-1) ? x+1 : x-1)] * (1);
cov = min(clamp(cov, -prefilterCap, prefilterCap) + prefilterCap, 255); cov = clamp(cov, -prefilterCap, prefilterCap) + prefilterCap;
output[y * cols + x] = cov & 0xFF; output[y * cols + x] = cov;
}
else
output[y * cols + x] = prefilterCap;
} }
} }

View File

@ -88,7 +88,7 @@ struct StereoBMParams
static bool ocl_prefilter_norm(InputArray _input, OutputArray _output, int winsize, int prefilterCap) static bool ocl_prefilter_norm(InputArray _input, OutputArray _output, int winsize, int prefilterCap)
{ {
ocl::Kernel k("prefilter_norm", ocl::calib3d::stereobm_oclsrc); ocl::Kernel k("prefilter_norm", ocl::calib3d::stereobm_oclsrc, cv::format("-D WSZ=%d", winsize));
if(k.empty()) if(k.empty())
return false; return false;
@ -102,7 +102,7 @@ static bool ocl_prefilter_norm(InputArray _input, OutputArray _output, int winsi
size_t globalThreads[3] = { input.cols, input.rows, 1 }; size_t globalThreads[3] = { input.cols, input.rows, 1 };
k.args(ocl::KernelArg::PtrReadOnly(input), ocl::KernelArg::PtrWriteOnly(output), input.rows, input.cols, k.args(ocl::KernelArg::PtrReadOnly(input), ocl::KernelArg::PtrWriteOnly(output), input.rows, input.cols,
prefilterCap, winsize, scale_g, scale_s); prefilterCap, scale_g, scale_s);
return k.run(2, globalThreads, NULL, false); return k.run(2, globalThreads, NULL, false);
} }
@ -743,9 +743,16 @@ static bool ocl_stereobm( InputArray _left, InputArray _right,
int wsz = state->SADWindowSize; int wsz = state->SADWindowSize;
int wsz2 = wsz/2; int wsz2 = wsz/2;
int sizeX = std::max(11, 27 - ocl::Device::getDefault().maxComputeUnits() ), sizeY = sizeX-1, N = ndisp*2; ocl::Device devDef = ocl::Device::getDefault();
int sizeX = devDef.isIntel() ? 32 : std::max(11, 27 - devDef.maxComputeUnits()),
sizeY = sizeX - 1,
N = ndisp * 2;
ocl::Kernel k("stereoBM", ocl::calib3d::stereobm_oclsrc, cv::format("-D csize=%d -D wsz=%d", (2*sizeY)*ndisp, wsz) ); cv::String opt = cv::format("-D DEFINE_KERNEL_STEREOBM -D MIN_DISP=%d -D NUM_DISP=%d"
" -D BLOCK_SIZE_X=%d -D BLOCK_SIZE_Y=%d -D WSZ=%d",
mindisp, ndisp,
sizeX, sizeY, wsz);
ocl::Kernel k("stereoBM", ocl::calib3d::stereobm_oclsrc, opt);
if(k.empty()) if(k.empty())
return false; return false;
@ -757,11 +764,10 @@ static bool ocl_stereobm( InputArray _left, InputArray _right,
Rect roi = Rect(Point(wsz2 + mindisp + ndisp - 1, wsz2), Point(cols-wsz2-mindisp, rows-wsz2) ); Rect roi = Rect(Point(wsz2 + mindisp + ndisp - 1, wsz2), Point(cols-wsz2-mindisp, rows-wsz2) );
UMat disp = (_disp.getUMat())(roi); UMat disp = (_disp.getUMat())(roi);
int globalX = disp.cols/sizeX, globalY = disp.rows/sizeY; int globalX = (disp.cols + sizeX - 1) / sizeX,
globalX += (disp.cols%sizeX) > 0 ? 1 : 0; globalY = (disp.rows + sizeY - 1) / sizeY;
globalY += (disp.rows%sizeY) > 0 ? 1 : 0; size_t globalThreads[3] = {N, globalX, globalY};
size_t globalThreads[3] = { globalX, globalY, N}; size_t localThreads[3] = {N, 1, 1};
size_t localThreads[3] = {1, 1, N};
int idx = 0; int idx = 0;
idx = k.set(idx, ocl::KernelArg::PtrReadOnly(left)); idx = k.set(idx, ocl::KernelArg::PtrReadOnly(left));
@ -769,15 +775,8 @@ static bool ocl_stereobm( InputArray _left, InputArray _right,
idx = k.set(idx, ocl::KernelArg::WriteOnlyNoSize(disp)); idx = k.set(idx, ocl::KernelArg::WriteOnlyNoSize(disp));
idx = k.set(idx, rows); idx = k.set(idx, rows);
idx = k.set(idx, cols); idx = k.set(idx, cols);
idx = k.set(idx, mindisp);
idx = k.set(idx, ndisp);
idx = k.set(idx, state->preFilterCap);
idx = k.set(idx, state->textureThreshold); idx = k.set(idx, state->textureThreshold);
idx = k.set(idx, state->uniquenessRatio); idx = k.set(idx, state->uniquenessRatio);
idx = k.set(idx, sizeX);
idx = k.set(idx, sizeY);
idx = k.set(idx, wsz);
return k.run(3, globalThreads, localThreads, false); return k.run(3, globalThreads, localThreads, false);
} }

View File

@ -618,7 +618,7 @@ CV_EXPORTS int predictOptimalVectorWidth(InputArray src1, InputArray src2 = noAr
InputArray src7 = noArray(), InputArray src8 = noArray(), InputArray src9 = noArray(), InputArray src7 = noArray(), InputArray src8 = noArray(), InputArray src9 = noArray(),
OclVectorStrategy strat = OCL_VECTOR_DEFAULT); OclVectorStrategy strat = OCL_VECTOR_DEFAULT);
CV_EXPORTS int checkOptimalVectorWidth(int *vectorWidths, CV_EXPORTS int checkOptimalVectorWidth(const int *vectorWidths,
InputArray src1, InputArray src2 = noArray(), InputArray src3 = noArray(), InputArray src1, InputArray src2 = noArray(), InputArray src3 = noArray(),
InputArray src4 = noArray(), InputArray src5 = noArray(), InputArray src6 = noArray(), InputArray src4 = noArray(), InputArray src5 = noArray(), InputArray src6 = noArray(),
InputArray src7 = noArray(), InputArray src8 = noArray(), InputArray src9 = noArray(), InputArray src7 = noArray(), InputArray src8 = noArray(), InputArray src9 = noArray(),

View File

@ -3275,13 +3275,26 @@ static BinaryFunc getConvertScaleFunc(int sdepth, int ddepth)
static bool ocl_convertScaleAbs( InputArray _src, OutputArray _dst, double alpha, double beta ) static bool ocl_convertScaleAbs( InputArray _src, OutputArray _dst, double alpha, double beta )
{ {
const ocl::Device & d = ocl::Device::getDefault(); const ocl::Device & d = ocl::Device::getDefault();
int type = _src.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type),
kercn = ocl::predictOptimalVectorWidth(_src, _dst), rowsPerWI = d.isIntel() ? 4 : 1;
bool doubleSupport = d.doubleFPConfig() > 0;
int type = _src.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type);
bool doubleSupport = d.doubleFPConfig() > 0;
if (!doubleSupport && depth == CV_64F) if (!doubleSupport && depth == CV_64F)
return false; return false;
_dst.create(_src.size(), CV_8UC(cn));
int kercn = 1;
if (d.isIntel())
{
static const int vectorWidths[] = {4, 4, 4, 4, 4, 4, 4, -1};
kercn = ocl::checkOptimalVectorWidth( vectorWidths, _src, _dst,
noArray(), noArray(), noArray(),
noArray(), noArray(), noArray(),
noArray(), ocl::OCL_VECTOR_MAX);
}
else
kercn = ocl::predictOptimalVectorWidthMax(_src, _dst);
int rowsPerWI = d.isIntel() ? 4 : 1;
char cvt[2][50]; char cvt[2][50];
int wdepth = std::max(depth, CV_32F); int wdepth = std::max(depth, CV_32F);
String build_opt = format("-D OP_CONVERT_SCALE_ABS -D UNARY_OP -D dstT=%s -D srcT1=%s" String build_opt = format("-D OP_CONVERT_SCALE_ABS -D UNARY_OP -D dstT=%s -D srcT1=%s"
@ -3299,7 +3312,6 @@ static bool ocl_convertScaleAbs( InputArray _src, OutputArray _dst, double alpha
return false; return false;
UMat src = _src.getUMat(); UMat src = _src.getUMat();
_dst.create(src.size(), CV_8UC(cn));
UMat dst = _dst.getUMat(); UMat dst = _dst.getUMat();
ocl::KernelArg srcarg = ocl::KernelArg::ReadOnlyNoSize(src), ocl::KernelArg srcarg = ocl::KernelArg::ReadOnlyNoSize(src),

View File

@ -693,7 +693,7 @@ static void GEMMStore_64fc( const Complexd* c_data, size_t c_step,
#ifdef HAVE_CLAMDBLAS #ifdef HAVE_CLAMDBLAS
static bool ocl_gemm( InputArray matA, InputArray matB, double alpha, static bool ocl_gemm_amdblas( InputArray matA, InputArray matB, double alpha,
InputArray matC, double beta, OutputArray matD, int flags ) InputArray matC, double beta, OutputArray matD, int flags )
{ {
int type = matA.type(), esz = CV_ELEM_SIZE(type); int type = matA.type(), esz = CV_ELEM_SIZE(type);
@ -775,6 +775,84 @@ static bool ocl_gemm( InputArray matA, InputArray matB, double alpha,
#endif #endif
#ifdef HAVE_OPENCL
static bool ocl_gemm( InputArray matA, InputArray matB, double alpha,
InputArray matC, double beta, OutputArray matD, int flags )
{
int depth = matA.depth(), cn = matA.channels();
int type = CV_MAKETYPE(depth, cn);
CV_Assert( type == matB.type() && (type == CV_32FC1 || type == CV_64FC1 || type == CV_32FC2 || type == CV_64FC2) );
const ocl::Device & dev = ocl::Device::getDefault();
bool doubleSupport = dev.doubleFPConfig() > 0;
if (!doubleSupport && depth == CV_64F)
return false;
bool haveC = matC.kind() != cv::_InputArray::NONE;
Size sizeA = matA.size(), sizeB = matB.size(), sizeC = haveC ? matC.size() : Size(0, 0);
bool atrans = (flags & GEMM_1_T) != 0, btrans = (flags & GEMM_2_T) != 0, ctrans = (flags & GEMM_3_T) != 0;
if (atrans)
sizeA = Size(sizeA.height, sizeA.width);
if (btrans)
sizeB = Size(sizeB.height, sizeB.width);
if (haveC && ctrans)
sizeC = Size(sizeC.height, sizeC.width);
Size sizeD(sizeB.width, sizeA.height);
CV_Assert( !haveC || matC.type() == type );
CV_Assert( sizeA.width == sizeB.height && (!haveC || sizeC == sizeD) );
int max_wg_size = (int)dev.maxWorkGroupSize();
int block_size = (max_wg_size / (32*cn) < 32) ? (max_wg_size / (16*cn) < 16) ? (max_wg_size / (8*cn) < 8) ? 1 : 8 : 16 : 32;
matD.create(sizeD, type);
UMat A = matA.getUMat(), B = matB.getUMat(), D = matD.getUMat();
if (atrans)
A = A.t();
if (btrans)
B = B.t();
if (haveC)
ctrans ? transpose(matC, D) : matC.copyTo(D);
int vectorWidths[] = { 4, 4, 2, 2, 1, 4, cn, -1 };
int kercn = ocl::checkOptimalVectorWidth(vectorWidths, B, D);
String opts = format("-D T=%s -D T1=%s -D WT=%s -D cn=%d -D kercn=%d -D LOCAL_SIZE=%d %s %s %s",
ocl::typeToStr(type), ocl::typeToStr(depth), ocl::typeToStr(CV_MAKETYPE(depth, kercn)),
cn, kercn, block_size,
(sizeA.width % block_size !=0) ? "-D NO_MULT" : "",
haveC ? "-D HAVE_C" : "",
doubleSupport ? " -D DOUBLE_SUPPORT" : "");
ocl::Kernel k("gemm", cv::ocl::core::gemm_oclsrc, opts);
if (k.empty())
return false;
if (depth == CV_64F)
k.args(ocl::KernelArg::ReadOnlyNoSize(A),
ocl::KernelArg::ReadOnlyNoSize(B, cn, kercn),
ocl::KernelArg::ReadWrite(D, cn, kercn),
sizeA.width, alpha, beta);
else
k.args(ocl::KernelArg::ReadOnlyNoSize(A),
ocl::KernelArg::ReadOnlyNoSize(B, cn, kercn),
ocl::KernelArg::ReadWrite(D, cn, kercn),
sizeA.width, (float)alpha, (float)beta);
size_t globalsize[2] = { sizeD.width * cn / kercn, sizeD.height};
size_t localsize[2] = { block_size, block_size};
return k.run(2, globalsize, block_size!=1 ? localsize : NULL, false);
}
#endif
} }
void cv::gemm( InputArray matA, InputArray matB, double alpha, void cv::gemm( InputArray matA, InputArray matB, double alpha,
@ -783,6 +861,11 @@ void cv::gemm( InputArray matA, InputArray matB, double alpha,
#ifdef HAVE_CLAMDBLAS #ifdef HAVE_CLAMDBLAS
CV_OCL_RUN(ocl::haveAmdBlas() && matA.dims() <= 2 && matB.dims() <= 2 && matC.dims() <= 2 && _matD.isUMat() && CV_OCL_RUN(ocl::haveAmdBlas() && matA.dims() <= 2 && matB.dims() <= 2 && matC.dims() <= 2 && _matD.isUMat() &&
matA.cols() > 20 && matA.rows() > 20 && matB.cols() > 20, // since it works incorrect for small sizes matA.cols() > 20 && matA.rows() > 20 && matB.cols() > 20, // since it works incorrect for small sizes
ocl_gemm_amdblas(matA, matB, alpha, matC, beta, _matD, flags))
#endif
#ifdef HAVE_OPENCL
CV_OCL_RUN(_matD.isUMat() && matA.dims() <= 2 && matB.dims() <= 2 && matC.dims() <= 2,
ocl_gemm(matA, matB, alpha, matC, beta, _matD, flags)) ocl_gemm(matA, matB, alpha, matC, beta, _matD, flags))
#endif #endif
@ -2173,14 +2256,18 @@ typedef void (*ScaleAddFunc)(const uchar* src1, const uchar* src2, uchar* dst, i
static bool ocl_scaleAdd( InputArray _src1, double alpha, InputArray _src2, OutputArray _dst, int type ) static bool ocl_scaleAdd( InputArray _src1, double alpha, InputArray _src2, OutputArray _dst, int type )
{ {
const ocl::Device & d = ocl::Device::getDefault(); const ocl::Device & d = ocl::Device::getDefault();
int depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type), wdepth = std::max(depth, CV_32F),
kercn = ocl::predictOptimalVectorWidth(_src1, _src2, _dst), rowsPerWI = d.isIntel() ? 4 : 1;
bool doubleSupport = d.doubleFPConfig() > 0; bool doubleSupport = d.doubleFPConfig() > 0;
Size size = _src1.size(); Size size = _src1.size();
int depth = CV_MAT_DEPTH(type);
if ( (!doubleSupport && depth == CV_64F) || size != _src2.size() ) if ( (!doubleSupport && depth == CV_64F) || size != _src2.size() )
return false; return false;
_dst.create(size, type);
int cn = CV_MAT_CN(type), wdepth = std::max(depth, CV_32F);
int kercn = ocl::predictOptimalVectorWidthMax(_src1, _src2, _dst),
rowsPerWI = d.isIntel() ? 4 : 1;
char cvt[2][50]; char cvt[2][50];
ocl::Kernel k("KF", ocl::core::arithm_oclsrc, ocl::Kernel k("KF", ocl::core::arithm_oclsrc,
format("-D OP_SCALE_ADD -D BINARY_OP -D dstT=%s -D workT=%s -D convertToWT1=%s" format("-D OP_SCALE_ADD -D BINARY_OP -D dstT=%s -D workT=%s -D convertToWT1=%s"
@ -2195,9 +2282,7 @@ static bool ocl_scaleAdd( InputArray _src1, double alpha, InputArray _src2, Outp
if (k.empty()) if (k.empty())
return false; return false;
UMat src1 = _src1.getUMat(), src2 = _src2.getUMat(); UMat src1 = _src1.getUMat(), src2 = _src2.getUMat(), dst = _dst.getUMat();
_dst.create(size, type);
UMat dst = _dst.getUMat();
ocl::KernelArg src1arg = ocl::KernelArg::ReadOnlyNoSize(src1), ocl::KernelArg src1arg = ocl::KernelArg::ReadOnlyNoSize(src1),
src2arg = ocl::KernelArg::ReadOnlyNoSize(src2), src2arg = ocl::KernelArg::ReadOnlyNoSize(src2),

View File

@ -3979,6 +3979,11 @@ public:
u->markDeviceMemMapped(false); u->markDeviceMemMapped(false);
CV_Assert( (retval = clEnqueueUnmapMemObject(q, CV_Assert( (retval = clEnqueueUnmapMemObject(q,
(cl_mem)u->handle, u->data, 0, 0, 0)) == CL_SUCCESS ); (cl_mem)u->handle, u->data, 0, 0, 0)) == CL_SUCCESS );
if (Device::getDefault().isAMD())
{
// required for multithreaded applications (see stitching test)
CV_OclDbgAssert(clFinish(q) == CL_SUCCESS);
}
u->data = 0; u->data = 0;
} }
else if( u->copyOnMap() && u->deviceCopyObsolete() ) else if( u->copyOnMap() && u->deviceCopyObsolete() )
@ -4531,12 +4536,14 @@ int predictOptimalVectorWidth(InputArray src1, InputArray src2, InputArray src3,
return checkOptimalVectorWidth(vectorWidths, src1, src2, src3, src4, src5, src6, src7, src8, src9, strat); return checkOptimalVectorWidth(vectorWidths, src1, src2, src3, src4, src5, src6, src7, src8, src9, strat);
} }
int checkOptimalVectorWidth(int *vectorWidths, int checkOptimalVectorWidth(const int *vectorWidths,
InputArray src1, InputArray src2, InputArray src3, InputArray src1, InputArray src2, InputArray src3,
InputArray src4, InputArray src5, InputArray src6, InputArray src4, InputArray src5, InputArray src6,
InputArray src7, InputArray src8, InputArray src9, InputArray src7, InputArray src8, InputArray src9,
OclVectorStrategy strat) OclVectorStrategy strat)
{ {
CV_Assert(vectorWidths);
int ref_type = src1.type(); int ref_type = src1.type();
std::vector<size_t> offsets, steps, cols; std::vector<size_t> offsets, steps, cols;
@ -4624,6 +4631,9 @@ struct Image2D::Impl
static bool isFormatSupported(cl_image_format format) static bool isFormatSupported(cl_image_format format)
{ {
if (!haveOpenCL())
CV_Error(Error::OpenCLApiCallError, "OpenCL runtime not found!");
cl_context context = (cl_context)Context::getDefault().ptr(); cl_context context = (cl_context)Context::getDefault().ptr();
// Figure out how many formats are supported by this context. // Figure out how many formats are supported by this context.
cl_uint numFormats = 0; cl_uint numFormats = 0;
@ -4647,6 +4657,10 @@ struct Image2D::Impl
void init(const UMat &src, bool norm, bool alias) void init(const UMat &src, bool norm, bool alias)
{ {
if (!haveOpenCL())
CV_Error(Error::OpenCLApiCallError, "OpenCL runtime not found!");
CV_Assert(!src.empty());
CV_Assert(ocl::Device::getDefault().imageSupport()); CV_Assert(ocl::Device::getDefault().imageSupport());
int err, depth = src.depth(), cn = src.channels(); int err, depth = src.depth(), cn = src.channels();
@ -4656,6 +4670,9 @@ struct Image2D::Impl
if (!isFormatSupported(format)) if (!isFormatSupported(format))
CV_Error(Error::OpenCLApiCallError, "Image format is not supported"); CV_Error(Error::OpenCLApiCallError, "Image format is not supported");
if (alias && !src.handle(ACCESS_RW))
CV_Error(Error::OpenCLApiCallError, "Incorrect UMat, handle is null");
cl_context context = (cl_context)Context::getDefault().ptr(); cl_context context = (cl_context)Context::getDefault().ptr();
cl_command_queue queue = (cl_command_queue)Queue::getDefault().ptr(); cl_command_queue queue = (cl_command_queue)Queue::getDefault().ptr();
@ -4740,7 +4757,7 @@ bool Image2D::canCreateAlias(const UMat &m)
{ {
bool ret = false; bool ret = false;
const Device & d = ocl::Device::getDefault(); const Device & d = ocl::Device::getDefault();
if (d.imageFromBufferSupport()) if (d.imageFromBufferSupport() && !m.empty())
{ {
// This is the required pitch alignment in pixels // This is the required pitch alignment in pixels
uint pitchAlign = d.imagePitchAlignment(); uint pitchAlign = d.imagePitchAlignment();

View File

@ -0,0 +1,112 @@
// This file is part of OpenCV project.
// It is subject to the license terms in the LICENSE file found in the top-level directory
// of this distribution and at http://opencv.org/license.html.
// Copyright (C) 2014, Itseez, Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
#ifdef DOUBLE_SUPPORT
#ifdef cl_amd_fp64
#pragma OPENCL EXTENSION cl_amd_fp64:enable
#elif defined (cl_khr_fp64)
#pragma OPENCL EXTENSION cl_khr_fp64:enable
#endif
#endif
#define TSIZE (int)sizeof(T)
#define WTSIZE (int)sizeof(WT)
#define IND_A mad24(y, A_step, A_offset)
#define IND_B mad24(x, WTSIZE, B_offset)
#define STEP_B B_step / WTSIZE
#define LOCAL_SIZE_ODD (LOCAL_SIZE + 1)
#if cn==2
#if kercn==2
#define MUL(a, b)\
{\
sum.x += fma(a.x, b.x, - a.y * b.y);\
sum.y += fma(a.x, b.y, a.y * b.x);\
}
#else
#define MUL(a, b)\
{\
sum.x += fma(a.x, b.x, - a.y * b.y);\
sum.y += fma(a.x, b.y, a.y * b.x);\
sum.z += fma(a.x, b.z, - a.y * b.w);\
sum.w += fma(a.x, b.w, a.y * b.z);\
}
#endif
#else
#define MUL(a, b) sum = fma(a, b, sum);
#endif
__kernel void gemm(__global const uchar * A_ptr, int A_step, int A_offset,
__global const uchar * B_ptr, int B_step, int B_offset,
__global uchar * D_ptr, int D_step, int D_offset, int D_rows, int D_cols,
int n, T1 alpha, T1 beta)
{
int x = get_global_id(0);
int y = get_global_id(1);
int lidx = get_local_id(0);
int lidy = get_local_id(1);
__global const T* A = (__global const T*)(A_ptr + IND_A);
__global const WT* B = (__global const WT*)(B_ptr + IND_B);
WT sum = (WT)(0);
#if LOCAL_SIZE == 1
if (x < D_cols && y < D_rows)
{
for (int i = 0; i < n; ++i)
MUL(A[i], B[i*STEP_B]);
#else
__local T a_local[LOCAL_SIZE_ODD*LOCAL_SIZE];
__local WT b_local[LOCAL_SIZE_ODD*LOCAL_SIZE];
int reps;
#if NO_MULT
reps = (n + LOCAL_SIZE-1)/LOCAL_SIZE;
#else
reps = n/LOCAL_SIZE;
#endif
for (int p = 0; p < reps; ++p)
{
if (p * LOCAL_SIZE + lidx < n && y < D_rows)
a_local[mad24(lidy, LOCAL_SIZE_ODD, lidx)] = A[mad24(p, LOCAL_SIZE, lidx)];
if (p * LOCAL_SIZE + lidy < n && x < D_cols)
b_local[mad24(lidy, LOCAL_SIZE_ODD, lidx)] = B[mad24(p, LOCAL_SIZE, lidy)*STEP_B];
barrier(CLK_LOCAL_MEM_FENCE);
if (x < D_cols && y < D_rows)
{
#if NO_MULT
int ie = min(LOCAL_SIZE, n - p * LOCAL_SIZE);
for (int i = 0; i < ie; ++i)
#else
for (int i = 0; i < LOCAL_SIZE; ++i)
#endif
MUL(a_local[mad24(lidy, LOCAL_SIZE_ODD, i)], b_local[mad24(i, LOCAL_SIZE_ODD, lidx)]);
}
barrier(CLK_LOCAL_MEM_FENCE);
}
if (x < D_cols && y < D_rows)
{
#endif
__global WT* D = (__global WT*)(D_ptr + mad24(y, D_step, mad24(x, WTSIZE, D_offset)));
#if HAVE_C
D[0] = mad(alpha, sum, D[0]*beta);
#else
D[0] = alpha * sum;
#endif
}
}

View File

@ -0,0 +1,96 @@
// This file is part of OpenCV project.
// It is subject to the license terms in the LICENSE file found in the top-level directory
// of this distribution and at http://opencv.org/license.html.
// Copyright (C) 2014, Itseez, Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
#include "../test_precomp.hpp"
#include "opencv2/ts/ocl_test.hpp"
#ifdef HAVE_OPENCL
namespace cvtest {
namespace ocl {
TEST(Image2D, createAliasEmptyUMat)
{
if (cv::ocl::haveOpenCL())
{
UMat um;
EXPECT_FALSE(cv::ocl::Image2D::canCreateAlias(um));
}
else
std::cout << "OpenCL runtime not found. Test skipped." << std::endl;
}
TEST(Image2D, createImage2DWithEmptyUMat)
{
if (cv::ocl::haveOpenCL())
{
UMat um;
EXPECT_ANY_THROW(cv::ocl::Image2D image(um));
}
else
std::cout << "OpenCL runtime not found. Test skipped." << std::endl;
}
TEST(Image2D, createAlias)
{
if (cv::ocl::haveOpenCL())
{
const cv::ocl::Device & d = cv::ocl::Device::getDefault();
int minor = d.deviceVersionMinor(), major = d.deviceVersionMajor();
// aliases is OpenCL 1.2 extension
if (1 < major || (1 == major && 2 <= minor))
{
UMat um(128, 128, CV_8UC1);
bool isFormatSupported = false, canCreateAlias = false;
EXPECT_NO_THROW(isFormatSupported = cv::ocl::Image2D::isFormatSupported(CV_8U, 1, false));
EXPECT_NO_THROW(canCreateAlias = cv::ocl::Image2D::canCreateAlias(um));
if (isFormatSupported && canCreateAlias)
{
EXPECT_NO_THROW(cv::ocl::Image2D image(um, false, true));
}
else
std::cout << "Impossible to create alias for selected image. Test skipped." << std::endl;
}
}
else
std::cout << "OpenCL runtime not found. Test skipped" << std::endl;
}
TEST(Image2D, turnOffOpenCL)
{
if (cv::ocl::haveOpenCL())
{
// save the current state
bool useOCL = cv::ocl::useOpenCL();
bool isFormatSupported = false;
cv::ocl::setUseOpenCL(true);
UMat um(128, 128, CV_8UC1);
cv::ocl::setUseOpenCL(false);
EXPECT_NO_THROW(isFormatSupported = cv::ocl::Image2D::isFormatSupported(CV_8U, 1, true));
if (isFormatSupported)
{
EXPECT_NO_THROW(cv::ocl::Image2D image(um));
}
else
std::cout << "CV_8UC1 is not supported for OpenCL images. Test skipped." << std::endl;
// reset state to the previous one
cv::ocl::setUseOpenCL(useOCL);
}
else
std::cout << "OpenCL runtime not found. Test skipped." << std::endl;
}
} } // namespace cvtest::ocl
#endif // HAVE_OPENCL

View File

@ -4848,7 +4848,7 @@ static bool ocl_cvtColor( InputArray _src, OutputArray _dst, int code, int dcn )
bool ok = false; bool ok = false;
UMat src = _src.getUMat(), dst; UMat src = _src.getUMat(), dst;
Size sz = src.size(), dstSz = sz; Size sz = src.size(), dstSz = sz;
int scn = src.channels(), depth = src.depth(), bidx; int scn = src.channels(), depth = src.depth(), bidx, uidx, yidx;
int dims = 2, stripeSize = 1; int dims = 2, stripeSize = 1;
ocl::Kernel k; ocl::Kernel k;
@ -4857,6 +4857,7 @@ static bool ocl_cvtColor( InputArray _src, OutputArray _dst, int code, int dcn )
ocl::Device dev = ocl::Device::getDefault(); ocl::Device dev = ocl::Device::getDefault();
int pxPerWIy = dev.isIntel() && (dev.type() & ocl::Device::TYPE_GPU) ? 4 : 1; int pxPerWIy = dev.isIntel() && (dev.type() & ocl::Device::TYPE_GPU) ? 4 : 1;
int pxPerWIx = 1;
size_t globalsize[] = { src.cols, (src.rows + pxPerWIy - 1) / pxPerWIy }; size_t globalsize[] = { src.cols, (src.rows + pxPerWIy - 1) / pxPerWIy };
cv::String opts = format("-D depth=%d -D scn=%d -D PIX_PER_WI_Y=%d ", cv::String opts = format("-D depth=%d -D scn=%d -D PIX_PER_WI_Y=%d ",
@ -4960,17 +4961,107 @@ static bool ocl_cvtColor( InputArray _src, OutputArray _dst, int code, int dcn )
opts + format("-D dcn=%d -D bidx=%d", dcn, bidx)); opts + format("-D dcn=%d -D bidx=%d", dcn, bidx));
break; break;
} }
case COLOR_YUV2RGB_NV12: case COLOR_YUV2BGR_NV12: case COLOR_YUV2RGB_NV12: case COLOR_YUV2BGR_NV12: case COLOR_YUV2RGB_NV21: case COLOR_YUV2BGR_NV21:
case COLOR_YUV2RGBA_NV12: case COLOR_YUV2BGRA_NV12: case COLOR_YUV2RGBA_NV12: case COLOR_YUV2BGRA_NV12: case COLOR_YUV2RGBA_NV21: case COLOR_YUV2BGRA_NV21:
{ {
CV_Assert( scn == 1 ); CV_Assert( scn == 1 );
CV_Assert( sz.width % 2 == 0 && sz.height % 3 == 0 && depth == CV_8U ); CV_Assert( sz.width % 2 == 0 && sz.height % 3 == 0 && depth == CV_8U );
dcn = code == COLOR_YUV2BGRA_NV12 || code == COLOR_YUV2RGBA_NV12 ? 4 : 3; dcn = code == COLOR_YUV2BGRA_NV12 || code == COLOR_YUV2RGBA_NV12 ||
bidx = code == COLOR_YUV2BGRA_NV12 || code == COLOR_YUV2BGR_NV12 ? 0 : 2; code == COLOR_YUV2BGRA_NV21 || code == COLOR_YUV2RGBA_NV21 ? 4 : 3;
bidx = code == COLOR_YUV2BGRA_NV12 || code == COLOR_YUV2BGR_NV12 ||
code == COLOR_YUV2BGRA_NV21 || code == COLOR_YUV2BGR_NV21 ? 0 : 2;
uidx = code == COLOR_YUV2RGBA_NV21 || code == COLOR_YUV2RGB_NV21 ||
code == COLOR_YUV2BGRA_NV21 || code == COLOR_YUV2BGR_NV21 ? 1 : 0;
dstSz = Size(sz.width, sz.height * 2 / 3); dstSz = Size(sz.width, sz.height * 2 / 3);
k.create("YUV2RGB_NV12", ocl::imgproc::cvtcolor_oclsrc, globalsize[0] = dstSz.width / 2; globalsize[1] = (dstSz.height/2 + pxPerWIy - 1) / pxPerWIy;
opts + format("-D dcn=%d -D bidx=%d", dcn, bidx)); k.create("YUV2RGB_NVx", ocl::imgproc::cvtcolor_oclsrc,
opts + format("-D dcn=%d -D bidx=%d -D uidx=%d", dcn, bidx, uidx));
break;
}
case COLOR_YUV2BGR_YV12: case COLOR_YUV2RGB_YV12: case COLOR_YUV2BGRA_YV12: case COLOR_YUV2RGBA_YV12:
case COLOR_YUV2BGR_IYUV: case COLOR_YUV2RGB_IYUV: case COLOR_YUV2BGRA_IYUV: case COLOR_YUV2RGBA_IYUV:
{
CV_Assert( scn == 1 );
CV_Assert( sz.width % 2 == 0 && sz.height % 3 == 0 && depth == CV_8U );
dcn = code == COLOR_YUV2BGRA_YV12 || code == COLOR_YUV2RGBA_YV12 ||
code == COLOR_YUV2BGRA_IYUV || code == COLOR_YUV2RGBA_IYUV ? 4 : 3;
bidx = code == COLOR_YUV2BGRA_YV12 || code == COLOR_YUV2BGR_YV12 ||
code == COLOR_YUV2BGRA_IYUV || code == COLOR_YUV2BGR_IYUV ? 0 : 2;
uidx = code == COLOR_YUV2BGRA_YV12 || code == COLOR_YUV2BGR_YV12 ||
code == COLOR_YUV2RGBA_YV12 || code == COLOR_YUV2RGB_YV12 ? 1 : 0;
dstSz = Size(sz.width, sz.height * 2 / 3);
globalsize[0] = dstSz.width / 2; globalsize[1] = (dstSz.height/2 + pxPerWIy - 1) / pxPerWIy;
k.create("YUV2RGB_YV12_IYUV", ocl::imgproc::cvtcolor_oclsrc,
opts + format("-D dcn=%d -D bidx=%d -D uidx=%d%s", dcn, bidx, uidx,
src.isContinuous() ? " -D SRC_CONT" : ""));
break;
}
case COLOR_YUV2GRAY_420:
{
if (dcn <= 0) dcn = 1;
CV_Assert( dcn == 1 );
CV_Assert( sz.width % 2 == 0 && sz.height % 3 == 0 && depth == CV_8U );
dstSz = Size(sz.width, sz.height * 2 / 3);
_dst.create(dstSz, CV_MAKETYPE(depth, dcn));
dst = _dst.getUMat();
src.rowRange(0, dstSz.height).copyTo(dst);
return true;
}
case COLOR_RGB2YUV_YV12: case COLOR_BGR2YUV_YV12: case COLOR_RGBA2YUV_YV12: case COLOR_BGRA2YUV_YV12:
case COLOR_RGB2YUV_IYUV: case COLOR_BGR2YUV_IYUV: case COLOR_RGBA2YUV_IYUV: case COLOR_BGRA2YUV_IYUV:
{
if (dcn <= 0) dcn = 1;
bidx = code == COLOR_BGRA2YUV_YV12 || code == COLOR_BGR2YUV_YV12 ||
code == COLOR_BGRA2YUV_IYUV || code == COLOR_BGR2YUV_IYUV ? 0 : 2;
uidx = code == COLOR_RGBA2YUV_YV12 || code == COLOR_RGB2YUV_YV12 ||
code == COLOR_BGRA2YUV_YV12 || code == COLOR_BGR2YUV_YV12 ? 1 : 0;
CV_Assert( (scn == 3 || scn == 4) && depth == CV_8U );
CV_Assert( dcn == 1 );
CV_Assert( sz.width % 2 == 0 && sz.height % 2 == 0 );
dstSz = Size(sz.width, sz.height / 2 * 3);
_dst.create(dstSz, CV_MAKETYPE(depth, dcn));
dst = _dst.getUMat();
if (dev.isIntel() && src.cols % 4 == 0 && src.step % 4 == 0 && src.offset % 4 == 0 &&
dst.step % 4 == 0 && dst.offset % 4 == 0)
{
pxPerWIx = 2;
}
globalsize[0] = dstSz.width / (2 * pxPerWIx); globalsize[1] = (dstSz.height/3 + pxPerWIy - 1) / pxPerWIy;
k.create("RGB2YUV_YV12_IYUV", ocl::imgproc::cvtcolor_oclsrc,
opts + format("-D dcn=%d -D bidx=%d -D uidx=%d -D PIX_PER_WI_X=%d", dcn, bidx, uidx, pxPerWIx));
k.args(ocl::KernelArg::ReadOnlyNoSize(src), ocl::KernelArg::WriteOnly(dst));
return k.run(2, globalsize, NULL, false);
}
case COLOR_YUV2RGB_UYVY: case COLOR_YUV2BGR_UYVY: case COLOR_YUV2RGBA_UYVY: case COLOR_YUV2BGRA_UYVY:
case COLOR_YUV2RGB_YUY2: case COLOR_YUV2BGR_YUY2: case COLOR_YUV2RGB_YVYU: case COLOR_YUV2BGR_YVYU:
case COLOR_YUV2RGBA_YUY2: case COLOR_YUV2BGRA_YUY2: case COLOR_YUV2RGBA_YVYU: case COLOR_YUV2BGRA_YVYU:
{
if (dcn <= 0)
dcn = (code==COLOR_YUV2RGBA_UYVY || code==COLOR_YUV2BGRA_UYVY || code==COLOR_YUV2RGBA_YUY2 ||
code==COLOR_YUV2BGRA_YUY2 || code==COLOR_YUV2RGBA_YVYU || code==COLOR_YUV2BGRA_YVYU) ? 4 : 3;
bidx = (code==COLOR_YUV2BGR_UYVY || code==COLOR_YUV2BGRA_UYVY || code==COLOR_YUV2BGRA_YUY2 ||
code==COLOR_YUV2BGR_YUY2 || code==COLOR_YUV2BGRA_YVYU || code==COLOR_YUV2BGR_YVYU) ? 0 : 2;
yidx = (code==COLOR_YUV2RGB_UYVY || code==COLOR_YUV2RGBA_UYVY || code==COLOR_YUV2BGR_UYVY || code==COLOR_YUV2BGRA_UYVY) ? 1 : 0;
uidx = (code==COLOR_YUV2RGB_YVYU || code==COLOR_YUV2RGBA_YVYU ||
code==COLOR_YUV2BGR_YVYU || code==COLOR_YUV2BGRA_YVYU) ? 2 : 0;
uidx = 1 - yidx + uidx;
CV_Assert( dcn == 3 || dcn == 4 );
CV_Assert( scn == 2 && depth == CV_8U );
k.create("YUV2RGB_422", ocl::imgproc::cvtcolor_oclsrc,
opts + format("-D dcn=%d -D bidx=%d -D uidx=%d -D yidx=%d%s", dcn, bidx, uidx, yidx,
src.offset % 4 == 0 && src.step % 4 == 0 ? " -D USE_OPTIMIZED_LOAD" : ""));
break; break;
} }
case COLOR_BGR2YCrCb: case COLOR_BGR2YCrCb:

View File

@ -275,6 +275,12 @@ void cv::goodFeaturesToTrack( InputArray _image, OutputArray _corners,
_mask, blockSize, useHarrisDetector, harrisK)) _mask, blockSize, useHarrisDetector, harrisK))
Mat image = _image.getMat(), eig, tmp; Mat image = _image.getMat(), eig, tmp;
if (image.empty())
{
_corners.release();
return;
}
if( useHarrisDetector ) if( useHarrisDetector )
cornerHarris( image, eig, blockSize, 3, harrisK ); cornerHarris( image, eig, blockSize, 3, harrisK );
else else

View File

@ -111,6 +111,18 @@ enum
#define B_COMP w #define B_COMP w
#endif #endif
#ifndef uidx
#define uidx 0
#endif
#ifndef yidx
#define yidx 0
#endif
#ifndef PIX_PER_WI_X
#define PIX_PER_WI_X 1
#endif
#define __CAT(x, y) x##y #define __CAT(x, y) x##y
#define CAT(x, y) __CAT(x, y) #define CAT(x, y) __CAT(x, y)
@ -141,7 +153,7 @@ __kernel void RGB2Gray(__global const uchar * srcptr, int src_step, int src_offs
#ifdef DEPTH_5 #ifdef DEPTH_5
dst[0] = fma(src_pix.B_COMP, 0.114f, fma(src_pix.G_COMP, 0.587f, src_pix.R_COMP * 0.299f)); dst[0] = fma(src_pix.B_COMP, 0.114f, fma(src_pix.G_COMP, 0.587f, src_pix.R_COMP * 0.299f));
#else #else
dst[0] = (DATA_TYPE)CV_DESCALE(mad24(src_pix.B_COMP, B2Y, mad24(src_pix.G_COMP, G2Y, src_pix.R_COMP * R2Y)), yuv_shift); dst[0] = (DATA_TYPE)CV_DESCALE(mad24(src_pix.B_COMP, B2Y, mad24(src_pix.G_COMP, G2Y, mul24(src_pix.R_COMP, R2Y))), yuv_shift);
#endif #endif
++y; ++y;
src_index += src_step; src_index += src_step;
@ -222,7 +234,7 @@ __kernel void RGB2YUV(__global const uchar* srcptr, int src_step, int src_offset
#else #else
__constant int * coeffs = c_RGB2YUVCoeffs_i; __constant int * coeffs = c_RGB2YUVCoeffs_i;
const int delta = HALF_MAX * (1 << yuv_shift); const int delta = HALF_MAX * (1 << yuv_shift);
const int Y = CV_DESCALE(mad24(b, coeffs[0], mad24(g, coeffs[1], r * coeffs[2])), yuv_shift); const int Y = CV_DESCALE(mad24(b, coeffs[0], mad24(g, coeffs[1], mul24(r, coeffs[2]))), yuv_shift);
const int U = CV_DESCALE(mad24(b - Y, coeffs[3], delta), yuv_shift); const int U = CV_DESCALE(mad24(b - Y, coeffs[3], delta), yuv_shift);
const int V = CV_DESCALE(mad24(r - Y, coeffs[4], delta), yuv_shift); const int V = CV_DESCALE(mad24(r - Y, coeffs[4], delta), yuv_shift);
#endif #endif
@ -239,8 +251,8 @@ __kernel void RGB2YUV(__global const uchar* srcptr, int src_step, int src_offset
} }
} }
__constant float c_YUV2RGBCoeffs_f[5] = { 2.032f, -0.395f, -0.581f, 1.140f }; __constant float c_YUV2RGBCoeffs_f[4] = { 2.032f, -0.395f, -0.581f, 1.140f };
__constant int c_YUV2RGBCoeffs_i[5] = { 33292, -6472, -9519, 18678 }; __constant int c_YUV2RGBCoeffs_i[4] = { 33292, -6472, -9519, 18678 };
__kernel void YUV2RGB(__global const uchar* srcptr, int src_step, int src_offset, __kernel void YUV2RGB(__global const uchar* srcptr, int src_step, int src_offset,
__global uchar* dstptr, int dst_step, int dt_offset, __global uchar* dstptr, int dst_step, int dt_offset,
@ -271,9 +283,9 @@ __kernel void YUV2RGB(__global const uchar* srcptr, int src_step, int src_offset
float b = fma(U - HALF_MAX, coeffs[0], Y); float b = fma(U - HALF_MAX, coeffs[0], Y);
#else #else
__constant int * coeffs = c_YUV2RGBCoeffs_i; __constant int * coeffs = c_YUV2RGBCoeffs_i;
const int r = Y + CV_DESCALE((V - HALF_MAX) * coeffs[3], yuv_shift); const int r = Y + CV_DESCALE(mul24(V - HALF_MAX, coeffs[3]), yuv_shift);
const int g = Y + CV_DESCALE(mad24(V - HALF_MAX, coeffs[2], (U - HALF_MAX) * coeffs[1]), yuv_shift); const int g = Y + CV_DESCALE(mad24(V - HALF_MAX, coeffs[2], mul24(U - HALF_MAX, coeffs[1])), yuv_shift);
const int b = Y + CV_DESCALE((U - HALF_MAX) * coeffs[0], yuv_shift); const int b = Y + CV_DESCALE(mul24(U - HALF_MAX, coeffs[0]), yuv_shift);
#endif #endif
dst[bidx] = SAT_CAST( b ); dst[bidx] = SAT_CAST( b );
@ -289,15 +301,10 @@ __kernel void YUV2RGB(__global const uchar* srcptr, int src_step, int src_offset
} }
} }
} }
__constant float c_YUV2RGBCoeffs_420[5] = { 1.163999557f, 2.017999649f, -0.390999794f,
-0.812999725f, 1.5959997177f };
__constant int ITUR_BT_601_CY = 1220542; __kernel void YUV2RGB_NVx(__global const uchar* srcptr, int src_step, int src_offset,
__constant int ITUR_BT_601_CUB = 2116026;
__constant int ITUR_BT_601_CUG = 409993;
__constant int ITUR_BT_601_CVG = 852492;
__constant int ITUR_BT_601_CVR = 1673527;
__constant int ITUR_BT_601_SHIFT = 20;
__kernel void YUV2RGB_NV12(__global const uchar* srcptr, int src_step, int src_offset,
__global uchar* dstptr, int dst_step, int dt_offset, __global uchar* dstptr, int dst_step, int dt_offset,
int rows, int cols) int rows, int cols)
{ {
@ -313,49 +320,50 @@ __kernel void YUV2RGB_NV12(__global const uchar* srcptr, int src_step, int src_o
{ {
__global const uchar* ysrc = srcptr + mad24(y << 1, src_step, (x << 1) + src_offset); __global const uchar* ysrc = srcptr + mad24(y << 1, src_step, (x << 1) + src_offset);
__global const uchar* usrc = srcptr + mad24(rows + y, src_step, (x << 1) + src_offset); __global const uchar* usrc = srcptr + mad24(rows + y, src_step, (x << 1) + src_offset);
__global uchar* dst1 = dstptr + mad24(y << 1, dst_step, x * (dcn<<1) + dt_offset); __global uchar* dst1 = dstptr + mad24(y << 1, dst_step, mad24(x, dcn<<1, dt_offset));
__global uchar* dst2 = dstptr + mad24((y << 1) + 1, dst_step, x * (dcn<<1) + dt_offset); __global uchar* dst2 = dst1 + dst_step;
int Y1 = ysrc[0]; float Y1 = ysrc[0];
int Y2 = ysrc[1]; float Y2 = ysrc[1];
int Y3 = ysrc[src_step]; float Y3 = ysrc[src_step];
int Y4 = ysrc[src_step + 1]; float Y4 = ysrc[src_step + 1];
int U = usrc[0] - 128; float U = ((float)usrc[uidx]) - HALF_MAX;
int V = usrc[1] - 128; float V = ((float)usrc[1-uidx]) - HALF_MAX;
int ruv = (1 << (ITUR_BT_601_SHIFT - 1)) + ITUR_BT_601_CVR * V; __constant float* coeffs = c_YUV2RGBCoeffs_420;
int guv = (1 << (ITUR_BT_601_SHIFT - 1)) - ITUR_BT_601_CVG * V - ITUR_BT_601_CUG * U; float ruv = fma(coeffs[4], V, 0.5f);
int buv = (1 << (ITUR_BT_601_SHIFT - 1)) + ITUR_BT_601_CUB * U; float guv = fma(coeffs[3], V, fma(coeffs[2], U, 0.5f));
float buv = fma(coeffs[1], U, 0.5f);
Y1 = max(0, Y1 - 16) * ITUR_BT_601_CY; Y1 = max(0.f, Y1 - 16.f) * coeffs[0];
dst1[2 - bidx] = convert_uchar_sat((Y1 + ruv) >> ITUR_BT_601_SHIFT); dst1[2 - bidx] = convert_uchar_sat(Y1 + ruv);
dst1[1] = convert_uchar_sat((Y1 + guv) >> ITUR_BT_601_SHIFT); dst1[1] = convert_uchar_sat(Y1 + guv);
dst1[bidx] = convert_uchar_sat((Y1 + buv) >> ITUR_BT_601_SHIFT); dst1[bidx] = convert_uchar_sat(Y1 + buv);
#if dcn == 4 #if dcn == 4
dst1[3] = 255; dst1[3] = 255;
#endif #endif
Y2 = max(0, Y2 - 16) * ITUR_BT_601_CY; Y2 = max(0.f, Y2 - 16.f) * coeffs[0];
dst1[dcn + 2 - bidx] = convert_uchar_sat((Y2 + ruv) >> ITUR_BT_601_SHIFT); dst1[dcn + 2 - bidx] = convert_uchar_sat(Y2 + ruv);
dst1[dcn + 1] = convert_uchar_sat((Y2 + guv) >> ITUR_BT_601_SHIFT); dst1[dcn + 1] = convert_uchar_sat(Y2 + guv);
dst1[dcn + bidx] = convert_uchar_sat((Y2 + buv) >> ITUR_BT_601_SHIFT); dst1[dcn + bidx] = convert_uchar_sat(Y2 + buv);
#if dcn == 4 #if dcn == 4
dst1[7] = 255; dst1[7] = 255;
#endif #endif
Y3 = max(0, Y3 - 16) * ITUR_BT_601_CY; Y3 = max(0.f, Y3 - 16.f) * coeffs[0];
dst2[2 - bidx] = convert_uchar_sat((Y3 + ruv) >> ITUR_BT_601_SHIFT); dst2[2 - bidx] = convert_uchar_sat(Y3 + ruv);
dst2[1] = convert_uchar_sat((Y3 + guv) >> ITUR_BT_601_SHIFT); dst2[1] = convert_uchar_sat(Y3 + guv);
dst2[bidx] = convert_uchar_sat((Y3 + buv) >> ITUR_BT_601_SHIFT); dst2[bidx] = convert_uchar_sat(Y3 + buv);
#if dcn == 4 #if dcn == 4
dst2[3] = 255; dst2[3] = 255;
#endif #endif
Y4 = max(0, Y4 - 16) * ITUR_BT_601_CY; Y4 = max(0.f, Y4 - 16.f) * coeffs[0];
dst2[dcn + 2 - bidx] = convert_uchar_sat((Y4 + ruv) >> ITUR_BT_601_SHIFT); dst2[dcn + 2 - bidx] = convert_uchar_sat(Y4 + ruv);
dst2[dcn + 1] = convert_uchar_sat((Y4 + guv) >> ITUR_BT_601_SHIFT); dst2[dcn + 1] = convert_uchar_sat(Y4 + guv);
dst2[dcn + bidx] = convert_uchar_sat((Y4 + buv) >> ITUR_BT_601_SHIFT); dst2[dcn + bidx] = convert_uchar_sat(Y4 + buv);
#if dcn == 4 #if dcn == 4
dst2[7] = 255; dst2[7] = 255;
#endif #endif
@ -365,6 +373,247 @@ __kernel void YUV2RGB_NV12(__global const uchar* srcptr, int src_step, int src_o
} }
} }
__kernel void YUV2RGB_YV12_IYUV(__global const uchar* srcptr, int src_step, int src_offset,
__global uchar* dstptr, int dst_step, int dt_offset,
int rows, int cols)
{
int x = get_global_id(0);
int y = get_global_id(1) * PIX_PER_WI_Y;
if (x < cols / 2)
{
#pragma unroll
for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
{
if (y < rows / 2 )
{
__global const uchar* ysrc = srcptr + mad24(y << 1, src_step, (x << 1) + src_offset);
__global uchar* dst1 = dstptr + mad24(y << 1, dst_step, x * (dcn<<1) + dt_offset);
__global uchar* dst2 = dst1 + dst_step;
float Y1 = ysrc[0];
float Y2 = ysrc[1];
float Y3 = ysrc[src_step];
float Y4 = ysrc[src_step + 1];
#ifdef SRC_CONT
__global const uchar* uvsrc = srcptr + mad24(rows, src_step, src_offset);
int u_ind = mad24(y, cols >> 1, x);
float uv[2] = { ((float)uvsrc[u_ind]) - HALF_MAX, ((float)uvsrc[u_ind + ((rows * cols) >> 2)]) - HALF_MAX };
#else
int vsteps[2] = { cols >> 1, src_step - (cols >> 1)};
__global const uchar* usrc = srcptr + mad24(rows + (y>>1), src_step, src_offset + (y%2)*(cols >> 1) + x);
__global const uchar* vsrc = usrc + mad24(rows >> 2, src_step, rows % 4 ? vsteps[y%2] : 0);
float uv[2] = { ((float)usrc[0]) - HALF_MAX, ((float)vsrc[0]) - HALF_MAX };
#endif
float U = uv[uidx];
float V = uv[1-uidx];
__constant float* coeffs = c_YUV2RGBCoeffs_420;
float ruv = fma(coeffs[4], V, 0.5f);
float guv = fma(coeffs[3], V, fma(coeffs[2], U, 0.5f));
float buv = fma(coeffs[1], U, 0.5f);
Y1 = max(0.f, Y1 - 16.f) * coeffs[0];
dst1[2 - bidx] = convert_uchar_sat(Y1 + ruv);
dst1[1] = convert_uchar_sat(Y1 + guv);
dst1[bidx] = convert_uchar_sat(Y1 + buv);
#if dcn == 4
dst1[3] = 255;
#endif
Y2 = max(0.f, Y2 - 16.f) * coeffs[0];
dst1[dcn + 2 - bidx] = convert_uchar_sat(Y2 + ruv);
dst1[dcn + 1] = convert_uchar_sat(Y2 + guv);
dst1[dcn + bidx] = convert_uchar_sat(Y2 + buv);
#if dcn == 4
dst1[7] = 255;
#endif
Y3 = max(0.f, Y3 - 16.f) * coeffs[0];
dst2[2 - bidx] = convert_uchar_sat(Y3 + ruv);
dst2[1] = convert_uchar_sat(Y3 + guv);
dst2[bidx] = convert_uchar_sat(Y3 + buv);
#if dcn == 4
dst2[3] = 255;
#endif
Y4 = max(0.f, Y4 - 16.f) * coeffs[0];
dst2[dcn + 2 - bidx] = convert_uchar_sat(Y4 + ruv);
dst2[dcn + 1] = convert_uchar_sat(Y4 + guv);
dst2[dcn + bidx] = convert_uchar_sat(Y4 + buv);
#if dcn == 4
dst2[7] = 255;
#endif
}
++y;
}
}
}
__constant float c_RGB2YUVCoeffs_420[8] = { 0.256999969f, 0.50399971f, 0.09799957f, -0.1479988098f, -0.2909994125f,
0.438999176f, -0.3679990768f, -0.0709991455f };
__kernel void RGB2YUV_YV12_IYUV(__global const uchar* srcptr, int src_step, int src_offset,
__global uchar* dstptr, int dst_step, int dst_offset,
int rows, int cols)
{
int x = get_global_id(0) * PIX_PER_WI_X;
int y = get_global_id(1) * PIX_PER_WI_Y;
if (x < cols/2)
{
int src_index = mad24(y << 1, src_step, mad24(x << 1, scn, src_offset));
int ydst_index = mad24(y << 1, dst_step, (x << 1) + dst_offset);
int y_rows = rows / 3 * 2;
int vsteps[2] = { cols >> 1, dst_step - (cols >> 1)};
__constant float* coeffs = c_RGB2YUVCoeffs_420;
#pragma unroll
for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
{
if (y < rows / 3)
{
__global const uchar* src1 = srcptr + src_index;
__global const uchar* src2 = src1 + src_step;
__global uchar* ydst1 = dstptr + ydst_index;
__global uchar* ydst2 = ydst1 + dst_step;
__global uchar* udst = dstptr + mad24(y_rows + (y>>1), dst_step, dst_offset + (y%2)*(cols >> 1) + x);
__global uchar* vdst = udst + mad24(y_rows >> 2, dst_step, y_rows % 4 ? vsteps[y%2] : 0);
#if PIX_PER_WI_X == 2
int s11 = *((__global const int*) src1);
int s12 = *((__global const int*) src1 + 1);
int s13 = *((__global const int*) src1 + 2);
#if scn == 4
int s14 = *((__global const int*) src1 + 3);
#endif
int s21 = *((__global const int*) src2);
int s22 = *((__global const int*) src2 + 1);
int s23 = *((__global const int*) src2 + 2);
#if scn == 4
int s24 = *((__global const int*) src2 + 3);
#endif
float src_pix1[scn * 4], src_pix2[scn * 4];
*((float4*) src_pix1) = convert_float4(as_uchar4(s11));
*((float4*) src_pix1 + 1) = convert_float4(as_uchar4(s12));
*((float4*) src_pix1 + 2) = convert_float4(as_uchar4(s13));
#if scn == 4
*((float4*) src_pix1 + 3) = convert_float4(as_uchar4(s14));
#endif
*((float4*) src_pix2) = convert_float4(as_uchar4(s21));
*((float4*) src_pix2 + 1) = convert_float4(as_uchar4(s22));
*((float4*) src_pix2 + 2) = convert_float4(as_uchar4(s23));
#if scn == 4
*((float4*) src_pix2 + 3) = convert_float4(as_uchar4(s24));
#endif
uchar4 y1, y2;
y1.x = convert_uchar_sat(fma(coeffs[0], src_pix1[ 2-bidx], fma(coeffs[1], src_pix1[ 1], fma(coeffs[2], src_pix1[ bidx], 16.5f))));
y1.y = convert_uchar_sat(fma(coeffs[0], src_pix1[ scn+2-bidx], fma(coeffs[1], src_pix1[ scn+1], fma(coeffs[2], src_pix1[ scn+bidx], 16.5f))));
y1.z = convert_uchar_sat(fma(coeffs[0], src_pix1[2*scn+2-bidx], fma(coeffs[1], src_pix1[2*scn+1], fma(coeffs[2], src_pix1[2*scn+bidx], 16.5f))));
y1.w = convert_uchar_sat(fma(coeffs[0], src_pix1[3*scn+2-bidx], fma(coeffs[1], src_pix1[3*scn+1], fma(coeffs[2], src_pix1[3*scn+bidx], 16.5f))));
y2.x = convert_uchar_sat(fma(coeffs[0], src_pix2[ 2-bidx], fma(coeffs[1], src_pix2[ 1], fma(coeffs[2], src_pix2[ bidx], 16.5f))));
y2.y = convert_uchar_sat(fma(coeffs[0], src_pix2[ scn+2-bidx], fma(coeffs[1], src_pix2[ scn+1], fma(coeffs[2], src_pix2[ scn+bidx], 16.5f))));
y2.z = convert_uchar_sat(fma(coeffs[0], src_pix2[2*scn+2-bidx], fma(coeffs[1], src_pix2[2*scn+1], fma(coeffs[2], src_pix2[2*scn+bidx], 16.5f))));
y2.w = convert_uchar_sat(fma(coeffs[0], src_pix2[3*scn+2-bidx], fma(coeffs[1], src_pix2[3*scn+1], fma(coeffs[2], src_pix2[3*scn+bidx], 16.5f))));
*((__global int*) ydst1) = as_int(y1);
*((__global int*) ydst2) = as_int(y2);
float uv[4] = { fma(coeffs[3], src_pix1[ 2-bidx], fma(coeffs[4], src_pix1[ 1], fma(coeffs[5], src_pix1[ bidx], 128.5f))),
fma(coeffs[5], src_pix1[ 2-bidx], fma(coeffs[6], src_pix1[ 1], fma(coeffs[7], src_pix1[ bidx], 128.5f))),
fma(coeffs[3], src_pix1[2*scn+2-bidx], fma(coeffs[4], src_pix1[2*scn+1], fma(coeffs[5], src_pix1[2*scn+bidx], 128.5f))),
fma(coeffs[5], src_pix1[2*scn+2-bidx], fma(coeffs[6], src_pix1[2*scn+1], fma(coeffs[7], src_pix1[2*scn+bidx], 128.5f))) };
udst[0] = convert_uchar_sat(uv[uidx] );
vdst[0] = convert_uchar_sat(uv[1 - uidx]);
udst[1] = convert_uchar_sat(uv[2 + uidx]);
vdst[1] = convert_uchar_sat(uv[3 - uidx]);
#else
float4 src_pix1 = convert_float4(vload4(0, src1));
float4 src_pix2 = convert_float4(vload4(0, src1+scn));
float4 src_pix3 = convert_float4(vload4(0, src2));
float4 src_pix4 = convert_float4(vload4(0, src2+scn));
ydst1[0] = convert_uchar_sat(fma(coeffs[0], src_pix1.R_COMP, fma(coeffs[1], src_pix1.G_COMP, fma(coeffs[2], src_pix1.B_COMP, 16.5f))));
ydst1[1] = convert_uchar_sat(fma(coeffs[0], src_pix2.R_COMP, fma(coeffs[1], src_pix2.G_COMP, fma(coeffs[2], src_pix2.B_COMP, 16.5f))));
ydst2[0] = convert_uchar_sat(fma(coeffs[0], src_pix3.R_COMP, fma(coeffs[1], src_pix3.G_COMP, fma(coeffs[2], src_pix3.B_COMP, 16.5f))));
ydst2[1] = convert_uchar_sat(fma(coeffs[0], src_pix4.R_COMP, fma(coeffs[1], src_pix4.G_COMP, fma(coeffs[2], src_pix4.B_COMP, 16.5f))));
float uv[2] = { fma(coeffs[3], src_pix1.R_COMP, fma(coeffs[4], src_pix1.G_COMP, fma(coeffs[5], src_pix1.B_COMP, 128.5f))),
fma(coeffs[5], src_pix1.R_COMP, fma(coeffs[6], src_pix1.G_COMP, fma(coeffs[7], src_pix1.B_COMP, 128.5f))) };
udst[0] = convert_uchar_sat(uv[uidx] );
vdst[0] = convert_uchar_sat(uv[1-uidx]);
#endif
++y;
src_index += 2*src_step;
ydst_index += 2*dst_step;
}
}
}
}
__kernel void YUV2RGB_422(__global const uchar* srcptr, int src_step, int src_offset,
__global uchar* dstptr, int dst_step, int dst_offset,
int rows, int cols)
{
int x = get_global_id(0);
int y = get_global_id(1) * PIX_PER_WI_Y;
if (x < cols / 2)
{
__global const uchar* src = srcptr + mad24(y, src_step, (x << 2) + src_offset);
__global uchar* dst = dstptr + mad24(y, dst_step, mad24(x << 1, dcn, dst_offset));
#pragma unroll
for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
{
if (y < rows )
{
__constant float* coeffs = c_YUV2RGBCoeffs_420;
#ifndef USE_OPTIMIZED_LOAD
float U = ((float) src[uidx]) - HALF_MAX;
float V = ((float) src[(2 + uidx) % 4]) - HALF_MAX;
float y00 = max(0.f, ((float) src[yidx]) - 16.f) * coeffs[0];
float y01 = max(0.f, ((float) src[yidx + 2]) - 16.f) * coeffs[0];
#else
int load_src = *((__global int*) src);
float vec_src[4] = { load_src & 0xff, (load_src >> 8) & 0xff, (load_src >> 16) & 0xff, (load_src >> 24) & 0xff};
float U = vec_src[uidx] - HALF_MAX;
float V = vec_src[(2 + uidx) % 4] - HALF_MAX;
float y00 = max(0.f, vec_src[yidx] - 16.f) * coeffs[0];
float y01 = max(0.f, vec_src[yidx + 2] - 16.f) * coeffs[0];
#endif
float ruv = fma(coeffs[4], V, 0.5f);
float guv = fma(coeffs[3], V, fma(coeffs[2], U, 0.5f));
float buv = fma(coeffs[1], U, 0.5f);
dst[2 - bidx] = convert_uchar_sat(y00 + ruv);
dst[1] = convert_uchar_sat(y00 + guv);
dst[bidx] = convert_uchar_sat(y00 + buv);
#if dcn == 4
dst[3] = 255;
#endif
dst[dcn + 2 - bidx] = convert_uchar_sat(y01 + ruv);
dst[dcn + 1] = convert_uchar_sat(y01 + guv);
dst[dcn + bidx] = convert_uchar_sat(y01 + buv);
#if dcn == 4
dst[7] = 255;
#endif
}
++y;
src += src_step;
dst += dst_step;
}
}
}
///////////////////////////////////// RGB <-> YCrCb ////////////////////////////////////// ///////////////////////////////////// RGB <-> YCrCb //////////////////////////////////////
__constant float c_RGB2YCrCbCoeffs_f[5] = {0.299f, 0.587f, 0.114f, 0.713f, 0.564f}; __constant float c_RGB2YCrCbCoeffs_f[5] = {0.299f, 0.587f, 0.114f, 0.713f, 0.564f};
@ -400,7 +649,7 @@ __kernel void RGB2YCrCb(__global const uchar* srcptr, int src_step, int src_offs
#else #else
__constant int * coeffs = c_RGB2YCrCbCoeffs_i; __constant int * coeffs = c_RGB2YCrCbCoeffs_i;
int delta = HALF_MAX * (1 << yuv_shift); int delta = HALF_MAX * (1 << yuv_shift);
int Y = CV_DESCALE(mad24(b, coeffs[2], mad24(g, coeffs[1], r * coeffs[0])), yuv_shift); int Y = CV_DESCALE(mad24(b, coeffs[2], mad24(g, coeffs[1], mul24(r, coeffs[0]))), yuv_shift);
int Cr = CV_DESCALE(mad24(r - Y, coeffs[3], delta), yuv_shift); int Cr = CV_DESCALE(mad24(r - Y, coeffs[3], delta), yuv_shift);
int Cb = CV_DESCALE(mad24(b - Y, coeffs[4], delta), yuv_shift); int Cb = CV_DESCALE(mad24(b - Y, coeffs[4], delta), yuv_shift);
#endif #endif

View File

@ -320,9 +320,9 @@ OCL_TEST_P(CvtColor8u32f, Luv2RGBA) { performTest(3, 4, CVTCODE(Luv2RGB), depth
OCL_TEST_P(CvtColor8u32f, Luv2LBGRA) { performTest(3, 4, CVTCODE(Luv2LBGR), depth == CV_8U ? 1 : 1e-5); } OCL_TEST_P(CvtColor8u32f, Luv2LBGRA) { performTest(3, 4, CVTCODE(Luv2LBGR), depth == CV_8U ? 1 : 1e-5); }
OCL_TEST_P(CvtColor8u32f, Luv2LRGBA) { performTest(3, 4, CVTCODE(Luv2LRGB), depth == CV_8U ? 1 : 1e-5); } OCL_TEST_P(CvtColor8u32f, Luv2LRGBA) { performTest(3, 4, CVTCODE(Luv2LRGB), depth == CV_8U ? 1 : 1e-5); }
// YUV -> RGBA_NV12 // YUV420 -> RGBA
struct CvtColor_YUV420 : struct CvtColor_YUV2RGB_420 :
public CvtColor public CvtColor
{ {
void generateTestData(int channelsIn, int channelsOut) void generateTestData(int channelsIn, int channelsOut)
@ -344,10 +344,94 @@ struct CvtColor_YUV420 :
} }
}; };
OCL_TEST_P(CvtColor_YUV420, YUV2RGBA_NV12) { performTest(1, 4, COLOR_YUV2RGBA_NV12); } OCL_TEST_P(CvtColor_YUV2RGB_420, YUV2RGBA_NV12) { performTest(1, 4, CVTCODE(YUV2RGBA_NV12)); }
OCL_TEST_P(CvtColor_YUV420, YUV2BGRA_NV12) { performTest(1, 4, COLOR_YUV2BGRA_NV12); } OCL_TEST_P(CvtColor_YUV2RGB_420, YUV2BGRA_NV12) { performTest(1, 4, CVTCODE(YUV2BGRA_NV12)); }
OCL_TEST_P(CvtColor_YUV420, YUV2RGB_NV12) { performTest(1, 3, COLOR_YUV2RGB_NV12); } OCL_TEST_P(CvtColor_YUV2RGB_420, YUV2RGB_NV12) { performTest(1, 3, CVTCODE(YUV2RGB_NV12)); }
OCL_TEST_P(CvtColor_YUV420, YUV2BGR_NV12) { performTest(1, 3, COLOR_YUV2BGR_NV12); } OCL_TEST_P(CvtColor_YUV2RGB_420, YUV2BGR_NV12) { performTest(1, 3, CVTCODE(YUV2BGR_NV12)); }
OCL_TEST_P(CvtColor_YUV2RGB_420, YUV2RGBA_NV21) { performTest(1, 4, CVTCODE(YUV2RGBA_NV21)); }
OCL_TEST_P(CvtColor_YUV2RGB_420, YUV2BGRA_NV21) { performTest(1, 4, CVTCODE(YUV2BGRA_NV21)); }
OCL_TEST_P(CvtColor_YUV2RGB_420, YUV2RGB_NV21) { performTest(1, 3, CVTCODE(YUV2RGB_NV21)); }
OCL_TEST_P(CvtColor_YUV2RGB_420, YUV2BGR_NV21) { performTest(1, 3, CVTCODE(YUV2BGR_NV21)); }
OCL_TEST_P(CvtColor_YUV2RGB_420, YUV2RGBA_YV12) { performTest(1, 4, CVTCODE(YUV2RGBA_YV12)); }
OCL_TEST_P(CvtColor_YUV2RGB_420, YUV2BGRA_YV12) { performTest(1, 4, CVTCODE(YUV2BGRA_YV12)); }
OCL_TEST_P(CvtColor_YUV2RGB_420, YUV2RGB_YV12) { performTest(1, 3, CVTCODE(YUV2RGB_YV12)); }
OCL_TEST_P(CvtColor_YUV2RGB_420, YUV2BGR_YV12) { performTest(1, 3, CVTCODE(YUV2BGR_YV12)); }
OCL_TEST_P(CvtColor_YUV2RGB_420, YUV2RGBA_IYUV) { performTest(1, 4, CVTCODE(YUV2RGBA_IYUV)); }
OCL_TEST_P(CvtColor_YUV2RGB_420, YUV2BGRA_IYUV) { performTest(1, 4, CVTCODE(YUV2BGRA_IYUV)); }
OCL_TEST_P(CvtColor_YUV2RGB_420, YUV2RGB_IYUV) { performTest(1, 3, CVTCODE(YUV2RGB_IYUV)); }
OCL_TEST_P(CvtColor_YUV2RGB_420, YUV2BGR_IYUV) { performTest(1, 3, CVTCODE(YUV2BGR_IYUV)); }
OCL_TEST_P(CvtColor_YUV2RGB_420, YUV2GRAY_420) { performTest(1, 1, CVTCODE(YUV2GRAY_420)); }
// RGBA -> YUV420
struct CvtColor_RGB2YUV_420 :
public CvtColor
{
void generateTestData(int channelsIn, int channelsOut)
{
const int srcType = CV_MAKE_TYPE(depth, channelsIn);
const int dstType = CV_MAKE_TYPE(depth, channelsOut);
Size srcRoiSize = randomSize(1, MAX_VALUE);
srcRoiSize.width *= 2;
srcRoiSize.height *= 2;
Border srcBorder = randomBorder(0, use_roi ? MAX_VALUE : 0);
randomSubMat(src, src_roi, srcRoiSize, srcBorder, srcType, 2, 100);
Size dstRoiSize(srcRoiSize.width, srcRoiSize.height / 2 * 3);
Border dstBorder = randomBorder(0, use_roi ? MAX_VALUE : 0);
randomSubMat(dst, dst_roi, dstRoiSize, dstBorder, dstType, 5, 16);
UMAT_UPLOAD_INPUT_PARAMETER(src);
UMAT_UPLOAD_OUTPUT_PARAMETER(dst);
}
};
OCL_TEST_P(CvtColor_RGB2YUV_420, RGBA2YUV_YV12) { performTest(4, 1, CVTCODE(RGBA2YUV_YV12), 1); }
OCL_TEST_P(CvtColor_RGB2YUV_420, BGRA2YUV_YV12) { performTest(4, 1, CVTCODE(BGRA2YUV_YV12), 1); }
OCL_TEST_P(CvtColor_RGB2YUV_420, RGB2YUV_YV12) { performTest(3, 1, CVTCODE(RGB2YUV_YV12), 1); }
OCL_TEST_P(CvtColor_RGB2YUV_420, BGR2YUV_YV12) { performTest(3, 1, CVTCODE(BGR2YUV_YV12), 1); }
OCL_TEST_P(CvtColor_RGB2YUV_420, RGBA2YUV_IYUV) { performTest(4, 1, CVTCODE(RGBA2YUV_IYUV), 1); }
OCL_TEST_P(CvtColor_RGB2YUV_420, BGRA2YUV_IYUV) { performTest(4, 1, CVTCODE(BGRA2YUV_IYUV), 1); }
OCL_TEST_P(CvtColor_RGB2YUV_420, RGB2YUV_IYUV) { performTest(3, 1, CVTCODE(RGB2YUV_IYUV), 1); }
OCL_TEST_P(CvtColor_RGB2YUV_420, BGR2YUV_IYUV) { performTest(3, 1, CVTCODE(BGR2YUV_IYUV), 1); }
// YUV422 -> RGBA
struct CvtColor_YUV2RGB_422 :
public CvtColor
{
void generateTestData(int channelsIn, int channelsOut)
{
const int srcType = CV_MAKE_TYPE(depth, channelsIn);
const int dstType = CV_MAKE_TYPE(depth, channelsOut);
Size roiSize = randomSize(1, MAX_VALUE);
roiSize.width *= 2;
Border srcBorder = randomBorder(0, use_roi ? MAX_VALUE : 0);
randomSubMat(src, src_roi, roiSize, srcBorder, srcType, 2, 100);
Border dstBorder = randomBorder(0, use_roi ? MAX_VALUE : 0);
randomSubMat(dst, dst_roi, roiSize, dstBorder, dstType, 5, 16);
UMAT_UPLOAD_INPUT_PARAMETER(src);
UMAT_UPLOAD_OUTPUT_PARAMETER(dst);
}
};
OCL_TEST_P(CvtColor_YUV2RGB_422, YUV2RGB_UYVY) { performTest(2, 3, CVTCODE(YUV2RGB_UYVY)); }
OCL_TEST_P(CvtColor_YUV2RGB_422, YUV2BGR_UYVY) { performTest(2, 3, CVTCODE(YUV2BGR_UYVY)); }
OCL_TEST_P(CvtColor_YUV2RGB_422, YUV2RGBA_UYVY) { performTest(2, 4, CVTCODE(YUV2RGBA_UYVY)); }
OCL_TEST_P(CvtColor_YUV2RGB_422, YUV2BGRA_UYVY) { performTest(2, 4, CVTCODE(YUV2BGRA_UYVY)); }
OCL_TEST_P(CvtColor_YUV2RGB_422, YUV2RGB_YUY2) { performTest(2, 3, CVTCODE(YUV2RGB_YUY2)); }
OCL_TEST_P(CvtColor_YUV2RGB_422, YUV2BGR_YUY2) { performTest(2, 3, CVTCODE(YUV2BGR_YUY2)); }
OCL_TEST_P(CvtColor_YUV2RGB_422, YUV2RGBA_YUY2) { performTest(2, 4, CVTCODE(YUV2RGBA_YUY2)); }
OCL_TEST_P(CvtColor_YUV2RGB_422, YUV2BGRA_YUY2) { performTest(2, 4, CVTCODE(YUV2BGRA_YUY2)); }
OCL_TEST_P(CvtColor_YUV2RGB_422, YUV2RGB_YVYU) { performTest(2, 3, CVTCODE(YUV2RGB_YVYU)); }
OCL_TEST_P(CvtColor_YUV2RGB_422, YUV2BGR_YVYU) { performTest(2, 3, CVTCODE(YUV2BGR_YVYU)); }
OCL_TEST_P(CvtColor_YUV2RGB_422, YUV2RGBA_YVYU) { performTest(2, 4, CVTCODE(YUV2RGBA_YVYU)); }
OCL_TEST_P(CvtColor_YUV2RGB_422, YUV2BGRA_YVYU) { performTest(2, 4, CVTCODE(YUV2BGRA_YVYU)); }
OCL_INSTANTIATE_TEST_CASE_P(ImgProc, CvtColor8u, OCL_INSTANTIATE_TEST_CASE_P(ImgProc, CvtColor8u,
@ -361,7 +445,17 @@ OCL_INSTANTIATE_TEST_CASE_P(ImgProc, CvtColor,
testing::Values(MatDepth(CV_8U), MatDepth(CV_16U), MatDepth(CV_32F)), testing::Values(MatDepth(CV_8U), MatDepth(CV_16U), MatDepth(CV_32F)),
Bool())); Bool()));
OCL_INSTANTIATE_TEST_CASE_P(ImgProc, CvtColor_YUV420, OCL_INSTANTIATE_TEST_CASE_P(ImgProc, CvtColor_YUV2RGB_420,
testing::Combine(
testing::Values(MatDepth(CV_8U)),
Bool()));
OCL_INSTANTIATE_TEST_CASE_P(ImgProc, CvtColor_RGB2YUV_420,
testing::Combine(
testing::Values(MatDepth(CV_8U)),
Bool()));
OCL_INSTANTIATE_TEST_CASE_P(ImgProc, CvtColor_YUV2RGB_422,
testing::Combine( testing::Combine(
testing::Values(MatDepth(CV_8U)), testing::Values(MatDepth(CV_8U)),
Bool())); Bool()));

View File

@ -135,6 +135,7 @@ public:
Point2f warpPoint(const Point2f &pt, InputArray K, InputArray R, InputArray T); Point2f warpPoint(const Point2f &pt, InputArray K, InputArray R, InputArray T);
virtual Rect buildMaps(Size src_size, InputArray K, InputArray R, InputArray T, OutputArray xmap, OutputArray ymap); virtual Rect buildMaps(Size src_size, InputArray K, InputArray R, InputArray T, OutputArray xmap, OutputArray ymap);
Rect buildMaps(Size src_size, InputArray K, InputArray R, OutputArray xmap, OutputArray ymap);
virtual Point warp(InputArray src, InputArray K, InputArray R, InputArray T, int interp_mode, int border_mode, virtual Point warp(InputArray src, InputArray K, InputArray R, InputArray T, int interp_mode, int border_mode,
OutputArray dst); OutputArray dst);

View File

@ -56,25 +56,27 @@ __kernel void buildWarpPlaneMaps(__global uchar * xmapptr, int xmap_step, int xm
int xmap_index = mad24(dv0, xmap_step, mad24(du, (int)sizeof(float), xmap_offset)); int xmap_index = mad24(dv0, xmap_step, mad24(du, (int)sizeof(float), xmap_offset));
int ymap_index = mad24(dv0, ymap_step, mad24(du, (int)sizeof(float), ymap_offset)); int ymap_index = mad24(dv0, ymap_step, mad24(du, (int)sizeof(float), ymap_offset));
float u = tl_u + du;
float x_ = fma(u, scale, -ct[0]);
float ct1 = 1 - ct[2];
for (int dv = dv0, dv1 = min(rows, dv0 + rowsPerWI); dv < dv1; ++dv, xmap_index += xmap_step, for (int dv = dv0, dv1 = min(rows, dv0 + rowsPerWI); dv < dv1; ++dv, xmap_index += xmap_step,
ymap_index += ymap_step) ymap_index += ymap_step)
{ {
__global float * xmap = (__global float *)(xmapptr + xmap_index); __global float * xmap = (__global float *)(xmapptr + xmap_index);
__global float * ymap = (__global float *)(ymapptr + ymap_index); __global float * ymap = (__global float *)(ymapptr + ymap_index);
float u = tl_u + du;
float v = tl_v + dv; float v = tl_v + dv;
float y_ = fma(v, scale, -ct[1]);
float x_ = u / scale - ct[0];
float y_ = v / scale - ct[1];
float ct1 = 1 - ct[2];
float x = fma(ck_rinv[0], x_, fma(ck_rinv[1], y_, ck_rinv[2] * ct1)); float x = fma(ck_rinv[0], x_, fma(ck_rinv[1], y_, ck_rinv[2] * ct1));
float y = fma(ck_rinv[3], x_, fma(ck_rinv[4], y_, ck_rinv[5] * ct1)); float y = fma(ck_rinv[3], x_, fma(ck_rinv[4], y_, ck_rinv[5] * ct1));
float z = fma(ck_rinv[6], x_, fma(ck_rinv[7], y_, ck_rinv[8] * ct1)); float z = fma(ck_rinv[6], x_, fma(ck_rinv[7], y_, ck_rinv[8] * ct1));
x /= z; if (z != 0)
y /= z; x /= z, y /= z;
else
x = y = -1;
xmap[0] = x; xmap[0] = x;
ymap[0] = y; ymap[0] = y;
@ -94,22 +96,19 @@ __kernel void buildWarpCylindricalMaps(__global uchar * xmapptr, int xmap_step,
int xmap_index = mad24(dv0, xmap_step, mad24(du, (int)sizeof(float), xmap_offset)); int xmap_index = mad24(dv0, xmap_step, mad24(du, (int)sizeof(float), xmap_offset));
int ymap_index = mad24(dv0, ymap_step, mad24(du, (int)sizeof(float), ymap_offset)); int ymap_index = mad24(dv0, ymap_step, mad24(du, (int)sizeof(float), ymap_offset));
float u = (tl_u + du) * scale;
float x_, z_;
x_ = sincos(u, &z_);
for (int dv = dv0, dv1 = min(rows, dv0 + rowsPerWI); dv < dv1; ++dv, xmap_index += xmap_step, for (int dv = dv0, dv1 = min(rows, dv0 + rowsPerWI); dv < dv1; ++dv, xmap_index += xmap_step,
ymap_index += ymap_step) ymap_index += ymap_step)
{ {
__global float * xmap = (__global float *)(xmapptr + xmap_index); __global float * xmap = (__global float *)(xmapptr + xmap_index);
__global float * ymap = (__global float *)(ymapptr + ymap_index); __global float * ymap = (__global float *)(ymapptr + ymap_index);
float u = tl_u + du; float y_ = (tl_v + dv) * scale;
float v = tl_v + dv;
float x, y;
u /= scale; float x, y, z;
float x_, y_, z_;
x_ = sincos(u, &z_);
y_ = v / scale;
float z;
x = fma(ck_rinv[0], x_, fma(ck_rinv[1], y_, ck_rinv[2] * z_)); x = fma(ck_rinv[0], x_, fma(ck_rinv[1], y_, ck_rinv[2] * z_));
y = fma(ck_rinv[3], x_, fma(ck_rinv[4], y_, ck_rinv[5] * z_)); y = fma(ck_rinv[3], x_, fma(ck_rinv[4], y_, ck_rinv[5] * z_));
z = fma(ck_rinv[6], x_, fma(ck_rinv[7], y_, ck_rinv[8] * z_)); z = fma(ck_rinv[6], x_, fma(ck_rinv[7], y_, ck_rinv[8] * z_));
@ -137,25 +136,23 @@ __kernel void buildWarpSphericalMaps(__global uchar * xmapptr, int xmap_step, in
int xmap_index = mad24(dv0, xmap_step, mad24(du, (int)sizeof(float), xmap_offset)); int xmap_index = mad24(dv0, xmap_step, mad24(du, (int)sizeof(float), xmap_offset));
int ymap_index = mad24(dv0, ymap_step, mad24(du, (int)sizeof(float), ymap_offset)); int ymap_index = mad24(dv0, ymap_step, mad24(du, (int)sizeof(float), ymap_offset));
float u = (tl_u + du) * scale;
float cosu, sinu = sincos(u, &cosu);
for (int dv = dv0, dv1 = min(rows, dv0 + rowsPerWI); dv < dv1; ++dv, xmap_index += xmap_step, for (int dv = dv0, dv1 = min(rows, dv0 + rowsPerWI); dv < dv1; ++dv, xmap_index += xmap_step,
ymap_index += ymap_step) ymap_index += ymap_step)
{ {
__global float * xmap = (__global float *)(xmapptr + xmap_index); __global float * xmap = (__global float *)(xmapptr + xmap_index);
__global float * ymap = (__global float *)(ymapptr + ymap_index); __global float * ymap = (__global float *)(ymapptr + ymap_index);
float u = tl_u + du; float v = (tl_v + dv) * scale;
float v = tl_v + dv;
float x, y;
v /= scale; float cosv, sinv = sincos(v, &cosv);
u /= scale;
float cosv, sinv = sincos(v, &cosv), cosu, sinu = sincos(u, &cosu);
float x_ = sinv * sinu; float x_ = sinv * sinu;
float y_ = -cosv; float y_ = -cosv;
float z_ = sinv * cosu; float z_ = sinv * cosu;
float z; float x, y, z;
x = fma(ck_rinv[0], x_, fma(ck_rinv[1], y_, ck_rinv[2] * z_)); x = fma(ck_rinv[0], x_, fma(ck_rinv[1], y_, ck_rinv[2] * z_));
y = fma(ck_rinv[3], x_, fma(ck_rinv[4], y_, ck_rinv[5] * z_)); y = fma(ck_rinv[3], x_, fma(ck_rinv[4], y_, ck_rinv[5] * z_));
z = fma(ck_rinv[6], x_, fma(ck_rinv[7], y_, ck_rinv[8] * z_)); z = fma(ck_rinv[6], x_, fma(ck_rinv[7], y_, ck_rinv[8] * z_));

View File

@ -87,6 +87,11 @@ Point2f PlaneWarper::warpPoint(const Point2f &pt, InputArray K, InputArray R, In
return uv; return uv;
} }
Rect PlaneWarper::buildMaps(Size src_size, InputArray K, InputArray R, OutputArray xmap, OutputArray ymap)
{
return buildMaps(src_size, K, R, Mat::zeros(3, 1, CV_32FC1), xmap, ymap);
}
Rect PlaneWarper::buildMaps(Size src_size, InputArray K, InputArray R, InputArray T, OutputArray _xmap, OutputArray _ymap) Rect PlaneWarper::buildMaps(Size src_size, InputArray K, InputArray R, InputArray T, OutputArray _xmap, OutputArray _ymap)
{ {
projector_.setCameraParams(K, R, T); projector_.setCameraParams(K, R, T);
@ -110,7 +115,7 @@ Rect PlaneWarper::buildMaps(Size src_size, InputArray K, InputArray R, InputArra
k.args(ocl::KernelArg::WriteOnlyNoSize(uxmap), ocl::KernelArg::WriteOnly(uymap), k.args(ocl::KernelArg::WriteOnlyNoSize(uxmap), ocl::KernelArg::WriteOnly(uymap),
ocl::KernelArg::PtrReadOnly(uk_rinv), ocl::KernelArg::PtrReadOnly(ut), ocl::KernelArg::PtrReadOnly(uk_rinv), ocl::KernelArg::PtrReadOnly(ut),
dst_tl.x, dst_tl.y, projector_.scale, rowsPerWI); dst_tl.x, dst_tl.y, 1/projector_.scale, rowsPerWI);
size_t globalsize[2] = { dsize.width, (dsize.height + rowsPerWI - 1) / rowsPerWI }; size_t globalsize[2] = { dsize.width, (dsize.height + rowsPerWI - 1) / rowsPerWI };
if (k.run(2, globalsize, NULL, true)) if (k.run(2, globalsize, NULL, true))
@ -388,7 +393,7 @@ Rect SphericalWarper::buildMaps(Size src_size, InputArray K, InputArray R, Outpu
UMat uxmap = xmap.getUMat(), uymap = ymap.getUMat(), uk_rinv = k_rinv.getUMat(ACCESS_READ); UMat uxmap = xmap.getUMat(), uymap = ymap.getUMat(), uk_rinv = k_rinv.getUMat(ACCESS_READ);
k.args(ocl::KernelArg::WriteOnlyNoSize(uxmap), ocl::KernelArg::WriteOnly(uymap), k.args(ocl::KernelArg::WriteOnlyNoSize(uxmap), ocl::KernelArg::WriteOnly(uymap),
ocl::KernelArg::PtrReadOnly(uk_rinv), dst_tl.x, dst_tl.y, projector_.scale, rowsPerWI); ocl::KernelArg::PtrReadOnly(uk_rinv), dst_tl.x, dst_tl.y, 1/projector_.scale, rowsPerWI);
size_t globalsize[2] = { dsize.width, (dsize.height + rowsPerWI - 1) / rowsPerWI }; size_t globalsize[2] = { dsize.width, (dsize.height + rowsPerWI - 1) / rowsPerWI };
if (k.run(2, globalsize, NULL, true)) if (k.run(2, globalsize, NULL, true))
@ -436,7 +441,7 @@ Rect CylindricalWarper::buildMaps(Size src_size, InputArray K, InputArray R, Out
UMat uxmap = xmap.getUMat(), uymap = ymap.getUMat(), uk_rinv = k_rinv.getUMat(ACCESS_READ); UMat uxmap = xmap.getUMat(), uymap = ymap.getUMat(), uk_rinv = k_rinv.getUMat(ACCESS_READ);
k.args(ocl::KernelArg::WriteOnlyNoSize(uxmap), ocl::KernelArg::WriteOnly(uymap), k.args(ocl::KernelArg::WriteOnlyNoSize(uxmap), ocl::KernelArg::WriteOnly(uymap),
ocl::KernelArg::PtrReadOnly(uk_rinv), dst_tl.x, dst_tl.y, projector_.scale, ocl::KernelArg::PtrReadOnly(uk_rinv), dst_tl.x, dst_tl.y, 1/projector_.scale,
rowsPerWI); rowsPerWI);
size_t globalsize[2] = { dsize.width, (dsize.height + rowsPerWI - 1) / rowsPerWI }; size_t globalsize[2] = { dsize.width, (dsize.height + rowsPerWI - 1) / rowsPerWI };

View File

@ -419,9 +419,11 @@ private:
static int64 timeLimitDefault; static int64 timeLimitDefault;
static unsigned int iterationsLimitDefault; static unsigned int iterationsLimitDefault;
unsigned int minIters;
unsigned int nIters; unsigned int nIters;
unsigned int currentIter; unsigned int currentIter;
unsigned int runsPerIteration; unsigned int runsPerIteration;
unsigned int perfValidationStage;
performance_metrics metrics; performance_metrics metrics;
void validateMetrics(); void validateMetrics();

View File

@ -1,5 +1,16 @@
#include "precomp.hpp" #include "precomp.hpp"
#include <map>
#include <iostream>
#include <fstream>
#if defined WIN32 || defined _WIN32 || defined WIN64 || defined _WIN64
#ifndef NOMINMAX
#define NOMINMAX
#endif
#include <windows.h>
#endif
#ifdef HAVE_CUDA #ifdef HAVE_CUDA
#include "opencv2/core/cuda.hpp" #include "opencv2/core/cuda.hpp"
#endif #endif
@ -35,11 +46,11 @@ static bool param_verify_sanity;
static bool param_collect_impl; static bool param_collect_impl;
#endif #endif
extern bool test_ipp_check; extern bool test_ipp_check;
#ifdef HAVE_CUDA #ifdef HAVE_CUDA
static int param_cuda_device; static int param_cuda_device;
#endif #endif
#ifdef ANDROID #ifdef ANDROID
static int param_affinity_mask; static int param_affinity_mask;
static bool log_power_checkpoints; static bool log_power_checkpoints;
@ -59,6 +70,8 @@ static void setCurrentThreadAffinityMask(int mask)
} }
#endif #endif
static double perf_stability_criteria = 0.03; // 3%
namespace { namespace {
class PerfEnvironment: public ::testing::Environment class PerfEnvironment: public ::testing::Environment
@ -635,6 +648,82 @@ void performance_metrics::clear()
terminationReason = TERM_UNKNOWN; terminationReason = TERM_UNKNOWN;
} }
/*****************************************************************************************\
* Performance validation results
\*****************************************************************************************/
static bool perf_validation_enabled = false;
static std::string perf_validation_results_directory;
static std::map<std::string, float> perf_validation_results;
static std::string perf_validation_results_outfile;
static double perf_validation_criteria = 0.03; // 3 %
static double perf_validation_time_threshold_ms = 0.1;
static int perf_validation_idle_delay_ms = 3000; // 3 sec
static void loadPerfValidationResults(const std::string& fileName)
{
perf_validation_results.clear();
std::ifstream infile(fileName.c_str());
while (!infile.eof())
{
std::string name;
float value = 0;
if (!(infile >> value))
{
if (infile.eof())
break; // it is OK
std::cout << "ERROR: Can't load performance validation results from " << fileName << "!" << std::endl;
return;
}
infile.ignore(1);
if (!(std::getline(infile, name)))
{
std::cout << "ERROR: Can't load performance validation results from " << fileName << "!" << std::endl;
return;
}
if (!name.empty() && name[name.size() - 1] == '\r') // CRLF processing on Linux
name.resize(name.size() - 1);
perf_validation_results[name] = value;
}
std::cout << "Performance validation results loaded from " << fileName << " (" << perf_validation_results.size() << " entries)" << std::endl;
}
static void savePerfValidationResult(const std::string& name, float value)
{
perf_validation_results[name] = value;
}
static void savePerfValidationResults()
{
if (!perf_validation_results_outfile.empty())
{
std::ofstream outfile((perf_validation_results_directory + perf_validation_results_outfile).c_str());
std::map<std::string, float>::const_iterator i;
for (i = perf_validation_results.begin(); i != perf_validation_results.end(); ++i)
{
outfile << i->second << ';';
outfile << i->first << std::endl;
}
outfile.close();
std::cout << "Performance validation results saved (" << perf_validation_results.size() << " entries)" << std::endl;
}
}
class PerfValidationEnvironment : public ::testing::Environment
{
public:
virtual ~PerfValidationEnvironment() {}
virtual void SetUp() {}
virtual void TearDown()
{
savePerfValidationResults();
}
};
/*****************************************************************************************\ /*****************************************************************************************\
* ::perf::TestBase * ::perf::TestBase
@ -666,6 +755,8 @@ void TestBase::Init(const std::vector<std::string> & availableImpls,
"{ perf_list_impls |false |list available implementation variants and exit}" "{ perf_list_impls |false |list available implementation variants and exit}"
"{ perf_run_cpu |false |deprecated, equivalent to --perf_impl=plain}" "{ perf_run_cpu |false |deprecated, equivalent to --perf_impl=plain}"
"{ perf_strategy |default |specifies performance measuring strategy: default, base or simple (weak restrictions)}" "{ perf_strategy |default |specifies performance measuring strategy: default, base or simple (weak restrictions)}"
"{ perf_read_validation_results | |specifies file name with performance results from previous run}"
"{ perf_write_validation_results | |specifies file name to write performance validation results}"
#ifdef ANDROID #ifdef ANDROID
"{ perf_time_limit |6.0 |default time limit for a single test (in seconds)}" "{ perf_time_limit |6.0 |default time limit for a single test (in seconds)}"
"{ perf_affinity_mask |0 |set affinity mask for the main thread}" "{ perf_affinity_mask |0 |set affinity mask for the main thread}"
@ -789,6 +880,26 @@ void TestBase::Init(const std::vector<std::string> & availableImpls,
} }
#endif #endif
{
const char* path = getenv("OPENCV_PERF_VALIDATION_DIR");
if (path)
perf_validation_results_directory = path;
}
std::string fileName_perf_validation_results_src = args.get<std::string>("perf_read_validation_results");
if (!fileName_perf_validation_results_src.empty())
{
perf_validation_enabled = true;
loadPerfValidationResults(perf_validation_results_directory + fileName_perf_validation_results_src);
}
perf_validation_results_outfile = args.get<std::string>("perf_write_validation_results");
if (!perf_validation_results_outfile.empty())
{
perf_validation_enabled = true;
::testing::AddGlobalTestEnvironment(new PerfValidationEnvironment());
}
if (!args.check()) if (!args.check())
{ {
args.printErrors(); args.printErrors();
@ -878,7 +989,9 @@ TestBase::TestBase(): testStrategy(PERF_STRATEGY_DEFAULT), declare(this)
{ {
lastTime = totalTime = timeLimit = 0; lastTime = totalTime = timeLimit = 0;
nIters = currentIter = runsPerIteration = 0; nIters = currentIter = runsPerIteration = 0;
minIters = param_min_samples;
verified = false; verified = false;
perfValidationStage = 0;
} }
#ifdef _MSC_VER #ifdef _MSC_VER
# pragma warning(pop) # pragma warning(pop)
@ -1004,7 +1117,7 @@ bool TestBase::next()
has_next = false; has_next = false;
break; break;
} }
if (currentIter < param_min_samples) if (currentIter < minIters)
{ {
has_next = true; has_next = true;
break; break;
@ -1012,14 +1125,96 @@ bool TestBase::next()
calcMetrics(); calcMetrics();
double criteria = 0.03; // 3%
if (fabs(metrics.mean) > 1e-6) if (fabs(metrics.mean) > 1e-6)
has_next = metrics.stddev > criteria * fabs(metrics.mean); has_next = metrics.stddev > perf_stability_criteria * fabs(metrics.mean);
else else
has_next = true; has_next = true;
} }
} while (false); } while (false);
if (perf_validation_enabled && !has_next)
{
calcMetrics();
double median_ms = metrics.median * 1000.0f / metrics.frequency;
const ::testing::TestInfo* const test_info = ::testing::UnitTest::GetInstance()->current_test_info();
std::string name = (test_info == 0) ? "" :
std::string(test_info->test_case_name()) + "--" + test_info->name();
if (!perf_validation_results.empty() && !name.empty())
{
std::map<std::string, float>::iterator i = perf_validation_results.find(name);
bool isSame = false;
bool found = false;
bool grow = false;
if (i != perf_validation_results.end())
{
found = true;
double prev_result = i->second;
grow = median_ms > prev_result;
isSame = fabs(median_ms - prev_result) <= perf_validation_criteria * fabs(median_ms);
if (!isSame)
{
if (perfValidationStage == 0)
{
printf("Performance is changed (samples = %d, median):\n %.2f ms (current)\n %.2f ms (previous)\n", (int)times.size(), median_ms, prev_result);
}
}
}
else
{
if (perfValidationStage == 0)
printf("New performance result is detected\n");
}
if (!isSame)
{
if (perfValidationStage < 2)
{
if (perfValidationStage == 0 && currentIter <= minIters * 3 && currentIter < nIters)
{
unsigned int new_minIters = std::max(minIters * 5, currentIter * 3);
printf("Increase minIters from %u to %u\n", minIters, new_minIters);
minIters = new_minIters;
has_next = true;
perfValidationStage++;
}
else if (found && currentIter >= nIters &&
median_ms > perf_validation_time_threshold_ms &&
(grow || metrics.stddev > perf_stability_criteria * fabs(metrics.mean)))
{
printf("Performance is unstable, it may be a result of overheat problems\n");
printf("Idle delay for %d ms... \n", perf_validation_idle_delay_ms);
#if defined WIN32 || defined _WIN32 || defined WIN64 || defined _WIN64
Sleep(perf_validation_idle_delay_ms);
#else
usleep(perf_validation_idle_delay_ms * 1000);
#endif
has_next = true;
minIters = std::min(minIters * 5, nIters);
// reset collected samples
currentIter = 0;
times.clear();
metrics.clear();
perfValidationStage += 2;
}
if (!has_next)
{
printf("Assume that current result is valid\n");
}
}
else
{
printf("Re-measured performance result: %.2f ms\n", median_ms);
}
}
}
if (!has_next && !name.empty())
{
savePerfValidationResult(name, (float)median_ms);
}
}
#ifdef ANDROID #ifdef ANDROID
if (log_power_checkpoints) if (log_power_checkpoints)
{ {
@ -1223,9 +1418,10 @@ void TestBase::validateMetrics()
else if (getCurrentPerformanceStrategy() == PERF_STRATEGY_SIMPLE) else if (getCurrentPerformanceStrategy() == PERF_STRATEGY_SIMPLE)
{ {
double mean = metrics.mean * 1000.0f / metrics.frequency; double mean = metrics.mean * 1000.0f / metrics.frequency;
double median = metrics.median * 1000.0f / metrics.frequency;
double stddev = metrics.stddev * 1000.0f / metrics.frequency; double stddev = metrics.stddev * 1000.0f / metrics.frequency;
double percents = stddev / mean * 100.f; double percents = stddev / mean * 100.f;
printf("[ PERFSTAT ] (samples = %d, mean = %.2f, stddev = %.2f (%.1f%%))\n", (int)metrics.samples, mean, stddev, percents); printf("[ PERFSTAT ] (samples = %d, mean = %.2f, median = %.2f, stddev = %.2f (%.1f%%))\n", (int)metrics.samples, mean, median, stddev, percents);
} }
else else
{ {

View File

@ -213,7 +213,7 @@ void App::run()
// Perform HOG classification // Perform HOG classification
hogWorkBegin(); hogWorkBegin();
hog.detectMultiScale(img.getMat(ACCESS_READ), found, hit_threshold, win_stride, hog.detectMultiScale(img, found, hit_threshold, win_stride,
Size(0, 0), scale, gr_threshold); Size(0, 0), scale, gr_threshold);
hogWorkEnd(); hogWorkEnd();
@ -225,7 +225,7 @@ void App::run()
rectangle(img_to_show, r.tl(), r.br(), Scalar(0, 255, 0), 3); rectangle(img_to_show, r.tl(), r.br(), Scalar(0, 255, 0), 3);
} }
putText(img_to_show, "Mode: CPU", Point(5, 25), FONT_HERSHEY_SIMPLEX, 1., Scalar(255, 100, 0), 2); putText(img_to_show, ocl::useOpenCL() ? "Mode: OpenCL" : "Mode: CPU", Point(5, 25), FONT_HERSHEY_SIMPLEX, 1., Scalar(255, 100, 0), 2);
putText(img_to_show, "FPS (HOG only): " + hogWorkFps(), Point(5, 65), FONT_HERSHEY_SIMPLEX, 1., Scalar(255, 100, 0), 2); putText(img_to_show, "FPS (HOG only): " + hogWorkFps(), Point(5, 65), FONT_HERSHEY_SIMPLEX, 1., Scalar(255, 100, 0), 2);
putText(img_to_show, "FPS (total): " + workFps(), Point(5, 105), FONT_HERSHEY_SIMPLEX, 1., Scalar(255, 100, 0), 2); putText(img_to_show, "FPS (total): " + workFps(), Point(5, 105), FONT_HERSHEY_SIMPLEX, 1., Scalar(255, 100, 0), 2);
imshow("opencv_hog", img_to_show); imshow("opencv_hog", img_to_show);