added simd_functions.hpp to device layer

This commit is contained in:
Vladislav Vinogradov
2013-03-20 16:27:58 +04:00
parent 321070ccf0
commit 33ff3d6016
3 changed files with 1144 additions and 823 deletions

View File

@@ -263,11 +263,8 @@ namespace
namespace arithm
{
template <typename T, typename D>
void vadd4(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
template <typename T, typename D>
void vadd2(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
void addMat_v4(PtrStepSz<unsigned int> src1, PtrStepSz<unsigned int> src2, PtrStepSz<unsigned int> dst, cudaStream_t stream);
void addMat_v2(PtrStepSz<unsigned int> src1, PtrStepSz<unsigned int> src2, PtrStepSz<unsigned int> dst, cudaStream_t stream);
template <typename T, typename D>
void addMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
@@ -345,62 +342,6 @@ void cv::gpu::add(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const Gpu
}
};
typedef void (*vfunc_t)(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
static const vfunc_t vfuncs4[4][4] =
{
{
vadd4<unsigned int, unsigned int>,
vadd4<unsigned int, int>,
0,
0
},
{
vadd4<int, unsigned int>,
vadd4<int, int>,
0,
0
},
{
0,
0,
0,
0
},
{
0,
0,
0,
0
}
};
static const vfunc_t vfuncs2[4][4] =
{
{
0,
0,
0,
0
},
{
0,
0,
0,
0
},
{
0,
0,
vadd2<unsigned int, unsigned int>,
vadd2<unsigned int, int>
},
{
0,
0,
vadd2<int, unsigned int>,
vadd2<int, int>
}
};
if (dtype < 0)
dtype = src1.depth();
@@ -426,7 +367,7 @@ void cv::gpu::add(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const Gpu
PtrStepSzb src2_(src1.rows, src1.cols * cn, src2.data, src2.step);
PtrStepSzb dst_(src1.rows, src1.cols * cn, dst.data, dst.step);
if (mask.empty() && sdepth < CV_32S && ddepth < CV_32S)
if (mask.empty() && (sdepth == CV_8U || sdepth == CV_16U) && ddepth == sdepth)
{
const intptr_t src1ptr = reinterpret_cast<intptr_t>(src1_.data);
const intptr_t src2ptr = reinterpret_cast<intptr_t>(src2_.data);
@@ -434,31 +375,27 @@ void cv::gpu::add(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const Gpu
const bool isAllAligned = (src1ptr & 31) == 0 && (src2ptr & 31) == 0 && (dstptr & 31) == 0;
if (deviceSupports(FEATURE_SET_COMPUTE_20) && isAllAligned)
if (isAllAligned)
{
const vfunc_t vfunc4 = vfuncs4[sdepth][ddepth];
const vfunc_t vfunc2 = vfuncs2[sdepth][ddepth];
if (vfunc4 != 0 && (src1_.cols & 3) == 0)
if (sdepth == CV_8U && (src1_.cols & 3) == 0)
{
const int vcols = src1_.cols >> 2;
vfunc4(PtrStepSzb(src1_.rows, vcols, src1_.data, src1_.step),
PtrStepSzb(src1_.rows, vcols, src2_.data, src2_.step),
PtrStepSzb(src1_.rows, vcols, dst_.data, dst_.step),
stream);
addMat_v4(PtrStepSz<unsigned int>(src1_.rows, vcols, (unsigned int*) src1_.data, src1_.step),
PtrStepSz<unsigned int>(src1_.rows, vcols, (unsigned int*) src2_.data, src2_.step),
PtrStepSz<unsigned int>(src1_.rows, vcols, (unsigned int*) dst_.data, dst_.step),
stream);
return;
}
if (vfunc2 != 0 && (src1_.cols & 1) == 0)
else if (sdepth == CV_16U && (src1_.cols & 1) == 0)
{
const int vcols = src1_.cols >> 1;
vfunc2(PtrStepSzb(src1_.rows, vcols, src1_.data, src1_.step),
PtrStepSzb(src1_.rows, vcols, src2_.data, src2_.step),
PtrStepSzb(src1_.rows, vcols, dst_.data, dst_.step),
stream);
addMat_v2(PtrStepSz<unsigned int>(src1_.rows, vcols, (unsigned int*) src1_.data, src1_.step),
PtrStepSz<unsigned int>(src1_.rows, vcols, (unsigned int*) src2_.data, src2_.step),
PtrStepSz<unsigned int>(src1_.rows, vcols, (unsigned int*) dst_.data, dst_.step),
stream);
return;
}
@@ -606,11 +543,8 @@ void cv::gpu::add(const GpuMat& src, const Scalar& sc, GpuMat& dst, const GpuMat
namespace arithm
{
template <typename T, typename D>
void vsub4(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
template <typename T, typename D>
void vsub2(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
void subMat_v4(PtrStepSz<unsigned int> src1, PtrStepSz<unsigned int> src2, PtrStepSz<unsigned int> dst, cudaStream_t stream);
void subMat_v2(PtrStepSz<unsigned int> src1, PtrStepSz<unsigned int> src2, PtrStepSz<unsigned int> dst, cudaStream_t stream);
template <typename T, typename D>
void subMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
@@ -688,62 +622,6 @@ void cv::gpu::subtract(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, cons
}
};
typedef void (*vfunc_t)(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
static const vfunc_t vfuncs4[4][4] =
{
{
vsub4<unsigned int, unsigned int>,
vsub4<unsigned int, int>,
0,
0
},
{
vsub4<int, unsigned int>,
vsub4<int, int>,
0,
0
},
{
0,
0,
0,
0
},
{
0,
0,
0,
0
}
};
static const vfunc_t vfuncs2[4][4] =
{
{
0,
0,
0,
0
},
{
0,
0,
0,
0
},
{
0,
0,
vsub2<unsigned int, unsigned int>,
vsub2<unsigned int, int>
},
{
0,
0,
vsub2<int, unsigned int>,
vsub2<int, int>
}
};
if (dtype < 0)
dtype = src1.depth();
@@ -769,7 +647,7 @@ void cv::gpu::subtract(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, cons
PtrStepSzb src2_(src1.rows, src1.cols * cn, src2.data, src2.step);
PtrStepSzb dst_(src1.rows, src1.cols * cn, dst.data, dst.step);
if (mask.empty() && sdepth < CV_32S && ddepth < CV_32S)
if (mask.empty() && (sdepth == CV_8U || sdepth == CV_16U) && ddepth == sdepth)
{
const intptr_t src1ptr = reinterpret_cast<intptr_t>(src1_.data);
const intptr_t src2ptr = reinterpret_cast<intptr_t>(src2_.data);
@@ -777,31 +655,27 @@ void cv::gpu::subtract(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, cons
const bool isAllAligned = (src1ptr & 31) == 0 && (src2ptr & 31) == 0 && (dstptr & 31) == 0;
if (deviceSupports(FEATURE_SET_COMPUTE_20) && isAllAligned)
if (isAllAligned)
{
const vfunc_t vfunc4 = vfuncs4[sdepth][ddepth];
const vfunc_t vfunc2 = vfuncs2[sdepth][ddepth];
if (vfunc4 != 0 && (src1_.cols & 3) == 0)
if (sdepth == CV_8U && (src1_.cols & 3) == 0)
{
const int vcols = src1_.cols >> 2;
vfunc4(PtrStepSzb(src1_.rows, vcols, src1_.data, src1_.step),
PtrStepSzb(src1_.rows, vcols, src2_.data, src2_.step),
PtrStepSzb(src1_.rows, vcols, dst_.data, dst_.step),
stream);
subMat_v4(PtrStepSz<unsigned int>(src1_.rows, vcols, (unsigned int*) src1_.data, src1_.step),
PtrStepSz<unsigned int>(src1_.rows, vcols, (unsigned int*) src2_.data, src2_.step),
PtrStepSz<unsigned int>(src1_.rows, vcols, (unsigned int*) dst_.data, dst_.step),
stream);
return;
}
if (vfunc2 != 0 && (src1_.cols & 1) == 0)
else if (sdepth == CV_16U && (src1_.cols & 1) == 0)
{
const int vcols = src1_.cols >> 1;
vfunc2(PtrStepSzb(src1_.rows, vcols, src1_.data, src1_.step),
PtrStepSzb(src1_.rows, vcols, src2_.data, src2_.step),
PtrStepSzb(src1_.rows, vcols, dst_.data, dst_.step),
stream);
subMat_v2(PtrStepSz<unsigned int>(src1_.rows, vcols, (unsigned int*) src1_.data, src1_.step),
PtrStepSz<unsigned int>(src1_.rows, vcols, (unsigned int*) src2_.data, src2_.step),
PtrStepSz<unsigned int>(src1_.rows, vcols, (unsigned int*) dst_.data, dst_.step),
stream);
return;
}
@@ -1585,11 +1459,8 @@ void cv::gpu::divide(double scale, const GpuMat& src, GpuMat& dst, int dtype, St
namespace arithm
{
template <typename T>
void vabsDiff4(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
template <typename T>
void vabsDiff2(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
void absDiffMat_v4(PtrStepSz<unsigned int> src1, PtrStepSz<unsigned int> src2, PtrStepSz<unsigned int> dst, cudaStream_t stream);
void absDiffMat_v2(PtrStepSz<unsigned int> src1, PtrStepSz<unsigned int> src2, PtrStepSz<unsigned int> dst, cudaStream_t stream);
template <typename T>
void absDiffMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
@@ -1610,20 +1481,6 @@ void cv::gpu::absdiff(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Strea
absDiffMat<float>,
absDiffMat<double>
};
static const func_t vfuncs4[] =
{
vabsDiff4<unsigned int>,
vabsDiff4<int>,
0,
0
};
static const func_t vfuncs2[] =
{
0,
0,
vabsDiff2<unsigned int>,
vabsDiff2<int>
};
const int depth = src1.depth();
const int cn = src1.channels();
@@ -1645,7 +1502,7 @@ void cv::gpu::absdiff(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Strea
PtrStepSzb src2_(src1.rows, src1.cols * cn, src2.data, src2.step);
PtrStepSzb dst_(src1.rows, src1.cols * cn, dst.data, dst.step);
if (depth < CV_32S)
if (depth == CV_8U || depth == CV_16U)
{
const intptr_t src1ptr = reinterpret_cast<intptr_t>(src1_.data);
const intptr_t src2ptr = reinterpret_cast<intptr_t>(src2_.data);
@@ -1653,31 +1510,27 @@ void cv::gpu::absdiff(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Strea
const bool isAllAligned = (src1ptr & 31) == 0 && (src2ptr & 31) == 0 && (dstptr & 31) == 0;
if (deviceSupports(FEATURE_SET_COMPUTE_20) && isAllAligned)
if (isAllAligned)
{
const func_t vfunc4 = vfuncs4[depth];
const func_t vfunc2 = vfuncs2[depth];
if (vfunc4 != 0 && (src1_.cols & 3) == 0)
if (depth == CV_8U && (src1_.cols & 3) == 0)
{
const int vcols = src1_.cols >> 2;
vfunc4(PtrStepSzb(src1_.rows, vcols, src1_.data, src1_.step),
PtrStepSzb(src1_.rows, vcols, src2_.data, src2_.step),
PtrStepSzb(src1_.rows, vcols, dst_.data, dst_.step),
stream);
absDiffMat_v4(PtrStepSz<unsigned int>(src1_.rows, vcols, (unsigned int*) src1_.data, src1_.step),
PtrStepSz<unsigned int>(src1_.rows, vcols, (unsigned int*) src2_.data, src2_.step),
PtrStepSz<unsigned int>(src1_.rows, vcols, (unsigned int*) dst_.data, dst_.step),
stream);
return;
}
if (vfunc2 != 0 && (src1_.cols & 1) == 0)
else if (depth == CV_16U && (src1_.cols & 1) == 0)
{
const int vcols = src1_.cols >> 1;
vfunc2(PtrStepSzb(src1_.rows, vcols, src1_.data, src1_.step),
PtrStepSzb(src1_.rows, vcols, src2_.data, src2_.step),
PtrStepSzb(src1_.rows, vcols, dst_.data, dst_.step),
stream);
absDiffMat_v2(PtrStepSz<unsigned int>(src1_.rows, vcols, (unsigned int*) src1_.data, src1_.step),
PtrStepSz<unsigned int>(src1_.rows, vcols, (unsigned int*) src2_.data, src2_.step),
PtrStepSz<unsigned int>(src1_.rows, vcols, (unsigned int*) dst_.data, dst_.step),
stream);
return;
}
@@ -1940,6 +1793,11 @@ void cv::gpu::exp(const GpuMat& src, GpuMat& dst, Stream& stream)
namespace arithm
{
void cmpMatEq_v4(PtrStepSz<uint> src1, PtrStepSz<uint> src2, PtrStepSz<uint> dst, cudaStream_t stream);
void cmpMatNe_v4(PtrStepSz<uint> src1, PtrStepSz<uint> src2, PtrStepSz<uint> dst, cudaStream_t stream);
void cmpMatLt_v4(PtrStepSz<uint> src1, PtrStepSz<uint> src2, PtrStepSz<uint> dst, cudaStream_t stream);
void cmpMatLe_v4(PtrStepSz<uint> src1, PtrStepSz<uint> src2, PtrStepSz<uint> dst, cudaStream_t stream);
template <typename T> void cmpMatEq(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
template <typename T> void cmpMatNe(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
template <typename T> void cmpMatLt(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
@@ -1962,6 +1820,12 @@ void cv::gpu::compare(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, int c
{cmpMatEq<double> , cmpMatNe<double> , cmpMatLt<double> , cmpMatLe<double> }
};
typedef void (*func_v4_t)(PtrStepSz<uint> src1, PtrStepSz<uint> src2, PtrStepSz<uint> dst, cudaStream_t stream);
static const func_v4_t funcs_v4[] =
{
cmpMatEq_v4, cmpMatNe_v4, cmpMatLt_v4, cmpMatLe_v4
};
const int depth = src1.depth();
const int cn = src1.channels();
@@ -1997,6 +1861,27 @@ void cv::gpu::compare(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, int c
PtrStepSzb src2_(src1.rows, src1.cols * cn, psrc2[cmpop]->data, psrc2[cmpop]->step);
PtrStepSzb dst_(src1.rows, src1.cols * cn, dst.data, dst.step);
if (depth == CV_8U && (src1_.cols & 3) == 0)
{
const intptr_t src1ptr = reinterpret_cast<intptr_t>(src1_.data);
const intptr_t src2ptr = reinterpret_cast<intptr_t>(src2_.data);
const intptr_t dstptr = reinterpret_cast<intptr_t>(dst_.data);
const bool isAllAligned = (src1ptr & 31) == 0 && (src2ptr & 31) == 0 && (dstptr & 31) == 0;
if (isAllAligned)
{
const int vcols = src1_.cols >> 2;
funcs_v4[code](PtrStepSz<unsigned int>(src1_.rows, vcols, (unsigned int*) src1_.data, src1_.step),
PtrStepSz<unsigned int>(src1_.rows, vcols, (unsigned int*) src2_.data, src2_.step),
PtrStepSz<unsigned int>(src1_.rows, vcols, (unsigned int*) dst_.data, dst_.step),
stream);
return;
}
}
const func_t func = funcs[depth][code];
func(src1_, src2_, dst_, stream);
@@ -2532,13 +2417,13 @@ void cv::gpu::lshift(const GpuMat& src, Scalar_<int> sc, GpuMat& dst, Stream& st
namespace arithm
{
template <typename T> void vmin4(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
template <typename T> void vmin2(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
void minMat_v4(PtrStepSz<unsigned int> src1, PtrStepSz<unsigned int> src2, PtrStepSz<unsigned int> dst, cudaStream_t stream);
void minMat_v2(PtrStepSz<unsigned int> src1, PtrStepSz<unsigned int> src2, PtrStepSz<unsigned int> dst, cudaStream_t stream);
template <typename T> void minMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
template <typename T> void minScalar(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream);
template <typename T> void vmax4(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
template <typename T> void vmax2(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
void maxMat_v4(PtrStepSz<unsigned int> src1, PtrStepSz<unsigned int> src2, PtrStepSz<unsigned int> dst, cudaStream_t stream);
void maxMat_v2(PtrStepSz<unsigned int> src1, PtrStepSz<unsigned int> src2, PtrStepSz<unsigned int> dst, cudaStream_t stream);
template <typename T> void maxMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
template <typename T> void maxScalar(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream);
}
@@ -2558,20 +2443,6 @@ void cv::gpu::min(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Stream& s
minMat<float>,
minMat<double>
};
static const func_t vfuncs4[] =
{
vmin4<unsigned int>,
vmin4<int>,
0,
0
};
static const func_t vfuncs2[] =
{
0,
0,
vmin2<unsigned int>,
vmin2<int>
};
const int depth = src1.depth();
const int cn = src1.channels();
@@ -2593,7 +2464,7 @@ void cv::gpu::min(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Stream& s
PtrStepSzb src2_(src1.rows, src1.cols * cn, src2.data, src2.step);
PtrStepSzb dst_(src1.rows, src1.cols * cn, dst.data, dst.step);
if (depth < CV_32S)
if (depth == CV_8U || depth == CV_16U)
{
const intptr_t src1ptr = reinterpret_cast<intptr_t>(src1_.data);
const intptr_t src2ptr = reinterpret_cast<intptr_t>(src2_.data);
@@ -2601,31 +2472,27 @@ void cv::gpu::min(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Stream& s
const bool isAllAligned = (src1ptr & 31) == 0 && (src2ptr & 31) == 0 && (dstptr & 31) == 0;
if (deviceSupports(FEATURE_SET_COMPUTE_20) && isAllAligned)
if (isAllAligned)
{
const func_t vfunc4 = vfuncs4[depth];
const func_t vfunc2 = vfuncs2[depth];
if (vfunc4 != 0 && (src1_.cols & 3) == 0)
if (depth == CV_8U && (src1_.cols & 3) == 0)
{
const int vcols = src1_.cols >> 2;
vfunc4(PtrStepSzb(src1_.rows, vcols, src1_.data, src1_.step),
PtrStepSzb(src1_.rows, vcols, src2_.data, src2_.step),
PtrStepSzb(src1_.rows, vcols, dst_.data, dst_.step),
stream);
minMat_v4(PtrStepSz<unsigned int>(src1_.rows, vcols, (unsigned int*) src1_.data, src1_.step),
PtrStepSz<unsigned int>(src1_.rows, vcols, (unsigned int*) src2_.data, src2_.step),
PtrStepSz<unsigned int>(src1_.rows, vcols, (unsigned int*) dst_.data, dst_.step),
stream);
return;
}
if (vfunc2 != 0 && (src1_.cols & 1) == 0)
else if (depth == CV_16U && (src1_.cols & 1) == 0)
{
const int vcols = src1_.cols >> 1;
vfunc2(PtrStepSzb(src1_.rows, vcols, src1_.data, src1_.step),
PtrStepSzb(src1_.rows, vcols, src2_.data, src2_.step),
PtrStepSzb(src1_.rows, vcols, dst_.data, dst_.step),
stream);
minMat_v2(PtrStepSz<unsigned int>(src1_.rows, vcols, (unsigned int*) src1_.data, src1_.step),
PtrStepSz<unsigned int>(src1_.rows, vcols, (unsigned int*) src2_.data, src2_.step),
PtrStepSz<unsigned int>(src1_.rows, vcols, (unsigned int*) dst_.data, dst_.step),
stream);
return;
}
@@ -2655,20 +2522,6 @@ void cv::gpu::max(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Stream& s
maxMat<float>,
maxMat<double>
};
static const func_t vfuncs4[] =
{
vmax4<unsigned int>,
vmax4<int>,
0,
0
};
static const func_t vfuncs2[] =
{
0,
0,
vmax2<unsigned int>,
vmax2<int>
};
const int depth = src1.depth();
const int cn = src1.channels();
@@ -2690,7 +2543,7 @@ void cv::gpu::max(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Stream& s
PtrStepSzb src2_(src1.rows, src1.cols * cn, src2.data, src2.step);
PtrStepSzb dst_(src1.rows, src1.cols * cn, dst.data, dst.step);
if (depth < CV_32S)
if (depth == CV_8U || depth == CV_16U)
{
const intptr_t src1ptr = reinterpret_cast<intptr_t>(src1_.data);
const intptr_t src2ptr = reinterpret_cast<intptr_t>(src2_.data);
@@ -2698,31 +2551,27 @@ void cv::gpu::max(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Stream& s
const bool isAllAligned = (src1ptr & 31) == 0 && (src2ptr & 31) == 0 && (dstptr & 31) == 0;
if (deviceSupports(FEATURE_SET_COMPUTE_20) && isAllAligned)
if (isAllAligned)
{
const func_t vfunc4 = vfuncs4[depth];
const func_t vfunc2 = vfuncs2[depth];
if (vfunc4 != 0 && (src1_.cols & 3) == 0)
if (depth == CV_8U && (src1_.cols & 3) == 0)
{
const int vcols = src1_.cols >> 2;
vfunc4(PtrStepSzb(src1_.rows, vcols, src1_.data, src1_.step),
PtrStepSzb(src1_.rows, vcols, src2_.data, src2_.step),
PtrStepSzb(src1_.rows, vcols, dst_.data, dst_.step),
stream);
maxMat_v4(PtrStepSz<unsigned int>(src1_.rows, vcols, (unsigned int*) src1_.data, src1_.step),
PtrStepSz<unsigned int>(src1_.rows, vcols, (unsigned int*) src2_.data, src2_.step),
PtrStepSz<unsigned int>(src1_.rows, vcols, (unsigned int*) dst_.data, dst_.step),
stream);
return;
}
if (vfunc2 != 0 && (src1_.cols & 1) == 0)
else if (depth == CV_16U && (src1_.cols & 1) == 0)
{
const int vcols = src1_.cols >> 1;
vfunc2(PtrStepSzb(src1_.rows, vcols, src1_.data, src1_.step),
PtrStepSzb(src1_.rows, vcols, src2_.data, src2_.step),
PtrStepSzb(src1_.rows, vcols, dst_.data, dst_.step),
stream);
maxMat_v2(PtrStepSz<unsigned int>(src1_.rows, vcols, (unsigned int*) src1_.data, src1_.step),
PtrStepSz<unsigned int>(src1_.rows, vcols, (unsigned int*) src2_.data, src2_.step),
PtrStepSz<unsigned int>(src1_.rows, vcols, (unsigned int*) dst_.data, dst_.step),
stream);
return;
}