fix cuda match template:

use correct types for integral/sum outputs
This commit is contained in:
Vladislav Vinogradov 2014-12-25 15:41:14 +03:00
parent 9b8c3fd675
commit 26afa49d71
2 changed files with 135 additions and 135 deletions

View File

@ -218,7 +218,7 @@ namespace cv { namespace cuda { namespace device
// Prepared_SQDIFF // Prepared_SQDIFF
template <int cn> template <int cn>
__global__ void matchTemplatePreparedKernel_SQDIFF_8U(int w, int h, const PtrStep<unsigned long long> image_sqsum, unsigned long long templ_sqsum, PtrStepSzf result) __global__ void matchTemplatePreparedKernel_SQDIFF_8U(int w, int h, const PtrStep<double> image_sqsum, double templ_sqsum, PtrStepSzf result)
{ {
const int x = blockIdx.x * blockDim.x + threadIdx.x; const int x = blockIdx.x * blockDim.x + threadIdx.x;
const int y = blockIdx.y * blockDim.y + threadIdx.y; const int y = blockIdx.y * blockDim.y + threadIdx.y;
@ -234,7 +234,7 @@ namespace cv { namespace cuda { namespace device
} }
template <int cn> template <int cn>
void matchTemplatePrepared_SQDIFF_8U(int w, int h, const PtrStepSz<unsigned long long> image_sqsum, unsigned long long templ_sqsum, PtrStepSzf result, cudaStream_t stream) void matchTemplatePrepared_SQDIFF_8U(int w, int h, const PtrStepSz<double> image_sqsum, double templ_sqsum, PtrStepSzf result, cudaStream_t stream)
{ {
const dim3 threads(32, 8); const dim3 threads(32, 8);
const dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y)); const dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));
@ -246,10 +246,10 @@ namespace cv { namespace cuda { namespace device
cudaSafeCall( cudaDeviceSynchronize() ); cudaSafeCall( cudaDeviceSynchronize() );
} }
void matchTemplatePrepared_SQDIFF_8U(int w, int h, const PtrStepSz<unsigned long long> image_sqsum, unsigned long long templ_sqsum, PtrStepSzf result, int cn, void matchTemplatePrepared_SQDIFF_8U(int w, int h, const PtrStepSz<double> image_sqsum, double templ_sqsum, PtrStepSzf result, int cn,
cudaStream_t stream) cudaStream_t stream)
{ {
typedef void (*caller_t)(int w, int h, const PtrStepSz<unsigned long long> image_sqsum, unsigned long long templ_sqsum, PtrStepSzf result, cudaStream_t stream); typedef void (*caller_t)(int w, int h, const PtrStepSz<double> image_sqsum, double templ_sqsum, PtrStepSzf result, cudaStream_t stream);
static const caller_t callers[] = static const caller_t callers[] =
{ {
@ -287,8 +287,8 @@ namespace cv { namespace cuda { namespace device
template <int cn> template <int cn>
__global__ void matchTemplatePreparedKernel_SQDIFF_NORMED_8U( __global__ void matchTemplatePreparedKernel_SQDIFF_NORMED_8U(
int w, int h, const PtrStep<unsigned long long> image_sqsum, int w, int h, const PtrStep<double> image_sqsum,
unsigned long long templ_sqsum, PtrStepSzf result) double templ_sqsum, PtrStepSzf result)
{ {
const int x = blockIdx.x * blockDim.x + threadIdx.x; const int x = blockIdx.x * blockDim.x + threadIdx.x;
const int y = blockIdx.y * blockDim.y + threadIdx.y; const int y = blockIdx.y * blockDim.y + threadIdx.y;
@ -305,7 +305,7 @@ namespace cv { namespace cuda { namespace device
} }
template <int cn> template <int cn>
void matchTemplatePrepared_SQDIFF_NORMED_8U(int w, int h, const PtrStepSz<unsigned long long> image_sqsum, unsigned long long templ_sqsum, void matchTemplatePrepared_SQDIFF_NORMED_8U(int w, int h, const PtrStepSz<double> image_sqsum, double templ_sqsum,
PtrStepSzf result, cudaStream_t stream) PtrStepSzf result, cudaStream_t stream)
{ {
const dim3 threads(32, 8); const dim3 threads(32, 8);
@ -319,10 +319,10 @@ namespace cv { namespace cuda { namespace device
} }
void matchTemplatePrepared_SQDIFF_NORMED_8U(int w, int h, const PtrStepSz<unsigned long long> image_sqsum, unsigned long long templ_sqsum, void matchTemplatePrepared_SQDIFF_NORMED_8U(int w, int h, const PtrStepSz<double> image_sqsum, double templ_sqsum,
PtrStepSzf result, int cn, cudaStream_t stream) PtrStepSzf result, int cn, cudaStream_t stream)
{ {
typedef void (*caller_t)(int w, int h, const PtrStepSz<unsigned long long> image_sqsum, unsigned long long templ_sqsum, PtrStepSzf result, cudaStream_t stream); typedef void (*caller_t)(int w, int h, const PtrStepSz<double> image_sqsum, double templ_sqsum, PtrStepSzf result, cudaStream_t stream);
static const caller_t callers[] = static const caller_t callers[] =
{ {
0, matchTemplatePrepared_SQDIFF_NORMED_8U<1>, matchTemplatePrepared_SQDIFF_NORMED_8U<2>, matchTemplatePrepared_SQDIFF_NORMED_8U<3>, matchTemplatePrepared_SQDIFF_NORMED_8U<4> 0, matchTemplatePrepared_SQDIFF_NORMED_8U<1>, matchTemplatePrepared_SQDIFF_NORMED_8U<2>, matchTemplatePrepared_SQDIFF_NORMED_8U<3>, matchTemplatePrepared_SQDIFF_NORMED_8U<4>
@ -334,7 +334,7 @@ namespace cv { namespace cuda { namespace device
////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////
// Prepared_CCOFF // Prepared_CCOFF
__global__ void matchTemplatePreparedKernel_CCOFF_8U(int w, int h, float templ_sum_scale, const PtrStep<unsigned int> image_sum, PtrStepSzf result) __global__ void matchTemplatePreparedKernel_CCOFF_8U(int w, int h, float templ_sum_scale, const PtrStep<int> image_sum, PtrStepSzf result)
{ {
const int x = blockIdx.x * blockDim.x + threadIdx.x; const int x = blockIdx.x * blockDim.x + threadIdx.x;
const int y = blockIdx.y * blockDim.y + threadIdx.y; const int y = blockIdx.y * blockDim.y + threadIdx.y;
@ -349,7 +349,7 @@ namespace cv { namespace cuda { namespace device
} }
} }
void matchTemplatePrepared_CCOFF_8U(int w, int h, const PtrStepSz<unsigned int> image_sum, unsigned int templ_sum, PtrStepSzf result, cudaStream_t stream) void matchTemplatePrepared_CCOFF_8U(int w, int h, const PtrStepSz<int> image_sum, int templ_sum, PtrStepSzf result, cudaStream_t stream)
{ {
dim3 threads(32, 8); dim3 threads(32, 8);
dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y)); dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));
@ -365,8 +365,8 @@ namespace cv { namespace cuda { namespace device
__global__ void matchTemplatePreparedKernel_CCOFF_8UC2( __global__ void matchTemplatePreparedKernel_CCOFF_8UC2(
int w, int h, float templ_sum_scale_r, float templ_sum_scale_g, int w, int h, float templ_sum_scale_r, float templ_sum_scale_g,
const PtrStep<unsigned int> image_sum_r, const PtrStep<int> image_sum_r,
const PtrStep<unsigned int> image_sum_g, const PtrStep<int> image_sum_g,
PtrStepSzf result) PtrStepSzf result)
{ {
const int x = blockIdx.x * blockDim.x + threadIdx.x; const int x = blockIdx.x * blockDim.x + threadIdx.x;
@ -388,9 +388,9 @@ namespace cv { namespace cuda { namespace device
void matchTemplatePrepared_CCOFF_8UC2( void matchTemplatePrepared_CCOFF_8UC2(
int w, int h, int w, int h,
const PtrStepSz<unsigned int> image_sum_r, const PtrStepSz<int> image_sum_r,
const PtrStepSz<unsigned int> image_sum_g, const PtrStepSz<int> image_sum_g,
unsigned int templ_sum_r, unsigned int templ_sum_g, int templ_sum_r, int templ_sum_g,
PtrStepSzf result, cudaStream_t stream) PtrStepSzf result, cudaStream_t stream)
{ {
dim3 threads(32, 8); dim3 threads(32, 8);
@ -412,9 +412,9 @@ namespace cv { namespace cuda { namespace device
float templ_sum_scale_r, float templ_sum_scale_r,
float templ_sum_scale_g, float templ_sum_scale_g,
float templ_sum_scale_b, float templ_sum_scale_b,
const PtrStep<unsigned int> image_sum_r, const PtrStep<int> image_sum_r,
const PtrStep<unsigned int> image_sum_g, const PtrStep<int> image_sum_g,
const PtrStep<unsigned int> image_sum_b, const PtrStep<int> image_sum_b,
PtrStepSzf result) PtrStepSzf result)
{ {
const int x = blockIdx.x * blockDim.x + threadIdx.x; const int x = blockIdx.x * blockDim.x + threadIdx.x;
@ -440,12 +440,12 @@ namespace cv { namespace cuda { namespace device
void matchTemplatePrepared_CCOFF_8UC3( void matchTemplatePrepared_CCOFF_8UC3(
int w, int h, int w, int h,
const PtrStepSz<unsigned int> image_sum_r, const PtrStepSz<int> image_sum_r,
const PtrStepSz<unsigned int> image_sum_g, const PtrStepSz<int> image_sum_g,
const PtrStepSz<unsigned int> image_sum_b, const PtrStepSz<int> image_sum_b,
unsigned int templ_sum_r, int templ_sum_r,
unsigned int templ_sum_g, int templ_sum_g,
unsigned int templ_sum_b, int templ_sum_b,
PtrStepSzf result, cudaStream_t stream) PtrStepSzf result, cudaStream_t stream)
{ {
dim3 threads(32, 8); dim3 threads(32, 8);
@ -471,10 +471,10 @@ namespace cv { namespace cuda { namespace device
float templ_sum_scale_g, float templ_sum_scale_g,
float templ_sum_scale_b, float templ_sum_scale_b,
float templ_sum_scale_a, float templ_sum_scale_a,
const PtrStep<unsigned int> image_sum_r, const PtrStep<int> image_sum_r,
const PtrStep<unsigned int> image_sum_g, const PtrStep<int> image_sum_g,
const PtrStep<unsigned int> image_sum_b, const PtrStep<int> image_sum_b,
const PtrStep<unsigned int> image_sum_a, const PtrStep<int> image_sum_a,
PtrStepSzf result) PtrStepSzf result)
{ {
const int x = blockIdx.x * blockDim.x + threadIdx.x; const int x = blockIdx.x * blockDim.x + threadIdx.x;
@ -504,14 +504,14 @@ namespace cv { namespace cuda { namespace device
void matchTemplatePrepared_CCOFF_8UC4( void matchTemplatePrepared_CCOFF_8UC4(
int w, int h, int w, int h,
const PtrStepSz<unsigned int> image_sum_r, const PtrStepSz<int> image_sum_r,
const PtrStepSz<unsigned int> image_sum_g, const PtrStepSz<int> image_sum_g,
const PtrStepSz<unsigned int> image_sum_b, const PtrStepSz<int> image_sum_b,
const PtrStepSz<unsigned int> image_sum_a, const PtrStepSz<int> image_sum_a,
unsigned int templ_sum_r, int templ_sum_r,
unsigned int templ_sum_g, int templ_sum_g,
unsigned int templ_sum_b, int templ_sum_b,
unsigned int templ_sum_a, int templ_sum_a,
PtrStepSzf result, cudaStream_t stream) PtrStepSzf result, cudaStream_t stream)
{ {
dim3 threads(32, 8); dim3 threads(32, 8);
@ -537,8 +537,8 @@ namespace cv { namespace cuda { namespace device
__global__ void matchTemplatePreparedKernel_CCOFF_NORMED_8U( __global__ void matchTemplatePreparedKernel_CCOFF_NORMED_8U(
int w, int h, float weight, int w, int h, float weight,
float templ_sum_scale, float templ_sqsum_scale, float templ_sum_scale, float templ_sqsum_scale,
const PtrStep<unsigned int> image_sum, const PtrStep<int> image_sum,
const PtrStep<unsigned long long> image_sqsum, const PtrStep<double> image_sqsum,
PtrStepSzf result) PtrStepSzf result)
{ {
const int x = blockIdx.x * blockDim.x + threadIdx.x; const int x = blockIdx.x * blockDim.x + threadIdx.x;
@ -559,9 +559,9 @@ namespace cv { namespace cuda { namespace device
} }
void matchTemplatePrepared_CCOFF_NORMED_8U( void matchTemplatePrepared_CCOFF_NORMED_8U(
int w, int h, const PtrStepSz<unsigned int> image_sum, int w, int h, const PtrStepSz<int> image_sum,
const PtrStepSz<unsigned long long> image_sqsum, const PtrStepSz<double> image_sqsum,
unsigned int templ_sum, unsigned long long templ_sqsum, int templ_sum, double templ_sqsum,
PtrStepSzf result, cudaStream_t stream) PtrStepSzf result, cudaStream_t stream)
{ {
dim3 threads(32, 8); dim3 threads(32, 8);
@ -586,8 +586,8 @@ namespace cv { namespace cuda { namespace device
int w, int h, float weight, int w, int h, float weight,
float templ_sum_scale_r, float templ_sum_scale_g, float templ_sum_scale_r, float templ_sum_scale_g,
float templ_sqsum_scale, float templ_sqsum_scale,
const PtrStep<unsigned int> image_sum_r, const PtrStep<unsigned long long> image_sqsum_r, const PtrStep<int> image_sum_r, const PtrStep<double> image_sqsum_r,
const PtrStep<unsigned int> image_sum_g, const PtrStep<unsigned long long> image_sqsum_g, const PtrStep<int> image_sum_g, const PtrStep<double> image_sqsum_g,
PtrStepSzf result) PtrStepSzf result)
{ {
const int x = blockIdx.x * blockDim.x + threadIdx.x; const int x = blockIdx.x * blockDim.x + threadIdx.x;
@ -618,10 +618,10 @@ namespace cv { namespace cuda { namespace device
void matchTemplatePrepared_CCOFF_NORMED_8UC2( void matchTemplatePrepared_CCOFF_NORMED_8UC2(
int w, int h, int w, int h,
const PtrStepSz<unsigned int> image_sum_r, const PtrStepSz<unsigned long long> image_sqsum_r, const PtrStepSz<int> image_sum_r, const PtrStepSz<double> image_sqsum_r,
const PtrStepSz<unsigned int> image_sum_g, const PtrStepSz<unsigned long long> image_sqsum_g, const PtrStepSz<int> image_sum_g, const PtrStepSz<double> image_sqsum_g,
unsigned int templ_sum_r, unsigned long long templ_sqsum_r, int templ_sum_r, double templ_sqsum_r,
unsigned int templ_sum_g, unsigned long long templ_sqsum_g, int templ_sum_g, double templ_sqsum_g,
PtrStepSzf result, cudaStream_t stream) PtrStepSzf result, cudaStream_t stream)
{ {
dim3 threads(32, 8); dim3 threads(32, 8);
@ -652,9 +652,9 @@ namespace cv { namespace cuda { namespace device
int w, int h, float weight, int w, int h, float weight,
float templ_sum_scale_r, float templ_sum_scale_g, float templ_sum_scale_b, float templ_sum_scale_r, float templ_sum_scale_g, float templ_sum_scale_b,
float templ_sqsum_scale, float templ_sqsum_scale,
const PtrStep<unsigned int> image_sum_r, const PtrStep<unsigned long long> image_sqsum_r, const PtrStep<int> image_sum_r, const PtrStep<double> image_sqsum_r,
const PtrStep<unsigned int> image_sum_g, const PtrStep<unsigned long long> image_sqsum_g, const PtrStep<int> image_sum_g, const PtrStep<double> image_sqsum_g,
const PtrStep<unsigned int> image_sum_b, const PtrStep<unsigned long long> image_sqsum_b, const PtrStep<int> image_sum_b, const PtrStep<double> image_sqsum_b,
PtrStepSzf result) PtrStepSzf result)
{ {
const int x = blockIdx.x * blockDim.x + threadIdx.x; const int x = blockIdx.x * blockDim.x + threadIdx.x;
@ -693,12 +693,12 @@ namespace cv { namespace cuda { namespace device
void matchTemplatePrepared_CCOFF_NORMED_8UC3( void matchTemplatePrepared_CCOFF_NORMED_8UC3(
int w, int h, int w, int h,
const PtrStepSz<unsigned int> image_sum_r, const PtrStepSz<unsigned long long> image_sqsum_r, const PtrStepSz<int> image_sum_r, const PtrStepSz<double> image_sqsum_r,
const PtrStepSz<unsigned int> image_sum_g, const PtrStepSz<unsigned long long> image_sqsum_g, const PtrStepSz<int> image_sum_g, const PtrStepSz<double> image_sqsum_g,
const PtrStepSz<unsigned int> image_sum_b, const PtrStepSz<unsigned long long> image_sqsum_b, const PtrStepSz<int> image_sum_b, const PtrStepSz<double> image_sqsum_b,
unsigned int templ_sum_r, unsigned long long templ_sqsum_r, int templ_sum_r, double templ_sqsum_r,
unsigned int templ_sum_g, unsigned long long templ_sqsum_g, int templ_sum_g, double templ_sqsum_g,
unsigned int templ_sum_b, unsigned long long templ_sqsum_b, int templ_sum_b, double templ_sqsum_b,
PtrStepSzf result, cudaStream_t stream) PtrStepSzf result, cudaStream_t stream)
{ {
dim3 threads(32, 8); dim3 threads(32, 8);
@ -732,10 +732,10 @@ namespace cv { namespace cuda { namespace device
int w, int h, float weight, int w, int h, float weight,
float templ_sum_scale_r, float templ_sum_scale_g, float templ_sum_scale_b, float templ_sum_scale_r, float templ_sum_scale_g, float templ_sum_scale_b,
float templ_sum_scale_a, float templ_sqsum_scale, float templ_sum_scale_a, float templ_sqsum_scale,
const PtrStep<unsigned int> image_sum_r, const PtrStep<unsigned long long> image_sqsum_r, const PtrStep<int> image_sum_r, const PtrStep<double> image_sqsum_r,
const PtrStep<unsigned int> image_sum_g, const PtrStep<unsigned long long> image_sqsum_g, const PtrStep<int> image_sum_g, const PtrStep<double> image_sqsum_g,
const PtrStep<unsigned int> image_sum_b, const PtrStep<unsigned long long> image_sqsum_b, const PtrStep<int> image_sum_b, const PtrStep<double> image_sqsum_b,
const PtrStep<unsigned int> image_sum_a, const PtrStep<unsigned long long> image_sqsum_a, const PtrStep<int> image_sum_a, const PtrStep<double> image_sqsum_a,
PtrStepSzf result) PtrStepSzf result)
{ {
const int x = blockIdx.x * blockDim.x + threadIdx.x; const int x = blockIdx.x * blockDim.x + threadIdx.x;
@ -780,14 +780,14 @@ namespace cv { namespace cuda { namespace device
void matchTemplatePrepared_CCOFF_NORMED_8UC4( void matchTemplatePrepared_CCOFF_NORMED_8UC4(
int w, int h, int w, int h,
const PtrStepSz<unsigned int> image_sum_r, const PtrStepSz<unsigned long long> image_sqsum_r, const PtrStepSz<int> image_sum_r, const PtrStepSz<double> image_sqsum_r,
const PtrStepSz<unsigned int> image_sum_g, const PtrStepSz<unsigned long long> image_sqsum_g, const PtrStepSz<int> image_sum_g, const PtrStepSz<double> image_sqsum_g,
const PtrStepSz<unsigned int> image_sum_b, const PtrStepSz<unsigned long long> image_sqsum_b, const PtrStepSz<int> image_sum_b, const PtrStepSz<double> image_sqsum_b,
const PtrStepSz<unsigned int> image_sum_a, const PtrStepSz<unsigned long long> image_sqsum_a, const PtrStepSz<int> image_sum_a, const PtrStepSz<double> image_sqsum_a,
unsigned int templ_sum_r, unsigned long long templ_sqsum_r, int templ_sum_r, double templ_sqsum_r,
unsigned int templ_sum_g, unsigned long long templ_sqsum_g, int templ_sum_g, double templ_sqsum_g,
unsigned int templ_sum_b, unsigned long long templ_sqsum_b, int templ_sum_b, double templ_sqsum_b,
unsigned int templ_sum_a, unsigned long long templ_sqsum_a, int templ_sum_a, double templ_sqsum_a,
PtrStepSzf result, cudaStream_t stream) PtrStepSzf result, cudaStream_t stream)
{ {
dim3 threads(32, 8); dim3 threads(32, 8);
@ -823,8 +823,8 @@ namespace cv { namespace cuda { namespace device
template <int cn> template <int cn>
__global__ void normalizeKernel_8U( __global__ void normalizeKernel_8U(
int w, int h, const PtrStep<unsigned long long> image_sqsum, int w, int h, const PtrStep<double> image_sqsum,
unsigned long long templ_sqsum, PtrStepSzf result) double templ_sqsum, PtrStepSzf result)
{ {
const int x = blockIdx.x * blockDim.x + threadIdx.x; const int x = blockIdx.x * blockDim.x + threadIdx.x;
const int y = blockIdx.y * blockDim.y + threadIdx.y; const int y = blockIdx.y * blockDim.y + threadIdx.y;
@ -838,8 +838,8 @@ namespace cv { namespace cuda { namespace device
} }
} }
void normalize_8U(int w, int h, const PtrStepSz<unsigned long long> image_sqsum, void normalize_8U(int w, int h, const PtrStepSz<double> image_sqsum,
unsigned long long templ_sqsum, PtrStepSzf result, int cn, cudaStream_t stream) double templ_sqsum, PtrStepSzf result, int cn, cudaStream_t stream)
{ {
dim3 threads(32, 8); dim3 threads(32, 8);
dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y)); dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));

View File

@ -61,77 +61,77 @@ namespace cv { namespace cuda { namespace device
void matchTemplateNaive_SQDIFF_8U(const PtrStepSzb image, const PtrStepSzb templ, PtrStepSzf result, int cn, cudaStream_t stream); void matchTemplateNaive_SQDIFF_8U(const PtrStepSzb image, const PtrStepSzb templ, PtrStepSzf result, int cn, cudaStream_t stream);
void matchTemplateNaive_SQDIFF_32F(const PtrStepSzb image, const PtrStepSzb templ, PtrStepSzf result, int cn, cudaStream_t stream); void matchTemplateNaive_SQDIFF_32F(const PtrStepSzb image, const PtrStepSzb templ, PtrStepSzf result, int cn, cudaStream_t stream);
void matchTemplatePrepared_SQDIFF_8U(int w, int h, const PtrStepSz<unsigned long long> image_sqsum, unsigned long long templ_sqsum, PtrStepSzf result, void matchTemplatePrepared_SQDIFF_8U(int w, int h, const PtrStepSz<double> image_sqsum, double templ_sqsum, PtrStepSzf result,
int cn, cudaStream_t stream); int cn, cudaStream_t stream);
void matchTemplatePrepared_SQDIFF_NORMED_8U(int w, int h, const PtrStepSz<unsigned long long> image_sqsum, unsigned long long templ_sqsum, PtrStepSzf result, void matchTemplatePrepared_SQDIFF_NORMED_8U(int w, int h, const PtrStepSz<double> image_sqsum, double templ_sqsum, PtrStepSzf result,
int cn, cudaStream_t stream); int cn, cudaStream_t stream);
void matchTemplatePrepared_CCOFF_8U(int w, int h, const PtrStepSz<unsigned int> image_sum, unsigned int templ_sum, PtrStepSzf result, cudaStream_t stream); void matchTemplatePrepared_CCOFF_8U(int w, int h, const PtrStepSz<int> image_sum, int templ_sum, PtrStepSzf result, cudaStream_t stream);
void matchTemplatePrepared_CCOFF_8UC2( void matchTemplatePrepared_CCOFF_8UC2(
int w, int h, int w, int h,
const PtrStepSz<unsigned int> image_sum_r, const PtrStepSz<int> image_sum_r,
const PtrStepSz<unsigned int> image_sum_g, const PtrStepSz<int> image_sum_g,
unsigned int templ_sum_r, int templ_sum_r,
unsigned int templ_sum_g, int templ_sum_g,
PtrStepSzf result, cudaStream_t stream); PtrStepSzf result, cudaStream_t stream);
void matchTemplatePrepared_CCOFF_8UC3( void matchTemplatePrepared_CCOFF_8UC3(
int w, int h, int w, int h,
const PtrStepSz<unsigned int> image_sum_r, const PtrStepSz<int> image_sum_r,
const PtrStepSz<unsigned int> image_sum_g, const PtrStepSz<int> image_sum_g,
const PtrStepSz<unsigned int> image_sum_b, const PtrStepSz<int> image_sum_b,
unsigned int templ_sum_r, int templ_sum_r,
unsigned int templ_sum_g, int templ_sum_g,
unsigned int templ_sum_b, int templ_sum_b,
PtrStepSzf result, cudaStream_t stream); PtrStepSzf result, cudaStream_t stream);
void matchTemplatePrepared_CCOFF_8UC4( void matchTemplatePrepared_CCOFF_8UC4(
int w, int h, int w, int h,
const PtrStepSz<unsigned int> image_sum_r, const PtrStepSz<int> image_sum_r,
const PtrStepSz<unsigned int> image_sum_g, const PtrStepSz<int> image_sum_g,
const PtrStepSz<unsigned int> image_sum_b, const PtrStepSz<int> image_sum_b,
const PtrStepSz<unsigned int> image_sum_a, const PtrStepSz<int> image_sum_a,
unsigned int templ_sum_r, int templ_sum_r,
unsigned int templ_sum_g, int templ_sum_g,
unsigned int templ_sum_b, int templ_sum_b,
unsigned int templ_sum_a, int templ_sum_a,
PtrStepSzf result, cudaStream_t stream); PtrStepSzf result, cudaStream_t stream);
void matchTemplatePrepared_CCOFF_NORMED_8U( void matchTemplatePrepared_CCOFF_NORMED_8U(
int w, int h, const PtrStepSz<unsigned int> image_sum, int w, int h, const PtrStepSz<int> image_sum,
const PtrStepSz<unsigned long long> image_sqsum, const PtrStepSz<double> image_sqsum,
unsigned int templ_sum, unsigned long long templ_sqsum, int templ_sum, double templ_sqsum,
PtrStepSzf result, cudaStream_t stream); PtrStepSzf result, cudaStream_t stream);
void matchTemplatePrepared_CCOFF_NORMED_8UC2( void matchTemplatePrepared_CCOFF_NORMED_8UC2(
int w, int h, int w, int h,
const PtrStepSz<unsigned int> image_sum_r, const PtrStepSz<unsigned long long> image_sqsum_r, const PtrStepSz<int> image_sum_r, const PtrStepSz<double> image_sqsum_r,
const PtrStepSz<unsigned int> image_sum_g, const PtrStepSz<unsigned long long> image_sqsum_g, const PtrStepSz<int> image_sum_g, const PtrStepSz<double> image_sqsum_g,
unsigned int templ_sum_r, unsigned long long templ_sqsum_r, int templ_sum_r, double templ_sqsum_r,
unsigned int templ_sum_g, unsigned long long templ_sqsum_g, int templ_sum_g, double templ_sqsum_g,
PtrStepSzf result, cudaStream_t stream); PtrStepSzf result, cudaStream_t stream);
void matchTemplatePrepared_CCOFF_NORMED_8UC3( void matchTemplatePrepared_CCOFF_NORMED_8UC3(
int w, int h, int w, int h,
const PtrStepSz<unsigned int> image_sum_r, const PtrStepSz<unsigned long long> image_sqsum_r, const PtrStepSz<int> image_sum_r, const PtrStepSz<double> image_sqsum_r,
const PtrStepSz<unsigned int> image_sum_g, const PtrStepSz<unsigned long long> image_sqsum_g, const PtrStepSz<int> image_sum_g, const PtrStepSz<double> image_sqsum_g,
const PtrStepSz<unsigned int> image_sum_b, const PtrStepSz<unsigned long long> image_sqsum_b, const PtrStepSz<int> image_sum_b, const PtrStepSz<double> image_sqsum_b,
unsigned int templ_sum_r, unsigned long long templ_sqsum_r, int templ_sum_r, double templ_sqsum_r,
unsigned int templ_sum_g, unsigned long long templ_sqsum_g, int templ_sum_g, double templ_sqsum_g,
unsigned int templ_sum_b, unsigned long long templ_sqsum_b, int templ_sum_b, double templ_sqsum_b,
PtrStepSzf result, cudaStream_t stream); PtrStepSzf result, cudaStream_t stream);
void matchTemplatePrepared_CCOFF_NORMED_8UC4( void matchTemplatePrepared_CCOFF_NORMED_8UC4(
int w, int h, int w, int h,
const PtrStepSz<unsigned int> image_sum_r, const PtrStepSz<unsigned long long> image_sqsum_r, const PtrStepSz<int> image_sum_r, const PtrStepSz<double> image_sqsum_r,
const PtrStepSz<unsigned int> image_sum_g, const PtrStepSz<unsigned long long> image_sqsum_g, const PtrStepSz<int> image_sum_g, const PtrStepSz<double> image_sqsum_g,
const PtrStepSz<unsigned int> image_sum_b, const PtrStepSz<unsigned long long> image_sqsum_b, const PtrStepSz<int> image_sum_b, const PtrStepSz<double> image_sqsum_b,
const PtrStepSz<unsigned int> image_sum_a, const PtrStepSz<unsigned long long> image_sqsum_a, const PtrStepSz<int> image_sum_a, const PtrStepSz<double> image_sqsum_a,
unsigned int templ_sum_r, unsigned long long templ_sqsum_r, int templ_sum_r, double templ_sqsum_r,
unsigned int templ_sum_g, unsigned long long templ_sqsum_g, int templ_sum_g, double templ_sqsum_g,
unsigned int templ_sum_b, unsigned long long templ_sqsum_b, int templ_sum_b, double templ_sqsum_b,
unsigned int templ_sum_a, unsigned long long templ_sqsum_a, int templ_sum_a, double templ_sqsum_a,
PtrStepSzf result, cudaStream_t stream); PtrStepSzf result, cudaStream_t stream);
void normalize_8U(int w, int h, const PtrStepSz<unsigned long long> image_sqsum, void normalize_8U(int w, int h, const PtrStepSz<double> image_sqsum,
unsigned long long templ_sqsum, PtrStepSzf result, int cn, cudaStream_t stream); double templ_sqsum, PtrStepSzf result, int cn, cudaStream_t stream);
void extractFirstChannel_32F(const PtrStepSzb image, PtrStepSzf result, int cn, cudaStream_t stream); void extractFirstChannel_32F(const PtrStepSzb image, PtrStepSzf result, int cn, cudaStream_t stream);
} }
@ -290,7 +290,7 @@ namespace
cuda::sqrIntegral(image.reshape(1), image_sqsums_, intBuffer_, stream); cuda::sqrIntegral(image.reshape(1), image_sqsums_, intBuffer_, stream);
unsigned long long templ_sqsum = (unsigned long long) cuda::sqrSum(templ.reshape(1))[0]; double templ_sqsum = cuda::sqrSum(templ.reshape(1))[0];
normalize_8U(templ.cols, templ.rows, image_sqsums_, templ_sqsum, result, image.channels(), StreamAccessor::getStream(stream)); normalize_8U(templ.cols, templ.rows, image_sqsums_, templ_sqsum, result, image.channels(), StreamAccessor::getStream(stream));
} }
@ -361,7 +361,7 @@ namespace
cuda::sqrIntegral(image.reshape(1), image_sqsums_, intBuffer_, stream); cuda::sqrIntegral(image.reshape(1), image_sqsums_, intBuffer_, stream);
unsigned long long templ_sqsum = (unsigned long long) cuda::sqrSum(templ.reshape(1))[0]; double templ_sqsum = cuda::sqrSum(templ.reshape(1))[0];
match_CCORR_.match(image, templ, _result, stream); match_CCORR_.match(image, templ, _result, stream);
GpuMat result = _result.getGpuMat(); GpuMat result = _result.getGpuMat();
@ -400,7 +400,7 @@ namespace
cuda::sqrIntegral(image.reshape(1), image_sqsums_, intBuffer_, stream); cuda::sqrIntegral(image.reshape(1), image_sqsums_, intBuffer_, stream);
unsigned long long templ_sqsum = (unsigned long long) cuda::sqrSum(templ.reshape(1))[0]; double templ_sqsum = cuda::sqrSum(templ.reshape(1))[0];
match_CCORR_.match(image, templ, _result, stream); match_CCORR_.match(image, templ, _result, stream);
GpuMat result = _result.getGpuMat(); GpuMat result = _result.getGpuMat();
@ -446,7 +446,7 @@ namespace
image_sums_.resize(1); image_sums_.resize(1);
cuda::integral(image, image_sums_[0], intBuffer_, stream); cuda::integral(image, image_sums_[0], intBuffer_, stream);
unsigned int templ_sum = (unsigned int) cuda::sum(templ)[0]; int templ_sum = (int) cuda::sum(templ)[0];
matchTemplatePrepared_CCOFF_8U(templ.cols, templ.rows, image_sums_[0], templ_sum, result, StreamAccessor::getStream(stream)); matchTemplatePrepared_CCOFF_8U(templ.cols, templ.rows, image_sums_[0], templ_sum, result, StreamAccessor::getStream(stream));
} }
@ -465,19 +465,19 @@ namespace
case 2: case 2:
matchTemplatePrepared_CCOFF_8UC2( matchTemplatePrepared_CCOFF_8UC2(
templ.cols, templ.rows, image_sums_[0], image_sums_[1], templ.cols, templ.rows, image_sums_[0], image_sums_[1],
(unsigned int) templ_sum[0], (unsigned int) templ_sum[1], (int) templ_sum[0], (int) templ_sum[1],
result, StreamAccessor::getStream(stream)); result, StreamAccessor::getStream(stream));
break; break;
case 3: case 3:
matchTemplatePrepared_CCOFF_8UC3( matchTemplatePrepared_CCOFF_8UC3(
templ.cols, templ.rows, image_sums_[0], image_sums_[1], image_sums_[2], templ.cols, templ.rows, image_sums_[0], image_sums_[1], image_sums_[2],
(unsigned int) templ_sum[0], (unsigned int) templ_sum[1], (unsigned int) templ_sum[2], (int) templ_sum[0], (int) templ_sum[1], (int) templ_sum[2],
result, StreamAccessor::getStream(stream)); result, StreamAccessor::getStream(stream));
break; break;
case 4: case 4:
matchTemplatePrepared_CCOFF_8UC4( matchTemplatePrepared_CCOFF_8UC4(
templ.cols, templ.rows, image_sums_[0], image_sums_[1], image_sums_[2], image_sums_[3], templ.cols, templ.rows, image_sums_[0], image_sums_[1], image_sums_[2], image_sums_[3],
(unsigned int) templ_sum[0], (unsigned int) templ_sum[1], (unsigned int) templ_sum[2], (unsigned int) templ_sum[3], (int) templ_sum[0], (int) templ_sum[1], (int) templ_sum[2], (int) templ_sum[3],
result, StreamAccessor::getStream(stream)); result, StreamAccessor::getStream(stream));
break; break;
default: default:
@ -532,8 +532,8 @@ namespace
image_sqsums_.resize(1); image_sqsums_.resize(1);
cuda::sqrIntegral(image, image_sqsums_[0], intBuffer_, stream); cuda::sqrIntegral(image, image_sqsums_[0], intBuffer_, stream);
unsigned int templ_sum = (unsigned int) cuda::sum(templ)[0]; int templ_sum = (int) cuda::sum(templ)[0];
unsigned long long templ_sqsum = (unsigned long long) cuda::sqrSum(templ)[0]; double templ_sqsum = cuda::sqrSum(templ)[0];
matchTemplatePrepared_CCOFF_NORMED_8U( matchTemplatePrepared_CCOFF_NORMED_8U(
templ.cols, templ.rows, image_sums_[0], image_sqsums_[0], templ.cols, templ.rows, image_sums_[0], image_sqsums_[0],
@ -561,8 +561,8 @@ namespace
templ.cols, templ.rows, templ.cols, templ.rows,
image_sums_[0], image_sqsums_[0], image_sums_[0], image_sqsums_[0],
image_sums_[1], image_sqsums_[1], image_sums_[1], image_sqsums_[1],
(unsigned int)templ_sum[0], (unsigned long long)templ_sqsum[0], (int)templ_sum[0], templ_sqsum[0],
(unsigned int)templ_sum[1], (unsigned long long)templ_sqsum[1], (int)templ_sum[1], templ_sqsum[1],
result, StreamAccessor::getStream(stream)); result, StreamAccessor::getStream(stream));
break; break;
case 3: case 3:
@ -571,9 +571,9 @@ namespace
image_sums_[0], image_sqsums_[0], image_sums_[0], image_sqsums_[0],
image_sums_[1], image_sqsums_[1], image_sums_[1], image_sqsums_[1],
image_sums_[2], image_sqsums_[2], image_sums_[2], image_sqsums_[2],
(unsigned int)templ_sum[0], (unsigned long long)templ_sqsum[0], (int)templ_sum[0], templ_sqsum[0],
(unsigned int)templ_sum[1], (unsigned long long)templ_sqsum[1], (int)templ_sum[1], templ_sqsum[1],
(unsigned int)templ_sum[2], (unsigned long long)templ_sqsum[2], (int)templ_sum[2], templ_sqsum[2],
result, StreamAccessor::getStream(stream)); result, StreamAccessor::getStream(stream));
break; break;
case 4: case 4:
@ -583,10 +583,10 @@ namespace
image_sums_[1], image_sqsums_[1], image_sums_[1], image_sqsums_[1],
image_sums_[2], image_sqsums_[2], image_sums_[2], image_sqsums_[2],
image_sums_[3], image_sqsums_[3], image_sums_[3], image_sqsums_[3],
(unsigned int)templ_sum[0], (unsigned long long)templ_sqsum[0], (int)templ_sum[0], templ_sqsum[0],
(unsigned int)templ_sum[1], (unsigned long long)templ_sqsum[1], (int)templ_sum[1], templ_sqsum[1],
(unsigned int)templ_sum[2], (unsigned long long)templ_sqsum[2], (int)templ_sum[2], templ_sqsum[2],
(unsigned int)templ_sum[3], (unsigned long long)templ_sqsum[3], (int)templ_sum[3], templ_sqsum[3],
result, StreamAccessor::getStream(stream)); result, StreamAccessor::getStream(stream));
break; break;
default: default: