#1713 Added the possibility of setting user_block_size manually for gpu::matchTemplate function (and gpu::convolve). Added a buffer param into these functions. Removed using of 2^n block sizes when it's not necessary.

This commit is contained in:
Alexey Spizhevoy
2012-03-28 07:11:07 +00:00
parent d1423adbc7
commit c776bff95b
5 changed files with 223 additions and 189 deletions

View File

@@ -559,7 +559,7 @@ namespace cv { namespace gpu { namespace device
void matchTemplatePrepared_CCOFF_NORMED_8U(
int w, int h, const DevMem2D_<unsigned int> image_sum,
const DevMem2D_<unsigned long long> image_sqsum,
unsigned int templ_sum, unsigned int templ_sqsum,
unsigned int templ_sum, unsigned long long templ_sqsum,
DevMem2Df result, cudaStream_t stream)
{
dim3 threads(32, 8);
@@ -618,8 +618,8 @@ namespace cv { namespace gpu { namespace device
int w, int h,
const DevMem2D_<unsigned int> image_sum_r, const DevMem2D_<unsigned long long> image_sqsum_r,
const DevMem2D_<unsigned int> image_sum_g, const DevMem2D_<unsigned long long> image_sqsum_g,
unsigned int templ_sum_r, unsigned int templ_sqsum_r,
unsigned int templ_sum_g, unsigned int templ_sqsum_g,
unsigned int templ_sum_r, unsigned long long templ_sqsum_r,
unsigned int templ_sum_g, unsigned long long templ_sqsum_g,
DevMem2Df result, cudaStream_t stream)
{
dim3 threads(32, 8);
@@ -694,9 +694,9 @@ namespace cv { namespace gpu { namespace device
const DevMem2D_<unsigned int> image_sum_r, const DevMem2D_<unsigned long long> image_sqsum_r,
const DevMem2D_<unsigned int> image_sum_g, const DevMem2D_<unsigned long long> image_sqsum_g,
const DevMem2D_<unsigned int> image_sum_b, const DevMem2D_<unsigned long long> image_sqsum_b,
unsigned int templ_sum_r, unsigned int templ_sqsum_r,
unsigned int templ_sum_g, unsigned int templ_sqsum_g,
unsigned int templ_sum_b, unsigned int templ_sqsum_b,
unsigned int templ_sum_r, unsigned long long templ_sqsum_r,
unsigned int templ_sum_g, unsigned long long templ_sqsum_g,
unsigned int templ_sum_b, unsigned long long templ_sqsum_b,
DevMem2Df result, cudaStream_t stream)
{
dim3 threads(32, 8);
@@ -782,10 +782,10 @@ namespace cv { namespace gpu { namespace device
const DevMem2D_<unsigned int> image_sum_g, const DevMem2D_<unsigned long long> image_sqsum_g,
const DevMem2D_<unsigned int> image_sum_b, const DevMem2D_<unsigned long long> image_sqsum_b,
const DevMem2D_<unsigned int> image_sum_a, const DevMem2D_<unsigned long long> image_sqsum_a,
unsigned int templ_sum_r, unsigned int templ_sqsum_r,
unsigned int templ_sum_g, unsigned int templ_sqsum_g,
unsigned int templ_sum_b, unsigned int templ_sqsum_b,
unsigned int templ_sum_a, unsigned int templ_sqsum_a,
unsigned int templ_sum_r, unsigned long long templ_sqsum_r,
unsigned int templ_sum_g, unsigned long long templ_sqsum_g,
unsigned int templ_sum_b, unsigned long long templ_sqsum_b,
unsigned int templ_sum_a, unsigned long long templ_sqsum_a,
DevMem2Df result, cudaStream_t stream)
{
dim3 threads(32, 8);
@@ -822,7 +822,7 @@ namespace cv { namespace gpu { namespace device
template <int cn>
__global__ void normalizeKernel_8U(
int w, int h, const PtrStep<unsigned long long> image_sqsum,
unsigned int templ_sqsum, DevMem2Df result)
unsigned long long templ_sqsum, DevMem2Df result)
{
const int x = blockIdx.x * blockDim.x + threadIdx.x;
const int y = blockIdx.y * blockDim.y + threadIdx.y;
@@ -837,7 +837,7 @@ namespace cv { namespace gpu { namespace device
}
void normalize_8U(int w, int h, const DevMem2D_<unsigned long long> image_sqsum,
unsigned int templ_sqsum, DevMem2Df result, int cn, cudaStream_t stream)
unsigned long long templ_sqsum, DevMem2Df result, int cn, cudaStream_t stream)
{
dim3 threads(32, 8);
dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));