From e75ca4b662afe0bff4b7ba6f332a80aafd0d5a13 Mon Sep 17 00:00:00 2001 From: Alexey Spizhevoy Date: Fri, 3 Dec 2010 13:11:14 +0000 Subject: [PATCH] replaced global memory reads with texture memory reads in GPU's corner detectors --- modules/gpu/src/cuda/imgproc.cu | 49 +++++++++++++++++++++++---------- 1 file changed, 34 insertions(+), 15 deletions(-) diff --git a/modules/gpu/src/cuda/imgproc.cu b/modules/gpu/src/cuda/imgproc.cu index 0f4fe4181..f964d2b1f 100644 --- a/modules/gpu/src/cuda/imgproc.cu +++ b/modules/gpu/src/cuda/imgproc.cu @@ -495,10 +495,12 @@ namespace cv { namespace gpu { namespace imgproc /////////////////////////////////////////// Corner Harris ///////////////////////////////////////////////// + texture harrisDxTex; + texture harrisDyTex; + template __global__ void cornerHarris_kernel(const int cols, const int rows, const int block_size, const float k, - const PtrStep Dx, const PtrStep Dy, PtrStep dst, B border_row, - B border_col) + PtrStep dst, B border_row, B border_col) { const unsigned int x = blockIdx.x * blockDim.x + threadIdx.x; const unsigned int y = blockIdx.y * blockDim.y + threadIdx.y; @@ -517,13 +519,11 @@ namespace cv { namespace gpu { namespace imgproc for (int i = ibegin; i < iend; ++i) { int y = border_col.idx(i); - const float* dx_row = (const float*)Dx.ptr(y); - const float* dy_row = (const float*)Dy.ptr(y); for (int j = jbegin; j < jend; ++j) { int x = border_row.idx(j); - float dx = dx_row[x]; - float dy = dy_row[x]; + float dx = tex2D(harrisDxTex, x, y); + float dy = tex2D(harrisDyTex, x, y); a += dx * dx; b += dx * dy; c += dy * dy; @@ -543,22 +543,33 @@ namespace cv { namespace gpu { namespace imgproc dim3 threads(32, 8); dim3 grid(divUp(cols, threads.x), divUp(rows, threads.y)); + cudaChannelFormatDesc desc = cudaCreateChannelDesc(); + cudaBindTexture2D(0, harrisDxTex, Dx.data, desc, Dx.cols, Dx.rows, Dx.step); + cudaBindTexture2D(0, harrisDyTex, Dy.data, desc, Dy.cols, Dy.rows, Dy.step); + harrisDxTex.filterMode = cudaFilterModePoint; + harrisDyTex.filterMode = cudaFilterModePoint; + switch (border_type) { case BORDER_REFLECT101: cornerHarris_kernel<<>>( - cols, rows, block_size, k, Dx, Dy, dst, - BrdReflect101(cols), BrdReflect101(rows)); + cols, rows, block_size, k, dst, BrdReflect101(cols), BrdReflect101(rows)); break; } + cudaSafeCall(cudaThreadSynchronize()); + cudaSafeCall(cudaUnbindTexture(harrisDxTex)); + cudaSafeCall(cudaUnbindTexture(harrisDyTex)); } /////////////////////////////////////////// Corner Min Eigen Val ///////////////////////////////////////////////// + texture minEigenValDxTex; + texture minEigenValDyTex; + template - __global__ void cornerMinEigenVal_kernel(const int cols, const int rows, const int block_size, const PtrStep Dx, - const PtrStep Dy, PtrStep dst, B border_row, B border_col) + __global__ void cornerMinEigenVal_kernel(const int cols, const int rows, const int block_size, + PtrStep dst, B border_row, B border_col) { const unsigned int x = blockIdx.x * blockDim.x + threadIdx.x; const unsigned int y = blockIdx.y * blockDim.y + threadIdx.y; @@ -577,13 +588,11 @@ namespace cv { namespace gpu { namespace imgproc for (int i = ibegin; i < iend; ++i) { int y = border_col.idx(i); - const float* dx_row = (const float*)Dx.ptr(y); - const float* dy_row = (const float*)Dy.ptr(y); for (int j = jbegin; j < jend; ++j) { int x = border_row.idx(j); - float dx = dx_row[x]; - float dy = dy_row[x]; + float dx = tex2D(minEigenValDxTex, x, y); + float dy = tex2D(minEigenValDyTex, x, y); a += dx * dx; b += dx * dy; c += dy * dy; @@ -605,14 +614,24 @@ namespace cv { namespace gpu { namespace imgproc dim3 threads(32, 8); dim3 grid(divUp(cols, threads.x), divUp(rows, threads.y)); + cudaChannelFormatDesc desc = cudaCreateChannelDesc(); + cudaBindTexture2D(0, minEigenValDxTex, Dx.data, desc, Dx.cols, Dx.rows, Dx.step); + cudaBindTexture2D(0, minEigenValDyTex, Dy.data, desc, Dy.cols, Dy.rows, Dy.step); + minEigenValDxTex.filterMode = cudaFilterModePoint; + minEigenValDyTex.filterMode = cudaFilterModePoint; + switch (border_type) { case BORDER_REFLECT101: cornerMinEigenVal_kernel<<>>( - cols, rows, block_size, Dx, Dy, dst, + cols, rows, block_size, dst, BrdReflect101(cols), BrdReflect101(rows)); break; } + cudaSafeCall(cudaThreadSynchronize()); + cudaSafeCall(cudaUnbindTexture(minEigenValDxTex)); + cudaSafeCall(cudaUnbindTexture(minEigenValDyTex)); } }}} +