added gpu::solvePnpRansac

2011-02-28 12:44:19 +00:00
parent 518ed29480
commit cae59a7caf
4 changed files with 260 additions and 0 deletions
--- a/modules/gpu/src/cuda/project_points.cu
+++ b/modules/gpu/src/cuda/project_points.cu
@@ -43,6 +43,8 @@
 #include "internal_shared.hpp"
 #include "opencv2/gpu/device/transform.hpp"

+#define SOLVE_PNP_RANSAC_NUM_ITERS 200
+
 namespace cv { namespace gpu
 {
    namespace transform_points
@@ -75,6 +77,7 @@ namespace cv { namespace gpu
        }
    } // namespace transform_points

+
    namespace project_points
    {
        __constant__ float3 crot0;
@@ -114,4 +117,75 @@ namespace cv { namespace gpu
        }
    } // namespace project_points

+
+    namespace solve_pnp_ransac
+    {
+        __constant__ float3 crot_matrices[SOLVE_PNP_RANSAC_NUM_ITERS * 3];
+        __constant__ float3 ctransl_vectors[SOLVE_PNP_RANSAC_NUM_ITERS];
+        __constant__ float3 ccamera_mat[2];
+
+        __device__ float sqr(float x)
+        {
+            return x * x;
+        }
+
+        __global__ void computeHypothesisScoresKernel(
+                const int num_points, const float3* object, const float2* image,
+                const float dist_threshold, int* g_num_inliers)
+        {
+            const float3* const &rot_mat = crot_matrices + blockIdx.x * 3;
+            const float3 &transl_vec = ctransl_vectors[blockIdx.x];
+            int num_inliers = 0;
+
+            for (int i = threadIdx.x; i < num_points; i += blockDim.x)
+            {
+                float3 p = object[i];
+                p = make_float3(
+                        rot_mat[0].x * p.x + rot_mat[0].y * p.y + rot_mat[0].z * p.z + transl_vec.x,
+                        rot_mat[1].x * p.x + rot_mat[1].y * p.y + rot_mat[1].z * p.z + transl_vec.y,
+                        rot_mat[2].x * p.x + rot_mat[2].y * p.y + rot_mat[2].z * p.z + transl_vec.z);
+                if (p.z > 0)
+                {
+                    p.x = ccamera_mat[0].x * p.x / p.z + ccamera_mat[0].z;
+                    p.y = ccamera_mat[1].y * p.y / p.z + ccamera_mat[1].z;
+                    float2 image_p = image[i];
+                    if (sqr(p.x - image_p.x) + sqr(p.y - image_p.y) < dist_threshold)
+                        ++num_inliers;
+                }
+            }
+
+            extern __shared__ float s_num_inliers[];
+            s_num_inliers[threadIdx.x] = num_inliers;
+            __syncthreads();
+
+            for (int step = blockDim.x / 2; step > 0; step >>= 1)
+            {
+                if (threadIdx.x < step)
+                    s_num_inliers[threadIdx.x] += s_num_inliers[threadIdx.x + step];
+                __syncthreads();
+            }
+
+            if (threadIdx.x == 0)
+                g_num_inliers[blockIdx.x] = s_num_inliers[0];
+        }
+
+        void computeHypothesisScores(
+                const int num_hypotheses, const int num_points, const float* rot_matrices,
+                const float3* transl_vectors, const float3* object, const float2* image,
+                const float3* camera_mat, const float dist_threshold, int* hypothesis_scores)
+        {
+            cudaSafeCall(cudaMemcpyToSymbol(crot_matrices, rot_matrices, num_hypotheses * 3 * sizeof(float3)));
+            cudaSafeCall(cudaMemcpyToSymbol(ctransl_vectors, transl_vectors, num_hypotheses * sizeof(float3)));
+            cudaSafeCall(cudaMemcpyToSymbol(ccamera_mat, camera_mat, 2 * sizeof(float3)));
+
+            dim3 threads(256);
+            dim3 grid(num_hypotheses);
+            int smem_size = threads.x * sizeof(float);
+
+            computeHypothesisScoresKernel<<<grid, threads, smem_size>>>(
+                    num_points, object, image, dist_threshold, hypothesis_scores);
+            cudaSafeCall(cudaThreadSynchronize());
+        }
+    } // namespace solvepnp_ransac
+
 }} // namespace cv { namespace gpu