Tensor reconstruction system upgrade: make fragments (#5181)

- Information matrix computation for RGB-D odometry - Easier-to-use pose graph wrapper in t-reconstruction system - Explicit Long sequence splitter (into fragments) - Module: RGB-D odometry, pose graph construction, and optimization - Module: RGB-D integration For the copyroom scene (5490 frames), on RTX 3060 & i7-11700 @ 2.5Hz - Pose graph generation and optimization takes 107.961s for 55 fragments. (~ avg 50 FPS) - TSDF integration takes 47.590s for 55 fragments. (~ avg 120 FPS) To run the system: `python examples/python/t_reconstruction_system/run_system.py --split_fragments --fragment_odometry --fragment_integration --path_dataset /path/to/dataset`
isl-org · Jan 20, 2023 · 450a053 · 450a053
1 parent dca613d
commit 450a053
Show file tree

Hide file tree

Showing 17 changed files with 1,044 additions and 102 deletions.
diff --git a/cpp/open3d/t/pipelines/kernel/RGBDOdometry.cpp b/cpp/open3d/t/pipelines/kernel/RGBDOdometry.cpp
@@ -211,6 +211,41 @@ void ComputeOdometryResultHybrid(const core::Tensor &source_depth,
     }
 }
 
+void ComputeOdometryInformationMatrix(const core::Tensor &source_vertex_map,
+                                      const core::Tensor &target_vertex_map,
+                                      const core::Tensor &intrinsic,
+                                      const core::Tensor &source_to_target,
+                                      const float square_dist_thr,
+                                      core::Tensor &information) {
+    core::AssertTensorDtypes(source_vertex_map, {core::Float32});
+
+    const core::Dtype supported_dtype = source_vertex_map.GetDtype();
+    const core::Device device = source_vertex_map.GetDevice();
+
+    core::AssertTensorDtype(target_vertex_map, supported_dtype);
+    core::AssertTensorDevice(target_vertex_map, device);
+
+    core::AssertTensorShape(intrinsic, {3, 3});
+    core::AssertTensorShape(source_to_target, {4, 4});
+
+    static const core::Device host("CPU:0");
+    core::Tensor intrinsics_d = intrinsic.To(host, core::Float64).Contiguous();
+    core::Tensor trans_d =
+            source_to_target.To(host, core::Float64).Contiguous();
+
+    if (device.GetType() == core::Device::DeviceType::CPU) {
+        ComputeOdometryInformationMatrixCPU(
+                source_vertex_map, target_vertex_map, intrinsic,
+                source_to_target, square_dist_thr, information);
+    } else if (device.GetType() == core::Device::DeviceType::CUDA) {
+        CUDA_CALL(ComputeOdometryInformationMatrixCUDA, source_vertex_map,
+                  target_vertex_map, intrinsic, source_to_target,
+                  square_dist_thr, information);
+    } else {
+        utility::LogError("Unimplemented device.");
+    }
+}
+
 }  // namespace odometry
 }  // namespace kernel
 }  // namespace pipelines

diff --git a/cpp/open3d/t/pipelines/kernel/RGBDOdometry.h b/cpp/open3d/t/pipelines/kernel/RGBDOdometry.h
@@ -79,6 +79,13 @@ void ComputeOdometryResultHybrid(const core::Tensor &source_depth,
                                  const float depth_huber_delta,
                                  const float intensity_huber_delta);
 
+void ComputeOdometryInformationMatrix(const core::Tensor &source_vertex_map,
+                                      const core::Tensor &target_vertex_map,
+                                      const core::Tensor &intrinsic,
+                                      const core::Tensor &source_to_target,
+                                      const float square_dist_thr,
+                                      core::Tensor &information);
+
 }  // namespace odometry
 }  // namespace kernel
 }  // namespace pipelines

diff --git a/cpp/open3d/t/pipelines/kernel/RGBDOdometryCPU.cpp b/cpp/open3d/t/pipelines/kernel/RGBDOdometryCPU.cpp
@@ -42,23 +42,17 @@ namespace pipelines {
 namespace kernel {
 namespace odometry {
 
-void ComputeOdometryResultPointToPlaneCPU(
-        const core::Tensor& source_vertex_map,
-        const core::Tensor& target_vertex_map,
-        const core::Tensor& target_normal_map,
-        const core::Tensor& intrinsics,
-        const core::Tensor& init_source_to_target,
-        core::Tensor& delta,
-        float& inlier_residual,
-        int& inlier_count,
-        const float depth_outlier_trunc,
-        const float depth_huber_delta) {
+void ComputeOdometryInformationMatrixCPU(const core::Tensor& source_vertex_map,
+                                         const core::Tensor& target_vertex_map,
+                                         const core::Tensor& intrinsic,
+                                         const core::Tensor& source_to_target,
+                                         const float square_dist_thr,
+                                         core::Tensor& information) {
     NDArrayIndexer source_vertex_indexer(source_vertex_map, 2);
     NDArrayIndexer target_vertex_indexer(target_vertex_map, 2);
-    NDArrayIndexer target_normal_indexer(target_normal_map, 2);
 
-    core::Tensor trans = init_source_to_target;
-    t::geometry::kernel::TransformIndexer ti(intrinsics, trans);
+    core::Tensor trans = source_to_target;
+    t::geometry::kernel::TransformIndexer ti(intrinsic, trans);
 
     // Output
     int64_t rows = source_vertex_indexer.GetShape(0);
@@ -68,59 +62,67 @@ void ComputeOdometryResultPointToPlaneCPU(
 
     int64_t n = rows * cols;
 
-    std::vector<float> A_1x29(29, 0.0);
+    std::vector<float> A_1x21(21, 0.0);
 
 #ifdef _MSC_VER
-    std::vector<float> zeros_29(29, 0.0);
-    A_1x29 = tbb::parallel_reduce(
-            tbb::blocked_range<int>(0, n), zeros_29,
+    std::vector<float> zeros_21(21, 0.0);
+    A_1x21 = tbb::parallel_reduce(
+            tbb::blocked_range<int>(0, n), zeros_21,
             [&](tbb::blocked_range<int> r, std::vector<float> A_reduction) {
                 for (int workload_idx = r.begin(); workload_idx < r.end();
                      workload_idx++) {
 #else
-    float* A_reduction = A_1x29.data();
-#pragma omp parallel for reduction(+ : A_reduction[:29]) schedule(static) num_threads(utility::EstimateMaxThreads())
+    float* A_reduction = A_1x21.data();
+#pragma omp parallel for reduction(+ : A_reduction[:21]) schedule(static) num_threads(utility::EstimateMaxThreads())
     for (int workload_idx = 0; workload_idx < n; workload_idx++) {
 #endif
                     int y = workload_idx / cols;
                     int x = workload_idx % cols;
 
-                    float J_ij[6];
-                    float r;
+                    float J_x[6], J_y[6], J_z[6];
+                    float rx, ry, rz;
 
-                    bool valid = GetJacobianPointToPlane(
-                            x, y, depth_outlier_trunc, source_vertex_indexer,
-                            target_vertex_indexer, target_normal_indexer, ti,
-                            J_ij, r);
+                    bool valid = GetJacobianPointToPoint(
+                            x, y, square_dist_thr, source_vertex_indexer,
+                            target_vertex_indexer, ti, J_x, J_y, J_z, rx, ry,
+                            rz);
 
                     if (valid) {
-                        float d_huber = HuberDeriv(r, depth_huber_delta);
-                        float r_huber = HuberLoss(r, depth_huber_delta);
                         for (int i = 0, j = 0; j < 6; j++) {
                             for (int k = 0; k <= j; k++) {
-                                A_reduction[i] += J_ij[j] * J_ij[k];
+                                A_reduction[i] += J_x[j] * J_x[k];
+                                A_reduction[i] += J_y[j] * J_y[k];
+                                A_reduction[i] += J_z[j] * J_z[k];
                                 i++;
                             }
-                            A_reduction[21 + j] += J_ij[j] * d_huber;
                         }
-                        A_reduction[27] += r_huber;
-                        A_reduction[28] += 1;
                     }
                 }
 #ifdef _MSC_VER
                 return A_reduction;
             },
             // TBB: Defining reduction operation.
             [&](std::vector<float> a, std::vector<float> b) {
-                std::vector<float> result(29);
-                for (int j = 0; j < 29; j++) {
+                std::vector<float> result(21);
+                for (int j = 0; j < 21; j++) {
                     result[j] = a[j] + b[j];
                 }
                 return result;
             });
 #endif
-    core::Tensor A_reduction_tensor(A_1x29, {29}, core::Float32, device);
-    DecodeAndSolve6x6(A_reduction_tensor, delta, inlier_residual, inlier_count);
+    core::Tensor A_reduction_tensor(A_1x21, {21}, core::Float32, device);
+    float* reduction_ptr = A_reduction_tensor.GetDataPtr<float>();
+
+    information = core::Tensor::Empty({6, 6}, core::Float64, device);
+    double* info_ptr = information.GetDataPtr<double>();
+
+    for (int j = 0; j < 6; j++) {
+        const int64_t reduction_idx = ((j * (j + 1)) / 2);
+        for (int k = 0; k <= j; k++) {
+            info_ptr[j * 6 + k] = reduction_ptr[reduction_idx + k];
+            info_ptr[k * 6 + j] = reduction_ptr[reduction_idx + k];
+        }
+    }
 }
 
 void ComputeOdometryResultIntensityCPU(
@@ -326,6 +328,87 @@ void ComputeOdometryResultHybridCPU(const core::Tensor& source_depth,
     DecodeAndSolve6x6(A_reduction_tensor, delta, inlier_residual, inlier_count);
 }
 
+void ComputeOdometryResultPointToPlaneCPU(
+        const core::Tensor& source_vertex_map,
+        const core::Tensor& target_vertex_map,
+        const core::Tensor& target_normal_map,
+        const core::Tensor& intrinsics,
+        const core::Tensor& init_source_to_target,
+        core::Tensor& delta,
+        float& inlier_residual,
+        int& inlier_count,
+        const float depth_outlier_trunc,
+        const float depth_huber_delta) {
+    NDArrayIndexer source_vertex_indexer(source_vertex_map, 2);
+    NDArrayIndexer target_vertex_indexer(target_vertex_map, 2);
+    NDArrayIndexer target_normal_indexer(target_normal_map, 2);
+
+    core::Tensor trans = init_source_to_target;
+    t::geometry::kernel::TransformIndexer ti(intrinsics, trans);
+
+    // Output
+    int64_t rows = source_vertex_indexer.GetShape(0);
+    int64_t cols = source_vertex_indexer.GetShape(1);
+
+    core::Device device = source_vertex_map.GetDevice();
+
+    int64_t n = rows * cols;
+
+    std::vector<float> A_1x29(29, 0.0);
+
+#ifdef _MSC_VER
+    std::vector<float> zeros_29(29, 0.0);
+    A_1x29 = tbb::parallel_reduce(
+            tbb::blocked_range<int>(0, n), zeros_29,
+            [&](tbb::blocked_range<int> r, std::vector<float> A_reduction) {
+                for (int workload_idx = r.begin(); workload_idx < r.end();
+                     workload_idx++) {
+#else
+    float* A_reduction = A_1x29.data();
+#pragma omp parallel for reduction(+ : A_reduction[:29]) schedule(static) num_threads(utility::EstimateMaxThreads())
+    for (int workload_idx = 0; workload_idx < n; workload_idx++) {
+#endif
+                    int y = workload_idx / cols;
+                    int x = workload_idx % cols;
+
+                    float J_ij[6];
+                    float r;
+
+                    bool valid = GetJacobianPointToPlane(
+                            x, y, depth_outlier_trunc, source_vertex_indexer,
+                            target_vertex_indexer, target_normal_indexer, ti,
+                            J_ij, r);
+
+                    if (valid) {
+                        float d_huber = HuberDeriv(r, depth_huber_delta);
+                        float r_huber = HuberLoss(r, depth_huber_delta);
+                        for (int i = 0, j = 0; j < 6; j++) {
+                            for (int k = 0; k <= j; k++) {
+                                A_reduction[i] += J_ij[j] * J_ij[k];
+                                i++;
+                            }
+                            A_reduction[21 + j] += J_ij[j] * d_huber;
+                        }
+                        A_reduction[27] += r_huber;
+                        A_reduction[28] += 1;
+                    }
+                }
+#ifdef _MSC_VER
+                return A_reduction;
+            },
+            // TBB: Defining reduction operation.
+            [&](std::vector<float> a, std::vector<float> b) {
+                std::vector<float> result(29);
+                for (int j = 0; j < 29; j++) {
+                    result[j] = a[j] + b[j];
+                }
+                return result;
+            });
+#endif
+    core::Tensor A_reduction_tensor(A_1x29, {29}, core::Float32, device);
+    DecodeAndSolve6x6(A_reduction_tensor, delta, inlier_residual, inlier_count);
+}
+
 }  // namespace odometry
 }  // namespace kernel
 }  // namespace pipelines

diff --git a/cpp/open3d/t/pipelines/kernel/RGBDOdometryCUDA.cu b/cpp/open3d/t/pipelines/kernel/RGBDOdometryCUDA.cu
@@ -46,7 +46,10 @@ namespace kernel {
 namespace odometry {
 
 const int kBlockSize = 256;
-const int kReduceDim = 29;  // 21 (JtJ) + 6 (Jtr) + 1 (inlier) + 1 (r)
+const int kJtJDim = 21;
+const int kJtrDim = 6;
+const int kReduceDim =
+        kJtJDim + kJtrDim + 1 + 1;  // 21 (JtJ) + 6 (Jtr) + 1 (inlier) + 1 (r)
 typedef utility::MiniVec<float, kReduceDim> ReduceVec;
 typedef cub::BlockReduce<ReduceVec, kBlockSize> BlockReduce;
 
@@ -67,7 +70,7 @@ __global__ void ComputeOdometryResultPointToPlaneCUDAKernel(
     int x = workload % cols;
     const int tid = threadIdx.x;
 
-    ReduceVec local_sum;
+    ReduceVec local_sum(0.0f);
     if (workload < rows * cols) {
         float J[6] = {0};
         float r = 0;
@@ -162,7 +165,7 @@ __global__ void ComputeOdometryResultIntensityCUDAKernel(
     int x = workload % cols;
     const int tid = threadIdx.x;
 
-    ReduceVec local_sum;
+    ReduceVec local_sum(0.0f);
     if (workload < rows * cols) {
         float J[6] = {0};
         float r = 0;
@@ -274,7 +277,7 @@ __global__ void ComputeOdometryResultHybridCUDAKernel(
     int x = workload % cols;
     const int tid = threadIdx.x;
 
-    ReduceVec local_sum;
+    ReduceVec local_sum(0.0f);
     if (workload < rows * cols) {
         float J_I[6] = {0}, J_D[6] = {0};
         float r_I = 0, r_D = 0;
@@ -372,6 +375,93 @@ void ComputeOdometryResultHybridCUDA(const core::Tensor& source_depth,
     DecodeAndSolve6x6(global_sum, delta, inlier_residual, inlier_count);
 }
 
+__global__ void ComputeOdometryInformationMatrixCUDAKernel(
+        NDArrayIndexer source_vertex_indexer,
+        NDArrayIndexer target_vertex_indexer,
+        TransformIndexer ti,
+        float* global_sum,
+        int rows,
+        int cols,
+        const float square_dist_thr) {
+    __shared__ typename BlockReduce::TempStorage temp_storage;
+
+    const int workload = threadIdx.x + blockIdx.x * blockDim.x;
+    int y = workload / cols;
+    int x = workload % cols;
+    const int tid = threadIdx.x;
+
+    ReduceVec local_sum(0.0f);
+    if (workload < rows * cols) {
+        float J_x[6], J_y[6], J_z[6];
+        float rx = 0, ry = 0, rz = 0;
+        bool valid = GetJacobianPointToPoint(
+                x, y, square_dist_thr, source_vertex_indexer,
+                target_vertex_indexer, ti, J_x, J_y, J_z, rx, ry, rz);
+
+        // Dump J, r into JtJ and Jtr
+        int offset = 0;
+        for (int i = 0; i < 6; ++i) {
+            for (int j = 0; j <= i; ++j) {
+                local_sum[offset] = valid ? J_x[i] * J_x[j] : 0;
+                local_sum[offset] += valid ? J_y[i] * J_y[j] : 0;
+                local_sum[offset] += valid ? J_z[i] * J_z[j] : 0;
+                offset++;
+            }
+        }
+    }
+
+    auto result = BlockReduce(temp_storage).Sum(local_sum);
+    if (tid == 0) {
+#pragma unroll
+        for (int i = 0; i < kJtJDim; ++i) {
+            atomicAdd(&global_sum[i], result[i]);
+        }
+    }
+}
+
+void ComputeOdometryInformationMatrixCUDA(const core::Tensor& source_vertex_map,
+                                          const core::Tensor& target_vertex_map,
+                                          const core::Tensor& intrinsic,
+                                          const core::Tensor& source_to_target,
+                                          const float square_dist_thr,
+                                          core::Tensor& information) {
+    NDArrayIndexer source_vertex_indexer(source_vertex_map, 2);
+    NDArrayIndexer target_vertex_indexer(target_vertex_map, 2);
+
+    core::Device device = source_vertex_map.GetDevice();
+    core::Tensor trans = source_to_target;
+    t::geometry::kernel::TransformIndexer ti(intrinsic, trans);
+
+    const int64_t rows = source_vertex_indexer.GetShape(0);
+    const int64_t cols = source_vertex_indexer.GetShape(1);
+
+    core::Tensor global_sum =
+            core::Tensor::Zeros({kJtJDim}, core::Float32, device);
+    float* global_sum_ptr = global_sum.GetDataPtr<float>();
+
+    const dim3 blocks((cols * rows + kBlockSize - 1) / kBlockSize);
+    const dim3 threads(kBlockSize);
+    ComputeOdometryInformationMatrixCUDAKernel<<<blocks, threads, 0,
+                                                 core::cuda::GetStream()>>>(
+            source_vertex_indexer, target_vertex_indexer, ti, global_sum_ptr,
+            rows, cols, square_dist_thr);
+    core::cuda::Synchronize();
+
+    // 21 => 6x6
+    const core::Device host(core::Device("CPU:0"));
+    information = core::Tensor::Empty({6, 6}, core::Float64, host);
+    global_sum = global_sum.To(host, core::Float64);
+
+    double* info_ptr = information.GetDataPtr<double>();
+    double* reduction_ptr = global_sum.GetDataPtr<double>();
+    for (int j = 0; j < 6; j++) {
+        const int64_t reduction_idx = ((j * (j + 1)) / 2);
+        for (int k = 0; k <= j; k++) {
+            info_ptr[j * 6 + k] = reduction_ptr[reduction_idx + k];
+            info_ptr[k * 6 + j] = reduction_ptr[reduction_idx + k];
+        }
+    }
+}
 }  // namespace odometry
 }  // namespace kernel
 }  // namespace pipelines