Skip to content

Commit

Permalink
Tensor reconstruction system upgrade: make fragments (#5181)
Browse files Browse the repository at this point in the history
- Information matrix computation for RGB-D odometry
- Easier-to-use pose graph wrapper in t-reconstruction system
- Explicit Long sequence splitter (into fragments)
- Module: RGB-D odometry, pose graph construction, and optimization
- Module: RGB-D integration

For the copyroom scene (5490 frames), on RTX 3060 & i7-11700 @ 2.5Hz
- Pose graph generation and optimization takes 107.961s for 55 fragments. (~ avg 50 FPS)
- TSDF integration takes 47.590s for 55 fragments. (~ avg 120 FPS)

To run the system:
`python examples/python/t_reconstruction_system/run_system.py --split_fragments --fragment_odometry --fragment_integration --path_dataset /path/to/dataset`
  • Loading branch information
theNded authored Jan 20, 2023
1 parent dca613d commit 450a053
Show file tree
Hide file tree
Showing 17 changed files with 1,044 additions and 102 deletions.
35 changes: 35 additions & 0 deletions cpp/open3d/t/pipelines/kernel/RGBDOdometry.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -211,6 +211,41 @@ void ComputeOdometryResultHybrid(const core::Tensor &source_depth,
}
}

void ComputeOdometryInformationMatrix(const core::Tensor &source_vertex_map,
const core::Tensor &target_vertex_map,
const core::Tensor &intrinsic,
const core::Tensor &source_to_target,
const float square_dist_thr,
core::Tensor &information) {
core::AssertTensorDtypes(source_vertex_map, {core::Float32});

const core::Dtype supported_dtype = source_vertex_map.GetDtype();
const core::Device device = source_vertex_map.GetDevice();

core::AssertTensorDtype(target_vertex_map, supported_dtype);
core::AssertTensorDevice(target_vertex_map, device);

core::AssertTensorShape(intrinsic, {3, 3});
core::AssertTensorShape(source_to_target, {4, 4});

static const core::Device host("CPU:0");
core::Tensor intrinsics_d = intrinsic.To(host, core::Float64).Contiguous();
core::Tensor trans_d =
source_to_target.To(host, core::Float64).Contiguous();

if (device.GetType() == core::Device::DeviceType::CPU) {
ComputeOdometryInformationMatrixCPU(
source_vertex_map, target_vertex_map, intrinsic,
source_to_target, square_dist_thr, information);
} else if (device.GetType() == core::Device::DeviceType::CUDA) {
CUDA_CALL(ComputeOdometryInformationMatrixCUDA, source_vertex_map,
target_vertex_map, intrinsic, source_to_target,
square_dist_thr, information);
} else {
utility::LogError("Unimplemented device.");
}
}

} // namespace odometry
} // namespace kernel
} // namespace pipelines
Expand Down
7 changes: 7 additions & 0 deletions cpp/open3d/t/pipelines/kernel/RGBDOdometry.h
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,13 @@ void ComputeOdometryResultHybrid(const core::Tensor &source_depth,
const float depth_huber_delta,
const float intensity_huber_delta);

void ComputeOdometryInformationMatrix(const core::Tensor &source_vertex_map,
const core::Tensor &target_vertex_map,
const core::Tensor &intrinsic,
const core::Tensor &source_to_target,
const float square_dist_thr,
core::Tensor &information);

} // namespace odometry
} // namespace kernel
} // namespace pipelines
Expand Down
155 changes: 119 additions & 36 deletions cpp/open3d/t/pipelines/kernel/RGBDOdometryCPU.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -42,23 +42,17 @@ namespace pipelines {
namespace kernel {
namespace odometry {

void ComputeOdometryResultPointToPlaneCPU(
const core::Tensor& source_vertex_map,
const core::Tensor& target_vertex_map,
const core::Tensor& target_normal_map,
const core::Tensor& intrinsics,
const core::Tensor& init_source_to_target,
core::Tensor& delta,
float& inlier_residual,
int& inlier_count,
const float depth_outlier_trunc,
const float depth_huber_delta) {
void ComputeOdometryInformationMatrixCPU(const core::Tensor& source_vertex_map,
const core::Tensor& target_vertex_map,
const core::Tensor& intrinsic,
const core::Tensor& source_to_target,
const float square_dist_thr,
core::Tensor& information) {
NDArrayIndexer source_vertex_indexer(source_vertex_map, 2);
NDArrayIndexer target_vertex_indexer(target_vertex_map, 2);
NDArrayIndexer target_normal_indexer(target_normal_map, 2);

core::Tensor trans = init_source_to_target;
t::geometry::kernel::TransformIndexer ti(intrinsics, trans);
core::Tensor trans = source_to_target;
t::geometry::kernel::TransformIndexer ti(intrinsic, trans);

// Output
int64_t rows = source_vertex_indexer.GetShape(0);
Expand All @@ -68,59 +62,67 @@ void ComputeOdometryResultPointToPlaneCPU(

int64_t n = rows * cols;

std::vector<float> A_1x29(29, 0.0);
std::vector<float> A_1x21(21, 0.0);

#ifdef _MSC_VER
std::vector<float> zeros_29(29, 0.0);
A_1x29 = tbb::parallel_reduce(
tbb::blocked_range<int>(0, n), zeros_29,
std::vector<float> zeros_21(21, 0.0);
A_1x21 = tbb::parallel_reduce(
tbb::blocked_range<int>(0, n), zeros_21,
[&](tbb::blocked_range<int> r, std::vector<float> A_reduction) {
for (int workload_idx = r.begin(); workload_idx < r.end();
workload_idx++) {
#else
float* A_reduction = A_1x29.data();
#pragma omp parallel for reduction(+ : A_reduction[:29]) schedule(static) num_threads(utility::EstimateMaxThreads())
float* A_reduction = A_1x21.data();
#pragma omp parallel for reduction(+ : A_reduction[:21]) schedule(static) num_threads(utility::EstimateMaxThreads())
for (int workload_idx = 0; workload_idx < n; workload_idx++) {
#endif
int y = workload_idx / cols;
int x = workload_idx % cols;

float J_ij[6];
float r;
float J_x[6], J_y[6], J_z[6];
float rx, ry, rz;

bool valid = GetJacobianPointToPlane(
x, y, depth_outlier_trunc, source_vertex_indexer,
target_vertex_indexer, target_normal_indexer, ti,
J_ij, r);
bool valid = GetJacobianPointToPoint(
x, y, square_dist_thr, source_vertex_indexer,
target_vertex_indexer, ti, J_x, J_y, J_z, rx, ry,
rz);

if (valid) {
float d_huber = HuberDeriv(r, depth_huber_delta);
float r_huber = HuberLoss(r, depth_huber_delta);
for (int i = 0, j = 0; j < 6; j++) {
for (int k = 0; k <= j; k++) {
A_reduction[i] += J_ij[j] * J_ij[k];
A_reduction[i] += J_x[j] * J_x[k];
A_reduction[i] += J_y[j] * J_y[k];
A_reduction[i] += J_z[j] * J_z[k];
i++;
}
A_reduction[21 + j] += J_ij[j] * d_huber;
}
A_reduction[27] += r_huber;
A_reduction[28] += 1;
}
}
#ifdef _MSC_VER
return A_reduction;
},
// TBB: Defining reduction operation.
[&](std::vector<float> a, std::vector<float> b) {
std::vector<float> result(29);
for (int j = 0; j < 29; j++) {
std::vector<float> result(21);
for (int j = 0; j < 21; j++) {
result[j] = a[j] + b[j];
}
return result;
});
#endif
core::Tensor A_reduction_tensor(A_1x29, {29}, core::Float32, device);
DecodeAndSolve6x6(A_reduction_tensor, delta, inlier_residual, inlier_count);
core::Tensor A_reduction_tensor(A_1x21, {21}, core::Float32, device);
float* reduction_ptr = A_reduction_tensor.GetDataPtr<float>();

information = core::Tensor::Empty({6, 6}, core::Float64, device);
double* info_ptr = information.GetDataPtr<double>();

for (int j = 0; j < 6; j++) {
const int64_t reduction_idx = ((j * (j + 1)) / 2);
for (int k = 0; k <= j; k++) {
info_ptr[j * 6 + k] = reduction_ptr[reduction_idx + k];
info_ptr[k * 6 + j] = reduction_ptr[reduction_idx + k];
}
}
}

void ComputeOdometryResultIntensityCPU(
Expand Down Expand Up @@ -326,6 +328,87 @@ void ComputeOdometryResultHybridCPU(const core::Tensor& source_depth,
DecodeAndSolve6x6(A_reduction_tensor, delta, inlier_residual, inlier_count);
}

void ComputeOdometryResultPointToPlaneCPU(
const core::Tensor& source_vertex_map,
const core::Tensor& target_vertex_map,
const core::Tensor& target_normal_map,
const core::Tensor& intrinsics,
const core::Tensor& init_source_to_target,
core::Tensor& delta,
float& inlier_residual,
int& inlier_count,
const float depth_outlier_trunc,
const float depth_huber_delta) {
NDArrayIndexer source_vertex_indexer(source_vertex_map, 2);
NDArrayIndexer target_vertex_indexer(target_vertex_map, 2);
NDArrayIndexer target_normal_indexer(target_normal_map, 2);

core::Tensor trans = init_source_to_target;
t::geometry::kernel::TransformIndexer ti(intrinsics, trans);

// Output
int64_t rows = source_vertex_indexer.GetShape(0);
int64_t cols = source_vertex_indexer.GetShape(1);

core::Device device = source_vertex_map.GetDevice();

int64_t n = rows * cols;

std::vector<float> A_1x29(29, 0.0);

#ifdef _MSC_VER
std::vector<float> zeros_29(29, 0.0);
A_1x29 = tbb::parallel_reduce(
tbb::blocked_range<int>(0, n), zeros_29,
[&](tbb::blocked_range<int> r, std::vector<float> A_reduction) {
for (int workload_idx = r.begin(); workload_idx < r.end();
workload_idx++) {
#else
float* A_reduction = A_1x29.data();
#pragma omp parallel for reduction(+ : A_reduction[:29]) schedule(static) num_threads(utility::EstimateMaxThreads())
for (int workload_idx = 0; workload_idx < n; workload_idx++) {
#endif
int y = workload_idx / cols;
int x = workload_idx % cols;

float J_ij[6];
float r;

bool valid = GetJacobianPointToPlane(
x, y, depth_outlier_trunc, source_vertex_indexer,
target_vertex_indexer, target_normal_indexer, ti,
J_ij, r);

if (valid) {
float d_huber = HuberDeriv(r, depth_huber_delta);
float r_huber = HuberLoss(r, depth_huber_delta);
for (int i = 0, j = 0; j < 6; j++) {
for (int k = 0; k <= j; k++) {
A_reduction[i] += J_ij[j] * J_ij[k];
i++;
}
A_reduction[21 + j] += J_ij[j] * d_huber;
}
A_reduction[27] += r_huber;
A_reduction[28] += 1;
}
}
#ifdef _MSC_VER
return A_reduction;
},
// TBB: Defining reduction operation.
[&](std::vector<float> a, std::vector<float> b) {
std::vector<float> result(29);
for (int j = 0; j < 29; j++) {
result[j] = a[j] + b[j];
}
return result;
});
#endif
core::Tensor A_reduction_tensor(A_1x29, {29}, core::Float32, device);
DecodeAndSolve6x6(A_reduction_tensor, delta, inlier_residual, inlier_count);
}

} // namespace odometry
} // namespace kernel
} // namespace pipelines
Expand Down
98 changes: 94 additions & 4 deletions cpp/open3d/t/pipelines/kernel/RGBDOdometryCUDA.cu
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,10 @@ namespace kernel {
namespace odometry {

const int kBlockSize = 256;
const int kReduceDim = 29; // 21 (JtJ) + 6 (Jtr) + 1 (inlier) + 1 (r)
const int kJtJDim = 21;
const int kJtrDim = 6;
const int kReduceDim =
kJtJDim + kJtrDim + 1 + 1; // 21 (JtJ) + 6 (Jtr) + 1 (inlier) + 1 (r)
typedef utility::MiniVec<float, kReduceDim> ReduceVec;
typedef cub::BlockReduce<ReduceVec, kBlockSize> BlockReduce;

Expand All @@ -67,7 +70,7 @@ __global__ void ComputeOdometryResultPointToPlaneCUDAKernel(
int x = workload % cols;
const int tid = threadIdx.x;

ReduceVec local_sum;
ReduceVec local_sum(0.0f);
if (workload < rows * cols) {
float J[6] = {0};
float r = 0;
Expand Down Expand Up @@ -162,7 +165,7 @@ __global__ void ComputeOdometryResultIntensityCUDAKernel(
int x = workload % cols;
const int tid = threadIdx.x;
ReduceVec local_sum;
ReduceVec local_sum(0.0f);
if (workload < rows * cols) {
float J[6] = {0};
float r = 0;
Expand Down Expand Up @@ -274,7 +277,7 @@ __global__ void ComputeOdometryResultHybridCUDAKernel(
int x = workload % cols;
const int tid = threadIdx.x;
ReduceVec local_sum;
ReduceVec local_sum(0.0f);
if (workload < rows * cols) {
float J_I[6] = {0}, J_D[6] = {0};
float r_I = 0, r_D = 0;
Expand Down Expand Up @@ -372,6 +375,93 @@ void ComputeOdometryResultHybridCUDA(const core::Tensor& source_depth,
DecodeAndSolve6x6(global_sum, delta, inlier_residual, inlier_count);
}
__global__ void ComputeOdometryInformationMatrixCUDAKernel(
NDArrayIndexer source_vertex_indexer,
NDArrayIndexer target_vertex_indexer,
TransformIndexer ti,
float* global_sum,
int rows,
int cols,
const float square_dist_thr) {
__shared__ typename BlockReduce::TempStorage temp_storage;
const int workload = threadIdx.x + blockIdx.x * blockDim.x;
int y = workload / cols;
int x = workload % cols;
const int tid = threadIdx.x;
ReduceVec local_sum(0.0f);
if (workload < rows * cols) {
float J_x[6], J_y[6], J_z[6];
float rx = 0, ry = 0, rz = 0;
bool valid = GetJacobianPointToPoint(
x, y, square_dist_thr, source_vertex_indexer,
target_vertex_indexer, ti, J_x, J_y, J_z, rx, ry, rz);
// Dump J, r into JtJ and Jtr
int offset = 0;
for (int i = 0; i < 6; ++i) {
for (int j = 0; j <= i; ++j) {
local_sum[offset] = valid ? J_x[i] * J_x[j] : 0;
local_sum[offset] += valid ? J_y[i] * J_y[j] : 0;
local_sum[offset] += valid ? J_z[i] * J_z[j] : 0;
offset++;
}
}
}
auto result = BlockReduce(temp_storage).Sum(local_sum);
if (tid == 0) {
#pragma unroll
for (int i = 0; i < kJtJDim; ++i) {
atomicAdd(&global_sum[i], result[i]);
}
}
}
void ComputeOdometryInformationMatrixCUDA(const core::Tensor& source_vertex_map,
const core::Tensor& target_vertex_map,
const core::Tensor& intrinsic,
const core::Tensor& source_to_target,
const float square_dist_thr,
core::Tensor& information) {
NDArrayIndexer source_vertex_indexer(source_vertex_map, 2);
NDArrayIndexer target_vertex_indexer(target_vertex_map, 2);
core::Device device = source_vertex_map.GetDevice();
core::Tensor trans = source_to_target;
t::geometry::kernel::TransformIndexer ti(intrinsic, trans);
const int64_t rows = source_vertex_indexer.GetShape(0);
const int64_t cols = source_vertex_indexer.GetShape(1);
core::Tensor global_sum =
core::Tensor::Zeros({kJtJDim}, core::Float32, device);
float* global_sum_ptr = global_sum.GetDataPtr<float>();
const dim3 blocks((cols * rows + kBlockSize - 1) / kBlockSize);
const dim3 threads(kBlockSize);
ComputeOdometryInformationMatrixCUDAKernel<<<blocks, threads, 0,
core::cuda::GetStream()>>>(
source_vertex_indexer, target_vertex_indexer, ti, global_sum_ptr,
rows, cols, square_dist_thr);
core::cuda::Synchronize();
// 21 => 6x6
const core::Device host(core::Device("CPU:0"));
information = core::Tensor::Empty({6, 6}, core::Float64, host);
global_sum = global_sum.To(host, core::Float64);
double* info_ptr = information.GetDataPtr<double>();
double* reduction_ptr = global_sum.GetDataPtr<double>();
for (int j = 0; j < 6; j++) {
const int64_t reduction_idx = ((j * (j + 1)) / 2);
for (int k = 0; k <= j; k++) {
info_ptr[j * 6 + k] = reduction_ptr[reduction_idx + k];
info_ptr[k * 6 + j] = reduction_ptr[reduction_idx + k];
}
}
}
} // namespace odometry
} // namespace kernel
} // namespace pipelines
Expand Down
Loading

0 comments on commit 450a053

Please sign in to comment.