Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update cmake to use CUDA #141

Open
wants to merge 3 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
61 changes: 44 additions & 17 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
cmake_minimum_required(VERSION 3.10.0)
project(fast_gicp)

option(BUILD_VGICP_CUDA "Build GPU-powered VGICP" OFF)
option(BUILD_VGICP_CUDA "Build GPU-powered VGICP" ON)
option(BUILD_apps "Build application programs" ON)
option(BUILD_test "Build test programs" OFF)
option(BUILD_PYTHON_BINDINGS "Build python bindings" OFF)
Expand All @@ -28,14 +28,30 @@ endif()

find_package(OpenMP)
if (OPENMP_FOUND)
message("OPENMP is found! Flags: ${OpenMP_C_FLAGS}" )
set (CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS}")
set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}")
endif()

if(BUILD_VGICP_CUDA)
find_package(CUDA REQUIRED)
include_directories(${CUDA_INCLUDE_DIRS})
link_directories(${CUDA_LIBRARY_DIRS})
message("BUILDING WITH CUDA ENABLED!")
enable_language(CUDA)
cmake_policy(SET CMP0104 OLD)
#set_property(TARGET fast_gicp PROPERTY CUDA_ARCHITECTURES OFF)

# find_package(CUDA REQUIRED)
# include_directories(${CUDA_INCLUDE_DIRS})
# link_directories(${CUDA_LIBRARY_DIRS})
message("CUDA include dirs: ${CUDA_INCLUDE_DIRS}")
message("CUDA_LIBRARY_DIRS: ${CUDA_LIBRARY_DIRS}")

find_package(CUDAToolkit REQUIRED)
include_directories(${CUDAToolkit_INCLUDE_DIRS})
link_directories(${CUDAToolkit_LIBRARY_DIRS})

message("CUDAToolkit_INCLUDE_DIRS: ${CUDAToolkit_INCLUDE_DIRS}")
message("CUDAToolkit_LIBRARY_DIRS: ${CUDAToolkit_LIBRARY_DIRS}")

endif()

###################################
Expand All @@ -58,14 +74,18 @@ add_library(fast_gicp SHARED
src/fast_gicp/gicp/fast_gicp_st.cpp
src/fast_gicp/gicp/fast_vgicp.cpp
)

set_property(TARGET fast_gicp PROPERTY CUDA_STANDARD 17)

target_link_libraries(fast_gicp
${PCL_LIBRARIES}
dl
)
if (OPENMP_FOUND)
if (TARGET OpenMP::OpenMP_CXX)
target_link_libraries(fast_gicp OpenMP::OpenMP_CXX)
endif ()
endif ()
# if (OPENMP_FOUND)
# if (TARGET OpenMP::OpenMP_CXX)
# target_link_libraries(fast_gicp OpenMP::OpenMP_CXX)
# endif ()
# endif ()
target_include_directories(fast_gicp PUBLIC
include
${PCL_INCLUDE_DIRS}
Expand All @@ -76,17 +96,20 @@ target_include_directories(fast_gicp PUBLIC
if(BUILD_apps)
add_executable(gicp_align src/align.cpp)
add_dependencies(gicp_align fast_gicp)
set_property(TARGET gicp_align PROPERTY CUDA_STANDARD 17)
target_link_libraries(gicp_align
${PCL_LIBRARIES}
fast_gicp
dl
)

add_executable(gicp_kitti src/kitti.cpp)
add_dependencies(gicp_kitti fast_gicp)
target_link_libraries(gicp_kitti
${PCL_LIBRARIES}
fast_gicp
)
# add_executable(gicp_kitti src/kitti.cpp)
# add_dependencies(gicp_kitti fast_gicp)
# set_property(TARGET gicp_kitti PROPERTY CUDA_STANDARD 17)
# target_link_libraries(gicp_kitti
# ${PCL_LIBRARIES}
# fast_gicp
# )
endif()

### Python bindings ###
Expand All @@ -102,6 +125,7 @@ if(BUILD_PYTHON_BINDINGS)
)
target_link_libraries(pygicp PRIVATE
fast_gicp
dl
)
endif()

Expand All @@ -110,7 +134,7 @@ if(BUILD_VGICP_CUDA)
set(CUDA_NVCC_FLAGS "--expt-relaxed-constexpr")
add_definitions(-DUSE_VGICP_CUDA)

cuda_add_library(fast_vgicp_cuda SHARED
add_library(fast_vgicp_cuda SHARED
src/fast_gicp/cuda/fast_vgicp_cuda.cu
src/fast_gicp/cuda/brute_force_knn.cu
src/fast_gicp/cuda/covariance_estimation.cu
Expand All @@ -123,6 +147,7 @@ if(BUILD_VGICP_CUDA)
src/fast_gicp/cuda/ndt_cuda.cu
src/fast_gicp/cuda/ndt_compute_derivatives.cu
)
set_property(TARGET fast_vgicp_cuda PROPERTY CUDA_STANDARD 17)
target_include_directories(fast_vgicp_cuda PRIVATE
include
thirdparty/Eigen
Expand All @@ -131,8 +156,9 @@ if(BUILD_VGICP_CUDA)
)
target_link_libraries(fast_vgicp_cuda
${catkin_LIBRARIES}
dl
)
cuda_add_cublas_to_target(fast_vgicp_cuda)
# cuda_add_cublas_to_target(fast_vgicp_cuda)

# add vgicp_cuda to libfast_gicp
target_sources(fast_gicp PRIVATE
Expand All @@ -141,6 +167,7 @@ if(BUILD_VGICP_CUDA)
)
target_link_libraries(fast_gicp
fast_vgicp_cuda
dl
)
add_dependencies(fast_gicp fast_vgicp_cuda)
if(catkin_FOUND)
Expand Down
2 changes: 1 addition & 1 deletion include/fast_gicp/ndt/ndt_cuda.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -71,4 +71,4 @@ class NDTCuda : public LsqRegistration<PointSource, PointTarget> {
};
} // namespace fast_gicp

#endif
#endif
52 changes: 28 additions & 24 deletions src/align.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -37,14 +37,14 @@ void test_pcl(Registration& reg, const pcl::PointCloud<pcl::PointXYZ>::ConstPtr&

// 100 times
t1 = std::chrono::high_resolution_clock::now();
for (int i = 0; i < 100; i++) {
for (int i = 0; i < 10; i++) {
reg.setInputTarget(target);
reg.setInputSource(source);
reg.align(*aligned);
}
t2 = std::chrono::high_resolution_clock::now();
double multi = std::chrono::duration_cast<std::chrono::nanoseconds>(t2 - t1).count() / 1e6;
std::cout << "100times:" << multi << "[msec] fitness_score:" << fitness_score << std::endl;
std::cout << "Avg over 10 runs:" << multi / 10.0 << "[msec] fitness_score:" << fitness_score << std::endl;
}

// benchmark for fast_gicp registration methods
Expand All @@ -71,7 +71,7 @@ void test(Registration& reg, const pcl::PointCloud<pcl::PointXYZ>::ConstPtr& tar

// 100 times
t1 = std::chrono::high_resolution_clock::now();
for (int i = 0; i < 100; i++) {
for (int i = 0; i < 10; i++) {
reg.clearTarget();
reg.clearSource();
reg.setInputTarget(target);
Expand All @@ -80,14 +80,14 @@ void test(Registration& reg, const pcl::PointCloud<pcl::PointXYZ>::ConstPtr& tar
}
t2 = std::chrono::high_resolution_clock::now();
double multi = std::chrono::duration_cast<std::chrono::nanoseconds>(t2 - t1).count() / 1e6;
std::cout << "100times:" << multi << "[msec] " << std::flush;
std::cout << "Avg over 10 runs:" << multi / 10.0 << "[msec] " << std::flush;

// for some tasks like odometry calculation,
// you can reuse the covariances of a source point cloud in the next registration
t1 = std::chrono::high_resolution_clock::now();
pcl::PointCloud<pcl::PointXYZ>::ConstPtr target_ = target;
pcl::PointCloud<pcl::PointXYZ>::ConstPtr source_ = source;
for (int i = 0; i < 100; i++) {
for (int i = 0; i < 10; i++) {
reg.swapSourceAndTarget();
reg.clearSource();

Expand All @@ -100,7 +100,7 @@ void test(Registration& reg, const pcl::PointCloud<pcl::PointXYZ>::ConstPtr& tar
t2 = std::chrono::high_resolution_clock::now();
double reuse = std::chrono::duration_cast<std::chrono::nanoseconds>(t2 - t1).count() / 1e6;

std::cout << "100times_reuse:" << reuse << "[msec] fitness_score:" << fitness_score << std::endl;
std::cout << "Avg over 10times_reuse:" << reuse / 10.0 << "[msec] fitness_score:" << fitness_score << std::endl;
}

/**
Expand Down Expand Up @@ -157,25 +157,27 @@ int main(int argc, char** argv) {
pcl_ndt.setResolution(1.0);
test_pcl(pcl_ndt, target_cloud, source_cloud);

std::cout << "--- fgicp_st ---" << std::endl;
fast_gicp::FastGICPSingleThread<pcl::PointXYZ, pcl::PointXYZ> fgicp_st;
test(fgicp_st, target_cloud, source_cloud);
/*
std::cout << "--- fgicp_st ---" << std::endl;
fast_gicp::FastGICPSingleThread<pcl::PointXYZ, pcl::PointXYZ> fgicp_st;
test(fgicp_st, target_cloud, source_cloud);

std::cout << "--- fgicp_mt ---" << std::endl;
fast_gicp::FastGICP<pcl::PointXYZ, pcl::PointXYZ> fgicp_mt;
// fast_gicp uses all the CPU cores by default
// fgicp_mt.setNumThreads(8);
test(fgicp_mt, target_cloud, source_cloud);
std::cout << "--- fgicp_mt ---" << std::endl;
fast_gicp::FastGICP<pcl::PointXYZ, pcl::PointXYZ> fgicp_mt;
// fast_gicp uses all the CPU cores by default
// fgicp_mt.setNumThreads(8);
test(fgicp_mt, target_cloud, source_cloud);

std::cout << "--- vgicp_st ---" << std::endl;
fast_gicp::FastVGICP<pcl::PointXYZ, pcl::PointXYZ> vgicp;
vgicp.setResolution(1.0);
vgicp.setNumThreads(1);
test(vgicp, target_cloud, source_cloud);
std::cout << "--- vgicp_st ---" << std::endl;
fast_gicp::FastVGICP<pcl::PointXYZ, pcl::PointXYZ> vgicp;
vgicp.setResolution(1.0);
vgicp.setNumThreads(1);
test(vgicp, target_cloud, source_cloud);

std::cout << "--- vgicp_mt ---" << std::endl;
vgicp.setNumThreads(omp_get_max_threads());
test(vgicp, target_cloud, source_cloud);
std::cout << "--- vgicp_mt ---" << std::endl;
vgicp.setNumThreads(omp_get_max_threads());
test(vgicp, target_cloud, source_cloud);
*/

#ifdef USE_VGICP_CUDA
std::cout << "--- ndt_cuda (P2D) ---" << std::endl;
Expand All @@ -188,13 +190,15 @@ int main(int argc, char** argv) {
ndt_cuda.setDistanceMode(fast_gicp::NDTDistanceMode::D2D);
test(ndt_cuda, target_cloud, source_cloud);

std::cout << "--- vgicp_cuda (parallel_kdtree) ---" << std::endl;
fast_gicp::FastVGICPCuda<pcl::PointXYZ, pcl::PointXYZ> vgicp_cuda;
vgicp_cuda.setResolution(1.0);
/*
// vgicp_cuda uses CPU-based parallel KDTree in covariance estimation by default
// on a modern CPU, it is faster than GPU_BRUTEFORCE
// vgicp_cuda.setNearestNeighborSearchMethod(fast_gicp::NearestNeighborMethod::CPU_PARALLEL_KDTREE);
std::cout << "--- vgicp_cuda (parallel_kdtree) ---" << std::endl;
vgicp_cuda.setNearestNeighborSearchMethod(fast_gicp::NearestNeighborMethod::CPU_PARALLEL_KDTREE);
test(vgicp_cuda, target_cloud, source_cloud);
*/

std::cout << "--- vgicp_cuda (gpu_bruteforce) ---" << std::endl;
// use GPU-based bruteforce nearest neighbor search for covariance estimation
Expand Down