From 14fc12ba95e78f6b285d59720ff4f317921b221e Mon Sep 17 00:00:00 2001 From: Neil Johnson Date: Fri, 15 Dec 2023 17:15:17 -0700 Subject: [PATCH 1/3] Updated the CMakeLists.txt to use 'enable_language(CUDA)' instead of find_package(CUDA) --- CMakeLists.txt | 28 ++++++++++++++++++++++------ 1 file changed, 22 insertions(+), 6 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 02d722d..7ae366a 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,7 +1,7 @@ cmake_minimum_required(VERSION 3.10.0) project(fast_gicp) -option(BUILD_VGICP_CUDA "Build GPU-powered VGICP" OFF) +option(BUILD_VGICP_CUDA "Build GPU-powered VGICP" ON) option(BUILD_apps "Build application programs" ON) option(BUILD_test "Build test programs" OFF) option(BUILD_PYTHON_BINDINGS "Build python bindings" OFF) @@ -28,14 +28,30 @@ endif() find_package(OpenMP) if (OPENMP_FOUND) + message("OPENMP is found! Flags: ${OpenMP_C_FLAGS}" ) set (CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS}") set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}") endif() if(BUILD_VGICP_CUDA) - find_package(CUDA REQUIRED) - include_directories(${CUDA_INCLUDE_DIRS}) - link_directories(${CUDA_LIBRARY_DIRS}) + message("BUILDING WITH CUDA ENABLED!") + enable_language(CUDA) + cmake_policy(SET CMP0104 OLD) + #set_property(TARGET fast_gicp PROPERTY CUDA_ARCHITECTURES OFF) + + # find_package(CUDA REQUIRED) + # include_directories(${CUDA_INCLUDE_DIRS}) + # link_directories(${CUDA_LIBRARY_DIRS}) + message("CUDA include dirs: ${CUDA_INCLUDE_DIRS}") + message("CUDA_LIBRARY_DIRS: ${CUDA_LIBRARY_DIRS}") + + find_package(CUDAToolkit REQUIRED) + include_directories(${CUDAToolkit_INCLUDE_DIRS}) + link_directories(${CUDAToolkit_LIBRARY_DIRS}) + + message("CUDAToolkit_INCLUDE_DIRS: ${CUDAToolkit_INCLUDE_DIRS}") + message("CUDAToolkit_LIBRARY_DIRS: ${CUDAToolkit_LIBRARY_DIRS}") + endif() ################################### @@ -110,7 +126,7 @@ if(BUILD_VGICP_CUDA) set(CUDA_NVCC_FLAGS "--expt-relaxed-constexpr") add_definitions(-DUSE_VGICP_CUDA) - cuda_add_library(fast_vgicp_cuda SHARED + add_library(fast_vgicp_cuda SHARED src/fast_gicp/cuda/fast_vgicp_cuda.cu src/fast_gicp/cuda/brute_force_knn.cu src/fast_gicp/cuda/covariance_estimation.cu @@ -132,7 +148,7 @@ if(BUILD_VGICP_CUDA) target_link_libraries(fast_vgicp_cuda ${catkin_LIBRARIES} ) - cuda_add_cublas_to_target(fast_vgicp_cuda) + # cuda_add_cublas_to_target(fast_vgicp_cuda) # add vgicp_cuda to libfast_gicp target_sources(fast_gicp PRIVATE From 3008292c5b260eef5a56ad27e0eeb4b2c28d511b Mon Sep 17 00:00:00 2001 From: Neil Johnson Date: Mon, 18 Dec 2023 10:28:57 -0700 Subject: [PATCH 2/3] Speed up align.cpp so that only the fast algos are run, and just 10 times each instead of 100 --- include/fast_gicp/ndt/ndt_cuda.hpp | 2 +- src/align.cpp | 52 ++++++++++++++++-------------- 2 files changed, 29 insertions(+), 25 deletions(-) diff --git a/include/fast_gicp/ndt/ndt_cuda.hpp b/include/fast_gicp/ndt/ndt_cuda.hpp index bf3a2d5..5ca07f6 100644 --- a/include/fast_gicp/ndt/ndt_cuda.hpp +++ b/include/fast_gicp/ndt/ndt_cuda.hpp @@ -71,4 +71,4 @@ class NDTCuda : public LsqRegistration { }; } // namespace fast_gicp -#endif \ No newline at end of file +#endif diff --git a/src/align.cpp b/src/align.cpp index 4006778..8b41898 100644 --- a/src/align.cpp +++ b/src/align.cpp @@ -37,14 +37,14 @@ void test_pcl(Registration& reg, const pcl::PointCloud::ConstPtr& // 100 times t1 = std::chrono::high_resolution_clock::now(); - for (int i = 0; i < 100; i++) { + for (int i = 0; i < 10; i++) { reg.setInputTarget(target); reg.setInputSource(source); reg.align(*aligned); } t2 = std::chrono::high_resolution_clock::now(); double multi = std::chrono::duration_cast(t2 - t1).count() / 1e6; - std::cout << "100times:" << multi << "[msec] fitness_score:" << fitness_score << std::endl; + std::cout << "Avg over 10 runs:" << multi / 10.0 << "[msec] fitness_score:" << fitness_score << std::endl; } // benchmark for fast_gicp registration methods @@ -71,7 +71,7 @@ void test(Registration& reg, const pcl::PointCloud::ConstPtr& tar // 100 times t1 = std::chrono::high_resolution_clock::now(); - for (int i = 0; i < 100; i++) { + for (int i = 0; i < 10; i++) { reg.clearTarget(); reg.clearSource(); reg.setInputTarget(target); @@ -80,14 +80,14 @@ void test(Registration& reg, const pcl::PointCloud::ConstPtr& tar } t2 = std::chrono::high_resolution_clock::now(); double multi = std::chrono::duration_cast(t2 - t1).count() / 1e6; - std::cout << "100times:" << multi << "[msec] " << std::flush; + std::cout << "Avg over 10 runs:" << multi / 10.0 << "[msec] " << std::flush; // for some tasks like odometry calculation, // you can reuse the covariances of a source point cloud in the next registration t1 = std::chrono::high_resolution_clock::now(); pcl::PointCloud::ConstPtr target_ = target; pcl::PointCloud::ConstPtr source_ = source; - for (int i = 0; i < 100; i++) { + for (int i = 0; i < 10; i++) { reg.swapSourceAndTarget(); reg.clearSource(); @@ -100,7 +100,7 @@ void test(Registration& reg, const pcl::PointCloud::ConstPtr& tar t2 = std::chrono::high_resolution_clock::now(); double reuse = std::chrono::duration_cast(t2 - t1).count() / 1e6; - std::cout << "100times_reuse:" << reuse << "[msec] fitness_score:" << fitness_score << std::endl; + std::cout << "Avg over 10times_reuse:" << reuse / 10.0 << "[msec] fitness_score:" << fitness_score << std::endl; } /** @@ -157,25 +157,27 @@ int main(int argc, char** argv) { pcl_ndt.setResolution(1.0); test_pcl(pcl_ndt, target_cloud, source_cloud); - std::cout << "--- fgicp_st ---" << std::endl; - fast_gicp::FastGICPSingleThread fgicp_st; - test(fgicp_st, target_cloud, source_cloud); + /* + std::cout << "--- fgicp_st ---" << std::endl; + fast_gicp::FastGICPSingleThread fgicp_st; + test(fgicp_st, target_cloud, source_cloud); - std::cout << "--- fgicp_mt ---" << std::endl; - fast_gicp::FastGICP fgicp_mt; - // fast_gicp uses all the CPU cores by default - // fgicp_mt.setNumThreads(8); - test(fgicp_mt, target_cloud, source_cloud); + std::cout << "--- fgicp_mt ---" << std::endl; + fast_gicp::FastGICP fgicp_mt; + // fast_gicp uses all the CPU cores by default + // fgicp_mt.setNumThreads(8); + test(fgicp_mt, target_cloud, source_cloud); - std::cout << "--- vgicp_st ---" << std::endl; - fast_gicp::FastVGICP vgicp; - vgicp.setResolution(1.0); - vgicp.setNumThreads(1); - test(vgicp, target_cloud, source_cloud); + std::cout << "--- vgicp_st ---" << std::endl; + fast_gicp::FastVGICP vgicp; + vgicp.setResolution(1.0); + vgicp.setNumThreads(1); + test(vgicp, target_cloud, source_cloud); - std::cout << "--- vgicp_mt ---" << std::endl; - vgicp.setNumThreads(omp_get_max_threads()); - test(vgicp, target_cloud, source_cloud); + std::cout << "--- vgicp_mt ---" << std::endl; + vgicp.setNumThreads(omp_get_max_threads()); + test(vgicp, target_cloud, source_cloud); + */ #ifdef USE_VGICP_CUDA std::cout << "--- ndt_cuda (P2D) ---" << std::endl; @@ -188,13 +190,15 @@ int main(int argc, char** argv) { ndt_cuda.setDistanceMode(fast_gicp::NDTDistanceMode::D2D); test(ndt_cuda, target_cloud, source_cloud); - std::cout << "--- vgicp_cuda (parallel_kdtree) ---" << std::endl; fast_gicp::FastVGICPCuda vgicp_cuda; vgicp_cuda.setResolution(1.0); + /* // vgicp_cuda uses CPU-based parallel KDTree in covariance estimation by default // on a modern CPU, it is faster than GPU_BRUTEFORCE - // vgicp_cuda.setNearestNeighborSearchMethod(fast_gicp::NearestNeighborMethod::CPU_PARALLEL_KDTREE); + std::cout << "--- vgicp_cuda (parallel_kdtree) ---" << std::endl; + vgicp_cuda.setNearestNeighborSearchMethod(fast_gicp::NearestNeighborMethod::CPU_PARALLEL_KDTREE); test(vgicp_cuda, target_cloud, source_cloud); + */ std::cout << "--- vgicp_cuda (gpu_bruteforce) ---" << std::endl; // use GPU-based bruteforce nearest neighbor search for covariance estimation From 80daf35cf7223c7adbb1801c93dcdb602ea2656b Mon Sep 17 00:00:00 2001 From: Neil Johnson Date: Mon, 18 Dec 2023 15:03:42 -0700 Subject: [PATCH 3/3] Updated CMakeLists.txt. Getting linker errors in glyd. --- CMakeLists.txt | 33 ++++++++++++++++++++++----------- 1 file changed, 22 insertions(+), 11 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 7ae366a..ad7c869 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -74,14 +74,18 @@ add_library(fast_gicp SHARED src/fast_gicp/gicp/fast_gicp_st.cpp src/fast_gicp/gicp/fast_vgicp.cpp ) + +set_property(TARGET fast_gicp PROPERTY CUDA_STANDARD 17) + target_link_libraries(fast_gicp ${PCL_LIBRARIES} + dl ) -if (OPENMP_FOUND) - if (TARGET OpenMP::OpenMP_CXX) - target_link_libraries(fast_gicp OpenMP::OpenMP_CXX) - endif () -endif () +# if (OPENMP_FOUND) +# if (TARGET OpenMP::OpenMP_CXX) +# target_link_libraries(fast_gicp OpenMP::OpenMP_CXX) +# endif () +# endif () target_include_directories(fast_gicp PUBLIC include ${PCL_INCLUDE_DIRS} @@ -92,17 +96,20 @@ target_include_directories(fast_gicp PUBLIC if(BUILD_apps) add_executable(gicp_align src/align.cpp) add_dependencies(gicp_align fast_gicp) + set_property(TARGET gicp_align PROPERTY CUDA_STANDARD 17) target_link_libraries(gicp_align ${PCL_LIBRARIES} fast_gicp + dl ) - add_executable(gicp_kitti src/kitti.cpp) - add_dependencies(gicp_kitti fast_gicp) - target_link_libraries(gicp_kitti - ${PCL_LIBRARIES} - fast_gicp - ) + # add_executable(gicp_kitti src/kitti.cpp) + # add_dependencies(gicp_kitti fast_gicp) + # set_property(TARGET gicp_kitti PROPERTY CUDA_STANDARD 17) + # target_link_libraries(gicp_kitti + # ${PCL_LIBRARIES} + # fast_gicp + # ) endif() ### Python bindings ### @@ -118,6 +125,7 @@ if(BUILD_PYTHON_BINDINGS) ) target_link_libraries(pygicp PRIVATE fast_gicp + dl ) endif() @@ -139,6 +147,7 @@ if(BUILD_VGICP_CUDA) src/fast_gicp/cuda/ndt_cuda.cu src/fast_gicp/cuda/ndt_compute_derivatives.cu ) + set_property(TARGET fast_vgicp_cuda PROPERTY CUDA_STANDARD 17) target_include_directories(fast_vgicp_cuda PRIVATE include thirdparty/Eigen @@ -147,6 +156,7 @@ if(BUILD_VGICP_CUDA) ) target_link_libraries(fast_vgicp_cuda ${catkin_LIBRARIES} + dl ) # cuda_add_cublas_to_target(fast_vgicp_cuda) @@ -157,6 +167,7 @@ if(BUILD_VGICP_CUDA) ) target_link_libraries(fast_gicp fast_vgicp_cuda + dl ) add_dependencies(fast_gicp fast_vgicp_cuda) if(catkin_FOUND)