koide3 · neil-glydways · Dec 16, 2023 · Dec 18, 2023 · Dec 18, 2023
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -1,7 +1,7 @@
 cmake_minimum_required(VERSION 3.10.0)
 project(fast_gicp)
 
-option(BUILD_VGICP_CUDA "Build GPU-powered VGICP" OFF)
+option(BUILD_VGICP_CUDA "Build GPU-powered VGICP" ON)
 option(BUILD_apps "Build application programs" ON)
 option(BUILD_test "Build test programs" OFF)
 option(BUILD_PYTHON_BINDINGS "Build python bindings" OFF)
@@ -28,14 +28,30 @@ endif()
 
 find_package(OpenMP)
 if (OPENMP_FOUND)
+  message("OPENMP is found!  Flags: ${OpenMP_C_FLAGS}" )
   set (CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS}")
   set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}")
 endif()
 
 if(BUILD_VGICP_CUDA)
-  find_package(CUDA REQUIRED)
-  include_directories(${CUDA_INCLUDE_DIRS})
-  link_directories(${CUDA_LIBRARY_DIRS})
+  message("BUILDING WITH CUDA ENABLED!")
+  enable_language(CUDA)
+  cmake_policy(SET CMP0104 OLD)
+  #set_property(TARGET fast_gicp PROPERTY CUDA_ARCHITECTURES OFF)
+
+  # find_package(CUDA REQUIRED)
+  # include_directories(${CUDA_INCLUDE_DIRS})
+  # link_directories(${CUDA_LIBRARY_DIRS})
+  message("CUDA include dirs: ${CUDA_INCLUDE_DIRS}")
+  message("CUDA_LIBRARY_DIRS: ${CUDA_LIBRARY_DIRS}")
+
+  find_package(CUDAToolkit REQUIRED)
+  include_directories(${CUDAToolkit_INCLUDE_DIRS})
+  link_directories(${CUDAToolkit_LIBRARY_DIRS})
+
+  message("CUDAToolkit_INCLUDE_DIRS: ${CUDAToolkit_INCLUDE_DIRS}")
+  message("CUDAToolkit_LIBRARY_DIRS: ${CUDAToolkit_LIBRARY_DIRS}")
+
 endif()
 
 ###################################
@@ -58,14 +74,18 @@ add_library(fast_gicp SHARED
   src/fast_gicp/gicp/fast_gicp_st.cpp
   src/fast_gicp/gicp/fast_vgicp.cpp
 )
+
+set_property(TARGET fast_gicp PROPERTY CUDA_STANDARD 17)
+
 target_link_libraries(fast_gicp
   ${PCL_LIBRARIES}
+  dl
 )
-if (OPENMP_FOUND)
-    if (TARGET OpenMP::OpenMP_CXX)
-        target_link_libraries(fast_gicp OpenMP::OpenMP_CXX)
-    endif ()
-endif ()
+# if (OPENMP_FOUND)
+#     if (TARGET OpenMP::OpenMP_CXX)
+#         target_link_libraries(fast_gicp OpenMP::OpenMP_CXX)
+#     endif ()
+# endif ()
 target_include_directories(fast_gicp PUBLIC
   include
   ${PCL_INCLUDE_DIRS}
@@ -76,17 +96,20 @@ target_include_directories(fast_gicp PUBLIC
 if(BUILD_apps)
   add_executable(gicp_align src/align.cpp)
   add_dependencies(gicp_align fast_gicp)
+  set_property(TARGET gicp_align PROPERTY CUDA_STANDARD 17)
   target_link_libraries(gicp_align
     ${PCL_LIBRARIES}
     fast_gicp
+    dl
   )
 
-  add_executable(gicp_kitti src/kitti.cpp)
-  add_dependencies(gicp_kitti fast_gicp)
-  target_link_libraries(gicp_kitti
-    ${PCL_LIBRARIES}
-    fast_gicp
-  )
+  # add_executable(gicp_kitti src/kitti.cpp)
+  # add_dependencies(gicp_kitti fast_gicp)
+  # set_property(TARGET gicp_kitti PROPERTY CUDA_STANDARD 17)
+  # target_link_libraries(gicp_kitti
+  #   ${PCL_LIBRARIES}
+  #   fast_gicp
+  # )
 endif()
 
 ### Python bindings ###
@@ -102,6 +125,7 @@ if(BUILD_PYTHON_BINDINGS)
   )
   target_link_libraries(pygicp PRIVATE
     fast_gicp
+    dl
   )
 endif()
 
@@ -110,7 +134,7 @@ if(BUILD_VGICP_CUDA)
   set(CUDA_NVCC_FLAGS "--expt-relaxed-constexpr")
   add_definitions(-DUSE_VGICP_CUDA)
 
-  cuda_add_library(fast_vgicp_cuda SHARED
+  add_library(fast_vgicp_cuda SHARED
     src/fast_gicp/cuda/fast_vgicp_cuda.cu
     src/fast_gicp/cuda/brute_force_knn.cu
     src/fast_gicp/cuda/covariance_estimation.cu
@@ -123,6 +147,7 @@ if(BUILD_VGICP_CUDA)
     src/fast_gicp/cuda/ndt_cuda.cu
     src/fast_gicp/cuda/ndt_compute_derivatives.cu
   )
+  set_property(TARGET fast_vgicp_cuda PROPERTY CUDA_STANDARD 17)
   target_include_directories(fast_vgicp_cuda PRIVATE
     include
     thirdparty/Eigen
@@ -131,8 +156,9 @@ if(BUILD_VGICP_CUDA)
   )
   target_link_libraries(fast_vgicp_cuda
     ${catkin_LIBRARIES}
+    dl
   )
-  cuda_add_cublas_to_target(fast_vgicp_cuda)
+  # cuda_add_cublas_to_target(fast_vgicp_cuda)
 
   # add vgicp_cuda to libfast_gicp
   target_sources(fast_gicp PRIVATE
@@ -141,6 +167,7 @@ if(BUILD_VGICP_CUDA)
   )
   target_link_libraries(fast_gicp
     fast_vgicp_cuda
+    dl
   )
   add_dependencies(fast_gicp fast_vgicp_cuda)
   if(catkin_FOUND)

diff --git a/include/fast_gicp/ndt/ndt_cuda.hpp b/include/fast_gicp/ndt/ndt_cuda.hpp
@@ -71,4 +71,4 @@ class NDTCuda : public LsqRegistration<PointSource, PointTarget> {
 };
 }  // namespace fast_gicp
 
-#endif
+#endif
diff --git a/src/align.cpp b/src/align.cpp
@@ -37,14 +37,14 @@ void test_pcl(Registration& reg, const pcl::PointCloud<pcl::PointXYZ>::ConstPtr&
 
   // 100 times
   t1 = std::chrono::high_resolution_clock::now();
-  for (int i = 0; i < 100; i++) {
+  for (int i = 0; i < 10; i++) {
     reg.setInputTarget(target);
     reg.setInputSource(source);
     reg.align(*aligned);
   }
   t2 = std::chrono::high_resolution_clock::now();
   double multi = std::chrono::duration_cast<std::chrono::nanoseconds>(t2 - t1).count() / 1e6;
-  std::cout << "100times:" << multi << "[msec] fitness_score:" << fitness_score << std::endl;
+  std::cout << "Avg over 10 runs:" << multi / 10.0 << "[msec] fitness_score:" << fitness_score << std::endl;
 }
 
 // benchmark for fast_gicp registration methods
@@ -71,7 +71,7 @@ void test(Registration& reg, const pcl::PointCloud<pcl::PointXYZ>::ConstPtr& tar
 
   // 100 times
   t1 = std::chrono::high_resolution_clock::now();
-  for (int i = 0; i < 100; i++) {
+  for (int i = 0; i < 10; i++) {
     reg.clearTarget();
     reg.clearSource();
     reg.setInputTarget(target);
@@ -80,14 +80,14 @@ void test(Registration& reg, const pcl::PointCloud<pcl::PointXYZ>::ConstPtr& tar
   }
   t2 = std::chrono::high_resolution_clock::now();
   double multi = std::chrono::duration_cast<std::chrono::nanoseconds>(t2 - t1).count() / 1e6;
-  std::cout << "100times:" << multi << "[msec] " << std::flush;
+  std::cout << "Avg over 10 runs:" << multi / 10.0 << "[msec] " << std::flush;
 
   // for some tasks like odometry calculation,
   // you can reuse the covariances of a source point cloud in the next registration
   t1 = std::chrono::high_resolution_clock::now();
   pcl::PointCloud<pcl::PointXYZ>::ConstPtr target_ = target;
   pcl::PointCloud<pcl::PointXYZ>::ConstPtr source_ = source;
-  for (int i = 0; i < 100; i++) {
+  for (int i = 0; i < 10; i++) {
     reg.swapSourceAndTarget();
     reg.clearSource();
 
@@ -100,7 +100,7 @@ void test(Registration& reg, const pcl::PointCloud<pcl::PointXYZ>::ConstPtr& tar
   t2 = std::chrono::high_resolution_clock::now();
   double reuse = std::chrono::duration_cast<std::chrono::nanoseconds>(t2 - t1).count() / 1e6;
 
-  std::cout << "100times_reuse:" << reuse << "[msec] fitness_score:" << fitness_score << std::endl;
+  std::cout << "Avg over 10times_reuse:" << reuse / 10.0 << "[msec] fitness_score:" << fitness_score << std::endl;
 }
 
 /**
@@ -157,25 +157,27 @@ int main(int argc, char** argv) {
   pcl_ndt.setResolution(1.0);
   test_pcl(pcl_ndt, target_cloud, source_cloud);
 
-  std::cout << "--- fgicp_st ---" << std::endl;
-  fast_gicp::FastGICPSingleThread<pcl::PointXYZ, pcl::PointXYZ> fgicp_st;
-  test(fgicp_st, target_cloud, source_cloud);
+  /*
+    std::cout << "--- fgicp_st ---" << std::endl;
+    fast_gicp::FastGICPSingleThread<pcl::PointXYZ, pcl::PointXYZ> fgicp_st;
+    test(fgicp_st, target_cloud, source_cloud);
 
-  std::cout << "--- fgicp_mt ---" << std::endl;
-  fast_gicp::FastGICP<pcl::PointXYZ, pcl::PointXYZ> fgicp_mt;
-  // fast_gicp uses all the CPU cores by default
-  // fgicp_mt.setNumThreads(8);
-  test(fgicp_mt, target_cloud, source_cloud);
+    std::cout << "--- fgicp_mt ---" << std::endl;
+    fast_gicp::FastGICP<pcl::PointXYZ, pcl::PointXYZ> fgicp_mt;
+    // fast_gicp uses all the CPU cores by default
+    // fgicp_mt.setNumThreads(8);
+    test(fgicp_mt, target_cloud, source_cloud);
 
-  std::cout << "--- vgicp_st ---" << std::endl;
-  fast_gicp::FastVGICP<pcl::PointXYZ, pcl::PointXYZ> vgicp;
-  vgicp.setResolution(1.0);
-  vgicp.setNumThreads(1);
-  test(vgicp, target_cloud, source_cloud);
+    std::cout << "--- vgicp_st ---" << std::endl;
+    fast_gicp::FastVGICP<pcl::PointXYZ, pcl::PointXYZ> vgicp;
+    vgicp.setResolution(1.0);
+    vgicp.setNumThreads(1);
+    test(vgicp, target_cloud, source_cloud);
 
-  std::cout << "--- vgicp_mt ---" << std::endl;
-  vgicp.setNumThreads(omp_get_max_threads());
-  test(vgicp, target_cloud, source_cloud);
+    std::cout << "--- vgicp_mt ---" << std::endl;
+    vgicp.setNumThreads(omp_get_max_threads());
+    test(vgicp, target_cloud, source_cloud);
+  */
 
 #ifdef USE_VGICP_CUDA
   std::cout << "--- ndt_cuda (P2D) ---" << std::endl;
@@ -188,13 +190,15 @@ int main(int argc, char** argv) {
   ndt_cuda.setDistanceMode(fast_gicp::NDTDistanceMode::D2D);
   test(ndt_cuda, target_cloud, source_cloud);
 
-  std::cout << "--- vgicp_cuda (parallel_kdtree) ---" << std::endl;
   fast_gicp::FastVGICPCuda<pcl::PointXYZ, pcl::PointXYZ> vgicp_cuda;
   vgicp_cuda.setResolution(1.0);
+  /*
   // vgicp_cuda uses CPU-based parallel KDTree in covariance estimation by default
   // on a modern CPU, it is faster than GPU_BRUTEFORCE
-  // vgicp_cuda.setNearestNeighborSearchMethod(fast_gicp::NearestNeighborMethod::CPU_PARALLEL_KDTREE);
+  std::cout << "--- vgicp_cuda (parallel_kdtree) ---" << std::endl;
+  vgicp_cuda.setNearestNeighborSearchMethod(fast_gicp::NearestNeighborMethod::CPU_PARALLEL_KDTREE);
   test(vgicp_cuda, target_cloud, source_cloud);
+  */
 
   std::cout << "--- vgicp_cuda (gpu_bruteforce) ---" << std::endl;
   // use GPU-based bruteforce nearest neighbor search for covariance estimation