From 0e51ef9f17bf911883f5c8f07821207654983084 Mon Sep 17 00:00:00 2001
From: "Micah Chambers (eos)" <mchambers@anduril.com>
Date: Thu, 7 Aug 2025 00:18:36 -0700
Subject: [PATCH 1/7] add pci stuff

---
 src/ffi/device.rs | 165 ++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 165 insertions(+)
diff --git a/src/ffi/device.rs b/src/ffi/device.rs
index 4044b6b..3074dee 100644
--- a/src/ffi/device.rs
+++ b/src/ffi/device.rs
@@ -1,3 +1,6 @@
+use std::ffi::c_char;
+use std::ffi::c_int;
+
 use cpp::cpp;
 
 use crate::device::DeviceId;
@@ -6,6 +9,70 @@ use crate::ffi::result;
 
 type Result<T> = std::result::Result<T, crate::error::Error>;
 
+#[derive(Clone, Debug)]
+#[repr(C)]
+pub struct CudaDeviceProperties {
+    pub can_map_host_memory: i32,
+    pub clock_rate: i32,
+    pub device_overlap: i32,
+    pub kernel_exec_timeout_enabled: i32,
+    pub major: i32,
+    pub max_grid_size: [i32; 3],
+    pub max_threads_dim: [i32; 3],
+    pub max_threads_per_block: i32,
+    pub mem_pitch: usize,
+    pub memory_bus_width: i32,
+    pub memory_clock_rate: i32,
+    pub minor: i32,
+    pub multi_processor_count: i32,
+    pub name: [c_char; 256],
+    pub regs_per_block: i32,
+    pub shared_mem_per_block: usize,
+    pub texture_alignment: usize,
+    pub total_const_mem: usize,
+    pub total_global_mem: usize,
+    pub warp_size: i32,
+    pub pci_bus_id: i32,
+    pub pci_device_id: i32,
+    pub pci_domain_id: i32,
+}
+
+impl Default for CudaDeviceProperties {
+    fn default() -> Self {
+        Self {
+            clock_rate: Default::default(),
+            device_overlap: Default::default(),
+            kernel_exec_timeout_enabled: Default::default(),
+            major: Default::default(),
+            max_grid_size: Default::default(),
+            can_map_host_memory: Default::default(),
+            max_threads_dim: Default::default(),
+            max_threads_per_block: Default::default(),
+            mem_pitch: Default::default(),
+            memory_bus_width: Default::default(),
+            memory_clock_rate: Default::default(),
+            minor: Default::default(),
+            multi_processor_count: Default::default(),
+            name: [0; 256],
+            regs_per_block: Default::default(),
+            shared_mem_per_block: Default::default(),
+            texture_alignment: Default::default(),
+            total_const_mem: Default::default(),
+            total_global_mem: Default::default(),
+            warp_size: Default::default(),
+            pci_bus_id: 0,
+            pci_device_id: 0,
+            pci_domain_id: 0,
+        }
+    }
+}
+
+#[derive(Clone, Debug, Default)]
+#[repr(C)]
+pub struct CudaUUID {
+    pub bytes: [u8; 16],
+}
+
 /// Synchronous implementation of [`crate::num_devices`].
 ///
 /// Refer to [`crate::num_devices`] for documentation.
@@ -54,6 +121,98 @@ impl Device {
         result!(ret)
     }
 
+    #[inline]
+    pub fn get_properties(id: DeviceId) -> Result<CudaDeviceProperties> {
+        let mut props = CudaDeviceProperties::default();
+        cpp! {{
+            #include <cuda_runtime_api.h>
+            #include <cstring>
+        }}
+        let can_map_host_memory_ptr = std::ptr::addr_of_mut!(props.can_map_host_memory);
+        let clock_rate_ptr = std::ptr::addr_of_mut!(props.clock_rate);
+        let device_overlap_ptr = std::ptr::addr_of_mut!(props.device_overlap);
+        let kernel_exec_timeout_enabled_ptr =
+            std::ptr::addr_of_mut!(props.kernel_exec_timeout_enabled);
+        let major_ptr = std::ptr::addr_of_mut!(props.major);
+        let max_grid_size_ptr = std::ptr::addr_of_mut!(props.max_grid_size);
+        let max_threads_dim_ptr = std::ptr::addr_of_mut!(props.max_threads_dim);
+        let max_threads_per_block_ptr = std::ptr::addr_of_mut!(props.max_threads_per_block);
+        let mem_pitch_ptr = std::ptr::addr_of_mut!(props.mem_pitch);
+        let memory_bus_width_ptr = std::ptr::addr_of_mut!(props.memory_bus_width);
+        let memory_clock_rate_ptr = std::ptr::addr_of_mut!(props.memory_clock_rate);
+        let minor_ptr = std::ptr::addr_of_mut!(props.minor);
+        let multi_processor_count_ptr = std::ptr::addr_of_mut!(props.multi_processor_count);
+        let name_ptr = std::ptr::addr_of_mut!(props.name);
+        let regs_per_block_ptr = std::ptr::addr_of_mut!(props.regs_per_block);
+        let shared_mem_per_block_ptr = std::ptr::addr_of_mut!(props.shared_mem_per_block);
+        let texture_alignment_ptr = std::ptr::addr_of_mut!(props.texture_alignment);
+        let total_const_mem_ptr = std::ptr::addr_of_mut!(props.total_const_mem);
+        let total_global_mem_ptr = std::ptr::addr_of_mut!(props.total_global_mem);
+        let warp_size_ptr = std::ptr::addr_of_mut!(props.warp_size);
+        let pci_bus_id_ptr = std::ptr::addr_of_mut!(props.pci_bus_id);
+        let pci_device_id_ptr = std::ptr::addr_of_mut!(props.pci_device_id);
+        let pci_domain_id_ptr = std::ptr::addr_of_mut!(props.pci_domain_id);
+
+        let ret = cpp!(unsafe [
+            id as "int",
+            can_map_host_memory_ptr as "int*",
+            clock_rate_ptr as "int*",
+            device_overlap_ptr as "int*",
+            kernel_exec_timeout_enabled_ptr as "int*",
+            major_ptr as "int*",
+            max_grid_size_ptr as "int*",
+            max_threads_dim_ptr as "int*",
+            max_threads_per_block_ptr as "int*",
+            mem_pitch_ptr as "unsigned long long*",
+            memory_bus_width_ptr as "int*",
+            memory_clock_rate_ptr as "int*",
+            minor_ptr as "int*",
+            multi_processor_count_ptr as "int*",
+            name_ptr as "char*",
+            regs_per_block_ptr as "int*",
+            shared_mem_per_block_ptr as "unsigned long long*",
+            texture_alignment_ptr as "unsigned long long*",
+            total_const_mem_ptr as "unsigned long long*",
+            total_global_mem_ptr as "unsigned long long*",
+            warp_size_ptr as "int*",
+            pci_bus_id_ptr as "int*",
+            pci_domain_id_ptr as "int*",
+            pci_device_id_ptr as "int*"
+        ] -> i32 as "int" {
+            cudaDeviceProp props;
+            const auto ret = cudaGetDeviceProperties(&props, id);
+            *can_map_host_memory_ptr = props.canMapHostMemory;
+            *clock_rate_ptr = props.clockRate;
+            *device_overlap_ptr = props.deviceOverlap;
+            *kernel_exec_timeout_enabled_ptr = props.kernelExecTimeoutEnabled;
+            *major_ptr = props.major;
+
+            for(int i = 0; i < 3; ++i) {
+                max_grid_size_ptr[i] = props.maxGridSize[i];
+                max_threads_dim_ptr[i] = props.maxThreadsDim[i];
+            }
+            *max_threads_per_block_ptr = props.maxThreadsPerBlock;
+            *mem_pitch_ptr = props.memPitch;
+            *memory_bus_width_ptr = props.memoryBusWidth;
+            *memory_clock_rate_ptr = props.memoryClockRate;
+            *minor_ptr = props.minor;
+            *multi_processor_count_ptr = props.multiProcessorCount;
+            *regs_per_block_ptr = props.regsPerBlock;
+            *shared_mem_per_block_ptr = props.sharedMemPerBlock;
+            *texture_alignment_ptr = props.textureAlignment;
+            *total_const_mem_ptr = props.totalConstMem;
+            *total_global_mem_ptr = props.totalGlobalMem;
+            *warp_size_ptr = props.warpSize;
+            *pci_bus_id_ptr = props.pciBusID;
+            *pci_device_id_ptr = props.pciDeviceID;
+            *pci_domain_id_ptr = props.pciDomainID;
+            memcpy(name_ptr, props.name, 256);
+            return ret;
+        });
+
+        result!(ret, props)
+    }
+
     #[inline(always)]
     pub fn set_or_panic(id: DeviceId) {
         Device::set(id).unwrap_or_else(|err| panic!("failed to set device {id}: {err}"));
@@ -107,6 +266,12 @@ mod tests {
         assert!(Device::synchronize().is_ok());
     }
 
+    #[test]
+    fn test_device_properties() {
+        assert!(Device::get_properties(0).is_ok());
+        eprintln!("{:?}", Device::get_properties(0));
+    }
+
     #[test]
     fn test_memory_info() {
         let memory_info = Device::memory_info().unwrap();

From 6c1010c298b738996fd6d31df21fbe790bdc3c92 Mon Sep 17 00:00:00 2001
From: "Micah Chambers (eos)" <mchambers@anduril.com>
Date: Thu, 7 Aug 2025 00:35:43 -0700
Subject: [PATCH 2/7] this is better

---
 src/ffi/device.rs | 499 +++++++++++++++++++++++++++++++++++-----------
 1 file changed, 378 insertions(+), 121 deletions(-)

diff --git a/src/ffi/device.rs b/src/ffi/device.rs
index 3074dee..5672c99 100644
--- a/src/ffi/device.rs
+++ b/src/ffi/device.rs
@@ -11,68 +11,390 @@ type Result<T> = std::result::Result<T, crate::error::Error>;
 
 #[derive(Clone, Debug)]
 #[repr(C)]
-pub struct CudaDeviceProperties {
-    pub can_map_host_memory: i32,
+pub struct CudaDeviceProp {
+    // ASCII string identifying the device.
+    // original name: name[256]
+    pub name: [i8; 256],
+
+    // 16-byte unique identifier.
+    // original name: uuid
+    pub uuid: [u8; 16],
+
+    // Total amount of global memory available on the device in bytes.
+    // original name: totalGlobalMem
+    pub total_global_mem: usize,
+
+    // Maximum amount of shared memory available to a thread block in bytes.
+    // original name: sharedMemPerBlock
+    pub shared_mem_per_block: usize,
+
+    // Maximum number of 32-bit registers available to a thread block.
+    // original name: regsPerBlock
+    pub regs_per_block: i32,
+
+    // Warp size in threads.
+    // original name: warpSize
+    pub warp_size: i32,
+
+    // Maximum pitch in bytes allowed by the memory copy functions that involve memory regions allocated through cudaMallocPitch().
+    // original name: memPitch
+    pub mem_pitch: usize,
+
+    // Maximum number of threads per block.
+    // original name: maxThreadsPerBlock
+    pub max_threads_per_block: i32,
+
+    // Maximum size of each dimension of a block.
+    // original name: maxThreadsDim[3]
+    pub max_threads_dim: [i32; 3],
+
+    // Maximum size of each dimension of a grid.
+    // original name: maxGridSize[3]
+    pub max_grid_size: [i32; 3],
+
+    // Clock frequency in kilohertz.
+    // original name: clockRate
     pub clock_rate: i32,
-    pub device_overlap: i32,
-    pub kernel_exec_timeout_enabled: i32,
+
+    // Total amount of constant memory available on the device in bytes.
+    // original name: totalConstMem
+    pub total_const_mem: usize,
+
+    // Major revision number defining the device's compute capability.
+    // original name: major
     pub major: i32,
-    pub max_grid_size: [i32; 3],
-    pub max_threads_dim: [i32; 3],
-    pub max_threads_per_block: i32,
-    pub mem_pitch: usize,
-    pub memory_bus_width: i32,
-    pub memory_clock_rate: i32,
+
+    // Minor revision number defining the device's compute capability.
+    // original name: minor
     pub minor: i32,
-    pub multi_processor_count: i32,
-    pub name: [c_char; 256],
-    pub regs_per_block: i32,
-    pub shared_mem_per_block: usize,
+
+    // Alignment requirement; texture base addresses that are aligned to texture_alignment bytes do not need an offset applied to texture fetches.
+    // original name: textureAlignment
     pub texture_alignment: usize,
-    pub total_const_mem: usize,
-    pub total_global_mem: usize,
-    pub warp_size: i32,
+
+    // Pitch alignment requirement for 2D texture references that are bound to pitched memory.
+    // original name: texturePitchAlignment
+    pub texture_pitch_alignment: usize,
+
+    // 1 if the device can concurrently copy memory between host and device while executing a kernel, or 0 if not.
+    // original name: deviceOverlap
+    pub device_overlap: i32,
+
+    // Number of multiprocessors on the device.
+    // original name: multiProcessorCount
+    pub multi_processor_count: i32,
+
+    // 1 if there is a run time limit for kernels executed on the device, or 0 if not.
+    // original name: kernelExecTimeoutEnabled
+    pub kernel_exec_timeout_enabled: i32,
+
+    // 1 if the device is an integrated (motherboard) GPU and 0 if it is a discrete (card) component.
+    // original name: integrated
+    pub integrated: i32,
+
+    // 1 if the device can map host memory into the CUDA address space for use with cudaHostAlloc()/cudaHostGetDevicePointer(), or 0 if not.
+    // original name: canMapHostMemory
+    pub can_map_host_memory: i32,
+
+    // Compute mode that the device is currently in.
+    // original name: computeMode
+    pub compute_mode: i32,
+
+    // Maximum 1D texture size.
+    // original name: maxTexture1D
+    pub max_texture1d: i32,
+
+    // Maximum 1D mipmapped texture size.
+    // original name: maxTexture1DMipmap
+    pub max_texture1d_mipmap: i32,
+
+    // Maximum 1D texture size for textures bound to linear memory.
+    // original name: maxTexture1DLinear
+    pub max_texture1d_linear: i32,
+
+    // Maximum 2D texture dimensions.
+    // original name: maxTexture2D[2]
+    pub max_texture2d: [i32; 2],
+
+    // Maximum 2D mipmapped texture dimensions.
+    // original name: maxTexture2DMipmap[2]
+    pub max_texture2d_mipmap: [i32; 2],
+
+    // Maximum 2D texture dimensions for 2D textures bound to pitch linear memory.
+    // original name: maxTexture2DLinear[3]
+    pub max_texture2d_linear: [i32; 3],
+
+    // Maximum 2D texture dimensions if texture gather operations have to be performed.
+    // original name: maxTexture2DGather[2]
+    pub max_texture2d_gather: [i32; 2],
+
+    // Maximum 3D texture dimensions.
+    // original name: maxTexture3D[3]
+    pub max_texture3d: [i32; 3],
+
+    // Maximum alternate 3D texture dimensions.
+    // original name: maxTexture3DAlt[3]
+    pub max_texture3d_alt: [i32; 3],
+
+    // Maximum cubemap texture width or height.
+    // original name: maxTextureCubemap
+    pub max_texture_cubemap: i32,
+
+    // Maximum 1D layered texture dimensions.
+    // original name: maxTexture1DLayered[2]
+    pub max_texture1d_layered: [i32; 2],
+
+    // Maximum 2D layered texture dimensions.
+    // original name: maxTexture2DLayered[3]
+    pub max_texture2d_layered: [i32; 3],
+
+    // Maximum cubemap layered texture dimensions.
+    // original name: maxTextureCubemapLayered[2]
+    pub max_texture_cubemap_layered: [i32; 2],
+
+    // Maximum 1D surface size.
+    // original name: maxSurface1D
+    pub max_surface1d: i32,
+
+    // Maximum 2D surface dimensions.
+    // original name: maxSurface2D[2]
+    pub max_surface2d: [i32; 2],
+
+    // Maximum 3D surface dimensions.
+    // original name: maxSurface3D[3]
+    pub max_surface3d: [i32; 3],
+
+    // Maximum 1D layered surface dimensions.
+    // original name: maxSurface1DLayered[2]
+    pub max_surface1d_layered: [i32; 2],
+
+    // Maximum 2D layered surface dimensions.
+    // original name: maxSurface2DLayered[3]
+    pub max_surface2d_layered: [i32; 3],
+
+    // Maximum cubemap surface width or height.
+    // original name: maxSurfaceCubemap
+    pub max_surface_cubemap: i32,
+
+    // Maximum cubemap layered surface dimensions.
+    // original name: maxSurfaceCubemapLayered[2]
+    pub max_surface_cubemap_layered: [i32; 2],
+
+    // Alignment requirements for surfaces.
+    // original name: surfaceAlignment
+    pub surface_alignment: usize,
+
+    // 1 if the device supports executing multiple kernels within the same context simultaneously, or 0 if not.
+    // original name: concurrentKernels
+    pub concurrent_kernels: i32,
+
+    // 1 if the device has ECC support turned on, or 0 if not.
+    // original name: ECCEnabled
+    pub ecc_enabled: i32,
+
+    // PCI bus identifier of the device.
+    // original name: pciBusID
     pub pci_bus_id: i32,
+
+    // PCI device (sometimes called slot) identifier of the device.
+    // original name: pciDeviceID
     pub pci_device_id: i32,
+
+    // PCI domain identifier of the device.
+    // original name: pciDomainID
     pub pci_domain_id: i32,
+
+    // 1 if the device is using a TCC driver or 0 if not.
+    // original name: tccDriver
+    pub tcc_driver: i32,
+
+    // 1 when the device can concurrently copy memory between host and device while executing a kernel.
+    // original name: asyncEngineCount
+    pub async_engine_count: i32,
+
+    // 1 if the device shares a unified address space with the host and 0 otherwise.
+    // original name: unifiedAddressing
+    pub unified_addressing: i32,
+
+    // Peak memory clock frequency in kilohertz.
+    // original name: memoryClockRate
+    pub memory_clock_rate: i32,
+
+    // Memory bus width in bits.
+    // original name: memoryBusWidth
+    pub memory_bus_width: i32,
+
+    // L2 cache size in bytes.
+    // original name: l2CacheSize
+    pub l2_cache_size: i32,
+
+    // L2 cache's maximum persisting lines size in bytes.
+    // original name: persistingL2CacheMaxSize
+    pub persisting_l2_cache_max_size: i32,
+
+    // Number of maximum resident threads per multiprocessor.
+    // original name: maxThreadsPerMultiProcessor
+    pub max_threads_per_multi_processor: i32,
+
+    // 1 if the device supports stream priorities, or 0 if it is not supported.
+    // original name: streamPrioritiesSupported
+    pub stream_priorities_supported: i32,
+
+    // 1 if the device supports caching of globals in L1 cache, or 0 if it is not supported.
+    // original name: globalL1CacheSupported
+    pub global_l1_cache_supported: i32,
+
+    // 1 if the device supports caching of locals in L1 cache, or 0 if it is not supported.
+    // original name: localL1CacheSupported
+    pub local_l1_cache_supported: i32,
+
+    // Maximum amount of shared memory available to a multiprocessor in bytes; this amount is shared by all thread blocks simultaneously resident on a multiprocessor.
+    // original name: sharedMemPerMultiprocessor
+    pub shared_mem_per_multiprocessor: usize,
+
+    // Maximum number of 32-bit registers available to a multiprocessor; this number is shared by all thread blocks simultaneously resident on a multiprocessor.
+    // original name: regsPerMultiprocessor
+    pub regs_per_multiprocessor: i32,
+
+    // 1 if the device supports allocating managed memory on this system, or 0 if it is not supported.
+    // original name: managedMemory
+    pub managed_memory: i32,
+
+    // 1 if the device is on a multi-GPU board (e.g. Gemini cards), and 0 if not.
+    // original name: isMultiGpuBoard
+    pub is_multi_gpu_board: i32,
+
+    // Unique identifier for a group of devices associated with the same board. Devices on the same multi-GPU board will share the same identifier.
+    // original name: multiGpuBoardGroupID
+    pub multi_gpu_board_group_id: i32,
+
+    // Ratio of single precision performance (in floating-point operations per second) to double precision performance.
+    // original name: singleToDoublePrecisionPerfRatio
+    pub single_to_double_precision_perf_ratio: i32,
+
+    // 1 if the device supports coherently accessing pageable memory without calling cudaHostRegister on it, and 0 otherwise.
+    // original name: pageableMemoryAccess
+    pub pageable_memory_access: i32,
+
+    // 1 if the device can coherently access managed memory concurrently with the CPU, and 0 otherwise.
+    // original name: concurrentManagedAccess
+    pub concurrent_managed_access: i32,
+
+    // 1 if the device supports Compute Preemption, and 0 otherwise.
+    // original name: computePreemptionSupported
+    pub compute_preemption_supported: i32,
+
+    // 1 if the device can access host registered memory at the same virtual address as the CPU, and 0 otherwise.
+    // original name: canUseHostPointerForRegisteredMem
+    pub can_use_host_pointer_for_registered_mem: i32,
+
+    // 1 if the device supports launching cooperative kernels via cudaLaunchCooperativeKernel, and 0 otherwise.
+    // original name: cooperativeLaunch
+    pub cooperative_launch: i32,
+
+    // 1 if the device supports launching cooperative kernels via cudaLaunchCooperativeKernelMultiDevice, and 0 otherwise.
+    // original name: cooperativeMultiDeviceLaunch
+    pub cooperative_multi_device_launch: i32,
+
+    // 1 if the device accesses pageable memory via the host's page tables, and 0 otherwise.
+    // original name: pageableMemoryAccessUsesHostPageTables
+    pub pageable_memory_access_uses_host_page_tables: i32,
+
+    // 1 if the host can directly access managed memory on the device without migration, and 0 otherwise.
+    // original name: directManagedMemAccessFromHost
+    pub direct_managed_mem_access_from_host: i32,
+
+    // Maximum number of thread blocks that can reside on a multiprocessor.
+    // original name: maxBlocksPerMultiProcessor
+    pub max_blocks_per_multi_processor: i32,
+
+    // Maximum value of cudaAccessPolicyWindow::num_bytes.
+    // original name: accessPolicyMaxWindowSize
+    pub access_policy_max_window_size: i32,
 }
 
-impl Default for CudaDeviceProperties {
+impl Default for CudaDeviceProp {
     fn default() -> Self {
-        Self {
-            clock_rate: Default::default(),
-            device_overlap: Default::default(),
-            kernel_exec_timeout_enabled: Default::default(),
-            major: Default::default(),
-            max_grid_size: Default::default(),
-            can_map_host_memory: Default::default(),
-            max_threads_dim: Default::default(),
-            max_threads_per_block: Default::default(),
-            mem_pitch: Default::default(),
-            memory_bus_width: Default::default(),
-            memory_clock_rate: Default::default(),
-            minor: Default::default(),
-            multi_processor_count: Default::default(),
+        CudaDeviceProp {
             name: [0; 256],
-            regs_per_block: Default::default(),
-            shared_mem_per_block: Default::default(),
-            texture_alignment: Default::default(),
-            total_const_mem: Default::default(),
-            total_global_mem: Default::default(),
-            warp_size: Default::default(),
+            uuid: [0; 16],
+            total_global_mem: 0,
+            shared_mem_per_block: 0,
+            regs_per_block: 0,
+            warp_size: 0,
+            mem_pitch: 0,
+            max_threads_per_block: 0,
+            max_threads_dim: [0; 3],
+            max_grid_size: [0; 3],
+            clock_rate: 0,
+            total_const_mem: 0,
+            major: 0,
+            minor: 0,
+            texture_alignment: 0,
+            texture_pitch_alignment: 0,
+            device_overlap: 0,
+            multi_processor_count: 0,
+            kernel_exec_timeout_enabled: 0,
+            integrated: 0,
+            can_map_host_memory: 0,
+            compute_mode: 0,
+            max_texture1d: 0,
+            max_texture1d_mipmap: 0,
+            max_texture1d_linear: 0,
+            max_texture2d: [0; 2],
+            max_texture2d_mipmap: [0; 2],
+            max_texture2d_linear: [0; 3],
+            max_texture2d_gather: [0; 2],
+            max_texture3d: [0; 3],
+            max_texture3d_alt: [0; 3],
+            max_texture_cubemap: 0,
+            max_texture1d_layered: [0; 2],
+            max_texture2d_layered: [0; 3],
+            max_texture_cubemap_layered: [0; 2],
+            max_surface1d: 0,
+            max_surface2d: [0; 2],
+            max_surface3d: [0; 3],
+            max_surface1d_layered: [0; 2],
+            max_surface2d_layered: [0; 3],
+            max_surface_cubemap: 0,
+            max_surface_cubemap_layered: [0; 2],
+            surface_alignment: 0,
+            concurrent_kernels: 0,
+            ecc_enabled: 0,
             pci_bus_id: 0,
             pci_device_id: 0,
             pci_domain_id: 0,
+            tcc_driver: 0,
+            async_engine_count: 0,
+            unified_addressing: 0,
+            memory_clock_rate: 0,
+            memory_bus_width: 0,
+            l2_cache_size: 0,
+            persisting_l2_cache_max_size: 0,
+            max_threads_per_multi_processor: 0,
+            stream_priorities_supported: 0,
+            global_l1_cache_supported: 0,
+            local_l1_cache_supported: 0,
+            shared_mem_per_multiprocessor: 0,
+            regs_per_multiprocessor: 0,
+            managed_memory: 0,
+            is_multi_gpu_board: 0,
+            multi_gpu_board_group_id: 0,
+            single_to_double_precision_perf_ratio: 0,
+            pageable_memory_access: 0,
+            concurrent_managed_access: 0,
+            compute_preemption_supported: 0,
+            can_use_host_pointer_for_registered_mem: 0,
+            cooperative_launch: 0,
+            cooperative_multi_device_launch: 0,
+            pageable_memory_access_uses_host_page_tables: 0,
+            direct_managed_mem_access_from_host: 0,
+            max_blocks_per_multi_processor: 0,
+            access_policy_max_window_size: 0,
         }
     }
 }
 
-#[derive(Clone, Debug, Default)]
-#[repr(C)]
-pub struct CudaUUID {
-    pub bytes: [u8; 16],
-}
-
 /// Synchronous implementation of [`crate::num_devices`].
 ///
 /// Refer to [`crate::num_devices`] for documentation.
@@ -122,91 +444,26 @@ impl Device {
     }
 
     #[inline]
-    pub fn get_properties(id: DeviceId) -> Result<CudaDeviceProperties> {
-        let mut props = CudaDeviceProperties::default();
+    pub fn get_properties(id: DeviceId) -> Result<CudaDeviceProp> {
+        let mut props = CudaDeviceProp::default();
+        let props_len = std::mem::size_of::<CudaDeviceProp>();
         cpp! {{
             #include <cuda_runtime_api.h>
             #include <cstring>
+            #include <algorithm>
         }}
-        let can_map_host_memory_ptr = std::ptr::addr_of_mut!(props.can_map_host_memory);
-        let clock_rate_ptr = std::ptr::addr_of_mut!(props.clock_rate);
-        let device_overlap_ptr = std::ptr::addr_of_mut!(props.device_overlap);
-        let kernel_exec_timeout_enabled_ptr =
-            std::ptr::addr_of_mut!(props.kernel_exec_timeout_enabled);
-        let major_ptr = std::ptr::addr_of_mut!(props.major);
-        let max_grid_size_ptr = std::ptr::addr_of_mut!(props.max_grid_size);
-        let max_threads_dim_ptr = std::ptr::addr_of_mut!(props.max_threads_dim);
-        let max_threads_per_block_ptr = std::ptr::addr_of_mut!(props.max_threads_per_block);
-        let mem_pitch_ptr = std::ptr::addr_of_mut!(props.mem_pitch);
-        let memory_bus_width_ptr = std::ptr::addr_of_mut!(props.memory_bus_width);
-        let memory_clock_rate_ptr = std::ptr::addr_of_mut!(props.memory_clock_rate);
-        let minor_ptr = std::ptr::addr_of_mut!(props.minor);
-        let multi_processor_count_ptr = std::ptr::addr_of_mut!(props.multi_processor_count);
-        let name_ptr = std::ptr::addr_of_mut!(props.name);
-        let regs_per_block_ptr = std::ptr::addr_of_mut!(props.regs_per_block);
-        let shared_mem_per_block_ptr = std::ptr::addr_of_mut!(props.shared_mem_per_block);
-        let texture_alignment_ptr = std::ptr::addr_of_mut!(props.texture_alignment);
-        let total_const_mem_ptr = std::ptr::addr_of_mut!(props.total_const_mem);
-        let total_global_mem_ptr = std::ptr::addr_of_mut!(props.total_global_mem);
-        let warp_size_ptr = std::ptr::addr_of_mut!(props.warp_size);
-        let pci_bus_id_ptr = std::ptr::addr_of_mut!(props.pci_bus_id);
-        let pci_device_id_ptr = std::ptr::addr_of_mut!(props.pci_device_id);
-        let pci_domain_id_ptr = std::ptr::addr_of_mut!(props.pci_domain_id);
+        let props_ptr = std::ptr::addr_of_mut!(props);
 
         let ret = cpp!(unsafe [
             id as "int",
-            can_map_host_memory_ptr as "int*",
-            clock_rate_ptr as "int*",
-            device_overlap_ptr as "int*",
-            kernel_exec_timeout_enabled_ptr as "int*",
-            major_ptr as "int*",
-            max_grid_size_ptr as "int*",
-            max_threads_dim_ptr as "int*",
-            max_threads_per_block_ptr as "int*",
-            mem_pitch_ptr as "unsigned long long*",
-            memory_bus_width_ptr as "int*",
-            memory_clock_rate_ptr as "int*",
-            minor_ptr as "int*",
-            multi_processor_count_ptr as "int*",
-            name_ptr as "char*",
-            regs_per_block_ptr as "int*",
-            shared_mem_per_block_ptr as "unsigned long long*",
-            texture_alignment_ptr as "unsigned long long*",
-            total_const_mem_ptr as "unsigned long long*",
-            total_global_mem_ptr as "unsigned long long*",
-            warp_size_ptr as "int*",
-            pci_bus_id_ptr as "int*",
-            pci_domain_id_ptr as "int*",
-            pci_device_id_ptr as "int*"
+            props_len as "uint64_t",
+            props_ptr as "cudaDeviceProp*"
         ] -> i32 as "int" {
-            cudaDeviceProp props;
-            const auto ret = cudaGetDeviceProperties(&props, id);
-            *can_map_host_memory_ptr = props.canMapHostMemory;
-            *clock_rate_ptr = props.clockRate;
-            *device_overlap_ptr = props.deviceOverlap;
-            *kernel_exec_timeout_enabled_ptr = props.kernelExecTimeoutEnabled;
-            *major_ptr = props.major;
-
-            for(int i = 0; i < 3; ++i) {
-                max_grid_size_ptr[i] = props.maxGridSize[i];
-                max_threads_dim_ptr[i] = props.maxThreadsDim[i];
-            }
-            *max_threads_per_block_ptr = props.maxThreadsPerBlock;
-            *mem_pitch_ptr = props.memPitch;
-            *memory_bus_width_ptr = props.memoryBusWidth;
-            *memory_clock_rate_ptr = props.memoryClockRate;
-            *minor_ptr = props.minor;
-            *multi_processor_count_ptr = props.multiProcessorCount;
-            *regs_per_block_ptr = props.regsPerBlock;
-            *shared_mem_per_block_ptr = props.sharedMemPerBlock;
-            *texture_alignment_ptr = props.textureAlignment;
-            *total_const_mem_ptr = props.totalConstMem;
-            *total_global_mem_ptr = props.totalGlobalMem;
-            *warp_size_ptr = props.warpSize;
-            *pci_bus_id_ptr = props.pciBusID;
-            *pci_device_id_ptr = props.pciDeviceID;
-            *pci_domain_id_ptr = props.pciDomainID;
-            memcpy(name_ptr, props.name, 256);
+            cudaDeviceProp prop;
+            const auto ret = cudaGetDeviceProperties(&prop, id);
+
+            // Don't copy past the end, just in case cudaDeviceProp grows
+            memcpy(props_ptr, &prop, std::min(size_t(props_len), sizeof(prop)));
             return ret;
         });
 

From 9478726aa7268c58bea80a80cdc88a624a19f418 Mon Sep 17 00:00:00 2001
From: "Micah Chambers (minerva)" <mchambers@anduril.com>
Date: Tue, 26 Aug 2025 09:25:53 -0700
Subject: [PATCH 3/7] make name u8

---
 src/ffi/device.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/ffi/device.rs b/src/ffi/device.rs
index 5672c99..0a3650d 100644
--- a/src/ffi/device.rs
+++ b/src/ffi/device.rs
@@ -14,7 +14,7 @@ type Result<T> = std::result::Result<T, crate::error::Error>;
 pub struct CudaDeviceProp {
     // ASCII string identifying the device.
     // original name: name[256]
-    pub name: [i8; 256],
+    pub name: [u8; 256],
 
     // 16-byte unique identifier.
     // original name: uuid

From e245e6f9175707078e1264490bc72df6c02523df Mon Sep 17 00:00:00 2001
From: "Micah Chambers (minerva)" <mchambers@anduril.com>
Date: Tue, 26 Aug 2025 09:33:32 -0700
Subject: [PATCH 4/7] Revert "make name u8"

This reverts commit 9478726aa7268c58bea80a80cdc88a624a19f418.
---
 src/ffi/device.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/ffi/device.rs b/src/ffi/device.rs
index 0a3650d..5672c99 100644
--- a/src/ffi/device.rs
+++ b/src/ffi/device.rs
@@ -14,7 +14,7 @@ type Result<T> = std::result::Result<T, crate::error::Error>;
 pub struct CudaDeviceProp {
     // ASCII string identifying the device.
     // original name: name[256]
-    pub name: [u8; 256],
+    pub name: [i8; 256],
 
     // 16-byte unique identifier.
     // original name: uuid

From 30c4932079595857b44a50eeeca67e3d6706d70f Mon Sep 17 00:00:00 2001
From: "Micah Chambers (minerva)" <mchambers@anduril.com>
Date: Tue, 26 Aug 2025 09:34:28 -0700
Subject: [PATCH 5/7] Reapply "make name u8"

This reverts commit e245e6f9175707078e1264490bc72df6c02523df.
---
 src/ffi/device.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/ffi/device.rs b/src/ffi/device.rs
index 5672c99..0a3650d 100644
--- a/src/ffi/device.rs
+++ b/src/ffi/device.rs
@@ -14,7 +14,7 @@ type Result<T> = std::result::Result<T, crate::error::Error>;
 pub struct CudaDeviceProp {
     // ASCII string identifying the device.
     // original name: name[256]
-    pub name: [i8; 256],
+    pub name: [u8; 256],
 
     // 16-byte unique identifier.
     // original name: uuid

From 402e79bf47e6c5c49935312d1b783ee50c2028f4 Mon Sep 17 00:00:00 2001
From: "Micah Chambers (minerva)" <mchambers@anduril.com>
Date: Tue, 26 Aug 2025 09:48:38 -0700
Subject: [PATCH 6/7] add missing

---
 src/ffi/device.rs | 21 +++++++++++++++++++--
 1 file changed, 19 insertions(+), 2 deletions(-)

diff --git a/src/ffi/device.rs b/src/ffi/device.rs
index 0a3650d..7632fbc 100644
--- a/src/ffi/device.rs
+++ b/src/ffi/device.rs
@@ -13,13 +13,18 @@ type Result<T> = std::result::Result<T, crate::error::Error>;
 #[repr(C)]
 pub struct CudaDeviceProp {
     // ASCII string identifying the device.
-    // original name: name[256]
     pub name: [u8; 256],
 
     // 16-byte unique identifier.
-    // original name: uuid
     pub uuid: [u8; 16],
 
+    // 8-byte locally unique identifier. Value is undefined on TCC and non-Windows platforms
+    pub luid: [u8; 8],
+
+    //  LUID device node mask. Value is undefined on TCC and non-Windows platforms
+    //  original name: luidDeviceNodeMask
+    pub luid_device_node_mask: u32,
+
     // Total amount of global memory available on the device in bytes.
     // original name: totalGlobalMem
     pub total_global_mem: usize,
@@ -268,6 +273,10 @@ pub struct CudaDeviceProp {
     // original name: multiGpuBoardGroupID
     pub multi_gpu_board_group_id: i32,
 
+    // Link between the device and the host supports native atomic operations
+    // original name: hostNativeAtomicSupported
+    host_native_atomic_supported: i32,
+
     // Ratio of single precision performance (in floating-point operations per second) to double precision performance.
     // original name: singleToDoublePrecisionPerfRatio
     pub single_to_double_precision_perf_ratio: i32,
@@ -296,6 +305,10 @@ pub struct CudaDeviceProp {
     // original name: cooperativeMultiDeviceLaunch
     pub cooperative_multi_device_launch: i32,
 
+    // Per device maximum shared memory per block usable by special opt in
+    // original name: sharedMemPerBlockOptin
+    pub shared_mem_per_block_optin: usize,
+
     // 1 if the device accesses pageable memory via the host's page tables, and 0 otherwise.
     // original name: pageableMemoryAccessUsesHostPageTables
     pub pageable_memory_access_uses_host_page_tables: i32,
@@ -311,6 +324,10 @@ pub struct CudaDeviceProp {
     // Maximum value of cudaAccessPolicyWindow::num_bytes.
     // original name: accessPolicyMaxWindowSize
     pub access_policy_max_window_size: i32,
+
+    // Shared memory reserved by CUDA driver per block in bytes
+    // original name: reservedSharedMemPerBlock
+    pub reserved_shared_mem_per_block: usize,
 }
 
 impl Default for CudaDeviceProp {

From df7073d4d78e84eb53b19a09b941b4ccc5622b79 Mon Sep 17 00:00:00 2001
From: "Micah Chambers (minerva)" <mchambers@anduril.com>
Date: Tue, 26 Aug 2025 10:48:21 -0700
Subject: [PATCH 7/7] add missing

---
 src/ffi/device.rs | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/src/ffi/device.rs b/src/ffi/device.rs
index 7632fbc..6f6334f 100644
--- a/src/ffi/device.rs
+++ b/src/ffi/device.rs
@@ -1,6 +1,3 @@
-use std::ffi::c_char;
-use std::ffi::c_int;
-
 use cpp::cpp;
 
 use crate::device::DeviceId;
@@ -9,6 +6,7 @@ use crate::ffi::result;
 
 type Result<T> = std::result::Result<T, crate::error::Error>;
 
+// Post CUDA 11, NVIDIA has only appended to this struct
 #[derive(Clone, Debug)]
 #[repr(C)]
 pub struct CudaDeviceProp {
@@ -335,6 +333,8 @@ impl Default for CudaDeviceProp {
         CudaDeviceProp {
             name: [0; 256],
             uuid: [0; 16],
+            luid: [0; 8],
+            luid_device_node_mask: 0,
             total_global_mem: 0,
             shared_mem_per_block: 0,
             regs_per_block: 0,
@@ -408,6 +408,9 @@ impl Default for CudaDeviceProp {
             direct_managed_mem_access_from_host: 0,
             max_blocks_per_multi_processor: 0,
             access_policy_max_window_size: 0,
+            host_native_atomic_supported: 0,
+            reserved_shared_mem_per_block: 0,
+            shared_mem_per_block_optin: 0,
         }
     }
 }