From 0e51ef9f17bf911883f5c8f07821207654983084 Mon Sep 17 00:00:00 2001 From: "Micah Chambers (eos)" Date: Thu, 7 Aug 2025 00:18:36 -0700 Subject: [PATCH 1/7] add pci stuff --- src/ffi/device.rs | 165 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 165 insertions(+) diff --git a/src/ffi/device.rs b/src/ffi/device.rs index 4044b6b..3074dee 100644 --- a/src/ffi/device.rs +++ b/src/ffi/device.rs @@ -1,3 +1,6 @@ +use std::ffi::c_char; +use std::ffi::c_int; + use cpp::cpp; use crate::device::DeviceId; @@ -6,6 +9,70 @@ use crate::ffi::result; type Result = std::result::Result; +#[derive(Clone, Debug)] +#[repr(C)] +pub struct CudaDeviceProperties { + pub can_map_host_memory: i32, + pub clock_rate: i32, + pub device_overlap: i32, + pub kernel_exec_timeout_enabled: i32, + pub major: i32, + pub max_grid_size: [i32; 3], + pub max_threads_dim: [i32; 3], + pub max_threads_per_block: i32, + pub mem_pitch: usize, + pub memory_bus_width: i32, + pub memory_clock_rate: i32, + pub minor: i32, + pub multi_processor_count: i32, + pub name: [c_char; 256], + pub regs_per_block: i32, + pub shared_mem_per_block: usize, + pub texture_alignment: usize, + pub total_const_mem: usize, + pub total_global_mem: usize, + pub warp_size: i32, + pub pci_bus_id: i32, + pub pci_device_id: i32, + pub pci_domain_id: i32, +} + +impl Default for CudaDeviceProperties { + fn default() -> Self { + Self { + clock_rate: Default::default(), + device_overlap: Default::default(), + kernel_exec_timeout_enabled: Default::default(), + major: Default::default(), + max_grid_size: Default::default(), + can_map_host_memory: Default::default(), + max_threads_dim: Default::default(), + max_threads_per_block: Default::default(), + mem_pitch: Default::default(), + memory_bus_width: Default::default(), + memory_clock_rate: Default::default(), + minor: Default::default(), + multi_processor_count: Default::default(), + name: [0; 256], + regs_per_block: Default::default(), + shared_mem_per_block: Default::default(), + texture_alignment: Default::default(), + total_const_mem: Default::default(), + total_global_mem: Default::default(), + warp_size: Default::default(), + pci_bus_id: 0, + pci_device_id: 0, + pci_domain_id: 0, + } + } +} + +#[derive(Clone, Debug, Default)] +#[repr(C)] +pub struct CudaUUID { + pub bytes: [u8; 16], +} + /// Synchronous implementation of [`crate::num_devices`]. /// /// Refer to [`crate::num_devices`] for documentation. @@ -54,6 +121,98 @@ impl Device { result!(ret) } + #[inline] + pub fn get_properties(id: DeviceId) -> Result { + let mut props = CudaDeviceProperties::default(); + cpp! {{ + #include + #include + }} + let can_map_host_memory_ptr = std::ptr::addr_of_mut!(props.can_map_host_memory); + let clock_rate_ptr = std::ptr::addr_of_mut!(props.clock_rate); + let device_overlap_ptr = std::ptr::addr_of_mut!(props.device_overlap); + let kernel_exec_timeout_enabled_ptr = + std::ptr::addr_of_mut!(props.kernel_exec_timeout_enabled); + let major_ptr = std::ptr::addr_of_mut!(props.major); + let max_grid_size_ptr = std::ptr::addr_of_mut!(props.max_grid_size); + let max_threads_dim_ptr = std::ptr::addr_of_mut!(props.max_threads_dim); + let max_threads_per_block_ptr = std::ptr::addr_of_mut!(props.max_threads_per_block); + let mem_pitch_ptr = std::ptr::addr_of_mut!(props.mem_pitch); + let memory_bus_width_ptr = std::ptr::addr_of_mut!(props.memory_bus_width); + let memory_clock_rate_ptr = std::ptr::addr_of_mut!(props.memory_clock_rate); + let minor_ptr = std::ptr::addr_of_mut!(props.minor); + let multi_processor_count_ptr = std::ptr::addr_of_mut!(props.multi_processor_count); + let name_ptr = std::ptr::addr_of_mut!(props.name); + let regs_per_block_ptr = std::ptr::addr_of_mut!(props.regs_per_block); + let shared_mem_per_block_ptr = std::ptr::addr_of_mut!(props.shared_mem_per_block); + let texture_alignment_ptr = std::ptr::addr_of_mut!(props.texture_alignment); + let total_const_mem_ptr = std::ptr::addr_of_mut!(props.total_const_mem); + let total_global_mem_ptr = std::ptr::addr_of_mut!(props.total_global_mem); + let warp_size_ptr = std::ptr::addr_of_mut!(props.warp_size); + let pci_bus_id_ptr = std::ptr::addr_of_mut!(props.pci_bus_id); + let pci_device_id_ptr = std::ptr::addr_of_mut!(props.pci_device_id); + let pci_domain_id_ptr = std::ptr::addr_of_mut!(props.pci_domain_id); + + let ret = cpp!(unsafe [ + id as "int", + can_map_host_memory_ptr as "int*", + clock_rate_ptr as "int*", + device_overlap_ptr as "int*", + kernel_exec_timeout_enabled_ptr as "int*", + major_ptr as "int*", + max_grid_size_ptr as "int*", + max_threads_dim_ptr as "int*", + max_threads_per_block_ptr as "int*", + mem_pitch_ptr as "unsigned long long*", + memory_bus_width_ptr as "int*", + memory_clock_rate_ptr as "int*", + minor_ptr as "int*", + multi_processor_count_ptr as "int*", + name_ptr as "char*", + regs_per_block_ptr as "int*", + shared_mem_per_block_ptr as "unsigned long long*", + texture_alignment_ptr as "unsigned long long*", + total_const_mem_ptr as "unsigned long long*", + total_global_mem_ptr as "unsigned long long*", + warp_size_ptr as "int*", + pci_bus_id_ptr as "int*", + pci_domain_id_ptr as "int*", + pci_device_id_ptr as "int*" + ] -> i32 as "int" { + cudaDeviceProp props; + const auto ret = cudaGetDeviceProperties(&props, id); + *can_map_host_memory_ptr = props.canMapHostMemory; + *clock_rate_ptr = props.clockRate; + *device_overlap_ptr = props.deviceOverlap; + *kernel_exec_timeout_enabled_ptr = props.kernelExecTimeoutEnabled; + *major_ptr = props.major; + + for(int i = 0; i < 3; ++i) { + max_grid_size_ptr[i] = props.maxGridSize[i]; + max_threads_dim_ptr[i] = props.maxThreadsDim[i]; + } + *max_threads_per_block_ptr = props.maxThreadsPerBlock; + *mem_pitch_ptr = props.memPitch; + *memory_bus_width_ptr = props.memoryBusWidth; + *memory_clock_rate_ptr = props.memoryClockRate; + *minor_ptr = props.minor; + *multi_processor_count_ptr = props.multiProcessorCount; + *regs_per_block_ptr = props.regsPerBlock; + *shared_mem_per_block_ptr = props.sharedMemPerBlock; + *texture_alignment_ptr = props.textureAlignment; + *total_const_mem_ptr = props.totalConstMem; + *total_global_mem_ptr = props.totalGlobalMem; + *warp_size_ptr = props.warpSize; + *pci_bus_id_ptr = props.pciBusID; + *pci_device_id_ptr = props.pciDeviceID; + *pci_domain_id_ptr = props.pciDomainID; + memcpy(name_ptr, props.name, 256); + return ret; + }); + + result!(ret, props) + } + #[inline(always)] pub fn set_or_panic(id: DeviceId) { Device::set(id).unwrap_or_else(|err| panic!("failed to set device {id}: {err}")); @@ -107,6 +266,12 @@ mod tests { assert!(Device::synchronize().is_ok()); } + #[test] + fn test_device_properties() { + assert!(Device::get_properties(0).is_ok()); + eprintln!("{:?}", Device::get_properties(0)); + } + #[test] fn test_memory_info() { let memory_info = Device::memory_info().unwrap(); From 6c1010c298b738996fd6d31df21fbe790bdc3c92 Mon Sep 17 00:00:00 2001 From: "Micah Chambers (eos)" Date: Thu, 7 Aug 2025 00:35:43 -0700 Subject: [PATCH 2/7] this is better --- src/ffi/device.rs | 499 +++++++++++++++++++++++++++++++++++----------- 1 file changed, 378 insertions(+), 121 deletions(-) diff --git a/src/ffi/device.rs b/src/ffi/device.rs index 3074dee..5672c99 100644 --- a/src/ffi/device.rs +++ b/src/ffi/device.rs @@ -11,68 +11,390 @@ type Result = std::result::Result; #[derive(Clone, Debug)] #[repr(C)] -pub struct CudaDeviceProperties { - pub can_map_host_memory: i32, +pub struct CudaDeviceProp { + // ASCII string identifying the device. + // original name: name[256] + pub name: [i8; 256], + + // 16-byte unique identifier. + // original name: uuid + pub uuid: [u8; 16], + + // Total amount of global memory available on the device in bytes. + // original name: totalGlobalMem + pub total_global_mem: usize, + + // Maximum amount of shared memory available to a thread block in bytes. + // original name: sharedMemPerBlock + pub shared_mem_per_block: usize, + + // Maximum number of 32-bit registers available to a thread block. + // original name: regsPerBlock + pub regs_per_block: i32, + + // Warp size in threads. + // original name: warpSize + pub warp_size: i32, + + // Maximum pitch in bytes allowed by the memory copy functions that involve memory regions allocated through cudaMallocPitch(). + // original name: memPitch + pub mem_pitch: usize, + + // Maximum number of threads per block. + // original name: maxThreadsPerBlock + pub max_threads_per_block: i32, + + // Maximum size of each dimension of a block. + // original name: maxThreadsDim[3] + pub max_threads_dim: [i32; 3], + + // Maximum size of each dimension of a grid. + // original name: maxGridSize[3] + pub max_grid_size: [i32; 3], + + // Clock frequency in kilohertz. + // original name: clockRate pub clock_rate: i32, - pub device_overlap: i32, - pub kernel_exec_timeout_enabled: i32, + + // Total amount of constant memory available on the device in bytes. + // original name: totalConstMem + pub total_const_mem: usize, + + // Major revision number defining the device's compute capability. + // original name: major pub major: i32, - pub max_grid_size: [i32; 3], - pub max_threads_dim: [i32; 3], - pub max_threads_per_block: i32, - pub mem_pitch: usize, - pub memory_bus_width: i32, - pub memory_clock_rate: i32, + + // Minor revision number defining the device's compute capability. + // original name: minor pub minor: i32, - pub multi_processor_count: i32, - pub name: [c_char; 256], - pub regs_per_block: i32, - pub shared_mem_per_block: usize, + + // Alignment requirement; texture base addresses that are aligned to texture_alignment bytes do not need an offset applied to texture fetches. + // original name: textureAlignment pub texture_alignment: usize, - pub total_const_mem: usize, - pub total_global_mem: usize, - pub warp_size: i32, + + // Pitch alignment requirement for 2D texture references that are bound to pitched memory. + // original name: texturePitchAlignment + pub texture_pitch_alignment: usize, + + // 1 if the device can concurrently copy memory between host and device while executing a kernel, or 0 if not. + // original name: deviceOverlap + pub device_overlap: i32, + + // Number of multiprocessors on the device. + // original name: multiProcessorCount + pub multi_processor_count: i32, + + // 1 if there is a run time limit for kernels executed on the device, or 0 if not. + // original name: kernelExecTimeoutEnabled + pub kernel_exec_timeout_enabled: i32, + + // 1 if the device is an integrated (motherboard) GPU and 0 if it is a discrete (card) component. + // original name: integrated + pub integrated: i32, + + // 1 if the device can map host memory into the CUDA address space for use with cudaHostAlloc()/cudaHostGetDevicePointer(), or 0 if not. + // original name: canMapHostMemory + pub can_map_host_memory: i32, + + // Compute mode that the device is currently in. + // original name: computeMode + pub compute_mode: i32, + + // Maximum 1D texture size. + // original name: maxTexture1D + pub max_texture1d: i32, + + // Maximum 1D mipmapped texture size. + // original name: maxTexture1DMipmap + pub max_texture1d_mipmap: i32, + + // Maximum 1D texture size for textures bound to linear memory. + // original name: maxTexture1DLinear + pub max_texture1d_linear: i32, + + // Maximum 2D texture dimensions. + // original name: maxTexture2D[2] + pub max_texture2d: [i32; 2], + + // Maximum 2D mipmapped texture dimensions. + // original name: maxTexture2DMipmap[2] + pub max_texture2d_mipmap: [i32; 2], + + // Maximum 2D texture dimensions for 2D textures bound to pitch linear memory. + // original name: maxTexture2DLinear[3] + pub max_texture2d_linear: [i32; 3], + + // Maximum 2D texture dimensions if texture gather operations have to be performed. + // original name: maxTexture2DGather[2] + pub max_texture2d_gather: [i32; 2], + + // Maximum 3D texture dimensions. + // original name: maxTexture3D[3] + pub max_texture3d: [i32; 3], + + // Maximum alternate 3D texture dimensions. + // original name: maxTexture3DAlt[3] + pub max_texture3d_alt: [i32; 3], + + // Maximum cubemap texture width or height. + // original name: maxTextureCubemap + pub max_texture_cubemap: i32, + + // Maximum 1D layered texture dimensions. + // original name: maxTexture1DLayered[2] + pub max_texture1d_layered: [i32; 2], + + // Maximum 2D layered texture dimensions. + // original name: maxTexture2DLayered[3] + pub max_texture2d_layered: [i32; 3], + + // Maximum cubemap layered texture dimensions. + // original name: maxTextureCubemapLayered[2] + pub max_texture_cubemap_layered: [i32; 2], + + // Maximum 1D surface size. + // original name: maxSurface1D + pub max_surface1d: i32, + + // Maximum 2D surface dimensions. + // original name: maxSurface2D[2] + pub max_surface2d: [i32; 2], + + // Maximum 3D surface dimensions. + // original name: maxSurface3D[3] + pub max_surface3d: [i32; 3], + + // Maximum 1D layered surface dimensions. + // original name: maxSurface1DLayered[2] + pub max_surface1d_layered: [i32; 2], + + // Maximum 2D layered surface dimensions. + // original name: maxSurface2DLayered[3] + pub max_surface2d_layered: [i32; 3], + + // Maximum cubemap surface width or height. + // original name: maxSurfaceCubemap + pub max_surface_cubemap: i32, + + // Maximum cubemap layered surface dimensions. + // original name: maxSurfaceCubemapLayered[2] + pub max_surface_cubemap_layered: [i32; 2], + + // Alignment requirements for surfaces. + // original name: surfaceAlignment + pub surface_alignment: usize, + + // 1 if the device supports executing multiple kernels within the same context simultaneously, or 0 if not. + // original name: concurrentKernels + pub concurrent_kernels: i32, + + // 1 if the device has ECC support turned on, or 0 if not. + // original name: ECCEnabled + pub ecc_enabled: i32, + + // PCI bus identifier of the device. + // original name: pciBusID pub pci_bus_id: i32, + + // PCI device (sometimes called slot) identifier of the device. + // original name: pciDeviceID pub pci_device_id: i32, + + // PCI domain identifier of the device. + // original name: pciDomainID pub pci_domain_id: i32, + + // 1 if the device is using a TCC driver or 0 if not. + // original name: tccDriver + pub tcc_driver: i32, + + // 1 when the device can concurrently copy memory between host and device while executing a kernel. + // original name: asyncEngineCount + pub async_engine_count: i32, + + // 1 if the device shares a unified address space with the host and 0 otherwise. + // original name: unifiedAddressing + pub unified_addressing: i32, + + // Peak memory clock frequency in kilohertz. + // original name: memoryClockRate + pub memory_clock_rate: i32, + + // Memory bus width in bits. + // original name: memoryBusWidth + pub memory_bus_width: i32, + + // L2 cache size in bytes. + // original name: l2CacheSize + pub l2_cache_size: i32, + + // L2 cache's maximum persisting lines size in bytes. + // original name: persistingL2CacheMaxSize + pub persisting_l2_cache_max_size: i32, + + // Number of maximum resident threads per multiprocessor. + // original name: maxThreadsPerMultiProcessor + pub max_threads_per_multi_processor: i32, + + // 1 if the device supports stream priorities, or 0 if it is not supported. + // original name: streamPrioritiesSupported + pub stream_priorities_supported: i32, + + // 1 if the device supports caching of globals in L1 cache, or 0 if it is not supported. + // original name: globalL1CacheSupported + pub global_l1_cache_supported: i32, + + // 1 if the device supports caching of locals in L1 cache, or 0 if it is not supported. + // original name: localL1CacheSupported + pub local_l1_cache_supported: i32, + + // Maximum amount of shared memory available to a multiprocessor in bytes; this amount is shared by all thread blocks simultaneously resident on a multiprocessor. + // original name: sharedMemPerMultiprocessor + pub shared_mem_per_multiprocessor: usize, + + // Maximum number of 32-bit registers available to a multiprocessor; this number is shared by all thread blocks simultaneously resident on a multiprocessor. + // original name: regsPerMultiprocessor + pub regs_per_multiprocessor: i32, + + // 1 if the device supports allocating managed memory on this system, or 0 if it is not supported. + // original name: managedMemory + pub managed_memory: i32, + + // 1 if the device is on a multi-GPU board (e.g. Gemini cards), and 0 if not. + // original name: isMultiGpuBoard + pub is_multi_gpu_board: i32, + + // Unique identifier for a group of devices associated with the same board. Devices on the same multi-GPU board will share the same identifier. + // original name: multiGpuBoardGroupID + pub multi_gpu_board_group_id: i32, + + // Ratio of single precision performance (in floating-point operations per second) to double precision performance. + // original name: singleToDoublePrecisionPerfRatio + pub single_to_double_precision_perf_ratio: i32, + + // 1 if the device supports coherently accessing pageable memory without calling cudaHostRegister on it, and 0 otherwise. + // original name: pageableMemoryAccess + pub pageable_memory_access: i32, + + // 1 if the device can coherently access managed memory concurrently with the CPU, and 0 otherwise. + // original name: concurrentManagedAccess + pub concurrent_managed_access: i32, + + // 1 if the device supports Compute Preemption, and 0 otherwise. + // original name: computePreemptionSupported + pub compute_preemption_supported: i32, + + // 1 if the device can access host registered memory at the same virtual address as the CPU, and 0 otherwise. + // original name: canUseHostPointerForRegisteredMem + pub can_use_host_pointer_for_registered_mem: i32, + + // 1 if the device supports launching cooperative kernels via cudaLaunchCooperativeKernel, and 0 otherwise. + // original name: cooperativeLaunch + pub cooperative_launch: i32, + + // 1 if the device supports launching cooperative kernels via cudaLaunchCooperativeKernelMultiDevice, and 0 otherwise. + // original name: cooperativeMultiDeviceLaunch + pub cooperative_multi_device_launch: i32, + + // 1 if the device accesses pageable memory via the host's page tables, and 0 otherwise. + // original name: pageableMemoryAccessUsesHostPageTables + pub pageable_memory_access_uses_host_page_tables: i32, + + // 1 if the host can directly access managed memory on the device without migration, and 0 otherwise. + // original name: directManagedMemAccessFromHost + pub direct_managed_mem_access_from_host: i32, + + // Maximum number of thread blocks that can reside on a multiprocessor. + // original name: maxBlocksPerMultiProcessor + pub max_blocks_per_multi_processor: i32, + + // Maximum value of cudaAccessPolicyWindow::num_bytes. + // original name: accessPolicyMaxWindowSize + pub access_policy_max_window_size: i32, } -impl Default for CudaDeviceProperties { +impl Default for CudaDeviceProp { fn default() -> Self { - Self { - clock_rate: Default::default(), - device_overlap: Default::default(), - kernel_exec_timeout_enabled: Default::default(), - major: Default::default(), - max_grid_size: Default::default(), - can_map_host_memory: Default::default(), - max_threads_dim: Default::default(), - max_threads_per_block: Default::default(), - mem_pitch: Default::default(), - memory_bus_width: Default::default(), - memory_clock_rate: Default::default(), - minor: Default::default(), - multi_processor_count: Default::default(), + CudaDeviceProp { name: [0; 256], - regs_per_block: Default::default(), - shared_mem_per_block: Default::default(), - texture_alignment: Default::default(), - total_const_mem: Default::default(), - total_global_mem: Default::default(), - warp_size: Default::default(), + uuid: [0; 16], + total_global_mem: 0, + shared_mem_per_block: 0, + regs_per_block: 0, + warp_size: 0, + mem_pitch: 0, + max_threads_per_block: 0, + max_threads_dim: [0; 3], + max_grid_size: [0; 3], + clock_rate: 0, + total_const_mem: 0, + major: 0, + minor: 0, + texture_alignment: 0, + texture_pitch_alignment: 0, + device_overlap: 0, + multi_processor_count: 0, + kernel_exec_timeout_enabled: 0, + integrated: 0, + can_map_host_memory: 0, + compute_mode: 0, + max_texture1d: 0, + max_texture1d_mipmap: 0, + max_texture1d_linear: 0, + max_texture2d: [0; 2], + max_texture2d_mipmap: [0; 2], + max_texture2d_linear: [0; 3], + max_texture2d_gather: [0; 2], + max_texture3d: [0; 3], + max_texture3d_alt: [0; 3], + max_texture_cubemap: 0, + max_texture1d_layered: [0; 2], + max_texture2d_layered: [0; 3], + max_texture_cubemap_layered: [0; 2], + max_surface1d: 0, + max_surface2d: [0; 2], + max_surface3d: [0; 3], + max_surface1d_layered: [0; 2], + max_surface2d_layered: [0; 3], + max_surface_cubemap: 0, + max_surface_cubemap_layered: [0; 2], + surface_alignment: 0, + concurrent_kernels: 0, + ecc_enabled: 0, pci_bus_id: 0, pci_device_id: 0, pci_domain_id: 0, + tcc_driver: 0, + async_engine_count: 0, + unified_addressing: 0, + memory_clock_rate: 0, + memory_bus_width: 0, + l2_cache_size: 0, + persisting_l2_cache_max_size: 0, + max_threads_per_multi_processor: 0, + stream_priorities_supported: 0, + global_l1_cache_supported: 0, + local_l1_cache_supported: 0, + shared_mem_per_multiprocessor: 0, + regs_per_multiprocessor: 0, + managed_memory: 0, + is_multi_gpu_board: 0, + multi_gpu_board_group_id: 0, + single_to_double_precision_perf_ratio: 0, + pageable_memory_access: 0, + concurrent_managed_access: 0, + compute_preemption_supported: 0, + can_use_host_pointer_for_registered_mem: 0, + cooperative_launch: 0, + cooperative_multi_device_launch: 0, + pageable_memory_access_uses_host_page_tables: 0, + direct_managed_mem_access_from_host: 0, + max_blocks_per_multi_processor: 0, + access_policy_max_window_size: 0, } } } -#[derive(Clone, Debug, Default)] -#[repr(C)] -pub struct CudaUUID { - pub bytes: [u8; 16], -} - /// Synchronous implementation of [`crate::num_devices`]. /// /// Refer to [`crate::num_devices`] for documentation. @@ -122,91 +444,26 @@ impl Device { } #[inline] - pub fn get_properties(id: DeviceId) -> Result { - let mut props = CudaDeviceProperties::default(); + pub fn get_properties(id: DeviceId) -> Result { + let mut props = CudaDeviceProp::default(); + let props_len = std::mem::size_of::(); cpp! {{ #include #include + #include }} - let can_map_host_memory_ptr = std::ptr::addr_of_mut!(props.can_map_host_memory); - let clock_rate_ptr = std::ptr::addr_of_mut!(props.clock_rate); - let device_overlap_ptr = std::ptr::addr_of_mut!(props.device_overlap); - let kernel_exec_timeout_enabled_ptr = - std::ptr::addr_of_mut!(props.kernel_exec_timeout_enabled); - let major_ptr = std::ptr::addr_of_mut!(props.major); - let max_grid_size_ptr = std::ptr::addr_of_mut!(props.max_grid_size); - let max_threads_dim_ptr = std::ptr::addr_of_mut!(props.max_threads_dim); - let max_threads_per_block_ptr = std::ptr::addr_of_mut!(props.max_threads_per_block); - let mem_pitch_ptr = std::ptr::addr_of_mut!(props.mem_pitch); - let memory_bus_width_ptr = std::ptr::addr_of_mut!(props.memory_bus_width); - let memory_clock_rate_ptr = std::ptr::addr_of_mut!(props.memory_clock_rate); - let minor_ptr = std::ptr::addr_of_mut!(props.minor); - let multi_processor_count_ptr = std::ptr::addr_of_mut!(props.multi_processor_count); - let name_ptr = std::ptr::addr_of_mut!(props.name); - let regs_per_block_ptr = std::ptr::addr_of_mut!(props.regs_per_block); - let shared_mem_per_block_ptr = std::ptr::addr_of_mut!(props.shared_mem_per_block); - let texture_alignment_ptr = std::ptr::addr_of_mut!(props.texture_alignment); - let total_const_mem_ptr = std::ptr::addr_of_mut!(props.total_const_mem); - let total_global_mem_ptr = std::ptr::addr_of_mut!(props.total_global_mem); - let warp_size_ptr = std::ptr::addr_of_mut!(props.warp_size); - let pci_bus_id_ptr = std::ptr::addr_of_mut!(props.pci_bus_id); - let pci_device_id_ptr = std::ptr::addr_of_mut!(props.pci_device_id); - let pci_domain_id_ptr = std::ptr::addr_of_mut!(props.pci_domain_id); + let props_ptr = std::ptr::addr_of_mut!(props); let ret = cpp!(unsafe [ id as "int", - can_map_host_memory_ptr as "int*", - clock_rate_ptr as "int*", - device_overlap_ptr as "int*", - kernel_exec_timeout_enabled_ptr as "int*", - major_ptr as "int*", - max_grid_size_ptr as "int*", - max_threads_dim_ptr as "int*", - max_threads_per_block_ptr as "int*", - mem_pitch_ptr as "unsigned long long*", - memory_bus_width_ptr as "int*", - memory_clock_rate_ptr as "int*", - minor_ptr as "int*", - multi_processor_count_ptr as "int*", - name_ptr as "char*", - regs_per_block_ptr as "int*", - shared_mem_per_block_ptr as "unsigned long long*", - texture_alignment_ptr as "unsigned long long*", - total_const_mem_ptr as "unsigned long long*", - total_global_mem_ptr as "unsigned long long*", - warp_size_ptr as "int*", - pci_bus_id_ptr as "int*", - pci_domain_id_ptr as "int*", - pci_device_id_ptr as "int*" + props_len as "uint64_t", + props_ptr as "cudaDeviceProp*" ] -> i32 as "int" { - cudaDeviceProp props; - const auto ret = cudaGetDeviceProperties(&props, id); - *can_map_host_memory_ptr = props.canMapHostMemory; - *clock_rate_ptr = props.clockRate; - *device_overlap_ptr = props.deviceOverlap; - *kernel_exec_timeout_enabled_ptr = props.kernelExecTimeoutEnabled; - *major_ptr = props.major; - - for(int i = 0; i < 3; ++i) { - max_grid_size_ptr[i] = props.maxGridSize[i]; - max_threads_dim_ptr[i] = props.maxThreadsDim[i]; - } - *max_threads_per_block_ptr = props.maxThreadsPerBlock; - *mem_pitch_ptr = props.memPitch; - *memory_bus_width_ptr = props.memoryBusWidth; - *memory_clock_rate_ptr = props.memoryClockRate; - *minor_ptr = props.minor; - *multi_processor_count_ptr = props.multiProcessorCount; - *regs_per_block_ptr = props.regsPerBlock; - *shared_mem_per_block_ptr = props.sharedMemPerBlock; - *texture_alignment_ptr = props.textureAlignment; - *total_const_mem_ptr = props.totalConstMem; - *total_global_mem_ptr = props.totalGlobalMem; - *warp_size_ptr = props.warpSize; - *pci_bus_id_ptr = props.pciBusID; - *pci_device_id_ptr = props.pciDeviceID; - *pci_domain_id_ptr = props.pciDomainID; - memcpy(name_ptr, props.name, 256); + cudaDeviceProp prop; + const auto ret = cudaGetDeviceProperties(&prop, id); + + // Don't copy past the end, just in case cudaDeviceProp grows + memcpy(props_ptr, &prop, std::min(size_t(props_len), sizeof(prop))); return ret; }); From 9478726aa7268c58bea80a80cdc88a624a19f418 Mon Sep 17 00:00:00 2001 From: "Micah Chambers (minerva)" Date: Tue, 26 Aug 2025 09:25:53 -0700 Subject: [PATCH 3/7] make name u8 --- src/ffi/device.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/ffi/device.rs b/src/ffi/device.rs index 5672c99..0a3650d 100644 --- a/src/ffi/device.rs +++ b/src/ffi/device.rs @@ -14,7 +14,7 @@ type Result = std::result::Result; pub struct CudaDeviceProp { // ASCII string identifying the device. // original name: name[256] - pub name: [i8; 256], + pub name: [u8; 256], // 16-byte unique identifier. // original name: uuid From e245e6f9175707078e1264490bc72df6c02523df Mon Sep 17 00:00:00 2001 From: "Micah Chambers (minerva)" Date: Tue, 26 Aug 2025 09:33:32 -0700 Subject: [PATCH 4/7] Revert "make name u8" This reverts commit 9478726aa7268c58bea80a80cdc88a624a19f418. --- src/ffi/device.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/ffi/device.rs b/src/ffi/device.rs index 0a3650d..5672c99 100644 --- a/src/ffi/device.rs +++ b/src/ffi/device.rs @@ -14,7 +14,7 @@ type Result = std::result::Result; pub struct CudaDeviceProp { // ASCII string identifying the device. // original name: name[256] - pub name: [u8; 256], + pub name: [i8; 256], // 16-byte unique identifier. // original name: uuid From 30c4932079595857b44a50eeeca67e3d6706d70f Mon Sep 17 00:00:00 2001 From: "Micah Chambers (minerva)" Date: Tue, 26 Aug 2025 09:34:28 -0700 Subject: [PATCH 5/7] Reapply "make name u8" This reverts commit e245e6f9175707078e1264490bc72df6c02523df. --- src/ffi/device.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/ffi/device.rs b/src/ffi/device.rs index 5672c99..0a3650d 100644 --- a/src/ffi/device.rs +++ b/src/ffi/device.rs @@ -14,7 +14,7 @@ type Result = std::result::Result; pub struct CudaDeviceProp { // ASCII string identifying the device. // original name: name[256] - pub name: [i8; 256], + pub name: [u8; 256], // 16-byte unique identifier. // original name: uuid From 402e79bf47e6c5c49935312d1b783ee50c2028f4 Mon Sep 17 00:00:00 2001 From: "Micah Chambers (minerva)" Date: Tue, 26 Aug 2025 09:48:38 -0700 Subject: [PATCH 6/7] add missing --- src/ffi/device.rs | 21 +++++++++++++++++++-- 1 file changed, 19 insertions(+), 2 deletions(-) diff --git a/src/ffi/device.rs b/src/ffi/device.rs index 0a3650d..7632fbc 100644 --- a/src/ffi/device.rs +++ b/src/ffi/device.rs @@ -13,13 +13,18 @@ type Result = std::result::Result; #[repr(C)] pub struct CudaDeviceProp { // ASCII string identifying the device. - // original name: name[256] pub name: [u8; 256], // 16-byte unique identifier. - // original name: uuid pub uuid: [u8; 16], + // 8-byte locally unique identifier. Value is undefined on TCC and non-Windows platforms + pub luid: [u8; 8], + + // LUID device node mask. Value is undefined on TCC and non-Windows platforms + // original name: luidDeviceNodeMask + pub luid_device_node_mask: u32, + // Total amount of global memory available on the device in bytes. // original name: totalGlobalMem pub total_global_mem: usize, @@ -268,6 +273,10 @@ pub struct CudaDeviceProp { // original name: multiGpuBoardGroupID pub multi_gpu_board_group_id: i32, + // Link between the device and the host supports native atomic operations + // original name: hostNativeAtomicSupported + host_native_atomic_supported: i32, + // Ratio of single precision performance (in floating-point operations per second) to double precision performance. // original name: singleToDoublePrecisionPerfRatio pub single_to_double_precision_perf_ratio: i32, @@ -296,6 +305,10 @@ pub struct CudaDeviceProp { // original name: cooperativeMultiDeviceLaunch pub cooperative_multi_device_launch: i32, + // Per device maximum shared memory per block usable by special opt in + // original name: sharedMemPerBlockOptin + pub shared_mem_per_block_optin: usize, + // 1 if the device accesses pageable memory via the host's page tables, and 0 otherwise. // original name: pageableMemoryAccessUsesHostPageTables pub pageable_memory_access_uses_host_page_tables: i32, @@ -311,6 +324,10 @@ pub struct CudaDeviceProp { // Maximum value of cudaAccessPolicyWindow::num_bytes. // original name: accessPolicyMaxWindowSize pub access_policy_max_window_size: i32, + + // Shared memory reserved by CUDA driver per block in bytes + // original name: reservedSharedMemPerBlock + pub reserved_shared_mem_per_block: usize, } impl Default for CudaDeviceProp { From df7073d4d78e84eb53b19a09b941b4ccc5622b79 Mon Sep 17 00:00:00 2001 From: "Micah Chambers (minerva)" Date: Tue, 26 Aug 2025 10:48:21 -0700 Subject: [PATCH 7/7] add missing --- src/ffi/device.rs | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/src/ffi/device.rs b/src/ffi/device.rs index 7632fbc..6f6334f 100644 --- a/src/ffi/device.rs +++ b/src/ffi/device.rs @@ -1,6 +1,3 @@ -use std::ffi::c_char; -use std::ffi::c_int; - use cpp::cpp; use crate::device::DeviceId; @@ -9,6 +6,7 @@ use crate::ffi::result; type Result = std::result::Result; +// Post CUDA 11, NVIDIA has only appended to this struct #[derive(Clone, Debug)] #[repr(C)] pub struct CudaDeviceProp { @@ -335,6 +333,8 @@ impl Default for CudaDeviceProp { CudaDeviceProp { name: [0; 256], uuid: [0; 16], + luid: [0; 8], + luid_device_node_mask: 0, total_global_mem: 0, shared_mem_per_block: 0, regs_per_block: 0, @@ -408,6 +408,9 @@ impl Default for CudaDeviceProp { direct_managed_mem_access_from_host: 0, max_blocks_per_multi_processor: 0, access_policy_max_window_size: 0, + host_native_atomic_supported: 0, + reserved_shared_mem_per_block: 0, + shared_mem_per_block_optin: 0, } } }