Prefer aligned_sizeof by vchuravy · Pull Request #2757 · JuliaGPU/CUDA.jl

vchuravy · 2025-04-22T18:49:57Z

Base itself uses aligned_sizeof for things like elsize(::Array). aligned_sizeof calculates the inline sizof, in contrast to the memory sizeof.

Matters for Symbols and mutable singleton objects x-ref #2753

github-actions · 2025-04-22T18:50:39Z

Your PR requires formatting changes to meet the project's style guidelines.
Please consider running Runic (git runic master) to apply these changes.

Click here to view the suggested changes.

diff --git a/lib/cudadrv/memory.jl b/lib/cudadrv/memory.jl
index 5981ad3d0..61000465c 100644
--- a/lib/cudadrv/memory.jl
+++ b/lib/cudadrv/memory.jl
@@ -411,7 +411,7 @@ for (fn, srcPtrTy, dstPtrTy) in (("cuMemcpyDtoHAsync_v2", :CuPtr, :Ptr),
     @eval function Base.unsafe_copyto!(dst::$dstPtrTy{T}, src::$srcPtrTy{T}, N::Integer;
                                        stream::CuStream=stream(),
                                        async::Bool=false) where T
-        $(getproperty(CUDA, Symbol(fn)))(dst, src, N*aligned_sizeof(T), stream)
+        $(getproperty(CUDA, Symbol(fn)))(dst, src, N * aligned_sizeof(T), stream)
         async || synchronize(stream)
         return dst
     end
@@ -423,11 +423,12 @@ function Base.unsafe_copyto!(dst::CuPtr{T}, src::CuPtr{T}, N::Integer;
     dst_dev = device(dst)
     src_dev = device(src)
     if dst_dev == src_dev
-        cuMemcpyDtoDAsync_v2(dst, src, N*aligned_sizeof(T), stream)
+        cuMemcpyDtoDAsync_v2(dst, src, N * aligned_sizeof(T), stream)
     else
         cuMemcpyPeerAsync(dst, context(dst_dev),
                           src, context(src_dev),
-                          N*aligned_sizeof(T), stream)
+            N * aligned_sizeof(T), stream
+        )
     end
     async || synchronize(stream)
     return dst
@@ -436,7 +437,7 @@ end
 function Base.unsafe_copyto!(dst::CuArrayPtr{T}, doffs::Integer, src::Ptr{T}, N::Integer;
                              stream::CuStream=stream(),
                              async::Bool=false) where T
-    cuMemcpyHtoAAsync_v2(dst, doffs, src, N*aligned_sizeof(T), stream)
+    cuMemcpyHtoAAsync_v2(dst, doffs, src, N * aligned_sizeof(T), stream)
     async || synchronize(stream)
     return dst
 end
@@ -444,16 +445,16 @@ end
 function Base.unsafe_copyto!(dst::Ptr{T}, src::CuArrayPtr{T}, soffs::Integer, N::Integer;
                              stream::CuStream=stream(),
                              async::Bool=false) where T
-    cuMemcpyAtoHAsync_v2(dst, src, soffs, N*aligned_sizeof(T), stream)
+    cuMemcpyAtoHAsync_v2(dst, src, soffs, N * aligned_sizeof(T), stream)
     async || synchronize(stream)
     return dst
 end
 
 Base.unsafe_copyto!(dst::CuArrayPtr{T}, doffs::Integer, src::CuPtr{T}, N::Integer) where {T} =
-    cuMemcpyDtoA_v2(dst, doffs, src, N*aligned_sizeof(T))
+    cuMemcpyDtoA_v2(dst, doffs, src, N * aligned_sizeof(T))
 
 Base.unsafe_copyto!(dst::CuPtr{T}, src::CuArrayPtr{T}, soffs::Integer, N::Integer) where {T} =
-    cuMemcpyAtoD_v2(dst, src, soffs, N*aligned_sizeof(T))
+    cuMemcpyAtoD_v2(dst, src, soffs, N * aligned_sizeof(T))
 
 Base.unsafe_copyto!(dst::CuArrayPtr, src, N::Integer; kwargs...) =
     Base.unsafe_copyto!(dst, 0, src, N; kwargs...)
@@ -529,15 +530,15 @@ function unsafe_copy2d!(dst::Union{Ptr{T},CuPtr{T},CuArrayPtr{T}}, dstTyp::Type{
 
     params_ref = Ref(CUDA_MEMCPY2D(
         # source
-        (srcPos.x-1)*aligned_sizeof(T), srcPos.y-1,
+            (srcPos.x - 1) * aligned_sizeof(T), srcPos.y - 1,
         srcMemoryType, srcHost, srcDevice, srcArray,
         srcPitch,
         # destination
-        (dstPos.x-1)*aligned_sizeof(T), dstPos.y-1,
+            (dstPos.x - 1) * aligned_sizeof(T), dstPos.y - 1,
         dstMemoryType, dstHost, dstDevice, dstArray,
         dstPitch,
         # extent
-        width*aligned_sizeof(T), height
+            width * aligned_sizeof(T), height
     ))
     cuMemcpy2DAsync_v2(params_ref, stream)
     async || synchronize(stream)
@@ -569,8 +570,8 @@ function unsafe_copy3d!(dst::Union{Ptr{T},CuPtr{T},CuArrayPtr{T}}, dstTyp::Type{
     #                       when using the stream-ordered memory allocator
     # NOTE: we apply the workaround unconditionally, since we want to keep this call cheap.
     if v"11.2" <= driver_version() <= v"11.3" #&& pools[device()].stream_ordered
-        srcOffset = (srcPos.x-1)*aligned_sizeof(T) + srcPitch*((srcPos.y-1) + srcHeight*(srcPos.z-1))
-        dstOffset = (dstPos.x-1)*aligned_sizeof(T) + dstPitch*((dstPos.y-1) + dstHeight*(dstPos.z-1))
+        srcOffset = (srcPos.x - 1) * aligned_sizeof(T) + srcPitch * ((srcPos.y - 1) + srcHeight * (srcPos.z - 1))
+        dstOffset = (dstPos.x - 1) * aligned_sizeof(T) + dstPitch * ((dstPos.y - 1) + dstHeight * (dstPos.z - 1))
     else
         srcOffset = 0
         dstOffset = 0
@@ -622,7 +623,7 @@ function unsafe_copy3d!(dst::Union{Ptr{T},CuPtr{T},CuArrayPtr{T}}, dstTyp::Type{
 
     params_ref = Ref(CUDA_MEMCPY3D(
         # source
-        srcOffset==0 ? (srcPos.x-1)*aligned_sizeof(T) : 0,
+            srcOffset == 0 ? (srcPos.x - 1) * aligned_sizeof(T) : 0,
         srcOffset==0 ? srcPos.y-1             : 0,
         srcOffset==0 ? srcPos.z-1             : 0,
         0, # LOD
@@ -630,7 +631,7 @@ function unsafe_copy3d!(dst::Union{Ptr{T},CuPtr{T},CuArrayPtr{T}}, dstTyp::Type{
         C_NULL, # reserved
         srcPitch, srcHeight,
         # destination
-        dstOffset==0 ? (dstPos.x-1)*aligned_sizeof(T) : 0,
+            dstOffset == 0 ? (dstPos.x - 1) * aligned_sizeof(T) : 0,
         dstOffset==0 ? dstPos.y-1             : 0,
         dstOffset==0 ? dstPos.z-1             : 0,
         0, # LOD
@@ -638,7 +639,7 @@ function unsafe_copy3d!(dst::Union{Ptr{T},CuPtr{T},CuArrayPtr{T}}, dstTyp::Type{
         C_NULL, # reserved
         dstPitch, dstHeight,
         # extent
-        width*aligned_sizeof(T), height, depth
+            width * aligned_sizeof(T), height, depth
     ))
     cuMemcpy3DAsync_v2(params_ref, stream)
     async || synchronize(stream)
diff --git a/src/CUDA.jl b/src/CUDA.jl
index a524f4eac..a430cf439 100644
--- a/src/CUDA.jl
+++ b/src/CUDA.jl
@@ -53,9 +53,9 @@ using Printf
 # Both of them are equivalent for immutable objects, but differ for mutable singtons and Symbol
 # We use `aligned_sizeof` since we care about the size of a type in an array
 @static if VERSION < v"1.11.0"
-   @generated function aligned_sizeof(::Type{T}) where T
+    @generated function aligned_sizeof(::Type{T}) where {T}
         return :($(Base.aligned_sizeof(T)))
-   end
+    end
 else
     import Base: aligned_sizeof
 end
diff --git a/src/array.jl b/src/array.jl
index dbf3949b6..e69a398d0 100644
--- a/src/array.jl
+++ b/src/array.jl
@@ -67,7 +67,7 @@ mutable struct CuArray{T,N,M} <: AbstractGPUArray{T,N}
 
   function CuArray{T,N,M}(::UndefInitializer, dims::Dims{N}) where {T,N,M}
     check_eltype("CuArray", T)
-    maxsize = prod(dims) * aligned_sizeof(T)
+        maxsize = prod(dims) * aligned_sizeof(T)
     bufsize = if Base.isbitsunion(T)
       # type tag array past the data
       maxsize + prod(dims)
@@ -84,7 +84,8 @@ mutable struct CuArray{T,N,M} <: AbstractGPUArray{T,N}
   end
 
   function CuArray{T,N}(data::DataRef{Managed{M}}, dims::Dims{N};
-                        maxsize::Int=prod(dims) * aligned_sizeof(T), offset::Int=0) where {T,N,M}
+            maxsize::Int = prod(dims) * aligned_sizeof(T), offset::Int = 0
+        ) where {T, N, M}
     check_eltype("CuArray", T)
     obj = new{T,N,M}(data, maxsize, offset, dims)
     finalizer(unsafe_free!, obj)
@@ -235,7 +236,7 @@ function Base.unsafe_wrap(::Type{CuArray{T,N,M}},
                           ptr::CuPtr{T}, dims::NTuple{N,Int};
                           own::Bool=false, ctx::CuContext=context()) where {T,N,M}
   isbitstype(T) || throw(ArgumentError("Can only unsafe_wrap a pointer to a bits type"))
-  sz = prod(dims) * aligned_sizeof(T)
+    sz = prod(dims) * aligned_sizeof(T)
 
   # create a memory object
   mem = if M == UnifiedMemory
@@ -290,7 +291,7 @@ supports_hmm(dev) = driver_version() >= v"12.2" &&
 function Base.unsafe_wrap(::Type{CuArray{T,N,M}}, p::Ptr{T}, dims::NTuple{N,Int};
                           ctx::CuContext=context()) where {T,N,M<:AbstractMemory}
   isbitstype(T) || throw(ArgumentError("Can only unsafe_wrap a pointer to a bits type"))
-  sz = prod(dims) * aligned_sizeof(T)
+    sz = prod(dims) * aligned_sizeof(T)
 
   data = if M == UnifiedMemory
     # HMM extends unified memory to include system memory
@@ -837,7 +838,7 @@ end
 ## derived arrays
 
 function GPUArrays.derive(::Type{T}, a::CuArray, dims::Dims{N}, offset::Int) where {T,N}
-  offset = (a.offset * Base.elsize(a)) ÷ aligned_sizeof(T) + offset
+    offset = (a.offset * Base.elsize(a)) ÷ aligned_sizeof(T) + offset
   CuArray{T,N}(copy(a.data), dims; a.maxsize, offset)
 end
 
@@ -851,7 +852,7 @@ function Base.unsafe_convert(::Type{CuPtr{T}}, V::SubArray{T,N,P,<:Tuple{Vararg{
 end
 function Base.unsafe_convert(::Type{CuPtr{T}}, V::SubArray{T,N,P,<:Tuple{Vararg{Union{Base.RangeIndex,Base.ReshapedUnitRange}}}}) where {T,N,P}
    return Base.unsafe_convert(CuPtr{T}, parent(V)) +
-          (Base.first_index(V)-1)*aligned_sizeof(T)
+        (Base.first_index(V) - 1) * aligned_sizeof(T)
 end
 
 
@@ -874,7 +875,7 @@ function Base.resize!(A::CuVector{T}, n::Integer) where T
   n == length(A) && return A
 
   # TODO: add additional space to allow for quicker resizing
-  maxsize = n * aligned_sizeof(T)
+    maxsize = n * aligned_sizeof(T)
   bufsize = if isbitstype(T)
     maxsize
   else
diff --git a/src/compiler/compilation.jl b/src/compiler/compilation.jl
index 93a232d04..b48d7073b 100644
--- a/src/compiler/compilation.jl
+++ b/src/compiler/compilation.jl
@@ -305,7 +305,7 @@ function compile(@nospecialize(job::CompilerJob))
                     continue
                 end
                 name = source_argnames[i]
-                details *= "\n  [$(i-1)] $name::$typ uses $(Base.format_bytes(aligned_sizeof(typ)))"
+                details *= "\n  [$(i - 1)] $name::$typ uses $(Base.format_bytes(aligned_sizeof(typ)))"
             end
             details *= "\n"
 
diff --git a/src/device/array.jl b/src/device/array.jl
index 59322349c..0adb40c85 100644
--- a/src/device/array.jl
+++ b/src/device/array.jl
@@ -29,7 +29,8 @@ struct CuDeviceArray{T,N,A} <: DenseArray{T,N}
 
     # inner constructors, fully parameterized, exact types (ie. Int not <:Integer)
     CuDeviceArray{T,N,A}(ptr::LLVMPtr{T,A}, dims::Tuple,
-                         maxsize::Int=prod(dims)*aligned_sizeof(T)) where {T,A,N} =
+        maxsize::Int = prod(dims) * aligned_sizeof(T)
+    ) where {T, A, N} =
         new(ptr, maxsize, dims, prod(dims))
 end
 
@@ -239,12 +240,12 @@ function Base.reinterpret(::Type{T}, a::CuDeviceArray{S,N,A}) where {T,S,N,A}
   err = GPUArrays._reinterpret_exception(T, a)
   err === nothing || throw(err)
 
-  if aligned_sizeof(T) == aligned_sizeof(S) # fast case
+    if aligned_sizeof(T) == aligned_sizeof(S) # fast case
     return CuDeviceArray{T,N,A}(reinterpret(LLVMPtr{T,A}, a.ptr), size(a), a.maxsize)
   end
 
   isize = size(a)
-  size1 = div(isize[1]*aligned_sizeof(S), aligned_sizeof(T))
+    size1 = div(isize[1] * aligned_sizeof(S), aligned_sizeof(T))
   osize = tuple(size1, Base.tail(isize)...)
   return CuDeviceArray{T,N,A}(reinterpret(LLVMPtr{T,A}, a.ptr), osize, a.maxsize)
 end

github-actions

CUDA.jl Benchmarks

Details

Benchmark suite	Current: `71da935`	Previous: `c113666`	Ratio
`latency/precompile`	`49238463615.5` ns	`48835237613` ns	`1.01`
`latency/ttfp`	`7233454087` ns	`7263249839` ns	`1.00`
`latency/import`	`3466573189` ns	`3465493701` ns	`1.00`
`integration/volumerhs`	`9622632.5` ns	`9621812` ns	`1.00`
`integration/byval/slices=1`	`146963` ns	`147442` ns	`1.00`
`integration/byval/slices=3`	`425235` ns	`425693` ns	`1.00`
`integration/byval/reference`	`145230` ns	`145206` ns	`1.00`
`integration/byval/slices=2`	`286099` ns	`286758` ns	`1.00`
`integration/cudadevrt`	`103434` ns	`103612` ns	`1.00`
`kernel/indexing`	`14000.5` ns	`14399` ns	`0.97`
`kernel/indexing_checked`	`14794` ns	`15068` ns	`0.98`
`kernel/occupancy`	`714.9477611940298` ns	`702.0419580419581` ns	`1.02`
`kernel/launch`	`2264` ns	`2258.4444444444443` ns	`1.00`
`kernel/rand`	`14502` ns	`16717` ns	`0.87`
`array/reverse/1d`	`19484.5` ns	`20195` ns	`0.96`
`array/reverse/2d`	`24764` ns	`24757` ns	`1.00`
`array/reverse/1d_inplace`	`10195` ns	`11401` ns	`0.89`
`array/reverse/2d_inplace`	`10844` ns	`13509` ns	`0.80`
`array/copy`	`20748` ns	`21451.5` ns	`0.97`
`array/iteration/findall/int`	`157574` ns	`159568` ns	`0.99`
`array/iteration/findall/bool`	`138648` ns	`140978` ns	`0.98`
`array/iteration/findfirst/int`	`153631.5` ns	`155188` ns	`0.99`
`array/iteration/findfirst/bool`	`154431.5` ns	`155942` ns	`0.99`
`array/iteration/scalar`	`71660` ns	`73335` ns	`0.98`
`array/iteration/logical`	`214107.5` ns	`214260.5` ns	`1.00`
`array/iteration/findmin/1d`	`41621` ns	`43021` ns	`0.97`
`array/iteration/findmin/2d`	`94009` ns	`94859` ns	`0.99`
`array/reductions/reduce/1d`	`44188` ns	`37155` ns	`1.19`
`array/reductions/reduce/2d`	`42692.5` ns	`41718` ns	`1.02`
`array/reductions/mapreduce/1d`	`39533.5` ns	`35382` ns	`1.12`
`array/reductions/mapreduce/2d`	`41897.5` ns	`52285.5` ns	`0.80`
`array/broadcast`	`20666` ns	`21333` ns	`0.97`
`array/copyto!/gpu_to_gpu`	`13740` ns	`11882` ns	`1.16`
`array/copyto!/cpu_to_gpu`	`208011` ns	`211668` ns	`0.98`
`array/copyto!/gpu_to_cpu`	`244959` ns	`244976.5` ns	`1.00`
`array/accumulate/1d`	`109544` ns	`110064` ns	`1.00`
`array/accumulate/2d`	`80451` ns	`81465` ns	`0.99`
`array/construct`	`1326.1` ns	`1277.4` ns	`1.04`
`array/random/randn/Float32`	`49877` ns	`45327.5` ns	`1.10`
`array/random/randn!/Float32`	`26410` ns	`27071` ns	`0.98`
`array/random/rand!/Int64`	`27000` ns	`27414` ns	`0.98`
`array/random/rand!/Float32`	`8538.333333333334` ns	`8820` ns	`0.97`
`array/random/rand/Int64`	`38105` ns	`38724` ns	`0.98`
`array/random/rand/Float32`	`13098` ns	`13425` ns	`0.98`
`array/permutedims/4d`	`61290` ns	`61677` ns	`0.99`
`array/permutedims/2d`	`55674` ns	`55974` ns	`0.99`
`array/permutedims/3d`	`56742` ns	`56818` ns	`1.00`
`array/sorting/1d`	`2775548` ns	`2778484` ns	`1.00`
`array/sorting/by`	`3366359` ns	`3369313` ns	`1.00`
`array/sorting/2d`	`1084810` ns	`1086275.5` ns	`1.00`
`cuda/synchronization/stream/auto`	`1035.090909090909` ns	`1003.9230769230769` ns	`1.03`
`cuda/synchronization/stream/nonblocking`	`6540.2` ns	`6655.2` ns	`0.98`
`cuda/synchronization/stream/blocking`	`797.0194174757281` ns	`783.8139534883721` ns	`1.02`
`cuda/synchronization/context/auto`	`1153.4` ns	`1147.9` ns	`1.00`
`cuda/synchronization/context/nonblocking`	`6778.2` ns	`6850.8` ns	`0.99`
`cuda/synchronization/context/blocking`	`898.94` ns	`947.1111111111111` ns	`0.95`

This comment was automatically generated by workflow using github-action-benchmark.

maleadt · 2025-04-23T04:48:54Z

CI failure related, looks like this isn't optimized by codegen / const-propped away. Probably should be fixed upstream? In the mean time, we can use a shadowed copy that has the necessary const-prop annotations, or a @generated version that forces early evaluation.

Could you also document at the import site when we should use aligned_sizeof, and when sizeof suffices?

codecov · 2025-04-24T16:39:26Z

Codecov Report

Attention: Patch coverage is 90.47619% with 2 lines in your changes missing coverage. Please review.

Project coverage is 89.43%. Comparing base (c113666) to head (71da935).
Report is 2 commits behind head on master.

Files with missing lines	Patch %	Lines
lib/cudadrv/memory.jl	71.42%	2 Missing ⚠️

Additional details and impacted files

@@           Coverage Diff           @@
##           master    #2757   +/-   ##
=======================================
  Coverage   89.43%   89.43%           
=======================================
  Files         153      153           
  Lines       13185    13186    +1     
=======================================
+ Hits        11792    11793    +1     
  Misses       1393     1393

☔ View full report in Codecov by Sentry.
📢 Have feedback on the report? Share it here.

🚀 New features to boost your workflow:

❄️ Test Analytics: Detect flaky tests, report on failures, and find test suite problems.

github-actions bot reviewed Apr 22, 2025

View reviewed changes

vchuravy force-pushed the vc/aligned_sizeof branch from d812291 to 2364bc0 Compare April 24, 2025 13:15

maleadt approved these changes Apr 25, 2025

View reviewed changes

Prefer aligned_sizeof

71da935

vchuravy force-pushed the vc/aligned_sizeof branch from 2364bc0 to 71da935 Compare April 25, 2025 12:42

vchuravy enabled auto-merge (squash) April 25, 2025 12:42

vchuravy disabled auto-merge April 25, 2025 16:20

vchuravy merged commit 2807156 into master Apr 25, 2025
3 checks passed

vchuravy deleted the vc/aligned_sizeof branch April 25, 2025 16:20

maleadt mentioned this pull request May 31, 2025

Illegal memory access after aligned_sizeof changes #2790

Closed

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Comments

Prefer aligned_sizeof#2757

Prefer aligned_sizeof#2757
vchuravy merged 1 commit intomasterfrom
vc/aligned_sizeof

vchuravy commented Apr 22, 2025 •

edited

Loading

Uh oh!

github-actions bot commented Apr 22, 2025 •

edited

Loading

Uh oh!

github-actions bot left a comment •

edited

Loading

Uh oh!

maleadt commented Apr 23, 2025

Uh oh!

codecov bot commented Apr 24, 2025 •

edited

Loading

Uh oh!

Uh oh!

Reviewers

Assignees

Labels

Projects

Milestone

Development

Uh oh!

2 participants

Comments

Conversation

vchuravy commented Apr 22, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

github-actions bot commented Apr 22, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

github-actions bot left a comment • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Choose a reason for hiding this comment

CUDA.jl Benchmarks

Uh oh!

maleadt commented Apr 23, 2025

Uh oh!

codecov bot commented Apr 24, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Codecov Report

Uh oh!

Uh oh!

Reviewers

Assignees

Labels

Projects

Milestone

Development

Uh oh!

2 participants

vchuravy commented Apr 22, 2025 •

edited

Loading

github-actions bot commented Apr 22, 2025 •

edited

Loading

github-actions bot left a comment •

edited

Loading

codecov bot commented Apr 24, 2025 •

edited

Loading