Conversation
|
Your PR requires formatting changes to meet the project's style guidelines. Click here to view the suggested changes.diff --git a/lib/cudadrv/memory.jl b/lib/cudadrv/memory.jl
index 5981ad3d0..61000465c 100644
--- a/lib/cudadrv/memory.jl
+++ b/lib/cudadrv/memory.jl
@@ -411,7 +411,7 @@ for (fn, srcPtrTy, dstPtrTy) in (("cuMemcpyDtoHAsync_v2", :CuPtr, :Ptr),
@eval function Base.unsafe_copyto!(dst::$dstPtrTy{T}, src::$srcPtrTy{T}, N::Integer;
stream::CuStream=stream(),
async::Bool=false) where T
- $(getproperty(CUDA, Symbol(fn)))(dst, src, N*aligned_sizeof(T), stream)
+ $(getproperty(CUDA, Symbol(fn)))(dst, src, N * aligned_sizeof(T), stream)
async || synchronize(stream)
return dst
end
@@ -423,11 +423,12 @@ function Base.unsafe_copyto!(dst::CuPtr{T}, src::CuPtr{T}, N::Integer;
dst_dev = device(dst)
src_dev = device(src)
if dst_dev == src_dev
- cuMemcpyDtoDAsync_v2(dst, src, N*aligned_sizeof(T), stream)
+ cuMemcpyDtoDAsync_v2(dst, src, N * aligned_sizeof(T), stream)
else
cuMemcpyPeerAsync(dst, context(dst_dev),
src, context(src_dev),
- N*aligned_sizeof(T), stream)
+ N * aligned_sizeof(T), stream
+ )
end
async || synchronize(stream)
return dst
@@ -436,7 +437,7 @@ end
function Base.unsafe_copyto!(dst::CuArrayPtr{T}, doffs::Integer, src::Ptr{T}, N::Integer;
stream::CuStream=stream(),
async::Bool=false) where T
- cuMemcpyHtoAAsync_v2(dst, doffs, src, N*aligned_sizeof(T), stream)
+ cuMemcpyHtoAAsync_v2(dst, doffs, src, N * aligned_sizeof(T), stream)
async || synchronize(stream)
return dst
end
@@ -444,16 +445,16 @@ end
function Base.unsafe_copyto!(dst::Ptr{T}, src::CuArrayPtr{T}, soffs::Integer, N::Integer;
stream::CuStream=stream(),
async::Bool=false) where T
- cuMemcpyAtoHAsync_v2(dst, src, soffs, N*aligned_sizeof(T), stream)
+ cuMemcpyAtoHAsync_v2(dst, src, soffs, N * aligned_sizeof(T), stream)
async || synchronize(stream)
return dst
end
Base.unsafe_copyto!(dst::CuArrayPtr{T}, doffs::Integer, src::CuPtr{T}, N::Integer) where {T} =
- cuMemcpyDtoA_v2(dst, doffs, src, N*aligned_sizeof(T))
+ cuMemcpyDtoA_v2(dst, doffs, src, N * aligned_sizeof(T))
Base.unsafe_copyto!(dst::CuPtr{T}, src::CuArrayPtr{T}, soffs::Integer, N::Integer) where {T} =
- cuMemcpyAtoD_v2(dst, src, soffs, N*aligned_sizeof(T))
+ cuMemcpyAtoD_v2(dst, src, soffs, N * aligned_sizeof(T))
Base.unsafe_copyto!(dst::CuArrayPtr, src, N::Integer; kwargs...) =
Base.unsafe_copyto!(dst, 0, src, N; kwargs...)
@@ -529,15 +530,15 @@ function unsafe_copy2d!(dst::Union{Ptr{T},CuPtr{T},CuArrayPtr{T}}, dstTyp::Type{
params_ref = Ref(CUDA_MEMCPY2D(
# source
- (srcPos.x-1)*aligned_sizeof(T), srcPos.y-1,
+ (srcPos.x - 1) * aligned_sizeof(T), srcPos.y - 1,
srcMemoryType, srcHost, srcDevice, srcArray,
srcPitch,
# destination
- (dstPos.x-1)*aligned_sizeof(T), dstPos.y-1,
+ (dstPos.x - 1) * aligned_sizeof(T), dstPos.y - 1,
dstMemoryType, dstHost, dstDevice, dstArray,
dstPitch,
# extent
- width*aligned_sizeof(T), height
+ width * aligned_sizeof(T), height
))
cuMemcpy2DAsync_v2(params_ref, stream)
async || synchronize(stream)
@@ -569,8 +570,8 @@ function unsafe_copy3d!(dst::Union{Ptr{T},CuPtr{T},CuArrayPtr{T}}, dstTyp::Type{
# when using the stream-ordered memory allocator
# NOTE: we apply the workaround unconditionally, since we want to keep this call cheap.
if v"11.2" <= driver_version() <= v"11.3" #&& pools[device()].stream_ordered
- srcOffset = (srcPos.x-1)*aligned_sizeof(T) + srcPitch*((srcPos.y-1) + srcHeight*(srcPos.z-1))
- dstOffset = (dstPos.x-1)*aligned_sizeof(T) + dstPitch*((dstPos.y-1) + dstHeight*(dstPos.z-1))
+ srcOffset = (srcPos.x - 1) * aligned_sizeof(T) + srcPitch * ((srcPos.y - 1) + srcHeight * (srcPos.z - 1))
+ dstOffset = (dstPos.x - 1) * aligned_sizeof(T) + dstPitch * ((dstPos.y - 1) + dstHeight * (dstPos.z - 1))
else
srcOffset = 0
dstOffset = 0
@@ -622,7 +623,7 @@ function unsafe_copy3d!(dst::Union{Ptr{T},CuPtr{T},CuArrayPtr{T}}, dstTyp::Type{
params_ref = Ref(CUDA_MEMCPY3D(
# source
- srcOffset==0 ? (srcPos.x-1)*aligned_sizeof(T) : 0,
+ srcOffset == 0 ? (srcPos.x - 1) * aligned_sizeof(T) : 0,
srcOffset==0 ? srcPos.y-1 : 0,
srcOffset==0 ? srcPos.z-1 : 0,
0, # LOD
@@ -630,7 +631,7 @@ function unsafe_copy3d!(dst::Union{Ptr{T},CuPtr{T},CuArrayPtr{T}}, dstTyp::Type{
C_NULL, # reserved
srcPitch, srcHeight,
# destination
- dstOffset==0 ? (dstPos.x-1)*aligned_sizeof(T) : 0,
+ dstOffset == 0 ? (dstPos.x - 1) * aligned_sizeof(T) : 0,
dstOffset==0 ? dstPos.y-1 : 0,
dstOffset==0 ? dstPos.z-1 : 0,
0, # LOD
@@ -638,7 +639,7 @@ function unsafe_copy3d!(dst::Union{Ptr{T},CuPtr{T},CuArrayPtr{T}}, dstTyp::Type{
C_NULL, # reserved
dstPitch, dstHeight,
# extent
- width*aligned_sizeof(T), height, depth
+ width * aligned_sizeof(T), height, depth
))
cuMemcpy3DAsync_v2(params_ref, stream)
async || synchronize(stream)
diff --git a/src/CUDA.jl b/src/CUDA.jl
index a524f4eac..a430cf439 100644
--- a/src/CUDA.jl
+++ b/src/CUDA.jl
@@ -53,9 +53,9 @@ using Printf
# Both of them are equivalent for immutable objects, but differ for mutable singtons and Symbol
# We use `aligned_sizeof` since we care about the size of a type in an array
@static if VERSION < v"1.11.0"
- @generated function aligned_sizeof(::Type{T}) where T
+ @generated function aligned_sizeof(::Type{T}) where {T}
return :($(Base.aligned_sizeof(T)))
- end
+ end
else
import Base: aligned_sizeof
end
diff --git a/src/array.jl b/src/array.jl
index dbf3949b6..e69a398d0 100644
--- a/src/array.jl
+++ b/src/array.jl
@@ -67,7 +67,7 @@ mutable struct CuArray{T,N,M} <: AbstractGPUArray{T,N}
function CuArray{T,N,M}(::UndefInitializer, dims::Dims{N}) where {T,N,M}
check_eltype("CuArray", T)
- maxsize = prod(dims) * aligned_sizeof(T)
+ maxsize = prod(dims) * aligned_sizeof(T)
bufsize = if Base.isbitsunion(T)
# type tag array past the data
maxsize + prod(dims)
@@ -84,7 +84,8 @@ mutable struct CuArray{T,N,M} <: AbstractGPUArray{T,N}
end
function CuArray{T,N}(data::DataRef{Managed{M}}, dims::Dims{N};
- maxsize::Int=prod(dims) * aligned_sizeof(T), offset::Int=0) where {T,N,M}
+ maxsize::Int = prod(dims) * aligned_sizeof(T), offset::Int = 0
+ ) where {T, N, M}
check_eltype("CuArray", T)
obj = new{T,N,M}(data, maxsize, offset, dims)
finalizer(unsafe_free!, obj)
@@ -235,7 +236,7 @@ function Base.unsafe_wrap(::Type{CuArray{T,N,M}},
ptr::CuPtr{T}, dims::NTuple{N,Int};
own::Bool=false, ctx::CuContext=context()) where {T,N,M}
isbitstype(T) || throw(ArgumentError("Can only unsafe_wrap a pointer to a bits type"))
- sz = prod(dims) * aligned_sizeof(T)
+ sz = prod(dims) * aligned_sizeof(T)
# create a memory object
mem = if M == UnifiedMemory
@@ -290,7 +291,7 @@ supports_hmm(dev) = driver_version() >= v"12.2" &&
function Base.unsafe_wrap(::Type{CuArray{T,N,M}}, p::Ptr{T}, dims::NTuple{N,Int};
ctx::CuContext=context()) where {T,N,M<:AbstractMemory}
isbitstype(T) || throw(ArgumentError("Can only unsafe_wrap a pointer to a bits type"))
- sz = prod(dims) * aligned_sizeof(T)
+ sz = prod(dims) * aligned_sizeof(T)
data = if M == UnifiedMemory
# HMM extends unified memory to include system memory
@@ -837,7 +838,7 @@ end
## derived arrays
function GPUArrays.derive(::Type{T}, a::CuArray, dims::Dims{N}, offset::Int) where {T,N}
- offset = (a.offset * Base.elsize(a)) ÷ aligned_sizeof(T) + offset
+ offset = (a.offset * Base.elsize(a)) ÷ aligned_sizeof(T) + offset
CuArray{T,N}(copy(a.data), dims; a.maxsize, offset)
end
@@ -851,7 +852,7 @@ function Base.unsafe_convert(::Type{CuPtr{T}}, V::SubArray{T,N,P,<:Tuple{Vararg{
end
function Base.unsafe_convert(::Type{CuPtr{T}}, V::SubArray{T,N,P,<:Tuple{Vararg{Union{Base.RangeIndex,Base.ReshapedUnitRange}}}}) where {T,N,P}
return Base.unsafe_convert(CuPtr{T}, parent(V)) +
- (Base.first_index(V)-1)*aligned_sizeof(T)
+ (Base.first_index(V) - 1) * aligned_sizeof(T)
end
@@ -874,7 +875,7 @@ function Base.resize!(A::CuVector{T}, n::Integer) where T
n == length(A) && return A
# TODO: add additional space to allow for quicker resizing
- maxsize = n * aligned_sizeof(T)
+ maxsize = n * aligned_sizeof(T)
bufsize = if isbitstype(T)
maxsize
else
diff --git a/src/compiler/compilation.jl b/src/compiler/compilation.jl
index 93a232d04..b48d7073b 100644
--- a/src/compiler/compilation.jl
+++ b/src/compiler/compilation.jl
@@ -305,7 +305,7 @@ function compile(@nospecialize(job::CompilerJob))
continue
end
name = source_argnames[i]
- details *= "\n [$(i-1)] $name::$typ uses $(Base.format_bytes(aligned_sizeof(typ)))"
+ details *= "\n [$(i - 1)] $name::$typ uses $(Base.format_bytes(aligned_sizeof(typ)))"
end
details *= "\n"
diff --git a/src/device/array.jl b/src/device/array.jl
index 59322349c..0adb40c85 100644
--- a/src/device/array.jl
+++ b/src/device/array.jl
@@ -29,7 +29,8 @@ struct CuDeviceArray{T,N,A} <: DenseArray{T,N}
# inner constructors, fully parameterized, exact types (ie. Int not <:Integer)
CuDeviceArray{T,N,A}(ptr::LLVMPtr{T,A}, dims::Tuple,
- maxsize::Int=prod(dims)*aligned_sizeof(T)) where {T,A,N} =
+ maxsize::Int = prod(dims) * aligned_sizeof(T)
+ ) where {T, A, N} =
new(ptr, maxsize, dims, prod(dims))
end
@@ -239,12 +240,12 @@ function Base.reinterpret(::Type{T}, a::CuDeviceArray{S,N,A}) where {T,S,N,A}
err = GPUArrays._reinterpret_exception(T, a)
err === nothing || throw(err)
- if aligned_sizeof(T) == aligned_sizeof(S) # fast case
+ if aligned_sizeof(T) == aligned_sizeof(S) # fast case
return CuDeviceArray{T,N,A}(reinterpret(LLVMPtr{T,A}, a.ptr), size(a), a.maxsize)
end
isize = size(a)
- size1 = div(isize[1]*aligned_sizeof(S), aligned_sizeof(T))
+ size1 = div(isize[1] * aligned_sizeof(S), aligned_sizeof(T))
osize = tuple(size1, Base.tail(isize)...)
return CuDeviceArray{T,N,A}(reinterpret(LLVMPtr{T,A}, a.ptr), osize, a.maxsize)
end |
There was a problem hiding this comment.
CUDA.jl Benchmarks
Details
| Benchmark suite | Current: 71da935 | Previous: c113666 | Ratio |
|---|---|---|---|
latency/precompile |
49238463615.5 ns |
48835237613 ns |
1.01 |
latency/ttfp |
7233454087 ns |
7263249839 ns |
1.00 |
latency/import |
3466573189 ns |
3465493701 ns |
1.00 |
integration/volumerhs |
9622632.5 ns |
9621812 ns |
1.00 |
integration/byval/slices=1 |
146963 ns |
147442 ns |
1.00 |
integration/byval/slices=3 |
425235 ns |
425693 ns |
1.00 |
integration/byval/reference |
145230 ns |
145206 ns |
1.00 |
integration/byval/slices=2 |
286099 ns |
286758 ns |
1.00 |
integration/cudadevrt |
103434 ns |
103612 ns |
1.00 |
kernel/indexing |
14000.5 ns |
14399 ns |
0.97 |
kernel/indexing_checked |
14794 ns |
15068 ns |
0.98 |
kernel/occupancy |
714.9477611940298 ns |
702.0419580419581 ns |
1.02 |
kernel/launch |
2264 ns |
2258.4444444444443 ns |
1.00 |
kernel/rand |
14502 ns |
16717 ns |
0.87 |
array/reverse/1d |
19484.5 ns |
20195 ns |
0.96 |
array/reverse/2d |
24764 ns |
24757 ns |
1.00 |
array/reverse/1d_inplace |
10195 ns |
11401 ns |
0.89 |
array/reverse/2d_inplace |
10844 ns |
13509 ns |
0.80 |
array/copy |
20748 ns |
21451.5 ns |
0.97 |
array/iteration/findall/int |
157574 ns |
159568 ns |
0.99 |
array/iteration/findall/bool |
138648 ns |
140978 ns |
0.98 |
array/iteration/findfirst/int |
153631.5 ns |
155188 ns |
0.99 |
array/iteration/findfirst/bool |
154431.5 ns |
155942 ns |
0.99 |
array/iteration/scalar |
71660 ns |
73335 ns |
0.98 |
array/iteration/logical |
214107.5 ns |
214260.5 ns |
1.00 |
array/iteration/findmin/1d |
41621 ns |
43021 ns |
0.97 |
array/iteration/findmin/2d |
94009 ns |
94859 ns |
0.99 |
array/reductions/reduce/1d |
44188 ns |
37155 ns |
1.19 |
array/reductions/reduce/2d |
42692.5 ns |
41718 ns |
1.02 |
array/reductions/mapreduce/1d |
39533.5 ns |
35382 ns |
1.12 |
array/reductions/mapreduce/2d |
41897.5 ns |
52285.5 ns |
0.80 |
array/broadcast |
20666 ns |
21333 ns |
0.97 |
array/copyto!/gpu_to_gpu |
13740 ns |
11882 ns |
1.16 |
array/copyto!/cpu_to_gpu |
208011 ns |
211668 ns |
0.98 |
array/copyto!/gpu_to_cpu |
244959 ns |
244976.5 ns |
1.00 |
array/accumulate/1d |
109544 ns |
110064 ns |
1.00 |
array/accumulate/2d |
80451 ns |
81465 ns |
0.99 |
array/construct |
1326.1 ns |
1277.4 ns |
1.04 |
array/random/randn/Float32 |
49877 ns |
45327.5 ns |
1.10 |
array/random/randn!/Float32 |
26410 ns |
27071 ns |
0.98 |
array/random/rand!/Int64 |
27000 ns |
27414 ns |
0.98 |
array/random/rand!/Float32 |
8538.333333333334 ns |
8820 ns |
0.97 |
array/random/rand/Int64 |
38105 ns |
38724 ns |
0.98 |
array/random/rand/Float32 |
13098 ns |
13425 ns |
0.98 |
array/permutedims/4d |
61290 ns |
61677 ns |
0.99 |
array/permutedims/2d |
55674 ns |
55974 ns |
0.99 |
array/permutedims/3d |
56742 ns |
56818 ns |
1.00 |
array/sorting/1d |
2775548 ns |
2778484 ns |
1.00 |
array/sorting/by |
3366359 ns |
3369313 ns |
1.00 |
array/sorting/2d |
1084810 ns |
1086275.5 ns |
1.00 |
cuda/synchronization/stream/auto |
1035.090909090909 ns |
1003.9230769230769 ns |
1.03 |
cuda/synchronization/stream/nonblocking |
6540.2 ns |
6655.2 ns |
0.98 |
cuda/synchronization/stream/blocking |
797.0194174757281 ns |
783.8139534883721 ns |
1.02 |
cuda/synchronization/context/auto |
1153.4 ns |
1147.9 ns |
1.00 |
cuda/synchronization/context/nonblocking |
6778.2 ns |
6850.8 ns |
0.99 |
cuda/synchronization/context/blocking |
898.94 ns |
947.1111111111111 ns |
0.95 |
This comment was automatically generated by workflow using github-action-benchmark.
|
CI failure related, looks like this isn't optimized by codegen / const-propped away. Probably should be fixed upstream? In the mean time, we can use a shadowed copy that has the necessary const-prop annotations, or a Could you also document at the import site when we should use |
d812291 to
2364bc0
Compare
Codecov ReportAttention: Patch coverage is
Additional details and impacted files@@ Coverage Diff @@
## master #2757 +/- ##
=======================================
Coverage 89.43% 89.43%
=======================================
Files 153 153
Lines 13185 13186 +1
=======================================
+ Hits 11792 11793 +1
Misses 1393 1393 ☔ View full report in Codecov by Sentry. 🚀 New features to boost your workflow:
|
2364bc0 to
71da935
Compare
Base itself uses
aligned_sizeoffor things likeelsize(::Array).aligned_sizeofcalculates the inline sizof, in contrast to the memory sizeof.Matters for Symbols and
mutablesingleton objects x-ref #2753