fix incorrect function dispatch on GPU

TensorBFS · Jun 30, 2023 · 9983f50 · 9983f50
1 parent 83f48c9
commit 9983f50
Show file tree

Hide file tree

Showing 5 changed files with 17 additions and 23 deletions.
diff --git a/Project.toml b/Project.toml
@@ -1,7 +1,7 @@
 name = "TropicalGEMM"
 uuid = "a4ad3063-64a7-4bad-8738-34ed09bc0236"
 authors = ["GiggleLiu <[email protected]> and contributors"]
-version = "0.1.9"
+version = "0.1.10"
 
 [deps]
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"

diff --git a/README.md b/README.md
@@ -25,6 +25,8 @@ julia> a = Tropical.(randn(1000, 1000))
 julia> @benchmark Octavian.matmul_serial($a, $a)
 ```
 
+**Warning:** using TropicalGEMM will overload the `mul!` function for Tropical numbers.
+
 ## Benchmarks
 
 Matrix size `n x n`, CPU Intel(R) Core(TM) i5-10400 CPU @ 2.90GHz.

diff --git a/src/TropicalGEMM.jl b/src/TropicalGEMM.jl
@@ -1,6 +1,11 @@
 module TropicalGEMM
 
 using LinearAlgebra, TropicalNumbers, VectorizationBase, LoopVectorization
+using VectorizationBase: OffsetPrecalc, StaticBool, Bit, static, NativeTypes, Index, gep_quote, VectorIndex,
+    AbstractMask, NativeTypesExceptBit, AbstractSIMDVector, IndexNoUnroll, AbstractStridedPointer, AbstractSIMD
+using VectorizationBase: contiguous_batch_size, contiguous_axis, val_stride_rank, bytestrides, offsets, memory_reference,
+    vmaximum, fmap, FloatingTypes, IntegerIndex, LazyMulAdd
+using LinearAlgebra: StridedMaybeAdjOrTransMat
 
 export Tropical, TropicalF64, TropicalF32, TropicalF16
 

diff --git a/src/fallbacks.jl b/src/fallbacks.jl
@@ -22,15 +22,12 @@ function naive_mul!(o::AbstractMatrix{T0}, a::AbstractMatrix{T1}, b::AbstractMat
     return o
 end
 
+# For types not nativelly supported, go to fallback.
 # Overwrite the `mul!` in LinearAlgebra (also changes the behavior of `*` in Base)!
-for TA in [:(AbstractMatrix{T} where T<:TropicalTypes), :(Transpose{T,S} where {T<:TropicalTypes,S<:AbstractVecOrMat{T}})]
-    for TB in [:(AbstractMatrix{T} where T<:TropicalTypes), :(Transpose{T,S} where {T<:TropicalTypes,S<:AbstractVecOrMat{T}})]
-        @eval @inline function LinearAlgebra.mul!(o::AbstractMatrix{TO}, a::$TA, b::$TB, α::Number, β::Number) where TO
-            α = _convert_to_static(TO, α)
-            β = _convert_to_static(TO, β)
-            naive_mul!(o, a, b, α, β)
-        end
-    end
+function LinearAlgebra.mul!(o::StridedMaybeAdjOrTransMat{TO}, a::StridedMaybeAdjOrTransMat, b::StridedMaybeAdjOrTransMat, α::Number, β::Number) where TO
+    α = _convert_to_static(TO, α)
+    β = _convert_to_static(TO, β)
+    naive_mul!(o, a, b, α, β)
 end
 
 Base.:*(a::T, b::StaticInt{0}) where T<:TropicalTypes = zero(T)

diff --git a/src/gemm.jl b/src/gemm.jl
@@ -1,8 +1,3 @@
-using VectorizationBase: OffsetPrecalc, StaticBool, Bit, static, NativeTypes, Index, gep_quote, VectorIndex,
-    AbstractMask, NativeTypesExceptBit, AbstractSIMDVector, IndexNoUnroll, AbstractStridedPointer, AbstractSIMD
-using VectorizationBase: contiguous_batch_size, contiguous_axis, val_stride_rank, bytestrides, offsets, memory_reference,
-    vmaximum, fmap, FloatingTypes, IntegerIndex, LazyMulAdd
-
 LoopVectorization.check_args(::Type{T}, ::Type{T}) where T<:Tropical = true
 LoopVectorization.check_type(::Type{Tropical{T}}) where {T} = LoopVectorization.check_type(T)
 
@@ -148,15 +143,10 @@ end
 
 # Overwrite the `mul!` in LinearAlgebra (also changes the behavior of `*` in Base)!
 using Octavian
-const XTranspose{T} = Transpose{T, <:AbstractVecOrMat{T}}
-for TA in [:AbstractMatrix, :XTranspose]
-    for TB in [:AbstractMatrix, :XTranspose]
-        @eval function LinearAlgebra.mul!(o::AbstractMatrix{T}, a::$TA{T}, b::$TB{T}, α::Number, β::Number) where {T<:Tropical{<:NativeTypes}}
-            α = _convert_to_static(T, α)
-            β = _convert_to_static(T, β)
-            Octavian.matmul!(o, a, b, α, β)
-        end
-    end
+function LinearAlgebra.mul!(o::StridedMaybeAdjOrTransMat{T}, a::StridedMaybeAdjOrTransMat{T}, b::StridedMaybeAdjOrTransMat{T}, α::Number, β::Number) where {T<:Tropical{<:NativeTypes}}
+    α = _convert_to_static(T, α)
+    β = _convert_to_static(T, β)
+    Octavian.matmul!(o, a, b, α, β)
 end
 # NOTE: benchmark shows, the type instability here can be optimized by the compiler
 # so you do not need to worry about the overheads.