diff --git a/src/LLVM_intrinsics.jl b/src/LLVM_intrinsics.jl index 2807968..8c3f172 100644 --- a/src/LLVM_intrinsics.jl +++ b/src/LLVM_intrinsics.jl @@ -45,9 +45,10 @@ suffix(N::Integer, ::Type{Ptr{T}}) where {T} = "v$(N)p0$(T<:IntegerTypes ? "i" : suffix(N::Integer, ::Type{T}) where {T} = "v$(N)$(T<:IntegerTypes ? "i" : "f")$(8*sizeof(T))" suffix(::Type{T}) where {T} = "$(T<:IntegerTypes ? "i" : "f")$(8*sizeof(T))" -llvm_name(llvmf, N, T) = string("llvm", ".", llvmf, ".", suffix(N, T)) -llvm_name(llvmf, ::Type{LVec{N, T}}) where {N,T} = string("llvm", ".", llvmf, ".", suffix(N, T)) -llvm_name(llvmf, ::Type{T}) where {T} = string("llvm", ".", llvmf, ".", suffix(T)) +dotit(f) = replace(string(f), "_" => ".") +llvm_name(llvmf, N, T) = string("llvm", ".", dotit(llvmf), ".", suffix(N, T)) +llvm_name(llvmf, ::Type{LVec{N, T}}) where {N,T} = string("llvm", ".", dotit(llvmf), ".", suffix(N, T)) +llvm_name(llvmf, ::Type{T}) where {T} = string("llvm", ".", dotit(llvmf), ".", suffix(T)) llvm_type(::Type{T}) where {T} = d[T] llvm_type(::Type{LVec{N, T}}) where {N,T} = "< $N x $(d[T])>" @@ -171,13 +172,23 @@ const BINARY_INTRINSICS_FLOAT = [ :round ] -for f in BINARY_INTRINSICS_FLOAT - @eval @generated function $(f)(x::T, y::T) where T<:LT{<:FloatingTypes} - ff = llvm_name($(QuoteNode(f)), T,) - return :( - $(Expr(:meta, :inline)); - ccall($ff, llvmcall, T, (T, T), x, y) - ) +const BINARY_INTRINSICS_INT = [ + :sadd_sat + :uadd_sat + :ssub_sat + :usub_sat +] + +for (fs, c) in zip([BINARY_INTRINSICS_FLOAT, BINARY_INTRINSICS_INT], + [FloatingTypes, IntegerTypes]) + for f in fs + @eval @generated function $(f)(x::T, y::T) where T<:LT{<:$c} + ff = llvm_name($(QuoteNode(f)), T,) + return :( + $(Expr(:meta, :inline)); + ccall($ff, llvmcall, T, (T, T), x, y) + ) + end end end diff --git a/src/simdvec.jl b/src/simdvec.jl index a09ca7f..6932b59 100644 --- a/src/simdvec.jl +++ b/src/simdvec.jl @@ -177,50 +177,54 @@ end #################### const BINARY_OPS = [ - (:+ , IntegerTypes , Intrinsics.add) - (:- , IntegerTypes , Intrinsics.sub) - (:* , IntegerTypes , Intrinsics.mul) - (:div , UIntTypes , Intrinsics.udiv) - (:div , IntTypes , Intrinsics.sdiv) - (:rem , UIntTypes , Intrinsics.urem) - (:rem , IntTypes , Intrinsics.srem) - - (:+ , FloatingTypes , Intrinsics.fadd) - (:- , FloatingTypes , Intrinsics.fsub) - (:* , FloatingTypes , Intrinsics.fmul) - (:^ , FloatingTypes , Intrinsics.pow) - (:/ , FloatingTypes , Intrinsics.fdiv) - (:rem , FloatingTypes , Intrinsics.frem) - (:min , FloatingTypes , Intrinsics.minnum) - (:max , FloatingTypes , Intrinsics.maxnum) - (:copysign , FloatingTypes , Intrinsics.copysign) - - (:~ , BIntegerTypes , Intrinsics.xor) - (:& , BIntegerTypes , Intrinsics.and) - (:| , BIntegerTypes , Intrinsics.or) - (:⊻ , BIntegerTypes , Intrinsics.xor) - - (:(==) , BIntegerTypes , Intrinsics.icmp_eq) - (:(!=) , BIntegerTypes , Intrinsics.icmp_ne) - (:(>) , BIntTypes , Intrinsics.icmp_sgt) - (:(>=) , BIntTypes , Intrinsics.icmp_sge) - (:(<) , BIntTypes , Intrinsics.icmp_slt) - (:(<=) , BIntTypes , Intrinsics.icmp_sle) - (:(>) , UIntTypes , Intrinsics.icmp_ugt) - (:(>=) , UIntTypes , Intrinsics.icmp_uge) - (:(<) , UIntTypes , Intrinsics.icmp_ult) - (:(<=) , UIntTypes , Intrinsics.icmp_ule) - - (:(==) , FloatingTypes , Intrinsics.fcmp_oeq) - (:(!=) , FloatingTypes , Intrinsics.fcmp_une) - (:(>) , FloatingTypes , Intrinsics.fcmp_ogt) - (:(>=) , FloatingTypes , Intrinsics.fcmp_oge) - (:(<) , FloatingTypes , Intrinsics.fcmp_olt) - (:(<=) , FloatingTypes , Intrinsics.fcmp_ole) + (:(Base.:+) , IntegerTypes , Intrinsics.add) + (:(Base.:-) , IntegerTypes , Intrinsics.sub) + (:(Base.:*) , IntegerTypes , Intrinsics.mul) + (:(Base.div) , UIntTypes , Intrinsics.udiv) + (:(Base.div) , IntTypes , Intrinsics.sdiv) + (:(Base.rem) , UIntTypes , Intrinsics.urem) + (:(Base.rem) , IntTypes , Intrinsics.srem) + + (:(add_saturate) , IntTypes , Intrinsics.sadd_sat) + (:(add_saturate) , UIntTypes , Intrinsics.uadd_sat) + (:(sub_saturate) , IntTypes , Intrinsics.ssub_sat) + (:(sub_saturate) , UIntTypes , Intrinsics.usub_sat) + + (:(Base.:+) , FloatingTypes , Intrinsics.fadd) + (:(Base.:-) , FloatingTypes , Intrinsics.fsub) + (:(Base.:*) , FloatingTypes , Intrinsics.fmul) + (:(Base.:^) , FloatingTypes , Intrinsics.pow) + (:(Base.:/) , FloatingTypes , Intrinsics.fdiv) + (:(Base.rem) , FloatingTypes , Intrinsics.frem) + (:(Base.min) , FloatingTypes , Intrinsics.minnum) + (:(Base.max) , FloatingTypes , Intrinsics.maxnum) + (:(Base.copysign) , FloatingTypes , Intrinsics.copysign) + (:(Base.:~) , BIntegerTypes , Intrinsics.xor) + (:(Base.:&) , BIntegerTypes , Intrinsics.and) + (:(Base.:|) , BIntegerTypes , Intrinsics.or) + (:(Base.:⊻) , BIntegerTypes , Intrinsics.xor) + + (:(Base.:(==)) , BIntegerTypes , Intrinsics.icmp_eq) + (:(Base.:!=) , BIntegerTypes , Intrinsics.icmp_ne) + (:(Base.:>) , BIntTypes , Intrinsics.icmp_sgt) + (:(Base.:>=) , BIntTypes , Intrinsics.icmp_sge) + (:(Base.:<) , BIntTypes , Intrinsics.icmp_slt) + (:(Base.:<=) , BIntTypes , Intrinsics.icmp_sle) + (:(Base.:>) , UIntTypes , Intrinsics.icmp_ugt) + (:(Base.:>=) , UIntTypes , Intrinsics.icmp_uge) + (:(Base.:<) , UIntTypes , Intrinsics.icmp_ult) + (:(Base.:<=) , UIntTypes , Intrinsics.icmp_ule) + + (:(Base.:(==)) , FloatingTypes , Intrinsics.fcmp_oeq) + (:(Base.:!=) , FloatingTypes , Intrinsics.fcmp_une) + (:(Base.:>) , FloatingTypes , Intrinsics.fcmp_ogt) + (:(Base.:>=) , FloatingTypes , Intrinsics.fcmp_oge) + (:(Base.:<) , FloatingTypes , Intrinsics.fcmp_olt) + (:(Base.:<=) , FloatingTypes , Intrinsics.fcmp_ole) ] for (op, constraint, llvmop) in BINARY_OPS - @eval @inline function (Base.$op)(x::Vec{N, T}, y::Vec{N, T}) where {N, T <: $constraint} + @eval @inline function $op(x::Vec{N, T}, y::Vec{N, T}) where {N, T <: $constraint} Vec($(llvmop)(x.data, y.data)) end end @@ -317,22 +321,23 @@ for v in (:<<, :>>, :>>>) end end + # Vectorize binary functions for (op, constraint) in [BINARY_OPS; - (:flipsign , ScalarTypes) - (:copysign , ScalarTypes) - (:signbit , ScalarTypes) - (:min , IntegerTypes) - (:max , IntegerTypes) - (:<< , IntegerTypes) - (:>> , IntegerTypes) - (:>>> , IntegerTypes) + (:(Base.flipsign) , ScalarTypes) + (:(Base.copysign) , ScalarTypes) + (:(Base.signbit) , ScalarTypes) + (:(Base.min) , IntegerTypes) + (:(Base.max) , IntegerTypes) + (:(Base.:<<) , IntegerTypes) + (:(Base.:>>) , IntegerTypes) + (:(Base.:>>>) , IntegerTypes) ] - @eval @inline function (Base.$op)(x::T2, y::Vec{N, T}) where {N, T2<:ScalarTypes, T <: $constraint} - Base.$op(Vec{N, T}(x), y) + @eval @inline function $op(x::T2, y::Vec{N, T}) where {N, T2<:ScalarTypes, T <: $constraint} + $op(Vec{N, T}(x), y) end - @eval @inline function (Base.$op)(x::Vec{N, T}, y::T2) where {N, T2 <:ScalarTypes, T <: $constraint} - Base.$op(x, Vec{N, T}(y)) + @eval @inline function $op(x::Vec{N, T}, y::T2) where {N, T2 <:ScalarTypes, T <: $constraint} + $op(x, Vec{N, T}(y)) end end diff --git a/test/runtests.jl b/test/runtests.jl index 74bb8f3..7d3c53c 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -130,6 +130,16 @@ llvm_ir(f, args) = sprint(code_llvm, f, Base.typesof(args...)) @test Tuple(V8I32(v8i32)^3) === v8i32.^3 end + @testset "saturation" begin + v = Vec{4, UInt8}(UInt8.((150, 250, 125, 0))) + @test SIMD.add_saturate(v, UInt8(50)) === Vec{4, UInt8}(UInt8.((200, 255, 175, 50))) + @test SIMD.sub_saturate(v, UInt8(100)) === Vec{4, UInt8}(UInt8.((50, 150, 25, 0))) + v = Vec{4, Int8}(Int8.((100, -100, 20, -20))) + @test SIMD.add_saturate(v, Int8(50)) === Vec{4, Int8}(Int8.((127, -50, 70, 30))) + @test SIMD.sub_saturate(v, Int8(50)) === Vec{4, Int8}(Int8.((50, -128, -30, -70))) + + end + @testset "Floating point arithmetic functions" begin global const v4f64b = map(x->Float64(x+1), v4f64) @@ -632,6 +642,29 @@ llvm_ir(f, args) = sprint(code_llvm, f, Base.typesof(args...)) @test occursin(" fadd <4 x double>", ir) # @test occursin(r"( shufflevector <4 x double>.*){2}"s, ir) end + + function isascii_simd(s::String) + len = sizeof(s) + nwords = len >> 7 + _0x80 = Vec{32, UInt8}(0x80) + p = pointer(s) + i = 0 + GC.@preserve s for _ in 1:nwords + comp = Vec{32, UInt8}(0x00) + for _ in 1:4 + v = SIMD.vload(LVec{32, UInt8}, p + i) + comp_i = v & _0x80 + comp += comp_i + i += 32 + end + reduce(|, comp) == 0x00 || return false + end + #' Finish up the chunks + for i = nwords*32*4+1:len + @inbounds(codeunit(s, i)) >= 0x80 && return false + end + return true + end end @testset "Vector shuffles" begin