From badbe6e0b9162e7613e44288fda6922b5999ae2e Mon Sep 17 00:00:00 2001
From: Kristoffer Carlsson <kcarlsson89@gmail.com>
Date: Thu, 13 Feb 2020 10:02:20 +0100
Subject: [PATCH] rename LLVM module to Intrinsics

---
 src/LLVM_intrinsics.jl |  17 +---
 src/arrayops.jl        |  12 +--
 src/simdvec.jl         | 199 +++++++++++++++++++++--------------------
 3 files changed, 107 insertions(+), 121 deletions(-)

diff --git a/src/LLVM_intrinsics.jl b/src/LLVM_intrinsics.jl
index 202da78..fada669 100644
--- a/src/LLVM_intrinsics.jl
+++ b/src/LLVM_intrinsics.jl
@@ -1,5 +1,5 @@
 # LLVM operations and intrinsics
-module LLVM
+module Intrinsics
 
 # TODO: fastmath flags
 
@@ -100,21 +100,6 @@ end
     )
 end
 
-@generated function xor(x::LVec{N, T}) where {N, T <: IntegerTypes}
-    ff = llvm_name(:xor, N, T)
-    shfl = join((string(d[T], " ", T == Bool ? 1 :  -1) for i in 1:N), ", ")
-    s = """
-    %res = xor <$N x $(d[T])> %0, <$shfl>
-    ret <$N x $(d[T])> %res
-    """
-    return :(
-        $(Expr(:meta, :inline));
-        Base.llvmcall($s, LVec{N, T}, Tuple{LVec{N, T}}, x)
-    )
-end
-
-
-
 #####################
 # Binary operators  #
 #####################
diff --git a/src/arrayops.jl b/src/arrayops.jl
index b9155ec..c23594f 100644
--- a/src/arrayops.jl
+++ b/src/arrayops.jl
@@ -42,9 +42,9 @@ FastContiguousArray{T,N} = Union{DenseArray{T,N}, Base.FastContiguousSubArray{T,
 @inline function vload(::Type{Vec{N, T}}, ptr::Ptr{T}, mask::Union{Nothing, Vec{N, Bool}}=nothing,
                        ::Val{Aligned}=Val(false), ::Val{Nontemporal}=Val(false)) where {N, T, Aligned, Nontemporal}
     if mask === nothing
-        Vec(LLVM.load(LLVM.LVec{N, T}, ptr, Val(Aligned), Val(Nontemporal)))
+        Vec(Intrinsics.load(Intrinsics.LVec{N, T}, ptr, Val(Aligned), Val(Nontemporal)))
     else
-        Vec(LLVM.maskedload(ptr, mask.data, Val(Aligned), Val(Nontemporal)))
+        Vec(Intrinsics.maskedload(ptr, mask.data, Val(Aligned), Val(Nontemporal)))
     end
 end
 
@@ -63,9 +63,9 @@ end
 @inline function vstore(x::Vec{N, T}, ptr::Ptr{T}, mask::Union{Nothing, Vec{N, Bool}}=nothing,
                        ::Val{Aligned}=Val(false), ::Val{Nontemporal}=Val(false)) where {N, T, Aligned, Nontemporal}
     if mask === nothing
-        LLVM.store(x.data, ptr, Val(Aligned), Val(Nontemporal))
+        Intrinsics.store(x.data, ptr, Val(Aligned), Val(Nontemporal))
     else
-        LLVM.maskedstore(x.data, ptr, mask.data, Val(Aligned), Val(Nontemporal))
+        Intrinsics.maskedstore(x.data, ptr, mask.data, Val(Aligned), Val(Nontemporal))
     end
 end
 @inline function vstore(x::Vec{N, T}, a::FastContiguousArray{T,1}, i::Integer, mask=nothing,
@@ -117,7 +117,7 @@ end
 @inline vgather(ptrs::Vec{N,Ptr{T}},
                  mask::Vec{N,Bool}=one(Vec{N,Bool}),
                  ::Val{Aligned}=Val(false)) where {N, T<:ScalarTypes, Aligned} =
-    return Vec(LLVM.maskedgather(ptrs.data, mask.data))
+    return Vec(Intrinsics.maskedgather(ptrs.data, mask.data))
 @propagate_inbounds function vgather(a::FastContiguousArray{T,1}, idx::Vec{N, Int},
                                      mask::Vec{N,Bool}=one(Vec{N,Bool}),
                                      ::Val{Aligned}=Val(false)) where {N, T<:ScalarTypes, Aligned}
@@ -138,7 +138,7 @@ end
 
 @propagate_inbounds vscatter(x::Vec{N,T}, ptrs::Vec{N,Ptr{T}},
                              mask::Vec{N,Bool}, ::Val{Aligned}=Val(false)) where {N, T<:ScalarTypes, Aligned} =
-    LLVM.maskedscatter(x.data, ptrs.data, mask.data)
+    Intrinsics.maskedscatter(x.data, ptrs.data, mask.data)
 @propagate_inbounds function vscatter(x::Vec{N,T}, a::FastContiguousArray{T,1}, idx::Vec{N, Int},
                                       mask::Vec{N,Bool}=one(Vec{N, Bool}),
                                       ::Val{Aligned}=Val(false)) where {N, T<:ScalarTypes, Aligned}
diff --git a/src/simdvec.jl b/src/simdvec.jl
index 0085627..617c55d 100644
--- a/src/simdvec.jl
+++ b/src/simdvec.jl
@@ -13,7 +13,7 @@ Base.copy(v::Vec) = v
 @inline _unsafe_convert(::Type{T}, v) where {T <: IntegerTypes} = v % T
 @inline _unsafe_convert(::Type{T}, v) where {T <: VecTypes} = convert(T, v)
 @inline constantvector(v::T1, ::Type{Vec{N, T2}}) where {N, T1, T2} =
-    Vec(LLVM.constantvector(_unsafe_convert(T2, v), LLVM.LVec{N, T2}))
+    Vec(Intrinsics.constantvector(_unsafe_convert(T2, v), Intrinsics.LVec{N, T2}))
 
 @inline Vec{N, T}(v::Vec{N, T}) where {N, T<:VecTypes} = v
 @inline Vec{N, T}(v::Vec{N, T}) where {N, T<:FloatingTypes} = v
@@ -26,30 +26,30 @@ Base.copy(v::Vec) = v
     if T1 <: Union{IntegerTypes, Ptr}
         if T2 <: Union{IntegerTypes, Ptr}
             if sizeof(T1) < sizeof(T2)
-                return Vec(LLVM.trunc(LLVM.LVec{N, T1}, v.data))
+                return Vec(Intrinsics.trunc(Intrinsics.LVec{N, T1}, v.data))
             elseif sizeof(T1) == sizeof(T2)
-                return Vec(LLVM.bitcast(LLVM.LVec{N, T1}, v.data))
+                return Vec(Intrinsics.bitcast(Intrinsics.LVec{N, T1}, v.data))
             else
-                return Vec(LLVM.sext(LLVM.LVec{N, T1}, v.data))
+                return Vec(Intrinsics.sext(Intrinsics.LVec{N, T1}, v.data))
             end
         elseif T2 <: FloatingTypes
             if T1 <: UIntTypes
-                return Vec(LLVM.fptoui(LLVM.LVec{N, T1}, v.data))
+                return Vec(Intrinsics.fptoui(Intrinsics.LVec{N, T1}, v.data))
             elseif T1 <: IntTypes
-                return Vec(LLVM.fptosi(LLVM.LVec{N, T1}, v.data))
+                return Vec(Intrinsics.fptosi(Intrinsics.LVec{N, T1}, v.data))
             end
         end
     end
     if T1 <: FloatingTypes
         if T2 <: UIntTypes
-            return Vec(LLVM.uitofp(LLVM.LVec{N, T1}, v.data))
+            return Vec(Intrinsics.uitofp(Intrinsics.LVec{N, T1}, v.data))
         elseif T2 <: IntTypes
-            return Vec(LLVM.sitofp(LLVM.LVec{N, T1}, v.data))
+            return Vec(Intrinsics.sitofp(Intrinsics.LVec{N, T1}, v.data))
         elseif T2 <: FloatingTypes
             if sizeof(T1) < sizeof(T2)
-                return Vec(LLVM.fptrunc(LLVM.LVec{N, T1}, v.data))
+                return Vec(Intrinsics.fptrunc(Intrinsics.LVec{N, T1}, v.data))
             else
-                return Vec(LLVM.fpext(LLVM.LVec{N, T1}, v.data))
+                return Vec(Intrinsics.fpext(Intrinsics.LVec{N, T1}, v.data))
             end
         end
     end
@@ -83,12 +83,12 @@ end
 
 function Base.getindex(v::Vec, i::IntegerTypes)
     @boundscheck checkbounds(v, i)
-    return LLVM.extractelement(v.data, i-1)
+    return Intrinsics.extractelement(v.data, i-1)
 end
 
 @inline function Base.setindex(v::Vec{N,T}, x, i::IntegerTypes) where {N,T}
     @boundscheck checkbounds(v, i)
-    Vec(LLVM.insertelement(v.data, _unsafe_convert(T, x), i-1))
+    Vec(Intrinsics.insertelement(v.data, _unsafe_convert(T, x), i-1))
 end
 
 Base.zero(::Type{Vec{N,T}}) where {N, T} = Vec{N,T}(zero(T))
@@ -96,9 +96,9 @@ Base.zero(::Vec{N,T}) where {N, T} = zero(Vec{N, T})
 Base.one(::Type{Vec{N,T}}) where {N, T} = Vec{N, T}(one(T))
 Base.one(::Vec{N,T}) where {N, T} = one(Vec{N, T})
 
-Base.reinterpret(::Type{Vec{N, T}}, v::Vec) where {T, N} = Vec(LLVM.bitcast(LLVM.LVec{N, T}, v.data))
-Base.reinterpret(::Type{Vec{N, T}}, v::ScalarTypes) where {T, N} = Vec(LLVM.bitcast(LLVM.LVec{N, T}, v))
-Base.reinterpret(::Type{T}, v::Vec) where {T} = LLVM.bitcast(T, v.data)
+Base.reinterpret(::Type{Vec{N, T}}, v::Vec) where {T, N} = Vec(Intrinsics.bitcast(Intrinsics.LVec{N, T}, v.data))
+Base.reinterpret(::Type{Vec{N, T}}, v::ScalarTypes) where {T, N} = Vec(Intrinsics.bitcast(Intrinsics.LVec{N, T}, v))
+Base.reinterpret(::Type{T}, v::Vec) where {T} = Intrinsics.bitcast(T, v.data)
 
 
 ###################
@@ -106,28 +106,27 @@ Base.reinterpret(::Type{T}, v::Vec) where {T} = LLVM.bitcast(T, v.data)
 ###################
 
 const UNARY_OPS = [
-    (:sqrt           , FloatingTypes , LLVM.sqrt)       ,
-    (:sin            , FloatingTypes , LLVM.sin)        ,
-    (:trunc          , FloatingTypes , LLVM.trunc)      ,
-    (:cos            , FloatingTypes , LLVM.cos)        ,
-    (:exp            , FloatingTypes , LLVM.exp)        ,
-    (:exp2           , FloatingTypes , LLVM.exp2)       ,
-    (:log            , FloatingTypes , LLVM.log)        ,
-    (:log10          , FloatingTypes , LLVM.log10)      ,
-    (:log2           , FloatingTypes , LLVM.log2)       ,
-    (:abs            , FloatingTypes , LLVM.fabs)       ,
-    (:floor          , FloatingTypes , LLVM.floor)      ,
-    (:ceil           , FloatingTypes , LLVM.ceil)       ,
-    # (:rint         , FloatingTypes , LLVM)            ,
-    # (:nearbyint    , FloatingTypes , LLVM)            ,
-    (:round          , FloatingTypes , LLVM.round)      ,
-
-    # (:bitreverse   , IntegerTypes  , LLVM.bitreverse) ,
-    (:bswap          , IntegerTypes  , LLVM.bswap)      ,
-    (:count_ones     , IntegerTypes  , LLVM.ctpop)      ,
-    (:leading_zeros  , IntegerTypes  , LLVM.ctlz)       ,
-    (:trailing_zeros , IntegerTypes  , LLVM.cttz)       ,
-    (:~              , IntegerTypes  , LLVM.xor)
+    (:sqrt           , FloatingTypes , Intrinsics.sqrt)       ,
+    (:sin            , FloatingTypes , Intrinsics.sin)        ,
+    (:trunc          , FloatingTypes , Intrinsics.trunc)      ,
+    (:cos            , FloatingTypes , Intrinsics.cos)        ,
+    (:exp            , FloatingTypes , Intrinsics.exp)        ,
+    (:exp2           , FloatingTypes , Intrinsics.exp2)       ,
+    (:log            , FloatingTypes , Intrinsics.log)        ,
+    (:log10          , FloatingTypes , Intrinsics.log10)      ,
+    (:log2           , FloatingTypes , Intrinsics.log2)       ,
+    (:abs            , FloatingTypes , Intrinsics.fabs)       ,
+    (:floor          , FloatingTypes , Intrinsics.floor)      ,
+    (:ceil           , FloatingTypes , Intrinsics.ceil)       ,
+    # (:rint         , FloatingTypes , Intrinsics)            ,
+    # (:nearbyint    , FloatingTypes , Intrinsics)            ,
+    (:round          , FloatingTypes , Intrinsics.round)      ,
+
+    # (:bitreverse   , IntegerTypes  , Intrinsics.bitreverse) ,
+    (:bswap          , IntegerTypes  , Intrinsics.bswap)      ,
+    (:count_ones     , IntegerTypes  , Intrinsics.ctpop)      ,
+    (:leading_zeros  , IntegerTypes  , Intrinsics.ctlz)       ,
+    (:trailing_zeros , IntegerTypes  , Intrinsics.cttz)       ,
 ]
 
 for (op, constraint, llvmop) in UNARY_OPS
@@ -137,7 +136,9 @@ end
 
 Base.:+(v::Vec) = v
 Base.:-(v::Vec{<:Any, <:IntegerTypes}) = zero(v) - v
-Base.:-(v::Vec{<:Any, <:FloatingTypes}) = Vec(LLVM.fneg(v.data))
+Base.:-(v::Vec{<:Any, <:FloatingTypes}) = Vec(Intrinsics.fneg(v.data))
+Base.:~(v::Vec{N, T}) where {N, T<:IntegerTypes} = Vec(Intrinsics.xor(v.data, Vec{N, T}(-1).data))
+Base.:~(v::Vec{N, Bool}) where {N} = Vec(Intrinsics.xor(v.data, Vec{N, Bool}(true).data))
 Base.abs(v::Vec{N, T}) where {N, T} = Vec(vifelse(v < zero(T), -v, v))
 Base.:!(v1::Vec{N,Bool}) where {N} = ~v1
 Base.inv(v::Vec{N, T}) where {N, T<:FloatingTypes} = one(T) / v
@@ -172,46 +173,46 @@ end
 ####################
 
 const BINARY_OPS = [
-    (:+        , IntegerTypes  , LLVM.add)
-    (:-        , IntegerTypes  , LLVM.sub)
-    (:*        , IntegerTypes  , LLVM.mul)
-    (:div      , UIntTypes     , LLVM.udiv)
-    (:div      , IntTypes      , LLVM.sdiv)
-    (:rem      , UIntTypes     , LLVM.urem)
-    (:rem      , IntTypes      , LLVM.srem)
-
-    (:+        , FloatingTypes , LLVM.fadd)
-    (:-        , FloatingTypes , LLVM.fsub)
-    (:*        , FloatingTypes , LLVM.fmul)
-    (:^        , FloatingTypes , LLVM.pow)
-    (:/        , FloatingTypes , LLVM.fdiv)
-    (:rem      , FloatingTypes , LLVM.frem)
-    (:min      , FloatingTypes , LLVM.minnum)
-    (:max      , FloatingTypes , LLVM.maxnum)
-    (:copysign , FloatingTypes , LLVM.copysign)
-
-    (:~        , IntegerTypes  , LLVM.xor)
-    (:&        , IntegerTypes  , LLVM.and)
-    (:|        , IntegerTypes  , LLVM.or)
-    (:⊻        , IntegerTypes  , LLVM.xor)
-
-    (:(==)     , IntegerTypes  , LLVM.icmp_eq)
-    (:(!=)     , IntegerTypes  , LLVM.icmp_ne)
-    (:(>)      , IntTypes      , LLVM.icmp_sgt)
-    (:(>=)     , IntTypes      , LLVM.icmp_sge)
-    (:(<)      , IntTypes      , LLVM.icmp_slt)
-    (:(<=)     , IntTypes      , LLVM.icmp_sle)
-    (:(>)      , UIntTypes     , LLVM.icmp_ugt)
-    (:(>=)     , UIntTypes     , LLVM.icmp_uge)
-    (:(<)      , UIntTypes     , LLVM.icmp_ult)
-    (:(<=)     , UIntTypes     , LLVM.icmp_ule)
-
-    (:(==)     , FloatingTypes , LLVM.fcmp_oeq)
-    (:(!=)     , FloatingTypes , LLVM.fcmp_une)
-    (:(>)      , FloatingTypes , LLVM.fcmp_ogt)
-    (:(>=)     , FloatingTypes , LLVM.fcmp_oge)
-    (:(<)      , FloatingTypes , LLVM.fcmp_olt)
-    (:(<=)     , FloatingTypes , LLVM.fcmp_ole)
+    (:+        , IntegerTypes  , Intrinsics.add)
+    (:-        , IntegerTypes  , Intrinsics.sub)
+    (:*        , IntegerTypes  , Intrinsics.mul)
+    (:div      , UIntTypes     , Intrinsics.udiv)
+    (:div      , IntTypes      , Intrinsics.sdiv)
+    (:rem      , UIntTypes     , Intrinsics.urem)
+    (:rem      , IntTypes      , Intrinsics.srem)
+
+    (:+        , FloatingTypes , Intrinsics.fadd)
+    (:-        , FloatingTypes , Intrinsics.fsub)
+    (:*        , FloatingTypes , Intrinsics.fmul)
+    (:^        , FloatingTypes , Intrinsics.pow)
+    (:/        , FloatingTypes , Intrinsics.fdiv)
+    (:rem      , FloatingTypes , Intrinsics.frem)
+    (:min      , FloatingTypes , Intrinsics.minnum)
+    (:max      , FloatingTypes , Intrinsics.maxnum)
+    (:copysign , FloatingTypes , Intrinsics.copysign)
+
+    (:~        , IntegerTypes  , Intrinsics.xor)
+    (:&        , IntegerTypes  , Intrinsics.and)
+    (:|        , IntegerTypes  , Intrinsics.or)
+    (:⊻        , IntegerTypes  , Intrinsics.xor)
+
+    (:(==)     , IntegerTypes  , Intrinsics.icmp_eq)
+    (:(!=)     , IntegerTypes  , Intrinsics.icmp_ne)
+    (:(>)      , IntTypes      , Intrinsics.icmp_sgt)
+    (:(>=)     , IntTypes      , Intrinsics.icmp_sge)
+    (:(<)      , IntTypes      , Intrinsics.icmp_slt)
+    (:(<=)     , IntTypes      , Intrinsics.icmp_sle)
+    (:(>)      , UIntTypes     , Intrinsics.icmp_ugt)
+    (:(>=)     , UIntTypes     , Intrinsics.icmp_uge)
+    (:(<)      , UIntTypes     , Intrinsics.icmp_ult)
+    (:(<=)     , UIntTypes     , Intrinsics.icmp_ule)
+
+    (:(==)     , FloatingTypes , Intrinsics.fcmp_oeq)
+    (:(!=)     , FloatingTypes , Intrinsics.fcmp_une)
+    (:(>)      , FloatingTypes , Intrinsics.fcmp_ogt)
+    (:(>=)     , FloatingTypes , Intrinsics.fcmp_oge)
+    (:(<)      , FloatingTypes , Intrinsics.fcmp_olt)
+    (:(<=)     , FloatingTypes , Intrinsics.fcmp_ole)
 ]
 
 for (op, constraint, llvmop) in BINARY_OPS
@@ -228,7 +229,7 @@ end
 
 # Pow
 @inline Base.:^(x::Vec{N,T}, y::IntegerTypes) where {N,T<:FloatingTypes} =
-    Vec(LLVM.powi(x.data, y))
+    Vec(Intrinsics.powi(x.data, y))
 # Do what Base does for HWNumber:
 @inline Base.literal_pow(::typeof(^), x::Vec, ::Val{0}) = one(typeof(x))
 @inline Base.literal_pow(::typeof(^), x::Vec, ::Val{1}) = x
@@ -266,19 +267,19 @@ end
 @inline function shl_int(x::Vec{N, T1}, y::Vec{N, T2}) where {N, T1<:IntegerTypes, T2<:IntegerTypes}
     vifelse(y > sizeof(T1) * 8,
         zero(Vec{N, T1}),
-        Vec(LLVM.shl(x.data, convert(Vec{N,T1}, y).data)))
+        Vec(Intrinsics.shl(x.data, convert(Vec{N,T1}, y).data)))
 end
 
 @inline function lshr_int(x::Vec{N, T1}, y::Vec{N, T2}) where {N, T1<:IntegerTypes, T2<:IntegerTypes}
     vifelse(y > sizeof(T1) * 8,
         zero(Vec{N, T1}),
-        Vec(LLVM.lshr(x.data, convert(Vec{N,T1}, y).data)))
+        Vec(Intrinsics.lshr(x.data, convert(Vec{N,T1}, y).data)))
 end
 
 @inline function ashr_int(x::Vec{N, T1}, y::Vec{N, T2}) where {N, T1<:IntegerTypes, T2<:IntegerTypes}
     vifelse(y > sizeof(T1) * 8,
-            Vec(LLVM.ashr(x.data, Vec{N,T1}(sizeof(T1)*8-1).data)),
-            Vec(LLVM.ashr(x.data, Vec{N,T1}(y).data)))
+            Vec(Intrinsics.ashr(x.data, Vec{N,T1}(sizeof(T1)*8-1).data)),
+            Vec(Intrinsics.ashr(x.data, Vec{N,T1}(y).data)))
 end
 
 # See https://github.com/JuliaLang/julia/blob/a211abcdfacc05cb93c15774a59ce8961c16dac4/base/int.jl#L422-L435
@@ -341,12 +342,12 @@ end
 
 @inline vifelse(v::Bool, v1::T, v2::T) where {T} = ifelse(v, v1, v2)
 @inline vifelse(v::Vec{N, Bool}, v1::Vec{N, T}, v2::Vec{N, T}) where {N, T} =
-    Vec(LLVM.select(v.data, v1.data, v2.data))
+    Vec(Intrinsics.select(v.data, v1.data, v2.data))
 @inline vifelse(v::Vec{N, Bool}, v1::T2, v2::Vec{N, T}) where {N, T, T2 <:ScalarTypes} = vifelse(v, Vec{N, T}(v1), v2)
 @inline vifelse(v::Vec{N, Bool}, v1::Vec{N, T}, v2::T2) where {N, T, T2 <:ScalarTypes} = vifelse(v, v1, Vec{N, T}(v2))
 
 # fma, muladd and vectorization of these
-for (op, llvmop) in [(:fma, LLVM.fma), (:muladd, LLVM.fmuladd)]
+for (op, llvmop) in [(:fma, Intrinsics.fma), (:muladd, Intrinsics.fmuladd)]
     @eval begin
         @inline Base.$op(a::Vec{N, T}, b::Vec{N, T}, c::Vec{N, T}) where {N,T<:FloatingTypes} =
             Vec($llvmop(a.data, b.data, c.data))
@@ -370,18 +371,18 @@ end
 # Reductions #
 ##############
 const HORZ_REDUCTION_OPS = [
-    (&   , IntegerTypes  , LLVM.reduce_and)
-    (|   , IntegerTypes  , LLVM.reduce_or)
-    (max , IntTypes      , LLVM.reduce_smax)
-    (max , UIntTypes     , LLVM.reduce_umax)
-    (max , FloatingTypes , LLVM.reduce_fmax)
-    (min , IntTypes      , LLVM.reduce_smin)
-    (min , UIntTypes     , LLVM.reduce_umin)
-    (min , FloatingTypes , LLVM.reduce_fmin)
-    (+   , IntegerTypes  , LLVM.reduce_add)
-    (*   , IntegerTypes  , LLVM.reduce_mul)
-    (+   , FloatingTypes , LLVM.reduce_fadd)
-    (*   , FloatingTypes , LLVM.reduce_fmul)
+    (&   , IntegerTypes  , Intrinsics.reduce_and)
+    (|   , IntegerTypes  , Intrinsics.reduce_or)
+    (max , IntTypes      , Intrinsics.reduce_smax)
+    (max , UIntTypes     , Intrinsics.reduce_umax)
+    (max , FloatingTypes , Intrinsics.reduce_fmax)
+    (min , IntTypes      , Intrinsics.reduce_smin)
+    (min , UIntTypes     , Intrinsics.reduce_umin)
+    (min , FloatingTypes , Intrinsics.reduce_fmin)
+    (+   , IntegerTypes  , Intrinsics.reduce_add)
+    (*   , IntegerTypes  , Intrinsics.reduce_mul)
+    (+   , FloatingTypes , Intrinsics.reduce_fadd)
+    (*   , FloatingTypes , Intrinsics.reduce_fmul)
 ]
 
 for (op, constraint, llvmop) in HORZ_REDUCTION_OPS
@@ -402,8 +403,8 @@ Base.reduce(F::Any, v::Vec) = error("reduction not defined for SIMD.Vec on $F")
 ############
 
 @inline function shufflevector(x::Vec{N, T}, ::Val{I}) where {N, T, I}
-    Vec(LLVM.shufflevector(x.data, Val(I)))
+    Vec(Intrinsics.shufflevector(x.data, Val(I)))
 end
 @inline function shufflevector(x::Vec{N, T}, y::Vec{N, T}, ::Val{I}) where {N, T, I}
-    Vec(LLVM.shufflevector(x.data, y.data, Val(I)))
+    Vec(Intrinsics.shufflevector(x.data, y.data, Val(I)))
 end