Skip to content

Commit

Permalink
Add LLVM patch D14260 to improve SIMD code
Browse files Browse the repository at this point in the history
Arch Robison proposed the patch <http://reviews.llvm.org/D14260> "Optimize store of "bitcast" from vector to aggregate" for LLVM. This patch applies cleanly to LLVM 3.7.1. It seems to be the last missing puzzle piece on the LLVM side to allow generating efficient SIMD instructions via `llvm_call` in Julia. For an example package, see e.g. <https://github.com/eschnett/SIMD.jl>.

Some discussion relevant to this PR are in JuliaLang#5355. @ArchRobison, please comment.

Julia stores tuples as LLVM arrays, whereas LLVM SIMD instructions require LLVM vectors. The respective conversions are unfortunately not always optimized out unless the patch above is applied, leading to a cumbersome sequence of instructions to disassemble and reassemble a SIMD vector. An example is given here <eschnett/SIMD.jl#1 (comment)>.

Without this patch, the loop kernel looks like (x86-64, AVX2 instructions):

```
    vunpcklpd   %xmm4, %xmm3, %xmm3 # xmm3 = xmm3[0],xmm4[0]
    vunpcklpd   %xmm2, %xmm1, %xmm1 # xmm1 = xmm1[0],xmm2[0]
    vinsertf128 $1, %xmm3, %ymm1, %ymm1
    vmovupd 8(%rcx), %xmm2
    vinsertf128 $1, 24(%rcx), %ymm2, %ymm2
    vaddpd  %ymm2, %ymm1, %ymm1
    vpermilpd   $1, %xmm1, %xmm2 # xmm2 = xmm1[1,0]
    vextractf128    $1, %ymm1, %xmm3
    vpermilpd   $1, %xmm3, %xmm4 # xmm4 = xmm3[1,0]
Source line: 62
    vaddsd  (%rcx), %xmm0, %xmm0
```

Note that the SIMD vector is kept in register `%ymm1`, but is unnecessarily scalarized into registers `%xmm{0,1,2,3}` at the end of the kernel, and re-assembled in the beginning.

With this patch, the loop kernel looks like:

```
L192:
	vaddpd	(%rdx), %ymm1, %ymm1
Source line: 62
	addq	%rsi, %rdx
	addq	%rcx, %rdi
	jne	L192
```

which is perfect.
  • Loading branch information
eschnett committed Feb 7, 2016
1 parent 350e4da commit 80fe031
Show file tree
Hide file tree
Showing 2 changed files with 145 additions and 0 deletions.
9 changes: 9 additions & 0 deletions deps/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -658,6 +658,12 @@ $$(LLVM_SRC_DIR)/$1.patch-applied: $(LLVM_SRC_DIR)/configure | $$(SRCDIR)/$1.pat
echo 1 > $$@
LLVM_PATCH_LIST += $$(LLVM_SRC_DIR)/$1.patch-applied
endef
define LLVM_PATCH0
$$(LLVM_SRC_DIR)/$1.patch-applied: $(LLVM_SRC_DIR)/configure | $$(SRCDIR)/$1.patch
cd $$(LLVM_SRC_DIR) && patch -p0 < $$(SRCDIR)/$1.patch
echo 1 > $$@
LLVM_PATCH_LIST += $$(LLVM_SRC_DIR)/$1.patch-applied
endef
ifeq ($(LLVM_VER),3.3)
$(eval $(call LLVM_PATCH,llvm-3.3))
$(eval $(call LLVM_PATCH,instcombine-llvm-3.3))
Expand All @@ -671,7 +677,10 @@ else ifeq ($(LLVM_VER),3.7.1)
$(eval $(call LLVM_PATCH,llvm-3.7.1))
$(eval $(call LLVM_PATCH,llvm-3.7.1_2))
$(eval $(call LLVM_PATCH,llvm-3.7.1_3))
$(eval $(call LLVM_PATCH0,llvm-D14260))
$(LLVM_SRC_DIR)/llvm-3.7.1_2.patch-applied: $(LLVM_SRC_DIR)/llvm-3.7.1.patch-applied
$(LLVM_SRC_DIR)/llvm-3.7.1_3.patch-applied: $(LLVM_SRC_DIR)/llvm-3.7.1_2.patch-applied
$(LLVM_SRC_DIR)/llvm-D14260.patch-applied: $(LLVM_SRC_DIR)/llvm-3.7.1_3.patch-applied
endif # LLVM_VER

ifeq ($(LLVM_VER),3.7.1)
Expand Down
136 changes: 136 additions & 0 deletions deps/llvm-D14260.patch
Original file line number Diff line number Diff line change
@@ -0,0 +1,136 @@
Index: lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
===================================================================
--- lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
+++ lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
@@ -879,6 +879,61 @@
return nullptr;
}

+/// \brief Look for extractelement/insertvalue sequence that acts like a bitcast.
+///
+/// \returns underlying value that was "cast", or nullptr otherwise.
+///
+/// For example, if we have:
+///
+/// %E0 = extractelement <2 x double> %U, i32 0
+/// %V0 = insertvalue [2 x double] undef, double %E0, 0
+/// %E1 = extractelement <2 x double> %U, i32 1
+/// %V1 = insertvalue [2 x double] %V0, double %E1, 1
+///
+/// and the layout of a <2 x double> is isomorphic to a [2 x double],
+/// then %V1 can be safely approximated by a conceptual "bitcast" of %U.
+/// Note that %U may contain non-undef values where %V1 has undef.
+static Value* likeBitCastFromVector(InstCombiner &IC, Value* V) {
+ Value *U = nullptr;
+ while (auto *IV = dyn_cast<InsertValueInst>(V)) {
+ auto *E = dyn_cast<ExtractElementInst>(IV->getInsertedValueOperand());
+ if (!E)
+ return nullptr;
+ auto *W = E->getVectorOperand();
+ if (!U)
+ U = W;
+ else if (U != W)
+ return nullptr;
+ auto *CI = dyn_cast<ConstantInt>(E->getIndexOperand());
+ if (!CI || IV->getNumIndices() != 1 || CI->getZExtValue() != *IV->idx_begin())
+ return nullptr;
+ V = IV->getAggregateOperand();
+ }
+ if (!isa<UndefValue>(V) ||!U)
+ return nullptr;
+
+ VectorType *UT = cast<VectorType>(U->getType());
+ Type *VT = V->getType();
+ // Check that types UT and VT are bitwise isomorphic.
+ const DataLayout &DL = IC.getDataLayout();
+ if (DL.getTypeSizeInBits(UT) != DL.getTypeSizeInBits(VT)) {
+ return nullptr;
+ }
+ if (ArrayType *AT = dyn_cast<ArrayType>(VT)) {
+ if (AT->getNumElements() != UT->getNumElements())
+ return nullptr;
+ } else {
+ StructType *ST = cast<StructType>(VT);
+ if (ST->getNumElements() != UT->getNumElements())
+ return nullptr;
+ for (const Type *EltT : ST->elements()) {
+ if (EltT != UT->getElementType())
+ return nullptr;
+ }
+ }
+ return U;
+}
+
/// \brief Combine stores to match the type of value being stored.
///
/// The core idea here is that the memory does not have any intrinsic type and
@@ -914,6 +969,11 @@
return true;
}

+ if (Value *U = likeBitCastFromVector(IC, V)) {
+ combineStoreToNewValue(IC, SI, U);
+ return true;
+ }
+
// FIXME: We should also canonicalize loads of vectors when their elements are
// cast to other types.
return false;
Index: test/Transforms/InstCombine/insert-val-extract-elem.ll
===================================================================
--- test/Transforms/InstCombine/insert-val-extract-elem.ll
+++ test/Transforms/InstCombine/insert-val-extract-elem.ll
@@ -0,0 +1,53 @@
+; RUN: opt -S -instcombine %s | FileCheck %s
+
+; CHECK-NOT: insertvalue
+; CHECK-NOT: extractelement
+; CHECK: store <2 x double>
+define void @julia_2xdouble([2 x double]* sret, <2 x double>*) {
+top:
+ %x = load <2 x double>, <2 x double>* %1
+ %x0 = extractelement <2 x double> %x, i32 0
+ %i0 = insertvalue [2 x double] undef, double %x0, 0
+ %x1 = extractelement <2 x double> %x, i32 1
+ %i1 = insertvalue [2 x double] %i0, double %x1, 1
+ store [2 x double] %i1, [2 x double]* %0, align 4
+ ret void
+}
+
+; CHECK-NOT: insertvalue
+; CHECK-NOT: extractelement
+; CHECK: store <4 x float>
+define void @julia_4xfloat([4 x float]* sret, <4 x float>*) {
+top:
+ %x = load <4 x float>, <4 x float>* %1
+ %x0 = extractelement <4 x float> %x, i32 0
+ %i0 = insertvalue [4 x float] undef, float %x0, 0
+ %x1 = extractelement <4 x float> %x, i32 1
+ %i1 = insertvalue [4 x float] %i0, float %x1, 1
+ %x2 = extractelement <4 x float> %x, i32 2
+ %i2 = insertvalue [4 x float] %i1, float %x2, 2
+ %x3 = extractelement <4 x float> %x, i32 3
+ %i3 = insertvalue [4 x float] %i2, float %x3, 3
+ store [4 x float] %i3, [4 x float]* %0, align 4
+ ret void
+}
+
+%pseudovec = type { float, float, float, float }
+
+; CHECK-NOT: insertvalue
+; CHECK-NOT: extractelement
+; CHECK: store <4 x float>
+define void @julia_pseudovec(%pseudovec* sret, <4 x float>*) {
+top:
+ %x = load <4 x float>, <4 x float>* %1
+ %x0 = extractelement <4 x float> %x, i32 0
+ %i0 = insertvalue %pseudovec undef, float %x0, 0
+ %x1 = extractelement <4 x float> %x, i32 1
+ %i1 = insertvalue %pseudovec %i0, float %x1, 1
+ %x2 = extractelement <4 x float> %x, i32 2
+ %i2 = insertvalue %pseudovec %i1, float %x2, 2
+ %x3 = extractelement <4 x float> %x, i32 3
+ %i3 = insertvalue %pseudovec %i2, float %x3, 3
+ store %pseudovec %i3, %pseudovec* %0, align 4
+ ret void
+}

0 comments on commit 80fe031

Please sign in to comment.