diff --git a/src/CodeGen_ARM.cpp b/src/CodeGen_ARM.cpp index 0a55f2d2fc6c..e9e855cd18ee 100644 --- a/src/CodeGen_ARM.cpp +++ b/src/CodeGen_ARM.cpp @@ -179,6 +179,8 @@ class CodeGen_ARM : public CodeGen_Posix { void visit(const Call *) override; void visit(const LT *) override; void visit(const LE *) override; + Value *interleave_vectors(const std::vector &) override; + Value *shuffle_vectors(Value *a, Value *b, const std::vector &indices) override; void codegen_vector_reduce(const VectorReduce *, const Expr &) override; bool codegen_dot_product_vector_reduce(const VectorReduce *, const Expr &); bool codegen_pairwise_vector_reduce(const VectorReduce *, const Expr &); @@ -1942,12 +1944,100 @@ void CodeGen_ARM::visit(const Shuffle *op) { load->type.lanes() == stride * op->type.lanes()) { value = codegen_dense_vector_load(load, nullptr, /* slice_to_native */ false); - value = shuffle_vectors(value, op->indices); + value = CodeGen_Posix::shuffle_vectors(value, op->indices); } else { CodeGen_Posix::visit(op); } } +Value *CodeGen_ARM::interleave_vectors(const std::vector &vecs) { + if (simd_intrinsics_disabled() || target_vscale() == 0 || + vecs.size() < 2 || + !is_scalable_vector(vecs[0])) { + return CodeGen_Posix::interleave_vectors(vecs); + } + + // Lower into llvm.vector.interleave intrinsic + const std::set supported_strides{2, 3, 4, 8}; + const int stride = vecs.size(); + const int src_lanes = get_vector_num_elements(vecs[0]->getType()); + + if (supported_strides.find(stride) != supported_strides.end() && + is_power_of_two(src_lanes) && + src_lanes % target_vscale() == 0 && + src_lanes / target_vscale() > 1) { + + llvm::Type *elt = get_vector_element_type(vecs[0]->getType()); + llvm::Type *ret_type = get_vector_type(elt, src_lanes * stride); + + std::string instr = concat_strings("llvm.vector.interleave", stride, mangle_llvm_type(ret_type)); + + std::vector arg_types(stride, vecs[0]->getType()); + llvm::FunctionType *fn_type = FunctionType::get(ret_type, arg_types, false); + FunctionCallee fn = module->getOrInsertFunction(instr, fn_type); + + CallInst *interleave = builder->CreateCall(fn, vecs); + return interleave; + } + + return CodeGen_Posix::interleave_vectors(vecs); +}; + +Value *CodeGen_ARM::shuffle_vectors(Value *a, Value *b, const std::vector &indices) { + if (simd_intrinsics_disabled() || target_vscale() == 0 || + !is_scalable_vector(a)) { + return CodeGen_Posix::shuffle_vectors(a, b, indices); + } + + internal_assert(a->getType() == b->getType()); + + llvm::Type *elt = get_vector_element_type(a->getType()); + const int src_lanes = get_vector_num_elements(a->getType()); + const int dst_lanes = indices.size(); + + // Check if deinterleaved slice + { + // Get the stride of slice + int slice_stride = 0; + const int start_index = indices[0]; + if (dst_lanes > 1) { + const int stride = indices[1] - start_index; + bool stride_equal = true; + for (int i = 2; i < dst_lanes; ++i) { + stride_equal &= (indices[i] == start_index + i * stride); + } + slice_stride = stride_equal ? stride : 0; + } + + // Lower slice with stride into llvm.vector.deinterleave intrinsic + const std::set supported_strides{2, 3, 4, 8}; + if (supported_strides.find(slice_stride) != supported_strides.end() && + dst_lanes * slice_stride == src_lanes && + indices.front() < slice_stride && // Start position cannot be larger than stride + is_power_of_two(dst_lanes) && + dst_lanes % target_vscale() == 0 && + dst_lanes / target_vscale() > 1) { + + std::string instr = concat_strings("llvm.vector.deinterleave", slice_stride, mangle_llvm_type(a->getType())); + + // We cannot mix FixedVector and ScalableVector, so dst_type must be scalable + llvm::Type *dst_type = get_vector_type(elt, dst_lanes / target_vscale(), VectorTypeConstraint::VScale); + StructType *sret_type = StructType::get(*context, std::vector(slice_stride, dst_type)); + std::vector arg_types{a->getType()}; + llvm::FunctionType *fn_type = FunctionType::get(sret_type, arg_types, false); + FunctionCallee fn = module->getOrInsertFunction(instr, fn_type); + + CallInst *deinterleave = builder->CreateCall(fn, {a}); + // extract one element out of the returned struct + Value *extracted = builder->CreateExtractValue(deinterleave, indices.front()); + + return extracted; + } + } + + return CodeGen_Posix::shuffle_vectors(a, b, indices); +} + void CodeGen_ARM::visit(const Ramp *op) { if (target_vscale() != 0 && op->type.is_int_or_uint()) { if (is_const_zero(op->base) && is_const_one(op->stride)) { diff --git a/src/CodeGen_LLVM.cpp b/src/CodeGen_LLVM.cpp index 7764c3d61264..25c231bb6720 100644 --- a/src/CodeGen_LLVM.cpp +++ b/src/CodeGen_LLVM.cpp @@ -1362,14 +1362,11 @@ void CodeGen_LLVM::codegen(const Stmt &s) { value = nullptr; s.accept(this); } -namespace { -bool is_power_of_two(int x) { +bool CodeGen_LLVM::is_power_of_two(int x) const { return (x & (x - 1)) == 0; } -} // namespace - Type CodeGen_LLVM::upgrade_type_for_arithmetic(const Type &t) const { if (t.is_bfloat() || (t.is_float() && t.bits() < 32)) { return Float(32, t.lanes()); @@ -4139,10 +4136,22 @@ void CodeGen_LLVM::visit(const Shuffle *op) { } else if (op->is_concat()) { value = concat_vectors(vecs); } else { + auto can_divide_lanes = [this](int lanes, int factor) -> bool { + if (lanes % factor) { + return false; + } + if (effective_vscale == 0) { + return true; + } else { + int divided = lanes / factor; + return (divided % effective_vscale) == 0 && (divided / effective_vscale) > 1; + } + }; + // If the even-numbered indices equal the odd-numbered // indices, only generate one and then do a self-interleave. for (int f : {4, 3, 2}) { - bool self_interleave = (op->indices.size() % f) == 0; + bool self_interleave = can_divide_lanes(op->indices.size(), f); for (size_t i = 0; i < op->indices.size(); i++) { self_interleave &= (op->indices[i] == op->indices[i - (i % f)]); } @@ -4158,8 +4167,8 @@ void CodeGen_LLVM::visit(const Shuffle *op) { } // Check for an interleave of slices (i.e. an in-vector transpose) - bool interleave_of_slices = op->vectors.size() == 1 && (op->indices.size() % f) == 0; int step = op->type.lanes() / f; + bool interleave_of_slices = can_divide_lanes(op->indices.size(), f) && step != 1 && op->vectors.size() == 1; for (int i = 0; i < step; i++) { for (int j = 0; j < f; j++) { interleave_of_slices &= (op->indices[i * f + j] == j * step + i); @@ -4173,13 +4182,14 @@ void CodeGen_LLVM::visit(const Shuffle *op) { slices.push_back(slice_vector(value, i * step, step)); } value = interleave_vectors(slices); + return; } } // If the indices form contiguous aligned runs, do the shuffle // on entire sub-vectors by reinterpreting them as a wider // type. for (int f : {8, 4, 2}) { - if (op->type.lanes() % f != 0) { + if (!can_divide_lanes(op->type.lanes(), f)) { continue; } @@ -4188,7 +4198,7 @@ void CodeGen_LLVM::visit(const Shuffle *op) { } bool contiguous = true; for (const Expr &vec : op->vectors) { - contiguous &= ((vec.type().lanes() % f) == 0); + contiguous &= can_divide_lanes(vec.type().lanes(), f); } for (size_t i = 0; i < op->indices.size(); i += f) { contiguous &= (op->indices[i] % f) == 0; @@ -5414,6 +5424,10 @@ llvm::Constant *CodeGen_LLVM::get_splat(int lanes, llvm::Constant *value, return ConstantVector::getSplat(ec, value); } +bool CodeGen_LLVM::is_scalable_vector(llvm::Value *v) const { + return isa(v->getType()); +} + std::string CodeGen_LLVM::mangle_llvm_type(llvm::Type *type) { std::string type_string = "."; if (isa(type)) { diff --git a/src/CodeGen_LLVM.h b/src/CodeGen_LLVM.h index 414859a54b2f..8698a3dfdad5 100644 --- a/src/CodeGen_LLVM.h +++ b/src/CodeGen_LLVM.h @@ -520,6 +520,10 @@ class CodeGen_LLVM : public IRVisitor { /** Shorthand for shuffling a single vector. */ llvm::Value *shuffle_vectors(llvm::Value *v, const std::vector &indices); + bool is_power_of_two(int x) const; + + bool is_scalable_vector(llvm::Value *v) const; + /** Go looking for a vector version of a runtime function. Will * return the best match. Matches in the following order: *