Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 5 additions & 3 deletions include/tvm/runtime/ndarray.h
Original file line number Diff line number Diff line change
Expand Up @@ -110,9 +110,10 @@ class NDArray : public ObjectRef {
/*!
* \brief Copy the data to another device.
* \param dev The target device.
* \param mem_scope The memory scope of the target array.
* \return The array under another device.
*/
inline NDArray CopyTo(const Device& dev) const;
inline NDArray CopyTo(const Device& dev, Optional<String> mem_scope = NullOpt) const;
/*!
* \brief Load NDArray from stream
* \param stream The input data stream
Expand Down Expand Up @@ -398,10 +399,11 @@ inline void NDArray::CopyTo(const NDArray& other) const {
CopyFromTo(&(get_mutable()->dl_tensor), &(other.get_mutable()->dl_tensor));
}

inline NDArray NDArray::CopyTo(const Device& dev) const {
inline NDArray NDArray::CopyTo(const Device& dev, Optional<String> mem_scope) const {
ICHECK(data_ != nullptr);
const DLTensor* dptr = operator->();
NDArray ret = Empty(ShapeTuple(dptr->shape, dptr->shape + dptr->ndim), dptr->dtype, dev);
NDArray ret =
Empty(ShapeTuple(dptr->shape, dptr->shape + dptr->ndim), dptr->dtype, dev, mem_scope);
this->CopyTo(ret);
return ret;
}
Expand Down
19 changes: 15 additions & 4 deletions include/tvm/runtime/vm/bytecode.h
Original file line number Diff line number Diff line change
Expand Up @@ -157,6 +157,8 @@ struct Instruction {
struct /* LoadConst Operands */ {
/* \brief The index into the constant pool. */
Index const_index;
/*! \brief The index of the device on which the load will be made. */
Index device_index;
};
struct /* LoadConsti Operands */ {
/* \brief The index into the constant pool. */
Expand Down Expand Up @@ -195,12 +197,18 @@ struct Instruction {
RegName* free_vars;
};
struct /* AllocStorage Operands */ {
/*! \brief The size of the allocation. */
RegName allocation_size;
/*! \brief The alignment of the allocation. */
Index alignment;
/*! \brief The hint of the dtype. */
DLDataType dtype_hint;
/*! \brief The number of dimensions. */
uint32_t ndim;
union {
/*! \brief The shape of tensor. */
int64_t* shape;
/*! \brief The size of the allocation. */
RegName allocation_size;
};
/*! \brief The index of the device on which the allocation will be made. */
Index device_index;
} alloc_storage;
Expand Down Expand Up @@ -332,10 +340,11 @@ struct Instruction {
/*!
* \brief Construct a load constant instruction.
* \param const_index The index of the constant.
* \param device_index The index of the device to load on.
* \param dst The destination register.
* \return The load constant instruction.
*/
static Instruction LoadConst(Index const_index, RegName dst);
static Instruction LoadConst(Index const_index, Index device_index, RegName dst);
/*!
* \brief Construct a load_constanti instruction.
* \param val The interger constant value.
Expand All @@ -356,11 +365,13 @@ struct Instruction {
* \param alignment The allocation's alignment.
* \param dtype_hint The data type hint for the allocator.
* \param device_index The index of the device to allocate on.
* \param shape The shape of the allocation.
* \param dst The destination to place the storage.
* \return The alloc storage instruction.
*/
static Instruction AllocStorage(RegName size, Index alignment, DLDataType dtype_hint,
Index device_index, RegName dst);
Index device_index, const std::vector<int64_t>& shape,
RegName dst);
/*!
* \brief Get the shape of an input tensor.
* \param tensor The input tensor.
Expand Down
5 changes: 3 additions & 2 deletions include/tvm/runtime/vm/executable.h
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@
#include <map>
#include <string>
#include <unordered_map>
#include <utility>
#include <vector>

namespace tvm {
Expand Down Expand Up @@ -262,9 +263,9 @@ class TVM_DLL Executable : public ModuleNode {

/*!
* \brief The (compile-time, virtual) devices corresponding to each device index.
* Currently we only support at most one device per device type.
* This vector contains a pair Device and its memory_scope.
*/
std::vector<Device> virtual_devices;
std::vector<std::pair<Device, std::string>> virtual_devices;
/*!
* \brief The device index corresponding to the 'host' device. That will hold and evaluate
* shape-related data and code.
Expand Down
41 changes: 19 additions & 22 deletions src/relay/backend/vm/compiler.cc
Original file line number Diff line number Diff line change
Expand Up @@ -352,19 +352,6 @@ class VMFunctionCompiler : DeviceAwareExprFunctor<void(const Expr& n)> {
return 0;
}

// However, otherwise we allow at most one VirtualDevice per device type.
// TODO(mbs): This will eventually need to account for memory scopes somehow so device_copy
// instructions can do the right thing.
itr = std::find_if(context_->virtual_devices_.begin() + 1, context_->virtual_devices_.end(),
[&virtual_device](const VirtualDevice& existing_virtual_device) {
return existing_virtual_device->device_type() ==
virtual_device->device_type();
});
CHECK(itr == context_->virtual_devices_.end())
<< "The VM does not currently support using more than one device with the same device type "
"for primitives, however the program is using the distinct scopes "
<< virtual_device << " and " << *itr << " of device type " << virtual_device->device_type();

ICHECK(virtual_device != host_virtual_device_);
Index index = context_->virtual_devices_.size();
VLOG(2) << "virtual_device[" << index << "] = " << virtual_device;
Expand All @@ -384,7 +371,7 @@ class VMFunctionCompiler : DeviceAwareExprFunctor<void(const Expr& n)> {
VLOG(2) << "constant[" << const_index << "] on device[" << device_index << "]";
context_->const_device_indexes.push_back(device_index);
context_->constants.push_back(const_node->data);
Emit(Instruction::LoadConst(const_index, NewRegister()));
Emit(Instruction::LoadConst(const_index, device_index, NewRegister()));
}

void VisitExpr_(const VarNode* var_node) final {
Expand Down Expand Up @@ -602,13 +589,21 @@ class VMFunctionCompiler : DeviceAwareExprFunctor<void(const Expr& n)> {
})
.Match("memory.alloc_storage",
[this](const Array<Expr>& args, const Attrs& attrs, const Array<Type>& type_arg) {
ICHECK_EQ(args.size(), 2);
ICHECK_EQ(args.size(), 3);
// Compute the size of the allocation.
this->VisitExpr(args[0]);
auto size_register = last_register_;

ICHECK(args[1].as<ConstantNode>()); // Always a literal.
NDArray alignment_arr = args[1].as<ConstantNode>()->data;
auto const_shape = AsIgnoringOnDevice<ConstantNode>(args[1]);
std::vector<int64_t> raw_shape;
if (const_shape) {
NDArray shape = const_shape->data;
// TODO(@jroesch): we need to get an RFC done to standarize shape dtype
raw_shape = ToAllocTensorShape(shape);
}

ICHECK(args[2].as<ConstantNode>()); // Always a literal.
NDArray alignment_arr = args[2].as<ConstantNode>()->data;
ICHECK_EQ(alignment_arr->dtype.code, 0U)
<< "The dtype of constant shape must be int32 or int64, but got "
<< DLDataType2String(alignment_arr->dtype);
Expand All @@ -622,7 +617,7 @@ class VMFunctionCompiler : DeviceAwareExprFunctor<void(const Expr& n)> {

Emit(Instruction::AllocStorage(size_register, alignment, dtype,
GetDeviceIndex(alloc_attrs->virtual_device),
NewRegister()));
raw_shape, NewRegister()));
})
.Match("vm.shape_of",
[this](const Array<Expr>& args, const Attrs& attrs, const Array<Type>& type_arg) {
Expand Down Expand Up @@ -739,7 +734,7 @@ class VMFunctionCompiler : DeviceAwareExprFunctor<void(const Expr& n)> {

/*!
* \brief Compile a match value
* Generate byte code that compute the value specificed in val
* Generate byte code that compute the value specified in val
*
* \return The register number assigned for the final value
*/
Expand Down Expand Up @@ -946,9 +941,10 @@ void VMCompiler::LowerImpl(IRModule mod) {
for (const auto& virtual_device : context_.virtual_devices_) {
ICHECK(!virtual_device->IsFullyUnconstrained());
ICHECK_GT(virtual_device->device_type(), 0);
// TODO(mbs): We forget the memory scope.
exec_->virtual_devices.push_back(Device{/*device_type=*/virtual_device->device_type(),
/*device_id=*/virtual_device->virtual_device_id});
exec_->virtual_devices.push_back(
std::make_pair(Device{/*device_type=*/virtual_device->device_type(),
/*device_id=*/virtual_device->virtual_device_id},
virtual_device->memory_scope));
}
exec_->host_device_index = kHostDeviceIndex;

Expand Down Expand Up @@ -1068,6 +1064,7 @@ IRModule VMCompiler::OptimizeModuleImpl(IRModule mod) {
}

pass_seqs.push_back(transform::FuseOps());
pass_seqs.push_back(transform::AnnotateMemoryScope());

// Do layout rewrite for auto-scheduler.
transform::PassContext pass_ctx = PassContext::Current();
Expand Down
4 changes: 3 additions & 1 deletion src/relay/backend/vm/manifest_lifetimes.cc
Original file line number Diff line number Diff line change
Expand Up @@ -167,7 +167,9 @@ class AliasEliminator : public MixedModeMutator {
if (copy_props.src_virtual_device->device_type() ==
copy_props.dst_virtual_device->device_type() &&
copy_props.src_virtual_device->virtual_device_id ==
copy_props.dst_virtual_device->virtual_device_id) {
copy_props.dst_virtual_device->virtual_device_id &&
copy_props.src_virtual_device->memory_scope ==
copy_props.dst_virtual_device->memory_scope) {
Expr to_copy = Downcast<Call>(unwrapped)->args[0];
if (const VarNode* alias_of_n = to_copy.as<VarNode>()) {
alias_[var] = Downcast<Var>(VisitExpr_(alias_of_n));
Expand Down
20 changes: 14 additions & 6 deletions src/relay/op/memory/memory.cc
Original file line number Diff line number Diff line change
Expand Up @@ -50,25 +50,32 @@ TVM_REGISTER_NODE_TYPE(AllocTensorAttrs);
// The passing value in attrs and args doesn't seem super great.
// We should consider a better solution, i.e the type relation
// being able to see the arguments as well?
Expr AllocStorage(Expr size, Expr alignment, VirtualDevice virtual_device, DataType dtype_hint) {
Expr AllocStorage(Expr size, Expr shape, Expr alignment, VirtualDevice virtual_device,
DataType dtype_hint) {
auto attrs = make_object<AllocStorageAttrs>();
attrs->dtype = dtype_hint;
attrs->virtual_device = std::move(virtual_device);
static const Op& op = Op::Get("memory.alloc_storage");
return Call(op, {std::move(size), std::move(alignment)}, Attrs(std::move(attrs)), {});
return Call(op, {std::move(size), std::move(shape), std::move(alignment)},
Attrs(std::move(attrs)), {});
}

TVM_REGISTER_GLOBAL("relay.op.memory._make.alloc_storage").set_body_typed(AllocStorage);

bool AllocStorageRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
const TypeReporter& reporter) {
ICHECK_EQ(types.size(), 3u);
ICHECK_EQ(types.size(), 4u);
auto size_type = types[0];
auto tensor_type = size_type.as<TensorTypeNode>();
ICHECK(tensor_type != nullptr);
ICHECK_EQ(tensor_type->dtype, DataType::Int(64));
ICHECK_EQ(tensor_type->shape.size(), 0);
auto align_type = types[1];

// Tensor shape
auto tt = types[1].as<TensorTypeNode>();
ICHECK(tt != nullptr) << "must be tensor type";

auto align_type = types[2];
auto align_ttype = align_type.as<TensorTypeNode>();
ICHECK(align_ttype != nullptr);
ICHECK_EQ(align_ttype->dtype, DataType::Int(64));
Expand All @@ -77,14 +84,15 @@ bool AllocStorageRel(const Array<Type>& types, int num_inputs, const Attrs& attr
ICHECK(mod.defined());
auto storage_name = mod->GetGlobalTypeVar("Storage");
auto storage = TypeCall(storage_name, {});
reporter->Assign(types[2], storage);
reporter->Assign(types[3], storage);
return true;
}

RELAY_REGISTER_OP("memory.alloc_storage")
.describe(R"code(Explicitly allocate storage to be used by tensors.)code" TVM_ADD_FILELINE)
.set_num_inputs(2)
.set_num_inputs(3)
.add_argument("size", "Tensor", "The size of the storage to allocate.")
.add_argument("shape", "Tensor", "The shape of the storage to allocate.")
.add_argument("alignment", "Tensor", "The alignment of the storage.")
.add_type_rel("AllocStorage", AllocStorageRel)
.set_attrs_type_key("relay.attrs.AllocStorageAttrs")
Expand Down
5 changes: 3 additions & 2 deletions src/relay/op/memory/memory.h
Original file line number Diff line number Diff line change
Expand Up @@ -34,10 +34,11 @@
namespace tvm {
namespace relay {

Expr AllocStorage(Expr size, Expr alignment, VirtualDevice virtual_device, DataType dtype_hint);
Expr AllocStorage(Expr size, Expr shape, Expr alignment, VirtualDevice virtual_device,
DataType dtype_hint);
/*! \brief Returns the "memory.alloc_tensor" operator. */
const Op& MemoryAllocTensorOp();
Expr AllocTensor(Expr storage, Expr offset, tvm::relay::Expr shape, DataType dtype,
Expr AllocTensor(Expr storage, Expr offset, Expr shape, DataType dtype,
Array<IndexExpr> assert_shape);
Expr ToTupleType(const Type& ty, const std::vector<Expr>& exprs);
std::vector<Expr> FromTupleType(const Type& type, const Expr& expr);
Expand Down
9 changes: 9 additions & 0 deletions src/relay/transforms/annotate_texture_storage.cc
Original file line number Diff line number Diff line change
Expand Up @@ -407,6 +407,15 @@ class StorageInfo : private transform::DeviceAwareExprVisitor {
if (pattern <= kCommReduce) {
if (const auto* ttype = call->checked_type().as<TensorTypeNode>()) {
if (ttype->shape.size() == 5) {
auto node0 = ttype->shape[0].as<IntImmNode>();
auto node1 = ttype->shape[1].as<IntImmNode>();
auto node2 = ttype->shape[2].as<IntImmNode>();
auto node3 = ttype->shape[3].as<IntImmNode>();
auto node4 = ttype->shape[4].as<IntImmNode>();
// if tensor has any dimension then textures are not supported
if (!node0 || !node1 || !node2 || !node3 || !node4) {
return false;
}
supports_texture_storage = true;
}
}
Expand Down
7 changes: 4 additions & 3 deletions src/relay/transforms/device_domains.cc
Original file line number Diff line number Diff line change
Expand Up @@ -236,12 +236,13 @@ DeviceDomainPtr DeviceDomains::DomainForCallee(const Call& call) {
args_and_result.emplace_back(ForVirtualDevice(device_copy_props.body->checked_type(),
device_copy_props.dst_virtual_device));
} else if (call->op == alloc_storage_op) {
ICHECK_EQ(call->args.size(), 2U);
// alloc_storage(size, alignment, virtual_device=<t>)
// alloc_storage: fn(<cpu>, <cpu>):<t>
ICHECK_EQ(call->args.size(), 3U);
// alloc_storage(size, shape, alignment, virtual_device=<t>)
// alloc_storage: fn(<cpu>, <cpu>, <cpu>):<t>
const auto* attrs = call->attrs.as<AllocStorageAttrs>();
args_and_result.emplace_back(host_domain_);
args_and_result.emplace_back(host_domain_);
args_and_result.emplace_back(host_domain_);
args_and_result.emplace_back(ForVirtualDevice(call->checked_type(), attrs->virtual_device));
} else if (call->op == alloc_tensor_op) {
ICHECK_EQ(call->args.size(), 3U);
Expand Down
4 changes: 2 additions & 2 deletions src/relay/transforms/memory_alloc.cc
Original file line number Diff line number Diff line change
Expand Up @@ -260,7 +260,7 @@ class DialectRewriter : public transform::DeviceAwareExprMutator {
Expr alignment = ComputeAlignment(type->dtype);
// Run type inference later to get the correct type.
Var var("storage_" + name_hint, Type(nullptr));
Expr value = AllocStorage(size, alignment, virtual_device, type->dtype);
Expr value = AllocStorage(size, shape, alignment, virtual_device, type->dtype);
auto sto = scope->Push(var, MaybeOnDeviceFixed(value, virtual_device));

// TODO(@jroesch): There is a bug with typing based on the constant shape.
Expand Down Expand Up @@ -366,7 +366,7 @@ class DialectRewriter : public transform::DeviceAwareExprMutator {
// Alignment is directly captured in the instruction so don't wrap in "on_device".
auto alignment = ComputeAlignment(out_type->dtype);
Var sto_var("storage_" + std::to_string(i), Type(nullptr));
auto val = AllocStorage(size, alignment, virtual_device, out_type->dtype);
auto val = AllocStorage(size, out_shape, alignment, virtual_device, out_type->dtype);
storages.push_back(scope->Push(sto_var, MaybeOnDeviceFixed(val, virtual_device)));
}

Expand Down
2 changes: 1 addition & 1 deletion src/runtime/c_runtime_api.cc
Original file line number Diff line number Diff line change
Expand Up @@ -152,7 +152,7 @@ static size_t GetDataAlignment(const DLDataType dtype) {

void* DeviceAPI::AllocDataSpace(Device dev, int ndim, const int64_t* shape, DLDataType dtype,
Optional<String> mem_scope) {
if (!mem_scope.defined() || mem_scope.value() == "global") {
if (!mem_scope.defined() || mem_scope.value() == "" || mem_scope.value() == "global") {
// by default, we can always redirect to the flat memory allocations
DLTensor temp;
temp.data = nullptr;
Expand Down
2 changes: 1 addition & 1 deletion src/runtime/opencl/opencl_device_api.cc
Original file line number Diff line number Diff line change
Expand Up @@ -239,7 +239,7 @@ void* OpenCLWorkspace::AllocDataSpace(Device dev, size_t size, size_t alignment,

void* OpenCLWorkspace::AllocDataSpace(Device dev, int ndim, const int64_t* shape, DLDataType dtype,
Optional<String> mem_scope) {
if (!mem_scope.defined() || mem_scope.value() == "global") {
if (!mem_scope.defined() || mem_scope.value().empty() || mem_scope.value() == "global") {
return DeviceAPI::AllocDataSpace(dev, ndim, shape, dtype, mem_scope);
}
ICHECK(IsTextureStorage(std::string(mem_scope.value())))
Expand Down
Loading