pytorch · alanwaketan · Dec 13, 2022 · Dec 9, 2022 · Dec 9, 2022 · Dec 9, 2022
diff --git a/torch_xla/csrc/xla_graph_executor.cpp b/torch_xla/csrc/xla_graph_executor.cpp
@@ -68,150 +68,74 @@ bool ShouldSyncIrValue(const torch::lazy::Value& ir_value) {
 
 }  // namespace
 
-// The DeviceContextArena holds per device live information and statistics,
-// among which the XLA tensors which are currently alive in the system. This is
-// used to create XLA computation "barriers" in order to flush pending
-// operations and ensure the same XLA computations are created during the
-// training loops.
-class DeviceContextArena {
-  struct DeviceContext {
-    std::mutex lock;
-    std::map<int64_t, std::weak_ptr<XLATensor::Data>> tensors_data;
-    uint64_t seed = 101;
-    uint64_t running_seed = 101;
-    torch::lazy::Value seed_ir_value;
-  };
-
- public:
-  static DeviceContextArena* Get() {
-    static DeviceContextArena* arena = new DeviceContextArena();
-    return arena;
-  }
-
-  void RegisterTensor(std::shared_ptr<XLATensor::Data> data) {
-    DeviceContext* devctx = GetDeviceContext(data->device);
-    std::lock_guard<std::mutex> lock(devctx->lock);
-    devctx->tensors_data.emplace(data->unique_id, data);
-    TORCH_LAZY_COUNTER("CreateXlaTensor", 1);
-  }
+auto XLAGraphExecutor::DeviceContextArena::Get() -> DeviceContextArena* {
+  static DeviceContextArena* arena = new DeviceContextArena();
+  return arena;
+}
 
-  void UnregisterTensor(XLATensor::Data* data) {
-    DeviceContext* devctx = GetDeviceContext(data->device);
+std::vector<XLATensorPtr> XLAGraphExecutor::DeviceContextArena::GetLiveTensors(
+    const torch::lazy::BackendDevice* device) {
+  std::vector<XLATensorPtr> tensors;
+  auto fn = [&](DeviceContext* devctx) {
     std::lock_guard<std::mutex> lock(devctx->lock);
-    devctx->tensors_data.erase(data->unique_id);
-    TORCH_LAZY_COUNTER("DestroyXlaTensor", 1);
-  }
-
-  std::vector<XLATensorPtr> GetLiveTensors(
-      const torch::lazy::BackendDevice* device) {
-    std::vector<XLATensorPtr> tensors;
-    auto fn = [&](DeviceContext* devctx) {
-      std::lock_guard<std::mutex> lock(devctx->lock);
-      for (auto& uid_wptr : devctx->tensors_data) {
-        std::shared_ptr<XLATensor::Data> data = uid_wptr.second.lock();
-        if (data != nullptr) {
-          tensors.push_back(
-              c10::make_intrusive<XLATensor>(XLATensor(std::move(data))));
-        }
+    for (auto& uid_wptr : devctx->tensors_data) {
+      auto data =
+          std::dynamic_pointer_cast<XLATensor::Data>(uid_wptr.second.lock());
+      if (data != nullptr) {
+        tensors.push_back(
+            c10::make_intrusive<XLATensor>(XLATensor(std::move(data))));
       }
-    };
-    ForAllDeviceContexts(fn, device);
-    return tensors;
-  }
-
-  torch::lazy::Value GetRngSeed(const torch::lazy::BackendDevice& device) {
-    static const at::ScalarType kSeedType = at::ScalarType::Long;
-    static const uint64_t kSeedMul = 214013;
-    static const uint64_t kSeedAdd = 2531011;
-    DeviceContext* devctx = GetDeviceContext(device);
-    std::lock_guard<std::mutex> lock(devctx->lock);
-    if (!devctx->seed_ir_value) {
-      devctx->seed_ir_value =
-          IrValueFromScalar(MakeIntScalar(devctx->seed), kSeedType, device);
     }
-    // Keep the running seed as scalar as well, so we can return it directly
-    // without executing graphs.
-    devctx->running_seed = kSeedAdd + kSeedMul * devctx->running_seed;
-    // Compose new seeds from the root seed, to avoid creating too many XLA
-    // computation parameters which might overflow the TPU capacity.
-    torch::lazy::Value k = ScalarOp(MakeIntScalar(kSeedMul),
-                                    MakeXlaPrimitiveType(kSeedType, &device));
-    torch::lazy::Value b = ScalarOp(MakeIntScalar(kSeedAdd),
-                                    MakeXlaPrimitiveType(kSeedType, &device));
-    devctx->seed_ir_value = b + k * devctx->seed_ir_value;
-    return devctx->seed_ir_value;
-  }
-
-  torch::lazy::BackendDataPtr GetBaseSeedData(
-      const torch::lazy::BackendDevice& device) {
-    static const at::ScalarType kSeedType = at::ScalarType::Long;
-    DeviceContext* devctx = GetDeviceContext(device);
-    std::lock_guard<std::mutex> lock(devctx->lock);
-    at::Tensor tensor = at::scalar_tensor(MakeIntScalar(devctx->seed),
-                                          at::TensorOptions(kSeedType));
-    torch::lazy::BackendDataPtr device_data = TensorToXlaData(tensor, device);
-    devctx->seed_ir_value = torch::lazy::MakeNode<DeviceData>(device_data);
-    devctx->running_seed = devctx->seed;
-    return torch_xla::DeviceData::Cast(devctx->seed_ir_value.node.get())
-        ->data();
-  }
-
-  uint64_t GetRunningSeed(const torch::lazy::BackendDevice& device) {
-    DeviceContext* devctx = GetDeviceContext(device);
-    std::lock_guard<std::mutex> lock(devctx->lock);
-    return devctx->running_seed;
-  }
-
-  void SetRngSeed(const torch::lazy::BackendDevice& device, uint64_t seed) {
-    DeviceContext* devctx = GetDeviceContext(device);
-    std::lock_guard<std::mutex> lock(devctx->lock);
-    devctx->seed = seed;
-    devctx->running_seed = devctx->seed;
-    devctx->seed_ir_value = torch::lazy::Value();
-  }
-
-  void MarkStep(const torch::lazy::BackendDevice& device) {
-    DeviceContext* devctx = GetDeviceContext(device);
-    std::lock_guard<std::mutex> lock(devctx->lock);
-    devctx->seed = 1012031 + devctx->seed * 7012063;
-    devctx->running_seed = devctx->seed;
-    devctx->seed_ir_value = torch::lazy::Value();
-  }
-
- private:
-  std::vector<DeviceContext*> GetAllDeviceContexts() {
-    std::vector<DeviceContext*> all_device_contexts;
-    std::lock_guard<std::mutex> lock(lock_);
-    all_device_contexts.reserve(device_contexts_.size());
-    for (auto& device_contexts : device_contexts_) {
-      all_device_contexts.push_back(device_contexts.second);
-    }
-    return all_device_contexts;
-  }
+  };
+  ForAllDeviceContexts(fn, device);
+  return tensors;
+}
 
-  void ForAllDeviceContexts(const std::function<void(DeviceContext*)>& fn,
-                            const torch::lazy::BackendDevice* device) {
-    if (device == nullptr) {
-      for (auto devctx : GetAllDeviceContexts()) {
-        fn(devctx);
-      }
-    } else {
-      fn(GetDeviceContext(*device));
-    }
+torch::lazy::Value XLAGraphExecutor::DeviceContextArena::GetRngSeed(
+    const torch::lazy::BackendDevice& device) {
+  static const at::ScalarType kSeedType = at::ScalarType::Long;
+  static const uint64_t kSeedMul = 214013;
+  static const uint64_t kSeedAdd = 2531011;
+  DeviceContext* devctx = GetDeviceContext(device);
+  std::lock_guard<std::mutex> lock(devctx->lock);
+  if (!devctx->seed_ir_value) {
+    devctx->seed_ir_value =
+        IrValueFromScalar(MakeIntScalar(devctx->seed), kSeedType, device);
   }
+  // Keep the running seed as scalar as well, so we can return it directly
+  // without executing graphs.
+  devctx->running_seed = kSeedAdd + kSeedMul * devctx->running_seed;
+  // Compose new seeds from the root seed, to avoid creating too many XLA
+  // computation parameters which might overflow the TPU capacity.
+  torch::lazy::Value k = ScalarOp(MakeIntScalar(kSeedMul),
+                                  MakeXlaPrimitiveType(kSeedType, &device));
+  torch::lazy::Value b = ScalarOp(MakeIntScalar(kSeedAdd),
+                                  MakeXlaPrimitiveType(kSeedType, &device));
+  devctx->seed_ir_value = b + k * devctx->seed_ir_value;
+  return devctx->seed_ir_value;
+}
 
-  DeviceContext* GetDeviceContext(const torch::lazy::BackendDevice& device) {
-    std::lock_guard<std::mutex> lock(lock_);
-    auto it = device_contexts_.find(device);
-    if (it == device_contexts_.end()) {
-      it = device_contexts_.emplace(device, new DeviceContext()).first;
-    }
-    return it->second;
-  }
+torch::lazy::BackendDataPtr
+XLAGraphExecutor::DeviceContextArena::GetBaseSeedData(
+    const torch::lazy::BackendDevice& device) {
+  static const at::ScalarType kSeedType = at::ScalarType::Long;
+  DeviceContext* devctx = GetDeviceContext(device);
+  std::lock_guard<std::mutex> lock(devctx->lock);
+  at::Tensor tensor = at::scalar_tensor(MakeIntScalar(devctx->seed),
+                                        at::TensorOptions(kSeedType));
+  torch::lazy::BackendDataPtr device_data = TensorToXlaData(tensor, device);
+  devctx->seed_ir_value = torch::lazy::MakeNode<DeviceData>(device_data);
+  devctx->running_seed = devctx->seed;
+  return torch_xla::DeviceData::Cast(devctx->seed_ir_value.node.get())->data();
+}
 
-  std::mutex lock_;
-  std::map<torch::lazy::BackendDevice, DeviceContext*> device_contexts_;
-};
+torch::lazy::Value XLAGraphExecutor::DeviceContextArena::IrValueFromScalar(
+    const at::Scalar& value, at::ScalarType scalar_type,
+    const torch::lazy::BackendDevice& device) {
+  at::Tensor tensor = at::scalar_tensor(value, at::TensorOptions(scalar_type));
+  torch::lazy::BackendDataPtr device_data = TensorToXlaData(tensor, device);
+  return torch::lazy::MakeNode<DeviceData>(std::move(device_data));
+}
 
 XLAGraphExecutor::Async::Async(
     SyncTensorCollection* coll,
@@ -253,12 +177,15 @@ XLAGraphExecutor* XLAGraphExecutor::Get() {
   return &arena;
 }
 
-void XLAGraphExecutor::RegisterTensor(std::shared_ptr<XLATensor::Data> data) {
+void XLAGraphExecutor::RegisterTensor(
+    std::shared_ptr<torch::lazy::LazyTensor::Data> data) {
   DeviceContextArena::Get()->RegisterTensor(data);
+  TORCH_LAZY_COUNTER("CreateXlaTensor", 1);
 }
 
-void XLAGraphExecutor::UnregisterTensor(XLATensor::Data* data) {
+void XLAGraphExecutor::UnregisterTensor(torch::lazy::LazyTensor::Data* data) {
   DeviceContextArena::Get()->UnregisterTensor(data);
+  TORCH_LAZY_COUNTER("DestroyXlaTensor", 1);
 }
 
 void XLAGraphExecutor::ApplyEagerSync(std::vector<XLATensorPtr>& tensors) {

diff --git a/torch_xla/csrc/xla_graph_executor.h b/torch_xla/csrc/xla_graph_executor.h
@@ -32,8 +32,9 @@ class XLAGraphExecutor : public torch::lazy::LazyGraphExecutor {
  public:
   static XLAGraphExecutor* Get();
 
-  virtual void RegisterTensor(std::shared_ptr<XLATensor::Data> data);
-  virtual void UnregisterTensor(XLATensor::Data* data);
+  void RegisterTensor(
+      std::shared_ptr<torch::lazy::LazyTensor::Data> data) final;
+  void UnregisterTensor(torch::lazy::LazyTensor::Data* data) final;
 
   // This method just syncs the tensors passed as argument. This method is
   // called at two places:
@@ -79,9 +80,10 @@ class XLAGraphExecutor : public torch::lazy::LazyGraphExecutor {
       c10::optional<at::ScalarType> logical_element_type,
       const torch::lazy::BackendDevice& device);
 
-  torch::lazy::Value GetRngSeed(const torch::lazy::BackendDevice& device);
-  void SetRngSeed(const torch::lazy::BackendDevice& device, uint64_t seed);
-  uint64_t GetRunningSeed(const torch::lazy::BackendDevice& device);
+  torch::lazy::Value GetRngSeed(const torch::lazy::BackendDevice& device) final;
+  void SetRngSeed(const torch::lazy::BackendDevice& device,
+                  uint64_t seed) final;
+  uint64_t GetRunningSeed(const torch::lazy::BackendDevice& device) final;
   torch::lazy::BackendDataPtr GetBaseSeedData(
       const torch::lazy::BackendDevice& device);
 
@@ -93,6 +95,7 @@ class XLAGraphExecutor : public torch::lazy::LazyGraphExecutor {
   // for the given device. If device is nullptr, the live tensors for all
   // devices will be returned. Returned tensors are sorted by device as primary
   // key, and by unique ID as secondary key.
+  // Unlike the base class, here we return XLATensorPtrs.
   std::vector<XLATensorPtr> GetLiveTensors(
       const torch::lazy::BackendDevice* device);
 
@@ -113,7 +116,7 @@ class XLAGraphExecutor : public torch::lazy::LazyGraphExecutor {
 
   // Marks an execution step, which allows the tensor framework to understand
   // the computation boundaries.
-  void MarkStep(const torch::lazy::BackendDevice& device);
+  void MarkStep(const torch::lazy::BackendDevice& device) final;
 
   // Waits for all the outstanding operations on all the supplied devices.
   // If devices is empty, the wait will happen for all local devices.
@@ -175,6 +178,29 @@ class XLAGraphExecutor : public torch::lazy::LazyGraphExecutor {
     std::vector<torch::lazy::BackendDataPtr> tensors_data;
   };
 
+  class DeviceContextArena
+      : public torch::lazy::LazyGraphExecutor::DeviceContextArena {
+   public:
+    static DeviceContextArena* Get();
+
+    // This method returns XLATensorPtrs instead of LazyTensorPtrs.
+    std::vector<XLATensorPtr> GetLiveTensors(
+        const torch::lazy::BackendDevice* device);
+
+    // We override this to use our own + and * for torch::lazy::Value.
+    torch::lazy::Value GetRngSeed(
+        const torch::lazy::BackendDevice& device) final;
+
+    torch::lazy::BackendDataPtr GetBaseSeedData(
+        const torch::lazy::BackendDevice& device);
+
+   private:
+    // We override this to use TensorToXlaData().
+    torch::lazy::Value IrValueFromScalar(
+        const at::Scalar& value, at::ScalarType scalar_type,
+        const torch::lazy::BackendDevice& device) final;
+  };
+
   XLAGraphExecutor() = default;
 
   SyncTensorCollection CollectSyncTensors(