Merge branch 'dmlc:master' into add-igbh-to-rgcn

BowenYao18 · web-flow · commit 5fc930b076cf · 2024-08-15T13:46:44.000-07:00
diff --git a/examples/multigpu/graphbolt/node_classification.py b/examples/multigpu/graphbolt/node_classification.py
@@ -135,7 +135,10 @@ def create_dataloader(
     if args.storage_device != "cpu":
         datapipe = datapipe.copy_to(device)
     datapipe = datapipe.sample_neighbor(
-        graph, args.fanout, overlap_fetch=args.storage_device == "pinned"
+        graph,
+        args.fanout,
+        overlap_fetch=args.storage_device == "pinned",
+        asynchronous=args.storage_device != "cpu",
     )
     datapipe = datapipe.fetch_feature(features, node_feature_keys=["feat"])
     if args.storage_device == "cpu":
diff --git a/graphbolt/src/cuda/extension/gpu_cache.cu b/graphbolt/src/cuda/extension/gpu_cache.cu
@@ -76,6 +76,14 @@ std::tuple<torch::Tensor, torch::Tensor, torch::Tensor> GpuCache::Query(
   return std::make_tuple(values, missing_index, missing_keys);
 }
 
+c10::intrusive_ptr<Future<std::vector<torch::Tensor>>> GpuCache::QueryAsync(
+    torch::Tensor keys) {
+  return async([=] {
+    auto [values, missing_index, missing_keys] = Query(keys);
+    return std::vector{values, missing_index, missing_keys};
+  });
+}
+
 void GpuCache::Replace(torch::Tensor keys, torch::Tensor values) {
   TORCH_CHECK(keys.device().is_cuda(), "Keys should be on a CUDA device.");
   TORCH_CHECK(
diff --git a/graphbolt/src/cuda/extension/gpu_cache.h b/graphbolt/src/cuda/extension/gpu_cache.h
@@ -21,6 +21,7 @@
 #ifndef GRAPHBOLT_GPU_CACHE_H_
 #define GRAPHBOLT_GPU_CACHE_H_
 
+#include <graphbolt/async.h>
 #include <torch/custom_class.h>
 #include <torch/torch.h>
 
@@ -53,6 +54,9 @@ class GpuCache : public torch::CustomClassHolder {
   std::tuple<torch::Tensor, torch::Tensor, torch::Tensor> Query(
       torch::Tensor keys);
 
+  c10::intrusive_ptr<Future<std::vector<torch::Tensor>>> QueryAsync(
+      torch::Tensor keys);
+
   void Replace(torch::Tensor keys, torch::Tensor values);
 
   static c10::intrusive_ptr<GpuCache> Create(
diff --git a/graphbolt/src/python_binding.cc b/graphbolt/src/python_binding.cc
@@ -109,6 +109,7 @@ TORCH_LIBRARY(graphbolt, m) {
 #ifdef GRAPHBOLT_USE_CUDA
   m.class_<cuda::GpuCache>("GpuCache")
       .def("query", &cuda::GpuCache::Query)
+      .def("query_async", &cuda::GpuCache::QueryAsync)
       .def("replace", &cuda::GpuCache::Replace);
   m.def("gpu_cache", &cuda::GpuCache::Create);
   m.class_<cuda::GpuGraphCache>("GpuGraphCache")
diff --git a/python/dgl/graphbolt/impl/gpu_cache.py b/python/dgl/graphbolt/impl/gpu_cache.py
@@ -14,13 +14,16 @@ def __init__(self, cache_shape, dtype):
         self.total_miss = 0
         self.total_queries = 0
 
-    def query(self, keys):
+    def query(self, keys, async_op=False):
         """Queries the GPU cache.
 
         Parameters
         ----------
         keys : Tensor
             The keys to query the GPU cache with.
+        async_op: bool
+            Boolean indicating whether the call is asynchronous. If so, the
+            result can be obtained by calling wait on the returned future.
 
         Returns
         -------
@@ -29,10 +32,29 @@ def query(self, keys):
             values[missing_indices] corresponds to cache misses that should be
             filled by quering another source with missing_keys.
         """
-        self.total_queries += keys.shape[0]
-        values, missing_index, missing_keys = self._cache.query(keys)
-        self.total_miss += missing_keys.shape[0]
-        return values, missing_index, missing_keys
+
+        class _Waiter:
+            def __init__(self, gpu_cache, future):
+                self.gpu_cache = gpu_cache
+                self.future = future
+
+            def wait(self):
+                """Returns the stored value when invoked."""
+                gpu_cache = self.gpu_cache
+                values, missing_index, missing_keys = (
+                    self.future.wait() if async_op else self.future
+                )
+                # Ensure there is no leak.
+                self.gpu_cache = self.future = None
+
+                gpu_cache.total_queries += values.shape[0]
+                gpu_cache.total_miss += missing_keys.shape[0]
+                return values, missing_index, missing_keys
+
+        if async_op:
+            return _Waiter(self, self._cache.query_async(keys))
+        else:
+            return _Waiter(self, self._cache.query(keys)).wait()
 
     def replace(self, keys, values):
         """Inserts key-value pairs into the GPU cache using the Least-Recently
diff --git a/python/dgl/graphbolt/impl/gpu_cached_feature.py b/python/dgl/graphbolt/impl/gpu_cached_feature.py
@@ -114,7 +114,11 @@ def read_async(self, ids: torch.Tensor):
         >>> assert stage + 1 == feature.read_async_num_stages(ids.device)
         >>> result = future.wait()  # result contains the read values.
         """
-        values, missing_index, missing_keys = self._feature.query(ids)
+        future = self._feature.query(ids, async_op=True)
+
+        yield
+
+        values, missing_index, missing_keys = future.wait()
 
         fallback_reader = self._fallback_feature.read_async(missing_keys)
         fallback_num_stages = self._fallback_feature.read_async_num_stages(
@@ -175,7 +179,7 @@ def read_async_num_stages(self, ids_device: torch.device):
             The number of stages of the read_async operation.
         """
         assert ids_device.type == "cuda"
-        return self._fallback_feature.read_async_num_stages(ids_device)
+        return 1 + self._fallback_feature.read_async_num_stages(ids_device)
 
     def size(self):
         """Get the size of the feature.