[GraphBolt][CUDA] Use better memory allocation algorithm to avoid OOM. (dmlc#7618)

mfbalin · lijialin03 · commit e9cfb19adfa4 · 2025-01-06T13:00:43.000Z
diff --git a/python/dgl/graphbolt/__init__.py b/python/dgl/graphbolt/__init__.py
@@ -2,6 +2,42 @@
 import os
 import sys
 
+from .internal_utils import *
+
+CUDA_ALLOCATOR_ENV_WARNING_STR = """
+An experimental feature for CUDA allocations is turned on for better allocation
+pattern resulting in better memory usage for minibatch GNN training workloads.
+See https://pytorch.org/docs/stable/notes/cuda.html#optimizing-memory-usage-with-pytorch-cuda-alloc-conf,
+and set the environment variable `PYTORCH_CUDA_ALLOC_CONF=expandable_segments:False`
+if you want to disable it.
+"""
+cuda_allocator_env = os.getenv("PYTORCH_CUDA_ALLOC_CONF")
+if cuda_allocator_env is None:
+    os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
+    gb_warning(CUDA_ALLOCATOR_ENV_WARNING_STR)
+else:
+    configs = {
+        kv_pair.split(":")[0]: kv_pair.split(":")[1]
+        for kv_pair in cuda_allocator_env.split(",")
+    }
+    if "expandable_segments" in configs:
+        if configs["expandable_segments"] != "True":
+            gb_warning(
+                "You should consider `expandable_segments:True` in the"
+                " environment variable `PYTORCH_CUDA_ALLOC_CONF` for lower"
+                " memory usage. See "
+                "https://pytorch.org/docs/stable/notes/cuda.html"
+                "#optimizing-memory-usage-with-pytorch-cuda-alloc-conf"
+            )
+    else:
+        configs["expandable_segments"] = "True"
+        os.environ["PYTORCH_CUDA_ALLOC_CONF"] = ",".join(
+            [k + ":" + v for k, v in configs.items()]
+        )
+        gb_warning(CUDA_ALLOCATOR_ENV_WARNING_STR)
+
+
+# pylint: disable=wrong-import-position, wrong-import-order
 import torch
 
 ### FROM DGL @todo
@@ -47,7 +83,6 @@ def load_graphbolt():
 from .itemset import *
 from .item_sampler import *
 from .minibatch_transformer import *
-from .internal_utils import *
 from .negative_sampler import *
 from .sampled_subgraph import *
 from .subgraph_sampler import *
diff --git a/tests/python/pytorch/graphbolt/test_base.py b/tests/python/pytorch/graphbolt/test_base.py
@@ -1,3 +1,4 @@
+import os
 import re
 import unittest
 from collections.abc import Iterable, Mapping
@@ -12,6 +13,13 @@
 from . import gb_test_utils
 
 
+def test_pytorch_cuda_allocator_conf():
+    env = os.getenv("PYTORCH_CUDA_ALLOC_CONF")
+    assert env is not None
+    config_list = env.split(",")
+    assert "expandable_segments:True" in config_list
+
+
 @unittest.skipIf(F._default_context_str != "gpu", "CopyTo needs GPU to test")
 @pytest.mark.parametrize("non_blocking", [False, True])
 def test_CopyTo(non_blocking):