1
1
"""Graph Bolt DataLoaders"""
2
2
3
- from collections import OrderedDict
4
-
5
3
import torch
6
4
import torch .utils .data as torch_data
7
5
8
- from .base import CopyTo , get_host_to_device_uva_stream
6
+ from .base import CopyTo
9
7
from .feature_fetcher import FeatureFetcher , FeatureFetcherStartMarker
10
- from .impl .gpu_graph_cache import GPUGraphCache
11
8
from .impl .neighbor_sampler import SamplePerLayer
12
9
13
10
from .internal import (
22
19
23
20
__all__ = [
24
21
"DataLoader" ,
25
- "construct_gpu_graph_cache" ,
26
22
]
27
23
28
24
29
- def construct_gpu_graph_cache (
30
- sample_per_layer_obj , num_gpu_cached_edges , gpu_cache_threshold
31
- ):
32
- "Construct a GPUGraphCache given a sample_per_layer_obj and cache parameters."
33
- graph = sample_per_layer_obj .sampler .__self__
34
- num_gpu_cached_edges = min (num_gpu_cached_edges , graph .total_num_edges )
35
- dtypes = OrderedDict ()
36
- dtypes ["indices" ] = graph .indices .dtype
37
- if graph .type_per_edge is not None :
38
- dtypes ["type_per_edge" ] = graph .type_per_edge .dtype
39
- if graph .edge_attributes is not None :
40
- probs_or_mask = graph .edge_attributes .get (
41
- sample_per_layer_obj .prob_name , None
42
- )
43
- if probs_or_mask is not None :
44
- dtypes ["probs_or_mask" ] = probs_or_mask .dtype
45
- return GPUGraphCache (
46
- num_gpu_cached_edges ,
47
- gpu_cache_threshold ,
48
- graph .csc_indptr .dtype ,
49
- list (dtypes .values ()),
50
- )
51
-
52
-
53
25
def _find_and_wrap_parent (datapipe_graph , target_datapipe , wrapper , ** kwargs ):
54
26
"""Find parent of target_datapipe and wrap it with ."""
55
27
datapipes = find_dps (
@@ -125,18 +97,6 @@ class DataLoader(torch_data.DataLoader):
125
97
If True, the data loader will not shut down the worker processes after a
126
98
dataset has been consumed once. This allows to maintain the workers
127
99
instances alive.
128
- overlap_graph_fetch : bool, optional
129
- If True, the data loader will overlap the UVA graph fetching operations
130
- with the rest of operations by using an alternative CUDA stream. This
131
- option should be enabled if you have moved your graph to the pinned
132
- memory for optimal performance. Default is False.
133
- num_gpu_cached_edges : int, optional
134
- If positive and overlap_graph_fetch is True, then the GPU will cache
135
- frequently accessed vertex neighborhoods to reduce the PCI-e bandwidth
136
- demand due to pinned graph accesses.
137
- gpu_cache_threshold : int, optional
138
- Determines how many times a vertex needs to be accessed before its
139
- neighborhood ends up being cached on the GPU.
140
100
max_uva_threads : int, optional
141
101
Limits the number of CUDA threads used for UVA copies so that the rest
142
102
of the computations can run simultaneously with it. Setting it to a too
@@ -150,9 +110,6 @@ def __init__(
150
110
datapipe ,
151
111
num_workers = 0 ,
152
112
persistent_workers = True ,
153
- overlap_graph_fetch = False ,
154
- num_gpu_cached_edges = 0 ,
155
- gpu_cache_threshold = 1 ,
156
113
max_uva_threads = 10240 ,
157
114
):
158
115
# Multiprocessing requires two modifications to the datapipe:
@@ -200,54 +157,14 @@ def __init__(
200
157
if feature_fetcher .max_num_stages > 0 : # Overlap enabled.
201
158
torch .ops .graphbolt .set_max_uva_threads (max_uva_threads )
202
159
203
- if (
204
- overlap_graph_fetch
205
- and num_workers == 0
206
- and torch .cuda .is_available ()
207
- ):
208
- torch .ops .graphbolt .set_max_uva_threads (max_uva_threads )
160
+ if num_workers == 0 and torch .cuda .is_available ():
209
161
samplers = find_dps (
210
162
datapipe_graph ,
211
163
SamplePerLayer ,
212
164
)
213
- gpu_graph_cache = None
214
165
for sampler in samplers :
215
- if num_gpu_cached_edges > 0 and gpu_graph_cache is None :
216
- gpu_graph_cache = construct_gpu_graph_cache (
217
- sampler , num_gpu_cached_edges , gpu_cache_threshold
218
- )
219
- if (
220
- sampler .sampler .__name__ == "sample_layer_neighbors"
221
- or gpu_graph_cache is not None
222
- ):
223
- # This code path is not faster for sample_neighbors.
224
- datapipe_graph = replace_dp (
225
- datapipe_graph ,
226
- sampler ,
227
- sampler .fetch_and_sample (
228
- gpu_graph_cache ,
229
- get_host_to_device_uva_stream (),
230
- 1 ,
231
- ),
232
- )
233
- elif sampler .sampler .__name__ == "sample_neighbors" :
234
- # This code path is faster for sample_neighbors.
235
- datapipe_graph = replace_dp (
236
- datapipe_graph ,
237
- sampler ,
238
- sampler .datapipe .sample_per_layer (
239
- sampler = sampler .sampler ,
240
- fanout = sampler .fanout ,
241
- replace = sampler .replace ,
242
- prob_name = sampler .prob_name ,
243
- returning_indices_is_optional = True ,
244
- ),
245
- )
246
- else :
247
- raise AssertionError (
248
- "overlap_graph_fetch is supported only for "
249
- "sample_neighbor and sample_layer_neighbor."
250
- )
166
+ if sampler .overlap_fetch :
167
+ torch .ops .graphbolt .set_max_uva_threads (max_uva_threads )
251
168
252
169
# (4) Cut datapipe at CopyTo and wrap with pinning and prefetching
253
170
# before it. This enables enables non_blocking copies to the device.
0 commit comments