@@ -17,20 +17,57 @@ For state of the art Hyper Parameter Optimization (HPO) we recommend the
17
17
with the associated
18
18
`Dask-Optuna integration <https://optuna-integration.readthedocs.io/en/latest/reference/generated/optuna_integration.DaskStorage.html >`_
19
19
20
- Consider also this video:
21
20
22
- .. raw :: html
21
+ In Optuna you construct an objective function that takes a trial object, which
22
+ generates parameters from distributions that you define in code. Your
23
+ objective function eventually produces a score. Optuna is smart about what
24
+ values from the distribution it suggests based on the scores it has received.
23
25
24
- <iframe width =" 560"
25
- height =" 315"
26
- src =" https://www.youtube.com/embed/euT6_h7iIBA"
27
- frameborder =" 0"
28
- allow =" autoplay; encrypted-media"
29
- style =" margin : 0 auto 20px auto ; display : block ;"
30
- allowfullscreen >
31
- </iframe >
26
+ .. code-block :: python
27
+
28
+ def objective (trial ):
29
+ params = {
30
+ " max_depth" : trial.suggest_int(" max_depth" , 2 , 10 , step = 1 ),
31
+ " learning_rate" : trial.suggest_float(" learning_rate" , 1e-8 , 1.0 , log = True ),
32
+ ...
33
+ }
34
+ model = train_model(train_data, ** params)
35
+ result = score(model, test_data)
36
+ return result
37
+
38
+ Dask and Optuna are often used together by running many objective functions in
39
+ parallel, and synchronizing the scores and parameter selection on the Dask
40
+ scheduler. To do this, we use the ``DaskStorage `` object found in Optuna.
41
+
42
+ .. code-block :: python
43
+
44
+ import optuna
45
+
46
+ storage = optuna.integration.DaskStorage()
47
+
48
+ study = optuna.create_study(
49
+ direction = " maximize" ,
50
+ storage = storage, # This makes the study Dask-enabled
51
+ )
52
+
53
+ Then we just run many optimize methods in parallel
54
+
55
+ .. code-block :: python
56
+
57
+ from dask.distributed import LocalCluster, wait
58
+
59
+ cluster = LocalCluster(processes = False ) # replace this with some scalable cluster
60
+ client = cluster.get_client()
61
+
62
+ futures = [
63
+ client.submit(study.optimize, objective, n_trials = 1 , pure = False ) for _ in range (500 )
64
+ ]
65
+ wait(futures)
66
+
67
+ print (study.best_params)
68
+
69
+ For a more fully worked example see :bdg-link-primary: `this Optuna+XGBoost example <https://docs.coiled.io/user_guide/usage/dask/hpo.html> `.
32
70
33
- TODO: what's the best optuna example here?
34
71
35
72
Dask Futures
36
73
~~~~~~~~~~~~
@@ -56,12 +93,15 @@ might look like the following:
56
93
return score
57
94
58
95
params_list = [... ]
59
- futures = [client.submit(train_and_score, params) for params in params_list]
96
+ futures = [
97
+ client.submit(train_and_score, params) for params in params_list
98
+ ]
60
99
scores = client.gather(futures)
61
100
best = max (scores)
62
101
63
102
best_params = params_list[scores.index(best)]
64
103
104
+ For a more fully worked example see :bdg-link-primary: `Futures Documentation <futures.html> `.
65
105
66
106
Gradient Boosted Trees
67
107
----------------------
@@ -85,12 +125,21 @@ and the Dask LocalCluster to train on randomly generated data
85
125
df = dask.datasets.timeseries() # randomly generated data
86
126
# df = dd.read_parquet(...) # probably you would read data though in practice
87
127
88
- train, test = df.random_split(... ) # TODO
128
+ train, test = df.random_split([0.80 , 0.20 ])
129
+ X_train, y_train, X_test, y_test = ...
89
130
90
131
with LocalCluster() as cluster:
91
132
with cluster.get_client() as client:
92
- # TODO
133
+ d_train = xgboost.dask.DaskDMatrix(client, X_train, y_train, enable_categorical = True )
134
+ model = xgboost.dask.train(
135
+ ...
136
+ d_train,
137
+ )
138
+ predictions = xgboost.dask.predict(client, model, X_test)
139
+
140
+ score = ...
93
141
142
+ For a more fully worked example see :bdg-link-primary: `this XGBoost example <https://docs.coiled.io/user_guide/usage/dask/xgboost.html> `.
94
143
95
144
Batch Inference
96
145
---------------
@@ -132,6 +181,8 @@ different files.
132
181
predictions = client.map(predict, filenames, model = model)
133
182
results = client.gather(predictions)
134
183
184
+ For a more fully worked example see :bdg-link-primary: `Batch Scoring for Computer Vision Workloads (video) <https://developer.download.nvidia.com/video/gputechconf/gtc/2019/video/S9198/s9198-dask-and-v100s-for-fast-distributed-batch-scoring-of-computer-vision-workloads.mp4> `.
185
+
135
186
Batch Prediction with Dask Dataframe
136
187
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
137
188
@@ -155,3 +206,5 @@ see which were likely to become ill
155
206
# Dask code
156
207
predictions = df.map_partitions(model.predict)
157
208
predictions.to_parquet(" /path/to/results.parquet" )
209
+
210
+ For more information see :bdg-link-primary: `Dask Dataframe docs <dataframe.html> `.
0 commit comments