From b03c7f6fd6fe399ead5d61e2040efd57f1696b18 Mon Sep 17 00:00:00 2001 From: james Date: Mon, 14 Dec 2020 19:09:05 +0000 Subject: [PATCH 1/6] use xgboost 1.3.0 instead of dask-xgboost --- .../examples-cpu/nyc-taxi/xgboost-dask.ipynb | 341 ++++++++++++++++-- examples/examples-cpu/nyc-taxi/xgboost.ipynb | 2 +- 2 files changed, 318 insertions(+), 25 deletions(-) diff --git a/examples/examples-cpu/nyc-taxi/xgboost-dask.ipynb b/examples/examples-cpu/nyc-taxi/xgboost-dask.ipynb index aa761142..afcb5911 100644 --- a/examples/examples-cpu/nyc-taxi/xgboost-dask.ipynb +++ b/examples/examples-cpu/nyc-taxi/xgboost-dask.ipynb @@ -20,7 +20,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 1, "metadata": {}, "outputs": [], "source": [ @@ -55,9 +55,34 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 2, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[2020-12-14 18:59:23] WARNING - dask-saturn | Unable to update the specification of a running dask cluster. No changes applied. To update, first stop the cluster with `cluster.close()` and reinitialize, or run 'cluster = SaturnCluster.reset(**kwargs)' to restart automatically.\n", + "[2020-12-14 18:59:23] INFO - dask-saturn | Cluster is ready\n", + "[2020-12-14 18:59:23] INFO - dask-saturn | Registering default plugins\n", + "[2020-12-14 18:59:23] INFO - dask-saturn | {'tcp://10.0.0.15:46173': {'status': 'OK'}, 'tcp://10.0.12.175:38441': {'status': 'OK'}, 'tcp://10.0.17.82:34435': {'status': 'OK'}}\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "6f2ec7aa543748a1af69458f199ee169", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "VBox(children=(HTML(value='

SaturnCluster

'), HBox(children=(HTML(value='\\n
\\n \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
pickup_weekdaypickup_weekofyearpickup_hourpickup_week_hourpickup_minutepassenger_countPULocationIDDOLocationIDtip_fraction
6845414.01.011.0107.041.02.0195.074.00.000000
7107724.01.014.0110.029.00.0132.017.00.204000
3699692.01.022.070.053.02.0151.0116.00.000000
936521.01.012.036.03.01.0236.0163.00.217778
1313751.01.016.040.025.06.0138.0137.00.253469
\n", + "
" + ], + "text/plain": [ + " pickup_weekday pickup_weekofyear pickup_hour pickup_week_hour \\\n", + "684541 4.0 1.0 11.0 107.0 \n", + "710772 4.0 1.0 14.0 110.0 \n", + "369969 2.0 1.0 22.0 70.0 \n", + "93652 1.0 1.0 12.0 36.0 \n", + "131375 1.0 1.0 16.0 40.0 \n", + "\n", + " pickup_minute passenger_count PULocationID DOLocationID \\\n", + "684541 41.0 2.0 195.0 74.0 \n", + "710772 29.0 0.0 132.0 17.0 \n", + "369969 53.0 2.0 151.0 116.0 \n", + "93652 3.0 1.0 236.0 163.0 \n", + "131375 25.0 6.0 138.0 137.0 \n", + "\n", + " tip_fraction \n", + "684541 0.000000 \n", + "710772 0.204000 \n", + "369969 0.000000 \n", + "93652 0.217778 \n", + "131375 0.253469 " + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "taxi_train.head()" ] @@ -170,9 +328,18 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 8, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CPU times: user 29.8 ms, sys: 560 µs, total: 30.4 ms\n", + "Wall time: 8.47 s\n" + ] + } + ], "source": [ "%%time\n", "taxi_train = taxi_train.persist()\n", @@ -188,29 +355,155 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 11, "metadata": {}, "outputs": [], "source": [ - "import dask_xgboost\n", - "\n", - "xgb_reg = dask_xgboost.XGBRegressor(\n", - " objective=\"reg:squarederror\",\n", - " tree_method='approx',\n", - " learning_rate=0.1,\n", - " max_depth=5,\n", - " n_estimators=50,\n", - ")" + "import xgboost as xgb" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Help on class DaskDMatrix in module xgboost.dask:\n", + "\n", + "class DaskDMatrix(builtins.object)\n", + " | DaskDMatrix(client, data, label=None, missing=None, weight=None, base_margin=None, label_lower_bound=None, label_upper_bound=None, feature_names=None, feature_types=None)\n", + " | \n", + " | DMatrix holding on references to Dask DataFrame or Dask Array. Constructing\n", + " | a `DaskDMatrix` forces all lazy computation to be carried out. Wait for\n", + " | the input data explicitly if you want to see actual computation of\n", + " | constructing `DaskDMatrix`.\n", + " | \n", + " | .. note::\n", + " | \n", + " | DaskDMatrix does not repartition or move data between workers. It's\n", + " | the caller's responsibility to balance the data.\n", + " | \n", + " | .. versionadded:: 1.0.0\n", + " | \n", + " | Parameters\n", + " | ----------\n", + " | client: dask.distributed.Client\n", + " | Specify the dask client used for training. Use default client\n", + " | returned from dask if it's set to None.\n", + " | data : dask.array.Array/dask.dataframe.DataFrame\n", + " | data source of DMatrix.\n", + " | label: dask.array.Array/dask.dataframe.DataFrame\n", + " | label used for trainin.\n", + " | missing : float, optional\n", + " | Value in the input data (e.g. `numpy.ndarray`) which needs\n", + " | to be present as a missing value. If None, defaults to np.nan.\n", + " | weight : dask.array.Array/dask.dataframe.DataFrame\n", + " | Weight for each instance.\n", + " | base_margin : dask.array.Array/dask.dataframe.DataFrame\n", + " | Global bias for each instance.\n", + " | label_lower_bound : dask.array.Array/dask.dataframe.DataFrame\n", + " | Upper bound for survival training.\n", + " | label_upper_bound : dask.array.Array/dask.dataframe.DataFrame\n", + " | Lower bound for survival training.\n", + " | feature_names : list, optional\n", + " | Set names for features.\n", + " | feature_types : list, optional\n", + " | Set types for features\n", + " | \n", + " | Methods defined here:\n", + " | \n", + " | __await__(self)\n", + " | \n", + " | __init__(self, client, data, label=None, missing=None, weight=None, base_margin=None, label_lower_bound=None, label_upper_bound=None, feature_names=None, feature_types=None)\n", + " | Initialize self. See help(type(self)) for accurate signature.\n", + " | \n", + " | create_fn_args(self, worker_addr: str)\n", + " | Create a dictionary of objects that can be pickled for function\n", + " | arguments.\n", + " | \n", + " | map_local_data(self, client, data, label=None, weights=None, base_margin=None, label_lower_bound=None, label_upper_bound=None)\n", + " | Obtain references to local data.\n", + " | \n", + " | ----------------------------------------------------------------------\n", + " | Data descriptors defined here:\n", + " | \n", + " | __dict__\n", + " | dictionary for instance variables (if defined)\n", + " | \n", + " | __weakref__\n", + " | list of weak references to the object (if defined)\n", + "\n" + ] + } + ], + "source": [ + "help(xgb.dask.DaskDMatrix)" + ] + }, + { + "cell_type": "code", + "execution_count": 14, "metadata": {}, "outputs": [], + "source": [ + "dtrain = xgb.dask.DaskDMatrix(\n", + " client=client,\n", + " data=taxi_train[features],\n", + " label=taxi_train[y_col]\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [ + { + "ename": "AttributeError", + "evalue": "/opt/conda/envs/saturn/lib/libxgboost.so: undefined symbol: XGDMatrixSetDenseInfo", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mAttributeError\u001b[0m Traceback (most recent call last)", + "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n", + "\u001b[0;32m/srv/conda/envs/saturn/lib/python3.7/site-packages/xgboost/dask.py\u001b[0m in \u001b[0;36mtrain\u001b[0;34m(client, params, dtrain, evals, early_stopping_rounds, *args, **kwargs)\u001b[0m\n\u001b[1;32m 736\u001b[0m return client.sync(\n\u001b[1;32m 737\u001b[0m \u001b[0m_train_async\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mclient\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mparams\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdtrain\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mdtrain\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mevals\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mevals\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 738\u001b[0;31m early_stopping_rounds=early_stopping_rounds, **kwargs)\n\u001b[0m\u001b[1;32m 739\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 740\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/srv/conda/envs/saturn/lib/python3.7/site-packages/distributed/client.py\u001b[0m in \u001b[0;36msync\u001b[0;34m(self, func, asynchronous, callback_timeout, *args, **kwargs)\u001b[0m\n\u001b[1;32m 831\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 832\u001b[0m return sync(\n\u001b[0;32m--> 833\u001b[0;31m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mloop\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfunc\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcallback_timeout\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mcallback_timeout\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 834\u001b[0m )\n\u001b[1;32m 835\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/srv/conda/envs/saturn/lib/python3.7/site-packages/distributed/utils.py\u001b[0m in \u001b[0;36msync\u001b[0;34m(loop, func, callback_timeout, *args, **kwargs)\u001b[0m\n\u001b[1;32m 338\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0merror\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 339\u001b[0m \u001b[0mtyp\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mexc\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtb\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0merror\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 340\u001b[0;31m \u001b[0;32mraise\u001b[0m \u001b[0mexc\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mwith_traceback\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtb\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 341\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 342\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mresult\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/srv/conda/envs/saturn/lib/python3.7/site-packages/distributed/utils.py\u001b[0m in \u001b[0;36mf\u001b[0;34m()\u001b[0m\n\u001b[1;32m 322\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mcallback_timeout\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 323\u001b[0m \u001b[0mfuture\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0masyncio\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mwait_for\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfuture\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcallback_timeout\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 324\u001b[0;31m \u001b[0mresult\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32myield\u001b[0m \u001b[0mfuture\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 325\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0mException\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0mexc\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 326\u001b[0m \u001b[0merror\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0msys\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mexc_info\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/srv/conda/envs/saturn/lib/python3.7/site-packages/tornado/gen.py\u001b[0m in \u001b[0;36mrun\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 733\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 734\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 735\u001b[0;31m \u001b[0mvalue\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mfuture\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mresult\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 736\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0mException\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 737\u001b[0m \u001b[0mexc_info\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0msys\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mexc_info\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/srv/conda/envs/saturn/lib/python3.7/site-packages/xgboost/dask.py\u001b[0m in \u001b[0;36m_train_async\u001b[0;34m(client, params, dtrain, evals, early_stopping_rounds, *args, **kwargs)\u001b[0m\n\u001b[1;32m 699\u001b[0m \u001b[0mfutures\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mappend\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mf\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 700\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 701\u001b[0;31m \u001b[0mresults\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mawait\u001b[0m \u001b[0mclient\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mgather\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfutures\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 702\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mlist\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfilter\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;32mlambda\u001b[0m \u001b[0mret\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mret\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mresults\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 703\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/srv/conda/envs/saturn/lib/python3.7/site-packages/distributed/client.py\u001b[0m in \u001b[0;36m_gather\u001b[0;34m(self, futures, errors, direct, local_worker)\u001b[0m\n\u001b[1;32m 1849\u001b[0m \u001b[0mexc\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mCancelledError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mkey\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1850\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1851\u001b[0;31m \u001b[0;32mraise\u001b[0m \u001b[0mexception\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mwith_traceback\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtraceback\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1852\u001b[0m \u001b[0;32mraise\u001b[0m \u001b[0mexc\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1853\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0merrors\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0;34m\"skip\"\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/srv/conda/envs/saturn/lib/python3.7/site-packages/xgboost/dask.py\u001b[0m in \u001b[0;36mdispatched_train\u001b[0;34m()\u001b[0m\n\u001b[1;32m 647\u001b[0m \u001b[0mworker\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mdistributed\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mget_worker\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 648\u001b[0m \u001b[0;32mwith\u001b[0m \u001b[0mRabitContext\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mrabit_args\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 649\u001b[0;31m \u001b[0mlocal_dtrain\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0m_dmatrix_from_list_of_parts\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m**\u001b[0m\u001b[0mdtrain_ref\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 650\u001b[0m \u001b[0mlocal_evals\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 651\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mevals_ref\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/opt/conda/envs/saturn/lib/python3.7/site-packages/xgboost/dask.py\u001b[0m in \u001b[0;36m_dmatrix_from_list_of_parts\u001b[0;34m()\u001b[0m\n\u001b[1;32m 598\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mis_quantile\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 599\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0m_create_device_quantile_dmatrix\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 600\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0m_create_dmatrix\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 601\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 602\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/opt/conda/envs/saturn/lib/python3.7/site-packages/xgboost/dask.py\u001b[0m in \u001b[0;36m_create_dmatrix\u001b[0;34m()\u001b[0m\n\u001b[1;32m 588\u001b[0m \u001b[0mfeature_names\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mfeature_names\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 589\u001b[0m \u001b[0mfeature_types\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mfeature_types\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 590\u001b[0;31m nthread=worker.nthreads)\n\u001b[0m\u001b[1;32m 591\u001b[0m dmatrix.set_info(base_margin=base_margin, weight=weights,\n\u001b[1;32m 592\u001b[0m \u001b[0mlabel_lower_bound\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mlabel_lower_bound\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/opt/conda/envs/saturn/lib/python3.7/site-packages/xgboost/core.py\u001b[0m in \u001b[0;36m__init__\u001b[0;34m()\u001b[0m\n\u001b[1;32m 506\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mhandle\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mhandle\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 507\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 508\u001b[0;31m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mset_info\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mlabel\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mlabel\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mweight\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mweight\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mbase_margin\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mbase_margin\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 509\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 510\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfeature_names\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mfeature_names\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/opt/conda/envs/saturn/lib/python3.7/site-packages/xgboost/core.py\u001b[0m in \u001b[0;36minner_f\u001b[0;34m()\u001b[0m\n\u001b[1;32m 419\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mk\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0marg\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mzip\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0msig\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mparameters\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0margs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 420\u001b[0m \u001b[0mkwargs\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mk\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0marg\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 421\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mf\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 422\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 423\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0minner_f\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/opt/conda/envs/saturn/lib/python3.7/site-packages/xgboost/core.py\u001b[0m in \u001b[0;36mset_info\u001b[0;34m()\u001b[0m\n\u001b[1;32m 527\u001b[0m \u001b[0;34m'''Set meta info for DMatrix.'''\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 528\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mlabel\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 529\u001b[0;31m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mset_label\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mlabel\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 530\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mweight\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 531\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mset_weight\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mweight\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/opt/conda/envs/saturn/lib/python3.7/site-packages/xgboost/core.py\u001b[0m in \u001b[0;36mset_label\u001b[0;34m()\u001b[0m\n\u001b[1;32m 656\u001b[0m \"\"\"\n\u001b[1;32m 657\u001b[0m \u001b[0;32mfrom\u001b[0m \u001b[0;34m.\u001b[0m\u001b[0mdata\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mdispatch_meta_backend\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 658\u001b[0;31m \u001b[0mdispatch_meta_backend\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mlabel\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'label'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'float'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 659\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 660\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mset_weight\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mweight\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/opt/conda/envs/saturn/lib/python3.7/site-packages/xgboost/data.py\u001b[0m in \u001b[0;36mdispatch_meta_backend\u001b[0;34m()\u001b[0m\n\u001b[1;32m 674\u001b[0m \u001b[0mdata\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mdata\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mvalues\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mastype\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'float'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 675\u001b[0m \u001b[0;32massert\u001b[0m \u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdata\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mshape\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0;36m1\u001b[0m \u001b[0;32mor\u001b[0m \u001b[0mdata\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mshape\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0;36m0\u001b[0m \u001b[0;32mor\u001b[0m \u001b[0mdata\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mshape\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0;36m1\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 676\u001b[0;31m \u001b[0m_meta_from_numpy\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdata\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mname\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdtype\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mhandle\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 677\u001b[0m \u001b[0;32mreturn\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 678\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0m_is_dlpack\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdata\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/opt/conda/envs/saturn/lib/python3.7/site-packages/xgboost/data.py\u001b[0m in \u001b[0;36m_meta_from_numpy\u001b[0;34m()\u001b[0m\n\u001b[1;32m 598\u001b[0m \u001b[0mptr\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0minterface\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'data'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 599\u001b[0m \u001b[0mptr\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mctypes\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mc_void_p\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mptr\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 600\u001b[0;31m _check_call(_LIB.XGDMatrixSetDenseInfo(\n\u001b[0m\u001b[1;32m 601\u001b[0m \u001b[0mhandle\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 602\u001b[0m \u001b[0mc_str\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfield\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/opt/conda/envs/saturn/lib/python3.7/ctypes/__init__.py\u001b[0m in \u001b[0;36m__getattr__\u001b[0;34m()\u001b[0m\n\u001b[1;32m 375\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mname\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mstartswith\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'__'\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mand\u001b[0m \u001b[0mname\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mendswith\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'__'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 376\u001b[0m \u001b[0;32mraise\u001b[0m \u001b[0mAttributeError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mname\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 377\u001b[0;31m \u001b[0mfunc\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m__getitem__\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mname\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 378\u001b[0m \u001b[0msetattr\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mname\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfunc\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 379\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mfunc\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/opt/conda/envs/saturn/lib/python3.7/ctypes/__init__.py\u001b[0m in \u001b[0;36m__getitem__\u001b[0;34m()\u001b[0m\n\u001b[1;32m 380\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 381\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0m__getitem__\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mname_or_ordinal\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 382\u001b[0;31m \u001b[0mfunc\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_FuncPtr\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mname_or_ordinal\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 383\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0misinstance\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mname_or_ordinal\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mint\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 384\u001b[0m \u001b[0mfunc\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m__name__\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mname_or_ordinal\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;31mAttributeError\u001b[0m: /opt/conda/envs/saturn/lib/libxgboost.so: undefined symbol: XGDMatrixSetDenseInfo" + ] + } + ], "source": [ "%%time\n", - "_ = xgb_reg.fit(taxi_train[features], y=taxi_train[y_col])" + "bst = xgb.dask.train(\n", + " client=client,\n", + " params={\n", + " \"objective\": \"reg:squarederror\",\n", + " \"tree_method\": \"hist\",\n", + " \"learning_rate\": 0.1,\n", + " \"max_depth\": 5,\n", + " },\n", + " dtrain=dtrain,\n", + " num_boost_round=50\n", + ")" ] }, { diff --git a/examples/examples-cpu/nyc-taxi/xgboost.ipynb b/examples/examples-cpu/nyc-taxi/xgboost.ipynb index 604688c3..bf10f411 100644 --- a/examples/examples-cpu/nyc-taxi/xgboost.ipynb +++ b/examples/examples-cpu/nyc-taxi/xgboost.ipynb @@ -124,7 +124,7 @@ "\n", "xgb_reg = xgboost.XGBRegressor(\n", " objective=\"reg:squarederror\",\n", - " tree_method='approx',\n", + " tree_method='hist',\n", " learning_rate=0.1,\n", " max_depth=5,\n", " n_estimators=50,\n", From 4d79122fb989ad8c27e2aac2693b06b809d8b874 Mon Sep 17 00:00:00 2001 From: james Date: Mon, 14 Dec 2020 19:53:06 +0000 Subject: [PATCH 2/6] change tree method and fix snowflake --- .../nyc-taxi-snowflake/xgboost-dask.ipynb | 108 ++++- .../nyc-taxi-snowflake/xgboost.ipynb | 2 +- .../examples-cpu/nyc-taxi/xgboost-dask.ipynb | 377 ++++-------------- 3 files changed, 177 insertions(+), 310 deletions(-) diff --git a/examples/examples-cpu/nyc-taxi-snowflake/xgboost-dask.ipynb b/examples/examples-cpu/nyc-taxi-snowflake/xgboost-dask.ipynb index 82c8d19c..6355b444 100644 --- a/examples/examples-cpu/nyc-taxi-snowflake/xgboost-dask.ipynb +++ b/examples/examples-cpu/nyc-taxi-snowflake/xgboost-dask.ipynb @@ -18,6 +18,13 @@ "" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This notebook describes how to use Dask to scale training of XGBoost models. For more detailed information, see [\"Distributed XGBoost with Dask\"](https://xgboost.readthedocs.io/en/latest/tutorials/dask.html) in the XGBoost documentation and [\"XGBoost Training with Dask\"](https://www.saturncloud.io/docs/tutorials/xgboost/) in Saturn Cloud's documentation." + ] + }, { "cell_type": "code", "execution_count": null, @@ -50,7 +57,9 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "# Initialize Dask cluster" + "# Initialize Dask cluster\n", + "\n", + "The code below uses [`dask-saturn`](https://github.com/saturncloud/dask-saturn) to create a Dask cluster or connect to one that is already running." ] }, { @@ -249,7 +258,9 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "# Train a model" + "# Train a model\n", + "\n", + "This example uses the native Dask integration built into XGBoost. That integration was added in `xgboost` 1.3.0, and should be preferred to [`dask-xgboost`](https://github.com/dask/dask-xgboost)." ] }, { @@ -258,17 +269,36 @@ "metadata": {}, "outputs": [], "source": [ - "import dask_xgboost\n", - "\n", - "xgb_reg = dask_xgboost.XGBRegressor(\n", - " objective=\"reg:squarederror\",\n", - " tree_method='approx',\n", - " learning_rate=0.1,\n", - " max_depth=5,\n", - " n_estimators=50,\n", + "import xgboost as xgb" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Training data for `xgboost.dask` needs to be prepared in a special object called `DaskDMatrix`. This is like the XGBoost `DMatrix` that you might be familiar with, but is backed by Dasks's distributed collections (Dask DataFrame and Dask Array)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "dtrain = xgb.dask.DaskDMatrix(\n", + " client=client,\n", + " data=taxi_train[features],\n", + " label=taxi_train[y_col]\n", ")" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "You can pass any [xgboost parameters](https://xgboost.readthedocs.io/en/latest/parameter.html) to `xgb.dask.train()`. The training process will then start up on all workers that have some of the data in `dtrain`." + ] + }, { "cell_type": "code", "execution_count": null, @@ -276,7 +306,36 @@ "outputs": [], "source": [ "%%time\n", - "_ = xgb_reg.fit(taxi_train[features], y=taxi_train[y_col])" + "result = xgb.dask.train(\n", + " client=client,\n", + " params={\n", + " \"objective\": \"reg:squarederror\",\n", + " \"tree_method\": \"hist\",\n", + " \"learning_rate\": 0.1,\n", + " \"max_depth\": 5,\n", + " },\n", + " dtrain=dtrain,\n", + " num_boost_round=50\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "`xgb.dask.train()` produces a regular `xgb.core.Booster` object, the same model object produced by non-Dask training." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "booster = result[\"booster\"]\n", + "type(booster)\n", + "\n", + "# xgboost.core.Booster" ] }, { @@ -295,7 +354,7 @@ "import cloudpickle\n", "\n", "with open(f'{MODEL_PATH}/xgboost_dask.pkl', 'wb') as f:\n", - " cloudpickle.dump(xgb_reg, f)" + " cloudpickle.dump(booster, f)" ] }, { @@ -319,6 +378,28 @@ "_ = wait(taxi_test)" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "`xgboost.dask.predict()` can be used to create predictiosn on a Dask collection using an XGBoost model object. Note that this model object is just a regular XGBoost booster, not a special Dask-specific model object.\n", + "\n", + "This function returns a Dask Array or Dask Series of predictions, depending on the input type." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "preds = xgb.dask.predict(\n", + " client=client,\n", + " model=booster,\n", + " data=taxi_test[features]\n", + ")" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -334,8 +415,7 @@ "source": [ "from dask_ml.metrics import mean_squared_error\n", "\n", - "preds = xgb_reg.predict(taxi_test[features])\n", - "mean_squared_error(taxi_test[y_col].to_dask_array(), preds, squared=False)" + "mean_squared_error(taxi_test[y_col].to_dask_array(), preds.to_dask_array(), squared=False)" ] } ], diff --git a/examples/examples-cpu/nyc-taxi-snowflake/xgboost.ipynb b/examples/examples-cpu/nyc-taxi-snowflake/xgboost.ipynb index 700acbf8..cbbc8c33 100644 --- a/examples/examples-cpu/nyc-taxi-snowflake/xgboost.ipynb +++ b/examples/examples-cpu/nyc-taxi-snowflake/xgboost.ipynb @@ -146,7 +146,7 @@ "\n", "xgb_reg = xgboost.XGBRegressor(\n", " objective=\"reg:squarederror\",\n", - " tree_method='approx',\n", + " tree_method='hist',\n", " learning_rate=0.1,\n", " max_depth=5,\n", " n_estimators=50,\n", diff --git a/examples/examples-cpu/nyc-taxi/xgboost-dask.ipynb b/examples/examples-cpu/nyc-taxi/xgboost-dask.ipynb index afcb5911..ba5bdd80 100644 --- a/examples/examples-cpu/nyc-taxi/xgboost-dask.ipynb +++ b/examples/examples-cpu/nyc-taxi/xgboost-dask.ipynb @@ -18,9 +18,16 @@ "" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This notebook describes how to use Dask to scale training of XGBoost models. For more detailed information, see [\"Distributed XGBoost with Dask\"](https://xgboost.readthedocs.io/en/latest/tutorials/dask.html) in the XGBoost documentation and [\"XGBoost Training with Dask\"](https://www.saturncloud.io/docs/tutorials/xgboost/) in Saturn Cloud's documentation." + ] + }, { "cell_type": "code", - "execution_count": 1, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -50,39 +57,16 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "# Initialize Dask cluster" + "# Initialize Dask cluster\n", + "\n", + "The code below uses [`dask-saturn`](https://github.com/saturncloud/dask-saturn) to create a Dask cluster or connect to one that is already running." ] }, { "cell_type": "code", - "execution_count": 2, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[2020-12-14 18:59:23] WARNING - dask-saturn | Unable to update the specification of a running dask cluster. No changes applied. To update, first stop the cluster with `cluster.close()` and reinitialize, or run 'cluster = SaturnCluster.reset(**kwargs)' to restart automatically.\n", - "[2020-12-14 18:59:23] INFO - dask-saturn | Cluster is ready\n", - "[2020-12-14 18:59:23] INFO - dask-saturn | Registering default plugins\n", - "[2020-12-14 18:59:23] INFO - dask-saturn | {'tcp://10.0.0.15:46173': {'status': 'OK'}, 'tcp://10.0.12.175:38441': {'status': 'OK'}, 'tcp://10.0.17.82:34435': {'status': 'OK'}}\n" - ] - }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "6f2ec7aa543748a1af69458f199ee169", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "VBox(children=(HTML(value='

SaturnCluster

'), HBox(children=(HTML(value='\\n
\\n \n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
pickup_weekdaypickup_weekofyearpickup_hourpickup_week_hourpickup_minutepassenger_countPULocationIDDOLocationIDtip_fraction
6845414.01.011.0107.041.02.0195.074.00.000000
7107724.01.014.0110.029.00.0132.017.00.204000
3699692.01.022.070.053.02.0151.0116.00.000000
936521.01.012.036.03.01.0236.0163.00.217778
1313751.01.016.040.025.06.0138.0137.00.253469
\n", - "
" - ], - "text/plain": [ - " pickup_weekday pickup_weekofyear pickup_hour pickup_week_hour \\\n", - "684541 4.0 1.0 11.0 107.0 \n", - "710772 4.0 1.0 14.0 110.0 \n", - "369969 2.0 1.0 22.0 70.0 \n", - "93652 1.0 1.0 12.0 36.0 \n", - "131375 1.0 1.0 16.0 40.0 \n", - "\n", - " pickup_minute passenger_count PULocationID DOLocationID \\\n", - "684541 41.0 2.0 195.0 74.0 \n", - "710772 29.0 0.0 132.0 17.0 \n", - "369969 53.0 2.0 151.0 116.0 \n", - "93652 3.0 1.0 236.0 163.0 \n", - "131375 25.0 6.0 138.0 137.0 \n", - "\n", - " tip_fraction \n", - "684541 0.000000 \n", - "710772 0.204000 \n", - "369969 0.000000 \n", - "93652 0.217778 \n", - "131375 0.253469 " - ] - }, - "execution_count": 7, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "taxi_train.head()" ] @@ -328,18 +179,9 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "CPU times: user 29.8 ms, sys: 560 µs, total: 30.4 ms\n", - "Wall time: 8.47 s\n" - ] - } - ], + "outputs": [], "source": [ "%%time\n", "taxi_train = taxi_train.persist()\n", @@ -350,12 +192,14 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "# Train a model" + "# Train a model\n", + "\n", + "This example uses the native Dask integration built into XGBoost. That integration was added in `xgboost` 1.3.0, and should be preferred to [`dask-xgboost`](https://github.com/dask/dask-xgboost)." ] }, { "cell_type": "code", - "execution_count": 11, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -363,89 +207,15 @@ ] }, { - "cell_type": "code", - "execution_count": 12, + "cell_type": "markdown", "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Help on class DaskDMatrix in module xgboost.dask:\n", - "\n", - "class DaskDMatrix(builtins.object)\n", - " | DaskDMatrix(client, data, label=None, missing=None, weight=None, base_margin=None, label_lower_bound=None, label_upper_bound=None, feature_names=None, feature_types=None)\n", - " | \n", - " | DMatrix holding on references to Dask DataFrame or Dask Array. Constructing\n", - " | a `DaskDMatrix` forces all lazy computation to be carried out. Wait for\n", - " | the input data explicitly if you want to see actual computation of\n", - " | constructing `DaskDMatrix`.\n", - " | \n", - " | .. note::\n", - " | \n", - " | DaskDMatrix does not repartition or move data between workers. It's\n", - " | the caller's responsibility to balance the data.\n", - " | \n", - " | .. versionadded:: 1.0.0\n", - " | \n", - " | Parameters\n", - " | ----------\n", - " | client: dask.distributed.Client\n", - " | Specify the dask client used for training. Use default client\n", - " | returned from dask if it's set to None.\n", - " | data : dask.array.Array/dask.dataframe.DataFrame\n", - " | data source of DMatrix.\n", - " | label: dask.array.Array/dask.dataframe.DataFrame\n", - " | label used for trainin.\n", - " | missing : float, optional\n", - " | Value in the input data (e.g. `numpy.ndarray`) which needs\n", - " | to be present as a missing value. If None, defaults to np.nan.\n", - " | weight : dask.array.Array/dask.dataframe.DataFrame\n", - " | Weight for each instance.\n", - " | base_margin : dask.array.Array/dask.dataframe.DataFrame\n", - " | Global bias for each instance.\n", - " | label_lower_bound : dask.array.Array/dask.dataframe.DataFrame\n", - " | Upper bound for survival training.\n", - " | label_upper_bound : dask.array.Array/dask.dataframe.DataFrame\n", - " | Lower bound for survival training.\n", - " | feature_names : list, optional\n", - " | Set names for features.\n", - " | feature_types : list, optional\n", - " | Set types for features\n", - " | \n", - " | Methods defined here:\n", - " | \n", - " | __await__(self)\n", - " | \n", - " | __init__(self, client, data, label=None, missing=None, weight=None, base_margin=None, label_lower_bound=None, label_upper_bound=None, feature_names=None, feature_types=None)\n", - " | Initialize self. See help(type(self)) for accurate signature.\n", - " | \n", - " | create_fn_args(self, worker_addr: str)\n", - " | Create a dictionary of objects that can be pickled for function\n", - " | arguments.\n", - " | \n", - " | map_local_data(self, client, data, label=None, weights=None, base_margin=None, label_lower_bound=None, label_upper_bound=None)\n", - " | Obtain references to local data.\n", - " | \n", - " | ----------------------------------------------------------------------\n", - " | Data descriptors defined here:\n", - " | \n", - " | __dict__\n", - " | dictionary for instance variables (if defined)\n", - " | \n", - " | __weakref__\n", - " | list of weak references to the object (if defined)\n", - "\n" - ] - } - ], "source": [ - "help(xgb.dask.DaskDMatrix)" + "Training data for `xgboost.dask` needs to be prepared in a special object called `DaskDMatrix`. This is like the XGBoost `DMatrix` that you might be familiar with, but is backed by Dasks's distributed collections (Dask DataFrame and Dask Array)." ] }, { "cell_type": "code", - "execution_count": 14, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -456,44 +226,21 @@ ")" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "You can pass any [xgboost parameters](https://xgboost.readthedocs.io/en/latest/parameter.html) to `xgb.dask.train()`. The training process will then start up on all workers that have some of the data in `dtrain`." + ] + }, { "cell_type": "code", - "execution_count": 16, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "ename": "AttributeError", - "evalue": "/opt/conda/envs/saturn/lib/libxgboost.so: undefined symbol: XGDMatrixSetDenseInfo", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mAttributeError\u001b[0m Traceback (most recent call last)", - "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n", - "\u001b[0;32m/srv/conda/envs/saturn/lib/python3.7/site-packages/xgboost/dask.py\u001b[0m in \u001b[0;36mtrain\u001b[0;34m(client, params, dtrain, evals, early_stopping_rounds, *args, **kwargs)\u001b[0m\n\u001b[1;32m 736\u001b[0m return client.sync(\n\u001b[1;32m 737\u001b[0m \u001b[0m_train_async\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mclient\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mparams\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdtrain\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mdtrain\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mevals\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mevals\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 738\u001b[0;31m early_stopping_rounds=early_stopping_rounds, **kwargs)\n\u001b[0m\u001b[1;32m 739\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 740\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m/srv/conda/envs/saturn/lib/python3.7/site-packages/distributed/client.py\u001b[0m in \u001b[0;36msync\u001b[0;34m(self, func, asynchronous, callback_timeout, *args, **kwargs)\u001b[0m\n\u001b[1;32m 831\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 832\u001b[0m return sync(\n\u001b[0;32m--> 833\u001b[0;31m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mloop\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfunc\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcallback_timeout\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mcallback_timeout\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 834\u001b[0m )\n\u001b[1;32m 835\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m/srv/conda/envs/saturn/lib/python3.7/site-packages/distributed/utils.py\u001b[0m in \u001b[0;36msync\u001b[0;34m(loop, func, callback_timeout, *args, **kwargs)\u001b[0m\n\u001b[1;32m 338\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0merror\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 339\u001b[0m \u001b[0mtyp\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mexc\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtb\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0merror\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 340\u001b[0;31m \u001b[0;32mraise\u001b[0m \u001b[0mexc\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mwith_traceback\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtb\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 341\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 342\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mresult\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m/srv/conda/envs/saturn/lib/python3.7/site-packages/distributed/utils.py\u001b[0m in \u001b[0;36mf\u001b[0;34m()\u001b[0m\n\u001b[1;32m 322\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mcallback_timeout\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 323\u001b[0m \u001b[0mfuture\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0masyncio\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mwait_for\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfuture\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcallback_timeout\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 324\u001b[0;31m \u001b[0mresult\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32myield\u001b[0m \u001b[0mfuture\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 325\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0mException\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0mexc\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 326\u001b[0m \u001b[0merror\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0msys\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mexc_info\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m/srv/conda/envs/saturn/lib/python3.7/site-packages/tornado/gen.py\u001b[0m in \u001b[0;36mrun\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 733\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 734\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 735\u001b[0;31m \u001b[0mvalue\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mfuture\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mresult\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 736\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0mException\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 737\u001b[0m \u001b[0mexc_info\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0msys\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mexc_info\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m/srv/conda/envs/saturn/lib/python3.7/site-packages/xgboost/dask.py\u001b[0m in \u001b[0;36m_train_async\u001b[0;34m(client, params, dtrain, evals, early_stopping_rounds, *args, **kwargs)\u001b[0m\n\u001b[1;32m 699\u001b[0m \u001b[0mfutures\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mappend\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mf\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 700\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 701\u001b[0;31m \u001b[0mresults\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mawait\u001b[0m \u001b[0mclient\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mgather\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfutures\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 702\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mlist\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfilter\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;32mlambda\u001b[0m \u001b[0mret\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mret\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mresults\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 703\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m/srv/conda/envs/saturn/lib/python3.7/site-packages/distributed/client.py\u001b[0m in \u001b[0;36m_gather\u001b[0;34m(self, futures, errors, direct, local_worker)\u001b[0m\n\u001b[1;32m 1849\u001b[0m \u001b[0mexc\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mCancelledError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mkey\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1850\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1851\u001b[0;31m \u001b[0;32mraise\u001b[0m \u001b[0mexception\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mwith_traceback\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtraceback\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1852\u001b[0m \u001b[0;32mraise\u001b[0m \u001b[0mexc\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1853\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0merrors\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0;34m\"skip\"\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m/srv/conda/envs/saturn/lib/python3.7/site-packages/xgboost/dask.py\u001b[0m in \u001b[0;36mdispatched_train\u001b[0;34m()\u001b[0m\n\u001b[1;32m 647\u001b[0m \u001b[0mworker\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mdistributed\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mget_worker\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 648\u001b[0m \u001b[0;32mwith\u001b[0m \u001b[0mRabitContext\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mrabit_args\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 649\u001b[0;31m \u001b[0mlocal_dtrain\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0m_dmatrix_from_list_of_parts\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m**\u001b[0m\u001b[0mdtrain_ref\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 650\u001b[0m \u001b[0mlocal_evals\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 651\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mevals_ref\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m/opt/conda/envs/saturn/lib/python3.7/site-packages/xgboost/dask.py\u001b[0m in \u001b[0;36m_dmatrix_from_list_of_parts\u001b[0;34m()\u001b[0m\n\u001b[1;32m 598\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mis_quantile\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 599\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0m_create_device_quantile_dmatrix\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 600\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0m_create_dmatrix\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 601\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 602\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m/opt/conda/envs/saturn/lib/python3.7/site-packages/xgboost/dask.py\u001b[0m in \u001b[0;36m_create_dmatrix\u001b[0;34m()\u001b[0m\n\u001b[1;32m 588\u001b[0m \u001b[0mfeature_names\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mfeature_names\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 589\u001b[0m \u001b[0mfeature_types\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mfeature_types\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 590\u001b[0;31m nthread=worker.nthreads)\n\u001b[0m\u001b[1;32m 591\u001b[0m dmatrix.set_info(base_margin=base_margin, weight=weights,\n\u001b[1;32m 592\u001b[0m \u001b[0mlabel_lower_bound\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mlabel_lower_bound\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m/opt/conda/envs/saturn/lib/python3.7/site-packages/xgboost/core.py\u001b[0m in \u001b[0;36m__init__\u001b[0;34m()\u001b[0m\n\u001b[1;32m 506\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mhandle\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mhandle\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 507\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 508\u001b[0;31m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mset_info\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mlabel\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mlabel\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mweight\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mweight\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mbase_margin\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mbase_margin\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 509\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 510\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfeature_names\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mfeature_names\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m/opt/conda/envs/saturn/lib/python3.7/site-packages/xgboost/core.py\u001b[0m in \u001b[0;36minner_f\u001b[0;34m()\u001b[0m\n\u001b[1;32m 419\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mk\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0marg\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mzip\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0msig\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mparameters\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0margs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 420\u001b[0m \u001b[0mkwargs\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mk\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0marg\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 421\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mf\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 422\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 423\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0minner_f\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m/opt/conda/envs/saturn/lib/python3.7/site-packages/xgboost/core.py\u001b[0m in \u001b[0;36mset_info\u001b[0;34m()\u001b[0m\n\u001b[1;32m 527\u001b[0m \u001b[0;34m'''Set meta info for DMatrix.'''\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 528\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mlabel\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 529\u001b[0;31m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mset_label\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mlabel\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 530\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mweight\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 531\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mset_weight\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mweight\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m/opt/conda/envs/saturn/lib/python3.7/site-packages/xgboost/core.py\u001b[0m in \u001b[0;36mset_label\u001b[0;34m()\u001b[0m\n\u001b[1;32m 656\u001b[0m \"\"\"\n\u001b[1;32m 657\u001b[0m \u001b[0;32mfrom\u001b[0m \u001b[0;34m.\u001b[0m\u001b[0mdata\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mdispatch_meta_backend\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 658\u001b[0;31m \u001b[0mdispatch_meta_backend\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mlabel\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'label'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'float'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 659\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 660\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mset_weight\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mweight\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m/opt/conda/envs/saturn/lib/python3.7/site-packages/xgboost/data.py\u001b[0m in \u001b[0;36mdispatch_meta_backend\u001b[0;34m()\u001b[0m\n\u001b[1;32m 674\u001b[0m \u001b[0mdata\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mdata\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mvalues\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mastype\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'float'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 675\u001b[0m \u001b[0;32massert\u001b[0m \u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdata\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mshape\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0;36m1\u001b[0m \u001b[0;32mor\u001b[0m \u001b[0mdata\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mshape\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0;36m0\u001b[0m \u001b[0;32mor\u001b[0m \u001b[0mdata\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mshape\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0;36m1\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 676\u001b[0;31m \u001b[0m_meta_from_numpy\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdata\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mname\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdtype\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mhandle\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 677\u001b[0m \u001b[0;32mreturn\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 678\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0m_is_dlpack\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdata\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m/opt/conda/envs/saturn/lib/python3.7/site-packages/xgboost/data.py\u001b[0m in \u001b[0;36m_meta_from_numpy\u001b[0;34m()\u001b[0m\n\u001b[1;32m 598\u001b[0m \u001b[0mptr\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0minterface\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'data'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 599\u001b[0m \u001b[0mptr\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mctypes\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mc_void_p\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mptr\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 600\u001b[0;31m _check_call(_LIB.XGDMatrixSetDenseInfo(\n\u001b[0m\u001b[1;32m 601\u001b[0m \u001b[0mhandle\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 602\u001b[0m \u001b[0mc_str\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfield\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m/opt/conda/envs/saturn/lib/python3.7/ctypes/__init__.py\u001b[0m in \u001b[0;36m__getattr__\u001b[0;34m()\u001b[0m\n\u001b[1;32m 375\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mname\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mstartswith\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'__'\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mand\u001b[0m \u001b[0mname\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mendswith\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'__'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 376\u001b[0m \u001b[0;32mraise\u001b[0m \u001b[0mAttributeError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mname\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 377\u001b[0;31m \u001b[0mfunc\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m__getitem__\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mname\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 378\u001b[0m \u001b[0msetattr\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mname\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfunc\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 379\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mfunc\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m/opt/conda/envs/saturn/lib/python3.7/ctypes/__init__.py\u001b[0m in \u001b[0;36m__getitem__\u001b[0;34m()\u001b[0m\n\u001b[1;32m 380\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 381\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0m__getitem__\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mname_or_ordinal\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 382\u001b[0;31m \u001b[0mfunc\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_FuncPtr\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mname_or_ordinal\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 383\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0misinstance\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mname_or_ordinal\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mint\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 384\u001b[0m \u001b[0mfunc\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m__name__\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mname_or_ordinal\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;31mAttributeError\u001b[0m: /opt/conda/envs/saturn/lib/libxgboost.so: undefined symbol: XGDMatrixSetDenseInfo" - ] - } - ], + "outputs": [], "source": [ "%%time\n", - "bst = xgb.dask.train(\n", + "result = xgb.dask.train(\n", " client=client,\n", " params={\n", " \"objective\": \"reg:squarederror\",\n", @@ -506,6 +253,25 @@ ")" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "`xgb.dask.train()` produces a regular `xgb.core.Booster` object, the same model object produced by non-Dask training." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "booster = result[\"booster\"]\n", + "type(booster)\n", + "\n", + "# xgboost.core.Booster" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -522,7 +288,7 @@ "import cloudpickle\n", "\n", "with open(f'{MODEL_PATH}/xgboost_dask.pkl', 'wb') as f:\n", - " cloudpickle.dump(xgb_reg, f)" + " cloudpickle.dump(booster, f)" ] }, { @@ -550,6 +316,28 @@ "taxi_test = prep_df(taxi_test)" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "`xgboost.dask.predict()` can be used to create predictiosn on a Dask collection using an XGBoost model object. Note that this model object is just a regular XGBoost booster, not a special Dask-specific model object.\n", + "\n", + "This function returns a Dask Array or Dask Series of predictions, depending on the input type." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "preds = xgb.dask.predict(\n", + " client=client,\n", + " model=booster,\n", + " data=taxi_test[features]\n", + ")" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -565,8 +353,7 @@ "source": [ "from dask_ml.metrics import mean_squared_error\n", "\n", - "preds = xgb_reg.predict(taxi_test[features])\n", - "mean_squared_error(taxi_test[y_col].to_dask_array(), preds, squared=False)" + "mean_squared_error(taxi_test[y_col].to_dask_array(), preds.to_dask_array(), squared=False)" ] } ], From 2f5403df83669438ad70ecaadde582c570ca95a4 Mon Sep 17 00:00:00 2001 From: James Lamb Date: Mon, 14 Dec 2020 13:56:34 -0600 Subject: [PATCH 3/6] add start script --- examples/examples-cpu/.saturn/start | 6 ++++++ 1 file changed, 6 insertions(+) create mode 100755 examples/examples-cpu/.saturn/start diff --git a/examples/examples-cpu/.saturn/start b/examples/examples-cpu/.saturn/start new file mode 100755 index 00000000..4c06ff48 --- /dev/null +++ b/examples/examples-cpu/.saturn/start @@ -0,0 +1,6 @@ +pip uninstall -y dask-xgboost xgboost || true + +rm -f /opt/conda/envs/saturn/lib/libxgboost.so +rm -f /opt/conda/envs/saturn/lib/python3.7/site-packages/xgboost/lib/libxgboost.so + +pip install --upgrade 'xgboost>=1.3.0' From a16fe14996016abbca9cdf476408d6226fc43eaf Mon Sep 17 00:00:00 2001 From: james Date: Mon, 14 Dec 2020 20:21:27 +0000 Subject: [PATCH 4/6] typos --- examples/examples-cpu/nyc-taxi-snowflake/xgboost-dask.ipynb | 2 +- examples/examples-cpu/nyc-taxi/xgboost-dask.ipynb | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/examples-cpu/nyc-taxi-snowflake/xgboost-dask.ipynb b/examples/examples-cpu/nyc-taxi-snowflake/xgboost-dask.ipynb index 6355b444..76d9da84 100644 --- a/examples/examples-cpu/nyc-taxi-snowflake/xgboost-dask.ipynb +++ b/examples/examples-cpu/nyc-taxi-snowflake/xgboost-dask.ipynb @@ -382,7 +382,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "`xgboost.dask.predict()` can be used to create predictiosn on a Dask collection using an XGBoost model object. Note that this model object is just a regular XGBoost booster, not a special Dask-specific model object.\n", + "`xgboost.dask.predict()` can be used to create predictions on a Dask collection using an XGBoost model object. Because the model object here is just a regular XGBoost model, using `dask-xgboost` for batch scoring doesn't require that you also perform training on Dask.\n", "\n", "This function returns a Dask Array or Dask Series of predictions, depending on the input type." ] diff --git a/examples/examples-cpu/nyc-taxi/xgboost-dask.ipynb b/examples/examples-cpu/nyc-taxi/xgboost-dask.ipynb index ba5bdd80..101cfc02 100644 --- a/examples/examples-cpu/nyc-taxi/xgboost-dask.ipynb +++ b/examples/examples-cpu/nyc-taxi/xgboost-dask.ipynb @@ -320,7 +320,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "`xgboost.dask.predict()` can be used to create predictiosn on a Dask collection using an XGBoost model object. Note that this model object is just a regular XGBoost booster, not a special Dask-specific model object.\n", + "`xgboost.dask.predict()` can be used to create predictions on a Dask collection using an XGBoost model object. Because the model object here is just a regular XGBoost model, using `xgboost.dask` for batch scoring doesn't require that you also perform training on Dask.\n", "\n", "This function returns a Dask Array or Dask Series of predictions, depending on the input type." ] From d6195bef6312e6eeb7dbf9693202f9d2cafe0b1d Mon Sep 17 00:00:00 2001 From: James Lamb Date: Wed, 16 Dec 2020 12:29:00 -0600 Subject: [PATCH 5/6] remove start script --- examples/examples-cpu/.saturn/saturn.json | 2 +- examples/examples-cpu/.saturn/start | 6 ------ 2 files changed, 1 insertion(+), 7 deletions(-) delete mode 100755 examples/examples-cpu/.saturn/start diff --git a/examples/examples-cpu/.saturn/saturn.json b/examples/examples-cpu/.saturn/saturn.json index 299886be..10c3f7b0 100644 --- a/examples/examples-cpu/.saturn/saturn.json +++ b/examples/examples-cpu/.saturn/saturn.json @@ -1,5 +1,5 @@ { - "image": "saturncloud/saturn:2020.11.30", + "image": "saturncloud/saturn:2020.12.16-dev", "jupyter": { "size": "large", "disk_space": "10Gi", diff --git a/examples/examples-cpu/.saturn/start b/examples/examples-cpu/.saturn/start deleted file mode 100755 index 4c06ff48..00000000 --- a/examples/examples-cpu/.saturn/start +++ /dev/null @@ -1,6 +0,0 @@ -pip uninstall -y dask-xgboost xgboost || true - -rm -f /opt/conda/envs/saturn/lib/libxgboost.so -rm -f /opt/conda/envs/saturn/lib/python3.7/site-packages/xgboost/lib/libxgboost.so - -pip install --upgrade 'xgboost>=1.3.0' From 228598f5e4aaad48fcb9ba64b483c8eda6b42789 Mon Sep 17 00:00:00 2001 From: James Lamb Date: Wed, 16 Dec 2020 15:43:50 -0600 Subject: [PATCH 6/6] fix typos --- examples/examples-cpu/nyc-taxi-snowflake/xgboost-dask.ipynb | 2 +- examples/examples-cpu/nyc-taxi/xgboost-dask.ipynb | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/examples-cpu/nyc-taxi-snowflake/xgboost-dask.ipynb b/examples/examples-cpu/nyc-taxi-snowflake/xgboost-dask.ipynb index 76d9da84..b776b77a 100644 --- a/examples/examples-cpu/nyc-taxi-snowflake/xgboost-dask.ipynb +++ b/examples/examples-cpu/nyc-taxi-snowflake/xgboost-dask.ipynb @@ -276,7 +276,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Training data for `xgboost.dask` needs to be prepared in a special object called `DaskDMatrix`. This is like the XGBoost `DMatrix` that you might be familiar with, but is backed by Dasks's distributed collections (Dask DataFrame and Dask Array)." + "Training data for `xgboost.dask` needs to be prepared in a special object called `DaskDMatrix`. This is like the XGBoost `DMatrix` that you might be familiar with, but is backed by Dask's distributed collections (Dask DataFrame and Dask Array)." ] }, { diff --git a/examples/examples-cpu/nyc-taxi/xgboost-dask.ipynb b/examples/examples-cpu/nyc-taxi/xgboost-dask.ipynb index 101cfc02..8e469213 100644 --- a/examples/examples-cpu/nyc-taxi/xgboost-dask.ipynb +++ b/examples/examples-cpu/nyc-taxi/xgboost-dask.ipynb @@ -210,7 +210,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Training data for `xgboost.dask` needs to be prepared in a special object called `DaskDMatrix`. This is like the XGBoost `DMatrix` that you might be familiar with, but is backed by Dasks's distributed collections (Dask DataFrame and Dask Array)." + "Training data for `xgboost.dask` needs to be prepared in a special object called `DaskDMatrix`. This is like the XGBoost `DMatrix` that you might be familiar with, but is backed by Dask's distributed collections (Dask DataFrame and Dask Array)." ] }, {