File tree 4 files changed +26
-10
lines changed
4 files changed +26
-10
lines changed Original file line number Diff line number Diff line change 16
16
import time
17
17
18
18
import dask
19
- from dask import dataframe as dd
20
19
21
20
from nemo_curator import FuzzyDuplicates , FuzzyDuplicatesConfig
22
21
from nemo_curator .datasets import DocumentDataset
@@ -49,6 +48,8 @@ def main(args):
49
48
50
49
t0 = time .time ()
51
50
if filetype == "parquet" :
51
+ from dask import dataframe as dd
52
+
52
53
input_dataset = DocumentDataset (
53
54
dd .read_parquet (
54
55
dataset_dir ,
Original file line number Diff line number Diff line change @@ -67,6 +67,7 @@ export CUDF_SPILL="1"
67
67
export RMM_SCHEDULER_POOL_SIZE=" 1GB"
68
68
export RMM_WORKER_POOL_SIZE=" 72GiB"
69
69
export LIBCUDF_CUFILE_POLICY=OFF
70
+ export DASK_DATAFRAME__QUERY_PLANNING=False
70
71
71
72
72
73
# =================================================================
Original file line number Diff line number Diff line change 12
12
# See the License for the specific language governing permissions and
13
13
# limitations under the License.
14
14
15
+ import sys
16
+
15
17
import dask
16
18
17
19
# Disable query planning if possible
18
20
# https://github.com/NVIDIA/NeMo-Curator/issues/73
19
- if dask .config .get ("dataframe.query-planning" ) is True :
21
+ if dask .config .get ("dataframe.query-planning" ) is True or "dask_expr" in sys . modules :
20
22
raise NotImplementedError (
21
- "NeMo Curator does not support query planning yet. "
22
- "Please disable query planning before importing "
23
- "`nemo_curator`, `dask.dataframe` or `dask_cudf`."
23
+ """
24
+ NeMo Curator does not support query planning yet.
25
+ Please disable query planning before importing
26
+ `dask.dataframe` or `dask_cudf`. This can be done via:
27
+ `export DASK_DATAFRAME__QUERY_PLANNING=False`, or
28
+ importing `dask.dataframe/dask_cudf` after importing
29
+ `nemo_curator`.
30
+ """
24
31
)
25
32
else :
26
33
dask .config .set ({"dataframe.query-planning" : False })
Original file line number Diff line number Diff line change 12
12
# See the License for the specific language governing permissions and
13
13
# limitations under the License.
14
14
15
+ import sys
16
+
15
17
import dask
16
18
17
- # Disable query planning before any tests are loaded
19
+ # Disable query planning if possible
18
20
# https://github.com/NVIDIA/NeMo-Curator/issues/73
19
- if dask .config .get ("dataframe.query-planning" ) is True :
21
+ if dask .config .get ("dataframe.query-planning" ) is True or "dask_expr" in sys . modules :
20
22
raise NotImplementedError (
21
- "NeMo Curator does not support query planning yet. "
22
- "Please disable query planning before importing "
23
- "`nemo_curator`, `dask.dataframe` or `dask_cudf`."
23
+ """
24
+ NeMo Curator does not support query planning yet.
25
+ Please disable query planning before importing
26
+ `dask.dataframe` or `dask_cudf`. This can be done via:
27
+ `export DASK_DATAFRAME__QUERY_PLANNING=False`, or
28
+ importing `dask.dataframe/dask_cudf` after importing
29
+ `nemo_curator`.
30
+ """
24
31
)
25
32
else :
26
33
dask .config .set ({"dataframe.query-planning" : False })
You can’t perform that action at this time.
0 commit comments