Skip to content

Commit eb8b8b7

Browse files
ayushdgrjzamora
authored andcommitted
Stricter check for query planning. (NVIDIA#107)
* Stricter query planning checks with newer versions of dask Signed-off-by: Ayush Dattagupta <[email protected]> * Add checks to tests/__init__ Signed-off-by: Ayush Dattagupta <[email protected]> * Check sys.modules to ensure dask-expr is not enabled Signed-off-by: Ayush Dattagupta <[email protected]> * Search for "dask_expr" in sys modules Co-authored-by: Richard (Rick) Zamora <[email protected]> Signed-off-by: Ayush Dattagupta <[email protected]> * use dask_expr instead of dask-expr Signed-off-by: Ayush Dattagupta <[email protected]> --------- Signed-off-by: Ayush Dattagupta <[email protected]> Co-authored-by: Richard (Rick) Zamora <[email protected]>
1 parent 274d3a9 commit eb8b8b7

File tree

4 files changed

+26
-10
lines changed

4 files changed

+26
-10
lines changed

examples/fuzzy_deduplication.py

+2-1
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,6 @@
1616
import time
1717

1818
import dask
19-
from dask import dataframe as dd
2019

2120
from nemo_curator import FuzzyDuplicates, FuzzyDuplicatesConfig
2221
from nemo_curator.datasets import DocumentDataset
@@ -49,6 +48,8 @@ def main(args):
4948

5049
t0 = time.time()
5150
if filetype == "parquet":
51+
from dask import dataframe as dd
52+
5253
input_dataset = DocumentDataset(
5354
dd.read_parquet(
5455
dataset_dir,

examples/slurm/start-slurm.sh

+1
Original file line numberDiff line numberDiff line change
@@ -67,6 +67,7 @@ export CUDF_SPILL="1"
6767
export RMM_SCHEDULER_POOL_SIZE="1GB"
6868
export RMM_WORKER_POOL_SIZE="72GiB"
6969
export LIBCUDF_CUFILE_POLICY=OFF
70+
export DASK_DATAFRAME__QUERY_PLANNING=False
7071

7172

7273
# =================================================================

nemo_curator/__init__.py

+11-4
Original file line numberDiff line numberDiff line change
@@ -12,15 +12,22 @@
1212
# See the License for the specific language governing permissions and
1313
# limitations under the License.
1414

15+
import sys
16+
1517
import dask
1618

1719
# Disable query planning if possible
1820
# https://github.com/NVIDIA/NeMo-Curator/issues/73
19-
if dask.config.get("dataframe.query-planning") is True:
21+
if dask.config.get("dataframe.query-planning") is True or "dask_expr" in sys.modules:
2022
raise NotImplementedError(
21-
"NeMo Curator does not support query planning yet. "
22-
"Please disable query planning before importing "
23-
"`nemo_curator`, `dask.dataframe` or `dask_cudf`."
23+
"""
24+
NeMo Curator does not support query planning yet.
25+
Please disable query planning before importing
26+
`dask.dataframe` or `dask_cudf`. This can be done via:
27+
`export DASK_DATAFRAME__QUERY_PLANNING=False`, or
28+
importing `dask.dataframe/dask_cudf` after importing
29+
`nemo_curator`.
30+
"""
2431
)
2532
else:
2633
dask.config.set({"dataframe.query-planning": False})

tests/__init__.py

+12-5
Original file line numberDiff line numberDiff line change
@@ -12,15 +12,22 @@
1212
# See the License for the specific language governing permissions and
1313
# limitations under the License.
1414

15+
import sys
16+
1517
import dask
1618

17-
# Disable query planning before any tests are loaded
19+
# Disable query planning if possible
1820
# https://github.com/NVIDIA/NeMo-Curator/issues/73
19-
if dask.config.get("dataframe.query-planning") is True:
21+
if dask.config.get("dataframe.query-planning") is True or "dask_expr" in sys.modules:
2022
raise NotImplementedError(
21-
"NeMo Curator does not support query planning yet. "
22-
"Please disable query planning before importing "
23-
"`nemo_curator`, `dask.dataframe` or `dask_cudf`."
23+
"""
24+
NeMo Curator does not support query planning yet.
25+
Please disable query planning before importing
26+
`dask.dataframe` or `dask_cudf`. This can be done via:
27+
`export DASK_DATAFRAME__QUERY_PLANNING=False`, or
28+
importing `dask.dataframe/dask_cudf` after importing
29+
`nemo_curator`.
30+
"""
2431
)
2532
else:
2633
dask.config.set({"dataframe.query-planning": False})

0 commit comments

Comments
 (0)