diff --git a/experimental/attention/llama3_attention.py b/experimental/attention/llama3_attention.py index 59851eb9d7..2288767f1d 100644 --- a/experimental/attention/llama3_attention.py +++ b/experimental/attention/llama3_attention.py @@ -1,10 +1,10 @@ +from compressed_tensors.quantization import QuantizationArgs, QuantizationScheme from datasets import load_dataset from transformers import AutoModelForCausalLM, AutoTokenizer from llmcompressor import oneshot from llmcompressor.modifiers.quantization import QuantizationModifier from llmcompressor.utils import dispatch_for_generation -from compressed_tensors.quantization import QuantizationScheme, QuantizationArgs # Select model and load it. model_id = "meta-llama/Meta-Llama-3-8B-Instruct" diff --git a/experimental/attention/llama3_attention_r3_nvfp4.py b/experimental/attention/llama3_attention_r3_nvfp4.py index 753db49b0c..77d95d9e02 100644 --- a/experimental/attention/llama3_attention_r3_nvfp4.py +++ b/experimental/attention/llama3_attention_r3_nvfp4.py @@ -1,3 +1,5 @@ +from compressed_tensors.quantization import QuantizationScheme +from compressed_tensors.quantization.quant_scheme import NVFP4 from datasets import load_dataset from transformers import AutoModelForCausalLM, AutoTokenizer @@ -5,8 +7,6 @@ from llmcompressor.modifiers.quantization import QuantizationModifier from llmcompressor.modifiers.transform import SpinQuantModifier from llmcompressor.utils import dispatch_for_generation -from compressed_tensors.quantization import QuantizationScheme -from compressed_tensors.quantization.quant_scheme import NVFP4 # Select model and load it. model_id = "meta-llama/Meta-Llama-3-8B-Instruct" diff --git a/src/llmcompressor/entrypoints/oneshot.py b/src/llmcompressor/entrypoints/oneshot.py index 2ae36e6199..d0be896cc1 100644 --- a/src/llmcompressor/entrypoints/oneshot.py +++ b/src/llmcompressor/entrypoints/oneshot.py @@ -31,6 +31,9 @@ from datasets import Dataset, DatasetDict +TOKENIZERS_PARALLELISM_ENV = "TOKENIZERS_PARALLELISM" + + class Oneshot: """ Class responsible for carrying out one-shot calibration on a pretrained model. @@ -121,6 +124,19 @@ def __init__( :param log_dir: Path to save logs during oneshot run. Nothing is logged to file if None. """ + # Disable tokenizer parallelism to prevent warning when using + # multiprocessing for dataset preprocessing. The warning occurs because + # FastTokenizer's internal threading conflicts with dataset.map's num_proc. + # See: https://github.com/vllm-project/llm-compressor/issues/2007 + if TOKENIZERS_PARALLELISM_ENV not in os.environ: + os.environ[TOKENIZERS_PARALLELISM_ENV] = "false" + logger.warning( + "Disabling tokenizer parallelism due to threading conflict between " + "FastTokenizer and Datasets. Set " + f"{TOKENIZERS_PARALLELISM_ENV}=false to " + "suppress this warning." + ) + # Set up file logging (no default files): # 1) If LLM_COMPRESSOR_LOG_FILE is set, log to that file. # 2) Else, if an explicit log_dir is provided, create a timestamped file there. diff --git a/tests/llmcompressor/transformers/oneshot/test_tokenizer_parallelism.py b/tests/llmcompressor/transformers/oneshot/test_tokenizer_parallelism.py new file mode 100644 index 0000000000..e58f9e42bb --- /dev/null +++ b/tests/llmcompressor/transformers/oneshot/test_tokenizer_parallelism.py @@ -0,0 +1,47 @@ +import os + +import pytest + +from llmcompressor.entrypoints.oneshot import ( + TOKENIZERS_PARALLELISM_ENV as _TOKENIZERS_PARALLELISM_ENV, +) + + +class TestTokenizerParallelism: + """Tests for tokenizer parallelism warning suppression (issue #2007).""" + + def test_oneshot_sets_tokenizers_parallelism_when_not_set(self, monkeypatch): + """ + Test that Oneshot sets TOKENIZERS_PARALLELISM=false when not already set. + + This prevents the warning: + "huggingface/tokenizers: The current process just got forked, after + parallelism has already been used. Disabling parallelism to avoid deadlocks..." + + See: https://github.com/vllm-project/llm-compressor/issues/2007 + """ + monkeypatch.delenv(_TOKENIZERS_PARALLELISM_ENV, raising=False) + + from llmcompressor.entrypoints.oneshot import Oneshot + + # Create a minimal Oneshot instance to trigger __init__ + # We expect it to fail due to missing model, but the env var should be set + with pytest.raises(Exception): + Oneshot(model="nonexistent-model") + + assert os.environ[_TOKENIZERS_PARALLELISM_ENV] == "false" + + def test_oneshot_respects_existing_tokenizers_parallelism(self, monkeypatch): + """ + Test that Oneshot respects user's existing TOKENIZERS_PARALLELISM setting. + + If a user has explicitly set TOKENIZERS_PARALLELISM, we should not override it. + """ + monkeypatch.setenv(_TOKENIZERS_PARALLELISM_ENV, "true") + + from llmcompressor.entrypoints.oneshot import Oneshot + + with pytest.raises(Exception): + Oneshot(model="nonexistent-model") + + assert os.environ[_TOKENIZERS_PARALLELISM_ENV] == "true" diff --git a/tools/collect_env.py b/tools/collect_env.py index 62a100ce3d..55dad830a6 100644 --- a/tools/collect_env.py +++ b/tools/collect_env.py @@ -3,9 +3,9 @@ creating bug reports. See `.github/ISSUE_TEMPLATE/bug_report.md` """ +import importlib import platform import sys -import importlib def get_version(pkg_name):