From 21dd34301eaef3378911223f74356cd2ce09c477 Mon Sep 17 00:00:00 2001 From: Jeffrey Martin Date: Thu, 27 Jun 2024 08:22:15 -0500 Subject: [PATCH] Refactor `huggingface` config support (#742) * do not override config deprefix_prompt Signed-off-by: Jeffrey Martin * improve code reuse * consolidate `__init__` where possible * shift generator or model object creation to `_load_client()` Signed-off-by: Jeffrey Martin * crude implmentation of limitation on parallel generator call Signed-off-by: Jeffrey Martin * add torch `mps` support & enabled passed pipeline params * detect cuda vs mps vs cpu in a common way * guard import of OptimimPipeline Signed-off-by: Jeffrey Martin * enable hf model or pipeline config in `hf_args` * support all generic `pipeline` args at all times * adds `do_sample` when `model` is a parameter to the `Callable` * adds `low_cpu_mem_usage` and all `pipeline` for `Callables` without `model` * consolidates optimal device selection & set when not provided by config Signed-off-by: Jeffrey Martin * amend yaml config example * support merged dictionary in `Configurable` Signed-off-by: Jeffrey Martin * free tokenizer in _clear_client Signed-off-by: Jeffrey Martin * explicit device support * raise error when passed negative device integer * rename parameter tracking var * remove unused import * add tests for `_select_hf_device()` Signed-off-by: Jeffrey Martin --------- Signed-off-by: Jeffrey Martin --- garak/configurable.py | 5 + garak/generators/base.py | 1 + garak/generators/huggingface.py | 319 ++++++++++++++++----------- garak/probes/base.py | 2 + tests/generators/test_huggingface.py | 40 +++- tests/test_config.py | 17 +- tests/test_configurable.py | 19 +- 7 files changed, 265 insertions(+), 138 deletions(-) diff --git a/garak/configurable.py b/garak/configurable.py index 7ad768a7..efb9f566 100644 --- a/garak/configurable.py +++ b/garak/configurable.py @@ -88,6 +88,8 @@ def _apply_config(self, config): ) ): continue + if isinstance(v, dict): # if value is an existing dictionary merge + v = getattr(self, k) | v setattr(self, k, v) # This will set attribute to the full dictionary value def _apply_missing_instance_defaults(self): @@ -96,6 +98,9 @@ def _apply_missing_instance_defaults(self): for k, v in self.DEFAULT_PARAMS.items(): if not hasattr(self, k): setattr(self, k, v) + elif isinstance(v, dict): + v = v | getattr(self, k) + setattr(self, k, v) def _validate_env_var(self): if hasattr(self, "key_env_var"): diff --git a/garak/generators/base.py b/garak/generators/base.py index b57c52d1..4ec02a45 100644 --- a/garak/generators/base.py +++ b/garak/generators/base.py @@ -27,6 +27,7 @@ class Generator(Configurable): active = True generator_family_name = None + parallel_capable = True # support mainstream any-to-any large models # legal element for str list `modality['in']`: 'text', 'image', 'audio', 'video', '3d' diff --git a/garak/generators/huggingface.py b/garak/generators/huggingface.py index f7bffa27..15a8bfe0 100644 --- a/garak/generators/huggingface.py +++ b/garak/generators/huggingface.py @@ -14,9 +14,11 @@ https://huggingface.co/docs/api-inference/quicktour """ +import inspect import logging +import os import re -from typing import List, Union +from typing import Callable, List, Union import warnings import backoff @@ -25,22 +27,22 @@ from transformers import LlavaNextProcessor, LlavaNextForConditionalGeneration from garak import _config -from garak.exception import ModelNameMissingError +from garak.exception import ModelNameMissingError, GarakException from garak.generators.base import Generator models_to_deprefix = ["gpt2"] -class HFRateLimitException(Exception): +class HFRateLimitException(GarakException): pass -class HFLoadingException(Exception): +class HFLoadingException(GarakException): pass -class HFInternalServerError(Exception): +class HFInternalServerError(GarakException): pass @@ -50,53 +52,138 @@ def _set_hf_context_len(self, config): if isinstance(config.n_ctx, int): self.context_len = config.n_ctx + def _gather_hf_params(self, hf_constructor: Callable): + # this may be a bit too naive as it will pass any parameter valid for the pipeline signature + # this falls over when passed `from_pretrained` methods as the callable model params are not explicit + params = self.hf_args + if params["device"] is None: + params["device"] = self.device + + args = {} + + params_to_process = inspect.signature(hf_constructor).parameters + + if "model" in params_to_process: + args["model"] = self.name + # expand for + params_to_process = {"do_sample": True} | params_to_process + else: + # callable is for a Pretrained class also map standard `pipeline` params + from transformers import pipeline + + params_to_process = ( + {"low_cpu_mem_usage": True} + | params_to_process + | inspect.signature(pipeline).parameters + ) + + for k in params_to_process: + if k == "model": + continue # special case `model` comes from `name` in the generator + if k in params: + val = params[k] + if k == "torch_dtype" and hasattr(torch, val): + args[k] = getattr( + torch, val + ) # some model type specific classes do not yet support direct string representation + continue + if ( + k == "device" + and "device_map" in params_to_process + and "device_map" in params + ): + # per transformers convention hold `device_map` before `device` + continue + args[k] = params[k] + + return args + + def _select_hf_device(self): + """Determine the most efficient device for tensor load, hold any existing `device` already selected""" + import torch.cuda + + selected_device = None + if self.hf_args.get("device", None) is not None: + if isinstance(self.hf_args["device"], int): + # this assumes that indexed only devices selections means `cuda` + if self.hf_args["device"] < 0: + msg = f"device {self.hf_args['device']} requested but CUDA device numbering starts at zero. Use 'device: cpu' to request CPU." + logging.critical(msg) + raise ValueError(msg) + selected_device = torch.device("cuda:" + str(self.hf_args["device"])) + else: + selected_device = torch.device(self.hf_args["device"]) + + if selected_device is None: + selected_device = torch.device( + "cuda" + if torch.cuda.is_available() + else "mps" if torch.backends.mps.is_available() else "cpu" + ) + + if isinstance(selected_device, torch.device) and selected_device.type == "mps": + os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1" + logging.debug("Enabled MPS fallback environment variable") + + logging.debug( + "Using %s, based on torch environment evaluation", selected_device + ) + return selected_device + class Pipeline(Generator, HFCompatible): """Get text generations from a locally-run Hugging Face pipeline""" + DEFAULT_PARAMS = Generator.DEFAULT_PARAMS | { + "generations": 10, + "hf_args": { + "torch_dtype": "float16", + "do_sample": True, + "device": None, + }, + } generator_family_name = "Hugging Face 🤗 pipeline" supports_multiple_generations = True + parallel_capable = False - def __init__( - self, name="", do_sample=True, generations=10, device=0, config_root=_config - ): + def __init__(self, name="", config_root=_config): self.name = name - self.generations = generations - self.do_sample = do_sample - self.device = device - self._load_config(config_root) - super().__init__( - self.name, generations=self.generations, config_root=config_root - ) + super().__init__(self.name, config_root=config_root) + + import torch.multiprocessing as mp + + mp.set_start_method("spawn", force=True) + + self.device = self._select_hf_device() + self._load_client() + + def _load_client(self): + if hasattr(self, "generator") and self.generator is not None: + return from transformers import pipeline, set_seed if _config.run.seed is not None: set_seed(_config.run.seed) - import torch.cuda - - if not torch.cuda.is_available(): - logging.debug("Using CPU, torch.cuda.is_available() returned False") - self.device = -1 - - self.generator = pipeline( - "text-generation", - model=self.name, - do_sample=self.do_sample, - device=self.device, - ) - self.deprefix_prompt = self.name in models_to_deprefix + pipline_kwargs = self._gather_hf_params(hf_constructor=pipeline) + self.generator = pipeline("text-generation", **pipline_kwargs) + if not hasattr(self, "deprefix_prompt"): + self.deprefix_prompt = self.name in models_to_deprefix if _config.loaded: if _config.run.deprefix is True: self.deprefix_prompt = True - self._set_hf_context_len(self.generator.model.config) + self._set_hf_context_len(self.generator.model.config) + + def _clear_client(self): + self.generator = None def _call_model( self, prompt: str, generations_this_call: int = 1 ) -> List[Union[str, None]]: + self._load_client() with warnings.catch_warnings(): warnings.simplefilter("ignore", category=UserWarning) try: @@ -135,21 +222,18 @@ class OptimumPipeline(Pipeline, HFCompatible): supports_multiple_generations = True doc_uri = "https://huggingface.co/blog/optimum-nvidia" - def __init__( - self, name="", do_sample=True, generations=10, device=0, config_root=_config - ): - self.name = name + def _load_client(self): + if hasattr(self, "generator") and self.generator is not None: + return - super().__init__( - self.name, - do_sample=do_sample, - generations=generations, - device=device, - config_root=config_root, - ) - - from optimum.nvidia.pipelines import pipeline - from transformers import set_seed + try: + from optimum.nvidia.pipelines import pipeline + from transformers import set_seed + except Exception as e: + logging.exception(e) + raise GarakException( + f"Missing required dependencies for {self.__class__.__name__}" + ) if _config.run.seed is not None: set_seed(_config.run.seed) @@ -159,21 +243,17 @@ def __init__( if not torch.cuda.is_available(): message = "OptimumPipeline needs CUDA, but torch.cuda.is_available() returned False; quitting" logging.critical(message) - raise ValueError(message) + raise GarakException(message) - use_fp8 = False + self.use_fp8 = False if _config.loaded: if "use_fp8" in _config.plugins.generators.OptimumPipeline: - use_fp8 = True - - self.generator = pipeline( - "text-generation", - model=self.name, - do_sample=self.do_sample, - device=self.device, - use_fp8=use_fp8, - ) - self.deprefix_prompt = name in models_to_deprefix + self.use_fp8 = True + + pipline_kwargs = self._gather_hf_params(hf_constructor=pipeline) + self.generator = pipeline("text-generation", **pipline_kwargs) + if not hasattr(self, "deprefix_prompt"): + self.deprefix_prompt = self.name in models_to_deprefix if _config.loaded: if _config.run.deprefix is True: self.deprefix_prompt = True @@ -181,45 +261,28 @@ def __init__( self._set_hf_context_len(self.generator.model.config) -class ConversationalPipeline(Generator, HFCompatible): +class ConversationalPipeline(Pipeline, HFCompatible): """Conversational text generation using HuggingFace pipelines""" generator_family_name = "Hugging Face 🤗 pipeline for conversations" supports_multiple_generations = True - def __init__( - self, name="", do_sample=True, generations=10, device=0, config_root=_config - ): - self.name = name - self.do_sample = do_sample - self.generations = generations - self.device = device - - super().__init__( - self.name, generations=self.generations, config_root=config_root - ) + def _load_client(self): + if hasattr(self, "generator") and self.generator is not None: + return from transformers import pipeline, set_seed, Conversation if _config.run.seed is not None: set_seed(_config.run.seed) - import torch.cuda - - if not torch.cuda.is_available(): - logging.debug("Using CPU, torch.cuda.is_available() returned False") - self.device = -1 - # Note that with pipeline, in order to access the tokenizer, model, or device, you must get the attribute # directly from self.generator instead of from the ConversationalPipeline object itself. - self.generator = pipeline( - "conversational", - model=self.name, - do_sample=self.do_sample, - device=self.device, - ) + pipline_kwargs = self._gather_hf_params(hf_constructor=pipeline) + self.generator = pipeline("conversational", **pipline_kwargs) self.conversation = Conversation() - self.deprefix_prompt = self.name in models_to_deprefix + if not hasattr(self, "deprefix_prompt"): + self.deprefix_prompt = self.name in models_to_deprefix if _config.loaded: if _config.run.deprefix is True: self.deprefix_prompt = True @@ -236,12 +299,13 @@ def _call_model( ) -> List[Union[str, None]]: """Take a conversation as a list of dictionaries and feed it to the model""" + self._load_client() # If conversation is provided as a list of dicts, create the conversation. # Otherwise, maintain state in Generator if isinstance(prompt, str): self.conversation.add_message({"role": "user", "content": prompt}) self.conversation = self.generator(self.conversation) - generations = [self.conversation[-1]["content"]] + generations = [self.conversation[-1]["content"]] # what is this doing? elif isinstance(prompt, list): from transformers import Conversation @@ -262,7 +326,7 @@ def _call_model( return [re.sub("^" + re.escape(prompt), "", _o) for _o in outputs] -class InferenceAPI(Generator, HFCompatible): +class InferenceAPI(Generator): """Get text generations from Hugging Face Inference API""" generator_family_name = "Hugging Face 🤗 Inference API" @@ -391,7 +455,7 @@ def _pre_generate_hook(self): self.wait_for_model = False -class InferenceEndpoint(InferenceAPI, HFCompatible): +class InferenceEndpoint(InferenceAPI): """Interface for Hugging Face private endpoints Pass the model URL as the name, e.g. https://xxx.aws.endpoints.huggingface.cloud """ @@ -448,53 +512,40 @@ def _call_model( return [output] -class Model(Generator, HFCompatible): +class Model(Pipeline, HFCompatible): """Get text generations from a locally-run Hugging Face model""" generator_family_name = "Hugging Face 🤗 model" supports_multiple_generations = True - def __init__( - self, name="", do_sample=True, generations=10, device=0, config_root=_config - ): - self.name = name - self.device = device - self.generations = generations - - super().__init__( - self.name, generations=self.generations, config_root=config_root - ) + def _load_client(self): + if hasattr(self, "model") and self.model is not None: + return import transformers if _config.run.seed is not None: transformers.set_seed(_config.run.seed) - self.init_device = "cuda:" + str(self.device) - import torch.cuda - - if not torch.cuda.is_available(): - logging.debug("Using CPU, torch.cuda.is_available() returned False") - self.device = -1 - self.init_device = "cpu" - trust_remote_code = self.name.startswith("mosaicml/mpt-") + model_kwargs = self._gather_hf_params( + hf_constructor=transformers.AutoConfig.from_pretrained + ) # will defer to device_map if device map was `auto` may not match self.device + self.config = transformers.AutoConfig.from_pretrained( - self.name, trust_remote_code=trust_remote_code - ) - self.config.init_device = ( - self.init_device # or "cuda:0" For fast initialization directly on GPU! + self.name, trust_remote_code=trust_remote_code, **model_kwargs ) self._set_hf_context_len(self.config) + self.config.init_device = self.device # determined by Pipeline `__init__`` self.model = transformers.AutoModelForCausalLM.from_pretrained( - self.name, - config=self.config, - ).to(self.init_device) + self.name, config=self.config + ).to(self.device) - self.deprefix_prompt = self.name in models_to_deprefix + if not hasattr(self, "deprefix_prompt"): + self.deprefix_prompt = self.name in models_to_deprefix if self.config.tokenizer_class: self.tokenizer = transformers.AutoTokenizer.from_pretrained( @@ -505,18 +556,24 @@ def __init__( self.name, padding_side="left" ) - self.do_sample = do_sample self.generation_config = transformers.GenerationConfig.from_pretrained( self.name ) self.generation_config.eos_token_id = self.model.config.eos_token_id self.generation_config.pad_token_id = self.model.config.eos_token_id + def _clear_client(self): + self.model = None + self.config = None + self.tokenizer = None + self.generation_config = None + def _call_model( self, prompt: str, generations_this_call: int = 1 ) -> List[Union[str, None]]: + self._load_client() self.generation_config.max_new_tokens = self.max_tokens - self.generation_config.do_sample = self.do_sample + self.generation_config.do_sample = self.hf_args["do_sample"] self.generation_config.num_return_sequences = generations_this_call if self.temperature is not None: self.generation_config.temperature = self.temperature @@ -529,7 +586,7 @@ def _call_model( with torch.no_grad(): inputs = self.tokenizer( prompt, truncation=True, return_tensors="pt" - ).to(self.init_device) + ).to(self.device) try: outputs = self.model.generate( @@ -553,21 +610,23 @@ def _call_model( return [re.sub("^" + re.escape(prompt), "", i) for i in text_output] -class LLaVA(Generator): +class LLaVA(Generator, HFCompatible): """Get LLaVA ([ text + image ] -> text) generations""" DEFAULT_PARAMS = Generator.DEFAULT_PARAMS | { + "max_tokens": 4000, # "exist_tokens + max_new_tokens < 4K is the golden rule." # https://github.com/haotian-liu/LLaVA/issues/1095#:~:text=Conceptually%2C%20as%20long%20as%20the%20total%20tokens%20are%20within%204K%2C%20it%20would%20be%20fine%2C%20so%20exist_tokens%20%2B%20max_new_tokens%20%3C%204K%20is%20the%20golden%20rule. - "max_tokens": 4000, - # consider shifting below to kwargs or llava_kwargs that is a dict to allow more customization - "torch_dtype": torch.float16, - "low_cpu_mem_usage": True, - "device_map": "cuda:0", + "hf_args": { + "torch_dtype": "float16", + "low_cpu_mem_usage": True, + "device_map": "auto", + }, } # rewrite modality setting modality = {"in": {"text", "image"}, "out": {"text"}} + parallel_capable = False # Support Image-Text-to-Text models # https://huggingface.co/llava-hf#:~:text=Llava-,Models,-9 @@ -582,20 +641,20 @@ def __init__(self, name="", generations=10, config_root=_config): super().__init__(name, generations=generations, config_root=config_root) if self.name not in self.supported_models: raise ModelNameMissingError( - f"Invalid modal name {self.name}, current support: {self.supported_models}." + f"Invalid model name {self.name}, current support: {self.supported_models}." ) + + self.device = self._select_hf_device() + model_kwargs = self._gather_hf_params( + hf_constructor=LlavaNextForConditionalGeneration.from_pretrained + ) # will defer to device_map if device map was `auto` may not match self.device + self.processor = LlavaNextProcessor.from_pretrained(self.name) self.model = LlavaNextForConditionalGeneration.from_pretrained( - self.name, - torch_dtype=self.torch_dtype, - low_cpu_mem_usage=self.low_cpu_mem_usage, + self.name, **model_kwargs ) - if torch.cuda.is_available(): - self.model.to(self.device_map) - else: - raise RuntimeError( - "CUDA is not supported on this device. Please make sure CUDA is installed and configured properly." - ) + + self.model.to(self.device) def generate( self, prompt: str, generations_this_call: int = 1 @@ -609,7 +668,7 @@ def generate( raise Exception(e) inputs = self.processor(text_prompt, image_prompt, return_tensors="pt").to( - self.device_map + self.device ) exist_token_number: int = inputs.data["input_ids"].shape[1] output = self.model.generate( diff --git a/garak/probes/base.py b/garak/probes/base.py index af0aa4c3..bc15a0bd 100644 --- a/garak/probes/base.py +++ b/garak/probes/base.py @@ -150,6 +150,7 @@ def _execute_attempt(self, this_attempt): return copy.deepcopy(this_attempt) def _execute_all(self, attempts) -> Iterable[garak.attempt.Attempt]: + """handles sending a set of attempt to the generator""" attempts_completed: Iterable[garak.attempt.Attempt] = [] if ( @@ -157,6 +158,7 @@ def _execute_all(self, attempts) -> Iterable[garak.attempt.Attempt]: and _config.system.parallel_attempts > 1 and self.parallelisable_attempts and len(attempts) > 1 + and self.generator.parallel_capable ): from multiprocessing import Pool diff --git a/tests/generators/test_huggingface.py b/tests/generators/test_huggingface.py index f3381c94..6f6d19ec 100644 --- a/tests/generators/test_huggingface.py +++ b/tests/generators/test_huggingface.py @@ -1,11 +1,26 @@ +import pytest import transformers import garak.generators.huggingface +from garak._config import GarakSubConfig DEFAULT_GENERATIONS_QTY = 10 def test_pipeline(): - g = garak.generators.huggingface.Pipeline("gpt2") + gen_config = { + "huggingface": { + "Pipeline": { + "name": "gpt2", + "hf_args": { + "device": "cpu", + }, + } + } + } + config_root = GarakSubConfig() + setattr(config_root, "generators", gen_config) + + g = garak.generators.huggingface.Pipeline("gpt2", config_root=config_root) assert g.name == "gpt2" assert g.generations == DEFAULT_GENERATIONS_QTY assert isinstance(g.generator, transformers.pipelines.text_generation.Pipeline) @@ -54,3 +69,26 @@ def test_model(): assert len(output) == DEFAULT_GENERATIONS_QTY for item in output: assert item is None # gpt2 is known raise exception returning `None` + + +def test_select_hf_device(): + from garak.generators.huggingface import HFCompatible + import torch + + class mockHF(HFCompatible): + def __init__(self, key, value): + self.hf_args = {key: value} + pass + + m = mockHF("device", -1) + with pytest.raises(ValueError) as exc_info: + device = m._select_hf_device() + assert "CUDA device numbering starts" in str(exc_info.value) + + m = mockHF("device", "cpu") + device = m._select_hf_device() + assert device == torch.device("cpu") + + m = mockHF("device_map", "auto") + device = m._select_hf_device() + assert isinstance(device, torch.device) diff --git a/tests/test_config.py b/tests/test_config.py index 8d33cef1..48aac522 100644 --- a/tests/test_config.py +++ b/tests/test_config.py @@ -19,23 +19,27 @@ plugins: generators: huggingface: - dtype: general - gpu: 0 + hf_args: + torch_dtype: float16 Pipeline: - dtype: bfloat16 + hf_args: + device: cuda probes: test: generators: huggingface: Pipeline: - dtype: for_probe + hf_args: + torch_dtype: float16 detector: test: val: tests Blank: generators: huggingface: - gpu: 1 + hf_args: + torch_dtype: float16 + device: cuda:1 Pipeline: dtype: for_detector buffs: @@ -43,7 +47,8 @@ Blank: generators: huggingface: - gpu: 1 + hf_args: + device: cuda:0 Pipeline: dtype: for_detector """.encode( diff --git a/tests/test_configurable.py b/tests/test_configurable.py index 4979beb1..7847f65b 100644 --- a/tests/test_configurable.py +++ b/tests/test_configurable.py @@ -24,7 +24,13 @@ class mockConfigurable(Configurable): # Configurable is coupled to hierarchy of plugin types __module__ = "garak.generators.mock" - DEFAULT_PARAMS = {"class_var": "from_class"} + DEFAULT_PARAMS = { + "class_var": "from_class", + "class_dict_var": { + "dict_a": "dict_val", + "dict_b": "dict_val", + }, + } def __init__( self, @@ -63,6 +69,17 @@ def test_param_provided(generator_sub_config): def test_class_vars_propagate_to_instance(generator_sub_config): m = mockConfigurable(config_root=generator_sub_config) assert m.class_var == m.DEFAULT_PARAMS["class_var"] + assert m.class_dict_var == m.DEFAULT_PARAMS["class_dict_var"] + + +# when a default parameter dictionary is provided merge on the resulting object +def test_class_dict_merge_to_instance(generator_sub_config): + config_dict_var = {"dict_a": "test_val", "dict_c": "test_val"} + generator_sub_config.generators["mock"]["class_dict_var"] = config_dict_var + m = mockConfigurable(config_root=generator_sub_config) + assert m.class_dict_var == m.DEFAULT_PARAMS["class_dict_var"] | config_dict_var + assert m.class_dict_var["dict_a"] == config_dict_var["dict_a"] + assert m.class_dict_var["dict_c"] == config_dict_var["dict_c"] # when a default parameter is provided and not config_root set on the resulting object