From 97c5916925f30a6218ebfbce36a79813f0d85804 Mon Sep 17 00:00:00 2001
From: Jeffrey Martin <jemartin@nvidia.com>
Date: Fri, 7 Jun 2024 10:36:49 -0500
Subject: [PATCH 1/9] do not override config deprefix_prompt

Signed-off-by: Jeffrey Martin <jemartin@nvidia.com>
---
 garak/generators/huggingface.py | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/garak/generators/huggingface.py b/garak/generators/huggingface.py
index e2b94f6f..eaea294d 100644
--- a/garak/generators/huggingface.py
+++ b/garak/generators/huggingface.py
@@ -87,7 +87,8 @@ def __init__(
             do_sample=self.do_sample,
             device=self.device,
         )
-        self.deprefix_prompt = self.name in models_to_deprefix
+        if not hasattr(self, "deprefix_prompt"):
+            self.deprefix_prompt = self.name in models_to_deprefix
         if _config.loaded:
             if _config.run.deprefix is True:
                 self.deprefix_prompt = True
@@ -173,7 +174,8 @@ def __init__(
             device=self.device,
             use_fp8=use_fp8,
         )
-        self.deprefix_prompt = name in models_to_deprefix
+        if not hasattr(self, "deprefix_prompt"):
+            self.deprefix_prompt = self.name in models_to_deprefix
         if _config.loaded:
             if _config.run.deprefix is True:
                 self.deprefix_prompt = True
@@ -219,7 +221,8 @@ def __init__(
             device=self.device,
         )
         self.conversation = Conversation()
-        self.deprefix_prompt = self.name in models_to_deprefix
+        if not hasattr(self, "deprefix_prompt"):
+            self.deprefix_prompt = self.name in models_to_deprefix
         if _config.loaded:
             if _config.run.deprefix is True:
                 self.deprefix_prompt = True
@@ -494,7 +497,8 @@ def __init__(
             config=self.config,
         ).to(self.init_device)
 
-        self.deprefix_prompt = self.name in models_to_deprefix
+        if not hasattr(self, "deprefix_prompt"):
+            self.deprefix_prompt = self.name in models_to_deprefix
 
         if self.config.tokenizer_class:
             self.tokenizer = transformers.AutoTokenizer.from_pretrained(

From 62a2269d9001799f66d3da5e6c9eccb975f0088f Mon Sep 17 00:00:00 2001
From: Jeffrey Martin <jemartin@nvidia.com>
Date: Thu, 6 Jun 2024 11:27:07 -0500
Subject: [PATCH 2/9] improve code reuse

* consolidate `__init__` where possible
* shift generator or model object creation to `_load_client()`

Signed-off-by: Jeffrey Martin <jemartin@nvidia.com>
---
 garak/generators/huggingface.py | 65 ++++++++++++++-------------------
 1 file changed, 28 insertions(+), 37 deletions(-)

diff --git a/garak/generators/huggingface.py b/garak/generators/huggingface.py
index eaea294d..843bf836 100644
--- a/garak/generators/huggingface.py
+++ b/garak/generators/huggingface.py
@@ -64,12 +64,17 @@ def __init__(
         self.generations = generations
         self.do_sample = do_sample
         self.device = device
-        self._load_config(config_root)
 
         super().__init__(
             self.name, generations=self.generations, config_root=config_root
         )
 
+        self._load_client()
+
+    def _load_client(self):
+        if hasattr(self, "generator") and self.generator is not None:
+            return
+
         from transformers import pipeline, set_seed
 
         if _config.run.seed is not None:
@@ -95,9 +100,13 @@ def __init__(
 
                 self._set_hf_context_len(self.generator.model.config)
 
+    def _clear_client():
+        self.generator = None
+
     def _call_model(
         self, prompt: str, generations_this_call: int = 1
     ) -> List[Union[str, None]]:
+        self._load_client()
         with warnings.catch_warnings():
             warnings.simplefilter("ignore", category=UserWarning)
             try:
@@ -136,18 +145,9 @@ class OptimumPipeline(Pipeline, HFCompatible):
     supports_multiple_generations = True
     doc_uri = "https://huggingface.co/blog/optimum-nvidia"
 
-    def __init__(
-        self, name="", do_sample=True, generations=10, device=0, config_root=_config
-    ):
-        self.name = name
-
-        super().__init__(
-            self.name,
-            do_sample=do_sample,
-            generations=generations,
-            device=device,
-            config_root=config_root,
-        )
+    def _load_client(self):
+        if hasattr(self, "generator") and self.generator is not None:
+            return
 
         from optimum.nvidia.pipelines import pipeline
         from transformers import set_seed
@@ -183,23 +183,15 @@ def __init__(
         self._set_hf_context_len(self.generator.model.config)
 
 
-class ConversationalPipeline(Generator, HFCompatible):
+class ConversationalPipeline(Pipeline, HFCompatible):
     """Conversational text generation using HuggingFace pipelines"""
 
     generator_family_name = "Hugging Face 🤗 pipeline for conversations"
     supports_multiple_generations = True
 
-    def __init__(
-        self, name="", do_sample=True, generations=10, device=0, config_root=_config
-    ):
-        self.name = name
-        self.do_sample = do_sample
-        self.generations = generations
-        self.device = device
-
-        super().__init__(
-            self.name, generations=self.generations, config_root=config_root
-        )
+    def _load_client(self):
+        if hasattr(self, "generator") and self.generator is not None:
+            return
 
         from transformers import pipeline, set_seed, Conversation
 
@@ -239,6 +231,7 @@ def _call_model(
     ) -> List[Union[str, None]]:
         """Take a conversation as a list of dictionaries and feed it to the model"""
 
+        self._load_client()
         # If conversation is provided as a list of dicts, create the conversation.
         # Otherwise, maintain state in Generator
         if isinstance(prompt, str):
@@ -451,22 +444,15 @@ def _call_model(
         return [output]
 
 
-class Model(Generator, HFCompatible):
+class Model(Pipeline, HFCompatible):
     """Get text generations from a locally-run Hugging Face model"""
 
     generator_family_name = "Hugging Face 🤗 model"
     supports_multiple_generations = True
 
-    def __init__(
-        self, name="", do_sample=True, generations=10, device=0, config_root=_config
-    ):
-        self.name = name
-        self.device = device
-        self.generations = generations
-
-        super().__init__(
-            self.name, generations=self.generations, config_root=config_root
-        )
+    def _load_client(self):
+        if hasattr(self, "model") and self.model is not None:
+            return
 
         import transformers
 
@@ -509,16 +495,21 @@ def __init__(
                 self.name, padding_side="left"
             )
 
-        self.do_sample = do_sample
         self.generation_config = transformers.GenerationConfig.from_pretrained(
             self.name
         )
         self.generation_config.eos_token_id = self.model.config.eos_token_id
         self.generation_config.pad_token_id = self.model.config.eos_token_id
 
+    def _clear_client(self):
+        self.model = None
+        self.config = None
+        self.generation_config = None
+
     def _call_model(
         self, prompt: str, generations_this_call: int = 1
     ) -> List[Union[str, None]]:
+        self._load_client()
         self.generation_config.max_new_tokens = self.max_tokens
         self.generation_config.do_sample = self.do_sample
         self.generation_config.num_return_sequences = generations_this_call

From 11ebb73dcb020ba7e42e788972bc9c798e9eae33 Mon Sep 17 00:00:00 2001
From: Jeffrey Martin <jemartin@nvidia.com>
Date: Fri, 7 Jun 2024 15:45:37 -0500
Subject: [PATCH 3/9] crude implmentation of limitation on parallel generator
 call

Signed-off-by: Jeffrey Martin <jemartin@nvidia.com>
---
 garak/generators/base.py        | 1 +
 garak/generators/huggingface.py | 7 ++++---
 garak/probes/base.py            | 2 ++
 3 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/garak/generators/base.py b/garak/generators/base.py
index b57c52d1..4ec02a45 100644
--- a/garak/generators/base.py
+++ b/garak/generators/base.py
@@ -27,6 +27,7 @@ class Generator(Configurable):
 
     active = True
     generator_family_name = None
+    parallel_capable = True
 
     # support mainstream any-to-any large models
     # legal element for str list `modality['in']`: 'text', 'image', 'audio', 'video', '3d'
diff --git a/garak/generators/huggingface.py b/garak/generators/huggingface.py
index 843bf836..5047bbec 100644
--- a/garak/generators/huggingface.py
+++ b/garak/generators/huggingface.py
@@ -56,6 +56,7 @@ class Pipeline(Generator, HFCompatible):
 
     generator_family_name = "Hugging Face 🤗 pipeline"
     supports_multiple_generations = True
+    parallel_capable = False
 
     def __init__(
         self, name="", do_sample=True, generations=10, device=0, config_root=_config
@@ -98,9 +99,9 @@ def _load_client(self):
             if _config.run.deprefix is True:
                 self.deprefix_prompt = True
 
-                self._set_hf_context_len(self.generator.model.config)
+        self._set_hf_context_len(self.generator.model.config)
 
-    def _clear_client():
+    def _clear_client(self):
         self.generator = None
 
     def _call_model(
@@ -237,7 +238,7 @@ def _call_model(
         if isinstance(prompt, str):
             self.conversation.add_message({"role": "user", "content": prompt})
             self.conversation = self.generator(self.conversation)
-            generations = [self.conversation[-1]["content"]]
+            generations = [self.conversation[-1]["content"]]  # what is this doing?
 
         elif isinstance(prompt, list):
             from transformers import Conversation
diff --git a/garak/probes/base.py b/garak/probes/base.py
index af0aa4c3..bc15a0bd 100644
--- a/garak/probes/base.py
+++ b/garak/probes/base.py
@@ -150,6 +150,7 @@ def _execute_attempt(self, this_attempt):
         return copy.deepcopy(this_attempt)
 
     def _execute_all(self, attempts) -> Iterable[garak.attempt.Attempt]:
+        """handles sending a set of attempt to the generator"""
         attempts_completed: Iterable[garak.attempt.Attempt] = []
 
         if (
@@ -157,6 +158,7 @@ def _execute_all(self, attempts) -> Iterable[garak.attempt.Attempt]:
             and _config.system.parallel_attempts > 1
             and self.parallelisable_attempts
             and len(attempts) > 1
+            and self.generator.parallel_capable
         ):
             from multiprocessing import Pool
 

From fd06da18e5d08bb1244224fab2f455a20292e3b0 Mon Sep 17 00:00:00 2001
From: Jeffrey Martin <jemartin@nvidia.com>
Date: Mon, 10 Jun 2024 16:00:16 -0500
Subject: [PATCH 4/9] add torch `mps` support & enabled passed pipeline params

* detect cuda vs mps vs cpu in a common way
* guard import of OptimimPipeline

Signed-off-by: Jeffrey Martin <jemartin@nvidia.com>
---
 garak/generators/huggingface.py | 91 +++++++++++++++++++++------------
 1 file changed, 58 insertions(+), 33 deletions(-)

diff --git a/garak/generators/huggingface.py b/garak/generators/huggingface.py
index 5047bbec..acd3be00 100644
--- a/garak/generators/huggingface.py
+++ b/garak/generators/huggingface.py
@@ -14,6 +14,7 @@
  https://huggingface.co/docs/api-inference/quicktour
 """
 
+import inspect
 import logging
 import re
 from typing import List, Union
@@ -25,7 +26,7 @@
 from transformers import LlavaNextProcessor, LlavaNextForConditionalGeneration
 
 from garak import _config
-from garak.exception import ModelNameMissingError
+from garak.exception import ModelNameMissingError, GarakException
 from garak.generators.base import Generator
 
 
@@ -70,6 +71,10 @@ def __init__(
             self.name, generations=self.generations, config_root=config_root
         )
 
+        import torch.multiprocessing as mp
+
+        mp.set_start_method("spawn", force=True)
+
         self._load_client()
 
     def _load_client(self):
@@ -83,16 +88,17 @@ def _load_client(self):
 
         import torch.cuda
 
-        if not torch.cuda.is_available():
-            logging.debug("Using CPU, torch.cuda.is_available() returned False")
-            self.device = -1
-
-        self.generator = pipeline(
-            "text-generation",
-            model=self.name,
-            do_sample=self.do_sample,
-            device=self.device,
+        # consider how this could be abstracted well
+        self.device = (
+            "cuda:" + str(self.device)
+            if torch.cuda.is_available()
+            else "mps" if torch.backends.mps.is_available() else "cpu"
         )
+
+        logging.debug("Using %s, based on torch environment evaluation", self.device)
+
+        pipline_kwargs = self._gather_pipeline_params(pipeline=pipeline)
+        self.generator = pipeline("text-generation", **pipline_kwargs)
         if not hasattr(self, "deprefix_prompt"):
             self.deprefix_prompt = self.name in models_to_deprefix
         if _config.loaded:
@@ -104,6 +110,17 @@ def _load_client(self):
     def _clear_client(self):
         self.generator = None
 
+    def _gather_pipeline_params(self, pipeline):
+        # this may be a bit too naive as it will pass any parameter valid for the pipeline signature
+        args = {}
+        for k in inspect.signature(pipeline).parameters:
+            if k == "model":
+                # special case of known mapping as `model` may be reserved for the class
+                args[k] = self.name
+            if hasattr(self, k):
+                args[k] = getattr(self, k)
+        return args
+
     def _call_model(
         self, prompt: str, generations_this_call: int = 1
     ) -> List[Union[str, None]]:
@@ -150,8 +167,14 @@ def _load_client(self):
         if hasattr(self, "generator") and self.generator is not None:
             return
 
-        from optimum.nvidia.pipelines import pipeline
-        from transformers import set_seed
+        try:
+            from optimum.nvidia.pipelines import pipeline
+            from transformers import set_seed
+        except Exception as e:
+            logging.exception(e)
+            raise GarakException(
+                f"Missing required dependencies for {self.__class__.__name__}"
+            )
 
         if _config.run.seed is not None:
             set_seed(_config.run.seed)
@@ -161,20 +184,15 @@ def _load_client(self):
         if not torch.cuda.is_available():
             message = "OptimumPipeline needs CUDA, but torch.cuda.is_available() returned False; quitting"
             logging.critical(message)
-            raise ValueError(message)
+            raise GarakException(message)
 
-        use_fp8 = False
+        self.use_fp8 = False
         if _config.loaded:
             if "use_fp8" in _config.plugins.generators.OptimumPipeline:
-                use_fp8 = True
-
-        self.generator = pipeline(
-            "text-generation",
-            model=self.name,
-            do_sample=self.do_sample,
-            device=self.device,
-            use_fp8=use_fp8,
-        )
+                self.use_fp8 = True
+
+        pipline_kwargs = self._gather_pipeline_params(pipeline=pipeline)
+        self.generator = pipeline("text-generation", **pipline_kwargs)
         if not hasattr(self, "deprefix_prompt"):
             self.deprefix_prompt = self.name in models_to_deprefix
         if _config.loaded:
@@ -201,18 +219,19 @@ def _load_client(self):
 
         import torch.cuda
 
-        if not torch.cuda.is_available():
-            logging.debug("Using CPU, torch.cuda.is_available() returned False")
-            self.device = -1
+        # consider how this could be abstracted well
+        self.device = (
+            "cuda:" + str(self.device)
+            if torch.cuda.is_available()
+            else "mps" if torch.backends.mps.is_available() else "cpu"
+        )
+
+        logging.debug("Using %s, based on torch environment evaluation", self.device)
 
         # Note that with pipeline, in order to access the tokenizer, model, or device, you must get the attribute
         # directly from self.generator instead of from the ConversationalPipeline object itself.
-        self.generator = pipeline(
-            "conversational",
-            model=self.name,
-            do_sample=self.do_sample,
-            device=self.device,
-        )
+        pipline_kwargs = self._gather_pipeline_params(pipeline=pipeline)
+        self.generator = pipeline("conversational", **pipline_kwargs)
         self.conversation = Conversation()
         if not hasattr(self, "deprefix_prompt"):
             self.deprefix_prompt = self.name in models_to_deprefix
@@ -460,9 +479,15 @@ def _load_client(self):
         if _config.run.seed is not None:
             transformers.set_seed(_config.run.seed)
 
-        self.init_device = "cuda:" + str(self.device)
         import torch.cuda
 
+        # consider how this could be abstracted well
+        self.init_device = (
+            "cuda:" + str(self.device)
+            if torch.cuda.is_available()
+            else "mps" if torch.backends.mps.is_available() else "cpu"
+        )
+
         if not torch.cuda.is_available():
             logging.debug("Using CPU, torch.cuda.is_available() returned False")
             self.device = -1

From 62f91a9c1211ee898a1ff77b6c90390b7bbfd1f1 Mon Sep 17 00:00:00 2001
From: Jeffrey Martin <jemartin@nvidia.com>
Date: Mon, 17 Jun 2024 09:37:54 -0500
Subject: [PATCH 5/9] enable hf model or pipeline config in `hf_args`

* support all generic `pipeline` args at all times
* adds `do_sample` when `model` is a parameter to the `Callable`
* adds `low_cpu_mem_usage` and all `pipeline` for `Callables` without `model`
* consolidates optimal device selection & set when not provided by config

Signed-off-by: Jeffrey Martin <jemartin@nvidia.com>
---
 garak/generators/huggingface.py      | 213 ++++++++++++++++-----------
 tests/generators/test_huggingface.py |  16 +-
 2 files changed, 139 insertions(+), 90 deletions(-)

diff --git a/garak/generators/huggingface.py b/garak/generators/huggingface.py
index acd3be00..f0972920 100644
--- a/garak/generators/huggingface.py
+++ b/garak/generators/huggingface.py
@@ -16,8 +16,9 @@
 
 import inspect
 import logging
+import os
 import re
-from typing import List, Union
+from typing import Callable, List, Union
 import warnings
 
 import backoff
@@ -33,15 +34,15 @@
 models_to_deprefix = ["gpt2"]
 
 
-class HFRateLimitException(Exception):
+class HFRateLimitException(GarakException):
     pass
 
 
-class HFLoadingException(Exception):
+class HFLoadingException(GarakException):
     pass
 
 
-class HFInternalServerError(Exception):
+class HFInternalServerError(GarakException):
     pass
 
 
@@ -51,30 +52,106 @@ def _set_hf_context_len(self, config):
             if isinstance(config.n_ctx, int):
                 self.context_len = config.n_ctx
 
+    def _gather_hf_params(self, hf_constructor: Callable):
+        # this may be a bit too naive as it will pass any parameter valid for the pipeline signature
+        # this falls over when passed `from_pretrained` methods as the callable model params are not explicit
+        params = self.hf_args
+        if params["device"] is None:
+            params["device"] = self.device
+
+        args = {}
+
+        parameters = inspect.signature(hf_constructor).parameters
+
+        if "model" in parameters:
+            args["model"] = self.name
+            # expand for
+            parameters = {"do_sample": True} | parameters
+        else:
+            # callable is for a Pretrained class also map standard `pipeline` params
+            from transformers import pipeline
+
+            parameters = (
+                {"low_cpu_mem_usage": True}
+                | parameters
+                | inspect.signature(pipeline).parameters
+            )
+
+        for k in parameters:
+            if k == "model":
+                continue  # special case `model` comes from `name` in the generator
+            if k in params:
+                val = params[k]
+                if k == "torch_dtype" and hasattr(torch, val):
+                    args[k] = getattr(
+                        torch, val
+                    )  # some model type specific classes do not yet support direct string representation
+                    continue
+                if (
+                    k == "device"
+                    and "device_map" in parameters
+                    and "device_map" in params
+                ):
+                    # per transformers convention hold `device_map` before `device`
+                    continue
+                args[k] = params[k]
+
+        return args
+
+    def _select_hf_device(self):
+        """Determine the most efficient device for tensor load, hold any existing `device` already selected"""
+        import torch.cuda
+
+        selected_device = None
+        if self.hf_args["device"] is not None:
+            if isinstance(self.hf_args["device"], int):
+                # this assumes that indexed only devices selections means `cuda`
+                selected_device = torch.device("cuda:" + str(self.hf_args["device"]))
+            else:
+                selected_device = torch.device(self.hf_args["device"])
+
+        if selected_device is None:
+            selected_device = torch.device(
+                "cuda"
+                if torch.cuda.is_available()
+                else "mps" if torch.backends.mps.is_available() else "cpu"
+            )
+
+        if isinstance(selected_device, torch.device) and selected_device.type == "mps":
+            os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1"
+            logging.debug("Enabled MPS fallback environment variable")
+
+        logging.debug(
+            "Using %s, based on torch environment evaluation", selected_device
+        )
+        return selected_device
+
 
 class Pipeline(Generator, HFCompatible):
     """Get text generations from a locally-run Hugging Face pipeline"""
 
+    DEFAULT_PARAMS = Generator.DEFAULT_PARAMS | {
+        "generations": 10,
+        "hf_args": {
+            "torch_dtype": "float16",
+            "do_sample": True,
+            "device": None,
+        },
+    }
     generator_family_name = "Hugging Face 🤗 pipeline"
     supports_multiple_generations = True
     parallel_capable = False
 
-    def __init__(
-        self, name="", do_sample=True, generations=10, device=0, config_root=_config
-    ):
+    def __init__(self, name="", config_root=_config):
         self.name = name
-        self.generations = generations
-        self.do_sample = do_sample
-        self.device = device
 
-        super().__init__(
-            self.name, generations=self.generations, config_root=config_root
-        )
+        super().__init__(self.name, config_root=config_root)
 
         import torch.multiprocessing as mp
 
         mp.set_start_method("spawn", force=True)
 
+        self.device = self._select_hf_device()
         self._load_client()
 
     def _load_client(self):
@@ -86,18 +163,7 @@ def _load_client(self):
         if _config.run.seed is not None:
             set_seed(_config.run.seed)
 
-        import torch.cuda
-
-        # consider how this could be abstracted well
-        self.device = (
-            "cuda:" + str(self.device)
-            if torch.cuda.is_available()
-            else "mps" if torch.backends.mps.is_available() else "cpu"
-        )
-
-        logging.debug("Using %s, based on torch environment evaluation", self.device)
-
-        pipline_kwargs = self._gather_pipeline_params(pipeline=pipeline)
+        pipline_kwargs = self._gather_hf_params(hf_constructor=pipeline)
         self.generator = pipeline("text-generation", **pipline_kwargs)
         if not hasattr(self, "deprefix_prompt"):
             self.deprefix_prompt = self.name in models_to_deprefix
@@ -110,17 +176,6 @@ def _load_client(self):
     def _clear_client(self):
         self.generator = None
 
-    def _gather_pipeline_params(self, pipeline):
-        # this may be a bit too naive as it will pass any parameter valid for the pipeline signature
-        args = {}
-        for k in inspect.signature(pipeline).parameters:
-            if k == "model":
-                # special case of known mapping as `model` may be reserved for the class
-                args[k] = self.name
-            if hasattr(self, k):
-                args[k] = getattr(self, k)
-        return args
-
     def _call_model(
         self, prompt: str, generations_this_call: int = 1
     ) -> List[Union[str, None]]:
@@ -191,7 +246,7 @@ def _load_client(self):
             if "use_fp8" in _config.plugins.generators.OptimumPipeline:
                 self.use_fp8 = True
 
-        pipline_kwargs = self._gather_pipeline_params(pipeline=pipeline)
+        pipline_kwargs = self._gather_hf_params(hf_constructor=pipeline)
         self.generator = pipeline("text-generation", **pipline_kwargs)
         if not hasattr(self, "deprefix_prompt"):
             self.deprefix_prompt = self.name in models_to_deprefix
@@ -219,18 +274,9 @@ def _load_client(self):
 
         import torch.cuda
 
-        # consider how this could be abstracted well
-        self.device = (
-            "cuda:" + str(self.device)
-            if torch.cuda.is_available()
-            else "mps" if torch.backends.mps.is_available() else "cpu"
-        )
-
-        logging.debug("Using %s, based on torch environment evaluation", self.device)
-
         # Note that with pipeline, in order to access the tokenizer, model, or device, you must get the attribute
         # directly from self.generator instead of from the ConversationalPipeline object itself.
-        pipline_kwargs = self._gather_pipeline_params(pipeline=pipeline)
+        pipline_kwargs = self._gather_hf_params(hf_constructor=pipeline)
         self.generator = pipeline("conversational", **pipline_kwargs)
         self.conversation = Conversation()
         if not hasattr(self, "deprefix_prompt"):
@@ -278,7 +324,7 @@ def _call_model(
             return [re.sub("^" + re.escape(prompt), "", _o) for _o in outputs]
 
 
-class InferenceAPI(Generator, HFCompatible):
+class InferenceAPI(Generator):
     """Get text generations from Hugging Face Inference API"""
 
     generator_family_name = "Hugging Face 🤗 Inference API"
@@ -407,7 +453,7 @@ def _pre_generate_hook(self):
         self.wait_for_model = False
 
 
-class InferenceEndpoint(InferenceAPI, HFCompatible):
+class InferenceEndpoint(InferenceAPI):
     """Interface for Hugging Face private endpoints
     Pass the model URL as the name, e.g. https://xxx.aws.endpoints.huggingface.cloud
     """
@@ -479,35 +525,22 @@ def _load_client(self):
         if _config.run.seed is not None:
             transformers.set_seed(_config.run.seed)
 
-        import torch.cuda
-
-        # consider how this could be abstracted well
-        self.init_device = (
-            "cuda:" + str(self.device)
-            if torch.cuda.is_available()
-            else "mps" if torch.backends.mps.is_available() else "cpu"
-        )
-
-        if not torch.cuda.is_available():
-            logging.debug("Using CPU, torch.cuda.is_available() returned False")
-            self.device = -1
-            self.init_device = "cpu"
-
         trust_remote_code = self.name.startswith("mosaicml/mpt-")
 
+        model_kwargs = self._gather_hf_params(
+            hf_constructor=transformers.AutoConfig.from_pretrained
+        )  # will defer to device_map if device map was `auto` may not match self.device
+
         self.config = transformers.AutoConfig.from_pretrained(
-            self.name, trust_remote_code=trust_remote_code
-        )
-        self.config.init_device = (
-            self.init_device  # or "cuda:0" For fast initialization directly on GPU!
+            self.name, trust_remote_code=trust_remote_code, **model_kwargs
         )
 
         self._set_hf_context_len(self.config)
+        self.config.init_device = self.device  # determined by Pipeline `__init__``
 
         self.model = transformers.AutoModelForCausalLM.from_pretrained(
-            self.name,
-            config=self.config,
-        ).to(self.init_device)
+            self.name, config=self.config
+        ).to(self.device)
 
         if not hasattr(self, "deprefix_prompt"):
             self.deprefix_prompt = self.name in models_to_deprefix
@@ -537,7 +570,7 @@ def _call_model(
     ) -> List[Union[str, None]]:
         self._load_client()
         self.generation_config.max_new_tokens = self.max_tokens
-        self.generation_config.do_sample = self.do_sample
+        self.generation_config.do_sample = self.hf_args["do_sample"]
         self.generation_config.num_return_sequences = generations_this_call
         if self.temperature is not None:
             self.generation_config.temperature = self.temperature
@@ -550,7 +583,7 @@ def _call_model(
             with torch.no_grad():
                 inputs = self.tokenizer(
                     prompt, truncation=True, return_tensors="pt"
-                ).to(self.init_device)
+                ).to(self.device)
 
                 try:
                     outputs = self.model.generate(
@@ -574,21 +607,23 @@ def _call_model(
             return [re.sub("^" + re.escape(prompt), "", i) for i in text_output]
 
 
-class LLaVA(Generator):
+class LLaVA(Generator, HFCompatible):
     """Get LLaVA ([ text + image ] -> text) generations"""
 
     DEFAULT_PARAMS = Generator.DEFAULT_PARAMS | {
+        "max_tokens": 4000,
         # "exist_tokens + max_new_tokens < 4K is the golden rule."
         # https://github.com/haotian-liu/LLaVA/issues/1095#:~:text=Conceptually%2C%20as%20long%20as%20the%20total%20tokens%20are%20within%204K%2C%20it%20would%20be%20fine%2C%20so%20exist_tokens%20%2B%20max_new_tokens%20%3C%204K%20is%20the%20golden%20rule.
-        "max_tokens": 4000,
-        # consider shifting below to kwargs or llava_kwargs that is a dict to allow more customization
-        "torch_dtype": torch.float16,
-        "low_cpu_mem_usage": True,
-        "device_map": "cuda:0",
+        "hf_args": {
+            "torch_dtype": "float16",
+            "low_cpu_mem_usage": True,
+            "device_map": "auto",
+        },
     }
 
     # rewrite modality setting
     modality = {"in": {"text", "image"}, "out": {"text"}}
+    parallel_capable = False
 
     # Support Image-Text-to-Text models
     # https://huggingface.co/llava-hf#:~:text=Llava-,Models,-9
@@ -603,20 +638,20 @@ def __init__(self, name="", generations=10, config_root=_config):
         super().__init__(name, generations=generations, config_root=config_root)
         if self.name not in self.supported_models:
             raise ModelNameMissingError(
-                f"Invalid modal name {self.name}, current support: {self.supported_models}."
+                f"Invalid model name {self.name}, current support: {self.supported_models}."
             )
+
+        self.device = self._select_hf_device()
+        model_kwargs = self._gather_hf_params(
+            hf_constructor=LlavaNextForConditionalGeneration.from_pretrained
+        )  # will defer to device_map if device map was `auto` may not match self.device
+
         self.processor = LlavaNextProcessor.from_pretrained(self.name)
         self.model = LlavaNextForConditionalGeneration.from_pretrained(
-            self.name,
-            torch_dtype=self.torch_dtype,
-            low_cpu_mem_usage=self.low_cpu_mem_usage,
+            self.name, **model_kwargs
         )
-        if torch.cuda.is_available():
-            self.model.to(self.device_map)
-        else:
-            raise RuntimeError(
-                "CUDA is not supported on this device. Please make sure CUDA is installed and configured properly."
-            )
+
+        self.model.to(self.device)
 
     def generate(
         self, prompt: str, generations_this_call: int = 1
@@ -630,7 +665,7 @@ def generate(
             raise Exception(e)
 
         inputs = self.processor(text_prompt, image_prompt, return_tensors="pt").to(
-            self.device_map
+            self.device
         )
         exist_token_number: int = inputs.data["input_ids"].shape[1]
         output = self.model.generate(
diff --git a/tests/generators/test_huggingface.py b/tests/generators/test_huggingface.py
index f3381c94..d42c6c2a 100644
--- a/tests/generators/test_huggingface.py
+++ b/tests/generators/test_huggingface.py
@@ -1,11 +1,25 @@
 import transformers
 import garak.generators.huggingface
+from garak._config import GarakSubConfig
 
 DEFAULT_GENERATIONS_QTY = 10
 
 
 def test_pipeline():
-    g = garak.generators.huggingface.Pipeline("gpt2")
+    gen_config = {
+        "huggingface": {
+            "Pipeline": {
+                "name": "gpt2",
+                "hf_args": {
+                    "device": "cpu",
+                },
+            }
+        }
+    }
+    config_root = GarakSubConfig()
+    setattr(config_root, "generators", gen_config)
+
+    g = garak.generators.huggingface.Pipeline("gpt2", config_root=config_root)
     assert g.name == "gpt2"
     assert g.generations == DEFAULT_GENERATIONS_QTY
     assert isinstance(g.generator, transformers.pipelines.text_generation.Pipeline)

From 1fca1196dd69f11dd5dd909e988d2084e5f80d7e Mon Sep 17 00:00:00 2001
From: Jeffrey Martin <jemartin@nvidia.com>
Date: Mon, 17 Jun 2024 13:52:15 -0500
Subject: [PATCH 6/9] amend yaml config example

---
 tests/test_config.py | 17 +++++++++++------
 1 file changed, 11 insertions(+), 6 deletions(-)

diff --git a/tests/test_config.py b/tests/test_config.py
index 8d33cef1..48aac522 100644
--- a/tests/test_config.py
+++ b/tests/test_config.py
@@ -19,23 +19,27 @@
 plugins:
   generators:
     huggingface:
-      dtype: general
-      gpu: 0
+      hf_args:
+        torch_dtype: float16
       Pipeline:
-        dtype: bfloat16
+        hf_args:
+            device: cuda
   probes:
     test:
       generators:
         huggingface:
             Pipeline:
-              dtype: for_probe
+                hf_args:
+                    torch_dtype: float16
   detector:
       test:
         val: tests
         Blank:
           generators:
             huggingface:
-                gpu: 1
+                hf_args:
+                    torch_dtype: float16
+                    device: cuda:1
                 Pipeline:
                   dtype: for_detector
   buffs:
@@ -43,7 +47,8 @@
         Blank:
           generators:
             huggingface:
-                gpu: 1
+                hf_args:
+                    device: cuda:0
                 Pipeline:
                   dtype: for_detector
 """.encode(

From f4d77b65968ad43466259a69c2b16227c236afec Mon Sep 17 00:00:00 2001
From: Jeffrey Martin <jemartin@nvidia.com>
Date: Tue, 18 Jun 2024 09:45:53 -0500
Subject: [PATCH 7/9] support merged dictionary in `Configurable`

Signed-off-by: Jeffrey Martin <jemartin@nvidia.com>
---
 garak/configurable.py      |  5 +++++
 tests/test_configurable.py | 19 ++++++++++++++++++-
 2 files changed, 23 insertions(+), 1 deletion(-)

diff --git a/garak/configurable.py b/garak/configurable.py
index 7ad768a7..efb9f566 100644
--- a/garak/configurable.py
+++ b/garak/configurable.py
@@ -88,6 +88,8 @@ def _apply_config(self, config):
                     )
                 ):
                     continue
+                if isinstance(v, dict):  # if value is an existing dictionary merge
+                    v = getattr(self, k) | v
             setattr(self, k, v)  # This will set attribute to the full dictionary value
 
     def _apply_missing_instance_defaults(self):
@@ -96,6 +98,9 @@ def _apply_missing_instance_defaults(self):
             for k, v in self.DEFAULT_PARAMS.items():
                 if not hasattr(self, k):
                     setattr(self, k, v)
+                elif isinstance(v, dict):
+                    v = v | getattr(self, k)
+                    setattr(self, k, v)
 
     def _validate_env_var(self):
         if hasattr(self, "key_env_var"):
diff --git a/tests/test_configurable.py b/tests/test_configurable.py
index 4979beb1..7847f65b 100644
--- a/tests/test_configurable.py
+++ b/tests/test_configurable.py
@@ -24,7 +24,13 @@ class mockConfigurable(Configurable):
     # Configurable is coupled to hierarchy of plugin types
     __module__ = "garak.generators.mock"
 
-    DEFAULT_PARAMS = {"class_var": "from_class"}
+    DEFAULT_PARAMS = {
+        "class_var": "from_class",
+        "class_dict_var": {
+            "dict_a": "dict_val",
+            "dict_b": "dict_val",
+        },
+    }
 
     def __init__(
         self,
@@ -63,6 +69,17 @@ def test_param_provided(generator_sub_config):
 def test_class_vars_propagate_to_instance(generator_sub_config):
     m = mockConfigurable(config_root=generator_sub_config)
     assert m.class_var == m.DEFAULT_PARAMS["class_var"]
+    assert m.class_dict_var == m.DEFAULT_PARAMS["class_dict_var"]
+
+
+# when a default parameter dictionary is provided merge on the resulting object
+def test_class_dict_merge_to_instance(generator_sub_config):
+    config_dict_var = {"dict_a": "test_val", "dict_c": "test_val"}
+    generator_sub_config.generators["mock"]["class_dict_var"] = config_dict_var
+    m = mockConfigurable(config_root=generator_sub_config)
+    assert m.class_dict_var == m.DEFAULT_PARAMS["class_dict_var"] | config_dict_var
+    assert m.class_dict_var["dict_a"] == config_dict_var["dict_a"]
+    assert m.class_dict_var["dict_c"] == config_dict_var["dict_c"]
 
 
 # when a default parameter is provided and not config_root set on the resulting object

From 9f19c30c75611e0587447b39596ad3cebc6f243a Mon Sep 17 00:00:00 2001
From: Jeffrey Martin <jemartin@nvidia.com>
Date: Tue, 18 Jun 2024 10:56:09 -0500
Subject: [PATCH 8/9] free tokenizer in _clear_client

Signed-off-by: Jeffrey Martin <jemartin@nvidia.com>
---
 garak/generators/huggingface.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/garak/generators/huggingface.py b/garak/generators/huggingface.py
index f0972920..7fb584a1 100644
--- a/garak/generators/huggingface.py
+++ b/garak/generators/huggingface.py
@@ -563,6 +563,7 @@ def _load_client(self):
     def _clear_client(self):
         self.model = None
         self.config = None
+        self.tokenizer = None
         self.generation_config = None
 
     def _call_model(

From 7b1382976d2f4e71c635230cb30a7db4c7cc8075 Mon Sep 17 00:00:00 2001
From: Jeffrey Martin <jemartin@nvidia.com>
Date: Wed, 19 Jun 2024 09:12:43 -0500
Subject: [PATCH 9/9] explicit device support

* raise error when passed negative device integer
* rename parameter tracking var
* remove unused import
* add tests for `_select_hf_device()`

Signed-off-by: Jeffrey Martin <jemartin@nvidia.com>
---
 garak/generators/huggingface.py      | 22 ++++++++++++----------
 tests/generators/test_huggingface.py | 24 ++++++++++++++++++++++++
 2 files changed, 36 insertions(+), 10 deletions(-)

diff --git a/garak/generators/huggingface.py b/garak/generators/huggingface.py
index 7fb584a1..2da3f9e9 100644
--- a/garak/generators/huggingface.py
+++ b/garak/generators/huggingface.py
@@ -61,23 +61,23 @@ def _gather_hf_params(self, hf_constructor: Callable):
 
         args = {}
 
-        parameters = inspect.signature(hf_constructor).parameters
+        params_to_process = inspect.signature(hf_constructor).parameters
 
-        if "model" in parameters:
+        if "model" in params_to_process:
             args["model"] = self.name
             # expand for
-            parameters = {"do_sample": True} | parameters
+            params_to_process = {"do_sample": True} | params_to_process
         else:
             # callable is for a Pretrained class also map standard `pipeline` params
             from transformers import pipeline
 
-            parameters = (
+            params_to_process = (
                 {"low_cpu_mem_usage": True}
-                | parameters
+                | params_to_process
                 | inspect.signature(pipeline).parameters
             )
 
-        for k in parameters:
+        for k in params_to_process:
             if k == "model":
                 continue  # special case `model` comes from `name` in the generator
             if k in params:
@@ -89,7 +89,7 @@ def _gather_hf_params(self, hf_constructor: Callable):
                     continue
                 if (
                     k == "device"
-                    and "device_map" in parameters
+                    and "device_map" in params_to_process
                     and "device_map" in params
                 ):
                     # per transformers convention hold `device_map` before `device`
@@ -103,9 +103,13 @@ def _select_hf_device(self):
         import torch.cuda
 
         selected_device = None
-        if self.hf_args["device"] is not None:
+        if self.hf_args.get("device", None) is not None:
             if isinstance(self.hf_args["device"], int):
                 # this assumes that indexed only devices selections means `cuda`
+                if self.hf_args["device"] < 0:
+                    msg = f"device {self.hf_args['device']} requested but CUDA device numbering starts at zero. Use 'device: cpu' to request CPU."
+                    logging.critical(msg)
+                    raise ValueError(msg)
                 selected_device = torch.device("cuda:" + str(self.hf_args["device"]))
             else:
                 selected_device = torch.device(self.hf_args["device"])
@@ -272,8 +276,6 @@ def _load_client(self):
         if _config.run.seed is not None:
             set_seed(_config.run.seed)
 
-        import torch.cuda
-
         # Note that with pipeline, in order to access the tokenizer, model, or device, you must get the attribute
         # directly from self.generator instead of from the ConversationalPipeline object itself.
         pipline_kwargs = self._gather_hf_params(hf_constructor=pipeline)
diff --git a/tests/generators/test_huggingface.py b/tests/generators/test_huggingface.py
index d42c6c2a..6f6d19ec 100644
--- a/tests/generators/test_huggingface.py
+++ b/tests/generators/test_huggingface.py
@@ -1,3 +1,4 @@
+import pytest
 import transformers
 import garak.generators.huggingface
 from garak._config import GarakSubConfig
@@ -68,3 +69,26 @@ def test_model():
     assert len(output) == DEFAULT_GENERATIONS_QTY
     for item in output:
         assert item is None  # gpt2 is known raise exception returning `None`
+
+
+def test_select_hf_device():
+    from garak.generators.huggingface import HFCompatible
+    import torch
+
+    class mockHF(HFCompatible):
+        def __init__(self, key, value):
+            self.hf_args = {key: value}
+            pass
+
+    m = mockHF("device", -1)
+    with pytest.raises(ValueError) as exc_info:
+        device = m._select_hf_device()
+    assert "CUDA device numbering starts" in str(exc_info.value)
+
+    m = mockHF("device", "cpu")
+    device = m._select_hf_device()
+    assert device == torch.device("cpu")
+
+    m = mockHF("device_map", "auto")
+    device = m._select_hf_device()
+    assert isinstance(device, torch.device)