dottxt-ai · Dec 16, 2024 · Dec 16, 2024 · Dec 20, 2024 · Dec 21, 2024 · Dec 28, 2024
diff --git a/benchmarks/bench_processors.py b/benchmarks/bench_processors.py
@@ -37,15 +37,12 @@ def get_mock_processor_inputs(array_library, num_tokens=30000):
     logits: (4, 30,000 ) dtype=float
     input_ids shape: (4, 2048) dtype=int
     """
-    if array_library == "torch":
-        logits = torch.rand((4, num_tokens), dtype=torch.float)
-        input_ids = torch.randint(
-            low=0, high=num_tokens, size=(4, 2048), dtype=torch.int
-        )
-    elif array_library == "torch_cuda":
-        logits = torch.rand((4, num_tokens), dtype=torch.float, device="cuda")
+    if array_library.startswith("torch"):
+        device = array_library.split("_")[1] if "_" in array_library else "cpu"
+
+        logits = torch.rand((4, num_tokens), dtype=torch.float, device=device)
         input_ids = torch.randint(
-            low=0, high=num_tokens, size=(4, 2048), dtype=torch.int, device="cuda"
+            low=0, high=num_tokens, size=(4, 2048), dtype=torch.int, device=device
         )
     elif array_library == "numpy":
         logits = np.random.rand(4, num_tokens).astype(np.float32)
@@ -88,6 +85,8 @@ class LogitsProcessorPassthroughBenchmark:
         params += ["mlx"]
     if torch.cuda.is_available():
         params += ["torch_cuda"]
+    if torch.mps.is_available():
+        params += ["torch_mps"]
     if is_jax_allowed():
         params += ["jax"]
 
@@ -108,9 +107,10 @@ class LogitsProcessorStructuredBenchmark:
     array_libraries = ["torch", "numpy"]
     if is_mlx_lm_allowed():
         array_libraries += ["mlx"]
-    # PR TODO
     if torch.cuda.is_available():
         array_libraries += ["torch_cuda"]
+    if torch.mps.is_available():
+        array_libraries += ["torch_mps"]
 
     # accept very many or very few tokens, respectively
     patterns = [r"[^Z]*", "Z*"]

diff --git a/docs/cookbook/chain_of_thought.md b/docs/cookbook/chain_of_thought.md
@@ -80,8 +80,8 @@ json_schema = Reasoning.model_json_schema()
 We could generate a response using the json schema but for a change we will use the regex:
 
 ```python
-from outlines.integrations.utils import convert_json_schema_to_str
-from outlines.fsm.json_schema import build_regex_from_schema
+from outlines.fsm.json_schema import convert_json_schema_to_str
+from outlines_core.fsm.json_schema import build_regex_from_schema
 
 schema_str = convert_json_schema_to_str(json_schema=json_schema)
 regex_str = build_regex_from_schema(schema_str)

diff --git a/docs/cookbook/receipt-digitization.md b/docs/cookbook/receipt-digitization.md
@@ -117,7 +117,7 @@ Here's what the image looks like:
 
 ```python
 # Path to the image
-image_path = "https://dottxt-ai.github.io/outlines/main/cookbook/images/trader-joes-receipt.png"
+image_path = "https://raw.githubusercontent.com/dottxt-ai/outlines/refs/heads/main/docs/cookbook/images/trader-joes-receipt.jpg"
 
 # Download the image
 response = requests.get(image_path)

diff --git a/docs/reference/models/transformers.md b/docs/reference/models/transformers.md
@@ -30,7 +30,7 @@ tokenizer = AutoTokenizer.from_pretrained("gpt2")
 model = models.Transformers(llm, tokenizer)
 ```
 
-# Using Logits Processors
+## Using Logits Processors
 
 There are two ways to use Outlines Structured Generation with HuggingFace Transformers:
 
@@ -39,7 +39,7 @@ There are two ways to use Outlines Structured Generation with HuggingFace Transf
 
 Outlines supports a myriad of logits processors for structured generation. In these example, we will use the `RegexLogitsProcessor` which guarantees generated text matches the specified pattern.
 
-## Using `outlines.models.transformers`
+### Using `outlines.models.transformers`
 
 ```python
 import outlines
@@ -54,7 +54,7 @@ print(output)
 # 2:30 pm
 ```
 
-## Using models initialized via the `transformers`  library
+### Using models initialized via the `transformers`  library
 
 ```python
 import outlines
@@ -85,7 +85,7 @@ print(output)
 [transformers]: https://github.com/huggingface/transformers
 
 
-# Alternative Model Classes
+## Alternative Model Classes
 
 `outlines.models.transformers` defaults to `transformers.AutoModelForCausalLM`, which is the appropriate class for most standard large language models, including Llama 3, Mistral, Phi-3, etc.
 
@@ -133,15 +133,15 @@ T5 Example:
 import outlines
 from transformers import AutoModelForSeq2SeqLM
 
-model_pile_t5 = models.transformers(
+model_pile_t5 = outlines.models.transformers(
     model_name="EleutherAI/pile-t5-large",
     model_class=AutoModelForSeq2SeqLM,
 )
 ```
 
 Bart Example:
 ```python
-model_bart = models.transformers(
+model_bart = outlines.models.transformers(
     model_name="facebook/bart-large",
     model_class=AutoModelForSeq2SeqLM,
 )

diff --git a/examples/vllm_integration.py b/examples/vllm_integration.py
@@ -3,7 +3,7 @@
 import vllm
 from pydantic import BaseModel
 
-from outlines.integrations.vllm import JSONLogitsProcessor
+from outlines.processors.structured import JSONLogitsProcessor
 
 
 class Person(BaseModel):

diff --git a/mkdocs.yml b/mkdocs.yml
@@ -87,6 +87,7 @@ plugins:
       handlers:
         python:
           options:
+            docstring_style: numpy
             show_submodules: true
   - search
   - section-index
@@ -160,7 +161,7 @@ nav:
           - TGI: reference/models/tgi.md
           - ExllamaV2: reference/models/exllamav2.md
           - MLX: reference/models/mlxlm.md
-          - Mamba: reference/models/transformers.md
+          - Mamba: reference/models/transformers/#mamba
         - API:
             - OpenAI: reference/models/openai.md
   - API Reference:

diff --git a/outlines/__init__.py b/outlines/__init__.py
@@ -1,4 +1,5 @@
 """Outlines is a Generative Model Programming Framework."""
+
 import outlines.generate
 import outlines.grammars
 import outlines.models
@@ -7,14 +8,15 @@
 from outlines.base import vectorize
 from outlines.caching import clear_cache, disable_cache, get_cache
 from outlines.function import Function
-from outlines.prompts import prompt
+from outlines.prompts import Prompt, prompt
 
 __all__ = [
     "clear_cache",
     "disable_cache",
     "get_cache",
     "Function",
     "prompt",
+    "Prompt",
     "vectorize",
     "grammars",
 ]
diff --git a/outlines/generate/api.py b/outlines/generate/api.py
@@ -4,7 +4,6 @@
 from typing import TYPE_CHECKING, Any, Iterator, List, Optional, Union
 
 from outlines.generate.generator import sequence_generator
-from outlines.samplers import BeamSearchSampler, GreedySampler, MultinomialSampler
 
 if TYPE_CHECKING:
     import torch
@@ -353,11 +352,13 @@ def token_generator() -> Iterator[Union[List[str], str, List[List[str]]]]:
                     ]
 
                     generated_sequences = [
-                        self.format_sequence(
-                            self.strip_stop_sequences(sequence, stop_sequences)
+                        (
+                            self.format_sequence(
+                                self.strip_stop_sequences(sequence, stop_sequences)
+                            )
+                            if stop
+                            else sequence
                         )
-                        if stop
-                        else sequence
                         for sequence, stop in zip(
                             generated_sequences, is_stop_at_reached
                         )
@@ -428,22 +429,7 @@ def __init__(self, model, logits_processor, sampler):
         self.model = model
         self.logits_processor = logits_processor
 
-        if isinstance(sampler, MultinomialSampler):
-            self.sampling_params = SamplingParameters(
-                "multinomial",
-                sampler.samples,
-                sampler.top_p,
-                sampler.top_k,
-                sampler.temperature,
-            )
-        elif isinstance(sampler, GreedySampler):
-            self.sampling_params = SamplingParameters(
-                "greedy", sampler.samples, None, None, 0.0
-            )
-        elif isinstance(sampler, BeamSearchSampler):
-            self.sampling_params = SamplingParameters(
-                "beam_search", sampler.samples, None, None, 1.0
-            )
+        self.sampling_params = sampler.sampling_params
 
     def prepare_generation_parameters(
         self,

diff --git a/outlines/processors/structured.py b/outlines/processors/structured.py
@@ -103,22 +103,19 @@ def process_logits(
 
             sequence_states.append(self._guide_states[curr_state_key])
 
-        mask = torch.ones_like(logits, dtype=torch.bool)
-
         allowed_tokens_batch = []
         batch_indices = []
         for i, guide_state in enumerate(sequence_states):
-            allowed_tokens = self.guide.get_next_instruction(guide_state).tokens.to(
-                mask.device, non_blocking=True
-            )
+            allowed_tokens = self.guide.get_next_instruction(guide_state).tokens
             allowed_tokens_batch.append(allowed_tokens)
             batch_indices.append(
                 torch.full_like(allowed_tokens, i)
             )  # Store batch index for each allowed token
 
-        allowed_tokens_concat = torch.cat(allowed_tokens_batch)
-        batch_indices_concat = torch.cat(batch_indices)
+        allowed_tokens_concat = torch.cat(allowed_tokens_batch).to(logits.device)
+        batch_indices_concat = torch.cat(batch_indices).to(logits.device)
 
+        mask = torch.ones_like(logits, dtype=torch.bool)
         mask[batch_indices_concat, allowed_tokens_concat] = False
         logits.masked_fill_(mask, float("-inf"))