TransformerLensOrg · bryce13950 · May 26, 2025 · Feb 5, 2025 · Feb 5, 2025 · Feb 5, 2025
diff --git a/.github/workflows/checks.yml b/.github/workflows/checks.yml
@@ -69,6 +69,13 @@ jobs:
         run: |
           poetry check --lock
           poetry install --with dev
+      - name: Authenticate HuggingFace CLI
+        if: env.HF_TOKEN != ''
+        run: |
+          pip install huggingface_hub
+          huggingface-cli login --token $HF_TOKEN
+        env:
+          HF_TOKEN: ${{ secrets.HF_TOKEN }}
       - name: Unit Test
         run: make unit-test
         env:
@@ -108,6 +115,13 @@ jobs:
         run: make docstring-test
       - name: Type check
         run: poetry run mypy .
+      - name: Authenticate HuggingFace CLI
+        if: env.HF_TOKEN != ''
+        run: |
+          pip install huggingface_hub
+          huggingface-cli login --token $HF_TOKEN
+        env:
+          HF_TOKEN: ${{ secrets.HF_TOKEN }}
       - name: Test Suite with Coverage Report
         run: make coverage-report-test
         env:
@@ -198,6 +212,13 @@ jobs:
         with:
           name: test-coverage
           path: docs/source/_static/coverage
+      - name: Authenticate HuggingFace CLI
+        if: env.HF_TOKEN != ''
+        run: |
+          pip install huggingface_hub
+          huggingface-cli login --token $HF_TOKEN
+        env:
+          HF_TOKEN: ${{ secrets.HF_TOKEN }}
       - name: Build Docs
         run: poetry run build-docs
         env:

diff --git a/tests/acceptance/test_evals.py b/tests/acceptance/test_evals.py
@@ -6,7 +6,7 @@
 
 @pytest.fixture(scope="module")
 def model():
-    return HookedTransformer.from_pretrained("gpt2-small")
+    return HookedTransformer.from_pretrained("gpt2-small", device="cpu")
 
 
 def test_basic_ioi_eval(model):

diff --git a/tests/acceptance/test_hooked_transformer.py b/tests/acceptance/test_hooked_transformer.py
@@ -21,7 +21,29 @@
 
 PYTHIA_MODEL_NAMES = [name for name in OFFICIAL_MODEL_NAMES if name.startswith("EleutherAI/pythia")]
 
-model_names = [
+# Small subsets for basic testing
+TINY_STORIES_SMALL_MODELS = ["roneneldan/TinyStories-1M"]
+PYTHIA_SMALL_MODELS = ["EleutherAI/pythia-70m"]
+
+# Use full lists if HF_TOKEN is available, otherwise use small subsets
+TINY_STORIES_TEST_MODELS = (
+    TINY_STORIES_MODEL_NAMES if os.environ.get("HF_TOKEN", "") else TINY_STORIES_SMALL_MODELS
+)
+PYTHIA_TEST_MODELS = PYTHIA_MODEL_NAMES if os.environ.get("HF_TOKEN", "") else PYTHIA_SMALL_MODELS
+
+# Small models for basic testing
+PUBLIC_MODEL_NAMES = [
+    "attn-only-demo",
+    "gpt2-small",
+    "opt-125m",
+    "pythia-70m",
+    "tiny-stories-33M",
+    "microsoft/phi-1",
+    "google/gemma-2b",
+]
+
+# Full set of models to test
+FULL_MODEL_NAMES = [
     "attn-only-demo",
     "gpt2-small",
     "opt-125m",
@@ -42,7 +64,12 @@
     "google/gemma-2b",
     "google/gemma-7b",
 ]
+
+# Use full model list if HF_TOKEN is available, otherwise use public models only
+model_names = FULL_MODEL_NAMES if os.environ.get("HF_TOKEN", "") else PUBLIC_MODEL_NAMES
+
 text = "Hello world!"
+
 """ 
 # Code to regenerate loss store
 store = {}
@@ -52,7 +79,15 @@
     store[name] = loss.item()
 print(store)
 """
-loss_store = {
+
+# Loss values for minimal testing
+SMALL_LOSS_STORE = {
+    "gpt2-small": 5.331855773925781,
+    "pythia-70m": 4.659344673156738,
+}
+
+# Full set of loss values
+FULL_LOSS_STORE = {
     "attn-only-demo": 5.701841354370117,
     "gpt2-small": 5.331855773925781,
     "opt-125m": 6.159054279327393,
@@ -69,6 +104,9 @@
     "bloom-560m": 5.237126350402832,
 }
 
+# Use full store if HF_TOKEN is available, otherwise use small store
+loss_store = FULL_LOSS_STORE if os.environ.get("HF_TOKEN", "") else SMALL_LOSS_STORE
+
 no_processing = [
     ("solu-1l", 5.256411552429199),
     (
@@ -534,7 +572,7 @@ def edit_pos_embed(z, hook):
 
 
 def test_all_tinystories_models_exist():
-    for model in TINY_STORIES_MODEL_NAMES:
+    for model in TINY_STORIES_TEST_MODELS:
         try:
             AutoConfig.from_pretrained(model)
         except OSError:
@@ -545,7 +583,7 @@ def test_all_tinystories_models_exist():
 
 
 def test_all_pythia_models_exist():
-    for model in PYTHIA_MODEL_NAMES:
+    for model in PYTHIA_TEST_MODELS:
         try:
             AutoConfig.from_pretrained(model)
         except OSError:

diff --git a/tests/acceptance/test_multi_gpu.py b/tests/acceptance/test_multi_gpu.py
@@ -43,7 +43,10 @@ def test_device_separation_and_cache(gpt2_medium_on_1_device, n_devices):
     loss_n_devices = model_n_devices(model_description_text, return_type="loss")
     elapsed_time_n_devices = time.time() - start_time_n_devices
 
-    gpt2_text = "Natural language processing tasks, such as question answering, machine translation, reading comprehension, and summarization, are typically approached with supervised learning on taskspecific datasets."
+    gpt2_text = (
+        "Natural language processing tasks, such as question answering, machine translation, reading comprehension, "
+        "and summarization, are typically approached with supervised learning on taskspecific datasets."
+    )
     gpt2_tokens = model_1_device.to_tokens(gpt2_text)
 
     gpt2_logits_1_device, gpt2_cache_1_device = model_1_device.run_with_cache(
@@ -55,7 +58,7 @@ def test_device_separation_and_cache(gpt2_medium_on_1_device, n_devices):
 
     # Make sure the tensors in cache remain on their respective devices
     for i in range(model_n_devices.cfg.n_layers):
-        expected_device = get_best_available_device(model_n_devices.cfg.device)
+        expected_device = get_best_available_device(model_n_devices.cfg)
         cache_device = gpt2_cache_n_devices[f"blocks.{i}.mlp.hook_post"].device
         assert cache_device == expected_device
 
@@ -80,10 +83,25 @@ def test_device_separation_and_cache(gpt2_medium_on_1_device, n_devices):
         assert prop_device == pytest.approx(expected_prop_device, rel=0.20)
 
     print(
-        f"Number of devices: {n_devices}, Model loss (1 device): {loss_1_device}, Model loss ({n_devices} devices): {loss_n_devices}, Time taken (1 device): {elapsed_time_1_device:.4f} seconds, Time taken ({n_devices} devices): {elapsed_time_n_devices:.4f} seconds"
+        f"Number of devices: {n_devices}, Model loss (1 device): {loss_1_device}, Model loss ({n_devices} devices): {loss_n_devices}, "
+        f"Time taken (1 device): {elapsed_time_1_device:.4f} seconds, Time taken ({n_devices} devices): {elapsed_time_n_devices:.4f} seconds"
     )
 
 
+@pytest.mark.skipif(torch.cuda.device_count() < 2, reason="Requires at least 2 CUDA devices")
+def test_load_model_on_target_device():
+    model = HookedTransformer.from_pretrained("gpt2-small", device="cuda:1")
+    assert model.cfg.device == "cuda:1"
+
+    for name, param in model.named_parameters():
+        assert param.device == torch.device(
+            "cuda:1"
+        ), f"Parameter {name} is on {param.device} instead of cuda:1"
+
+    output = model("Hello world")
+    assert output.device == torch.device("cuda:1")
+
+
 @pytest.mark.skipif(torch.cuda.device_count() < 2, reason="Requires at least 2 CUDA devices")
 def test_cache_device():
     model = HookedTransformer.from_pretrained("gpt2-small", device="cuda:1")

diff --git a/tests/acceptance/test_tokenizer_special_tokens.py b/tests/acceptance/test_tokenizer_special_tokens.py
@@ -1,10 +1,15 @@
+import os
+
 from transformers import AutoTokenizer
 
 import transformer_lens.loading_from_pretrained as loading
 from transformer_lens import HookedTransformer, HookedTransformerConfig
 
-# Get's tedious typing these out everytime I want to sweep over all the distinct small models
-MODEL_TESTING_LIST = [
+# Small models for basic testing
+PUBLIC_MODEL_TESTING_LIST = ["gpt2-small", "opt-125m", "pythia-70m"]
+
+# Full set of models to test when HF_TOKEN is available
+FULL_MODEL_TESTING_LIST = [
     "solu-1l",
     "gpt2-small",
     "gpt-neo-125M",
@@ -14,6 +19,11 @@
     "pythia-70m",
 ]
 
+# Use full model list if HF_TOKEN is available, otherwise use public models only
+MODEL_TESTING_LIST = (
+    FULL_MODEL_TESTING_LIST if os.environ.get("HF_TOKEN", "") else PUBLIC_MODEL_TESTING_LIST
+)
+
 
 def test_d_vocab_from_tokenizer():
     cfg = HookedTransformerConfig(

diff --git a/tests/integration/model_bridge/test_bridge.py b/tests/integration/model_bridge/test_bridge.py
@@ -0,0 +1,114 @@
+"""Integration tests for the model bridge functionality.
+
+This module contains tests that verify the core functionality of the model bridge,
+including model initialization, text generation, hooks, and caching.
+"""
+
+import pytest
+import torch
+
+from transformer_lens.boot import boot
+
+
+def test_model_initialization():
+    """Test that the model can be initialized correctly."""
+    model_name = "gpt2"  # Use a smaller model for testing
+    bridge = boot(model_name)
+
+    assert bridge is not None, "Bridge should be initialized"
+    assert bridge.tokenizer is not None, "Tokenizer should be initialized"
+    assert isinstance(bridge.model, torch.nn.Module), "Model should be a PyTorch module"
+
+
+def test_text_generation():
+    """Test basic text generation functionality."""
+    model_name = "gpt2"  # Use a smaller model for testing
+    bridge = boot(model_name)
+
+    prompt = "The quick brown fox jumps over the lazy dog"
+    output = bridge.generate(prompt, max_new_tokens=10)
+
+    assert isinstance(output, str), "Output should be a string"
+    assert len(output) > len(prompt), "Generated text should be longer than the prompt"
+
+
+def test_hooks():
+    """Test that hooks can be added and removed correctly."""
+    model_name = "gpt2"  # Use a smaller model for testing
+    bridge = boot(model_name)
+
+    # Track if hook was called
+    hook_called = False
+
+    def test_hook(tensor, hook):
+        nonlocal hook_called
+        hook_called = True
+        return tensor
+
+    # Add hook to first attention layer
+    hook_name = "blocks.0.attn"
+    bridge.blocks[0].attn.add_hook(test_hook)
+
+    # Run model
+    prompt = "Test prompt"
+    bridge.generate(prompt, max_new_tokens=1)
+
+    # Verify hook was called
+    assert hook_called, "Hook should have been called"
+
+    # Remove hook
+    bridge.blocks[0].attn.remove_hooks()
+    hook_called = False
+
+    # Run model again
+    bridge.generate(prompt, max_new_tokens=1)
+
+    # Verify hook was not called
+    assert not hook_called, "Hook should not have been called after removal"
+
+
+def test_cache():
+    """Test that the cache functionality works correctly."""
+    model_name = "gpt2"  # Use a smaller model for testing
+    bridge = boot(model_name)
+
+    prompt = "Test prompt"
+    output, cache = bridge.run_with_cache(prompt)
+
+    # Verify output and cache
+    assert isinstance(output, torch.Tensor), "Output should be a tensor"
+    assert isinstance(cache, dict), "Cache should be a dictionary"
+    assert len(cache) > 0, "Cache should contain activations"
+
+    # Verify cache contains some expected keys (using actual HuggingFace model structure)
+    # The exact keys depend on the model architecture, but we should have some basic ones
+    cache_keys = list(cache.keys())
+    assert any("wte" in key for key in cache_keys), "Cache should contain word token embeddings"
+    assert any("ln_f" in key for key in cache_keys), "Cache should contain final layer norm"
+    assert any("lm_head" in key for key in cache_keys), "Cache should contain language model head"
+
+    # Verify that cached tensors are actually tensors
+    for key, value in cache.items():
+        assert isinstance(value, torch.Tensor), f"Cache value for {key} should be a tensor"
+
+
+def test_component_access():
+    """Test that model components can be accessed correctly."""
+    model_name = "gpt2"  # Use a smaller model for testing
+    bridge = boot(model_name)
+
+    # Test accessing various components
+    assert hasattr(bridge, "embed"), "Bridge should have embed component"
+    assert hasattr(bridge, "blocks"), "Bridge should have blocks component"
+    assert hasattr(bridge, "unembed"), "Bridge should have unembed component"
+
+    # Test accessing block components
+    block = bridge.blocks[0]
+    assert hasattr(block, "attn"), "Block should have attention component"
+    assert hasattr(block, "mlp"), "Block should have MLP component"
+    assert hasattr(block, "ln1"), "Block should have first layer norm"
+    assert hasattr(block, "ln2"), "Block should have second layer norm"
+
+
+if __name__ == "__main__":
+    pytest.main([__file__])
diff --git a/tests/integration/weight_conversion/test_gemma_weight_conversion.py b/tests/integration/weight_conversion/test_gemma_weight_conversion.py
diff --git a/tests/mocks/models.py b/tests/mocks/models.py
@@ -0,0 +1,36 @@
+"""Mock models for testing."""
+
+import torch.nn as nn
+
+
+class MockGemma3Model(nn.Module):
+    """A mock implementation of the Gemma 3 model architecture for testing purposes.
+
+    This mock model replicates the key architectural components of Gemma 3:
+    - Embedding layer (embed_tokens)
+    - Multiple transformer layers with:
+        - Input and post-attention layer norms
+        - Self-attention with Q, K, V, O projections
+        - MLP with up, gate, and down projections
+    - Final layer norm
+    """
+
+    def __init__(self):
+        super().__init__()
+        self.model = nn.Module()
+        self.model.embed_tokens = nn.Embedding(1000, 512)
+        self.model.layers = nn.ModuleList([nn.Module() for _ in range(2)])
+        for layer in self.model.layers:
+            layer.input_layernorm = nn.LayerNorm(512)
+            layer.post_attention_layernorm = nn.LayerNorm(512)
+            layer.self_attn = nn.Module()
+            layer.self_attn.q_proj = nn.Linear(512, 512)
+            layer.self_attn.k_proj = nn.Linear(512, 512)
+            layer.self_attn.v_proj = nn.Linear(512, 512)
+            layer.self_attn.o_proj = nn.Linear(512, 512)
+            layer.mlp = nn.Module()
+            layer.mlp.up_proj = nn.Linear(512, 2048)
+            layer.mlp.gate_proj = nn.Linear(512, 2048)
+            layer.mlp.down_proj = nn.Linear(2048, 512)
+        self.model.norm = nn.LayerNorm(512)
+        self.embed_tokens = self.model.embed_tokens  # For shared embedding/unembedding