From 87dc6bf7692eed2b92ed3de51bcabc0ab0fc89c4 Mon Sep 17 00:00:00 2001
From: Daniel van Strien <davanstrien@gmail.com>
Date: Thu, 26 Jun 2025 15:57:17 +0100
Subject: [PATCH 01/13] Add UV scripts guide introduction and example

- Explain what UV is and how it works with hfjobs
- Add concrete hello world example with cowsay
- Document key benefits for ML workflows
- Set up document structure for future sections
---
 docs/uv_scripts.md | 143 +++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 143 insertions(+)
 create mode 100644 docs/uv_scripts.md

diff --git a/docs/uv_scripts.md b/docs/uv_scripts.md
new file mode 100644
index 0000000..3c13344
--- /dev/null
+++ b/docs/uv_scripts.md
@@ -0,0 +1,143 @@
+# Using UV to Run Scripts with hfjobs
+
+This guide explains how to use uv to run scripts with hfjobs.
+
+## What is UV?
+
+UV is a Python package manager that can run Python scripts directly. The simplest way to use UV with hfjobs is to run any Python script:
+
+```bash
+# Run a script from a URL
+hfjobs run ghcr.io/astral-sh/uv:debian-slim uv run https://example.com/script.py
+```
+
+This works with any Python script - no special setup required!
+
+On its own, this isn't very exciting, you can also run a python script directly with Python! One of the things that makes uv more powerful is the ability to declare dependencies directly in your Python scripts, which allows you to run them without needing to install anything manually.
+
+### UV Scripts: Adding Dependencies
+
+Let's look at a simple example of a Python script with dependencies. This script relies on the `cowsay` library to print a message:
+
+```python
+# /// script
+# requires-python = ">=3.8"
+# dependencies = [
+#     "cowsay",
+# ]
+# ///
+"""A simple UV script example for hfjobs.
+This script demonstrates how UV scripts can specify their dependencies
+inline, making them perfect for running with hfjobs.
+"""
+
+import cowsay
+import sys
+
+
+def main():
+    message = " ".join(sys.argv[1:]) if len(sys.argv) > 1 else "Hello from hfjobs!"
+    cowsay.cow(message)
+
+
+if __name__ == "__main__":
+    main()
+```
+
+If we have the script saved as `hello_world_uv.py`, you can run it locally (assuming you have uv installed) like this:
+
+```bash
+uv run hello_world_uv.py "Hello from my CLI!"
+```
+
+We can also run uv scripts via a URL:
+
+```bash
+uv run https://raw.githubusercontent.com/davanstrien/hfjobs/refs/heads/quickstart-only/docs/examples/hello_world_uv.py "Hello from my CLI, I arrived from the internet via a URL!"
+```
+
+Now, to run it on Hugging Face infrastructure using hfjobs we would simply need to instead run:
+
+<!-- TODO update URLs once examples are published -->
+
+```bash
+hfjobs run ghcr.io/astral-sh/uv:debian-slim uv run https://raw.githubusercontent.com/davanstrien/hfjobs/refs/heads/quickstart-only/docs/examples/hello_world_uv.py "Hello from hfjobs!"
+```
+
+This command runs your script on Hugging Face's infrastructure, automatically installing cowsay in an isolated environment. We'll explain how this works in detail later, but the key point is that you can run any Python script with dependencies on Hugging Face infrastructure using a single command!
+
+### Why UV Scripts + hfjobs?
+
+UV scripts solve a fundamental challenge when running code on remote infrastructure: dependency management. Instead of building Docker images or manually installing packages, UV scripts let you declare dependencies right in your Python file.
+
+**Key benefits for hfjobs users:**
+
+- **Zero setup**: Your script runs anywhere with just a URL - no Docker knowledge needed
+- **Self-contained**: Dependencies travel with your code, ensuring reproducibility
+- **Instant iteration**: Change dependencies without rebuilding containers
+- **Perfect for sharing**: Send colleagues a single command that just works
+
+**Ideal for ML workflows:**
+
+UV scripts are particularly powerful for machine learning tasks. That `train.py` script you've been working on? Add a UV header with your dependencies, and it's ready to run on GPUs with hfjobs. When your script includes a CLI (using argparse or click), you get a flexible tool that can handle different datasets, models, and hyperparameters - we'll show examples of this pattern throughout the guide.
+
+You can think of UV scripts as "portable cloud functions" - your Python script becomes a complete, runnable unit that hfjobs can execute on any hardware with one command.
+
+## Understanding UV Scripts
+
+### Script Header Format
+
+### Dependency Declaration
+
+### Python Version Requirements
+
+## Running Scripts with UV and hfjobs
+
+### Making the script available (TODO better name)
+
+- Running a public script
+- uploading script to HF
+
+### Basic Command Pattern
+
+### Choosing Docker Images
+
+### Environment Setup
+
+## Examples
+
+### Example 1: Simple Script (CPU)
+
+### Example 2: Data Processing with Dependencies
+
+### Example 3: GPU Workload with ML Libraries
+
+### Example 4: Production vLLM Example
+
+## Best Practices
+
+### Script Design for Cloud
+
+### Error Handling
+
+### Resource Management
+
+## Common Patterns
+
+### Data Input/Output
+
+### Authentication
+
+### Monitoring Progress
+
+## Debugging and Troubleshooting
+
+### Common Issues
+
+### Testing Locally vs Cloud
+
+## Reference
+
+### Quick Command Templates
+
+### Links to More Examples

From bbf764389221e4b6372d7daadc0ae420492e4e90 Mon Sep 17 00:00:00 2001
From: Daniel van Strien <davanstrien@gmail.com>
Date: Thu, 26 Jun 2025 17:19:26 +0100
Subject: [PATCH 02/13] Complete Understanding UV Scripts section

- Add uv init --script command for creating templates
- Show example output of generated script
- Focus on uv add --script as recommended approach
- Document alternative package indexes with vLLM example
- Simplify Python version requirements section
- Add links to official UV documentation
---
 docs/uv_scripts.md | 124 +++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 124 insertions(+)

diff --git a/docs/uv_scripts.md b/docs/uv_scripts.md
index 3c13344..7842285 100644
--- a/docs/uv_scripts.md
+++ b/docs/uv_scripts.md
@@ -85,12 +85,132 @@ You can think of UV scripts as "portable cloud functions" - your Python script b
 
 ## Understanding UV Scripts
 
+In this section, we'll cover the basics of uv scripts. To avoid duplicating the official [uv documentation for scripts](https://docs.astral.sh/uv/guides/scripts/#declaring-script-dependencies), we'll focus on the key aspects that are relevant for running scripts with hfjobs.
+
+UV scripts are Python files that include a special header to declare dependencies and metadata. We can create a template UV script using the `uv init` command with the `--script` flag. This command initializes a new Python script with the necessary UV header:
+
+```bash
+uv init --script example.py
+```
+
+This creates a file named `example.py` with the following header:
+
+```python
+# /// script
+# requires-python = ">=3.12"
+# dependencies = []
+# ///
+```
+
 ### Script Header Format
 
+UV scripts use a special comment block at the top of your Python file to declare metadata. This header follows a specific format:
+
+```python
+# /// script
+# dependencies = [
+#     "package1",
+#     "package2",
+# ]
+# ///
+```
+
+Key points:
+
+- The header starts with `# /// script` and ends with `# ///`
+- Everything between these markers uses TOML format
+- The `dependencies` field is required (even if empty)
+- All lines must be prefixed with `#` and a space
+
+A minimal UV script looks like this:
+
+```python
+# /// script
+# dependencies = []
+# ///
+
+print("Hello, world!")
+```
+
 ### Dependency Declaration
 
+The easiest way to add dependencies to your UV script is using the `uv add` command:
+
+```bash
+# Add a single package
+uv add --script script.py numpy
+
+# Add multiple packages
+uv add --script script.py pandas polars requests
+
+# Add packages with version constraints
+uv add --script script.py "torch>=2.0" "transformers<5.0"
+
+# Add from a requirements file
+uv add --script script.py --requirements requirements.txt
+```
+
+This automatically updates your script header with the dependencies:
+
+```python
+# /// script
+# dependencies = [
+#     "numpy",
+#     "pandas",
+#     "polars",
+#     "requests",
+#     "torch>=2.0",
+#     "transformers<5.0",
+# ]
+# ///
+```
+
+**Understanding the syntax:**
+
+Dependencies work like `requirements.txt` entries:
+
+- `"numpy"` - Latest version
+- `"pandas>=2.0.0"` - Minimum version
+- `"torch==2.1.0"` - Exact version
+- `"transformers>=4.30,<5.0"` - Version range
+
+### Using alternative package indexes
+
+Quite often in an ML context, you may want to use a package index other than PyPI, such as the vLLM wheels index. You can specify an alternative index using the `--index` flag with `uv add`:
+
+```bash
+uv add --index "https://wheels.vllm.ai/nightly" --script example.py vllm
+```
+
+This will result in adding the following to your script header:
+
+```python
+# /// script
+# requires-python = ">=3.12"
+# dependencies = [
+#     "vllm",
+# ]
+#
+# [[tool.uv.index]]
+# url = "https://wheels.vllm.ai/nightly"
+# ///
+```
+
+This will let uv know to use the specified index when installing dependencies for this script.
+
+See [uv docs](https://docs.astral.sh/uv/guides/scripts/#using-alternative-package-indexes) for more details on using alternative package indexes.
+
 ### Python Version Requirements
 
+You can specify which Python version your script requires using the `requires-python` field:
+
+```python
+# /// script
+# requires-python = ">=3.8"
+# dependencies = ["numpy", "pandas"]
+# ///
+```
+
 ## Running Scripts with UV and hfjobs
 
 ### Making the script available (TODO better name)
@@ -141,3 +261,7 @@ You can think of UV scripts as "portable cloud functions" - your Python script b
 ### Quick Command Templates
 
 ### Links to More Examples
+
+```
+
+```

From 415ff7b3886778baace5c3cd777c0e8c53d5e70f Mon Sep 17 00:00:00 2001
From: Daniel van Strien <davanstrien@gmail.com>
Date: Thu, 26 Jun 2025 18:12:25 +0100
Subject: [PATCH 03/13] draft simple uv usage guide

---
 docs/uv_scripts.md | 251 ++++++++++++++++++++++++++-------------------
 1 file changed, 144 insertions(+), 107 deletions(-)

diff --git a/docs/uv_scripts.md b/docs/uv_scripts.md
index 7842285..f524566 100644
--- a/docs/uv_scripts.md
+++ b/docs/uv_scripts.md
@@ -83,185 +83,222 @@ UV scripts are particularly powerful for machine learning tasks. That `train.py`
 
 You can think of UV scripts as "portable cloud functions" - your Python script becomes a complete, runnable unit that hfjobs can execute on any hardware with one command.
 
-## Understanding UV Scripts
+## Getting Started
 
-In this section, we'll cover the basics of uv scripts. To avoid duplicating the official [uv documentation for scripts](https://docs.astral.sh/uv/guides/scripts/#declaring-script-dependencies), we'll focus on the key aspects that are relevant for running scripts with hfjobs.
+Let's create and run your first UV script on Hugging Face's infrastructure.
 
-UV scripts are Python files that include a special header to declare dependencies and metadata. We can create a template UV script using the `uv init` command with the `--script` flag. This command initializes a new Python script with the necessary UV header:
+### 1. Create a UV Script
+
+First, create a new UV script using the `uv init` command:
 
 ```bash
-uv init --script example.py
+uv init --script process_data.py
 ```
 
-This creates a file named `example.py` with the following header:
+This creates a template script:
 
 ```python
 # /// script
 # requires-python = ">=3.12"
 # dependencies = []
 # ///
-```
-
-### Script Header Format
 
-UV scripts use a special comment block at the top of your Python file to declare metadata. This header follows a specific format:
-
-```python
-# /// script
-# dependencies = [
-#     "package1",
-#     "package2",
-# ]
-# ///
-```
-
-Key points:
-
-- The header starts with `# /// script` and ends with `# ///`
-- Everything between these markers uses TOML format
-- The `dependencies` field is required (even if empty)
-- All lines must be prefixed with `#` and a space
-
-A minimal UV script looks like this:
-
-```python
-# /// script
-# dependencies = []
-# ///
+def main():
+    print("Hello from UV!")
 
-print("Hello, world!")
+if __name__ == "__main__":
+    main()
 ```
 
-### Dependency Declaration
+### 2. Add Dependencies
 
-The easiest way to add dependencies to your UV script is using the `uv add` command:
+Add the packages your script needs:
 
 ```bash
-# Add a single package
-uv add --script script.py numpy
-
-# Add multiple packages
-uv add --script script.py pandas polars requests
+# For data processing
+uv add --script process_data.py pandas pyarrow requests
 
-# Add packages with version constraints
-uv add --script script.py "torch>=2.0" "transformers<5.0"
-
-# Add from a requirements file
-uv add --script script.py --requirements requirements.txt
+# For machine learning
+uv add --script process_data.py torch transformers datasets
 ```
 
-This automatically updates your script header with the dependencies:
+Your script header now includes the dependencies:
 
 ```python
 # /// script
+# requires-python = ">=3.12"
 # dependencies = [
-#     "numpy",
 #     "pandas",
-#     "polars",
+#     "pyarrow",
 #     "requests",
-#     "torch>=2.0",
-#     "transformers<5.0",
+#     "torch",
+#     "transformers",
+#     "datasets",
 # ]
 # ///
 ```
 
-**Understanding the syntax:**
+### 3. Test Locally
 
-Dependencies work like `requirements.txt` entries:
+Make sure your script works:
 
-- `"numpy"` - Latest version
-- `"pandas>=2.0.0"` - Minimum version
-- `"torch==2.1.0"` - Exact version
-- `"transformers>=4.30,<5.0"` - Version range
+```bash
+uv run process_data.py
+```
 
-### Using alternative package indexes
+### 4. Upload to Hugging Face Hub
 
-Quite often in an ML context, you may want to use a package index other than PyPI, such as the vLLM wheels index. You can specify an alternative index using the `--index` flag with `uv add`:
+Create a dataset repository for your scripts:
 
 ```bash
-uv add --index "https://wheels.vllm.ai/nightly" --script example.py vllm
+# Create a dataset repo (only needed once)
+huggingface-cli repo create my-uv-scripts --type dataset
+
+# Upload your script
+huggingface-cli upload my-uv-scripts process_data.py scripts/process_data.py --repo-type dataset
 ```
 
-This will result in adding the following to your script header:
+### 5. Run with hfjobs
 
-```python
-# /// script
-# requires-python = ">=3.12"
-# dependencies = [
-#     "vllm",
-# ]
-#
-# [[tool.uv.index]]
-# url = "https://wheels.vllm.ai/nightly"
-# ///
+Now run your script on HF infrastructure:
+
+```bash
+# CPU execution
+hfjobs run ghcr.io/astral-sh/uv:debian-slim /bin/bash -c \
+  "uv run https://huggingface.co/datasets/{your-username}/my-uv-scripts/raw/main/scripts/process_data.py"
+
+# GPU execution
+hfjobs run --flavor gpu-nvidia-small ghcr.io/astral-sh/uv:debian-slim /bin/bash -c \
+  "uv run https://huggingface.co/datasets/{your-username}/my-uv-scripts/raw/main/scripts/process_data.py"
 ```
 
-This will let uv know to use the specified index when installing dependencies for this script.
+That's it! Your script is running on Hugging Face's infrastructure with all dependencies automatically installed.
 
-See [uv docs](https://docs.astral.sh/uv/guides/scripts/#using-alternative-package-indexes) for more details on using alternative package indexes.
+## Key Concepts for Running UV Scripts
 
-### Python Version Requirements
+### Basic Command Pattern
 
-You can specify which Python version your script requires using the `requires-python` field:
+The pattern for running UV scripts with hfjobs is:
 
-```python
-# /// script
-# requires-python = ">=3.8"
-# dependencies = ["numpy", "pandas"]
-# ///
+```bash
+hfjobs run <docker_image> /bin/bash -c "uv run <script_url> <args>"
 ```
 
-## Running Scripts with UV and hfjobs
+For most cases, use the lightweight UV image:
+- **`ghcr.io/astral-sh/uv:debian-slim`** - Fast startup, includes UV and Python
 
-### Making the script available (TODO better name)
+### Common Options
 
-- Running a public script
-- uploading script to HF
+**Running on GPU:**
+```bash
+hfjobs run --flavor gpu-nvidia-small ghcr.io/astral-sh/uv:debian-slim /bin/bash -c \
+  "uv run https://huggingface.co/datasets/{username}/my-scripts/raw/main/train.py"
+```
 
-### Basic Command Pattern
+**Passing secrets (like HF token):**
+```bash
+hfjobs run --secret HF_TOKEN=$HF_TOKEN ghcr.io/astral-sh/uv:debian-slim /bin/bash -c \
+  "uv run https://huggingface.co/datasets/{username}/my-scripts/raw/main/upload.py"
+```
+
+**Setting environment variables:**
+```bash
+hfjobs run ghcr.io/astral-sh/uv:debian-slim /bin/bash -c \
+  "export HOME=/tmp && uv run your_script.py"
+```
+
+For advanced topics like Docker image selection, environment setup, and system dependencies, see the [advanced guide](./uv_scripts_advanced.md).
 
-### Choosing Docker Images
+## Example: Process a Hugging Face Dataset
 
-### Environment Setup
+Here's a complete example that downloads and analyzes a dataset:
 
-## Examples
+```python
+# /// script
+# requires-python = ">=3.12"
+# dependencies = [
+#     "datasets",
+#     "pandas",
+# ]
+# ///
 
-### Example 1: Simple Script (CPU)
+import argparse
+from datasets import load_dataset
+import pandas as pd
 
-### Example 2: Data Processing with Dependencies
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("dataset", help="Dataset name (e.g., 'imdb')")
+    parser.add_argument("--max-samples", type=int, default=100)
+    args = parser.parse_args()
+    
+    # Load dataset
+    print(f"Loading {args.dataset}...")
+    ds = load_dataset(args.dataset, split=f"train[:{args.max_samples}]")
+    
+    # Basic analysis
+    df = pd.DataFrame(ds)
+    print(f"\nDataset shape: {df.shape}")
+    print(f"Columns: {list(df.columns)}")
+    print(f"\nFirst example:")
+    print(df.iloc[0].to_dict())
 
-### Example 3: GPU Workload with ML Libraries
+if __name__ == "__main__":
+    main()
+```
 
-### Example 4: Production vLLM Example
+Run this example:
+```bash
+# Upload to HF Hub
+huggingface-cli upload my-uv-scripts analyze.py scripts/analyze.py --repo-type dataset
 
-## Best Practices
+# Run on CPU
+hfjobs run ghcr.io/astral-sh/uv:debian-slim /bin/bash -c \
+  "uv run https://huggingface.co/datasets/{username}/my-uv-scripts/raw/main/scripts/analyze.py imdb --max-samples 1000"
+```
 
-### Script Design for Cloud
+## Quick Reference
 
-### Error Handling
+### Essential Commands
 
-### Resource Management
+```bash
+# Create UV script
+uv init --script myscript.py
 
-## Common Patterns
+# Add dependencies  
+uv add --script myscript.py pandas torch
 
-### Data Input/Output
+# Test locally
+uv run myscript.py
 
-### Authentication
+# Upload to HF
+huggingface-cli upload my-uv-scripts myscript.py scripts/myscript.py --repo-type dataset
 
-### Monitoring Progress
+# Run on CPU
+hfjobs run ghcr.io/astral-sh/uv:debian-slim /bin/bash -c \
+  "uv run https://huggingface.co/datasets/{username}/my-uv-scripts/raw/main/scripts/myscript.py"
 
-## Debugging and Troubleshooting
+# Run on GPU
+hfjobs run --flavor gpu-nvidia-small ghcr.io/astral-sh/uv:debian-slim /bin/bash -c \
+  "uv run https://huggingface.co/datasets/{username}/my-uv-scripts/raw/main/scripts/myscript.py"
 
-### Common Issues
+# With secrets
+hfjobs run --secret HF_TOKEN=$HF_TOKEN ghcr.io/astral-sh/uv:debian-slim /bin/bash -c \
+  "uv run your_script.py"
+```
 
-### Testing Locally vs Cloud
+### Getting Help
 
-## Reference
+- **UV documentation**: https://docs.astral.sh/uv/
+- **hfjobs documentation**: https://github.com/huggingface/hfjobs
+- **This guide (advanced topics)**: [uv_scripts_advanced.md](./uv_scripts_advanced.md)
 
-### Quick Command Templates
+## Next Steps
 
-### Links to More Examples
+You now have everything you need to run UV scripts on Hugging Face's infrastructure! Try:
 
-```
+1. Modifying the example for your use case
+2. Exploring GPU options for ML workloads  
+3. Building a collection of reusable scripts
 
-```
+Happy scripting! 🚀

From ac6f0c630cef3b25cd5f59a1910303537ef53c07 Mon Sep 17 00:00:00 2001
From: Daniel van Strien <davanstrien@gmail.com>
Date: Fri, 27 Jun 2025 09:11:43 +0100
Subject: [PATCH 04/13] grammar

---
 docs/uv_scripts.md | 17 +++++++++++------
 1 file changed, 11 insertions(+), 6 deletions(-)

diff --git a/docs/uv_scripts.md b/docs/uv_scripts.md
index f524566..74d9ef0 100644
--- a/docs/uv_scripts.md
+++ b/docs/uv_scripts.md
@@ -4,7 +4,7 @@ This guide explains how to use uv to run scripts with hfjobs.
 
 ## What is UV?
 
-UV is a Python package manager that can run Python scripts directly. The simplest way to use UV with hfjobs is to run any Python script:
+UV is a Python package manager that can run Python scripts. The simplest way to use UV with hfjobs is to run any Python script:
 
 ```bash
 # Run a script from a URL
@@ -13,7 +13,7 @@ hfjobs run ghcr.io/astral-sh/uv:debian-slim uv run https://example.com/script.py
 
 This works with any Python script - no special setup required!
 
-On its own, this isn't very exciting, you can also run a python script directly with Python! One of the things that makes uv more powerful is the ability to declare dependencies directly in your Python scripts, which allows you to run them without needing to install anything manually.
+On its own, this isn't very exciting; you can also run a Python script directly with Python! One of the features that makes UV more powerful is the ability to declare dependencies directly in your Python scripts, which allows you to run them without needing to install any dependencies manually.
 
 ### UV Scripts: Adding Dependencies
 
@@ -185,23 +185,27 @@ hfjobs run <docker_image> /bin/bash -c "uv run <script_url> <args>"
 ```
 
 For most cases, use the lightweight UV image:
+
 - **`ghcr.io/astral-sh/uv:debian-slim`** - Fast startup, includes UV and Python
 
 ### Common Options
 
 **Running on GPU:**
+
 ```bash
 hfjobs run --flavor gpu-nvidia-small ghcr.io/astral-sh/uv:debian-slim /bin/bash -c \
   "uv run https://huggingface.co/datasets/{username}/my-scripts/raw/main/train.py"
 ```
 
 **Passing secrets (like HF token):**
+
 ```bash
 hfjobs run --secret HF_TOKEN=$HF_TOKEN ghcr.io/astral-sh/uv:debian-slim /bin/bash -c \
   "uv run https://huggingface.co/datasets/{username}/my-scripts/raw/main/upload.py"
 ```
 
 **Setting environment variables:**
+
 ```bash
 hfjobs run ghcr.io/astral-sh/uv:debian-slim /bin/bash -c \
   "export HOME=/tmp && uv run your_script.py"
@@ -231,11 +235,11 @@ def main():
     parser.add_argument("dataset", help="Dataset name (e.g., 'imdb')")
     parser.add_argument("--max-samples", type=int, default=100)
     args = parser.parse_args()
-    
+
     # Load dataset
     print(f"Loading {args.dataset}...")
     ds = load_dataset(args.dataset, split=f"train[:{args.max_samples}]")
-    
+
     # Basic analysis
     df = pd.DataFrame(ds)
     print(f"\nDataset shape: {df.shape}")
@@ -248,6 +252,7 @@ if __name__ == "__main__":
 ```
 
 Run this example:
+
 ```bash
 # Upload to HF Hub
 huggingface-cli upload my-uv-scripts analyze.py scripts/analyze.py --repo-type dataset
@@ -265,7 +270,7 @@ hfjobs run ghcr.io/astral-sh/uv:debian-slim /bin/bash -c \
 # Create UV script
 uv init --script myscript.py
 
-# Add dependencies  
+# Add dependencies
 uv add --script myscript.py pandas torch
 
 # Test locally
@@ -298,7 +303,7 @@ hfjobs run --secret HF_TOKEN=$HF_TOKEN ghcr.io/astral-sh/uv:debian-slim /bin/bas
 You now have everything you need to run UV scripts on Hugging Face's infrastructure! Try:
 
 1. Modifying the example for your use case
-2. Exploring GPU options for ML workloads  
+2. Exploring GPU options for ML workloads
 3. Building a collection of reusable scripts
 
 Happy scripting! 🚀

From 2240f1770e79a11a25ac439730efd94cc95fbe04 Mon Sep 17 00:00:00 2001
From: Daniel van Strien <davanstrien@gmail.com>
Date: Fri, 27 Jun 2025 09:12:13 +0100
Subject: [PATCH 05/13] grammar

---
 docs/uv_scripts.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/uv_scripts.md b/docs/uv_scripts.md
index 74d9ef0..58cab1e 100644
--- a/docs/uv_scripts.md
+++ b/docs/uv_scripts.md
@@ -56,7 +56,7 @@ We can also run uv scripts via a URL:
 uv run https://raw.githubusercontent.com/davanstrien/hfjobs/refs/heads/quickstart-only/docs/examples/hello_world_uv.py "Hello from my CLI, I arrived from the internet via a URL!"
 ```
 
-Now, to run it on Hugging Face infrastructure using hfjobs we would simply need to instead run:
+Now, to run it on Hugging Face infrastructure using hfjobs we would simply need to run instead:
 
 <!-- TODO update URLs once examples are published -->
 

From e1d8e931927efe60b20efbca727fb60fcedde9e9 Mon Sep 17 00:00:00 2001
From: Daniel van Strien <davanstrien@gmail.com>
Date: Fri, 27 Jun 2025 09:14:02 +0100
Subject: [PATCH 06/13] add link

---
 docs/uv_scripts.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/uv_scripts.md b/docs/uv_scripts.md
index 58cab1e..508502a 100644
--- a/docs/uv_scripts.md
+++ b/docs/uv_scripts.md
@@ -17,7 +17,7 @@ On its own, this isn't very exciting; you can also run a Python script directly
 
 ### UV Scripts: Adding Dependencies
 
-Let's look at a simple example of a Python script with dependencies. This script relies on the `cowsay` library to print a message:
+Let's look at a simple example of a Python script with dependencies. This script relies on the [`cowsay`](https://pypi.org/project/cowsay/) library to print a message:
 
 ```python
 # /// script

From cc153b9c4732a3325d0148969958c48d5c33344f Mon Sep 17 00:00:00 2001
From: Daniel van Strien <davanstrien@gmail.com>
Date: Fri, 27 Jun 2025 09:15:04 +0100
Subject: [PATCH 07/13] install link

---
 docs/uv_scripts.md | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/docs/uv_scripts.md b/docs/uv_scripts.md
index 508502a..7dfcdf0 100644
--- a/docs/uv_scripts.md
+++ b/docs/uv_scripts.md
@@ -4,7 +4,7 @@ This guide explains how to use uv to run scripts with hfjobs.
 
 ## What is UV?
 
-UV is a Python package manager that can run Python scripts. The simplest way to use UV with hfjobs is to run any Python script:
+[UV](https://docs.astral.sh/uv) is a Python package manager that can run Python scripts. The simplest way to use UV with hfjobs is to run any Python script:
 
 ```bash
 # Run a script from a URL
@@ -15,6 +15,10 @@ This works with any Python script - no special setup required!
 
 On its own, this isn't very exciting; you can also run a Python script directly with Python! One of the features that makes UV more powerful is the ability to declare dependencies directly in your Python scripts, which allows you to run them without needing to install any dependencies manually.
 
+### Install UV
+
+See [the UV documentation](https://docs.astral.sh/uv/installation/) for up to date installation instructions.
+
 ### UV Scripts: Adding Dependencies
 
 Let's look at a simple example of a Python script with dependencies. This script relies on the [`cowsay`](https://pypi.org/project/cowsay/) library to print a message:

From 78a2f185a79bff8212f464f51b58067e9935b846 Mon Sep 17 00:00:00 2001
From: Daniel van Strien <davanstrien@gmail.com>
Date: Fri, 27 Jun 2025 10:03:53 +0100
Subject: [PATCH 08/13] better example

---
 docs/uv_scripts.md | 63 +++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 60 insertions(+), 3 deletions(-)

diff --git a/docs/uv_scripts.md b/docs/uv_scripts.md
index 7dfcdf0..3a55285 100644
--- a/docs/uv_scripts.md
+++ b/docs/uv_scripts.md
@@ -62,7 +62,7 @@ uv run https://raw.githubusercontent.com/davanstrien/hfjobs/refs/heads/quickstar
 
 Now, to run it on Hugging Face infrastructure using hfjobs we would simply need to run instead:
 
-<!-- TODO update URLs once examples are published -->
+<!-- TODO: Update these URLs to point to the main branch once the examples are merged and published -->
 
 ```bash
 hfjobs run ghcr.io/astral-sh/uv:debian-slim uv run https://raw.githubusercontent.com/davanstrien/hfjobs/refs/heads/quickstart-only/docs/examples/hello_world_uv.py "Hello from hfjobs!"
@@ -188,6 +188,8 @@ The pattern for running UV scripts with hfjobs is:
 hfjobs run <docker_image> /bin/bash -c "uv run <script_url> <args>"
 ```
 
+The `/bin/bash -c` wrapper allows us to run shell commands (like setting environment variables) before executing the UV script.
+
 For most cases, use the lightweight UV image:
 
 - **`ghcr.io/astral-sh/uv:debian-slim`** - Fast startup, includes UV and Python
@@ -223,7 +225,7 @@ Here's a complete example that downloads and analyzes a dataset:
 
 ```python
 # /// script
-# requires-python = ">=3.12"
+# requires-python = ">=3.11"
 # dependencies = [
 #     "datasets",
 #     "pandas",
@@ -266,6 +268,62 @@ hfjobs run ghcr.io/astral-sh/uv:debian-slim /bin/bash -c \
   "uv run https://huggingface.co/datasets/{username}/my-uv-scripts/raw/main/scripts/analyze.py imdb --max-samples 1000"
 ```
 
+You should see output like:
+
+```python
+Loading imdb...
+train-00000-of-00001.parquet: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 21.0M/21.0M [00:00<00:00, 64.4MB/s]
+...
+Dataset shape: (100, 2)
+Columns: ['text', 'label']
+First example:
+...
+```
+
+## Saving Your Results
+
+When your script runs on Hugging Face infrastructure, any output to stdout is displayed in your terminal. Often though, you don't just want to print results; you want to save them to a file or upload them somewhere.
+
+You can do this in a few ways:
+
+### Option 1: Use existing push_to_hub functionality
+
+The Transformers, TRL, datasets libraries (and many more!) can push results to the Hugging Face Hub directly using their built-in `push_to_hub` functionality. This is the recommended way to save models, datasets, and other artifacts. This means you can use the same code you would use locally to save your results, and it will work seamlessly on Hugging Face's infrastructure.
+
+### Option 2: Upload results using the `huggingface-hub` library
+
+Add the `huggingface-hub` library to your script and upload results directly:
+
+```python
+# /// script
+# requires-python = ">=3.12"
+# dependencies = [
+#     "datasets",
+#     "pandas",
+#     "huggingface-hub",
+# ]
+# ///
+
+from huggingface_hub import HfApi
+import os
+
+# Your processing code here...
+
+# Upload results
+api = HfApi()
+api.upload_file(
+    path_or_fileobj="results.csv",
+    path_in_repo="outputs/results.csv",
+    repo_id="username/my-results",
+    repo_type="dataset",
+    token=os.environ.get("HF_TOKEN")
+)
+```
+
+### Option 3: Use a directory to store results
+
+You can also write results to a directory and then upload that directory as a dataset. For example if you were saving multiple checkpoints or filtered version of a dataset to a `output` directory you could use `upload_folder` to upload to the hub (or use `upload_large_folder` if you are uploading a large amount of data).
+
 ## Quick Reference
 
 ### Essential Commands
@@ -300,7 +358,6 @@ hfjobs run --secret HF_TOKEN=$HF_TOKEN ghcr.io/astral-sh/uv:debian-slim /bin/bas
 
 - **UV documentation**: https://docs.astral.sh/uv/
 - **hfjobs documentation**: https://github.com/huggingface/hfjobs
-- **This guide (advanced topics)**: [uv_scripts_advanced.md](./uv_scripts_advanced.md)
 
 ## Next Steps
 

From 59db8fbcda5ebfb008ad80b7f12ea8229e050baf Mon Sep 17 00:00:00 2001
From: Daniel van Strien <davanstrien@gmail.com>
Date: Fri, 27 Jun 2025 10:17:13 +0100
Subject: [PATCH 09/13] make consistent

---
 docs/uv_scripts.md | 42 ++++++++++++++++++++++++++++++++----------
 1 file changed, 32 insertions(+), 10 deletions(-)

diff --git a/docs/uv_scripts.md b/docs/uv_scripts.md
index 3a55285..ac17d25 100644
--- a/docs/uv_scripts.md
+++ b/docs/uv_scripts.md
@@ -103,7 +103,7 @@ This creates a template script:
 
 ```python
 # /// script
-# requires-python = ">=3.12"
+# requires-python = ">=3.8"
 # dependencies = []
 # ///
 
@@ -130,7 +130,7 @@ Your script header now includes the dependencies:
 
 ```python
 # /// script
-# requires-python = ">=3.12"
+# requires-python = ">=3.8"
 # dependencies = [
 #     "pandas",
 #     "pyarrow",
@@ -169,11 +169,11 @@ Now run your script on HF infrastructure:
 ```bash
 # CPU execution
 hfjobs run ghcr.io/astral-sh/uv:debian-slim /bin/bash -c \
-  "uv run https://huggingface.co/datasets/{your-username}/my-uv-scripts/raw/main/scripts/process_data.py"
+  "uv run https://huggingface.co/datasets/{username}/my-uv-scripts/raw/main/scripts/process_data.py"
 
 # GPU execution
 hfjobs run --flavor gpu-nvidia-small ghcr.io/astral-sh/uv:debian-slim /bin/bash -c \
-  "uv run https://huggingface.co/datasets/{your-username}/my-uv-scripts/raw/main/scripts/process_data.py"
+  "uv run https://huggingface.co/datasets/{username}/my-uv-scripts/raw/main/scripts/process_data.py"
 ```
 
 That's it! Your script is running on Hugging Face's infrastructure with all dependencies automatically installed.
@@ -225,7 +225,7 @@ Here's a complete example that downloads and analyzes a dataset:
 
 ```python
 # /// script
-# requires-python = ">=3.11"
+# requires-python = ">=3.8"
 # dependencies = [
 #     "datasets",
 #     "pandas",
@@ -270,14 +270,17 @@ hfjobs run ghcr.io/astral-sh/uv:debian-slim /bin/bash -c \
 
 You should see output like:
 
-```python
+```
 Loading imdb...
-train-00000-of-00001.parquet: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 21.0M/21.0M [00:00<00:00, 64.4MB/s]
-...
+Downloading readme: 100%|██████████| 7.83k/7.83k [00:00<00:00, 3.91MB/s]
+Downloading data: 100%|████████████| 21.0M/21.0M [00:00<00:00, 64.4MB/s]
+Generating train split: 25000 examples [00:00, 68054.94 examples/s]
+
 Dataset shape: (100, 2)
 Columns: ['text', 'label']
+
 First example:
-...
+{'text': 'I rented I AM CURIOUS-YELLOW from my video store because of all the controversy that surrounded it when it was first released in 1967...', 'label': 0}
 ```
 
 ## Saving Your Results
@@ -296,7 +299,7 @@ Add the `huggingface-hub` library to your script and upload results directly:
 
 ```python
 # /// script
-# requires-python = ">=3.12"
+# requires-python = ">=3.8"
 # dependencies = [
 #     "datasets",
 #     "pandas",
@@ -324,6 +327,24 @@ api.upload_file(
 
 You can also write results to a directory and then upload that directory as a dataset. For example if you were saving multiple checkpoints or filtered version of a dataset to a `output` directory you could use `upload_folder` to upload to the hub (or use `upload_large_folder` if you are uploading a large amount of data).
 
+```python
+from huggingface_hub import HfApi
+import os
+
+# Your processing that creates multiple files...
+# e.g., saving to output/checkpoint1.pt, output/checkpoint2.pt, etc.
+
+# Upload the entire directory
+api = HfApi()
+api.upload_folder(
+    folder_path="./output",
+    path_in_repo="experiment_results",
+    repo_id="username/my-experiments",
+    repo_type="dataset",
+    token=os.environ.get("HF_TOKEN")
+)
+```
+
 ## Quick Reference
 
 ### Essential Commands
@@ -358,6 +379,7 @@ hfjobs run --secret HF_TOKEN=$HF_TOKEN ghcr.io/astral-sh/uv:debian-slim /bin/bas
 
 - **UV documentation**: https://docs.astral.sh/uv/
 - **hfjobs documentation**: https://github.com/huggingface/hfjobs
+- **This guide (advanced topics)**: [uv_scripts_advanced.md](./uv_scripts_advanced.md)
 
 ## Next Steps
 

From 6227756d8aa5df7b48be1adafbdf016bea69c059 Mon Sep 17 00:00:00 2001
From: Daniel van Strien <davanstrien@gmail.com>
Date: Fri, 27 Jun 2025 10:18:24 +0100
Subject: [PATCH 10/13] remove link to advanced guide for now

---
 docs/uv_scripts.md | 1 -
 1 file changed, 1 deletion(-)

diff --git a/docs/uv_scripts.md b/docs/uv_scripts.md
index ac17d25..0090516 100644
--- a/docs/uv_scripts.md
+++ b/docs/uv_scripts.md
@@ -379,7 +379,6 @@ hfjobs run --secret HF_TOKEN=$HF_TOKEN ghcr.io/astral-sh/uv:debian-slim /bin/bas
 
 - **UV documentation**: https://docs.astral.sh/uv/
 - **hfjobs documentation**: https://github.com/huggingface/hfjobs
-- **This guide (advanced topics)**: [uv_scripts_advanced.md](./uv_scripts_advanced.md)
 
 ## Next Steps
 

From ca5ce6ec78e49357ba20acf636a18a2e8a371413 Mon Sep 17 00:00:00 2001
From: Daniel van Strien <davanstrien@gmail.com>
Date: Fri, 27 Jun 2025 14:25:28 +0100
Subject: [PATCH 11/13] first example scripts

---
 examples/README.md                            |  47 ++++
 examples/dataset-deduplication/README.md      | 104 +++++++
 .../dataset-deduplication/semantic-dedupe.py  | 263 ++++++++++++++++++
 3 files changed, 414 insertions(+)
 create mode 100644 examples/README.md
 create mode 100644 examples/dataset-deduplication/README.md
 create mode 100644 examples/dataset-deduplication/semantic-dedupe.py

diff --git a/examples/README.md b/examples/README.md
new file mode 100644
index 0000000..6e897e4
--- /dev/null
+++ b/examples/README.md
@@ -0,0 +1,47 @@
+# hfjobs Examples
+
+Production-ready examples for running workloads on Hugging Face infrastructure.
+
+## Available Examples
+
+### [Dataset Deduplication](./dataset-deduplication/)
+
+Remove duplicate samples from datasets using semantic similarity. Includes examples for cleaning training data and preventing train/test leakage.
+
+### Coming Soon
+
+- **Training** - Multi-node training examples
+- **vLLM Inference** - Run optimized inference at scale
+- **Synthetic Data Generation** - Generate high-quality synthetic datasets
+- **Data Processing Pipelines** - ETL workflows for ML data
+
+## Quick Start
+
+1. **Install hfjobs**:
+
+   ```bash
+   pip install hfjobs
+   ```
+
+2. **Set your HF token**:
+
+   ```bash
+   export HF_TOKEN=$(python -c "from huggingface_hub import HfFolder; print(HfFolder.get_token())")
+   ```
+
+3. **Browse the examples** above for your use case
+
+## Simple Examples
+
+Looking for basic hfjobs usage? Check out [docs/examples/](../docs/examples/) for pedagogical examples focused on learning the basics.
+
+## Contributing
+
+To add a new example:
+
+1. Create a task-focused directory (e.g., `model-quantization/`)
+2. Include a comprehensive README with use cases and benchmarks
+3. Provide runnable scripts with clear documentation
+4. Add performance metrics and cost estimates
+
+Each example should solve a real problem users face when scaling ML workloads.
diff --git a/examples/dataset-deduplication/README.md b/examples/dataset-deduplication/README.md
new file mode 100644
index 0000000..07cf9cc
--- /dev/null
+++ b/examples/dataset-deduplication/README.md
@@ -0,0 +1,104 @@
+# Dataset Deduplication with hfjobs
+
+Remove duplicate samples from datasets at scale using Hugging Face infrastructure.
+
+## Overview
+
+This example demonstrates how to deduplicate datasets using semantic similarity. Unlike exact matching, semantic deduplication identifies samples that have the same meaning even if worded differently.
+
+## Use Cases
+
+- **Clean training data**: Remove redundant samples that can lead to overfitting
+- **Prevent train/test leakage**: Ensure no semantic overlap between splits
+- **Improve data quality**: Remove near-duplicates while preserving diversity
+
+## Available Scripts
+
+### semantic-dedupe.py
+
+Uses [SemHash](https://github.com/MinishLab/semhash) for semantic deduplication. Supports multiple methods:
+
+- `deduplicate`: Remove semantic duplicates (default)
+- `filter_outliers`: Remove anomalous samples
+- `find_representative`: Select diverse representative samples
+
+## Running on HF Infrastructure
+
+### Prerequisites
+
+<!-- TODO make sure we are always using the same approach to tokens so we don't confuse users -->
+
+```bash
+export HF_TOKEN=$(python -c "from huggingface_hub import HfFolder; print(HfFolder.get_token())")
+```
+
+### Basic Usage
+
+<!-- TODO update URL to GitHub repo (for now) -->
+
+```bash
+hfjobs run --secret HF_TOKEN=$HF_TOKEN \
+  ghcr.io/astral-sh/uv:debian-slim /bin/bash -c \
+  "uv run https://huggingface.co/datasets/{username}/my-scripts/raw/main/semantic-dedupe.py \
+  <dataset_id> <column> <output_repo>"
+```
+
+### Examples
+
+**Small dataset (<100k samples)**:
+
+```bash
+hfjobs run --secret HF_TOKEN=$HF_TOKEN \
+  ghcr.io/astral-sh/uv:debian-slim /bin/bash -c \
+  "uv run https://huggingface.co/datasets/davanstrien/hfjobs-examples/raw/main/semantic-dedupe.py \
+  imdb text davanstrien/imdb-deduplicated"
+```
+
+**Large dataset (use cpu-upgrade)**:
+
+```bash
+hfjobs run --flavor cpu-upgrade --secret HF_TOKEN=$HF_TOKEN \
+  ghcr.io/astral-sh/uv:debian-slim /bin/bash -c \
+  "uv run https://huggingface.co/datasets/davanstrien/hfjobs-examples/raw/main/semantic-dedupe.py \
+  nvidia/Nemotron-Personas persona davanstrien/Personas-deduplicated"
+```
+
+**With custom threshold**:
+
+```bash
+hfjobs run --secret HF_TOKEN=$HF_TOKEN \
+  ghcr.io/astral-sh/uv:debian-slim /bin/bash -c \
+  "uv run https://huggingface.co/datasets/davanstrien/hfjobs-examples/raw/main/semantic-dedupe.py \
+  squad question davanstrien/squad-dedup --threshold 0.9"
+```
+
+**Filter outliers instead**:
+
+```bash
+hfjobs run --secret HF_TOKEN=$HF_TOKEN \
+  ghcr.io/astral-sh/uv:debian-slim /bin/bash -c \
+  "uv run https://huggingface.co/datasets/davanstrien/hfjobs-examples/raw/main/semantic-dedupe.py \
+  ag_news text davanstrien/ag-news-filtered --method filter_outliers"
+```
+
+## Performance Tips
+
+1. **Test with small samples first**: Use `--max-samples 1000` to verify your setup
+2. **Choose appropriate thresholds**: Lower = more aggressive deduplication
+3. **Monitor progress**: Use `hfjobs logs <job_id>` to track progress
+
+## Output
+
+The script creates a new dataset repository with:
+
+- Deduplicated dataset in parquet format
+- Dataset card with deduplication statistics
+- Metadata about the deduplication process
+
+Example output repository: [davanstrien/imdb-deduplicated](https://huggingface.co/datasets/davanstrien/imdb-deduplicated)
+
+## Cost Optimization
+
+- Semantic deduplication is CPU-bound (embedding generation)
+- GPU not required unless using custom embedding models
+- For very large datasets (>10M), consider chunking the process
diff --git a/examples/dataset-deduplication/semantic-dedupe.py b/examples/dataset-deduplication/semantic-dedupe.py
new file mode 100644
index 0000000..aff4ef3
--- /dev/null
+++ b/examples/dataset-deduplication/semantic-dedupe.py
@@ -0,0 +1,263 @@
+# /// script
+# requires-python = ">=3.9"
+# dependencies = [
+#     "semhash",
+#     "datasets",
+#     "huggingface-hub",
+#     "hf-transfer",
+#     "hf-xet",
+# ]
+# ///
+"""Deduplicate a Hugging Face dataset using SemHash.
+
+This script uses semantic deduplication to remove duplicate entries from a dataset
+based on a specified text column, then pushes the results to a new dataset repository.
+"""
+
+import argparse
+import os
+import sys
+from datetime import datetime
+from typing import Optional
+
+from datasets import Dataset, load_dataset
+from huggingface_hub import DatasetCard
+from semhash import SemHash
+from huggingface_hub import login
+
+os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = (
+    "1"  # Enable HF transfer to speed up transfers
+)
+HF_TOKEN = os.environ.get("HF_TOKEN", None)  # Get Hugging Face token from environment
+assert HF_TOKEN, "HF_TOKEN environment variable must be set for authentication"
+login(HF_TOKEN)
+
+
+def parse_args():
+    """Parse command line arguments."""
+    parser = argparse.ArgumentParser(
+        description="Deduplicate a Hugging Face dataset using semantic similarity"
+    )
+    parser.add_argument(
+        "dataset_id",
+        type=str,
+        help="Source dataset ID (e.g., 'imdb', 'squad', 'username/dataset-name')",
+    )
+    parser.add_argument(
+        "column",
+        type=str,
+        help="Column name to deduplicate on (e.g., 'text', 'question', 'context')",
+    )
+    parser.add_argument(
+        "repo_id",
+        type=str,
+        help="Target repository ID for deduplicated dataset (e.g., 'username/my-deduplicated-dataset')",
+    )
+    parser.add_argument(
+        "--split",
+        type=str,
+        default="train",
+        help="Dataset split to process (default: train)",
+    )
+    parser.add_argument(
+        "--threshold",
+        type=float,
+        default=None,
+        help="Similarity threshold for deduplication (0-1, default: auto)",
+    )
+    parser.add_argument(
+        "--method",
+        type=str,
+        choices=["deduplicate", "filter_outliers", "find_representative"],
+        default="deduplicate",
+        help="Deduplication method to use (default: deduplicate)",
+    )
+    parser.add_argument(
+        "--private",
+        action="store_true",
+        help="Make the output dataset private",
+    )
+    parser.add_argument(
+        "--max-samples",
+        type=int,
+        default=None,
+        help="Maximum number of samples to process (for testing)",
+    )
+
+    return parser.parse_args()
+
+
+def create_dataset_card(
+    original_dataset_id: str,
+    column: str,
+    method: str,
+    duplicate_ratio: float,
+    original_size: int,
+    deduplicated_size: int,
+    threshold: Optional[float] = None,
+) -> str:
+    """Create a dataset card with deduplication information."""
+    card_content = f"""---
+tags:
+- deduplicated
+- semhash
+- semantic-deduplication
+- hfjobs
+---
+
+# Deduplicated {original_dataset_id}
+
+This dataset is a deduplicated version of [{original_dataset_id}](https://huggingface.co/datasets/{original_dataset_id}) 
+using semantic deduplication with [SemHash](https://github.com/MinishLab/semhash).
+
+## Deduplication Details
+
+- **Method**: {method}
+- **Column**: `{column}`
+- **Original size**: {original_size:,} samples
+- **Deduplicated size**: {deduplicated_size:,} samples
+- **Duplicate ratio**: {duplicate_ratio:.2%}
+- **Reduction**: {(1 - deduplicated_size / original_size):.2%}
+"""
+
+    if threshold is not None:
+        card_content += f"- **Similarity threshold**: {threshold}\n"
+
+    card_content += f"""
+- **Date processed**: {datetime.now().strftime("%Y-%m-%d")}
+
+## How to use
+
+```python
+from datasets import load_dataset
+
+dataset = load_dataset("{original_dataset_id.split("/")[-1]}-deduplicated")
+```
+
+## Processing script
+
+This dataset was created using the following script:
+
+```bash
+uv run dedupe-dataset.py {original_dataset_id} {column} <repo_id> --method {method}
+```
+
+## About semantic deduplication
+
+Unlike exact deduplication, semantic deduplication identifies and removes samples that are 
+semantically similar even if they use different words. This helps create cleaner training 
+datasets and prevents data leakage between train/test splits.
+"""
+
+    return card_content
+
+
+def main():
+    """Main function to run deduplication."""
+    args = parse_args()
+
+    # Check for HF token
+    token = os.environ.get("HF_TOKEN")
+    if not token:
+        print(
+            "Warning: HF_TOKEN not found in environment. You may not be able to push to private repos."
+        )
+
+    # Load dataset
+    print(f"Loading dataset '{args.dataset_id}' (split: {args.split})...")
+    try:
+        if args.max_samples:
+            dataset = load_dataset(
+                args.dataset_id, split=f"{args.split}[:{args.max_samples}]", token=token
+            )
+        else:
+            dataset = load_dataset(args.dataset_id, split=args.split, token=token)
+    except Exception as e:
+        print(f"Error loading dataset: {e}")
+        sys.exit(1)
+
+    # Validate column exists
+    if args.column not in dataset.column_names:
+        print(f"Error: Column '{args.column}' not found in dataset.")
+        print(f"Available columns: {', '.join(dataset.column_names)}")
+        sys.exit(1)
+
+    # Convert dataset to records for semhash
+    print(f"Preparing dataset for deduplication on column '{args.column}'...")
+    records = [dict(row) for row in dataset]
+    original_size = len(records)
+    print(f"Found {original_size:,} samples")
+
+    # Initialize SemHash with the specific column
+    print("Initializing SemHash with default model...")
+    semhash = SemHash.from_records(records=records, columns=[args.column])
+
+    # Apply selected method
+    print(f"Applying {args.method} method...")
+    if args.method == "deduplicate":
+        if args.threshold:
+            result = semhash.self_deduplicate(threshold=args.threshold)
+        else:
+            result = semhash.self_deduplicate()
+    elif args.method == "filter_outliers":
+        result = semhash.self_filter_outliers()
+    elif args.method == "find_representative":
+        result = semhash.self_find_representative()
+
+    # Get deduplicated records
+    deduplicated_records = result.selected
+    deduplicated_size = len(deduplicated_records)
+
+    # Print statistics
+    print("\nDeduplication complete!")
+    print(f"Original size: {original_size:,}")
+    print(f"Deduplicated size: {deduplicated_size:,}")
+    print(
+        f"Removed: {original_size - deduplicated_size:,} ({result.duplicate_ratio:.2%})"
+    )
+
+    # Create new dataset from deduplicated records
+    print("\nCreating deduplicated dataset...")
+    deduplicated_dataset = Dataset.from_list(deduplicated_records)
+
+    # Push dataset to hub first (this creates the repo)
+    print(f"\nPushing deduplicated dataset to '{args.repo_id}'...")
+    try:
+        deduplicated_dataset.push_to_hub(
+            args.repo_id,
+            private=args.private,
+            token=token,
+            commit_message=f"Add deduplicated version of {args.dataset_id}",
+        )
+        print("Dataset pushed successfully!")
+
+        # Create and push dataset card
+        print("Creating and pushing dataset card...")
+        card_content = create_dataset_card(
+            original_dataset_id=args.dataset_id,
+            column=args.column,
+            method=args.method,
+            duplicate_ratio=result.duplicate_ratio,
+            original_size=original_size,
+            deduplicated_size=deduplicated_size,
+            threshold=args.threshold,
+        )
+
+        card = DatasetCard(card_content)
+        card.push_to_hub(
+            repo_id=args.repo_id,
+            repo_type="dataset",
+            token=token,
+            commit_message="Add dataset card",
+        )
+
+        print(
+            f"\nSuccess! Dataset available at: https://huggingface.co/datasets/{args.repo_id}"
+        )
+    except Exception as e:
+        print(f"Error: {e}")
+        sys.exit(1)
+
+
+if __name__ == "__main__":
+    main()

From eabfe64790261069c2dd331dc8039523a4eb474e Mon Sep 17 00:00:00 2001
From: Daniel van Strien <davanstrien@gmail.com>
Date: Fri, 27 Jun 2025 15:24:37 +0100
Subject: [PATCH 12/13] vllm classification example

---
 examples/text-classification/vllm-classify.py | 276 ++++++++++++++++++
 1 file changed, 276 insertions(+)
 create mode 100644 examples/text-classification/vllm-classify.py

diff --git a/examples/text-classification/vllm-classify.py b/examples/text-classification/vllm-classify.py
new file mode 100644
index 0000000..1c3d66d
--- /dev/null
+++ b/examples/text-classification/vllm-classify.py
@@ -0,0 +1,276 @@
+# /// script
+# requires-python = ">=3.10"
+# dependencies = [
+#     "datasets",
+#     "httpx",
+#     "huggingface-hub",
+#     "setuptools",
+#     "toolz",
+#     "transformers",
+#     "vllm",
+# ]
+#
+# [[tool.uv.index]]
+# url = "https://wheels.vllm.ai/nightly"
+# ///
+
+import logging
+import os
+from typing import Optional
+
+import httpx
+import torch
+import torch.nn.functional as F
+from datasets import load_dataset
+from huggingface_hub import hf_hub_url, login
+from toolz import concat, partition_all, keymap
+from tqdm.auto import tqdm
+from vllm import LLM
+import vllm
+import os
+
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+# log vllm version
+print(vllm.__version__)
+
+
+def get_model_id2label(hub_model_id: str) -> Optional[dict[str, str]]:
+    response = httpx.get(
+        hf_hub_url(
+            hub_model_id,
+            filename="config.json",
+        )
+    )
+    if response.status_code != 200:
+        return None
+    try:
+        data = response.json()
+        logger.info(f"Config: {data}")
+        id2label = data.get("id2label")
+        if id2label is None:
+            logger.error("id2label is not found in config.json")
+            return None
+        return keymap(int, id2label)
+    except Exception as e:
+        logger.error(f"Failed to parse config.json: {e}")
+        return None
+
+
+def get_top_label(output, label_map: Optional[dict[str, str]] = None):
+    """
+    Given a ClassificationRequestOutput and a label_map (e.g. {'0': 'label0', ...}),
+    returns the top predicted label (or None if not found) and its confidence score.
+    """
+    logits = torch.tensor(output.outputs.probs)
+    probs = F.softmax(logits, dim=0)
+    top_idx = torch.argmax(probs).item()
+    top_prob = probs[top_idx].item()
+    label = label_map.get(top_idx) if label_map is not None else top_idx
+    return label, top_prob
+
+
+def format_prompts(dataset, inference_column, inference_columns, prompt_template, column_separator):
+    """Format prompts based on the provided arguments."""
+    
+    if inference_columns:
+        # Multiple columns specified
+        columns = [col.strip() for col in inference_columns.split(',')]
+        
+        # Validate columns exist
+        for col in columns:
+            if col not in dataset.column_names:
+                raise ValueError(f"Column '{col}' not found in dataset. Available: {dataset.column_names}")
+        
+        if prompt_template:
+            # Use template formatting
+            prompts = []
+            for row in dataset:
+                format_dict = {col: row[col] for col in columns}
+                try:
+                    # Replace \\n with actual newlines in the template
+                    template = prompt_template.replace('\\n', '\n')
+                    prompt = template.format(**format_dict)
+                    prompts.append(prompt)
+                except KeyError as e:
+                    raise ValueError(f"Template placeholder {e} not found in columns: {columns}")
+        else:
+            # Join columns with separator
+            prompts = [
+                column_separator.join(str(row[col]) for col in columns)
+                for row in dataset
+            ]
+    else:
+        # Single column (backward compatible)
+        if inference_column not in dataset.column_names:
+            raise ValueError(f"Column '{inference_column}' not found in dataset")
+        prompts = dataset[inference_column]
+    
+    return prompts
+
+
+def main(
+    hub_model_id: str,
+    src_dataset_hub_id: str,
+    output_dataset_hub_id: str,
+    inference_column: str = "text",
+    inference_columns: Optional[str] = None,
+    prompt_template: Optional[str] = None,
+    column_separator: str = " ",
+    batch_size: int = 10_000,
+    hf_token: Optional[str] = None,
+):
+    HF_TOKEN = hf_token or os.environ.get("HF_TOKEN")
+    if HF_TOKEN is not None:
+        login(token=HF_TOKEN)
+    else:
+        raise ValueError("HF_TOKEN is not set")
+    llm = LLM(model=hub_model_id, task="classify")
+    id2label = get_model_id2label(hub_model_id)
+    dataset = load_dataset(src_dataset_hub_id, split="train")
+    
+    # Format prompts based on arguments
+    prompts = format_prompts(dataset, inference_column, inference_columns, prompt_template, column_separator)
+    logger.info(f"Formatted {len(prompts)} prompts")
+    if prompts:
+        logger.info(f"Example prompt: {prompts[0][:200]}...")
+    all_results = []
+    all_results.extend(
+        llm.classify(batch) for batch in tqdm(list(partition_all(batch_size, prompts)))
+    )
+    outputs = list(concat(all_results))
+    if id2label is not None:
+        labels_and_probs = [get_top_label(output, id2label) for output in outputs]
+        dataset = dataset.add_column("label", [label for label, _ in labels_and_probs])
+        dataset = dataset.add_column("prob", [prob for _, prob in labels_and_probs])
+    else:
+        # just append raw label index and probs
+        dataset = dataset.add_column(
+            "label", [output.outputs.label for output in outputs]
+        )
+        dataset = dataset.add_column(
+            "prob", [output.outputs.probs for output in outputs]
+        )
+    dataset.push_to_hub(output_dataset_hub_id, token=HF_TOKEN)
+    
+    # Create and push dataset card
+    from huggingface_hub import DatasetCard
+    
+    card_content = f"""---
+tags:
+- text-classification
+- vllm
+---
+
+# {output_dataset_hub_id}
+
+This dataset was created by classifying [{src_dataset_hub_id}](https://huggingface.co/datasets/{src_dataset_hub_id}) 
+using [{hub_model_id}](https://huggingface.co/{hub_model_id}).
+
+## Prompt Format
+"""
+    
+    if inference_columns:
+        card_content += f"Columns used: `{inference_columns}`\n\n"
+        if prompt_template:
+            card_content += f"Template:\n```\n{prompt_template}\n```\n\n"
+        else:
+            card_content += f"Columns joined with: `{column_separator}`\n\n"
+    else:
+        card_content += f"Column used: `{inference_column}`\n\n"
+    
+    if id2label:
+        card_content += f"\n## Labels\n\n{', '.join([f'`{label}`' for label in id2label.values()])}\n"
+    
+    card_content += f"\n## Processing Details\n\n- Batch size: {batch_size:,}\n- Date: {os.popen('date').read().strip()}\n"
+    
+    card = DatasetCard(card_content)
+    card.push_to_hub(output_dataset_hub_id, repo_type="dataset", token=HF_TOKEN)
+    logger.info(f"Dataset and card pushed to: https://huggingface.co/datasets/{output_dataset_hub_id}")
+
+
+if __name__ == "__main__":
+    import argparse
+
+    parser = argparse.ArgumentParser(
+        prog="main.py",
+        description="Classify a dataset using a Hugging Face model and save results to Hugging Face Hub",
+    )
+    parser.add_argument(
+        "hub_model_id",
+        type=str,
+        help="Hugging Face model ID to use for classification",
+    )
+    parser.add_argument(
+        "src_dataset_hub_id",
+        type=str,
+        help="Source dataset ID on Hugging Face Hub",
+    )
+    parser.add_argument(
+        "output_dataset_hub_id",
+        type=str,
+        help="Output dataset ID on Hugging Face Hub",
+    )
+    parser.add_argument(
+        "--inference-column",
+        type=str,
+        default="text",
+        help="Column name containing text to classify (default: text)",
+    )
+    parser.add_argument(
+        "--inference-columns",
+        type=str,
+        help="Comma-separated list of columns to combine (e.g., 'title,abstract')"
+    )
+    parser.add_argument(
+        "--prompt-template",
+        type=str,
+        help="Template string with placeholders (e.g., 'Title: {title}\\nAbstract: {abstract}')"
+    )
+    parser.add_argument(
+        "--column-separator",
+        type=str,
+        default=" ",
+        help="Separator when joining columns without template (default: space)"
+    )
+    parser.add_argument(
+        "--batch-size",
+        type=int,
+        default=10_000,
+        help="Batch size for inference (default: 10000)",
+    )
+    parser.add_argument(
+        "--hf-token",
+        type=str,
+        default=None,
+        help="Hugging Face token (default: None)",
+    )
+
+    args = parser.parse_args()
+    main(
+        hub_model_id=args.hub_model_id,
+        src_dataset_hub_id=args.src_dataset_hub_id,
+        output_dataset_hub_id=args.output_dataset_hub_id,
+        inference_column=args.inference_column,
+        inference_columns=args.inference_columns,
+        prompt_template=args.prompt_template,
+        column_separator=args.column_separator,
+        batch_size=args.batch_size,
+        hf_token=args.hf_token,
+    )
+
+# hfjobs run --flavor l4x1 \
+#         --secret HF_TOKEN=hf_*** \
+#         ghcr.io/astral-sh/uv:debian \
+#         /bin/bash -c "
+#       export HOME=/tmp && \
+#       export USER=dummy && \
+#       export TORCHINDUCTOR_CACHE_DIR=/tmp/torch-inductor && \
+#       uv run https://huggingface.co/datasets/davanstrien/dataset-creation-scripts/raw/main/vllm-bert-classify-dataset/main.py \
+#         davanstrien/ModernBERT-base-is-new-arxiv-dataset \
+#         davanstrien/testarxiv \
+#         davanstrien/testarxiv-out \
+#         --inference-column prompt \
+#         --batch-size 100000" \
+#         --project vllm-classify \
+#         --name testarxiv-classify

From 4efe10a782600009aa9c89633009b5bdae4c9552 Mon Sep 17 00:00:00 2001
From: Daniel van Strien <davanstrien@gmail.com>
Date: Fri, 27 Jun 2025 15:55:11 +0100
Subject: [PATCH 13/13] add support for multiple gpus

---
 examples/text-classification/vllm-classify.py | 76 ++++++++++++-------
 1 file changed, 48 insertions(+), 28 deletions(-)

diff --git a/examples/text-classification/vllm-classify.py b/examples/text-classification/vllm-classify.py
index 1c3d66d..aee8e14 100644
--- a/examples/text-classification/vllm-classify.py
+++ b/examples/text-classification/vllm-classify.py
@@ -2,9 +2,10 @@
 # requires-python = ">=3.10"
 # dependencies = [
 #     "datasets",
+#     "hf-transfer",
+#     "hf-xet",
 #     "httpx",
 #     "huggingface-hub",
-#     "setuptools",
 #     "toolz",
 #     "transformers",
 #     "vllm",
@@ -29,6 +30,7 @@
 import vllm
 import os
 
+
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
 # log vllm version
@@ -70,18 +72,22 @@ def get_top_label(output, label_map: Optional[dict[str, str]] = None):
     return label, top_prob
 
 
-def format_prompts(dataset, inference_column, inference_columns, prompt_template, column_separator):
+def format_prompts(
+    dataset, inference_column, inference_columns, prompt_template, column_separator
+):
     """Format prompts based on the provided arguments."""
-    
+
     if inference_columns:
         # Multiple columns specified
-        columns = [col.strip() for col in inference_columns.split(',')]
-        
+        columns = [col.strip() for col in inference_columns.split(",")]
+
         # Validate columns exist
         for col in columns:
             if col not in dataset.column_names:
-                raise ValueError(f"Column '{col}' not found in dataset. Available: {dataset.column_names}")
-        
+                raise ValueError(
+                    f"Column '{col}' not found in dataset. Available: {dataset.column_names}"
+                )
+
         if prompt_template:
             # Use template formatting
             prompts = []
@@ -89,23 +95,24 @@ def format_prompts(dataset, inference_column, inference_columns, prompt_template
                 format_dict = {col: row[col] for col in columns}
                 try:
                     # Replace \\n with actual newlines in the template
-                    template = prompt_template.replace('\\n', '\n')
+                    template = prompt_template.replace("\\n", "\n")
                     prompt = template.format(**format_dict)
                     prompts.append(prompt)
                 except KeyError as e:
-                    raise ValueError(f"Template placeholder {e} not found in columns: {columns}")
+                    raise ValueError(
+                        f"Template placeholder {e} not found in columns: {columns}"
+                    ) from e
         else:
             # Join columns with separator
             prompts = [
                 column_separator.join(str(row[col]) for col in columns)
                 for row in dataset
             ]
-    else:
-        # Single column (backward compatible)
-        if inference_column not in dataset.column_names:
-            raise ValueError(f"Column '{inference_column}' not found in dataset")
+    elif inference_column in dataset.column_names:
         prompts = dataset[inference_column]
-    
+
+    else:
+        raise ValueError(f"Column '{inference_column}' not found in dataset")
     return prompts
 
 
@@ -125,12 +132,23 @@ def main(
         login(token=HF_TOKEN)
     else:
         raise ValueError("HF_TOKEN is not set")
-    llm = LLM(model=hub_model_id, task="classify")
+    # Auto-detect number of GPUs
+    num_gpus = torch.cuda.device_count()
+    logger.info(f"Detected {num_gpus} GPU(s)")
+
+    # Initialize LLM with tensor parallel size equal to number of GPUs
+    llm = LLM(
+        model=hub_model_id,
+        task="classify",
+        tensor_parallel_size=num_gpus if num_gpus > 0 else 1,
+    )
     id2label = get_model_id2label(hub_model_id)
     dataset = load_dataset(src_dataset_hub_id, split="train")
-    
+
     # Format prompts based on arguments
-    prompts = format_prompts(dataset, inference_column, inference_columns, prompt_template, column_separator)
+    prompts = format_prompts(
+        dataset, inference_column, inference_columns, prompt_template, column_separator
+    )
     logger.info(f"Formatted {len(prompts)} prompts")
     if prompts:
         logger.info(f"Example prompt: {prompts[0][:200]}...")
@@ -152,10 +170,10 @@ def main(
             "prob", [output.outputs.probs for output in outputs]
         )
     dataset.push_to_hub(output_dataset_hub_id, token=HF_TOKEN)
-    
+
     # Create and push dataset card
     from huggingface_hub import DatasetCard
-    
+
     card_content = f"""---
 tags:
 - text-classification
@@ -169,7 +187,7 @@ def main(
 
 ## Prompt Format
 """
-    
+
     if inference_columns:
         card_content += f"Columns used: `{inference_columns}`\n\n"
         if prompt_template:
@@ -178,15 +196,17 @@ def main(
             card_content += f"Columns joined with: `{column_separator}`\n\n"
     else:
         card_content += f"Column used: `{inference_column}`\n\n"
-    
+
     if id2label:
         card_content += f"\n## Labels\n\n{', '.join([f'`{label}`' for label in id2label.values()])}\n"
-    
-    card_content += f"\n## Processing Details\n\n- Batch size: {batch_size:,}\n- Date: {os.popen('date').read().strip()}\n"
-    
+
+    card_content += f"\n## Processing Details\n\n- Batch size: {batch_size:,}\n- GPUs used: {num_gpus}\n- Date: {os.popen('date').read().strip()}\n"
+
     card = DatasetCard(card_content)
     card.push_to_hub(output_dataset_hub_id, repo_type="dataset", token=HF_TOKEN)
-    logger.info(f"Dataset and card pushed to: https://huggingface.co/datasets/{output_dataset_hub_id}")
+    logger.info(
+        f"Dataset and card pushed to: https://huggingface.co/datasets/{output_dataset_hub_id}"
+    )
 
 
 if __name__ == "__main__":
@@ -220,18 +240,18 @@ def main(
     parser.add_argument(
         "--inference-columns",
         type=str,
-        help="Comma-separated list of columns to combine (e.g., 'title,abstract')"
+        help="Comma-separated list of columns to combine (e.g., 'title,abstract')",
     )
     parser.add_argument(
         "--prompt-template",
         type=str,
-        help="Template string with placeholders (e.g., 'Title: {title}\\nAbstract: {abstract}')"
+        help="Template string with placeholders (e.g., 'Title: {title}\\nAbstract: {abstract}')",
     )
     parser.add_argument(
         "--column-separator",
         type=str,
         default=" ",
-        help="Separator when joining columns without template (default: space)"
+        help="Separator when joining columns without template (default: space)",
     )
     parser.add_argument(
         "--batch-size",