diff --git a/docs/index.md b/docs/index.md index f6266980..82dd2518 100644 --- a/docs/index.md +++ b/docs/index.md @@ -69,10 +69,16 @@ we recommend creating a dedicated Python environment for each model. pip install "maestro[qwen_2_5_vl]" ``` +=== "SmolVLM2" + + ```bash + pip install "maestro[smolvlm2]" + ``` + ### CLI Kick off fine-tuning with our command-line interface, which leverages the configuration -and training routines defined in each model’s core module. Simply specify key parameters such as +and training routines defined in each model's core module. Simply specify key parameters such as the dataset location, number of epochs, batch size, optimization strategy, and metrics. === "Florence-2" @@ -108,6 +114,17 @@ the dataset location, number of epochs, batch size, optimization strategy, and m --metrics "edit_distance" ``` +=== "SmolVLM2" + + ```bash + maestro smolvlm2 train \ + --dataset "dataset/location" \ + --epochs 10 \ + --batch-size 4 \ + --optimization_strategy "lora" \ + --metrics "edit_distance" + ``` + ### Python For greater control, use the Python API to fine-tune your models. @@ -148,7 +165,6 @@ and training setup. ``` === "Qwen2.5-VL" - ```python from maestro.trainer.models.qwen_2_5_vl.core import train @@ -162,3 +178,18 @@ and training setup. train(config) ``` + +=== "SmolVLM2" + ```python + from maestro.trainer.models.smolvlm2.core import train + + config = { + "dataset": "dataset/location", + "epochs": 10, + "batch_size": 4, + "optimization_strategy": "lora", + "metrics": ["edit_distance"], + } + + train(config) + ``` diff --git a/docs/models/smolvlm2.md b/docs/models/smolvlm2.md new file mode 100644 index 00000000..cebf95f7 --- /dev/null +++ b/docs/models/smolvlm2.md @@ -0,0 +1,99 @@ +--- +comments: true +--- + +## Overview + +SmolVLM2 is a lightweight vision-language model developed by Smol AI. It offers impressive capabilities for multimodal understanding while maintaining a compact size compared to larger VLMs. The model excels at tasks such as image captioning, visual question answering, and object detection, making it accessible for applications with limited computational resources. + +Built to balance performance and efficiency, SmolVLM2 provides a valuable option for developers seeking to implement vision-language capabilities without the overhead of larger models. The 500M parameter variant delivers practical results while being significantly more resource-friendly than multi-billion parameter alternatives. + +## Install + +```bash +pip install "maestro[smolvlm2]" +``` + +## Train + +The training routines support various optimization strategies such as LoRA, QLoRA, and freezing the vision encoder. Customize your fine-tuning process via CLI or Python to align with your dataset and task requirements. + +### CLI + +Kick off training from the command line by running the command below. Be sure to replace the dataset path and adjust the hyperparameters (such as epochs and batch size) to suit your needs. + +```bash +maestro smolvlm2 train \ + --dataset "dataset/location" \ + --epochs 10 \ + --batch-size 4 \ + --optimization_strategy "qlora" \ + --metrics "edit_distance" +``` + +### Python + +For more control, you can fine-tune SmolVLM2 using the Python API. Create a configuration dictionary with your training parameters and pass it to the train function to integrate the process into your custom workflow. + +```python +from maestro.trainer.models.smolvlm2.core import train + +config = { + "dataset": "dataset/location", + "epochs": 10, + "batch_size": 4, + "optimization_strategy": "qlora", + "metrics": ["edit_distance"], +} + +results = train(config) +``` + +## Inference + +Use SmolVLM2 for inference on images using either the CLI or Python API. + +### CLI + +```bash +maestro smolvlm2 predict \ + --image "path/to/image.jpg" \ + --prompt "Describe this image" +``` + +### Python + +```python +from maestro.trainer.models.smolvlm2.entrypoint import SmolVLM2 + +model = SmolVLM2() +result = model.generate( + images="path/to/image.jpg", + prompt="Describe this image", + max_new_tokens=512 +) + +print(result["text"]) +``` + +## Object Detection + +SmolVLM2 can perform object detection on images, identifying and localizing objects with bounding boxes. + +```python +from maestro.trainer.models.smolvlm2.entrypoint import SmolVLM2 +from maestro.trainer.models.smolvlm2.detection import result_to_detections_formatter + +model = SmolVLM2() +result = model.generate( + images="path/to/image.jpg", + prompt="Detect the following objects: person, car, dog" +) + +# Convert text output to detections format +boxes, class_ids = result_to_detections_formatter( + text=result["text"], + resolution_wh=(640, 480), + classes=["person", "car", "dog"] +) +``` diff --git a/maestro/cli/introspection.py b/maestro/cli/introspection.py index 086a831b..95368685 100644 --- a/maestro/cli/introspection.py +++ b/maestro/cli/introspection.py @@ -28,6 +28,13 @@ def find_training_recipes(app: typer.Typer) -> None: except Exception: _warn_about_recipe_import_error(model_name="Qwen2.5-VL") + try: + from maestro.trainer.models.smolvlm2.entrypoint import smolvlm2_app + + app.add_typer(smolvlm2_app, name="smolvlm2") + except Exception: + _warn_about_recipe_import_error(model_name="SmolVLM2") + def _warn_about_recipe_import_error(model_name: str) -> None: disable_warnings = str2bool( diff --git a/maestro/trainer/models/smolvlm2/__init__.py b/maestro/trainer/models/smolvlm2/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/maestro/trainer/models/smolvlm2/checkpoints.py b/maestro/trainer/models/smolvlm2/checkpoints.py new file mode 100644 index 00000000..87d1aea8 --- /dev/null +++ b/maestro/trainer/models/smolvlm2/checkpoints.py @@ -0,0 +1,55 @@ +import os +from typing import Optional + +import torch +from transformers import AutoModelForVision2Seq, AutoProcessor + + +def save_checkpoint( + model: AutoModelForVision2Seq, processor: AutoProcessor, path: str, metadata: Optional[dict] = None +) -> None: + """ + Save model checkpoint. + + Args: + model: Model to save + processor: Processor to save + path: Path to save checkpoint + metadata: Optional metadata to save + """ + os.makedirs(path, exist_ok=True) + + # Save model + model.save_pretrained(path) + + # Save processor + processor.save_pretrained(path) + + # Save metadata if provided + if metadata is not None: + torch.save(metadata, os.path.join(path, "metadata.pt")) + + +def load_checkpoint(path: str, device: str = "cuda" if torch.cuda.is_available() else "cpu") -> dict: + """ + Load model checkpoint. + + Args: + path: Path to checkpoint + device: Device to load model on + + Returns: + Dictionary containing model, processor, and metadata + """ + # Load model + model = AutoModelForVision2Seq.from_pretrained(path) + model.to(device) + + # Load processor + processor = AutoProcessor.from_pretrained(path) + + # Load metadata if exists + metadata_path = os.path.join(path, "metadata.pt") + metadata = torch.load(metadata_path) if os.path.exists(metadata_path) else None + + return {"model": model, "processor": processor, "metadata": metadata} diff --git a/maestro/trainer/models/smolvlm2/core.py b/maestro/trainer/models/smolvlm2/core.py new file mode 100644 index 00000000..e300982f --- /dev/null +++ b/maestro/trainer/models/smolvlm2/core.py @@ -0,0 +1,201 @@ +import os +from typing import Optional, Union + +import torch +from transformers import AutoModelForVision2Seq, AutoProcessor, Trainer + + +class SmolVLM2Core: + """Core SmolVLM2 model implementation.""" + + def __init__( + self, + model_name: str = "smol-ai/smolvlm2-500m", + device: str = "cuda" if torch.cuda.is_available() else "cpu", + **kwargs, + ): + """ + Initialize SmolVLM2 model. + + Args: + model_name: Name or path of the model to load + device: Device to run the model on + **kwargs: Additional arguments to pass to the model + """ + self.model_name = model_name + self.device = device + + self.processor = AutoProcessor.from_pretrained(model_name) + self.model = AutoModelForVision2Seq.from_pretrained(model_name) + self.model.to(device) + + def process_inputs(self, images: Union[str, list[str]], prompt: Optional[str] = None) -> dict: + """Process input images and text.""" + if isinstance(images, str): + images = [images] + + return self.processor(images=images, text=prompt if prompt else "", return_tensors="pt").to(self.device) + + def generate(self, inputs: dict, max_new_tokens: int = 512, **kwargs) -> torch.Tensor: + """Generate text from processed inputs.""" + return self.model.generate(**inputs, max_new_tokens=max_new_tokens, **kwargs) + + def decode_outputs(self, outputs: torch.Tensor, skip_special_tokens: bool = True) -> list[str]: + """Decode model outputs to text.""" + return self.processor.batch_decode(outputs, skip_special_tokens=skip_special_tokens) + + +def train(config: dict) -> dict: + """ + Train SmolVLM2 model with provided configuration. + + Args: + config: Dictionary containing training configuration + - dataset: Path to dataset directory or file + - epochs: Number of training epochs + - batch_size: Training batch size + - optimization_strategy: Strategy for optimization (qlora, lora, freeze_vision) + - metrics: List of metrics to evaluate during training + - output_dir: Directory to save trained model + Returns: + Dictionary containing training results and metrics + """ + + from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training + from transformers import BitsAndBytesConfig, TrainingArguments + + from maestro.trainer.common.datasets.core import create_data_loaders, resolve_dataset_path + from maestro.trainer.models.smolvlm2.loaders import evaluation_collate_fn, train_collate_fn + + # Load dataset + dataset_path = config["dataset"] + dataset_location = resolve_dataset_path(dataset_path) + if dataset_location is None: + return {"error": "Dataset not found"} + + # Create model with the specified optimization strategy + model_name = config.get("model_name", "smol-ai/smolvlm2-500m") + strategy = config.get("optimization_strategy", "qlora") + + if strategy == "qlora": + # Configure QLoRA + bnb_config = BitsAndBytesConfig( + load_in_4bit=True, + bnb_4bit_quant_type="nf4", + bnb_4bit_compute_dtype=torch.float16, + bnb_4bit_use_double_quant=True, + ) + + model = AutoModelForVision2Seq.from_pretrained(model_name, quantization_config=bnb_config, device_map="auto") + model = prepare_model_for_kbit_training(model) + + lora_config = LoraConfig( + r=16, + lora_alpha=32, + target_modules=["q_proj", "v_proj", "k_proj", "o_proj", "gate_proj", "up_proj", "down_proj"], + lora_dropout=0.05, + bias="none", + task_type="CAUSAL_LM", + ) + + model = get_peft_model(model, lora_config) + + elif strategy == "lora": + # Configure LoRA without quantization + model = AutoModelForVision2Seq.from_pretrained(model_name) + + lora_config = LoraConfig( + r=16, + lora_alpha=32, + target_modules=["q_proj", "v_proj", "k_proj", "o_proj", "gate_proj", "up_proj", "down_proj"], + lora_dropout=0.05, + bias="none", + task_type="CAUSAL_LM", + ) + + model = get_peft_model(model, lora_config) + + elif strategy == "freeze_vision": + # Freeze vision encoder, train only language model part + model = AutoModelForVision2Seq.from_pretrained(model_name) + + # Freeze vision encoder parameters + for param in model.vision_model.parameters(): + param.requires_grad = False + else: + raise ValueError(f"Unsupported optimization strategy: {strategy}") + + # Load processor and datasets + processor = AutoProcessor.from_pretrained(model_name) + + # Create processor wrapper to preprocess data before collating + def process_batch(batch): + processed_batch = [] + for item in batch: + processed_item = processor(images=item.get("image"), text=item.get("text", ""), return_tensors="pt") + processed_batch.append(processed_item) + return processed_batch + + train_loader, valid_loader, test_loader = create_data_loaders( + dataset_location=dataset_location, + train_batch_size=config.get("batch_size", 4), + train_collect_fn=lambda batch: train_collate_fn(process_batch(batch)), + train_num_workers=config.get("num_workers", 0), + test_batch_size=config.get("val_batch_size", config.get("batch_size", 4)), + test_collect_fn=lambda batch: evaluation_collate_fn(process_batch(batch)), + test_num_workers=config.get("val_num_workers", config.get("num_workers", 0)), + ) + + # Set up training arguments + output_dir = config.get("output_dir", "./smolvlm2-finetuned") + os.makedirs(output_dir, exist_ok=True) + + training_args = TrainingArguments( + output_dir=output_dir, + num_train_epochs=config.get("epochs", 10), + per_device_train_batch_size=config.get("batch_size", 4), + per_device_eval_batch_size=config.get("val_batch_size", config.get("batch_size", 4)), + gradient_accumulation_steps=4, + learning_rate=2e-5, + weight_decay=0.01, + warmup_steps=100, + save_strategy="epoch", + save_total_limit=2, + logging_steps=10, + evaluation_strategy="epoch", + load_best_model_at_end=True, + remove_unused_columns=False, + ) + + # Safely handle potential None loaders by directly checking + # train_loader/valid_loader before accessing dataset attribute + train_dataset = None + if train_loader is not None: + train_dataset = train_loader.dataset + + eval_dataset = None + if valid_loader is not None: + eval_dataset = valid_loader.dataset + + # Create data_collator that matches the train_collate_fn signature (doesn't pass processor) + trainer = Trainer( + model=model, + args=training_args, + train_dataset=train_dataset, + eval_dataset=eval_dataset, + data_collator=lambda batch: train_collate_fn(process_batch(batch)), + ) + + # Train model + trainer.train() + + # Save model and processor + model.save_pretrained(output_dir) + processor.save_pretrained(output_dir) + + # Return results + return { + "model_path": output_dir, + "metrics": trainer.state.log_history[-1] if trainer.state.log_history else {"loss": "N/A"}, + "status": "Training completed", + } diff --git a/maestro/trainer/models/smolvlm2/detection.py b/maestro/trainer/models/smolvlm2/detection.py new file mode 100644 index 00000000..c03126a5 --- /dev/null +++ b/maestro/trainer/models/smolvlm2/detection.py @@ -0,0 +1,113 @@ +import re +from typing import Optional + +import numpy as np + + +def result_to_detections_formatter( + text: str, resolution_wh: tuple[int, int], classes: Optional[list[str]] = None +) -> tuple[np.ndarray, np.ndarray]: + """Converts SmolVLM2 text output into detection format. + + SmolVLM2 outputs text in a format like: + "a person standing in front of a car [x1, y1, x2, y2]" + + Args: + text: SmolVLM2 output text + resolution_wh: Target image resolution (width, height) + classes: Optional list of valid class names + + Returns: + Tuple of (boxes, class_ids) where: + - boxes is a float32 array of shape (N, 4) with xyxy coordinates + - class_ids is an int32 array of shape (N,) with class IDs + """ + # Extract bounding boxes using regex + box_pattern = r"\[(\d+),\s*(\d+),\s*(\d+),\s*(\d+)\]" + matches = re.finditer(box_pattern, text) + + boxes_list = [] + class_ids_list = [] + + # Create class mapping if provided + if classes is not None: + name_to_index = {cls_name: idx for idx, cls_name in enumerate(classes)} + else: + name_to_index = None + + for match in matches: + x_min, y_min, x_max, y_max = map(float, match.groups()) + + # Extract class name from text before the box + text_before = text[: match.start()].strip() + class_name = text_before.split()[-1] if text_before else "unknown" + + if name_to_index is not None: + if class_name not in name_to_index: + continue + current_class_id = name_to_index[class_name] + else: + current_class_id = -1 + + boxes_list.append([x_min, y_min, x_max, y_max]) + class_ids_list.append(current_class_id) + + boxes = np.array(boxes_list, dtype=np.float32).reshape(-1, 4) + class_ids = np.array(class_ids_list, dtype=np.int32) + + return boxes, class_ids + + +def detections_to_text_formatter( + xyxy: np.ndarray, class_id: np.ndarray, classes: list[str], resolution_wh: tuple[int, int] +) -> str: + """Converts detections to SmolVLM2 text format. + + Args: + xyxy: Bounding boxes in xyxy format + class_id: Class IDs for each box + classes: List of class names + resolution_wh: Image resolution (width, height) + + Returns: + Formatted text string for SmolVLM2 + """ + text_parts = [] + + for i in range(len(xyxy)): + cls_name = classes[class_id[i]] + x_min, y_min, x_max, y_max = map(int, xyxy[i]) + box_text = f"{cls_name} [{x_min}, {y_min}, {x_max}, {y_max}]" + text_parts.append(box_text) + + return " ".join(text_parts) + + +def format_prompt_for_detection( + prompt: str, + xyxy: Optional[np.ndarray] = None, + class_id: Optional[np.ndarray] = None, + classes: Optional[list[str]] = None, + resolution_wh: Optional[tuple[int, int]] = None, +) -> str: + """Formats a prompt for object detection with SmolVLM2. + + Args: + prompt: Base prompt + xyxy: Optional bounding boxes + class_id: Optional class IDs + classes: Optional class names resolution_wh: Optional image resolution + + Returns: + Formatted prompt string + """ + if all(x is not None for x in [xyxy, class_id, classes, resolution_wh]): + # Type-cast to the expected types before passing to formatter + detection_text = detections_to_text_formatter( + xyxy, + class_id if class_id is not None else [], + classes if classes is not None else [], + resolution_wh if resolution_wh is not None else (0, 0), + ) + return f"{prompt} {detection_text}" + return prompt diff --git a/maestro/trainer/models/smolvlm2/entrypoint.py b/maestro/trainer/models/smolvlm2/entrypoint.py new file mode 100644 index 00000000..a18ff74e --- /dev/null +++ b/maestro/trainer/models/smolvlm2/entrypoint.py @@ -0,0 +1,124 @@ +from pathlib import Path +from typing import Optional, Union + +import torch +import typer + +from .inference import SmolVLM2Inference + +smolvlm2_app = typer.Typer() + + +class SmolVLM2: + """Main entrypoint for SmolVLM2 model.""" + + def __init__( + self, + model_name: str = "smol-ai/smolvlm2-500m", + device: str = "cuda" if torch.cuda.is_available() else "cpu", + **kwargs, + ): + """Initialize SmolVLM2 model.""" + self.inference = SmolVLM2Inference(model_name=model_name, device=device, **kwargs) + + def generate( + self, images: Union[str, list[str]], prompt: Optional[str] = None, max_new_tokens: int = 512, **kwargs + ) -> dict: + """ + Generate text from images. + + Args: + images: Path(s) to image(s) + prompt: Optional prompt to guide generation + max_new_tokens: Maximum number of tokens to generate + **kwargs: Additional generation parameters + + Returns: + Dictionary containing generated text and other outputs + """ + return self.inference.generate(images=images, prompt=prompt, max_new_tokens=max_new_tokens, **kwargs) + + +@smolvlm2_app.command(name="info", help="Get information about the SmolVLM2 model") +def info() -> None: + """Get information about the SmolVLM2 model.""" + try: + model = SmolVLM2() + info = model.inference.get_model_info() + typer.echo(f"Model Name: {info['model_name']}") + typer.echo(f"Model Size: {info['model_size']}") + typer.echo(f"Device: {info['device']}") + typer.echo(f"Tokenizer: {info['tokenizer']}") + except Exception as e: + typer.echo(f"Error retrieving model info: {e!s}", err=True) + raise typer.Exit(code=1) + + +@smolvlm2_app.command(name="predict", help="Run inference on one or more images") +def predict( + image: list[Path] = typer.Option(..., "--image", "-i", help="Path to image(s) for prediction"), + prompt: Optional[str] = typer.Option(None, "--prompt", "-p", help="Optional prompt to guide generation"), + max_new_tokens: int = typer.Option(512, "--max-new-tokens", help="Maximum new tokens to generate"), + output: Optional[Path] = typer.Option(None, "--output", "-o", help="Output file path to save results"), +) -> None: + """Run inference on images using SmolVLM2.""" + try: + model = SmolVLM2() + result = model.generate(images=[str(img) for img in image], prompt=prompt, max_new_tokens=max_new_tokens) + + if output: + import json + + with open(output, "w") as f: + json.dump(result, f, indent=2) + typer.echo(f"Results saved to {output}") + else: + typer.echo(f"Generated text: {result['text']}") + + except Exception as e: + typer.echo(f"Error during prediction: {e!s}", err=True) + raise typer.Exit(code=1) + + +@smolvlm2_app.command(name="train", help="Fine-tune the SmolVLM2 model") +def train( + dataset: Path = typer.Option(..., "--dataset", "-d", help="Path to dataset directory or file"), + epochs: int = typer.Option(10, "--epochs", "-e", help="Number of training epochs"), + batch_size: int = typer.Option(4, "--batch-size", "-b", help="Training batch size"), + optimization_strategy: str = typer.Option( + "qlora", "--optimization-strategy", "-o", help="Optimization strategy (qlora, lora, freeze_vision)" + ), + metrics: list[str] = typer.Option(["edit_distance"], "--metrics", "-m", help="Metrics to evaluate during training"), + output_dir: Optional[Path] = typer.Option(None, "--output-dir", help="Directory to save trained model"), +) -> None: + """Fine-tune the SmolVLM2 model on a dataset.""" + try: + typer.echo("Starting SmolVLM2 fine-tuning...") + + if output_dir is None: + import tempfile + + output_dir = Path(tempfile.mkdtemp()) + typer.echo(f"No output directory specified, using temporary directory: {output_dir}") + + # Create configuration for training + config = { + "dataset": str(dataset), + "epochs": epochs, + "batch_size": batch_size, + "optimization_strategy": optimization_strategy, + "metrics": metrics, + "output_dir": str(output_dir), + } + + # Import the train function here to avoid circular imports + from .core import train as train_model + + results = train_model(config) + + typer.echo(f"Training complete! Model saved to {output_dir}") + typer.echo(f"Final metrics: {results.get('metrics', {})}") + + except Exception as e: + typer.echo(f"Error during training: {e!s}", err=True) + raise typer.Exit(code=1) diff --git a/maestro/trainer/models/smolvlm2/inference.py b/maestro/trainer/models/smolvlm2/inference.py new file mode 100644 index 00000000..31834b13 --- /dev/null +++ b/maestro/trainer/models/smolvlm2/inference.py @@ -0,0 +1,154 @@ +from typing import Optional, Union + +import torch +from transformers import AutoModelForVision2Seq, AutoProcessor + + +class SmolVLM2Inference: + """Inference interface for SmolVLM2 model.""" + + def __init__( + self, + model_name: str = "smol-ai/smolvlm2-500m", + device: str = "cuda" if torch.cuda.is_available() else "cpu", + **kwargs, + ): + """Initialize inference interface.""" + self.model = AutoModelForVision2Seq.from_pretrained(model_name) + self.processor = AutoProcessor.from_pretrained(model_name) + self.device = device + self.model_name = model_name + + def get_model_info(self) -> dict: + """ + Get information about the loaded model. + + Returns: + Dictionary containing model information + """ + # Extract model size from model name (e.g., smolvlm2-500m -> 500M) + size_info = "unknown" + if "-" in self.model_name: + parts = self.model_name.split("-") + if len(parts) > 1 and parts[-1].endswith("m"): + size_info = parts[-1].upper() + + # Get total parameters + total_params = sum(p.numel() for p in self.model.parameters()) + trainable_params = sum(p.numel() for p in self.model.parameters() if p.requires_grad) + + return { + "model_name": self.model_name, + "model_size": size_info, + "device": self.device, + "total_parameters": f"{total_params:,}", + "trainable_parameters": f"{trainable_params:,}", + "architecture": "Vision-Language Model (VLM)", + "framework": "PyTorch/Transformers", + } + + def generate( + self, images: Union[str, list[str]], prompt: Optional[str] = None, max_new_tokens: int = 512, **kwargs + ) -> dict: + """ + Generate text from images. + + Args: + images: Path(s) to image(s) + prompt: Optional prompt to guide generation + max_new_tokens: Maximum number of tokens to generate + **kwargs: Additional generation parameters + + Returns: + Dictionary containing generated text and other outputs + """ + # Process inputs + inputs = self.processor(images=images, text=prompt if prompt else "", return_tensors="pt") + + # Generate + outputs = self.model.generate( + input_ids=inputs["input_ids"].to(self.device), + pixel_values=inputs["pixel_values"].to(self.device), + max_new_tokens=max_new_tokens, + **kwargs, + ) + + # Decode outputs + generated_text = self.processor.batch_decode(outputs, skip_special_tokens=True) + + return {"generated_text": generated_text, "model_outputs": outputs} + + +def predict_with_inputs( + model: AutoModelForVision2Seq, + processor: AutoProcessor, + input_ids: torch.Tensor, + pixel_values: torch.Tensor, + device: Union[str, torch.device], + max_new_tokens: int = 512, + **kwargs, +) -> list[str]: + """ + Generate text predictions using the model. + + Args: + model: The SmolVLM2 model + processor: The model's processor + input_ids: Input token IDs + pixel_values: Input image pixel values + device: Device to run inference on + max_new_tokens: Maximum number of tokens to generate + **kwargs: Additional generation parameters + + Returns: + List of generated text strings + """ + model.eval() + with torch.no_grad(): + outputs = model.generate( + input_ids=input_ids.to(device), + pixel_values=pixel_values.to(device), + max_new_tokens=max_new_tokens, + **kwargs, + ) + return processor.batch_decode(outputs, skip_special_tokens=True) + + +def predict_with_images( + model: AutoModelForVision2Seq, + processor: AutoProcessor, + images: Union[str, list[str]], + prompt: Optional[str] = None, + device: Union[str, torch.device] = "cuda" if torch.cuda.is_available() else "cpu", + max_new_tokens: int = 512, + **kwargs, +) -> list[str]: + """ + Generate text predictions from images. + + Args: + model: The SmolVLM2 model + processor: The model's processor + images: Path(s) to image(s) + prompt: Optional prompt to guide generation + device: Device to run inference on + max_new_tokens: Maximum number of tokens to generate + **kwargs: Additional generation parameters + + Returns: + List of generated text strings + """ + if isinstance(images, str): + images = [images] + + inputs = processor(images=images, text=prompt if prompt else "", return_tensors="pt") + + return predict_with_inputs( + model=model, + processor=processor, + input_ids=inputs["input_ids"], + pixel_values=inputs["pixel_values"], + device=device, + max_new_tokens=max_new_tokens, + **kwargs, + ) diff --git a/maestro/trainer/models/smolvlm2/loaders.py b/maestro/trainer/models/smolvlm2/loaders.py new file mode 100644 index 00000000..a99af163 --- /dev/null +++ b/maestro/trainer/models/smolvlm2/loaders.py @@ -0,0 +1,98 @@ +from typing import Optional + +import torch +from PIL import Image +from torch.utils.data import DataLoader, Dataset +from transformers import AutoProcessor + + +class SmolVLM2Dataset(Dataset): + """Dataset for SmolVLM2 model.""" + + def __init__( + self, image_paths: list[str], texts: Optional[list[str]] = None, processor: Optional[AutoProcessor] = None + ): + """ + Initialize dataset. + + Args: + image_paths: List of paths to images + texts: Optional list of corresponding texts + processor: Model processor for preprocessing + """ + self.image_paths = image_paths + self.texts = texts + self.processor = processor + + def __len__(self) -> int: + return len(self.image_paths) + + def __getitem__(self, idx: int) -> dict: + """Get a single item from the dataset.""" + image = Image.open(self.image_paths[idx]) + + if self.texts is not None: + text = self.texts[idx] + else: + text = "" + + if self.processor is not None: + return self.processor(images=image, text=text, return_tensors="pt") + else: + return {"image": image, "text": text} + + +def train_collate_fn(batch: list[dict]) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]: + """ + Collate function for training data. + + Args: + batch: List of processed samples + + Returns: + Tuple of (input_ids, pixel_values, labels) + """ + input_ids = torch.stack([item["input_ids"].squeeze(0) for item in batch]) + pixel_values = torch.stack([item["pixel_values"].squeeze(0) for item in batch]) + labels = torch.stack([item["labels"].squeeze(0) for item in batch]) + + return input_ids, pixel_values, labels + + +def evaluation_collate_fn( + batch: list[dict], +) -> tuple[torch.Tensor, torch.Tensor, list[Image.Image], list[str], list[str]]: + """ + Collate function for evaluation data. + + Args: + batch: List of processed samples + + Returns: + Tuple of (input_ids, pixel_values, images, prompts, targets) + """ + input_ids = torch.stack([item["input_ids"].squeeze(0) for item in batch]) + pixel_values = torch.stack([item["pixel_values"].squeeze(0) for item in batch]) + images = [item["image"] for item in batch] + prompts = [item["text"] for item in batch] + targets = [item["text"] for item in batch] # In evaluation, target is same as prompt + + return input_ids, pixel_values, images, prompts, targets + + +def create_dataloader( + dataset: Dataset, batch_size: int = 8, num_workers: int = 4, shuffle: bool = True, collate_fn=None +) -> DataLoader: + """ + Create a DataLoader for the dataset. + + Args: + dataset: Dataset to create loader for + batch_size: Batch size + num_workers: Number of worker processes + shuffle: Whether to shuffle the data + collate_fn: Optional collate function + Returns: + DataLoader instance + """ + return DataLoader(dataset, batch_size=batch_size, num_workers=num_workers, shuffle=shuffle, collate_fn=collate_fn) diff --git a/mkdocs.yaml b/mkdocs.yaml index 3f476c12..f232889b 100644 --- a/mkdocs.yaml +++ b/mkdocs.yaml @@ -27,6 +27,7 @@ nav: - Florence-2: models/florence_2.md - PaliGemma 2: models/paligemma_2.md - Qwen2.5-VL: models/qwen_2_5_vl.md + - SmolVLM2: models/smolvlm2.md - Datasets: - JSONL: datasets/jsonl.md diff --git a/pyproject.toml b/pyproject.toml index da476b4f..82f15aeb 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -91,6 +91,14 @@ qwen_2_5_vl = [ "bitsandbytes>=0.45.0", "qwen-vl-utils>=0.0.8" ] +smolvlm2 = [ + "accelerate>=1.2.1", + "peft>=0.12", + "torch>=2.4.0", + "torchvision>=0.20.0", + "transformers>=4.49.0", + "bitsandbytes>=0.45.0" +] [project.scripts] maestro = "maestro.cli.main:app" @@ -147,12 +155,8 @@ line-length = 120 indent-width = 4 [tool.ruff.lint] - -# Enable pycodestyle (`E`) -select = ["E", "F", "I", "A", "Q", "W", "N", "T", "Q","TRY","UP","C90","RUF","NPY"] -ignore = ["T201","TRY003","NPY201"] - -# Allow autofix for all enabled rules (when `--fix`) is provided. +select = ["E", "F", "I", "A", "Q", "W", "N", "T", "TRY", "UP", "C90", "RUF", "NPY"] +ignore = ["T201", "TRY003", "NPY201"] fixable = [ "A", "B", @@ -197,12 +201,15 @@ fixable = [ "TID", "TRY", "UP", - "YTT", + "YTT" ] unfixable = [] + # Allow unused variables when underscore-prefixed. dummy-variable-rgx = "^(_+|(_+[a-zA-Z0-9_]*[a-zA-Z0-9]+?))$" -pylint.max-args = 20 + +[tool.ruff.lint.pylint] +max-args = 20 [tool.ruff.lint.flake8-quotes] inline-quotes = "double"