diff --git a/3rdparty/Automodel-workspace/Automodel b/3rdparty/Automodel-workspace/Automodel index a2db048383..2d20e33a19 160000 --- a/3rdparty/Automodel-workspace/Automodel +++ b/3rdparty/Automodel-workspace/Automodel @@ -1 +1 @@ -Subproject commit a2db048383cd54b3fafc928df4c30bf7bbf7c430 +Subproject commit 2d20e33a19d5e53a271b1403b507475e68ad14dc diff --git a/nemo_rl/models/automodel/__init__.py b/nemo_rl/models/automodel/__init__.py new file mode 100644 index 0000000000..341a77c5bc --- /dev/null +++ b/nemo_rl/models/automodel/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/nemo_rl/models/automodel/setup.py b/nemo_rl/models/automodel/setup.py new file mode 100644 index 0000000000..4724b3edca --- /dev/null +++ b/nemo_rl/models/automodel/setup.py @@ -0,0 +1,455 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +from dataclasses import dataclass +from typing import Any, Optional + +import torch +from accelerate import init_empty_weights +from nemo_automodel._transformers.registry import ModelRegistry +from nemo_automodel._transformers.utils import sliding_window_overwrite +from nemo_automodel.components.config.loader import _resolve_target +from nemo_automodel.components.distributed.fsdp2 import FSDP2Manager +from torch.distributed.fsdp import CPUOffloadPolicy, MixedPrecisionPolicy +from transformers import AutoConfig, AutoProcessor, AutoTokenizer, PreTrainedModel +from transformers.models.gemma3.modeling_gemma3 import Gemma3ForCausalLM +from transformers.utils import TRANSFORMERS_CACHE + +from nemo_rl.models.automodel.types import RuntimeConfig +from nemo_rl.models.policy import PolicyConfig +from nemo_rl.models.policy.utils import configure_dynamo_cache, resolve_model_class + + +@dataclass +class ModelAndOptimizerState: + model: torch.nn.Module + model_state_dict_keys: list[str] + optimizer: Optional[torch.optim.Optimizer] + scheduler: Optional[Any] + reference_model_state_dict: Optional[dict[str, torch.Tensor]] + is_hf_model: bool + is_moe_model: bool + is_reward_model: bool + is_vlm: bool + model_class: type + model_config: Any + + +def validate_and_set_config( + config: PolicyConfig, + processor: Optional[AutoProcessor], + rank: int, +) -> RuntimeConfig: + # Set basic configuration + is_vlm = processor is not None + is_generation_colocated = None + if "generation" in config and config["generation"] is not None: + is_generation_colocated = config["generation"]["colocated"]["enabled"] + + # Set NCCL environment variable + if not is_generation_colocated: + os.environ["NCCL_CUMEM_ENABLE"] = "1" + + # Configure dynamo cache + configure_dynamo_cache() + + # Parse precision + precision_map = { + "float32": torch.float32, + "bfloat16": torch.bfloat16, + "float16": torch.float16, + } + precision = config["precision"] + if precision not in precision_map: + raise ValueError(f"Unknown precision: {precision}") + dtype = precision_map[precision] + + # Get other configuration values + cpu_offload = config["dtensor_cfg"]["cpu_offload"] + offload_optimizer_for_logprob = config.get("offload_optimizer_for_logprob", False) + max_grad_norm = config["max_grad_norm"] + enable_seq_packing = config["sequence_packing"]["enabled"] + model_name = config["model_name"] + + # Validate sequence packing + if enable_seq_packing: + if is_vlm: + raise ValueError( + "Sequence packing is not supported for VLM models. " + "Please set policy.sequence_packing.enabled = False to train VLM models." + ) + print(f"[Rank {rank}] Sequence packing is enabled for model {model_name}") + print(f"[Rank {rank}] Using FlashAttention2 for sequence packing") + + # Get HF config overrides + hf_config_overrides = config.get("hf_config_overrides", {}) or {} + + # Determine attention implementation + cp_size_cfg = config["dtensor_cfg"]["context_parallel_size"] + attn_impl = ( + "flash_attention_2" + if (enable_seq_packing and cp_size_cfg == 1) + else ("sdpa" if cp_size_cfg > 1 else None) + ) + + # Load model config + model_config = AutoConfig.from_pretrained( + model_name, + torch_dtype=torch.float32, # Always load in float32 for master weights + trust_remote_code=True, + **sliding_window_overwrite(model_name), + attn_implementation=attn_impl, + **hf_config_overrides, + ) + + # Check if model supports flash attention args + allow_flash_attn_args = True + if ( + model_config.architectures[0] == "DeciLMForCausalLM" + and model_config.model_type == "nemotron-nas" + ): + allow_flash_attn_args = False + + # Determine if reward model + is_reward_model = ( + "reward_model_cfg" in config and config["reward_model_cfg"]["enabled"] + ) + + if is_reward_model: + from nemo_automodel import NeMoAutoModelForSequenceClassification + + # Validate reward model configuration + if enable_seq_packing: + raise NotImplementedError( + "Sequence packing is not supported for reward models" + ) + + rm_type = config["reward_model_cfg"]["reward_model_type"] + if rm_type == "bradley_terry": + model_class = NeMoAutoModelForSequenceClassification + if model_config.num_labels != 1: + print( + "model_config.num_labels is not 1. Setting it to 1 since this value is used as the out_features " + "for the linear head of Bradley-Terry reward models." + ) + model_config.num_labels = 1 + else: + raise ValueError(f"Unknown reward model type: {rm_type}") + else: + model_class = resolve_model_class(model_config.model_type) + + # Get parallelization sizes + tp_size = config["dtensor_cfg"].get("tensor_parallel_size", 1) + cp_size = config["dtensor_cfg"].get("context_parallel_size", 1) + ep_size = config["dtensor_cfg"].get("expert_parallel_size", 1) + dp_size = config["dtensor_cfg"].get("data_parallel_size", None) + sequence_parallel_enabled = config["dtensor_cfg"]["sequence_parallel"] + + # Validate parallelization configuration + if cp_size > 1 and enable_seq_packing: + raise ValueError( + "Context parallel is not supported for sequence packing. " + "Refer to https://github.com/NVIDIA/NeMo-RL/blob/main/docs/model-quirks.md#context-parallel-with-fsdp2 for more details." + ) + + if sequence_parallel_enabled and tp_size == 1: + print( + "[WARNING]: sequence_parallel=True, but tp_size=1 which has no effect. " + "Enable tp_size > 1 to use sequence parallelism." + ) + elif sequence_parallel_enabled and tp_size > 1: + raise RuntimeError( + "Sequence parallel + tp_size >1 is currently broken in torch==2.8.0. " + "See https://github.com/NVIDIA-NeMo/Automodel/issues/652 for more details." + ) + + return RuntimeConfig( + model_class=model_class, + model_config=model_config, + hf_config_overrides=hf_config_overrides, + allow_flash_attn_args=allow_flash_attn_args, + attn_impl=attn_impl, + dtype=dtype, + enable_seq_packing=enable_seq_packing, + max_grad_norm=max_grad_norm, + cpu_offload=cpu_offload, + offload_optimizer_for_logprob=offload_optimizer_for_logprob, + is_generation_colocated=is_generation_colocated, + ) + + +def setup_distributed( + config: PolicyConfig, + runtime_config: RuntimeConfig, +) -> FSDP2Manager: + # Initialize process group + torch.distributed.init_process_group(backend="nccl") + world_size = torch.distributed.get_world_size() + + # Extract runtime config values + dtype = runtime_config.dtype + cpu_offload = runtime_config.cpu_offload + + # Extract parallelization config from config (not runtime_config) + tp_size = config["dtensor_cfg"].get("tensor_parallel_size", 1) + cp_size = config["dtensor_cfg"].get("context_parallel_size", 1) + ep_size = config["dtensor_cfg"].get("expert_parallel_size", 1) + dp_size = config["dtensor_cfg"].get("data_parallel_size", None) + sequence_parallel_enabled = config["dtensor_cfg"]["sequence_parallel"] + + # Create FSDP2 manager + manager = FSDP2Manager( + dp_size=dp_size, + dp_replicate_size=1, + tp_size=tp_size, + cp_size=cp_size, + ep_size=ep_size, + pp_size=1, + sequence_parallel=sequence_parallel_enabled, + use_hf_tp_plan=config["dtensor_cfg"].get("use_hf_tp_plan", False), + mp_policy=MixedPrecisionPolicy( + param_dtype=dtype, + reduce_dtype=torch.float32, + output_dtype=torch.float32, + ), + offload_policy=CPUOffloadPolicy(pin_memory=False) if cpu_offload else None, + backend="nccl", + world_size=world_size, + activation_checkpointing=config["dtensor_cfg"]["activation_checkpointing"], + ) + + return manager + + +def setup_model_and_optimizer( + config: PolicyConfig, + tokenizer: AutoTokenizer, + runtime_config: RuntimeConfig, + distributed_manager: FSDP2Manager, + worker_instance: Any, + is_vlm: bool = False, + init_optimizer: bool = True, + init_reference_model: bool = True, +) -> ModelAndOptimizerState: + from typing import cast + + from nemo_automodel.components.distributed.tensor_utils import get_cpu_state_dict + from nemo_automodel.components.moe.parallelizer import ( + parallelize_model as moe_parallelize_model, + ) + + from nemo_rl.models.policy.utils import import_class_from_path + + # Extract configuration values from runtime_config + model_config = runtime_config.model_config + model_class = runtime_config.model_class + attn_impl = runtime_config.attn_impl + hf_config_overrides = runtime_config.hf_config_overrides + cpu_offload = runtime_config.cpu_offload + + # Determine is_reward_model from config + is_reward_model = ( + "reward_model_cfg" in config and config["reward_model_cfg"]["enabled"] + ) + + # Extract distributed configuration from manager + rank = torch.distributed.get_rank() + device_mesh = distributed_manager.device_mesh + moe_mesh = distributed_manager.moe_mesh + tp_size = distributed_manager.tp_size + cp_size = distributed_manager.cp_size + sequence_parallel_enabled = distributed_manager.sequence_parallel + + model_name = config["model_name"] + + print(f"[Rank {rank}] Initializing empty model for FSDP...") + + # Prepare automodel kwargs + automodel_model_kwargs = config.get("automodel_model_kwargs", {}) + if automodel_model_kwargs.get("backend", None) is not None: + backend_class = _resolve_target( + automodel_model_kwargs.get("backend", None)["_target_"] + ) + backend_kwargs = automodel_model_kwargs.get("backend") + backend_kwargs.pop("_target_") + backend = backend_class(**backend_kwargs) + automodel_model_kwargs["backend"] = backend + + # Initialize empty model + with init_empty_weights(): + model = model_class.from_config( + model_config, + attn_implementation=attn_impl, + torch_dtype=str(model_config.torch_dtype), + **automodel_model_kwargs, + ) + + # Store original state dict keys + model_state_dict_keys = list(model.state_dict().keys()) + + # Set pad token ID if needed + if model.config.pad_token_id is None: + model.config.pad_token_id = tokenizer.pad_token_id + + # Validate CP configuration with model type + if cp_size > 1: + if isinstance(model, Gemma3ForCausalLM): + raise ValueError( + "Context parallel is not supported for Gemma3ForCausalLM. " + "Torch context parallel has many limitations. " + "Please refer to https://github.com/NVIDIA/NeMo-RL/blob/main/docs/model-quirks.md#context-parallel-with-fsdp2 for more details." + ) + + if tp_size > 1 and sequence_parallel_enabled: + raise ValueError( + "It's a known issue that context parallel can't be used together with sequence parallel in DTensor worker. " + "Please either set cp_size = 1 or disable sequence parallel. " + "See https://github.com/NVIDIA-NeMo/RL/issues/659 for more details." + ) + + if is_vlm: + raise ValueError( + "Context parallel is yet not supported for VLM models. Please set cp_size = 1 to train VLM models." + ) + + # Parallelize model + is_hf_model = ( + model_config.architectures[0] not in ModelRegistry.model_arch_name_to_cls + ) + is_moe_model = any(["expert" in key for key in model_state_dict_keys]) + if not isinstance(model, PreTrainedModel) and is_moe_model and not is_hf_model: + moe_parallelize_model( + model=model, + world_mesh=device_mesh, + moe_mesh=moe_mesh, + pp_enabled=False, + dp_axis_names=( + ("dp_replicate", "dp_shard_cp") + if "dp_replicate" in device_mesh.mesh_dim_names + and "dp_shard_cp" in device_mesh.mesh_dim_names + else ("dp_shard_cp",) + ), + cp_axis_name="cp", + tp_axis_name="tp", + ep_axis_name="ep", + ep_shard_axis_names=("ep_shard",), + activation_checkpointing=config["dtensor_cfg"]["activation_checkpointing"], + ) + else: + model = distributed_manager.parallelize(model) + + print(model) + + # Ensure checkpointer exists + worker_instance._ensure_checkpointer( + config_updates={ + "model_repo_id": model_name, + "dequantize_base_checkpoint": config.get( + "dequantize_base_checkpoint", False + ), + }, + checkpoint_root=None, + ) + worker_instance.checkpointer.config.model_state_dict_keys = model_state_dict_keys + + # Load base HF weights + worker_instance.checkpointer.load_base_model( + model, + device=torch.cuda.current_device(), + root_dir=hf_config_overrides.get("cache_dir", TRANSFORMERS_CACHE), + model_name=model_name, + peft_init_method=None, + load_base_model=True, + ) + + # Handle tied word embeddings + is_tied_lm_head = hasattr(model, "lm_head") and getattr( + getattr(model, "config", {}), "tie_word_embeddings", False + ) + if is_tied_lm_head: + embed_tokens_weight = None + for name, param in model.named_parameters(): + if "embed_tokens" in name and name.endswith(".weight"): + embed_tokens_weight = param + break + + if embed_tokens_weight is not None: + model.lm_head.weight = embed_tokens_weight + + # CPU offload if needed + if cpu_offload: + model = worker_instance.move_to_device(model, "cpu") + + # Initialize reference model + reference_model_state_dict = None + if init_reference_model: + reference_model_state_dict = get_cpu_state_dict( + model.state_dict().items(), pin_memory=True + ) + + # Initialize optimizer + optimizer = None + if init_optimizer: + optimizer_cls = import_class_from_path(config["optimizer"]["name"]) + optimizer = optimizer_cls(model.parameters(), **config["optimizer"]["kwargs"]) + + # Initialize scheduler + scheduler = None + if "scheduler" in config and optimizer is not None: + if isinstance(config["scheduler"], dict): + scheduler_cls = import_class_from_path( + cast(str, config["scheduler"]["name"]) + ) + scheduler = scheduler_cls(optimizer, **config["scheduler"]["kwargs"]) + else: + schedulers = [] + for scheduler_cfg in config["scheduler"]: + if "name" in scheduler_cfg: + schedulers.append( + import_class_from_path(scheduler_cfg["name"])( + optimizer, **scheduler_cfg["kwargs"] + ) + ) + else: + assert "milestones" in scheduler_cfg, ( + "unknown scheduler config: ", + scheduler_cfg, + ) + milestones: list[int] = scheduler_cfg["milestones"] + + scheduler = torch.optim.lr_scheduler.SequentialLR( + optimizer, schedulers, milestones + ) + elif optimizer is not None: + # Default to passthrough LR schedule + scheduler = torch.optim.lr_scheduler.LambdaLR( + optimizer, lr_lambda=lambda epoch: 1 + ) + + model_and_optimizer_state = ModelAndOptimizerState( + model=model, + model_state_dict_keys=model_state_dict_keys, + optimizer=optimizer, + scheduler=scheduler, + reference_model_state_dict=reference_model_state_dict, + is_hf_model=is_hf_model, + is_moe_model=is_moe_model, + is_reward_model=is_reward_model, + is_vlm=is_vlm, + model_class=type(model), + model_config=model.config, + ) + + return model_and_optimizer_state diff --git a/nemo_rl/models/automodel/types.py b/nemo_rl/models/automodel/types.py new file mode 100644 index 0000000000..76090b052a --- /dev/null +++ b/nemo_rl/models/automodel/types.py @@ -0,0 +1,111 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Type definitions for automodel training framework.""" + +from dataclasses import dataclass, field +from typing import Any, Callable, Optional + +import torch +from transformers import AutoConfig + +from nemo_rl.algorithms.interfaces import LossFunction +from nemo_rl.distributed.batched_data_dict import BatchedDataDict + + +@dataclass(frozen=True) +class RuntimeConfig: + """Unified runtime configuration for model training and inference. + + This replaces ValidatedState with a cleaner, more intuitive structure + that groups all runtime settings in one place. + """ + + # Model loading configuration + model_class: type + model_config: AutoConfig + hf_config_overrides: dict[str, Any] + + # Attention configuration + allow_flash_attn_args: bool + attn_impl: Optional[str] + + # Training/inference settings + dtype: torch.dtype + enable_seq_packing: bool + max_grad_norm: float + + # Memory management + cpu_offload: bool = False + offload_optimizer_for_logprob: bool = False + + # Generation configuration + is_generation_colocated: Optional[bool] = None + + +@dataclass +class ProcessedInputs: + """Processed microbatch inputs ready for model forward pass. + + This structure contains all necessary tensors and metadata for a forward pass, + including context parallel buffers and flash attention configuration. + """ + + # Core inputs (always present) + input_ids: torch.Tensor + seq_len: int + + # Optional tensors (None when not applicable) + attention_mask: Optional[torch.Tensor] = None + position_ids: Optional[torch.Tensor] = None + + # Flash attention configuration + flash_attn_kwargs: dict[str, Any] = field(default_factory=dict) + + # Multimodal (VLM) inputs + vlm_kwargs: dict[str, Any] = field(default_factory=dict) + + # Context parallel support (cp_size > 1) + cp_buffers: list[torch.Tensor] = field(default_factory=list) + seq_index: Optional[torch.Tensor] = None + + @property + def has_context_parallel(self) -> bool: + """Check if context parallel is enabled.""" + return len(self.cp_buffers) > 0 + + @property + def has_flash_attention(self) -> bool: + """Check if flash attention is configured.""" + return len(self.flash_attn_kwargs) > 0 + + @property + def is_multimodal(self) -> bool: + """Check if this is a multimodal input.""" + return len(self.vlm_kwargs) > 0 + + +@dataclass +class LossInputs: + """Everything needed to compute loss. + + Groups together microbatch data, loss function, normalization factors, + and temperature processing function. + """ + + microbatch: BatchedDataDict[Any] + loss_fn: LossFunction + global_valid_seqs: torch.Tensor + global_valid_toks: torch.Tensor + apply_temperature_fn: Callable[[torch.Tensor], torch.Tensor] diff --git a/nemo_rl/models/policy/dtensor_policy_worker_v2.py b/nemo_rl/models/policy/dtensor_policy_worker_v2.py index 47a0c60da4..c5490c3189 100644 --- a/nemo_rl/models/policy/dtensor_policy_worker_v2.py +++ b/nemo_rl/models/policy/dtensor_policy_worker_v2.py @@ -18,14 +18,19 @@ import warnings from collections import defaultdict from contextlib import AbstractContextManager, contextmanager, nullcontext -from typing import Any, Generator, Optional, cast +from typing import Any, Generator, Optional import ray import torch import zmq -from accelerate import init_empty_weights -from nemo_automodel import ( - NeMoAutoModelForSequenceClassification, +from nemo_automodel.components.checkpoint._backports.filesystem import ( + SerializationFormat, +) +from nemo_automodel.components.checkpoint.checkpointing import ( + Checkpointer, +) +from nemo_automodel.components.checkpoint.checkpointing import ( + CheckpointingConfig as AutomodelCheckpointingConfig, ) from nemo_automodel.components.distributed.cp_utils import ( create_context_parallel_ctx, @@ -35,30 +40,16 @@ clip_grad_by_total_norm_, get_grad_norm, ) -from nemo_automodel.components.distributed.parallelizer import ( - fsdp2_strategy_parallelize, -) from nemo_automodel.components.distributed.tensor_utils import ( get_cpu_state_dict, to_local_if_dtensor, ) from torch import nn -from torch.distributed.checkpoint.state_dict import ( - StateDictOptions, - set_model_state_dict, -) -from torch.distributed.fsdp import ( - CPUOffloadPolicy, - MixedPrecisionPolicy, - OffloadPolicy, -) from torch.distributed.tensor import DTensor, Shard from transformers import ( - AutoConfig, AutoProcessor, AutoTokenizer, ) -from transformers.models.gemma3.modeling_gemma3 import Gemma3ForCausalLM from nemo_rl.algorithms.interfaces import LossFunction, LossType from nemo_rl.algorithms.loss_functions import SequencePackingLossWrapper @@ -68,6 +59,11 @@ distributed_vocab_topk, get_logprobs_from_vocab_parallel_logits, ) +from nemo_rl.models.automodel.setup import ( + setup_distributed, + setup_model_and_optimizer, + validate_and_set_config, +) from nemo_rl.models.huggingface.common import ( get_flash_attention_kwargs, pack_sequences, @@ -79,15 +75,8 @@ ScoreOutputSpec, ) from nemo_rl.models.policy.utils import ( - configure_dynamo_cache, get_gpu_info, get_runtime_env_for_policy_worker, - import_class_from_path, - resolve_model_class, -) -from nemo_rl.utils.automodel_checkpoint import ( - load_checkpoint, - save_checkpoint, ) from nemo_rl.utils.checkpoint import CheckpointingConfig from nemo_rl.utils.nsys import wrap_with_nvtx_name @@ -120,342 +109,92 @@ def __init__( **kwargs: Any, ): """Initialize the DTensorPolicyWorkerV2.""" + # Store tokenizer and processor self.tokenizer = tokenizer self.processor = processor - self.is_vlm = processor is not None - - print(f"Initializing DTensorPolicyWorkerV2 with is_vlm={self.is_vlm}") - - self.is_generation_colocated = None - if "generation" in config and config["generation"] is not None: - self.is_generation_colocated = config["generation"]["colocated"]["enabled"] - - # Explicitly set NCCL_CUMEM_ENABLE to 1 to avoid the P2P initialization error for PyNCCLCommunicator. - # See https://github.com/NVIDIA-NeMo/RL/issues/564 for more details. - if not self.is_generation_colocated: - os.environ["NCCL_CUMEM_ENABLE"] = "1" - - # Disable dynamo autotune_local_cache to avoid crash when there's already a cache - # with different order of node_bundles - configure_dynamo_cache() + is_vlm = processor is not None + # Store configuration self.cfg = config - # torch distributed init. Envars for rank, world_size, and master_addr and master_port are set from the ray remote call - torch.distributed.init_process_group(backend="nccl") - self.rank = torch.distributed.get_rank() - world_size = torch.distributed.get_world_size() - model_name = self.cfg["model_name"] - - self.cpu_offload = self.cfg["dtensor_cfg"]["cpu_offload"] - self.offload_optimizer_for_logprob = self.cfg["offload_optimizer_for_logprob"] - self.max_grad_norm = self.cfg["max_grad_norm"] - - if self.cfg["precision"] == "float32": - self.dtype = torch.float32 - elif self.cfg["precision"] == "bfloat16": - self.dtype = torch.bfloat16 - elif self.cfg["precision"] == "float16": - self.dtype = torch.float16 - else: - raise ValueError(f"Unknown precision: {self.cfg['precision']}") - - print(f"[Rank {self.rank}] Loading model {model_name} on CPU...") - self.enable_seq_packing = self.cfg["sequence_packing"]["enabled"] - if self.enable_seq_packing: - assert not self.is_vlm, ( - "Sequence packing is not supported for VLM models. Please set policy.sequence_packing.enabled = False to train VLM models." - ) - print( - f"[Rank {self.rank}] Sequence packing is enabled for model {model_name}" - ) - print(f"[Rank {self.rank}] Using FlashAttention2 for sequence packing") - - hf_config_overrides = self.cfg.get("hf_config_overrides", {}) or {} - - model_config = AutoConfig.from_pretrained( - model_name, - # Always load the model in float32 to keep master weights in float32. - # Keeping the master weights in lower precision has shown to cause issues with convergence. - torch_dtype=torch.float32, - trust_remote_code=True, - attn_implementation="flash_attention_2" - if self.enable_seq_packing - else None, - **hf_config_overrides, - ) - - self.allow_flash_attn_args = self.check_model_allow_flash_attn_args( - model_config - ) - - self._is_reward_model = ( - "reward_model_cfg" in self.cfg and self.cfg["reward_model_cfg"]["enabled"] - ) - if self._is_reward_model: - # Ensure sequence packing is disabled. - if self.enable_seq_packing: - raise NotImplementedError( - "Sequence packing is not supported for reward models" - ) - # Load model as a Reward Model. - rm_type = self.cfg["reward_model_cfg"]["reward_model_type"] - if rm_type == "bradley_terry": - model_class = NeMoAutoModelForSequenceClassification - if model_config.num_labels != 1: - # For Bradley-Terry reward models, the linear head has a single output. - # In the transformers library, the default setting for model_config.num_labels is 2 - # (https://github.com/huggingface/transformers/blob/v4.52.4/src/transformers/configuration_utils.py#L259). - # Since num_labels is used as the out_features for the linear head - # (https://github.com/huggingface/transformers/blob/v4.52.4/src/transformers/models/llama/modeling_llama.py#L738) - # if num_labels is not 1, we set it to 1. This change may trigger a warning that some weights are not initialized - # from the model checkpoint and are instead initialized using model_config.initializer_range - # (https://github.com/huggingface/transformers/blob/v4.52.4/src/transformers/models/llama/configuration_llama.py#L62). - print( - "model_config.num_labels is not 1. Setting it to 1 since this value is used as the out_features " - "for the linear head of Bradley-Terry reward models." - ) - model_config.num_labels = 1 - else: - raise ValueError(f"Unknown reward model type: {rm_type}") - else: - # DO NOT assume AutoModelForCausalLM, multimodal models can inherit from AutoModelForImageTextToText, AutoModelForTextToWaveform, etc. - model_class = resolve_model_class(model_config.model_type) - - full_state_dict = None - model_state_dict_keys = None - if self.rank == 0: - print(f"[Rank {self.rank}] Loading model {model_name} on CPU...") - model = model_class.from_pretrained( - model_name, - device_map="cpu", # load weights onto CPU initially - trust_remote_code=True, - config=model_config, - use_liger_kernel=False, - torch_dtype=str(model_config.torch_dtype), - ) - full_state_dict = model.state_dict() - # Store the original model state dict keys before any parallelization - model_state_dict_keys = list(full_state_dict.keys()) - del model - - print(f"[Rank {self.rank}] Initializing empty model for FSDP...") - # All ranks initialize model on meta device, so FSDP can shard it. - # The actual weights will be broadcast from rank 0. - - with init_empty_weights(): - # NeMoAutoModelForCausalLM uses flash_attention_2 by default - # so we need to set it to None if sequence packing is disabled - # https://github.com/NVIDIA-NeMo/Automodel/blob/7e748be260651349307862426c0c168cebdeeec3/nemo_automodel/components/_transformers/auto_model.py#L180 - self.model = model_class.from_config( - model_config, - attn_implementation="flash_attention_2" - if self.enable_seq_packing - else None, - use_liger_kernel=False, - trust_remote_code=True, - torch_dtype=str(model_config.torch_dtype), - ) - - if self.model.config.pad_token_id is None: - self.model.config.pad_token_id = tokenizer.pad_token_id + # Initialize checkpointer references + self.checkpointer = None + self.checkpoint_config = None - tp_size = self.cfg["dtensor_cfg"]["tensor_parallel_size"] - cp_size = self.cfg["dtensor_cfg"]["context_parallel_size"] - if cp_size > 1 and self.enable_seq_packing: - raise ValueError( - "Context parallel is not supported for sequence packing. Refer to https://github.com/NVIDIA/NeMo-RL/blob/main/docs/model-quirks.md#context-parallel-with-fsdp2 for more details." - ) - dp_size = world_size // tp_size // cp_size - sequence_parallel_enabled = self.cfg["dtensor_cfg"]["sequence_parallel"] - assert world_size == dp_size * tp_size * cp_size, ( - f"World size({world_size}) must equal to dp_size({dp_size}) * tp_size({tp_size}) * cp_size({cp_size}) to use DTensor" + # Validate configuration and set derived values + self.runtime_config = validate_and_set_config( + config=config, + processor=processor, + rank=0, # Temporary, will be updated after distributed init ) - if sequence_parallel_enabled and tp_size == 1: - print( - "[WARNING]: sequence_parallel=True, but tp_size=1 which has no effect. Enable tp_size > 1 to use sequence parallelism." - ) - elif sequence_parallel_enabled and tp_size > 1: - raise RuntimeError( - "Sequence parallel + tp_size >1 is currently broken in torch==2.8.0. See https://github.com/NVIDIA-NeMo/Automodel/issues/652 for more details." - ) - - if cp_size > 1: - assert not isinstance(self.model, Gemma3ForCausalLM), ( - "Context parallel is not supported for Gemma3ForCausalLM. Torch context parallel has many limitations. " - "Please refer to https://github.com/NVIDIA/NeMo-RL/blob/main/docs/model-quirks.md#context-parallel-with-fsdp2 for more details." - ) - - assert not (tp_size > 1 and sequence_parallel_enabled), ( - "It's a known issue that context parallel can't be used together with sequence parallel in DTensor worker. " - "Please either set cp_size = 1 or disable sequence parallel. " - "See https://github.com/NVIDIA-NeMo/RL/issues/659 for more details." - ) + print(f"Initializing DTensorPolicyWorkerV2 with is_vlm={is_vlm}") - assert not self.is_vlm, ( - "Context parallel is yet not supported for VLM models. Please set cp_size = 1 to train VLM models." - ) - - # For FSDP2 compatibility, we need to support HSDP structure - # For now, we use dp_replicate_size = 1 (no hybrid sharding) - dp_replicate_size = 1 - dp_shard_size = dp_size - - # torch==2.8 uses LOCAL_RANK to set the device here (https://github.com/pytorch/pytorch/blob/ba56102387ef21a3b04b357e5b183d48f0afefc7/torch/distributed/device_mesh.py#L500), - # but CUDA_VISIBLE_DEVICES is set to only 1 gpu, so we need to temporarily set LOCAL_RANK to 0. - # TODO: consider changing the default LOCAL_RANK set in worker_groups.py - prev_local_rank = os.environ["LOCAL_RANK"] - os.environ["LOCAL_RANK"] = "0" - - # Create device mesh with HSDP structure for FSDP2 compatibility - device_mesh = torch.distributed.device_mesh.init_device_mesh( - "cuda", - (dp_replicate_size, dp_shard_size, cp_size, tp_size), - mesh_dim_names=("dp_replicate", "dp_shard", "cp", "tp"), - ) - os.environ["LOCAL_RANK"] = prev_local_rank - - # Create flattened submeshes for different use cases - # Flatten dp_replicate + dp_shard for the "dp" dimension (backward compatibility) - device_mesh[("dp_replicate", "dp_shard")]._flatten(mesh_dim_name="dp") - - # Flatten dp_shard + cp for FSDP2 sharding - device_mesh[("dp_shard", "cp")]._flatten(mesh_dim_name="dp_shard_cp") - - # Flatten dp_replicate + dp_shard + cp for gradient operations - device_mesh[("dp_replicate", "dp_shard", "cp")]._flatten(mesh_dim_name="dp_cp") - - # Store mesh references for backward compatibility - self.dp_cp_mesh = device_mesh["dp_cp"] - self.dp_mesh = device_mesh["dp"] - self.tp_mesh = device_mesh["tp"] - self.cp_mesh = device_mesh["cp"] - - self.dp_size = dp_size - self.tp_size = tp_size - self.cp_size = cp_size - self.device_mesh = device_mesh - - # ------------------------------------------------ - # 3) Move to GPU + Composable FSDP - # (Initialize device mesh, shard submodules, then shard entire model) - # ------------------------------------------------ - self.model = fsdp2_strategy_parallelize( - self.model, - device_mesh=self.device_mesh, - mp_policy=MixedPrecisionPolicy( - param_dtype=self.dtype, - reduce_dtype=torch.float32, - output_dtype=torch.float32, - ), - offload_policy=CPUOffloadPolicy(pin_memory=False) - if self.cpu_offload - else OffloadPolicy(), - sequence_parallel=sequence_parallel_enabled, - activation_checkpointing=self.cfg["dtensor_cfg"][ - "activation_checkpointing" - ], - tp_shard_plan=self.cfg["dtensor_cfg"]["custom_parallel_plan"], - dp_replicate_mesh_name="dp_replicate", - dp_shard_cp_mesh_name="dp_shard_cp", - tp_mesh_name="tp", + # Set up distributed environment (returns FSDP2Manager) + self.distributed_manager = setup_distributed( + config=config, + runtime_config=self.runtime_config, ) - print(f"[Rank {self.rank}] Loading state dict from rank 0...") - # This will broadcast the state dict from rank 0 to all other ranks - # and load it into the FSDP model. - set_model_state_dict( - self.model, - model_state_dict=full_state_dict, - options=StateDictOptions( - full_state_dict=True, - broadcast_from_rank0=True, - ), + # Set up model and optimizer + self.model_and_optimizer_state = setup_model_and_optimizer( + config=config, + tokenizer=tokenizer, + runtime_config=self.runtime_config, + distributed_manager=self.distributed_manager, + worker_instance=self, + is_vlm=is_vlm, + init_optimizer=init_optimizer, + init_reference_model=init_reference_model, ) - # Broadcast model state dict keys to all ranks and store as instance variable - keys_to_broadcast = [model_state_dict_keys] - torch.distributed.broadcast_object_list(keys_to_broadcast, src=0) - self.model_state_dict_keys = keys_to_broadcast[0] - - # Handle tied word embeddings after loading the state dict - # We need to actually tie the parameters at the model level - is_tied_lm_head = hasattr(self.model, "lm_head") and getattr( - getattr(self.model, "config", {}), "tie_word_embeddings", False - ) - if is_tied_lm_head: - embed_tokens_weight = None - for name, param in self.model.named_parameters(): - if "embed_tokens" in name and name.endswith(".weight"): - embed_tokens_weight = param - break - - if embed_tokens_weight is not None: - self.model.lm_head.weight = embed_tokens_weight - - # Manually broadcast buffers - for _, buf in self.model.named_buffers(): - torch.distributed.broadcast(to_local_if_dtensor(buf), src=0) - - if self.cpu_offload: - self.model = self.move_to_device(self.model, "cpu") - - if init_reference_model: - self.reference_model_state_dict = get_cpu_state_dict( - self.model.state_dict().items(), pin_memory=True - ) - - if init_optimizer: - optimizer_cls = import_class_from_path(self.cfg["optimizer"]["name"]) - self.optimizer = optimizer_cls( - self.model.parameters(), **self.cfg["optimizer"]["kwargs"] - ) - else: - self.optimizer = None - - if "scheduler" in self.cfg and self.optimizer is not None: - if isinstance(self.cfg["scheduler"], dict): - scheduler_cls = import_class_from_path( - cast(str, self.cfg["scheduler"]["name"]) - ) - self.scheduler = scheduler_cls( - self.optimizer, **self.cfg["scheduler"]["kwargs"] - ) - else: - schedulers = [] - for scheduler_cfg in self.cfg["scheduler"]: - if "name" in scheduler_cfg: - schedulers.append( - import_class_from_path(scheduler_cfg["name"])( - self.optimizer, **scheduler_cfg["kwargs"] - ) - ) - else: - assert "milestones" in scheduler_cfg, ( - "unknown scheduler config: ", - scheduler_cfg, - ) - milestones: list[int] = scheduler_cfg["milestones"] - - self.scheduler = torch.optim.lr_scheduler.SequentialLR( - self.optimizer, schedulers, milestones - ) - - elif self.optimizer is not None: - ## default to a passthrough LR schedule - self.scheduler = torch.optim.lr_scheduler.LambdaLR( - self.optimizer, lr_lambda=lambda epoch: 1 - ) + # Set up backward compatibility aliases + self._set_attributes() - # restore + # Load checkpoint if provided if weights_path: self.load_checkpoint(weights_path, optimizer_path) else: print( - "No weights path provided. Starting from scratch (default policy init)" + "No weights path provided. Loaded base HF weights via Checkpointer (default policy init)" ) + def _set_attributes(self) -> None: + # Aliases from model_and_optimizer_state + self.model = self.model_and_optimizer_state.model + self.optimizer = self.model_and_optimizer_state.optimizer + self.scheduler = self.model_and_optimizer_state.scheduler + self.reference_model_state_dict = ( + self.model_and_optimizer_state.reference_model_state_dict + ) + self.model_state_dict_keys = ( + self.model_and_optimizer_state.model_state_dict_keys + ) + self.is_vlm = self.model_and_optimizer_state.is_vlm + self._is_reward_model = self.model_and_optimizer_state.is_reward_model + + # Aliases from manager (FSDP2Manager) + self.rank = torch.distributed.get_rank() + self.device_mesh = self.distributed_manager.device_mesh + self.dp_cp_mesh = self.device_mesh["dp_cp"] + self.dp_mesh = self.device_mesh["dp"] + self.tp_mesh = self.device_mesh["tp"] + self.cp_mesh = self.device_mesh["cp"] + self.dp_size = self.distributed_manager.dp_size + self.tp_size = self.distributed_manager.tp_size + self.cp_size = self.distributed_manager.cp_size + + # Aliases from runtime_config + self.dtype = self.runtime_config.dtype + self.cpu_offload = self.runtime_config.cpu_offload + self.offload_optimizer_for_logprob = ( + self.runtime_config.offload_optimizer_for_logprob + ) + self.max_grad_norm = self.runtime_config.max_grad_norm + self.enable_seq_packing = self.runtime_config.enable_seq_packing + self.allow_flash_attn_args = self.runtime_config.allow_flash_attn_args + self.is_generation_colocated = self.runtime_config.is_generation_colocated + def _apply_temperature_scaling(self, logits: torch.Tensor) -> torch.Tensor: if "generation" in self.cfg and self.cfg["generation"] is not None: logits.div_(self.cfg["generation"]["temperature"]) @@ -1892,41 +1631,152 @@ def save_checkpoint( "save_consolidated", "is_peft", "peft_config", + "model_cache_dir", + "model_repo_id", + "is_async", + "dequantize_base_checkpoint", } } - save_checkpoint( + checkpoint_root = _infer_checkpoint_root(weights_path) + + # Ensure a persistent Checkpointer exists and is configured + self._ensure_checkpointer( + config_updates=checkpoint_kwargs, checkpoint_root=checkpoint_root + ) + + self.checkpointer.save_model( model=self.model, weights_path=weights_path, - optimizer=self.optimizer if optimizer_path else None, - scheduler=self.scheduler if optimizer_path else None, - optimizer_path=optimizer_path, - tokenizer=self.tokenizer if tokenizer_path else None, - tokenizer_path=tokenizer_path, - model_state_dict_keys=self.model_state_dict_keys, - **checkpoint_kwargs, + peft_config=checkpoint_kwargs.get("peft_config"), + tokenizer=self.tokenizer if tokenizer_path is None else None, ) + if optimizer_path and self.optimizer is not None: + self.checkpointer.save_optimizer( + optimizer=self.optimizer, + model=self.model, + weights_path=optimizer_path, + scheduler=self.model_and_optimizer_state.scheduler, + ) + + if tokenizer_path and self.tokenizer is not None: + print(f"Saving tokenizer (or processor) to {tokenizer_path}") + self.tokenizer.save_pretrained(tokenizer_path) + def load_checkpoint( self, weights_path: str, optimizer_path: Optional[str] = None, ) -> None: - """Load a checkpoint into the model.""" - load_checkpoint( + """Load a checkpoint into the model using Automodel Checkpointer.""" + print(f"Loading weights from {weights_path}") + + model_save_format, is_peft = detect_checkpoint_format(weights_path) + + weights_dir = os.path.dirname(weights_path) + checkpoint_root = ( + os.path.dirname(weights_dir) + if weights_dir.endswith("weights") + else weights_dir + ) + + # Ensure a persistent Checkpointer exists and is configured + self._ensure_checkpointer( + config_updates={ + "model_save_format": model_save_format, + "is_peft": is_peft, + }, + checkpoint_root=checkpoint_root, + ) + + model_dir = ( + weights_path + if weights_path.endswith("/model") + else os.path.join(weights_path, "model") + ) + + self.checkpointer.load_model( model=self.model, - weights_path=weights_path, - optimizer=self.optimizer if optimizer_path else None, - scheduler=self.scheduler if optimizer_path else None, - optimizer_path=optimizer_path, + model_path=model_dir, ) + if optimizer_path and self.optimizer is not None: + self.checkpointer.load_optimizer( + optimizer=self.optimizer, + model=self.model, + weights_path=optimizer_path, + scheduler=self.model_and_optimizer_state.scheduler, + ) + + def _ensure_checkpointer( + self, config_updates=None, checkpoint_root: Optional[str] = None + ) -> None: + """Create or update a persistent Automodel Checkpointer bound to this worker ranks. + + Args: + config_updates: Dict of CheckpointingConfig fields to update. + checkpoint_root: Optional root directory for checkpoints. + """ + if config_updates is None: + config_updates = {} + + # Compute dp/tp ranks + dp_rank = torch.distributed.get_rank(self.distributed_state.dp_mesh.get_group()) + tp_rank = torch.distributed.get_rank(self.distributed_state.tp_mesh.get_group()) + pp_rank = 0 + + if self.checkpointer is None: + # Initialize a base config with sensible defaults + base_cfg = AutomodelCheckpointingConfig( + enabled=True, + checkpoint_dir=checkpoint_root or "", + model_save_format=config_updates.get( + "model_save_format", "safetensors" + ), + model_cache_dir=config_updates.get("model_cache_dir", ""), + model_repo_id=config_updates.get("model_repo_id", ""), + save_consolidated=config_updates.get("save_consolidated", False), + is_peft=config_updates.get("is_peft", False), + model_state_dict_keys=getattr(self, "model_state_dict_keys", None), + is_async=config_updates.get("is_async", False), + dequantize_base_checkpoint=config_updates.get( + "dequantize_base_checkpoint", False + ), + ) + self.checkpoint_config = base_cfg + self.checkpointer = Checkpointer( + config=base_cfg, + dp_rank=dp_rank, + tp_rank=tp_rank, + pp_rank=pp_rank, + moe_mesh=self.distributed_state.moe_mesh, + ) + else: + # Update mutable config fields on the existing instance + cfg = self.checkpointer.config + if checkpoint_root is not None: + cfg.checkpoint_dir = checkpoint_root + for k, v in config_updates.items(): + if k == "model_save_format": + # Ensure enum type + v = SerializationFormat[v.upper()] if isinstance(v, str) else v + setattr(cfg, k, v) + # Ensure model_state_dict_keys is current + if getattr(self, "model_state_dict_keys", None) is not None: + cfg.model_state_dict_keys = ( + self.model_and_optimizer_state.model_state_dict_keys + ) + def shutdown(self) -> None: """Shutdown the policy.""" # Clean up extension resources like ZMQ sockets if hasattr(self, "zmq_socket"): self.zmq_socket.close() self.zmq_context.term() + # Close checkpointer resources + if hasattr(self, "checkpointer") and self.checkpointer is not None: + self.checkpointer.close() def start_gpu_profiling(self) -> None: """Start GPU profiling.""" @@ -1941,3 +1791,56 @@ def report_node_ip_and_gpu_id(self) -> list[tuple[str, int]]: ip = ray._private.services.get_node_ip_address() gpu_id = ray.get_gpu_ids()[0] return (ip, gpu_id) + + +def detect_checkpoint_format(weights_path: str) -> tuple[str, bool]: + """Detect model save format and PEFT status from checkpoint directory. + + Args: + weights_path: Path to the checkpoint directory (e.g., weights/model) + + Returns: + tuple: (model_save_format, is_peft) where: + model_save_format is "torch_save" for DCP or "safetensors" for safetensors + is_peft is True if PEFT/adapter patterns are detected + """ + is_peft = False + model_save_format = "safetensors" + try: + # Iterate through all subdirectories and files recursively + all_files = [] + for root, dirs, files in os.walk(weights_path): + all_files.extend(files) + + if any(f.endswith(".distcp") for f in all_files): + model_save_format = "torch_save" + elif any(f.endswith(".safetensors") for f in all_files): + model_save_format = "safetensors" + elif any(f.endswith((".bin", ".pt", ".pth")) for f in all_files): + model_save_format = "torch_save" + + if not is_peft: + is_peft = any("adapter" in f.lower() for f in all_files) + + except (OSError, PermissionError): + pass + + return model_save_format, is_peft + + +def _infer_checkpoint_root(weights_path: str) -> str: + """Infer checkpoint root directory from weights path. + + When weights_path ends with "…/weights/model", we need the parent of + the weights directory (the checkpoint root), not the weights directory itself. + + Args: + weights_path: Path to model weights (e.g., "/path/to/policy/weights/model") + + Returns: + str: Checkpoint root directory (e.g., "/path/to/policy") + """ + weights_dir = os.path.dirname(weights_path) + if weights_dir.endswith("weights"): + return os.path.dirname(weights_dir) + return weights_dir diff --git a/pyrefly.toml b/pyrefly.toml index a1d64ad6fa..0419d89591 100644 --- a/pyrefly.toml +++ b/pyrefly.toml @@ -89,6 +89,7 @@ project-includes = [ "nemo_rl/experience/__init__.py", "nemo_rl/experience/rollouts.py", "nemo_rl/models/__init__.py", + "nemo_rl/models/automodel/__init__.py", "nemo_rl/models/dtensor/__init__.py", "nemo_rl/models/dtensor/parallelize.py", "nemo_rl/models/generation/__init__.py", diff --git a/tests/unit/models/automodel/__init__.py b/tests/unit/models/automodel/__init__.py new file mode 100644 index 0000000000..341a77c5bc --- /dev/null +++ b/tests/unit/models/automodel/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/tests/unit/models/automodel/test_automodel_setup.py b/tests/unit/models/automodel/test_automodel_setup.py new file mode 100644 index 0000000000..fc856e008c --- /dev/null +++ b/tests/unit/models/automodel/test_automodel_setup.py @@ -0,0 +1,1033 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +from unittest.mock import MagicMock, Mock, patch + +import pytest + +pytest_plugins = [] +try: + import nemo_automodel # noqa: F401 +except ImportError: + pytest.skip("nemo_automodel not available", allow_module_level=True) + +import torch +from nemo_automodel.components.distributed.fsdp2 import FSDP2Manager + +from nemo_rl.models.automodel.setup import ( + ModelAndOptimizerState, + setup_distributed, + setup_model_and_optimizer, + validate_and_set_config, +) +from nemo_rl.models.automodel.types import RuntimeConfig + + +@pytest.fixture +def mock_config(): + return { + "model_name": "gpt2", + "precision": "bfloat16", + "max_grad_norm": 1.0, + "offload_optimizer_for_logprob": False, + "sequence_packing": {"enabled": False}, + "dtensor_cfg": { + "cpu_offload": False, + "context_parallel_size": 1, + "tensor_parallel_size": 1, + "expert_parallel_size": 1, + "data_parallel_size": None, + "sequence_parallel": False, + "use_hf_tp_plan": False, + "activation_checkpointing": False, + }, + "generation": None, + "hf_config_overrides": {}, + "optimizer": { + "name": "torch.optim.AdamW", + "kwargs": {"lr": 1e-4}, + }, + } + + +@pytest.fixture +def mock_autoconfig(): + config = MagicMock() + config.architectures = ["GPT2LMHeadModel"] + config.model_type = "gpt2" + config.num_labels = 2 + config.torch_dtype = "float32" + return config + + +@pytest.mark.automodel +class TestValidateAndSetConfig: + @patch("nemo_rl.models.automodel.setup.AutoConfig") + @patch("nemo_rl.models.automodel.setup.resolve_model_class") + @patch("nemo_rl.models.automodel.setup.configure_dynamo_cache") + @patch("nemo_rl.models.automodel.setup.sliding_window_overwrite") + def test_basic_validation( + self, + mock_sliding_window, + mock_dynamo, + mock_resolve_class, + mock_autoconfig_class, + mock_config, + mock_autoconfig, + ): + mock_sliding_window.return_value = {} + mock_autoconfig_class.from_pretrained.return_value = mock_autoconfig + mock_resolve_class.return_value = Mock + + result = validate_and_set_config( + config=mock_config, + processor=None, + rank=0, + ) + + # Verify result is a RuntimeConfig dataclass + assert isinstance(result, RuntimeConfig) + assert result.dtype == torch.bfloat16 + assert result.cpu_offload is False + assert result.offload_optimizer_for_logprob is False + assert result.max_grad_norm == 1.0 + assert result.enable_seq_packing is False + + @patch("nemo_rl.models.automodel.setup.AutoConfig") + @patch("nemo_rl.models.automodel.setup.resolve_model_class") + @patch("nemo_rl.models.automodel.setup.configure_dynamo_cache") + @patch("nemo_rl.models.automodel.setup.sliding_window_overwrite") + def test_precision_validation_invalid( + self, + mock_sliding_window, + mock_dynamo, + mock_resolve_class, + mock_autoconfig_class, + mock_config, + ): + mock_config["precision"] = "invalid_precision" + + with pytest.raises(ValueError, match="Unknown precision"): + validate_and_set_config( + config=mock_config, + processor=None, + rank=0, + ) + + @patch("nemo_rl.models.automodel.setup.AutoConfig") + @patch("nemo_rl.models.automodel.setup.resolve_model_class") + @patch("nemo_rl.models.automodel.setup.configure_dynamo_cache") + @patch("nemo_rl.models.automodel.setup.sliding_window_overwrite") + def test_sequence_packing_with_vlm_raises_error( + self, + mock_sliding_window, + mock_dynamo, + mock_resolve_class, + mock_autoconfig_class, + mock_config, + ): + mock_config["sequence_packing"]["enabled"] = True + processor = MagicMock() + + with pytest.raises( + ValueError, match="Sequence packing is not supported for VLM" + ): + validate_and_set_config( + config=mock_config, + processor=processor, + rank=0, + ) + + @patch("nemo_rl.models.automodel.setup.AutoConfig") + @patch("nemo_rl.models.automodel.setup.resolve_model_class") + @patch("nemo_rl.models.automodel.setup.configure_dynamo_cache") + @patch("nemo_rl.models.automodel.setup.sliding_window_overwrite") + @patch("nemo_automodel.NeMoAutoModelForSequenceClassification") + def test_reward_model_bradley_terry( + self, + mock_rm_class, + mock_sliding_window, + mock_dynamo, + mock_resolve_class, + mock_autoconfig_class, + mock_config, + mock_autoconfig, + ): + mock_sliding_window.return_value = {} + mock_autoconfig_class.from_pretrained.return_value = mock_autoconfig + + mock_config["reward_model_cfg"] = { + "enabled": True, + "reward_model_type": "bradley_terry", + } + + result = validate_and_set_config( + config=mock_config, + processor=None, + rank=0, + ) + + # Verify num_labels was set to 1 for bradley_terry reward model + assert mock_autoconfig.num_labels == 1 + # Result should be valid RuntimeConfig + assert isinstance(result, RuntimeConfig) + + @patch("nemo_rl.models.automodel.setup.AutoConfig") + @patch("nemo_rl.models.automodel.setup.resolve_model_class") + @patch("nemo_rl.models.automodel.setup.configure_dynamo_cache") + @patch("nemo_rl.models.automodel.setup.sliding_window_overwrite") + def test_context_parallel_with_sequence_packing_raises_error( + self, + mock_sliding_window, + mock_dynamo, + mock_resolve_class, + mock_autoconfig_class, + mock_config, + ): + mock_config["sequence_packing"]["enabled"] = True + mock_config["dtensor_cfg"]["context_parallel_size"] = 2 + + with pytest.raises( + ValueError, match="Context parallel is not supported for sequence packing" + ): + validate_and_set_config( + config=mock_config, + processor=None, + rank=0, + ) + + @patch("nemo_rl.models.automodel.setup.AutoConfig") + @patch("nemo_rl.models.automodel.setup.resolve_model_class") + @patch("nemo_rl.models.automodel.setup.configure_dynamo_cache") + @patch("nemo_rl.models.automodel.setup.sliding_window_overwrite") + def test_sequence_parallel_with_large_tp_raises_error( + self, + mock_sliding_window, + mock_dynamo, + mock_resolve_class, + mock_autoconfig_class, + mock_config, + ): + mock_config["dtensor_cfg"]["sequence_parallel"] = True + mock_config["dtensor_cfg"]["tensor_parallel_size"] = 2 + + with pytest.raises(RuntimeError, match="Sequence parallel \\+ tp_size >1"): + validate_and_set_config( + config=mock_config, + processor=None, + rank=0, + ) + + @patch("nemo_rl.models.automodel.setup.AutoConfig") + @patch("nemo_rl.models.automodel.setup.resolve_model_class") + @patch("nemo_rl.models.automodel.setup.configure_dynamo_cache") + @patch("nemo_rl.models.automodel.setup.sliding_window_overwrite") + def test_attention_implementation_selection( + self, + mock_sliding_window, + mock_dynamo, + mock_resolve_class, + mock_autoconfig_class, + mock_config, + mock_autoconfig, + ): + mock_sliding_window.return_value = {} + mock_autoconfig_class.from_pretrained.return_value = mock_autoconfig + mock_resolve_class.return_value = Mock + + # Test FA2 for sequence packing with cp=1 + mock_config["sequence_packing"]["enabled"] = True + mock_config["dtensor_cfg"]["context_parallel_size"] = 1 + result = validate_and_set_config(mock_config, None, 0) + assert result.attn_impl == "flash_attention_2" + + # Test SDPA for cp > 1 + mock_config["sequence_packing"]["enabled"] = False + mock_config["dtensor_cfg"]["context_parallel_size"] = 2 + result = validate_and_set_config(mock_config, None, 0) + assert result.attn_impl == "sdpa" + + # Test None for cp=1 without sequence packing + mock_config["dtensor_cfg"]["context_parallel_size"] = 1 + result = validate_and_set_config(mock_config, None, 0) + assert result.attn_impl is None + + @patch("nemo_rl.models.automodel.setup.AutoConfig") + @patch("nemo_rl.models.automodel.setup.resolve_model_class") + @patch("nemo_rl.models.automodel.setup.configure_dynamo_cache") + @patch("nemo_rl.models.automodel.setup.sliding_window_overwrite") + def test_precision_types( + self, + mock_sliding_window, + mock_dynamo, + mock_resolve_class, + mock_autoconfig_class, + mock_config, + mock_autoconfig, + ): + mock_sliding_window.return_value = {} + mock_autoconfig_class.from_pretrained.return_value = mock_autoconfig + mock_resolve_class.return_value = Mock + + # Test float32 + mock_config["precision"] = "float32" + result = validate_and_set_config(mock_config, None, 0) + assert result.dtype == torch.float32 + + # Test float16 + mock_config["precision"] = "float16" + result = validate_and_set_config(mock_config, None, 0) + assert result.dtype == torch.float16 + + @patch("nemo_rl.models.automodel.setup.AutoConfig") + @patch("nemo_rl.models.automodel.setup.resolve_model_class") + @patch("nemo_rl.models.automodel.setup.configure_dynamo_cache") + @patch("nemo_rl.models.automodel.setup.sliding_window_overwrite") + @patch("os.environ", {}) + def test_generation_colocated( + self, + mock_sliding_window, + mock_dynamo, + mock_resolve_class, + mock_autoconfig_class, + mock_config, + mock_autoconfig, + ): + mock_sliding_window.return_value = {} + mock_autoconfig_class.from_pretrained.return_value = mock_autoconfig + mock_resolve_class.return_value = Mock + + # Test generation colocated = True (should not set NCCL env var) + mock_config["generation"] = {"colocated": {"enabled": True}} + result = validate_and_set_config(mock_config, None, 0) + assert result.is_generation_colocated is True + assert "NCCL_CUMEM_ENABLE" not in os.environ + + @patch("nemo_rl.models.automodel.setup.AutoConfig") + @patch("nemo_rl.models.automodel.setup.resolve_model_class") + @patch("nemo_rl.models.automodel.setup.configure_dynamo_cache") + @patch("nemo_rl.models.automodel.setup.sliding_window_overwrite") + def test_sequence_packing_enabled_print( + self, + mock_sliding_window, + mock_dynamo, + mock_resolve_class, + mock_autoconfig_class, + mock_config, + mock_autoconfig, + capsys, + ): + mock_sliding_window.return_value = {} + mock_autoconfig_class.from_pretrained.return_value = mock_autoconfig + mock_resolve_class.return_value = Mock + + mock_config["sequence_packing"]["enabled"] = True + mock_config["dtensor_cfg"]["context_parallel_size"] = 1 + result = validate_and_set_config(mock_config, None, 0) + + captured = capsys.readouterr() + assert "Sequence packing is enabled" in captured.out + assert "Using FlashAttention2" in captured.out + + @patch("nemo_rl.models.automodel.setup.AutoConfig") + @patch("nemo_rl.models.automodel.setup.resolve_model_class") + @patch("nemo_rl.models.automodel.setup.configure_dynamo_cache") + @patch("nemo_rl.models.automodel.setup.sliding_window_overwrite") + def test_hf_config_overrides_none( + self, + mock_sliding_window, + mock_dynamo, + mock_resolve_class, + mock_autoconfig_class, + mock_config, + mock_autoconfig, + ): + mock_sliding_window.return_value = {} + mock_autoconfig_class.from_pretrained.return_value = mock_autoconfig + mock_resolve_class.return_value = Mock + + mock_config["hf_config_overrides"] = None + result = validate_and_set_config(mock_config, None, 0) + assert result.hf_config_overrides == {} + + @patch("nemo_rl.models.automodel.setup.AutoConfig") + @patch("nemo_rl.models.automodel.setup.resolve_model_class") + @patch("nemo_rl.models.automodel.setup.configure_dynamo_cache") + @patch("nemo_rl.models.automodel.setup.sliding_window_overwrite") + @patch("nemo_automodel.NeMoAutoModelForSequenceClassification") + def test_reward_model_with_num_labels_equals_one( + self, + mock_rm_class, + mock_sliding_window, + mock_dynamo, + mock_resolve_class, + mock_autoconfig_class, + mock_config, + mock_autoconfig, + ): + mock_sliding_window.return_value = {} + mock_autoconfig.num_labels = 1 # Already 1 + mock_autoconfig_class.from_pretrained.return_value = mock_autoconfig + + mock_config["reward_model_cfg"] = { + "enabled": True, + "reward_model_type": "bradley_terry", + } + + result = validate_and_set_config(mock_config, None, 0) + # num_labels should remain 1 + assert mock_autoconfig.num_labels == 1 + # Result should be valid RuntimeConfig + assert isinstance(result, RuntimeConfig) + + @patch("nemo_rl.models.automodel.setup.AutoConfig") + @patch("nemo_rl.models.automodel.setup.resolve_model_class") + @patch("nemo_rl.models.automodel.setup.configure_dynamo_cache") + @patch("nemo_rl.models.automodel.setup.sliding_window_overwrite") + def test_reward_model_with_sequence_packing_error( + self, + mock_sliding_window, + mock_dynamo, + mock_resolve_class, + mock_autoconfig_class, + mock_config, + mock_autoconfig, + ): + mock_sliding_window.return_value = {} + mock_autoconfig_class.from_pretrained.return_value = mock_autoconfig + + mock_config["sequence_packing"]["enabled"] = True + mock_config["dtensor_cfg"]["context_parallel_size"] = 1 + mock_config["reward_model_cfg"] = { + "enabled": True, + "reward_model_type": "bradley_terry", + } + + with pytest.raises( + NotImplementedError, + match="Sequence packing is not supported for reward models", + ): + validate_and_set_config(mock_config, None, 0) + + @patch("nemo_rl.models.automodel.setup.AutoConfig") + @patch("nemo_rl.models.automodel.setup.resolve_model_class") + @patch("nemo_rl.models.automodel.setup.configure_dynamo_cache") + @patch("nemo_rl.models.automodel.setup.sliding_window_overwrite") + def test_reward_model_with_unknown_type( + self, + mock_sliding_window, + mock_dynamo, + mock_resolve_class, + mock_autoconfig_class, + mock_config, + mock_autoconfig, + ): + mock_sliding_window.return_value = {} + mock_autoconfig_class.from_pretrained.return_value = mock_autoconfig + + mock_config["reward_model_cfg"] = { + "enabled": True, + "reward_model_type": "unknown_type", + } + + with pytest.raises(ValueError, match="Unknown reward model type"): + validate_and_set_config(mock_config, None, 0) + + @patch("nemo_rl.models.automodel.setup.AutoConfig") + @patch("nemo_rl.models.automodel.setup.resolve_model_class") + @patch("nemo_rl.models.automodel.setup.configure_dynamo_cache") + @patch("nemo_rl.models.automodel.setup.sliding_window_overwrite") + def test_sequence_parallel_with_tp_size_one_warning( + self, + mock_sliding_window, + mock_dynamo, + mock_resolve_class, + mock_autoconfig_class, + mock_config, + mock_autoconfig, + capsys, + ): + mock_sliding_window.return_value = {} + mock_autoconfig_class.from_pretrained.return_value = mock_autoconfig + mock_resolve_class.return_value = Mock + + mock_config["dtensor_cfg"]["sequence_parallel"] = True + mock_config["dtensor_cfg"]["tensor_parallel_size"] = 1 + + result = validate_and_set_config(mock_config, None, 0) + + captured = capsys.readouterr() + assert "[WARNING]" in captured.out + assert "sequence_parallel=True, but tp_size=1" in captured.out + + +@pytest.mark.automodel +class TestSetupDistributed: + @patch("nemo_rl.models.automodel.setup.torch.distributed.init_process_group") + @patch("nemo_rl.models.automodel.setup.torch.distributed.get_rank") + @patch("nemo_rl.models.automodel.setup.torch.distributed.get_world_size") + @patch("nemo_rl.models.automodel.setup.FSDP2Manager") + def test_basic_distributed_setup( + self, + mock_manager_class, + mock_world_size, + mock_get_rank, + mock_init_pg, + mock_config, + ): + # Setup mocks + mock_get_rank.return_value = 0 + mock_world_size.return_value = 4 + + # Create mock manager with meshes that have size() methods + mock_manager = MagicMock() + mock_dp_mesh = MagicMock() + mock_dp_mesh.size.return_value = 2 + mock_tp_mesh = MagicMock() + mock_tp_mesh.size.return_value = 1 + mock_cp_mesh = MagicMock() + mock_cp_mesh.size.return_value = 1 + mock_device_mesh = { + "dp_cp": MagicMock(), + "dp": mock_dp_mesh, + "tp": mock_tp_mesh, + "cp": mock_cp_mesh, + } + mock_manager.device_mesh = mock_device_mesh + mock_manager.dp_size = 2 + mock_manager.tp_size = 1 + mock_manager.cp_size = 1 + mock_manager.moe_mesh = None + mock_manager_class.return_value = mock_manager + + # Create runtime config + runtime_config = RuntimeConfig( + model_class=Mock, + model_config=MagicMock(), + hf_config_overrides={}, + allow_flash_attn_args=True, + attn_impl=None, + dtype=torch.bfloat16, + enable_seq_packing=False, + max_grad_norm=1.0, + cpu_offload=False, + offload_optimizer_for_logprob=False, + is_generation_colocated=None, + ) + + result = setup_distributed( + config=mock_config, + runtime_config=runtime_config, + ) + + # Verify result is the FSDP2Manager instance + assert result == mock_manager + assert result.dp_size == 2 + assert result.tp_size == 1 + assert result.cp_size == 1 + assert result.device_mesh == mock_device_mesh + + # Verify init_process_group was called + mock_init_pg.assert_called_once_with(backend="nccl") + + @patch("nemo_rl.models.automodel.setup.torch.distributed.init_process_group") + @patch("nemo_rl.models.automodel.setup.torch.distributed.get_rank") + @patch("nemo_rl.models.automodel.setup.torch.distributed.get_world_size") + @patch("nemo_rl.models.automodel.setup.FSDP2Manager") + def test_cpu_offload_enabled( + self, + mock_manager_class, + mock_world_size, + mock_get_rank, + mock_init_pg, + mock_config, + ): + mock_get_rank.return_value = 0 + mock_world_size.return_value = 2 + + # Create mock manager with meshes that have size() methods + mock_manager = MagicMock() + mock_dp_mesh = MagicMock() + mock_dp_mesh.size.return_value = 2 + mock_tp_mesh = MagicMock() + mock_tp_mesh.size.return_value = 1 + mock_cp_mesh = MagicMock() + mock_cp_mesh.size.return_value = 1 + mock_manager.device_mesh = { + "dp_cp": MagicMock(), + "dp": mock_dp_mesh, + "tp": mock_tp_mesh, + "cp": mock_cp_mesh, + } + mock_manager.dp_size = 2 + mock_manager.tp_size = 1 + mock_manager.cp_size = 1 + mock_manager_class.return_value = mock_manager + + runtime_config = RuntimeConfig( + model_class=Mock, + model_config=MagicMock(), + hf_config_overrides={}, + allow_flash_attn_args=True, + attn_impl=None, + dtype=torch.float32, + enable_seq_packing=False, + max_grad_norm=1.0, + cpu_offload=True, # Enable CPU offload + offload_optimizer_for_logprob=False, + is_generation_colocated=None, + ) + + result = setup_distributed(mock_config, runtime_config) + + # Verify FSDP2Manager was called with CPU offload policy + call_kwargs = mock_manager_class.call_args[1] + assert call_kwargs["offload_policy"] is not None + + +@pytest.mark.automodel +class TestSetupModelAndOptimizer: + @patch("nemo_rl.models.automodel.setup.torch.distributed.get_rank") + @patch("torch.optim.lr_scheduler.LambdaLR") + @patch("nemo_rl.models.automodel.setup.init_empty_weights") + @patch("nemo_rl.models.policy.utils.import_class_from_path") + @patch("nemo_automodel.components.distributed.tensor_utils.get_cpu_state_dict") + def test_basic_model_setup( + self, + mock_get_cpu_state, + mock_import_class, + mock_init_empty, + mock_lambda_lr, + mock_get_rank, + mock_config, + ): + mock_get_rank.return_value = 0 + # Create mocks + mock_tokenizer = MagicMock() + mock_tokenizer.pad_token_id = 50256 + + mock_model_class = MagicMock() + mock_model = MagicMock() + mock_model.state_dict.return_value = {"layer.weight": torch.zeros(10, 10)} + mock_model.config.pad_token_id = None + mock_model.named_parameters.return_value = [] + mock_model_class.from_config.return_value = mock_model + + mock_optimizer_class = MagicMock() + mock_optimizer = MagicMock() + mock_optimizer_class.return_value = mock_optimizer + mock_import_class.return_value = mock_optimizer_class + + mock_scheduler = MagicMock() + mock_lambda_lr.return_value = mock_scheduler + + mock_get_cpu_state.return_value = {"layer.weight": torch.zeros(10, 10)} + + # Create worker instance mock + mock_worker = MagicMock() + mock_worker._ensure_checkpointer = MagicMock() + mock_worker.checkpointer = MagicMock() + mock_worker.checkpointer.config = MagicMock() + mock_worker.checkpointer.load_base_model = MagicMock() + mock_worker.move_to_device = MagicMock(side_effect=lambda m, d: m) + + # Create runtime config + runtime_config = RuntimeConfig( + model_class=mock_model_class, + model_config=MagicMock(), + hf_config_overrides={}, + allow_flash_attn_args=True, + attn_impl=None, + dtype=torch.bfloat16, + enable_seq_packing=False, + max_grad_norm=1.0, + cpu_offload=False, + offload_optimizer_for_logprob=False, + is_generation_colocated=None, + ) + + # Create mock FSDP2Manager + mock_manager = MagicMock(spec=FSDP2Manager) + mock_device_mesh = MagicMock() + mock_device_mesh.mesh_dim_names = ["dp"] + mock_manager.device_mesh = mock_device_mesh + mock_manager.moe_mesh = None + mock_manager.dp_size = 4 + mock_manager.tp_size = 1 + mock_manager.cp_size = 1 + mock_manager.sequence_parallel = False + mock_manager.parallelize = MagicMock(return_value=mock_model) + + result = setup_model_and_optimizer( + config=mock_config, + tokenizer=mock_tokenizer, + runtime_config=runtime_config, + distributed_manager=mock_manager, + worker_instance=mock_worker, + init_optimizer=True, + init_reference_model=True, + ) + + # Verify result + assert isinstance(result, ModelAndOptimizerState) + assert result.model is not None + assert result.optimizer == mock_optimizer + assert result.scheduler == mock_scheduler + assert result.reference_model_state_dict is not None + assert len(result.model_state_dict_keys) > 0 + assert isinstance(result.is_hf_model, bool) + assert isinstance(result.is_moe_model, bool) + + @patch("nemo_rl.models.automodel.setup.torch.distributed.get_rank") + @patch("nemo_rl.models.automodel.setup.init_empty_weights") + @patch("nemo_rl.models.policy.utils.import_class_from_path") + def test_model_setup_without_optimizer( + self, + mock_import_class, + mock_init_empty, + mock_get_rank, + mock_config, + ): + mock_get_rank.return_value = 0 + mock_tokenizer = MagicMock() + mock_tokenizer.pad_token_id = 50256 + + mock_model_class = MagicMock() + mock_model = MagicMock() + mock_model.state_dict.return_value = {"layer.weight": torch.zeros(10, 10)} + mock_model.config.pad_token_id = None + mock_model.named_parameters.return_value = [] + mock_model_class.from_config.return_value = mock_model + + mock_worker = MagicMock() + mock_worker._ensure_checkpointer = MagicMock() + mock_worker.checkpointer = MagicMock() + mock_worker.checkpointer.config = MagicMock() + mock_worker.checkpointer.load_base_model = MagicMock() + mock_worker.move_to_device = MagicMock(side_effect=lambda m, d: m) + + runtime_config = RuntimeConfig( + model_class=mock_model_class, + model_config=MagicMock(), + hf_config_overrides={}, + allow_flash_attn_args=True, + attn_impl=None, + dtype=torch.bfloat16, + enable_seq_packing=False, + max_grad_norm=1.0, + cpu_offload=False, + offload_optimizer_for_logprob=False, + is_generation_colocated=None, + ) + + # Create mock FSDP2Manager + mock_manager = MagicMock(spec=FSDP2Manager) + mock_device_mesh = MagicMock() + mock_device_mesh.mesh_dim_names = ["dp"] + mock_manager.device_mesh = mock_device_mesh + mock_manager.moe_mesh = None + mock_manager.dp_size = 4 + mock_manager.tp_size = 1 + mock_manager.cp_size = 1 + mock_manager.sequence_parallel = False + mock_manager.parallelize = MagicMock(return_value=mock_model) + + result = setup_model_and_optimizer( + config=mock_config, + tokenizer=mock_tokenizer, + runtime_config=runtime_config, + distributed_manager=mock_manager, + worker_instance=mock_worker, + init_optimizer=False, # Don't initialize optimizer + init_reference_model=False, + ) + + # Verify optimizer and reference model are None + assert result.optimizer is None + assert result.scheduler is None + assert result.reference_model_state_dict is None + assert isinstance(result.is_hf_model, bool) + assert isinstance(result.is_moe_model, bool) + + @patch("nemo_rl.models.automodel.setup.torch.distributed.get_rank") + @patch("nemo_rl.models.automodel.setup.init_empty_weights") + def test_context_parallel_with_gemma3_raises_error( + self, + mock_init_empty, + mock_get_rank, + mock_config, + ): + mock_get_rank.return_value = 0 + # Import the real Gemma3ForCausalLM to use as the class type + from transformers.models.gemma3.modeling_gemma3 import Gemma3ForCausalLM + + mock_tokenizer = MagicMock() + mock_tokenizer.pad_token_id = 50256 + + # Create a Gemma3 model mock with the correct class + mock_model = MagicMock() + mock_model.__class__ = Gemma3ForCausalLM + mock_model.state_dict.return_value = {"layer.weight": torch.zeros(10, 10)} + mock_model.config.pad_token_id = None + + mock_model_class = MagicMock() + mock_model_class.from_config.return_value = mock_model + + mock_worker = MagicMock() + + runtime_config = RuntimeConfig( + model_class=mock_model_class, + model_config=MagicMock(), + hf_config_overrides={}, + allow_flash_attn_args=True, + attn_impl=None, + dtype=torch.bfloat16, + enable_seq_packing=False, + max_grad_norm=1.0, + cpu_offload=False, + offload_optimizer_for_logprob=False, + is_generation_colocated=None, + ) + + # Create mock FSDP2Manager with cp_size > 1 to trigger the error + mock_manager = MagicMock(spec=FSDP2Manager) + mock_manager.device_mesh = MagicMock() + mock_manager.moe_mesh = None + mock_manager.dp_size = 4 + mock_manager.tp_size = 1 + mock_manager.cp_size = 2 # cp_size > 1 to trigger the error + mock_manager.sequence_parallel = False + + with pytest.raises( + ValueError, match="Context parallel is not supported for Gemma3ForCausalLM" + ): + result = setup_model_and_optimizer( + config=mock_config, + tokenizer=mock_tokenizer, + runtime_config=runtime_config, + distributed_manager=mock_manager, + worker_instance=mock_worker, + ) + + @patch("nemo_rl.models.automodel.setup.torch.distributed.get_rank") + @patch("torch.optim.lr_scheduler.SequentialLR") + @patch("nemo_rl.models.automodel.setup.init_empty_weights") + @patch("nemo_rl.models.policy.utils.import_class_from_path") + def test_scheduler_as_list( + self, + mock_import_class, + mock_init_empty, + mock_sequential_lr, + mock_get_rank, + mock_config, + ): + mock_get_rank.return_value = 0 + mock_tokenizer = MagicMock() + mock_tokenizer.pad_token_id = 50256 + + mock_model_class = MagicMock() + mock_model = MagicMock() + mock_model.state_dict.return_value = {"layer.weight": torch.zeros(10, 10)} + mock_model.config.pad_token_id = None + mock_model.named_parameters.return_value = [] + mock_model_class.from_config.return_value = mock_model + + mock_optimizer = MagicMock() + mock_scheduler_class = MagicMock() + mock_scheduler1 = MagicMock() + mock_scheduler2 = MagicMock() + mock_scheduler_class.side_effect = [mock_scheduler1, mock_scheduler2] + + mock_final_scheduler = MagicMock() + mock_sequential_lr.return_value = mock_final_scheduler + + def import_side_effect(path): + if "optimizer" in path.lower() or "AdamW" in path: + return lambda *args, **kwargs: mock_optimizer + else: + return mock_scheduler_class + + mock_import_class.side_effect = import_side_effect + + mock_worker = MagicMock() + mock_worker._ensure_checkpointer = MagicMock() + mock_worker.checkpointer = MagicMock() + mock_worker.checkpointer.config = MagicMock() + mock_worker.checkpointer.load_base_model = MagicMock() + mock_worker.move_to_device = MagicMock(side_effect=lambda m, d: m) + + runtime_config = RuntimeConfig( + model_class=mock_model_class, + model_config=MagicMock(), + hf_config_overrides={}, + allow_flash_attn_args=True, + attn_impl=None, + dtype=torch.bfloat16, + enable_seq_packing=False, + max_grad_norm=1.0, + cpu_offload=False, + offload_optimizer_for_logprob=False, + is_generation_colocated=None, + ) + + # Create mock FSDP2Manager + mock_manager = MagicMock(spec=FSDP2Manager) + mock_device_mesh = MagicMock() + mock_device_mesh.mesh_dim_names = ["dp"] + mock_manager.device_mesh = mock_device_mesh + mock_manager.moe_mesh = None + mock_manager.dp_size = 4 + mock_manager.tp_size = 1 + mock_manager.cp_size = 1 + mock_manager.sequence_parallel = False + mock_manager.parallelize = MagicMock(return_value=mock_model) + + # Configure with list scheduler + mock_config["scheduler"] = [ + {"name": "torch.optim.lr_scheduler.LinearLR", "kwargs": {}}, + { + "name": "torch.optim.lr_scheduler.CosineAnnealingLR", + "kwargs": {"T_max": 100}, + }, + {"milestones": [1000]}, + ] + + result = setup_model_and_optimizer( + config=mock_config, + tokenizer=mock_tokenizer, + runtime_config=runtime_config, + distributed_manager=mock_manager, + worker_instance=mock_worker, + init_optimizer=True, + init_reference_model=False, + ) + + assert result.scheduler == mock_final_scheduler + assert isinstance(result.is_hf_model, bool) + assert isinstance(result.is_moe_model, bool) + + @patch("nemo_rl.models.automodel.setup.torch.distributed.get_rank") + @patch("nemo_rl.models.automodel.setup.init_empty_weights") + @patch("nemo_rl.models.policy.utils.import_class_from_path") + def test_context_parallel_with_tp_and_sp_error( + self, + mock_import_class, + mock_init_empty, + mock_get_rank, + mock_config, + ): + mock_get_rank.return_value = 0 + mock_tokenizer = MagicMock() + mock_tokenizer.pad_token_id = 50256 + + mock_model_class = MagicMock() + mock_model = MagicMock() + mock_model.state_dict.return_value = {"layer.weight": torch.zeros(10, 10)} + mock_model.config.pad_token_id = None + mock_model_class.from_config.return_value = mock_model + + mock_worker = MagicMock() + + runtime_config = RuntimeConfig( + model_class=mock_model_class, + model_config=MagicMock(), + hf_config_overrides={}, + allow_flash_attn_args=True, + attn_impl=None, + dtype=torch.bfloat16, + enable_seq_packing=False, + max_grad_norm=1.0, + cpu_offload=False, + offload_optimizer_for_logprob=False, + is_generation_colocated=None, + ) + + # Create mock FSDP2Manager with tp_size > 1 and cp_size > 1 to trigger the error + mock_manager = MagicMock(spec=FSDP2Manager) + mock_manager.device_mesh = MagicMock() + mock_manager.moe_mesh = None + mock_manager.dp_size = 4 + mock_manager.tp_size = 2 # tp_size > 1 to trigger the error + mock_manager.cp_size = 2 # cp_size > 1 to enter the validation block + mock_manager.sequence_parallel = ( + True # Enable sequence parallel to trigger the error + ) + + with pytest.raises( + ValueError, + match="context parallel can't be used together with sequence parallel", + ): + result = setup_model_and_optimizer( + config=mock_config, + tokenizer=mock_tokenizer, + runtime_config=runtime_config, + distributed_manager=mock_manager, + worker_instance=mock_worker, + ) + + @patch("nemo_rl.models.automodel.setup.torch.distributed.get_rank") + @patch("nemo_rl.models.automodel.setup.init_empty_weights") + def test_context_parallel_with_vlm_error( + self, + mock_init_empty, + mock_get_rank, + mock_config, + ): + mock_get_rank.return_value = 0 + mock_tokenizer = MagicMock() + mock_tokenizer.pad_token_id = 50256 + + mock_model_class = MagicMock() + mock_model = MagicMock() + mock_model.state_dict.return_value = {"layer.weight": torch.zeros(10, 10)} + mock_model.config.pad_token_id = None + mock_model_class.from_config.return_value = mock_model + + mock_worker = MagicMock() + + runtime_config = RuntimeConfig( + model_class=mock_model_class, + model_config=MagicMock(), + hf_config_overrides={}, + allow_flash_attn_args=True, + attn_impl=None, + dtype=torch.bfloat16, + enable_seq_packing=False, + max_grad_norm=1.0, + cpu_offload=False, + offload_optimizer_for_logprob=False, + is_generation_colocated=None, + ) + + # Create mock FSDP2Manager with cp_size > 1 to trigger the VLM validation + mock_manager = MagicMock(spec=FSDP2Manager) + mock_manager.device_mesh = MagicMock() + mock_manager.moe_mesh = None + mock_manager.dp_size = 4 + mock_manager.tp_size = 1 + mock_manager.cp_size = 2 # cp_size > 1 to trigger the VLM validation + mock_manager.sequence_parallel = False + + with pytest.raises( + ValueError, match="Context parallel is yet not supported for VLM models" + ): + result = setup_model_and_optimizer( + config=mock_config, + tokenizer=mock_tokenizer, + runtime_config=runtime_config, + distributed_manager=mock_manager, + worker_instance=mock_worker, + is_vlm=True, # VLM model + ) diff --git a/uv.lock b/uv.lock index 03c163b5ec..b99ea655f3 100644 --- a/uv.lock +++ b/uv.lock @@ -350,6 +350,19 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/f6/22/91616fe707a5c5510de2cac9b046a30defe7007ba8a0c04f9c08f27df312/audioop_lts-0.2.2-cp314-cp314t-win_arm64.whl", hash = "sha256:b492c3b040153e68b9fdaff5913305aaaba5bb433d8a7f73d5cf6a64ed3cc1dd", size = 25206, upload-time = "2025-08-05T16:43:16.444Z" }, ] +[[package]] +name = "audioread" +version = "3.1.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "standard-aifc", marker = "python_full_version >= '3.13'" }, + { name = "standard-sunau", marker = "python_full_version >= '3.13'" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/a1/4a/874ecf9b472f998130c2b5e145dcdb9f6131e84786111489103b66772143/audioread-3.1.0.tar.gz", hash = "sha256:1c4ab2f2972764c896a8ac61ac53e261c8d29f0c6ccd652f84e18f08a4cab190", size = 20082, upload-time = "2025-10-26T19:44:13.484Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/7e/16/fbe8e1e185a45042f7cd3a282def5bb8d95bb69ab9e9ef6a5368aa17e426/audioread-3.1.0-py3-none-any.whl", hash = "sha256:b30d1df6c5d3de5dcef0fb0e256f6ea17bdcf5f979408df0297d8a408e2971b4", size = 23143, upload-time = "2025-10-26T19:44:12.016Z" }, +] + [[package]] name = "av" version = "15.0.0" @@ -407,19 +420,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/50/cd/30110dc0ffcf3b131156077b90e9f60ed75711223f306da4db08eff8403b/beautifulsoup4-4.13.4-py3-none-any.whl", hash = "sha256:9bbbb14bfde9d79f38b8cd5f8c7c85f4b8f2523190ebed90e950a8dea4cb1c4b", size = 187285, upload-time = "2025-04-15T17:05:12.221Z" }, ] -[[package]] -name = "bitsandbytes" -version = "0.45.5" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "numpy", marker = "(platform_machine != 'aarch64' and sys_platform != 'darwin') or sys_platform == 'win32'" }, - { name = "torch", version = "2.8.0+cu129", source = { registry = "https://download.pytorch.org/whl/cu129" }, marker = "(platform_machine != 'aarch64' and sys_platform != 'darwin') or sys_platform == 'win32'" }, -] -wheels = [ - { url = "https://files.pythonhosted.org/packages/07/b7/cb5ce4d1a382cf53c19ef06c5fc29e85f5e129b4da6527dd207d90a5b8ad/bitsandbytes-0.45.5-py3-none-manylinux_2_24_x86_64.whl", hash = "sha256:a5453f30cc6aab6ccaac364e6bf51a7808d3da5f71763dffeb6d9694c59136e4", size = 76059261, upload-time = "2025-04-07T13:32:52.573Z" }, - { url = "https://files.pythonhosted.org/packages/a6/4c/77b535e025ce780d2ada8271c1e481fb7337c1df2588a52fe1c9bd87d2e8/bitsandbytes-0.45.5-py3-none-win_amd64.whl", hash = "sha256:ed1c61b91d989d6a33fd05737d6edbf5086d8ebc89235ee632c7a19144085da2", size = 75430204, upload-time = "2025-04-07T13:32:57.553Z" }, -] - [[package]] name = "blake3" version = "1.0.5" @@ -1147,12 +1147,21 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/52/57/ecc9ae29fa5b2d90107cd1d9bf8ed19aacb74b2264d986ae9d44fe9bdf87/debugpy-1.8.16-py2.py3-none-any.whl", hash = "sha256:19c9521962475b87da6f673514f7fd610328757ec993bf7ec0d8c96f9a325f9e", size = 5287700, upload-time = "2025-08-06T18:00:42.333Z" }, ] +[[package]] +name = "decorator" +version = "5.2.1" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/43/fa/6d96a0978d19e17b68d634497769987b16c8f4cd0a7a05048bec693caa6b/decorator-5.2.1.tar.gz", hash = "sha256:65f266143752f734b0a7cc83c46f4618af75b8c5911b00ccb61d0ac9b6da0360", size = 56711, upload-time = "2025-02-24T04:41:34.073Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/4e/8c/f3147f5c4b73e7550fe5f9352eaa956ae838d5c51eb58e7a25b9f3e2643b/decorator-5.2.1-py3-none-any.whl", hash = "sha256:d316bb415a2d9e2d2b3abcc4084c6502fc09240e292cd76a76afc106a1c8e04a", size = 9190, upload-time = "2025-02-24T04:41:32.565Z" }, +] + [[package]] name = "decord" version = "0.6.0" source = { registry = "https://pypi.org/simple" } dependencies = [ - { name = "numpy" }, + { name = "numpy", marker = "(platform_machine != 'aarch64' and sys_platform != 'darwin') or sys_platform == 'win32'" }, ] wheels = [ { url = "https://files.pythonhosted.org/packages/11/79/936af42edf90a7bd4e41a6cac89c913d4b47fa48a26b042d5129a9242ee3/decord-0.6.0-py3-none-manylinux2010_x86_64.whl", hash = "sha256:51997f20be8958e23b7c4061ba45d0efcd86bffd5fe81c695d0befee0d442976", size = 13602299, upload-time = "2021-06-14T21:30:55.486Z" }, @@ -1217,6 +1226,25 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/d1/ae/afb1487556e2dc827a17097aac8158a25b433a345386f0e249f6d2694ccb/devtools-0.12.2-py3-none-any.whl", hash = "sha256:c366e3de1df4cdd635f1ad8cbcd3af01a384d7abda71900e68d43b04eb6aaca7", size = 19411, upload-time = "2023-09-03T16:56:59.049Z" }, ] +[[package]] +name = "diffusers" +version = "0.35.2" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "filelock" }, + { name = "huggingface-hub" }, + { name = "importlib-metadata" }, + { name = "numpy" }, + { name = "pillow" }, + { name = "regex" }, + { name = "requests" }, + { name = "safetensors" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/03/68/288ca23c7c05c73e87ffe5efffc282400ac9b017f7a9bb03883f4310ea15/diffusers-0.35.2.tar.gz", hash = "sha256:30ecd552303edfcfe1724573c3918a8462ee3ab4d529bdbd4c0045f763affded", size = 3366711, upload-time = "2025-10-15T04:05:17.213Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/2a/2e/38d9824f8c6bb048c5ba21c6d4da54c29c162a46b58b3ef907a360a76d3e/diffusers-0.35.2-py3-none-any.whl", hash = "sha256:d50d5e74fdd6dcf55e5c1d304bc52cc7c2659abd1752740d736d7b54078b4db5", size = 4121649, upload-time = "2025-10-15T04:05:14.391Z" }, +] + [[package]] name = "dill" version = "0.3.8" @@ -1432,6 +1460,20 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/42/14/42b2651a2f46b022ccd948bca9f2d5af0fd8929c4eec235b8d6d844fbe67/filelock-3.19.1-py3-none-any.whl", hash = "sha256:d38e30481def20772f5baf097c122c3babc4fcdb7e14e57049eb9d88c6dc017d", size = 15988, upload-time = "2025-08-14T16:56:01.633Z" }, ] +[[package]] +name = "fla-core" +version = "0.4.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "einops" }, + { name = "torch", version = "2.8.0", source = { registry = "https://pypi.org/simple" }, marker = "sys_platform == 'darwin'" }, + { name = "torch", version = "2.8.0+cu129", source = { registry = "https://download.pytorch.org/whl/cu129" }, marker = "sys_platform != 'darwin'" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/db/3d/79a9d5c8cd973c86f35403931031787dfc6cc97d838a42d4c62e8cbbb66f/fla_core-0.4.0.tar.gz", hash = "sha256:d975022b074e97bfd086dc6b767dccb35e27a9fe36f26f3b26b1c2b68b36a1c8", size = 316316, upload-time = "2025-10-27T08:18:51.673Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/d5/0c/d52ab65e9c163631895052d70d4111f8530ca52f45beb0895378d1a2a8b5/fla_core-0.4.0-py3-none-any.whl", hash = "sha256:5396f36a9838c99f9e45c70e88e2e0b26688f719d07d2ddd61be16d29327f4ea", size = 438519, upload-time = "2025-10-27T08:18:49.561Z" }, +] + [[package]] name = "flash-attn" version = "2.8.1" @@ -1446,6 +1488,19 @@ dependencies = [ ] sdist = { url = "https://files.pythonhosted.org/packages/e8/6d/7066d160bdffa2f9da29a8c3957f266b17a03ca0b3bdc8fdae86d9881fe7/flash_attn-2.8.1.tar.gz", hash = "sha256:0ff003899fcb244f357905b04f622d5c9736887126dd6675f8f4bc52954e3923", size = 8166563, upload-time = "2025-07-10T05:16:39.729Z" } +[[package]] +name = "flash-linear-attention" +version = "0.4.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "fla-core" }, + { name = "transformers" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/44/9a/e546815da2bf149e0af58449ff1ca10074165af4384febead438ad46f74c/flash_linear_attention-0.4.0.tar.gz", hash = "sha256:c5d2bf6e1a766af3a4426f07f710b0b87809f7218de21eb313314be6ff1b0dba", size = 157646, upload-time = "2025-10-27T08:18:52.445Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/3c/76/4f716c953608204c970de7cd4045db1af02643d7f19c94a49254834b7563/flash_linear_attention-0.4.0-py3-none-any.whl", hash = "sha256:50c97163f7cb64dc53585194ef36af44d2a6bc545227c4f73bb3ba9062630f1a", size = 290439, upload-time = "2025-10-27T08:18:50.589Z" }, +] + [[package]] name = "flask" version = "3.1.2" @@ -1617,6 +1672,18 @@ http = [ { name = "aiohttp" }, ] +[[package]] +name = "ftfy" +version = "6.3.1" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "wcwidth" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/a5/d3/8650919bc3c7c6e90ee3fa7fd618bf373cbbe55dff043bd67353dbb20cd8/ftfy-6.3.1.tar.gz", hash = "sha256:9b3c3d90f84fb267fe64d375a07b7f8912d817cf86009ae134aa03e1819506ec", size = 308927, upload-time = "2024-10-26T00:50:35.149Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/ab/6e/81d47999aebc1b155f81eca4477a616a70f238a2549848c38983f3c22a82/ftfy-6.3.1-py3-none-any.whl", hash = "sha256:7c70eb532015cd2f9adb53f101fb6c7945988d023a085d127d1573dc49dd0083", size = 44821, upload-time = "2024-10-26T00:50:33.425Z" }, +] + [[package]] name = "gguf" version = "0.17.1" @@ -1935,6 +2002,19 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/04/4b/29cac41a4d98d144bf5f6d33995617b185d14b22401f75ca86f384e87ff1/h11-0.16.0-py3-none-any.whl", hash = "sha256:63cf8bbe7522de3bf65932fda1d9c2772064ffb3dae62d55932da54b31cb6c86", size = 37515, upload-time = "2025-04-24T03:35:24.344Z" }, ] +[[package]] +name = "h2" +version = "4.3.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "hpack" }, + { name = "hyperframe" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/1d/17/afa56379f94ad0fe8defd37d6eb3f89a25404ffc71d4d848893d270325fc/h2-4.3.0.tar.gz", hash = "sha256:6c59efe4323fa18b47a632221a1888bd7fde6249819beda254aeca909f221bf1", size = 2152026, upload-time = "2025-08-23T18:12:19.778Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/69/b2/119f6e6dcbd96f9069ce9a2665e0146588dc9f88f29549711853645e736a/h2-4.3.0-py3-none-any.whl", hash = "sha256:c438f029a25f7945c69e0ccf0fb951dc3f73a5f6412981daee861431b70e2bdd", size = 61779, upload-time = "2025-08-23T18:12:17.779Z" }, +] + [[package]] name = "hatchling" version = "1.27.0" @@ -1965,6 +2045,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/9e/d3/0aaf279f4f3dea58e99401b92c31c0f752924ba0e6c7d7bb07b1dbd7f35e/hf_xet-1.1.8-cp37-abi3-win_amd64.whl", hash = "sha256:4171f31d87b13da4af1ed86c98cf763292e4720c088b4957cf9d564f92904ca9", size = 2801689, upload-time = "2025-08-18T22:01:04.81Z" }, ] +[[package]] +name = "hpack" +version = "4.1.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/2c/48/71de9ed269fdae9c8057e5a4c0aa7402e8bb16f2c6e90b3aa53327b113f8/hpack-4.1.0.tar.gz", hash = "sha256:ec5eca154f7056aa06f196a557655c5b009b382873ac8d1e66e79e87535f1dca", size = 51276, upload-time = "2025-01-22T21:44:58.347Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/07/c6/80c95b1b2b94682a72cbdbfb85b81ae2daffa4291fbfa1b1464502ede10d/hpack-4.1.0-py3-none-any.whl", hash = "sha256:157ac792668d995c657d93111f46b4535ed114f0c9c8d672271bbec7eae1b496", size = 34357, upload-time = "2025-01-22T21:44:56.92Z" }, +] + [[package]] name = "httpcore" version = "1.0.9" @@ -2015,6 +2104,11 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/2a/39/e50c7c3a983047577ee07d2a9e53faf5a69493943ec3f6a384bdc792deb2/httpx-0.28.1-py3-none-any.whl", hash = "sha256:d909fcccc110f8c7faf814ca82a9a4d816bc5a6dbfea25d6591d6985b8ba59ad", size = 73517, upload-time = "2024-12-06T15:37:21.509Z" }, ] +[package.optional-dependencies] +http2 = [ + { name = "h2" }, +] + [[package]] name = "huggingface-hub" version = "0.34.4" @@ -2048,6 +2142,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/c6/50/e0edd38dcd63fb26a8547f13d28f7a008bc4a3fd4eb4ff030673f22ad41a/hydra_core-1.3.2-py3-none-any.whl", hash = "sha256:fa0238a9e31df3373b35b0bfb672c34cc92718d21f81311d8996a16de1141d8b", size = 154547, upload-time = "2023-02-23T18:33:40.801Z" }, ] +[[package]] +name = "hyperframe" +version = "6.1.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/02/e7/94f8232d4a74cc99514c13a9f995811485a6903d48e5d952771ef6322e30/hyperframe-6.1.0.tar.gz", hash = "sha256:f630908a00854a7adeabd6382b43923a4c4cd4b821fcb527e6ab9e15382a3b08", size = 26566, upload-time = "2025-01-22T21:41:49.302Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/48/30/47d0bf6072f7252e6521f3447ccfa40b421b6824517f82854703d0f5a98b/hyperframe-6.1.0-py3-none-any.whl", hash = "sha256:b03380493a519fce58ea5af42e4a42317bf9bd425596f7a0835ffce80f1a42e5", size = 13007, upload-time = "2025-01-22T21:41:47.295Z" }, +] + [[package]] name = "identify" version = "2.6.13" @@ -2066,6 +2169,20 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/76/c6/c88e154df9c4e1a2a66ccf0005a88dfb2650c1dffb6f5ce603dfbd452ce3/idna-3.10-py3-none-any.whl", hash = "sha256:946d195a0d259cbba61165e88e65941f16e9b36ea6ddb97f00452bae8b1287d3", size = 70442, upload-time = "2024-09-15T18:07:37.964Z" }, ] +[[package]] +name = "imageio-ffmpeg" +version = "0.6.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/44/bd/c3343c721f2a1b0c9fc71c1aebf1966a3b7f08c2eea8ed5437a2865611d6/imageio_ffmpeg-0.6.0.tar.gz", hash = "sha256:e2556bed8e005564a9f925bb7afa4002d82770d6b08825078b7697ab88ba1755", size = 25210, upload-time = "2025-01-16T21:34:32.747Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/da/58/87ef68ac83f4c7690961bce288fd8e382bc5f1513860fc7f90a9c1c1c6bf/imageio_ffmpeg-0.6.0-py3-none-macosx_10_9_intel.macosx_10_9_x86_64.whl", hash = "sha256:9d2baaf867088508d4a3458e61eeb30e945c4ad8016025545f66c4b5aaef0a61", size = 24932969, upload-time = "2025-01-16T21:34:20.464Z" }, + { url = "https://files.pythonhosted.org/packages/40/5c/f3d8a657d362cc93b81aab8feda487317da5b5d31c0e1fdfd5e986e55d17/imageio_ffmpeg-0.6.0-py3-none-macosx_11_0_arm64.whl", hash = "sha256:b1ae3173414b5fc5f538a726c4e48ea97edc0d2cdc11f103afee655c463fa742", size = 21113891, upload-time = "2025-01-16T21:34:00.277Z" }, + { url = "https://files.pythonhosted.org/packages/33/e7/1925bfbc563c39c1d2e82501d8372734a5c725e53ac3b31b4c2d081e895b/imageio_ffmpeg-0.6.0-py3-none-manylinux2014_aarch64.whl", hash = "sha256:1d47bebd83d2c5fc770720d211855f208af8a596c82d17730aa51e815cdee6dc", size = 25632706, upload-time = "2025-01-16T21:33:53.475Z" }, + { url = "https://files.pythonhosted.org/packages/a0/2d/43c8522a2038e9d0e7dbdf3a61195ecc31ca576fb1527a528c877e87d973/imageio_ffmpeg-0.6.0-py3-none-manylinux2014_x86_64.whl", hash = "sha256:c7e46fcec401dd990405049d2e2f475e2b397779df2519b544b8aab515195282", size = 29498237, upload-time = "2025-01-16T21:34:13.726Z" }, + { url = "https://files.pythonhosted.org/packages/a0/13/59da54728351883c3c1d9fca1710ab8eee82c7beba585df8f25ca925f08f/imageio_ffmpeg-0.6.0-py3-none-win32.whl", hash = "sha256:196faa79366b4a82f95c0f4053191d2013f4714a715780f0ad2a68ff37483cc2", size = 19652251, upload-time = "2025-01-16T21:34:06.812Z" }, + { url = "https://files.pythonhosted.org/packages/2c/c6/fa760e12a2483469e2bf5058c5faff664acf66cadb4df2ad6205b016a73d/imageio_ffmpeg-0.6.0-py3-none-win_amd64.whl", hash = "sha256:02fa47c83703c37df6bfe4896aab339013f62bf02c5ebf2dce6da56af04ffc0a", size = 31246824, upload-time = "2025-01-16T21:34:28.6Z" }, +] + [[package]] name = "imagesize" version = "1.4.1" @@ -2327,6 +2444,44 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/ab/60/dfbbf40e3a371388c0e03ff65b01319b7d4023e883df6d7261125772ffdc/latex2sympy2_extended-1.10.2-py3-none-any.whl", hash = "sha256:f910442c5b02a466c1046f47d05cc5285181068b882399281f30102715337fb7", size = 207855, upload-time = "2025-07-02T15:26:04.88Z" }, ] +[[package]] +name = "lazy-loader" +version = "0.4" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "packaging" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/6f/6b/c875b30a1ba490860c93da4cabf479e03f584eba06fe5963f6f6644653d8/lazy_loader-0.4.tar.gz", hash = "sha256:47c75182589b91a4e1a85a136c074285a5ad4d9f39c63e0d7fb76391c4574cd1", size = 15431, upload-time = "2024-04-05T13:03:12.261Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/83/60/d497a310bde3f01cb805196ac61b7ad6dc5dcf8dce66634dc34364b20b4f/lazy_loader-0.4-py3-none-any.whl", hash = "sha256:342aa8e14d543a154047afb4ba8ef17f5563baad3fc610d7b15b213b0f119efc", size = 12097, upload-time = "2024-04-05T13:03:10.514Z" }, +] + +[[package]] +name = "librosa" +version = "0.11.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "audioread" }, + { name = "decorator" }, + { name = "joblib" }, + { name = "lazy-loader" }, + { name = "msgpack" }, + { name = "numba" }, + { name = "numpy" }, + { name = "pooch" }, + { name = "scikit-learn" }, + { name = "scipy" }, + { name = "soundfile" }, + { name = "soxr" }, + { name = "standard-aifc", marker = "python_full_version >= '3.13'" }, + { name = "standard-sunau", marker = "python_full_version >= '3.13'" }, + { name = "typing-extensions" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/64/36/360b5aafa0238e29758729e9486c6ed92a6f37fa403b7875e06c115cdf4a/librosa-0.11.0.tar.gz", hash = "sha256:f5ed951ca189b375bbe2e33b2abd7e040ceeee302b9bbaeeffdfddb8d0ace908", size = 327001, upload-time = "2025-03-11T15:09:54.884Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/b5/ba/c63c5786dfee4c3417094c4b00966e61e4a63efecee22cb7b4c0387dda83/librosa-0.11.0-py3-none-any.whl", hash = "sha256:0b6415c4fd68bff4c29288abe67c6d80b587e0e1e2cfb0aad23e4559504a7fa1", size = 260749, upload-time = "2025-03-11T15:09:52.982Z" }, +] + [[package]] name = "liger-kernel" version = "0.6.2" @@ -3083,10 +3238,14 @@ wheels = [ name = "nemo-automodel" source = { editable = "3rdparty/Automodel-workspace/Automodel" } dependencies = [ - { name = "bitsandbytes", marker = "platform_machine == 'x86_64' and sys_platform != 'darwin'" }, { name = "datasets" }, + { name = "diffusers" }, + { name = "ftfy" }, + { name = "imageio-ffmpeg" }, { name = "liger-kernel", marker = "platform_machine == 'x86_64' and sys_platform != 'darwin'" }, { name = "megatron-fsdp" }, + { name = "mlflow" }, + { name = "opencv-python-headless" }, { name = "pybind11" }, { name = "pyyaml" }, { name = "torch", version = "2.8.0", source = { registry = "https://pypi.org/simple" }, marker = "sys_platform == 'darwin'" }, @@ -3098,6 +3257,25 @@ dependencies = [ ] [package.optional-dependencies] +all = [ + { name = "backoff" }, + { name = "flash-linear-attention" }, + { name = "mistral-common", extra = ["opencv"] }, + { name = "numba" }, + { name = "numpy" }, + { name = "perceptron" }, + { name = "pillow" }, + { name = "qwen-omni-utils" }, + { name = "qwen-vl-utils", extra = ["decord"], marker = "platform_machine == 'x86_64' and sys_platform != 'darwin'" }, + { name = "sentencepiece" }, + { name = "timm" }, + { name = "torchcodec", marker = "platform_machine == 'x86_64' and sys_platform != 'darwin'" }, +] +extra = [ + { name = "flash-linear-attention" }, + { name = "perceptron" }, + { name = "sentencepiece" }, +] fa = [ { name = "flash-attn" }, ] @@ -3110,10 +3288,10 @@ vlm = [ { name = "numba" }, { name = "numpy" }, { name = "pillow" }, - { name = "qwen-vl-utils", extra = ["decord"] }, + { name = "qwen-omni-utils" }, + { name = "qwen-vl-utils", extra = ["decord"], marker = "platform_machine == 'x86_64' and sys_platform != 'darwin'" }, { name = "timm" }, - { name = "torchcodec" }, - { name = "transformers" }, + { name = "torchcodec", marker = "platform_machine == 'x86_64' and sys_platform != 'darwin'" }, ] [package.dev-dependencies] @@ -3147,36 +3325,45 @@ test = [ [package.metadata] requires-dist = [ { name = "backoff", marker = "extra == 'vlm'" }, - { name = "bitsandbytes", marker = "platform_machine == 'x86_64' and sys_platform != 'darwin'", specifier = "==0.45.5" }, { name = "datasets", specifier = ">=4.0.0" }, + { name = "diffusers" }, { name = "flash-attn", marker = "extra == 'fa'", specifier = "<=2.8.3" }, + { name = "flash-linear-attention", marker = "extra == 'extra'" }, + { name = "ftfy" }, + { name = "imageio-ffmpeg" }, { name = "liger-kernel", marker = "platform_machine == 'x86_64' and sys_platform != 'darwin'", specifier = ">=0.5.9" }, { name = "megatron-fsdp" }, { name = "mistral-common", extras = ["opencv"], marker = "extra == 'vlm'" }, + { name = "mlflow" }, + { name = "nemo-automodel", extras = ["extra"], marker = "extra == 'all'", editable = "3rdparty/Automodel-workspace/Automodel" }, + { name = "nemo-automodel", extras = ["vlm"], marker = "extra == 'all'", editable = "3rdparty/Automodel-workspace/Automodel" }, { name = "numba", marker = "extra == 'vlm'" }, { name = "numpy", marker = "extra == 'vlm'" }, + { name = "opencv-python-headless", specifier = "==4.10.0.84" }, + { name = "perceptron", marker = "extra == 'extra'" }, { name = "pillow", marker = "extra == 'vlm'" }, { name = "pybind11" }, { name = "pyyaml" }, - { name = "qwen-vl-utils", extras = ["decord"], marker = "extra == 'vlm'" }, - { name = "timm", marker = "extra == 'vlm'", specifier = "==1.0.16" }, - { name = "torch", marker = "sys_platform != 'darwin'", specifier = "<=2.8.0", index = "https://download.pytorch.org/whl/cu129" }, - { name = "torch", marker = "sys_platform == 'darwin'", specifier = "<=2.8.0", index = "https://pypi.org/simple" }, + { name = "qwen-omni-utils", marker = "extra == 'vlm'" }, + { name = "qwen-vl-utils", extras = ["decord"], marker = "platform_machine == 'x86_64' and sys_platform != 'darwin' and extra == 'vlm'" }, + { name = "sentencepiece", marker = "extra == 'extra'" }, + { name = "timm", marker = "extra == 'vlm'", specifier = "<=1.0.22" }, + { name = "torch", marker = "sys_platform != 'darwin'", specifier = "<=2.9.0", index = "https://download.pytorch.org/whl/cu129" }, + { name = "torch", marker = "sys_platform == 'darwin'", specifier = "<=2.9.0", index = "https://pypi.org/simple" }, { name = "torchao" }, - { name = "torchcodec", marker = "extra == 'vlm'" }, + { name = "torchcodec", marker = "platform_machine == 'x86_64' and sys_platform != 'darwin' and extra == 'vlm'" }, { name = "torchdata" }, { name = "transformer-engine", extras = ["pytorch"], marker = "extra == 'moe'", specifier = "==2.8.0" }, - { name = "transformers", specifier = "<=4.55.4" }, - { name = "transformers", marker = "extra == 'vlm'", specifier = "<=4.55.4" }, + { name = "transformers", specifier = "<=4.57.1" }, { name = "wandb" }, ] -provides-extras = ["vlm", "fa", "moe"] +provides-extras = ["vlm", "fa", "moe", "extra", "all"] [package.metadata.requires-dev] build = [ { name = "setuptools" }, - { name = "torch", marker = "sys_platform != 'darwin'", specifier = "<=2.8.0", index = "https://download.pytorch.org/whl/cu129" }, - { name = "torch", marker = "sys_platform == 'darwin'", specifier = "<=2.8.0", index = "https://pypi.org/simple" }, + { name = "torch", marker = "sys_platform != 'darwin'", specifier = "<=2.9.0", index = "https://download.pytorch.org/whl/cu129" }, + { name = "torch", marker = "sys_platform == 'darwin'", specifier = "<=2.9.0", index = "https://pypi.org/simple" }, ] dev = [{ name = "cut-cross-entropy", git = "https://github.com/apple/ml-cross-entropy.git?rev=87a86ab" }] docs = [ @@ -4186,6 +4373,24 @@ requires-dist = [ { name = "yappi" }, ] +[[package]] +name = "perceptron" +version = "0.1.4" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "colorama" }, + { name = "httpx", extra = ["http2"] }, + { name = "numpy" }, + { name = "pillow" }, + { name = "rich" }, + { name = "shellingham" }, + { name = "typer" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/30/60/85db2243d8b550823603d8f9c5845b0dd0f01074e9aabf0b2af0c4f52565/perceptron-0.1.4.tar.gz", hash = "sha256:62fd190efb74925e2cc33c0cd38761e19959be3bdb7b24fbf9e3386d6961f690", size = 78116, upload-time = "2025-11-12T20:00:28.024Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/ef/17/b7cb1a10ebb0a9a4c9fbcd96a28b43d44e08a90f620bab07e644a658d2f1/perceptron-0.1.4-py3-none-any.whl", hash = "sha256:f490a6df6c15167e91e1a528601cae98ce99a30991cf792f9ef83ebc15d335c4", size = 57421, upload-time = "2025-11-12T20:00:26.395Z" }, +] + [[package]] name = "pillow" version = "11.3.0" @@ -4283,6 +4488,20 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/54/20/4d324d65cc6d9205fabedc306948156824eb9f0ee1633355a8f7ec5c66bf/pluggy-1.6.0-py3-none-any.whl", hash = "sha256:e920276dd6813095e9377c0bc5566d94c932c33b27a3e3945d8389c374dd4746", size = 20538, upload-time = "2025-05-15T12:30:06.134Z" }, ] +[[package]] +name = "pooch" +version = "1.8.2" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "packaging" }, + { name = "platformdirs" }, + { name = "requests" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/c6/77/b3d3e00c696c16cf99af81ef7b1f5fe73bd2a307abca41bd7605429fe6e5/pooch-1.8.2.tar.gz", hash = "sha256:76561f0de68a01da4df6af38e9955c4c9d1a5c90da73f7e40276a5728ec83d10", size = 59353, upload-time = "2024-06-06T16:53:46.224Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/a8/87/77cc11c7a9ea9fd05503def69e3d18605852cd0d4b0d3b8f15bbeb3ef1d1/pooch-1.8.2-py3-none-any.whl", hash = "sha256:3529a57096f7198778a5ceefd5ac3ef0e4d06a6ddaf9fc2d609b806f25302c47", size = 64574, upload-time = "2024-06-06T16:53:44.343Z" }, +] + [[package]] name = "pre-commit" version = "4.3.0" @@ -5079,6 +5298,22 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/89/32/3836ed85947b06f1d67c07ce16c00b0cf8c053ab0b249d234f9f81ff95ff/pyzmq-27.0.1-cp314-cp314t-win_arm64.whl", hash = "sha256:0fc24bf45e4a454e55ef99d7f5c8b8712539200ce98533af25a5bfa954b6b390", size = 575098, upload-time = "2025-08-03T05:04:27.974Z" }, ] +[[package]] +name = "qwen-omni-utils" +version = "0.0.8" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "av" }, + { name = "librosa" }, + { name = "packaging" }, + { name = "pillow" }, + { name = "requests" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/b8/b1/cc58b03b5eadddc0812cef884d013ed6cc66b09f9b0f5b45123f89dcd056/qwen_omni_utils-0.0.8.tar.gz", hash = "sha256:b5808673e1455f4115cb784a62cdc8e8616576221a01fc738610b0f9268cb33c", size = 8145, upload-time = "2025-06-12T11:02:05.411Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/a1/b1/dcdd69246a3c3c3bd6f6ced58e2307b3afbd894c4412c29fd49dd897e562/qwen_omni_utils-0.0.8-py3-none-any.whl", hash = "sha256:c42bcc633fbfd84d565ff0de9d45fae68a6b57a9b7b97a4b77eda71a0d3ee73a", size = 9218, upload-time = "2025-06-12T11:02:03.981Z" }, +] + [[package]] name = "qwen-vl-utils" version = "0.0.11" @@ -5096,7 +5331,7 @@ wheels = [ [package.optional-dependencies] decord = [ - { name = "decord" }, + { name = "decord", marker = "(platform_machine != 'aarch64' and sys_platform != 'darwin') or sys_platform == 'win32'" }, ] [[package]] @@ -6016,6 +6251,40 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/a9/5c/bfd6bd0bf979426d405cc6e71eceb8701b148b16c21d2dc3c261efc61c7b/sqlparse-0.5.3-py3-none-any.whl", hash = "sha256:cf2196ed3418f3ba5de6af7e82c694a9fbdbfecccdfc72e281548517081f16ca", size = 44415, upload-time = "2024-12-10T12:05:27.824Z" }, ] +[[package]] +name = "standard-aifc" +version = "3.13.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "audioop-lts", marker = "python_full_version >= '3.13'" }, + { name = "standard-chunk", marker = "python_full_version >= '3.13'" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/c4/53/6050dc3dde1671eb3db592c13b55a8005e5040131f7509cef0215212cb84/standard_aifc-3.13.0.tar.gz", hash = "sha256:64e249c7cb4b3daf2fdba4e95721f811bde8bdfc43ad9f936589b7bb2fae2e43", size = 15240, upload-time = "2024-10-30T16:01:31.772Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/c3/52/5fbb203394cc852334d1575cc020f6bcec768d2265355984dfd361968f36/standard_aifc-3.13.0-py3-none-any.whl", hash = "sha256:f7ae09cc57de1224a0dd8e3eb8f73830be7c3d0bc485de4c1f82b4a7f645ac66", size = 10492, upload-time = "2024-10-30T16:01:07.071Z" }, +] + +[[package]] +name = "standard-chunk" +version = "3.13.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/43/06/ce1bb165c1f111c7d23a1ad17204d67224baa69725bb6857a264db61beaf/standard_chunk-3.13.0.tar.gz", hash = "sha256:4ac345d37d7e686d2755e01836b8d98eda0d1a3ee90375e597ae43aaf064d654", size = 4672, upload-time = "2024-10-30T16:18:28.326Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/7a/90/a5c1084d87767d787a6caba615aa50dc587229646308d9420c960cb5e4c0/standard_chunk-3.13.0-py3-none-any.whl", hash = "sha256:17880a26c285189c644bd5bd8f8ed2bdb795d216e3293e6dbe55bbd848e2982c", size = 4944, upload-time = "2024-10-30T16:18:26.694Z" }, +] + +[[package]] +name = "standard-sunau" +version = "3.13.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "audioop-lts", marker = "python_full_version >= '3.13'" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/66/e3/ce8d38cb2d70e05ffeddc28bb09bad77cfef979eb0a299c9117f7ed4e6a9/standard_sunau-3.13.0.tar.gz", hash = "sha256:b319a1ac95a09a2378a8442f403c66f4fd4b36616d6df6ae82b8e536ee790908", size = 9368, upload-time = "2024-10-30T16:01:41.626Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/34/ae/e3707f6c1bc6f7aa0df600ba8075bfb8a19252140cd595335be60e25f9ee/standard_sunau-3.13.0-py3-none-any.whl", hash = "sha256:53af624a9529c41062f4c2fd33837f297f3baa196b0cfceffea6555654602622", size = 7364, upload-time = "2024-10-30T16:01:28.003Z" }, +] + [[package]] name = "starlette" version = "0.47.2" @@ -6422,9 +6691,7 @@ name = "torchcodec" version = "0.6.0" source = { registry = "https://pypi.org/simple" } wheels = [ - { url = "https://files.pythonhosted.org/packages/d9/b3/11326a0e7a3c803a95975cfce4ac88fa4ea1a0d432bb876081046c5a5554/torchcodec-0.6.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:fba260145a239b5afe13336e3a5bc1b089c9c31a073e9a7c2026d4cbd853fdd9", size = 3482584, upload-time = "2025-08-07T08:51:32.535Z" }, { url = "https://files.pythonhosted.org/packages/a7/d1/3f90561df013f6a015ef19de22726b64073fee405f53d3c4b8255ab05a67/torchcodec-0.6.0-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:fdef91a17fb1f1a159ce23710324a9a4e6d6a885275de73700f94a9ad562c6b2", size = 1370954, upload-time = "2025-08-07T08:51:15.021Z" }, - { url = "https://files.pythonhosted.org/packages/87/d0/0b5dd42652e4527d578e1d6239dbb907bf83e502115e517b83a55d8b7f8b/torchcodec-0.6.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:de20cab5df7fa7cdd74ec1dc0d508324685573f86de6789f0ebb860b7ea20b33", size = 3446017, upload-time = "2025-08-07T08:51:34.484Z" }, { url = "https://files.pythonhosted.org/packages/97/62/a938334e39101d4304619b90847d8aef7d1c607c6bcf33638f72931ae990/torchcodec-0.6.0-cp313-cp313-manylinux_2_28_x86_64.whl", hash = "sha256:46dab701a2d809e975a8b07d7ee47ed34f1d903511e374c74cfc1de6a5ab0e3f", size = 1374794, upload-time = "2025-08-07T08:51:17.355Z" }, ]