From 17395eca58cde9841fe61bb32d1933deb58d11bf Mon Sep 17 00:00:00 2001 From: meichangsu1 <1484603386@qq.com> Date: Thu, 22 Jan 2026 10:17:20 +0800 Subject: [PATCH 1/7] feat: add activation CPU offload callback support for FSDP training - Add ActivationCpuOffloadCallBack import and registration in callbacks mapping - Automatically append activation_cpu_offload callback when FSDP config has activation_cpu_offload enabled - Enables memory-efficient training by offloading activations to CPU during FSDP forward pass --- swift/callbacks/mapping.py | 2 ++ swift/trainers/arguments.py | 3 +++ 2 files changed, 5 insertions(+) diff --git a/swift/callbacks/mapping.py b/swift/callbacks/mapping.py index c4684f75c8..6dd83aaaf6 100644 --- a/swift/callbacks/mapping.py +++ b/swift/callbacks/mapping.py @@ -1,10 +1,12 @@ # Copyright (c) ModelScope Contributors. All rights reserved. +from swift.activation_cpu_offload import ActivationCpuOffloadCallBack from .adalora import AdaloraCallback from .early_stop import EarlyStopCallback from .lisa import LISACallback from .perf_log import PerfMetricsLogCallback callbacks_map = { + 'activation_cpu_offload': ActivationCpuOffloadCallBack, 'adalora': AdaloraCallback, 'early_stop': EarlyStopCallback, 'lisa': LISACallback, diff --git a/swift/trainers/arguments.py b/swift/trainers/arguments.py index de4777fb25..80f0dfeb66 100644 --- a/swift/trainers/arguments.py +++ b/swift/trainers/arguments.py @@ -228,6 +228,9 @@ def _init_callbacks(self): self.callbacks.append('adalora') if self.early_stop_interval is not None and self.early_stop_interval > 0: self.callbacks.append('early_stop') + fsdp_config = getattr(self, 'fsdp_config', {}) + if isinstance(fsdp_config, dict) and fsdp_config.get('activation_cpu_offload', False): + self.callbacks.append('activation_cpu_offload') def __post_init__(self): if hasattr(self, 'output_dir'): From 20680af33bba75b210c447f4b8ab7db6ecd46d23 Mon Sep 17 00:00:00 2001 From: meichangsu1 <1484603386@qq.com> Date: Thu, 22 Jan 2026 11:28:06 +0800 Subject: [PATCH 2/7] feat: add FSDP2 configuration and training script for activation CPU offload - Add `fsdp2.json` configuration file for PyTorch native FSDP v2 with activation CPU offloading - Include detailed parameter documentation and usage notes for FSDP2 - Provide example training script (`train.sh`) demonstrating multi-GPU training with LoRA - Disable gradient checkpointing in favor of FSDP's native activation checkpointing - Enable CPU RAM efficient loading and sharded state dicts for memory optimization --- .../train/activation_cpu_offload/fsdp2.json | 26 + .../train/activation_cpu_offload/train.sh | 54 ++ swift/callbacks/activation_cpu_offload.py | 611 ++++++++++++++++++ swift/callbacks/mapping.py | 2 +- 4 files changed, 692 insertions(+), 1 deletion(-) create mode 100644 examples/train/activation_cpu_offload/fsdp2.json create mode 100644 examples/train/activation_cpu_offload/train.sh create mode 100644 swift/callbacks/activation_cpu_offload.py diff --git a/examples/train/activation_cpu_offload/fsdp2.json b/examples/train/activation_cpu_offload/fsdp2.json new file mode 100644 index 0000000000..73d856389a --- /dev/null +++ b/examples/train/activation_cpu_offload/fsdp2.json @@ -0,0 +1,26 @@ +{ + "_description": "FSDP2 configuration for distributed training (PyTorch native FSDP v2)", + "_requires": "torch>=2.4.0", + "_note": "This is the recommended configuration for multi-GPU training without CPU offloading. NOTE: When using FSDP2, do NOT use --gradient_checkpointing, use activation_checkpointing in fsdp_config instead.", + + "_param_docs": { + "fsdp": "FSDP strategy string. Options: 'full_shard' (ZeRO-3 style, shards params+grads+optimizer), 'shard_grad_op' (ZeRO-2 style, shards grads+optimizer only). Add 'auto_wrap' to enable automatic layer wrapping. Add 'offload' to enable CPU offloading.", + "fsdp_version": "FSDP version. Use 2 for PyTorch native FSDP2 (recommended). FSDP2 uses DTensor for per-parameter sharding, supports LoRA/QLoRA natively.", + "auto_wrap_policy": "How to wrap model layers. 'TRANSFORMER_BASED_WRAP' wraps transformer decoder layers (from model._no_split_modules). 'SIZE_BASED_WRAP' wraps modules exceeding min_num_params.", + "cpu_ram_efficient_loading": "If true, only rank 0 loads full model weights, then broadcasts to other ranks. Reduces CPU RAM usage during initialization.", + "state_dict_type": "'SHARDED_STATE_DICT' (recommended): each rank saves its own shard without extra communication. 'FULL_STATE_DICT': gathers full model on rank 0 (higher memory, slower).", + "reshard_after_forward": "true = FULL_SHARD (ZeRO-3), reshards params after forward pass. false = SHARD_GRAD_OP (ZeRO-2), keeps params gathered during forward/backward.", + "activation_checkpointing": "Use FSDP's native activation checkpointing instead of gradient_checkpointing. This is the correct way to save memory with FSDP.", + "activation_cpu_offload": "true = offload activations to CPU. false = keep activations on GPU,can enable when using activation_checkpointing." + }, + "fsdp": "full_shard auto_wrap", + "fsdp_config": { + "fsdp_version": 2, + "reshard_after_forward": true, + "auto_wrap_policy": "TRANSFORMER_BASED_WRAP", + "cpu_ram_efficient_loading": true, + "state_dict_type": "SHARDED_STATE_DICT", + "activation_checkpointing": false, + "activation_cpu_offload": true + } +} diff --git a/examples/train/activation_cpu_offload/train.sh b/examples/train/activation_cpu_offload/train.sh new file mode 100644 index 0000000000..b9b206748c --- /dev/null +++ b/examples/train/activation_cpu_offload/train.sh @@ -0,0 +1,54 @@ +#!/bin/bash +CUDA_VISIBLE_DEVICES=0,1 \ +NPROC_PER_NODE=2 \ +swift sft \ + --model 'Qwen/Qwen3-8B' \ + --train_type lora \ + --dataset 'AI-ModelScope/alpaca-gpt4-data-zh#500' \ + --torch_dtype bfloat16 \ + --num_train_epochs 1 \ + --per_device_train_batch_size 1 \ + --per_device_eval_batch_size 1 \ + --learning_rate 1e-4 \ + --lora_rank 8 \ + --lora_alpha 32 \ + --gradient_checkpointing false \ + --weight_decay 0.1 \ + --target_modules all-linear \ + --gradient_accumulation_steps 16 \ + --eval_steps 100 \ + --save_steps 5 \ + --save_total_limit 2 \ + --logging_steps 5 \ + --max_length 2048 \ + --output_dir output \ + --system You\ are\ a\ helpful\ assistant. \ + --warmup_ratio 0.05 \ + --dataloader_num_workers 4 \ + --fsdp './examples/train/activation_cpu_offload/fsdp2.json' + + +# --dataset 'AI-ModelScope/alpaca-gpt4-data-zh#500' +# activation_cpu_offload=true + +# {'loss': 2.1327579, 'grad_norm': 1.72890568, 'learning_rate': 8.346e-05, 'token_acc': 0.58396158, 'epoch': 0.32, 'global_step/max_steps': '5/16', 'percentage': '31.25%', 'elapsed_time': '5m 28s', 'remaining_time': '12m 2s', 'memory(GiB)': 24.8, 'train_speed(iter/s)': 0.015218} +# Train: 31%|██████████████████████████████████████▍ | 5/16 [05:28<11:41, 63.77s/it][INFO:swift] Saving model checkpoint to /model/ljl/output/v45-20251231-160511/checkpoint-5 +# {'loss': 1.51323957, 'grad_norm': 0.39210615, 'learning_rate': 3.455e-05, 'token_acc': 0.62368014, 'epoch': 0.64, 'global_step/max_steps': '10/16', 'percentage': '62.50%', 'elapsed_time': '10m 22s', 'remaining_time': '6m 13s', 'memory(GiB)': 24.87, 'train_speed(iter/s)': 0.016054} +# Train: 62%|████████████████████████████████████████████████████████████████████████████▎ | 10/16 [10:22<05:37, 56.26s/it][INFO:swift] Saving model checkpoint to /model/ljl/output/v45-20251231-160511/checkpoint-10 +# {'loss': 1.36127844, 'grad_norm': 0.30676287, 'learning_rate': 1.09e-06, 'token_acc': 0.64411869, 'epoch': 0.96, 'global_step/max_steps': '15/16', 'percentage': '93.75%', 'elapsed_time': '15m 6s', 'remaining_time': '1m 0s', 'memory(GiB)': 24.87, 'train_speed(iter/s)': 0.016547} +# ... +# {'train_runtime': 962.7184, 'train_samples_per_second': 0.519, 'train_steps_per_second': 0.017, 'train_loss': 1.61728384, 'token_acc': 0.62789828, 'epoch': 1.0, 'global_step/max_steps': '16/16', 'percentage': '100.00%', 'elapsed_time': '16m 2s', 'remaining_time': '0s', 'memory(GiB)': 24.87, 'train_speed(iter/s)': 0.016624} +# Train: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 16/16 [16:02<00:00, 60.16s/it] + + +# activation_cpu_offload=false + +# {'loss': 2.15452981, 'grad_norm': 1.7536869, 'learning_rate': 0.0001, 'token_acc': 0.61792799, 'epoch': 0.06, 'global_step/max_steps': '1/16', 'percentage': '6.25%', 'elapsed_time': '46s', 'remaining_time': '11m 39s', 'memory(GiB)': 26.14, 'train_speed(iter/s)': 0.021458} +# {'loss': 2.13306689, 'grad_norm': 1.7279824, 'learning_rate': 8.346e-05, 'token_acc': 0.58295639, 'epoch': 0.32, 'global_step/max_steps': '5/16', 'percentage': '31.25%', 'elapsed_time': '2m 55s', 'remaining_time': '6m 26s', 'memory(GiB)': 26.59, 'train_speed(iter/s)': 0.028456} +# Train: 31%|██████████████████████████████████████▍ | 5/16 [02:55<05:59, 32.65s/it][INFO:swift] Saving model checkpoint to /model/ljl/output/v44-20251231-155036/checkpoint-5 +# {'loss': 1.51308346, 'grad_norm': 0.39151499, 'learning_rate': 3.455e-05, 'token_acc': 0.62377399, 'epoch': 0.64, 'global_step/max_steps': '10/16', 'percentage': '62.50%', 'elapsed_time': '5m 18s', 'remaining_time': '3m 10s', 'memory(GiB)': 27.73, 'train_speed(iter/s)': 0.031432} +# Train: 62%|████████████████████████████████████████████████████████████████████████████▎ | 10/16 [05:18<02:51, 28.58s/it][INFO:swift] Saving model checkpoint to /model/ljl/output/v44-20251231-155036/checkpoint-10 +# {'loss': 1.36132231, 'grad_norm': 0.30557585, 'learning_rate': 1.09e-06, 'token_acc': 0.64442776, 'epoch': 0.96, 'global_step/max_steps': '15/16', 'percentage': '93.75%', 'elapsed_time': '7m 57s', 'remaining_time': '31s', 'memory(GiB)': 27.96, 'train_speed(iter/s)': 0.031437} +# ... +# {'train_runtime': 507.5282, 'train_samples_per_second': 0.985, 'train_steps_per_second': 0.032, 'train_loss': 1.61732693, 'token_acc': 0.63051608, 'epoch': 1.0, 'global_step/max_steps': '16/16', 'percentage': '100.00%', 'elapsed_time': '8m 27s', 'remaining_time': '0s', 'memory(GiB)': 27.96, 'train_speed(iter/s)': 0.031543} +# Train: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 16/16 [08:27<00:00, 31.70s/it] diff --git a/swift/callbacks/activation_cpu_offload.py b/swift/callbacks/activation_cpu_offload.py new file mode 100644 index 0000000000..e160d71d81 --- /dev/null +++ b/swift/callbacks/activation_cpu_offload.py @@ -0,0 +1,611 @@ +"""Functionality for CPU offloading of tensors saved for backward pass.""" +from __future__ import annotations +import functools +import logging +import os +from contextlib import nullcontext +from typing import Any, Dict, Optional + +import torch +from torch.distributed.fsdp import FSDPModule as FSDP2 +from torch.distributed.fsdp import FullyShardedDataParallel as FSDP +from transformers import TrainerCallback +from transformers.trainer_callback import TrainerControl, TrainerState +from transformers.training_args import TrainingArguments + +from swift.utils import get_logger + +logger = get_logger() + + +def is_torch_npu_available() -> bool: + """Check the availability of NPU""" + try: + if hasattr(torch, 'npu') and callable(getattr(torch.npu, 'is_available', None)): + return torch.npu.is_available() + return False + except ImportError: + return False + + +is_cuda_available = torch.cuda.is_available() +is_npu_available = is_torch_npu_available() + + +def _get_unique_tensor_key(tensor): + key = (tensor.untyped_storage().data_ptr() + tensor.storage_offset(), tensor.dtype) + return key + + +def get_device_name() -> str: + """Function that gets the torch.device based on the current machine. + This currently only supports CPU, CUDA, NPU. + Returns: + device + """ + if is_cuda_available: + device = 'cuda' + elif is_npu_available: + device = 'npu' + else: + device = 'cpu' + return device + + +class FSDPParameterFilter: + + def __init__(self): + self.model_parameters_storage = set() + + def __call__(self, tensor): + return tensor.untyped_storage().data_ptr() not in self.model_parameters_storage + + def update_model_parameters(self, model): + new_storage = set() + for p in model.parameters(): + new_storage.add(p.data.untyped_storage().data_ptr()) + self.model_parameters_storage = new_storage + + +def get_torch_device() -> any: + """Return the corresponding torch attribute based on the device type string. + Returns: + module: The corresponding torch device namespace, or torch.cuda if not found. + """ + device_name = get_device_name() + try: + return getattr(torch, device_name) + except AttributeError: + logger.warning(f"Device namespace '{device_name}' not found in torch, try to load torch.cuda.") + return torch.cuda + + +class CpuOffloadHookWithOffloadHandler: + """Context-manager that offloads/recovers tensors through an offload hander. + + The hook just offloads/recovers the tensor object to the handler through `tensor_push` + and `tensor_pop` interface. How the offload-handler manages the offloading, recovering + or prefetching timing is transparent to this hook. + """ + + def __init__( + self, + offload_handler: OffloadHandler, + handler_extra_kwargs: Optional[dict[str, Any]] = None, + ) -> None: + if handler_extra_kwargs is None: + handler_extra_kwargs = {} + self.offload_handler: OffloadHandler = offload_handler + self.handler_extra_kwargs: dict[str, Any] = handler_extra_kwargs + self.inside_context = False + + def __enter__(self): + self.inside_context = True + torch._C._autograd._push_saved_tensors_default_hooks(self.on_save_for_backward, self.on_get_saved_tensor) + + def __exit__(self, *args: Any): + self.inside_context = False + torch._C._autograd._pop_saved_tensors_default_hooks() + + def on_save_for_backward(self, tensor: torch.Tensor) -> Any: + retrieve_identifier = self.offload_handler.tensor_push(tensor, **self.handler_extra_kwargs) + return retrieve_identifier + + def on_get_saved_tensor(self, saved_state: Any) -> torch.Tensor: + tensor = self.offload_handler.tensor_pop(saved_state, **self.handler_extra_kwargs) + return tensor + + +class OffloadHandler: + """A base class for CPU offload-handler.""" + + def __init__(self) -> None: + pass + + def tensor_push(self, tensor: torch.Tensor, **kwargs) -> Any: + """Tensor push.""" + raise NotImplementedError( + '`tensor_push is not implented in OffloadHandler class. Inherit this class and implement your ' + 'custom tensor_push.') + + def tensor_pop(self, tensor_tag: Any, **kwargs): + """Tensor pop.""" + raise NotImplementedError( + '`tensor_pop is not implented in OffloadHandler class. Inherit this class and implement your ' + 'custom tensor_pop.') + + +class GroupCommitFunction(torch.autograd.Function): + """this is a dummy op with output identical to input. + However, it is necessary for marking a timepoint for offload handler to + accomplish all synchronizations. Implementing it as a function is necessary + because we need to actions in both forward and backward. + """ + + @staticmethod + def forward(ctx, tensor, cpu_offload_handler): + # pylint: disable=missing-function-docstring + cpu_offload_handler.on_group_commit_forward() + ctx.cpu_offload_handler = cpu_offload_handler + # return the identical tensor + return tensor + + @staticmethod + def backward(ctx, grad_output): + # pylint: disable=missing-function-docstring + cpu_offload_handler = ctx.cpu_offload_handler + cpu_offload_handler.on_group_commit_backward() + return grad_output, None + + +group_prefetch_offload_commit = GroupCommitFunction.apply + + +class SynchronizedGroupOffloadHandler(OffloadHandler): + """Offload Handler that offloads/reloads in a synchronized way. + The device-to-host and host-to-device copying happen in the same stream + as the computation kernels, thus the copying will block computation. + """ + + def __init__(self, num_offload_group, tensor_need_offloading_checker=(lambda _: True)) -> None: + super().__init__() + + self.num_offload_group = num_offload_group + self.tensor_need_offloading_checker = tensor_need_offloading_checker + + self.groupid_reset() + + def groupid_reset(self): + """Groupid reset.""" + # Data structures to label saved tensors and book-keep their cpu copies. + # Currently, on push, create a new cpu tensor and copies; on pop, copies + # the tensor back to gpu and deletes the cpu tensor. + # These will increment whenever `group_commit()` is invoked + self.current_group, self.tensor_count_current_group = (0, 0) + self.torch_tensor_count = 0 + self.tensor_tag_to_state = {} + + def on_group_commit_forward(self): + """On group commit forward.""" + # finishing up with updating current group and tensor count + self.current_group += 1 # increment + self.tensor_count_current_group = 0 # reset + + def on_group_commit_backward(self): + """On group commit backward.""" + self.current_group -= 1 + assert self.current_group >= 0 + + @staticmethod + def offload(src_tensor, pin_memory=True): + """Offload.""" + + cpu_backup = torch.empty( + src_tensor.size(), + dtype=src_tensor.dtype, + layout=src_tensor.layout, + device='cpu', + pin_memory=pin_memory, + ) + cpu_backup.copy_(src_tensor, non_blocking=True) + state = (src_tensor.device, cpu_backup) + return state + + @staticmethod + def reload(state, non_blocking=None): + """Reload.""" + dev, cpu_backup = state + if non_blocking is None: + non_blocking = cpu_backup.is_pinned() + return cpu_backup.to(dev, non_blocking=non_blocking) + + def tensor_push(self, tensor: torch.Tensor, **kwargs): + """Tensor push.""" + # obtain a unique tensor tag + tensor_tag = (self.current_group, self.tensor_count_current_group) + self.tensor_count_current_group += 1 + assert tensor_tag not in self.tensor_tag_to_state + if self.current_group < self.num_offload_group and self.tensor_need_offloading_checker(tensor): + state = SynchronizedGroupOffloadHandler.offload(tensor) + self.tensor_tag_to_state[tensor_tag] = state + else: + # will be offloaded together after group commit + self.tensor_tag_to_state[tensor_tag] = tensor + + return tensor_tag + + def tensor_pop(self, tensor_tag, **kwargs): + """Tensor pop.""" + assert tensor_tag in self.tensor_tag_to_state + state = self.tensor_tag_to_state.pop(tensor_tag) + if isinstance(state, tuple): + tensor = SynchronizedGroupOffloadHandler.reload(state) + else: + tensor = state + return tensor + + +class AsyncDoubleBufferGroupOffloadHandler(SynchronizedGroupOffloadHandler): + """Compared to synchronize, this uses more memory because of the buffer but + achieves better performance due to the overlapping. D2h and h2d copying are + completely hidden behind computation if computation time of a layer is longer + than host-device communication time. Bulk offloading with delay and bulk reloading + with prefetch are implemented.""" + + def __init__( + self, + num_offload_group, # must be <= actual number of groups (number of commits) + num_model_group, + tensor_need_offloading_checker=(lambda t: True), + ) -> None: + super().__init__( + num_offload_group=num_offload_group, + tensor_need_offloading_checker=tensor_need_offloading_checker, + ) + # Number of layers in the model + self.num_layers = num_model_group + # Data Structure to maintain reference to activation tensors + self.tensor_tag_to_buf = {} + # Tracking the number of layers offloaded + self.offloaded_group_count = 0 + # Core data structure that decides the window for offloading + self.layer_window_map = {} + self.group_offload_mapping = {} + + # Logic to make offloading load balance across computation + # for optimal CPU/GPU interconnect usage + constant = 0 + for i in range(self.num_offload_group): + self.layer_window_map[i] = ((self.num_layers // self.num_offload_group) * (i + 1)) - 1 + if i < (self.num_layers % self.num_offload_group): + self.layer_window_map[i] += i + 1 + constant = i + 1 + else: + self.layer_window_map[i] += constant + + # allocate streams and events for synchronization + self.d2h_stream = get_torch_device().Stream() + self.h2d_stream = get_torch_device().Stream() + + def tensor_push(self, tensor: torch.Tensor, **kwargs) -> Any: + torch_stray_tensor = isinstance( + tensor, + torch._subclasses.fake_tensor.FakeTensor | torch._subclasses.functional_tensor.FunctionalTensor, + ) + need_offload = not torch_stray_tensor + need_offload = need_offload and self.tensor_need_offloading_checker(tensor) + + if need_offload: + # obtain a unique tensor tag + tensor_tag = (self.current_group, self.tensor_count_current_group) + self.tensor_count_current_group += 1 + + assert tensor_tag not in self.tensor_tag_to_state + self.tensor_tag_to_state[tensor_tag] = tensor + + if self.current_group < self.num_offload_group: + self.tensor_tag_to_buf[tensor_tag] = tensor + else: + tensor_tag = tensor + return tensor_tag + + def tensor_pop(self, tensor_tag, **kwargs): + """Tensor pop.""" + if isinstance(tensor_tag, torch.Tensor): + return tensor_tag + assert tensor_tag in self.tensor_tag_to_state + tensor = self.tensor_tag_to_state.pop(tensor_tag) + self.tensor_tag_to_buf.pop(tensor_tag, None) + + # the tensor should have been copied back in on_group_commit_backward() + # which invokes bulk_reload_group. + assert not isinstance(tensor, tuple) + return tensor + + def bulk_offload_group(self, group_to_offload): + """Bulk offload group.""" + offload_mapping = {} + offload_size = 0 + with get_torch_device().stream(self.d2h_stream): + for tensor_tag, state in self.tensor_tag_to_state.items(): + group_id, _ = tensor_tag + if group_id == group_to_offload: + assert not isinstance(state, tuple) + key = _get_unique_tensor_key(state) + if key not in offload_mapping: + offload_mapping[key] = state + # if offload, return the reference to cpu copy + self.tensor_tag_to_state[tensor_tag] = (key, state.shape) + for key, tensor in offload_mapping.items(): + state = SynchronizedGroupOffloadHandler.offload(tensor) + offload_size += tensor.numel() * tensor.element_size() + offload_mapping[key] = state + + self.group_offload_mapping[group_to_offload] = offload_mapping + + def synchronize_on_group_commit_forward(self, current_group): + """Synchronize on group commit forward.""" + + # For the first group, kickstart the offload after we have + # the first compute completion + if current_group == 0: + self.d2h_stream.wait_stream(get_torch_device().current_stream()) + self.bulk_offload_group(current_group) + + # Window map data structure helps us synchronize based on number + # of layers offloaded + if self.layer_window_map[self.offloaded_group_count] == current_group: + # Stream synchronization both ways + self.d2h_stream.wait_stream(get_torch_device().current_stream()) + get_torch_device().current_stream().wait_stream(self.d2h_stream) + + # Time to free the activation memory after usage + for tensor_tag, _ in self.tensor_tag_to_buf.items(): + if tensor_tag[0] == self.offloaded_group_count: + self.tensor_tag_to_buf[tensor_tag] = None + + # Time to offload the next group + if self.offloaded_group_count < (self.num_offload_group - 1): + self.bulk_offload_group(self.offloaded_group_count + 1) + + # Increment the offload group count to keep track + self.offloaded_group_count += 1 + + def on_group_commit_forward(self): + """This function will cause host device synchronization""" + # handle synchronization events + self.synchronize_on_group_commit_forward(self.current_group) + + super().on_group_commit_forward() + + @torch.no_grad + def bulk_reload_group(self, group_to_reload): + """Bulk reload group.""" + assert group_to_reload < self.num_offload_group + + with get_torch_device().stream(self.h2d_stream): + # move back tensors + offload_mapping = self.group_offload_mapping.pop(group_to_reload) + assert offload_mapping is not None + for key, state in offload_mapping.items(): + offload_mapping[key] = SynchronizedGroupOffloadHandler.reload(state) + for tensor_label, state in self.tensor_tag_to_state.items(): + group_id, _ = tensor_label + if group_id == group_to_reload and not isinstance(state, torch.Tensor): + assert isinstance(state, tuple), f'{group_id} {state}' + key, shape = state + recovered_tensor = offload_mapping[key].view(shape) + self.tensor_tag_to_state[tensor_label] = recovered_tensor + + def on_group_commit_backward(self): + # first decrement the current group. + # after last commit in forward, the group will +1; in backward it -1. + # Finally it should be decremented to 0. + self.current_group -= 1 + assert self.current_group >= 0 + + # Layer window data structure helps us to reload at right times + if self.layer_window_map[self.offloaded_group_count - 1] == self.current_group: + # Stream synchronization both ways + self.h2d_stream.wait_stream(get_torch_device().current_stream()) + get_torch_device().current_stream().wait_stream(self.h2d_stream) + + # Time to reload the next group + self.bulk_reload_group(self.offloaded_group_count - 1) + + # Decrease the offloading group counter + self.offloaded_group_count -= 1 if self.offloaded_group_count > 1 else 0 + + # Last group computation needs to wait till all the reloads complete + if self.current_group == 0: + get_torch_device().current_stream().wait_stream(self.h2d_stream) + self.offloaded_group_count = 0 + + +def get_activation_offload_context(num_layers: int = 1, + model_layers: int = 1, + tensor_need_offloading_checker=(lambda t: True)): + cpu_offload_handler = AsyncDoubleBufferGroupOffloadHandler( + num_offload_group=num_layers, + num_model_group=model_layers, + tensor_need_offloading_checker=tensor_need_offloading_checker, + ) + + def group_prefetch_offload_commit_async(tensor): + return group_prefetch_offload_commit(tensor, cpu_offload_handler) + + return ( + CpuOffloadHookWithOffloadHandler(offload_handler=cpu_offload_handler), + group_prefetch_offload_commit_async, + ) + + +class ActivationHandler: + + def __init__(self, offload_ctx, sync_func, tensor_filter, enable_ckpt): + self._offload_ctx = offload_ctx + self._sync_func = sync_func + self._enable_ckpt = enable_ckpt + self._tensor_filter = tensor_filter + if enable_ckpt: + self.checkpoint_fn = functools.partial( + torch.utils.checkpoint.checkpoint, + use_reentrant=True, + ) + + def pre_forward(self, module): + if module.training: + self._offload_ctx.__enter__() + self._tensor_filter.update_model_parameters(module) + + def post_forward(self, module): + if module.training: + self._offload_ctx.__exit__(None, None, None) + + def _pack_kwargs(self, *args, **kwargs): + kwarg_keys = [] + flat_args = list(args) + for k, v in kwargs.items(): + kwarg_keys.append(k) + flat_args.append(v) + + return tuple(flat_args), tuple(kwarg_keys) + + def _unpack_kwargs(self, flat_args, kwarg_keys): + assert len(kwarg_keys) <= len(flat_args), f'too many keys {len(kwarg_keys)} vs. {len(flat_args)}' + if len(kwarg_keys) == 0: + return flat_args, {} + args = flat_args[:-len(kwarg_keys)] + kwargs = dict(zip(kwarg_keys, flat_args[-len(kwarg_keys):], strict=True)) + return args, kwargs + + def _ckpt_forward(self, forward_method, *args, **kwargs): + flat_args, kwarg_keys = self._pack_kwargs(*args, **kwargs) + + def my_function(*inputs): + # unpack back into args and kwargs + nonlocal forward_method, kwarg_keys + unpacked_args, unpacked_kwargs = self._unpack_kwargs(inputs, kwarg_keys) + # run original module + return forward_method(*unpacked_args, **unpacked_kwargs) + + return self.checkpoint_fn( + my_function, + *flat_args, + ) + + def forward(self, module, forward_method, *args, **kwargs): + if not module.training: + return forward_method(*args, **kwargs) + if not self._enable_ckpt: + ret = forward_method(*args, **kwargs) + else: + ret = self._ckpt_forward(forward_method, *args, **kwargs) + binded_tensor = ret + if isinstance(ret, tuple): + binded_tensor = ret[0] + binded_tensor = self._sync_func(binded_tensor) + final_ret = binded_tensor + if isinstance(ret, tuple): + final_ret = (final_ret, ) + ret[1:] + return final_ret + + def wrap_module_forward_method(self, module): + orig_method = module.forward + handler = self + + @functools.wraps(orig_method) + def wrapped_method(model_self, *args, **kwargs): + nonlocal handler + handler.pre_forward(model_self) + out = handler.forward(model_self, orig_method, *args, **kwargs) + handler.post_forward(model_self) + return out + + module.forward = wrapped_method.__get__(module, type(module)) + + +def enable_activation_offloading(model, strategy, enable_ckpt=False): + """ + Enable activation offloading for the model. It groups activations by TransformerLayer and offloads activation + groups asynchronously. This means that the offloading of the i-th activation group and the computation of the i+1-th + activation group happen at the same time, and there are at most two activation groups in GPU memory. + + Args: + model: the model to enable activation offloading + strategy: the training strategy of the model, such as "fsdp" + enable_ckpt: whether activation checkpointing(also called gradient checkpointing) has been enabled for the model + + Note: + For best efficiency, activation offloading is usually combined with activation checkpointing. However, this + implementation of activation offloading is conflicted with the implementation of activation checkpointing in + some training strategies. This function resolves this conflict, and therefore requires the "strategy" and + "enable_ckpt" arguments. + + Returns: + + """ + + assert strategy == 'fsdp' or strategy == 'fsdp2', 'activation offloading only supports fsdp strategy' + layers = [] + + def get_layers(module): + for name, child in module.named_children(): + if not isinstance(child, FSDP | FSDP2): + get_layers(child) + else: + wrapped_module = child + if isinstance(child, FSDP): + wrapped_module = child._fsdp_wrapped_module + # In some cases, torch.nn.Embedding is wrapped with FSDP alone. However, the activation + # size of torch.nn.Embedding is small, so it's not necessary to offload it. + if not isinstance(wrapped_module, torch.nn.Embedding): + layers.append(child) + + get_layers(model) + if len(layers) < 3: + logger.warning(f'Find only {len(layers)} fsdp layers, not neccessary to enable async activation offloading') + return + + tensor_filter = FSDPParameterFilter() + context, sync_func = get_activation_offload_context(len(layers) - 1, len(layers), tensor_filter) + if enable_ckpt: + # The implementation of activation checkpointing in transformers library is incompatible with + # activation offloading, + # so it will be disabled, but this implementation supports another version of activation checkpointing, so that + # these two features can be enabled at the same time. + for module in model.modules(): + if hasattr(module, 'gradient_checkpointing_disable'): + module.gradient_checkpointing_disable() + + handler = ActivationHandler(context, sync_func, tensor_filter, enable_ckpt) + for layer in layers: + module = layer + if isinstance(layer, FSDP): + module = module._fsdp_wrapped_module + handler.wrap_module_forward_method(module) + + +class ActivationCpuOffloadCallBack(TrainerCallback): + + def on_train_begin(self, args: TrainingArguments, state: TrainerState, control: TrainerControl, **kwargs): + """ + Event called at the beginning of training. + """ + model = kwargs['model'] + + # Check if model is wrapped with FSDP + if isinstance(model, FSDP) or isinstance(model, FSDP2): + if args is not None and hasattr(args, 'fsdp_config'): + fsdp_config = args.fsdp_config + # Check if fsdp_config is a dictionary and has activation_cpu_offload enabled + if isinstance(fsdp_config, dict) and fsdp_config.get('activation_cpu_offload', False): + # Get FSDP version from fsdp_config + strategy = fsdp_config.get('version', None) + if strategy is not None: + fsdp_version = 'fsdp' if strategy == 1 else 'fsdp2' + # Get activation checkpointing setting from fsdp_config + enable_ckpt = fsdp_config.get('activation_checkpointing', False) + if enable_ckpt and hasattr(model, 'enable_input_require_grads'): + model.enable_input_require_grads() + enable_activation_offloading(model, strategy=fsdp_version, enable_ckpt=enable_ckpt) diff --git a/swift/callbacks/mapping.py b/swift/callbacks/mapping.py index 6dd83aaaf6..fe9c9b2a16 100644 --- a/swift/callbacks/mapping.py +++ b/swift/callbacks/mapping.py @@ -1,5 +1,5 @@ # Copyright (c) ModelScope Contributors. All rights reserved. -from swift.activation_cpu_offload import ActivationCpuOffloadCallBack +from .activation_cpu_offload import ActivationCpuOffloadCallBack from .adalora import AdaloraCallback from .early_stop import EarlyStopCallback from .lisa import LISACallback From 80e191575c10035174dc8d67aea14301f379aaa5 Mon Sep 17 00:00:00 2001 From: meichangsu1 <1484603386@qq.com> Date: Thu, 22 Jan 2026 12:54:46 +0800 Subject: [PATCH 3/7] feat(callbacks): add constructor to ActivationCpuOffloadCallBack - Add __init__ method to ActivationCpuOffloadCallBack to properly initialize parent class - Update import to use local base TrainerCallback instead of transformers version - Ensure callback follows consistent initialization pattern with other callbacks --- swift/callbacks/activation_cpu_offload.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/swift/callbacks/activation_cpu_offload.py b/swift/callbacks/activation_cpu_offload.py index e160d71d81..fab3b699a5 100644 --- a/swift/callbacks/activation_cpu_offload.py +++ b/swift/callbacks/activation_cpu_offload.py @@ -9,11 +9,11 @@ import torch from torch.distributed.fsdp import FSDPModule as FSDP2 from torch.distributed.fsdp import FullyShardedDataParallel as FSDP -from transformers import TrainerCallback from transformers.trainer_callback import TrainerControl, TrainerState from transformers.training_args import TrainingArguments from swift.utils import get_logger +from .base import TrainerCallback logger = get_logger() @@ -588,6 +588,9 @@ def get_layers(module): class ActivationCpuOffloadCallBack(TrainerCallback): + def __init__(self, args: TrainingArguments, trainer): + super().__init__(args, trainer) + def on_train_begin(self, args: TrainingArguments, state: TrainerState, control: TrainerControl, **kwargs): """ Event called at the beginning of training. From 0cd6ffdb6e5bb25b196cc7609575bea308e3981e Mon Sep 17 00:00:00 2001 From: meichangsu1 <1484603386@qq.com> Date: Thu, 22 Jan 2026 18:34:35 +0800 Subject: [PATCH 4/7] feat: enable activation checkpointing in FSDP2 config - Remove activation_cpu_offload parameter from fsdp2.json - Set activation_checkpointing to true for improved memory efficiency - Maintain existing auto_wrap_policy and state_dict_type settings --- examples/train/activation_cpu_offload/fsdp2.json | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/examples/train/activation_cpu_offload/fsdp2.json b/examples/train/activation_cpu_offload/fsdp2.json index 73d856389a..9d9b5867ad 100644 --- a/examples/train/activation_cpu_offload/fsdp2.json +++ b/examples/train/activation_cpu_offload/fsdp2.json @@ -20,7 +20,6 @@ "auto_wrap_policy": "TRANSFORMER_BASED_WRAP", "cpu_ram_efficient_loading": true, "state_dict_type": "SHARDED_STATE_DICT", - "activation_checkpointing": false, - "activation_cpu_offload": true + "activation_checkpointing": true } } From 18ab86eddfefd0cbc4e722c866fb77ae238ea27d Mon Sep 17 00:00:00 2001 From: meichangsu1 <1484603386@qq.com> Date: Thu, 22 Jan 2026 18:59:17 +0800 Subject: [PATCH 5/7] Updated fsdp2.json configuration for activation offload example --- examples/train/activation_cpu_offload/fsdp2.json | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/examples/train/activation_cpu_offload/fsdp2.json b/examples/train/activation_cpu_offload/fsdp2.json index 9d9b5867ad..73d856389a 100644 --- a/examples/train/activation_cpu_offload/fsdp2.json +++ b/examples/train/activation_cpu_offload/fsdp2.json @@ -20,6 +20,7 @@ "auto_wrap_policy": "TRANSFORMER_BASED_WRAP", "cpu_ram_efficient_loading": true, "state_dict_type": "SHARDED_STATE_DICT", - "activation_checkpointing": true + "activation_checkpointing": false, + "activation_cpu_offload": true } } From 723314fa47e9c2aa3780832b0b22ec79b7594ab6 Mon Sep 17 00:00:00 2001 From: meichangsu1 <1484603386@qq.com> Date: Mon, 2 Feb 2026 21:43:57 +0800 Subject: [PATCH 6/7] feat(ascend): add training script for activation CPU offload - Add new training script `train.sh` for Ascend platform with activation CPU offload configuration - Include comprehensive training parameters for Qwen3-8B model with LoRA fine-tuning - Provide example training outputs for both activation_cpu_offload=true and false scenarios - Move existing fsdp2.json configuration file to Ascend examples directory --- .../activation_cpu_offload/fsdp2.json | 0 .../ascend/activation_cpu_offload/train.sh | 57 +++++++++++++++++++ .../train/activation_cpu_offload/train.sh | 54 ------------------ swift/callbacks/activation_cpu_offload.py | 2 +- 4 files changed, 58 insertions(+), 55 deletions(-) rename examples/{train => ascend}/activation_cpu_offload/fsdp2.json (100%) create mode 100644 examples/ascend/activation_cpu_offload/train.sh delete mode 100644 examples/train/activation_cpu_offload/train.sh diff --git a/examples/train/activation_cpu_offload/fsdp2.json b/examples/ascend/activation_cpu_offload/fsdp2.json similarity index 100% rename from examples/train/activation_cpu_offload/fsdp2.json rename to examples/ascend/activation_cpu_offload/fsdp2.json diff --git a/examples/ascend/activation_cpu_offload/train.sh b/examples/ascend/activation_cpu_offload/train.sh new file mode 100644 index 0000000000..185487a845 --- /dev/null +++ b/examples/ascend/activation_cpu_offload/train.sh @@ -0,0 +1,57 @@ +#!/bin/bash +ASCEND_RT_VISIBLE_DEVICES=0,1 \ +NPROC_PER_NODE=2 \ +swift sft \ + --model 'Qwen/Qwen3-8B' \ + --train_type lora \ + --dataset 'AI-ModelScope/LongAlpaca-12k#2000' \ + --torch_dtype bfloat16 \ + --num_train_epochs 1 \ + --per_device_train_batch_size 1 \ + --per_device_eval_batch_size 1 \ + --learning_rate 1e-4 \ + --lora_rank 8 \ + --lora_alpha 32 \ + --gradient_checkpointing false \ + --weight_decay 0.1 \ + --target_modules all-linear \ + --gradient_accumulation_steps 16 \ + --eval_steps 100 \ + --save_steps 5 \ + --save_total_limit 2 \ + --logging_steps 5 \ + --max_length 4096 \ + --output_dir output \ + --system You\ are\ a\ helpful\ assistant. \ + --warmup_ratio 0.05 \ + --dataloader_num_workers 4 \ + --fsdp './examples/ascend/activation_cpu_offload/fsdp2.json' + + +# activation_cpu_offload=false + +# {'loss': 2.93329144, 'grad_norm': 2.44835496, 'learning_rate': 0.0001, 'token_acc': 0.56405613, 'epoch': 0.06, 'global_step/max_steps': '1/16', 'percentage': '6.25%', 'elapsed_time': '8s', 'remaining_time': '2m 6s', 'memory(GiB)': 24.8, 'train_speed(iter/s)': 0.118837} +# {'loss': 2.93490505, 'grad_norm': 2.63550186, 'learning_rate': 8.346e-05, 'token_acc': 0.58979954, 'epoch': 0.32, 'global_step/max_steps': '5/16', 'percentage': '31.25%', 'elapsed_time': '28s', 'remaining_time': '1m 2s', 'memory(GiB)': 57.91, 'train_speed(iter/s)': 0.175644} +# Train: 31%|███████████████████████████████████ | 5/16 [00:28<00:57, 5.22s/it][INFO:swift] Saving model checkpoint to /model/ljl/project/ms-swift/output/v60-20260202-130514/checkpoint-5 +# {'loss': 1.61339226, 'grad_norm': 1.05343676, 'learning_rate': 3.455e-05, 'token_acc': 0.63342983, 'epoch': 0.64, 'global_step/max_steps': '10/16', 'percentage': '62.50%', 'elapsed_time': '51s', 'remaining_time': '31s', 'memory(GiB)': 58.02, 'train_speed(iter/s)': 0.192856} +# Train: 62%|█████████████████████████████████████████████████████████████████████▍ | 10/16 [00:51<00:27, 4.66s/it][INFO:swift] Saving model checkpoint to /model/ljl/project/ms-swift/output/v60-20260202-130514/checkpoint-10 +# {'loss': 1.32472887, 'grad_norm': 0.60581738, 'learning_rate': 1.09e-06, 'token_acc': 0.64779323, 'epoch': 0.96, 'global_step/max_steps': '15/16', 'percentage': '93.75%', 'elapsed_time': '1m 13s', 'remaining_time': '4s', 'memory(GiB)': 58.02, 'train_speed(iter/s)': 0.204973} +# Train: 94%|████████████████████████████████████████████████████████████████████████████████████████████████████████ | 15/16 [01:13<00:04, 4.12s/it][INFO:swift] Saving model checkpoint to /model/ljl/project/ms-swift/output/v60-20260202-130514/checkpoint-15 +# Train: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████| 16/16 [01:17<00:00, 4.25s/it][INFO:swift] Saving model checkpoint to /model/ljl/project/ms-swift/output/v60-20260202-130514/checkpoint-16 +# {'train_runtime': 79.7064, 'train_samples_per_second': 6.311, 'train_steps_per_second': 0.201, 'train_loss': 1.91648413, 'token_acc': 0.68027888, 'epoch': 1.0, 'global_step/max_steps': '16/16', 'percentage': '100.00%', 'elapsed_time': '1m 19s', 'remaining_time': '0s', 'memory(GiB)': 58.02, 'train_speed(iter/s)': 0.200728} +# Train: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████| 16/16 [01:19<00:00, 4.98s/it] + + + +# "activation_cpu_offload": true + +# {'loss': 2.93329144, 'grad_norm': 2.44853568, 'learning_rate': 0.0001, 'token_acc': 0.56405613, 'epoch': 0.06, 'global_step/max_steps': '1/16', 'percentage': '6.25%', 'elapsed_time': '26s', 'remaining_time': '6m 43s', 'memory(GiB)': 24.62, 'train_speed(iter/s)': 0.037168} +# {'loss': 2.93512678, 'grad_norm': 2.6212213, 'learning_rate': 8.346e-05, 'token_acc': 0.5895268, 'epoch': 0.32, 'global_step/max_steps': '5/16', 'percentage': '31.25%', 'elapsed_time': '1m 21s', 'remaining_time': '2m 58s', 'memory(GiB)': 26.93, 'train_speed(iter/s)': 0.061631} +# Train: 31%|███████████████████████████████████ | 5/16 [01:21<02:30, 13.67s/it][INFO:swift] Saving model checkpoint to /model/ljl/project/ms-swift/output/v59-20260202-125158/checkpoint-5 +# {'loss': 1.61200867, 'grad_norm': 1.05091298, 'learning_rate': 3.455e-05, 'token_acc': 0.63310818, 'epoch': 0.64, 'global_step/max_steps': '10/16', 'percentage': '62.50%', 'elapsed_time': '2m 20s', 'remaining_time': '1m 24s', 'memory(GiB)': 26.93, 'train_speed(iter/s)': 0.0712} +# Train: 62%|█████████████████████████████████████████████████████████████████████▍ | 10/16 [02:20<01:11, 11.97s/it][INFO:swift] Saving model checkpoint to /model/ljl/project/ms-swift/output/v59-20260202-125158/checkpoint-10 +# {'loss': 1.32489185, 'grad_norm': 0.60476321, 'learning_rate': 1.09e-06, 'token_acc': 0.64746468, 'epoch': 0.96, 'global_step/max_steps': '15/16', 'percentage': '93.75%', 'elapsed_time': '3m 11s', 'remaining_time': '12s', 'memory(GiB)': 26.94, 'train_speed(iter/s)': 0.078265} +# Train: 94%|████████████████████████████████████████████████████████████████████████████████████████████████████████ | 15/16 [03:11<00:10, 10.03s/it][INFO:swift] Saving model checkpoint to /model/ljl/project/ms-swift/output/v59-20260202-125158/checkpoint-15 +# Train: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████| 16/16 [03:20<00:00, 9.65s/it][INFO:swift] Saving model checkpoint to /model/ljl/project/ms-swift/output/v59-20260202-125158/checkpoint-16 +# {'train_runtime': 202.2537, 'train_samples_per_second': 2.487, 'train_steps_per_second': 0.079, 'train_loss': 1.91632293, 'token_acc': 0.67729084, 'epoch': 1.0, 'global_step/max_steps': '16/16', 'percentage': '100.00%', 'elapsed_time': '3m 22s', 'remaining_time': '0s', 'memory(GiB)': 26.94, 'train_speed(iter/s)': 0.078996} +# Train: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████| 16/16 [03:22<00:00, 12.66s/it] \ No newline at end of file diff --git a/examples/train/activation_cpu_offload/train.sh b/examples/train/activation_cpu_offload/train.sh deleted file mode 100644 index b9b206748c..0000000000 --- a/examples/train/activation_cpu_offload/train.sh +++ /dev/null @@ -1,54 +0,0 @@ -#!/bin/bash -CUDA_VISIBLE_DEVICES=0,1 \ -NPROC_PER_NODE=2 \ -swift sft \ - --model 'Qwen/Qwen3-8B' \ - --train_type lora \ - --dataset 'AI-ModelScope/alpaca-gpt4-data-zh#500' \ - --torch_dtype bfloat16 \ - --num_train_epochs 1 \ - --per_device_train_batch_size 1 \ - --per_device_eval_batch_size 1 \ - --learning_rate 1e-4 \ - --lora_rank 8 \ - --lora_alpha 32 \ - --gradient_checkpointing false \ - --weight_decay 0.1 \ - --target_modules all-linear \ - --gradient_accumulation_steps 16 \ - --eval_steps 100 \ - --save_steps 5 \ - --save_total_limit 2 \ - --logging_steps 5 \ - --max_length 2048 \ - --output_dir output \ - --system You\ are\ a\ helpful\ assistant. \ - --warmup_ratio 0.05 \ - --dataloader_num_workers 4 \ - --fsdp './examples/train/activation_cpu_offload/fsdp2.json' - - -# --dataset 'AI-ModelScope/alpaca-gpt4-data-zh#500' -# activation_cpu_offload=true - -# {'loss': 2.1327579, 'grad_norm': 1.72890568, 'learning_rate': 8.346e-05, 'token_acc': 0.58396158, 'epoch': 0.32, 'global_step/max_steps': '5/16', 'percentage': '31.25%', 'elapsed_time': '5m 28s', 'remaining_time': '12m 2s', 'memory(GiB)': 24.8, 'train_speed(iter/s)': 0.015218} -# Train: 31%|██████████████████████████████████████▍ | 5/16 [05:28<11:41, 63.77s/it][INFO:swift] Saving model checkpoint to /model/ljl/output/v45-20251231-160511/checkpoint-5 -# {'loss': 1.51323957, 'grad_norm': 0.39210615, 'learning_rate': 3.455e-05, 'token_acc': 0.62368014, 'epoch': 0.64, 'global_step/max_steps': '10/16', 'percentage': '62.50%', 'elapsed_time': '10m 22s', 'remaining_time': '6m 13s', 'memory(GiB)': 24.87, 'train_speed(iter/s)': 0.016054} -# Train: 62%|████████████████████████████████████████████████████████████████████████████▎ | 10/16 [10:22<05:37, 56.26s/it][INFO:swift] Saving model checkpoint to /model/ljl/output/v45-20251231-160511/checkpoint-10 -# {'loss': 1.36127844, 'grad_norm': 0.30676287, 'learning_rate': 1.09e-06, 'token_acc': 0.64411869, 'epoch': 0.96, 'global_step/max_steps': '15/16', 'percentage': '93.75%', 'elapsed_time': '15m 6s', 'remaining_time': '1m 0s', 'memory(GiB)': 24.87, 'train_speed(iter/s)': 0.016547} -# ... -# {'train_runtime': 962.7184, 'train_samples_per_second': 0.519, 'train_steps_per_second': 0.017, 'train_loss': 1.61728384, 'token_acc': 0.62789828, 'epoch': 1.0, 'global_step/max_steps': '16/16', 'percentage': '100.00%', 'elapsed_time': '16m 2s', 'remaining_time': '0s', 'memory(GiB)': 24.87, 'train_speed(iter/s)': 0.016624} -# Train: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 16/16 [16:02<00:00, 60.16s/it] - - -# activation_cpu_offload=false - -# {'loss': 2.15452981, 'grad_norm': 1.7536869, 'learning_rate': 0.0001, 'token_acc': 0.61792799, 'epoch': 0.06, 'global_step/max_steps': '1/16', 'percentage': '6.25%', 'elapsed_time': '46s', 'remaining_time': '11m 39s', 'memory(GiB)': 26.14, 'train_speed(iter/s)': 0.021458} -# {'loss': 2.13306689, 'grad_norm': 1.7279824, 'learning_rate': 8.346e-05, 'token_acc': 0.58295639, 'epoch': 0.32, 'global_step/max_steps': '5/16', 'percentage': '31.25%', 'elapsed_time': '2m 55s', 'remaining_time': '6m 26s', 'memory(GiB)': 26.59, 'train_speed(iter/s)': 0.028456} -# Train: 31%|██████████████████████████████████████▍ | 5/16 [02:55<05:59, 32.65s/it][INFO:swift] Saving model checkpoint to /model/ljl/output/v44-20251231-155036/checkpoint-5 -# {'loss': 1.51308346, 'grad_norm': 0.39151499, 'learning_rate': 3.455e-05, 'token_acc': 0.62377399, 'epoch': 0.64, 'global_step/max_steps': '10/16', 'percentage': '62.50%', 'elapsed_time': '5m 18s', 'remaining_time': '3m 10s', 'memory(GiB)': 27.73, 'train_speed(iter/s)': 0.031432} -# Train: 62%|████████████████████████████████████████████████████████████████████████████▎ | 10/16 [05:18<02:51, 28.58s/it][INFO:swift] Saving model checkpoint to /model/ljl/output/v44-20251231-155036/checkpoint-10 -# {'loss': 1.36132231, 'grad_norm': 0.30557585, 'learning_rate': 1.09e-06, 'token_acc': 0.64442776, 'epoch': 0.96, 'global_step/max_steps': '15/16', 'percentage': '93.75%', 'elapsed_time': '7m 57s', 'remaining_time': '31s', 'memory(GiB)': 27.96, 'train_speed(iter/s)': 0.031437} -# ... -# {'train_runtime': 507.5282, 'train_samples_per_second': 0.985, 'train_steps_per_second': 0.032, 'train_loss': 1.61732693, 'token_acc': 0.63051608, 'epoch': 1.0, 'global_step/max_steps': '16/16', 'percentage': '100.00%', 'elapsed_time': '8m 27s', 'remaining_time': '0s', 'memory(GiB)': 27.96, 'train_speed(iter/s)': 0.031543} -# Train: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 16/16 [08:27<00:00, 31.70s/it] diff --git a/swift/callbacks/activation_cpu_offload.py b/swift/callbacks/activation_cpu_offload.py index fab3b699a5..a7d750f786 100644 --- a/swift/callbacks/activation_cpu_offload.py +++ b/swift/callbacks/activation_cpu_offload.py @@ -67,7 +67,7 @@ def update_model_parameters(self, model): self.model_parameters_storage = new_storage -def get_torch_device() -> any: +def get_torch_device() -> Any: """Return the corresponding torch attribute based on the device type string. Returns: module: The corresponding torch device namespace, or torch.cuda if not found. From de4d8dc91e061ced2e206ee13ae12bcc80f1a7f8 Mon Sep 17 00:00:00 2001 From: meichangsu1 <1484603386@qq.com> Date: Thu, 5 Feb 2026 14:21:11 +0800 Subject: [PATCH 7/7] feat(activation_cpu_offload): update dataset and add NPU-specific sync copy - Change training dataset from `AI-ModelScope/LongAlpaca-12k` to `AI-ModelScope/alpaca-gpt4-data-zh` in the example script - Modify `SynchronizedGroupOffloadHandler.offload` to use synchronous, non-pinned memory copy when NPU is available, as NPU does not fully support async H2D/D2H with pinned memory --- .../ascend/activation_cpu_offload/train.sh | 8 +++--- swift/callbacks/activation_cpu_offload.py | 28 +++++++++++++------ 2 files changed, 23 insertions(+), 13 deletions(-) diff --git a/examples/ascend/activation_cpu_offload/train.sh b/examples/ascend/activation_cpu_offload/train.sh index 185487a845..479ff53a02 100644 --- a/examples/ascend/activation_cpu_offload/train.sh +++ b/examples/ascend/activation_cpu_offload/train.sh @@ -4,7 +4,7 @@ NPROC_PER_NODE=2 \ swift sft \ --model 'Qwen/Qwen3-8B' \ --train_type lora \ - --dataset 'AI-ModelScope/LongAlpaca-12k#2000' \ + --dataset 'AI-ModelScope/alpaca-gpt4-data-zh' \ --torch_dtype bfloat16 \ --num_train_epochs 1 \ --per_device_train_batch_size 1 \ @@ -26,8 +26,8 @@ swift sft \ --warmup_ratio 0.05 \ --dataloader_num_workers 4 \ --fsdp './examples/ascend/activation_cpu_offload/fsdp2.json' - +# --dataset AI-ModelScope/LongAlpaca-12k # activation_cpu_offload=false # {'loss': 2.93329144, 'grad_norm': 2.44835496, 'learning_rate': 0.0001, 'token_acc': 0.56405613, 'epoch': 0.06, 'global_step/max_steps': '1/16', 'percentage': '6.25%', 'elapsed_time': '8s', 'remaining_time': '2m 6s', 'memory(GiB)': 24.8, 'train_speed(iter/s)': 0.118837} @@ -42,7 +42,7 @@ swift sft \ # Train: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████| 16/16 [01:19<00:00, 4.98s/it] - +# --dataset AI-ModelScope/LongAlpaca-12k # "activation_cpu_offload": true # {'loss': 2.93329144, 'grad_norm': 2.44853568, 'learning_rate': 0.0001, 'token_acc': 0.56405613, 'epoch': 0.06, 'global_step/max_steps': '1/16', 'percentage': '6.25%', 'elapsed_time': '26s', 'remaining_time': '6m 43s', 'memory(GiB)': 24.62, 'train_speed(iter/s)': 0.037168} @@ -54,4 +54,4 @@ swift sft \ # Train: 94%|████████████████████████████████████████████████████████████████████████████████████████████████████████ | 15/16 [03:11<00:10, 10.03s/it][INFO:swift] Saving model checkpoint to /model/ljl/project/ms-swift/output/v59-20260202-125158/checkpoint-15 # Train: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████| 16/16 [03:20<00:00, 9.65s/it][INFO:swift] Saving model checkpoint to /model/ljl/project/ms-swift/output/v59-20260202-125158/checkpoint-16 # {'train_runtime': 202.2537, 'train_samples_per_second': 2.487, 'train_steps_per_second': 0.079, 'train_loss': 1.91632293, 'token_acc': 0.67729084, 'epoch': 1.0, 'global_step/max_steps': '16/16', 'percentage': '100.00%', 'elapsed_time': '3m 22s', 'remaining_time': '0s', 'memory(GiB)': 26.94, 'train_speed(iter/s)': 0.078996} -# Train: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████| 16/16 [03:22<00:00, 12.66s/it] \ No newline at end of file +# Train: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████| 16/16 [03:22<00:00, 12.66s/it] diff --git a/swift/callbacks/activation_cpu_offload.py b/swift/callbacks/activation_cpu_offload.py index a7d750f786..35ea0c609e 100644 --- a/swift/callbacks/activation_cpu_offload.py +++ b/swift/callbacks/activation_cpu_offload.py @@ -199,15 +199,25 @@ def on_group_commit_backward(self): @staticmethod def offload(src_tensor, pin_memory=True): """Offload.""" - - cpu_backup = torch.empty( - src_tensor.size(), - dtype=src_tensor.dtype, - layout=src_tensor.layout, - device='cpu', - pin_memory=pin_memory, - ) - cpu_backup.copy_(src_tensor, non_blocking=True) + # NPU doesn't fully support async H2D/D2H with pinned memory; use sync copy. + if is_npu_available: + cpu_backup = torch.empty( + src_tensor.size(), + dtype=src_tensor.dtype, + layout=src_tensor.layout, + device='cpu', + pin_memory=False, + ) + cpu_backup.copy_(src_tensor, non_blocking=False) + else: + cpu_backup = torch.empty( + src_tensor.size(), + dtype=src_tensor.dtype, + layout=src_tensor.layout, + device='cpu', + pin_memory=pin_memory, + ) + cpu_backup.copy_(src_tensor, non_blocking=True) state = (src_tensor.device, cpu_backup) return state