diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 0dc61b7ffe..24e434a4c8 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -66,12 +66,13 @@ jobs: - name: Check out repository code uses: actions/checkout@v4 -# - name: Restore Cache from S3 -# id: hf-cache-restore-s3 -# run: | -# mkdir -p ~/.cache/huggingface/hub -# curl -L https://d1dttdx32dkk5p.cloudfront.net/hf-cache.tar.zst | tar -xf - -C ~/.cache/huggingface/hub/ --use-compress-program unzstd -# + - name: Restore Cache from S3 + id: hf-cache-restore-s3 + run: | + mkdir -p ~/.cache/huggingface/hub + curl -L https://d1dttdx32dkk5p.cloudfront.net/hf-cache.tar.zst | tar -xpf - -C ~/.cache/huggingface/hub/ --use-compress-program unzstd --strip-components=1 + ls -ltr ~/.cache/huggingface/hub/ + - name: Setup Python uses: actions/setup-python@v5 with: @@ -111,6 +112,9 @@ jobs: run: | huggingface-cli download --repo-type=dataset axolotl-ai-internal/axolotl-oss-dataset-fixtures + - name: Show HF cache + run: hf cache scan + - name: Run tests run: | df -h @@ -122,6 +126,9 @@ jobs: df -h pytest -v --durations=10 tests/cli/ --cov=axolotl --cov-append --cov-report=xml + - name: Show HF cache + run: hf cache scan + - name: Upload coverage to Codecov uses: codecov/codecov-action@v5 with: @@ -149,12 +156,13 @@ jobs: - name: Check out repository code uses: actions/checkout@v4 -# - name: Restore Cache from S3 -# id: hf-cache-restore-s3 -# run: | -# mkdir -p ~/.cache/huggingface/hub -# curl -L https://d1dttdx32dkk5p.cloudfront.net/hf-cache.tar.zst | tar -xf - -C ~/.cache/huggingface/hub/ --use-compress-program unzstd -# + - name: Restore Cache from S3 + id: hf-cache-restore-s3 + run: | + mkdir -p ~/.cache/huggingface/hub + curl -L https://d1dttdx32dkk5p.cloudfront.net/hf-cache.tar.zst | tar -xpf - -C ~/.cache/huggingface/hub/ --use-compress-program unzstd --strip-components=1 + ls -ltr ~/.cache/huggingface/hub/ + - name: Setup Python uses: actions/setup-python@v5 with: @@ -200,6 +208,9 @@ jobs: pytest -v --durations=10 tests/monkeypatch/ --cov=axolotl --cov-append --cov-report=xml pytest -v --durations=10 tests/cli/ + - name: Show HF cache + run: hf cache scan + gate-skip-e2e: needs: [pre-commit, pytest, pytest-sdist] runs-on: ubuntu-latest diff --git a/cicd/multigpu.sh b/cicd/multigpu.sh index 3ec4456b97..307dd4960e 100755 --- a/cicd/multigpu.sh +++ b/cicd/multigpu.sh @@ -2,7 +2,7 @@ set -e # Only run two tests at a time to avoid OOM on GPU (with coverage collection) -pytest -v --durations=10 -n2 \ +pytest -v --durations=10 -n2 --maxfail=4 \ --ignore=/workspace/axolotl/tests/e2e/multigpu/solo/ \ --ignore=/workspace/axolotl/tests/e2e/multigpu/patched/ \ /workspace/axolotl/tests/e2e/multigpu/ \ diff --git a/requirements.txt b/requirements.txt index 5e1af69408..c87bb21478 100644 --- a/requirements.txt +++ b/requirements.txt @@ -14,12 +14,12 @@ huggingface_hub>=0.36.0 peft>=0.18.0 tokenizers>=0.22.1 transformers==4.57.1 -accelerate==1.11.0 -datasets==4.4.1 -deepspeed>=0.17.0 -trl==0.25.0 +accelerate==1.12.0 +datasets==4.4.2 +deepspeed>=0.18.3 +trl==0.25.1 hf_xet==1.2.0 -kernels>=0.9.0 +kernels==0.11.5 trackio>=0.13.0 typing_extensions>=4.14.0 diff --git a/setup.py b/setup.py index e22df40c82..10c9a84539 100644 --- a/setup.py +++ b/setup.py @@ -156,7 +156,7 @@ def get_package_version(): "came_pytorch==0.1.3", ], "ray": [ - "ray[train]", + "ray[train]>=2.52.1", ], "vllm": [ "vllm==0.10.0", diff --git a/src/axolotl/core/trainers/base.py b/src/axolotl/core/trainers/base.py index 850517ded7..3a08d0574b 100644 --- a/src/axolotl/core/trainers/base.py +++ b/src/axolotl/core/trainers/base.py @@ -356,6 +356,7 @@ def compute_loss( inputs_key = "labels" if "labels" in inputs else "input_ids" trainable_tokens = (inputs[inputs_key] != -100).sum() total_tokens = inputs[inputs_key].numel() + total_tokens = torch.tensor(total_tokens, device=inputs[inputs_key].device) if is_distributed(): torch.distributed.all_reduce( @@ -375,9 +376,7 @@ def compute_loss( self.state.tokens["trainable"] = ( self.state.tokens["trainable"] + trainable_tokens.detach().cpu() ) - self.state.tokens["total"] = ( - self.state.tokens["total"] + torch.as_tensor(total_tokens).cpu() - ) + self.state.tokens["total"] = self.state.tokens["total"] + total_tokens.cpu() # Store per-step trainable tokens for throughput calculation self.state.tokens["trainable_tokens"] = trainable_tokens.detach().cpu() diff --git a/src/axolotl/monkeypatch/accelerate/parallelism_config.py b/src/axolotl/monkeypatch/accelerate/parallelism_config.py index b2157fb6b6..9b71e914ac 100644 --- a/src/axolotl/monkeypatch/accelerate/parallelism_config.py +++ b/src/axolotl/monkeypatch/accelerate/parallelism_config.py @@ -75,3 +75,33 @@ def patch_parallelism_config(): ParallelismConfig._validate_accelerator = _validate_accelerator AcceleratorState.is_fsdp2 = property(patched_is_fsdp2) + + +def patch_prepare_cp(): + import functools + + import torch + from accelerate import Accelerator + + def patched_prepare_cp(self, *args): + if self.parallelism_config.cp_backend == "deepspeed": + return args + + from accelerate.big_modeling import _attach_context_parallel_hooks + from torch.distributed.tensor.experimental import context_parallel + from torch.distributed.tensor.experimental._attention import set_rotate_method + + cp_comm_strategy = self.parallelism_config.cp_handler.cp_comm_strategy + set_rotate_method(cp_comm_strategy) + + self._cp_context = functools.partial( + context_parallel, mesh=self.torch_device_mesh["cp"] + ) + + for arg in args: + if isinstance(arg, torch.nn.Module): + _attach_context_parallel_hooks(arg) + + return args + + Accelerator._prepare_cp = patched_prepare_cp diff --git a/src/axolotl/utils/trainer.py b/src/axolotl/utils/trainer.py index 3628fd85f1..fb381a8a19 100644 --- a/src/axolotl/utils/trainer.py +++ b/src/axolotl/utils/trainer.py @@ -645,6 +645,9 @@ def setup_parallelism_envs(cfg): set_accelerate_parallelism_config = True os.environ["PARALLELISM_CONFIG_CP_SIZE"] = str(cfg.context_parallel_size) os.environ["ACCELERATE_ALLOW_CP_STANDALONE"] = "true" + from axolotl.monkeypatch.accelerate.parallelism_config import patch_prepare_cp + + patch_prepare_cp() if set_accelerate_parallelism_config: os.environ["ACCELERATE_USE_PARALLELISM_CONFIG"] = "true" diff --git a/tests/conftest.py b/tests/conftest.py index d3b9407ec4..4c8c80cb7f 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -62,7 +62,7 @@ def snapshot_download_w_retry(*args, **kwargs): """ with hf_offline_context(True): try: - return snapshot_download(*args, **kwargs) + return snapshot_download(*args, local_files_only=True, **kwargs) except LocalEntryNotFoundError: pass with hf_offline_context(False): diff --git a/tests/hf_offline_utils.py b/tests/hf_offline_utils.py index 0e4a2f067e..221db1c515 100644 --- a/tests/hf_offline_utils.py +++ b/tests/hf_offline_utils.py @@ -6,8 +6,6 @@ from contextlib import contextmanager from functools import wraps -from huggingface_hub.utils import reset_sessions - def reload_modules(hf_hub_offline): # Force reload of the modules that check this variable @@ -21,7 +19,6 @@ def reload_modules(hf_hub_offline): huggingface_hub.constants.HF_HUB_OFFLINE = hf_hub_offline importlib.reload(datasets.config) datasets.config.HF_HUB_OFFLINE = hf_hub_offline - reset_sessions() def enable_hf_offline(test_func):