Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 4 additions & 2 deletions .azure/app-cloud-e2e.yml
Original file line number Diff line number Diff line change
Expand Up @@ -148,11 +148,13 @@ jobs:
# It also e2e tests running on cloud without installing dependencies.
- bash: |
git clone https://github.com/Lightning-AI/lightning-quick-start examples/app/quick-start
condition: eq(variables['name'], 'quick_start')
# without succeeded this could run even if the job has already failed
condition: and(succeeded(), eq(variables['name'], 'quick_start'))
displayName: 'Clone Quick start Repo'
- bash: |
git clone https://github.com/Lightning-AI/lightning-template-react examples/app/template_react_ui
condition: eq(variables['name'], 'template_react_ui')
# without succeeded this could run even if the job has already failed
condition: and(succeeded(), eq(variables['name'], 'template_react_ui'))
displayName: 'Clone Template React UI Repo'

# Replace imports to use `lightning` instead of `lightning_app` since we install lightning only ATM
Expand Down
3 changes: 2 additions & 1 deletion .azure/gpu-benchmarks.yml
Original file line number Diff line number Diff line change
Expand Up @@ -103,7 +103,8 @@ jobs:

- bash: bash run_standalone_tasks.sh
workingDirectory: tests/parity_fabric
condition: eq(variables['PACKAGE_NAME'], 'fabric')
# without succeeded this could run even if the job has already failed
condition: and(succeeded(), eq(variables['PACKAGE_NAME'], 'fabric'))
env:
PL_RUN_CUDA_TESTS: "1"
displayName: 'Testing: fabric standalone tasks'
Expand Down
6 changes: 4 additions & 2 deletions .azure/gpu-tests-fabric.yml
Original file line number Diff line number Diff line change
Expand Up @@ -130,7 +130,8 @@ jobs:

- bash: python -m pytest lightning_fabric
workingDirectory: src
condition: eq(variables['PACKAGE_NAME'], 'fabric')
# without succeeded this could run even if the job has already failed
condition: and(succeeded(), eq(variables['PACKAGE_NAME'], 'fabric'))
displayName: 'Testing: Fabric doctests'

- bash: |
Expand All @@ -141,7 +142,8 @@ jobs:
python .actions/assistant.py copy_replace_imports --source_dir="./examples/fabric" \
--source_import="lightning.fabric" \
--target_import="lightning_fabric"
condition: eq(variables['PACKAGE_NAME'], 'fabric')
# without succeeded this could run even if the job has already failed
condition: and(succeeded(), eq(variables['PACKAGE_NAME'], 'fabric'))
displayName: 'Adjust tests & examples'

- bash: python -m coverage run --source ${COVERAGE_SOURCE} -m pytest -v --durations=50
Expand Down
12 changes: 8 additions & 4 deletions .azure/gpu-tests-pytorch.yml
Original file line number Diff line number Diff line change
Expand Up @@ -133,11 +133,13 @@ jobs:
displayName: 'Bump to nightly'

- bash: pip uninstall -y lightning
condition: eq(variables['PACKAGE_NAME'], 'pytorch')
# without succeeded this could run even if the job has already failed
condition: and(succeeded(), eq(variables['PACKAGE_NAME'], 'pytorch'))
# Lightning is dependency of Habana or other accelerators/integrations so in case we test PL we need to remove it
displayName: 'Drop LAI from extensions'
- bash: pip uninstall -y pytorch-lightning
condition: eq(variables['PACKAGE_NAME'], 'lightning')
# without succeeded this could run even if the job has already failed
condition: and(succeeded(), eq(variables['PACKAGE_NAME'], 'lightning'))
displayName: 'Drop PL for LAI'

- bash: |
Expand All @@ -149,7 +151,8 @@ jobs:

- bash: python -m pytest pytorch_lightning
workingDirectory: src
condition: eq(variables['PACKAGE_NAME'], 'pytorch')
# without succeeded this could run even if the job has already failed
condition: and(succeeded(), eq(variables['PACKAGE_NAME'], 'pytorch'))
displayName: 'Testing: PyTorch doctests'

- bash: |
Expand All @@ -159,7 +162,8 @@ jobs:
python .actions/assistant.py copy_replace_imports --source_dir="./examples/pytorch/basics" \
--source_import="lightning.fabric,lightning.pytorch" \
--target_import="lightning_fabric,pytorch_lightning"
condition: eq(variables['PACKAGE_NAME'], 'pytorch')
# without succeeded this could run even if the job has already failed
condition: and(succeeded(), eq(variables['PACKAGE_NAME'], 'pytorch'))
displayName: 'Adjust tests & examples'

- bash: |
Expand Down
1 change: 1 addition & 0 deletions .github/checkgroup.yml
Original file line number Diff line number Diff line change
Expand Up @@ -143,6 +143,7 @@ subprojects:
- "build-cuda (3.9, 1.12, 11.7.1)"
- "build-cuda (3.9, 1.13, 12.0.1)"
- "build-cuda (3.10, 2.0, 12.0.1)"
- "build-cuda (3.10, 2.0, 11.7.1)"
#- "build-NGC"
- "build-pl (3.9, 1.11, 11.3.1)"
- "build-pl (3.9, 1.12, 11.7.1)"
Expand Down
2 changes: 2 additions & 0 deletions .github/workflows/ci-dockers.yml
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,8 @@ jobs:
- {python_version: "3.9", pytorch_version: "1.12", cuda_version: "11.7.1"}
- {python_version: "3.9", pytorch_version: "1.13", cuda_version: "12.0.1"}
- {python_version: "3.10", pytorch_version: "2.0", cuda_version: "12.0.1"}
# these are used in Azure GPU CI
- {python_version: "3.10", pytorch_version: "2.0", cuda_version: "11.7.1"}
steps:
- uses: actions/checkout@v3
- uses: docker/setup-buildx-action@v2
Expand Down
3 changes: 1 addition & 2 deletions .github/workflows/ci-tests-fabric.yml
Original file line number Diff line number Diff line change
Expand Up @@ -112,8 +112,7 @@ jobs:
run: |
python -m pip install -q pip -U
extra=$(python -c "print({'lightning': 'fabric-'}.get('${{ matrix.pkg-name }}', ''))")
pip install -e ".[${extra}test]" "pytest-timeout" -U -f ${TORCH_URL} ${TORCH_PREINSTALL} -f ${PYPI_CACHE_DIR} --prefer-binary
pip install -r requirements/fabric/strategies.txt -f ${PYPI_CACHE_DIR} --prefer-binary
pip install -e ".[${extra}test,${extra}strategies]" "pytest-timeout" -U -f ${TORCH_URL} ${TORCH_PREINSTALL} -f ${PYPI_CACHE_DIR} --prefer-binary
pip list
- name: Dump handy wheels
if: github.event_name == 'push' && github.ref == 'refs/heads/master'
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/ci-tests-pytorch.yml
Original file line number Diff line number Diff line change
Expand Up @@ -116,7 +116,7 @@ jobs:
run: |
python -m pip install -q pip -U
extra=$(python -c "print({'lightning': 'pytorch-'}.get('${{ matrix.pkg-name }}', ''))")
pip install ".[${extra}extra,${extra}test]" -U \
pip install ".[${extra}extra,${extra}test,${extra}strategies]" -U \
"pytest-timeout" -r requirements/_integrations/accelerators.txt \
-f ${TORCH_URL} ${TORCH_PREINSTALL} -f ${PYPI_CACHE_DIR} --prefer-binary
pip list
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/tpu-tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -115,7 +115,7 @@ jobs:
env:
JOB_NAME: ${{ env.PR_NUMBER }}-${{ matrix.pkg-name }}-${{ matrix.runtime }}-${{ env.SHA }}
run: |
gcloud compute tpus tpu-vm create "$JOB_NAME" --accelerator-type=v4-8 --version="tpu-vm-v4-pt-$XLA_VER"
gcloud compute tpus tpu-vm create "$JOB_NAME" --accelerator-type=v4-8 --version="tpu-vm-v4-pt-$XLA_VER" --preemptible

- name: Cancel job
if: steps.tpu-create.outcome != 'success'
Expand Down
2 changes: 1 addition & 1 deletion requirements/fabric/base.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,6 @@
numpy >=1.17.2, <1.25.1
torch >=1.11.0, <2.1.0
fsspec[http]>2021.06.0, <2023.5.0
packaging >=17.1, <=23.0
packaging >=20.0, <=23.0
typing-extensions >=4.0.0, <=4.4.0
lightning-utilities >=0.8.0, <0.10.0
2 changes: 1 addition & 1 deletion requirements/pytorch/base.txt
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,6 @@ tqdm >=4.57.0, <4.66.0
PyYAML >=5.4, <=6.0
fsspec[http] >2021.06.0, <2023.5.0
torchmetrics >=0.7.0, <1.1.0 # needed for using fixed compare_version
packaging >=17.1, <=23.0
packaging >=20.0, <=23.0
typing-extensions >=4.0.0, <=4.4.0
lightning-utilities >=0.8.0, <0.10.0
3 changes: 2 additions & 1 deletion src/lightning/fabric/accelerators/cuda.py
Original file line number Diff line number Diff line change
Expand Up @@ -360,7 +360,8 @@ def _check_cuda_matmul_precision(device: torch.device) -> None:


def _clear_cuda_memory() -> None:
if _TORCH_GREATER_EQUAL_2_0:
# strangely, the attribute function be undefined when torch.compile is used
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

verb is missing "function [...] be undefined"

if _TORCH_GREATER_EQUAL_2_0 and hasattr(torch._C, "_cuda_clearCublasWorkspaces"):
# https://github.com/pytorch/pytorch/issues/95668
torch._C._cuda_clearCublasWorkspaces()
torch.cuda.empty_cache()
9 changes: 8 additions & 1 deletion tests/tests_fabric/strategies/test_fsdp_integration.py
Original file line number Diff line number Diff line change
Expand Up @@ -347,7 +347,14 @@ def test_setup_with_orig_params_and_multiple_param_groups():

@RunIf(min_cuda_gpus=2, skip_windows=True, standalone=True, dynamo=True)
@mock.patch.dict(os.environ, {})
@pytest.mark.parametrize("compile_after_setup", [False, True])
@pytest.mark.parametrize(
"compile_after_setup",
[
False,
# https://github.com/pytorch/pytorch/issues/97811
pytest.param(True, marks=RunIf(min_python="3.9")),
],
)
def test_compile(compile_after_setup):
"""Test that the model can be compiled before and after the model is wrapped in FSDP."""
model = BoringModel()
Expand Down
12 changes: 1 addition & 11 deletions tests/tests_pytorch/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -155,7 +155,7 @@ def __new__(cls, self, *args, **kwargs):

# torch doesn't allow creation of mps devices on older versions
monkeypatch.setattr("torch.device", MpsDeviceMock)
monkeypatch.setattr(lightning.fabric.accelerators.mps, "_get_all_available_mps_gpus", lambda: list(range(n)))
monkeypatch.setattr(lightning.fabric.accelerators.mps, "_get_all_available_mps_gpus", lambda: [0] if n > 0 else [])
monkeypatch.setattr(lightning.fabric.accelerators.mps.MPSAccelerator, "is_available", lambda *_: n > 0)


Expand All @@ -169,16 +169,6 @@ def mps_count_1(monkeypatch):
mock_mps_count(monkeypatch, 1)


@pytest.fixture()
def mps_count_2(monkeypatch):
mock_mps_count(monkeypatch, 2)


@pytest.fixture()
def mps_count_4(monkeypatch):
mock_mps_count(monkeypatch, 4)


def mock_xla_available(monkeypatch: pytest.MonkeyPatch, value: bool = True) -> None:
monkeypatch.setattr(lightning.pytorch.strategies.xla, "_XLA_AVAILABLE", value)
monkeypatch.setattr(lightning.pytorch.strategies.single_xla, "_XLA_AVAILABLE", value)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -781,6 +781,7 @@ def get_defaults(cls):


@RunIf(min_cuda_gpus=1) # trigger this test on our GPU pipeline, because we don't install the package on the CPU suite
@pytest.mark.xfail(raises=ImportError, reason="Not updated to latest API")
@pytest.mark.skipif(not package_available("lightning_colossalai"), reason="Requires Colossal AI Strategy")
def test_colossalai_external_strategy(monkeypatch):
with mock.patch(
Expand All @@ -795,6 +796,7 @@ def test_colossalai_external_strategy(monkeypatch):


@RunIf(min_cuda_gpus=1) # trigger this test on our GPU pipeline, because we don't install the package on the CPU suite
@pytest.mark.xfail(raises=ImportError, reason="Not updated to latest API")
@pytest.mark.skipif(not package_available("lightning_bagua"), reason="Requires Bagua Strategy")
def test_bagua_external_strategy(monkeypatch):
with mock.patch(
Expand Down