From 2a8789e1c680833d550cf4569e6b77bf40261ad4 Mon Sep 17 00:00:00 2001 From: Jirka Borovec <6035284+Borda@users.noreply.github.com> Date: Wed, 13 Dec 2023 20:27:49 +0100 Subject: [PATCH] ci/tests: cleaning standalone script (#19141) * tests: cleaning standalone script * switch * from tests * -m * collect * array * tests_fabric/ * .. * path prefix * pl * cleaning * test_pytorch_profiler_nested_emit_nvtx * Apply suggestions from code review * Apply suggestions from code review * todo --- .azure/gpu-tests-fabric.yml | 18 +++--- .azure/gpu-tests-pytorch.yml | 7 +-- pyproject.toml | 2 +- .../pytorch/utilities/testing/_runif.py | 4 +- .../run_standalone_tests.sh | 59 ++++++++----------- tests/tests_fabric/conftest.py | 33 ++++++----- tests/tests_fabric/run_standalone_tests.sh | 1 - tests/tests_fabric/run_tpu_tests.sh | 2 +- tests/tests_pytorch/conftest.py | 33 ++++++----- .../tests_pytorch/profilers/test_profiler.py | 30 +++++----- tests/tests_pytorch/run_standalone_tasks.sh | 15 +++-- tests/tests_pytorch/run_tpu_tests.sh | 2 +- 12 files changed, 99 insertions(+), 107 deletions(-) rename tests/{tests_pytorch => }/run_standalone_tests.sh (55%) delete mode 120000 tests/tests_fabric/run_standalone_tests.sh diff --git a/.azure/gpu-tests-fabric.yml b/.azure/gpu-tests-fabric.yml index da477107e8467..a84a426e6d296 100644 --- a/.azure/gpu-tests-fabric.yml +++ b/.azure/gpu-tests-fabric.yml @@ -49,6 +49,7 @@ jobs: DEVICES: $( python -c 'print("$(Agent.Name)".split("_")[-1])' ) FREEZE_REQUIREMENTS: "1" PIP_CACHE_DIR: "/var/tmp/pip" + PL_RUN_CUDA_TESTS: "1" container: image: $(image) # default shm size is 64m. Increase it to avoid: @@ -126,19 +127,16 @@ jobs: condition: and(succeeded(), eq(variables['PACKAGE_NAME'], 'fabric')) displayName: "Adjust tests & examples" - - bash: python -m coverage run --source ${COVERAGE_SOURCE} -m pytest -v --durations=50 - workingDirectory: tests/tests_fabric - env: - PL_RUN_CUDA_TESTS: "1" + - bash: python -m coverage run --source ${COVERAGE_SOURCE} -m pytest . -v --durations=50 + workingDirectory: tests/tests_fabric/ displayName: "Testing: fabric standard" timeoutInMinutes: "10" - - bash: bash run_standalone_tests.sh - workingDirectory: tests/tests_fabric + - bash: bash ../run_standalone_tests.sh "." + workingDirectory: tests/tests_fabric/ env: - PL_RUN_CUDA_TESTS: "1" PL_STANDALONE_TESTS_SOURCE: $(COVERAGE_SOURCE) - displayName: "Testing: fabric standalone tests" + displayName: "Testing: fabric standalone" timeoutInMinutes: "10" - bash: | @@ -152,12 +150,12 @@ jobs: ./codecov --token=$(CODECOV_TOKEN) --commit=$(Build.SourceVersion) \ --flags=gpu,pytest,${COVERAGE_SOURCE} --name="GPU-coverage" --env=linux,azure ls -l - workingDirectory: tests/tests_fabric + workingDirectory: tests/tests_fabric/ displayName: "Statistics" - script: | set -e bash run_fabric_examples.sh --accelerator=cuda --devices=1 bash run_fabric_examples.sh --accelerator=cuda --devices=2 --strategy ddp - workingDirectory: examples + workingDirectory: examples/ displayName: "Testing: fabric examples" diff --git a/.azure/gpu-tests-pytorch.yml b/.azure/gpu-tests-pytorch.yml index f154eb6632feb..19390490f0091 100644 --- a/.azure/gpu-tests-pytorch.yml +++ b/.azure/gpu-tests-pytorch.yml @@ -59,6 +59,7 @@ jobs: DEVICES: $( python -c 'print("$(Agent.Name)".split("_")[-1])' ) FREEZE_REQUIREMENTS: "1" PIP_CACHE_DIR: "/var/tmp/pip" + PL_RUN_CUDA_TESTS: "1" container: image: $(image) # default shm size is 64m. Increase it to avoid: @@ -154,16 +155,13 @@ jobs: - bash: python -m coverage run --source ${COVERAGE_SOURCE} -m pytest -v --durations=50 workingDirectory: tests/tests_pytorch - env: - PL_RUN_CUDA_TESTS: "1" displayName: "Testing: PyTorch standard" timeoutInMinutes: "35" - - bash: bash run_standalone_tests.sh + - bash: bash ../run_standalone_tests.sh "." workingDirectory: tests/tests_pytorch env: PL_USE_MOCKED_MNIST: "1" - PL_RUN_CUDA_TESTS: "1" PL_STANDALONE_TESTS_SOURCE: $(COVERAGE_SOURCE) displayName: "Testing: PyTorch standalone tests" timeoutInMinutes: "35" @@ -172,7 +170,6 @@ jobs: workingDirectory: tests/tests_pytorch env: PL_USE_MOCKED_MNIST: "1" - PL_RUN_CUDA_TESTS: "1" displayName: "Testing: PyTorch standalone tasks" timeoutInMinutes: "10" diff --git a/pyproject.toml b/pyproject.toml index b1a0dbd9f83c1..b78e03aed5c5a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -269,7 +269,7 @@ addopts = [ "--ignore=legacy/checkpoints", ] markers = [ - "cloud:Run the cloud tests for example", + "cloud: Run the cloud tests for example", ] filterwarnings = [ "error::FutureWarning", diff --git a/src/lightning/pytorch/utilities/testing/_runif.py b/src/lightning/pytorch/utilities/testing/_runif.py index 3c67260a88bed..c3e0262d9906f 100644 --- a/src/lightning/pytorch/utilities/testing/_runif.py +++ b/src/lightning/pytorch/utilities/testing/_runif.py @@ -16,7 +16,7 @@ from lightning_utilities.core.imports import RequirementCache from lightning.fabric.utilities.imports import _TORCH_GREATER_EQUAL_2_0 -from lightning.fabric.utilities.testing import _runif_reasons as FabricRunIf +from lightning.fabric.utilities.testing import _runif_reasons as fabric_run_if from lightning.pytorch.accelerators.cpu import _PSUTIL_AVAILABLE from lightning.pytorch.callbacks.progress.rich_progress import _RICH_AVAILABLE from lightning.pytorch.core.module import _ONNX_AVAILABLE @@ -68,7 +68,7 @@ def _runif_reasons( """ - reasons, kwargs = FabricRunIf( + reasons, kwargs = fabric_run_if( min_cuda_gpus=min_cuda_gpus, min_torch=min_torch, max_torch=max_torch, diff --git a/tests/tests_pytorch/run_standalone_tests.sh b/tests/run_standalone_tests.sh similarity index 55% rename from tests/tests_pytorch/run_standalone_tests.sh rename to tests/run_standalone_tests.sh index a8f7432eee3fb..19d233794719f 100644 --- a/tests/tests_pytorch/run_standalone_tests.sh +++ b/tests/run_standalone_tests.sh @@ -23,25 +23,19 @@ source="${PL_STANDALONE_TESTS_SOURCE:-"lightning"}" # this environment variable allows special tests to run export PL_RUN_STANDALONE_TESTS=1 # python arguments -defaults="-m coverage run --source ${source} --append -m pytest --no-header -v -s --timeout 120" +defaults=" -m coverage run --source ${source} --append -m pytest --no-header -v -s --timeout 120 " echo "Using defaults: ${defaults}" -# find tests marked as `@RunIf(standalone=True)`. done manually instead of with pytest because it is faster -grep_output=$(grep --recursive --word-regexp . --regexp 'standalone=True' --include '*.py') +# get the testing location as the fist argument +test_path=$1 +printf "source path: $test_path\n" -# file paths, remove duplicates -files=$(echo "$grep_output" | cut -f1 -d: | sort | uniq) - -# get the list of parametrizations. we need to call them separately. the last two lines are removed. -# note: if there's a syntax error, this will fail with some garbled output -if [[ "$OSTYPE" == "darwin"* ]]; then - parametrizations=$(python3 -m pytest $files --collect-only --quiet "$@" | tail -r | sed -e '1,3d' | tail -r) -else - parametrizations=$(python3 -m pytest $files --collect-only --quiet "$@" | head -n -2) -fi -# remove the "tests/tests_pytorch/" path suffixes -path_suffix=$(basename "$(dirname "$(pwd)")")/$(basename "$(pwd)")"/" # https://stackoverflow.com/a/8223345 -parametrizations=${parametrizations//$path_suffix/} +# collect all tests with parametrization based filtering with PL_RUN_STANDALONE_TESTS +standalone_tests=$(python -m pytest $test_path -q --collect-only --pythonwarnings ignore) +printf "Collected tests: \n $standalone_tests" +# match only lines with tests +parametrizations=$(grep -oP '\S+::test_\S+' <<< "$standalone_tests") +# convert the list to be array parametrizations_arr=($parametrizations) report='' @@ -61,30 +55,25 @@ function show_batched_output { } trap show_batched_output EXIT # show the output on exit +# remove the "tests/tests_pytorch/" path suffixes +path_prefix=$(basename "$(dirname "$(pwd)")")/$(basename "$(pwd)")"/" # https://stackoverflow.com/a/8223345 for i in "${!parametrizations_arr[@]}"; do - parametrization=${parametrizations_arr[$i]} + parametrization=${parametrizations_arr[$i]//$path_prefix/} prefix="$((i+1))/${#parametrizations_arr[@]}" - # check blocklist - if [[ "${parametrization}" == *"test_pytorch_profiler_nested_emit_nvtx"* ]]; then - echo "$prefix: Skipping $parametrization" - report+="Skipped\t$parametrization\n" - # do not continue the loop because we might need to wait for batched jobs - else - echo "$prefix: Running $parametrization" + echo "$prefix: Running $parametrization" - # fix the port to avoid race condition when batched distributed tests select the port randomly - export MASTER_PORT=$((29500 + $i % $test_batch_size)) + # fix the port to avoid race condition when batched distributed tests select the port randomly + export MASTER_PORT=$((29500 + $i % $test_batch_size)) - # execute the test in the background - # redirect to a log file that buffers test output. since the tests will run in the background, we cannot let them - # output to std{out,err} because the outputs would be garbled together - python3 ${defaults} "$parametrization" &>> standalone_test_output.txt & - # save the PID in an array - pids[${i}]=$! - # add row to the final report - report+="Ran\t$parametrization\n" - fi + # execute the test in the background + # redirect to a log file that buffers test output. since the tests will run in the background, we cannot let them + # output to std{out,err} because the outputs would be garbled together + python ${defaults} "$parametrization" &>> standalone_test_output.txt & + # save the PID in an array + pids[${i}]=$! + # add row to the final report + report+="Ran\t$parametrization\n" if ((($i + 1) % $test_batch_size == 0)); then # wait for running tests diff --git a/tests/tests_fabric/conftest.py b/tests/tests_fabric/conftest.py index 6edce6c85b8e0..221efe2d4155f 100644 --- a/tests/tests_fabric/conftest.py +++ b/tests/tests_fabric/conftest.py @@ -192,22 +192,23 @@ def pytest_collection_modifyitems(items: List[pytest.Function], config: pytest.C for kwarg, env_var in options.items(): # this will compute the intersection of all tests selected per environment variable - if os.getenv(env_var, "0") == "1": - conditions.append(env_var) - for i, test in reversed(list(enumerate(items))): # loop in reverse, since we are going to pop items - already_skipped = any(marker.name == "skip" for marker in test.own_markers) - if already_skipped: - # the test was going to be skipped anyway, filter it out - items.pop(i) - skipped += 1 - continue - has_runif_with_kwarg = any( - marker.name == "skipif" and marker.kwargs.get(kwarg) for marker in test.own_markers - ) - if not has_runif_with_kwarg: - # the test has `@RunIf(kwarg=True)`, filter it out - items.pop(i) - filtered += 1 + if os.getenv(env_var, "0") != "1": + continue + conditions.append(env_var) + for i, test in reversed(list(enumerate(items))): # loop in reverse, since we are going to pop items + already_skipped = any(marker.name == "skip" for marker in test.own_markers) + if already_skipped: + # the test was going to be skipped anyway, filter it out + items.pop(i) + skipped += 1 + continue + has_runif_with_kwarg = any( + marker.name == "skipif" and marker.kwargs.get(kwarg) for marker in test.own_markers + ) + if not has_runif_with_kwarg: + # the test has `@RunIf(kwarg=True)`, filter it out + items.pop(i) + filtered += 1 if config.option.verbose >= 0 and (filtered or skipped): writer = config.get_terminal_writer() diff --git a/tests/tests_fabric/run_standalone_tests.sh b/tests/tests_fabric/run_standalone_tests.sh deleted file mode 120000 index 23049489b7160..0000000000000 --- a/tests/tests_fabric/run_standalone_tests.sh +++ /dev/null @@ -1 +0,0 @@ -../tests_pytorch/run_standalone_tests.sh \ No newline at end of file diff --git a/tests/tests_fabric/run_tpu_tests.sh b/tests/tests_fabric/run_tpu_tests.sh index 2f98b2a258e48..ca59a001927c9 100644 --- a/tests/tests_fabric/run_tpu_tests.sh +++ b/tests/tests_fabric/run_tpu_tests.sh @@ -35,7 +35,7 @@ cd tests/tests_fabric PL_RUN_TPU_TESTS=1 python3 -m coverage run --source=lightning -m pytest -vv --durations=0 --timeout 60 ./ echo "--- Running standalone Fabric tests ---" -PL_RUN_TPU_TESTS=1 PL_STANDALONE_TESTS_BATCH_SIZE=1 bash run_standalone_tests.sh +PL_RUN_TPU_TESTS=1 PL_STANDALONE_TESTS_BATCH_SIZE=1 bash ../run_standalone_tests.sh "." echo "--- Generating coverage ---" python3 -m coverage xml diff --git a/tests/tests_pytorch/conftest.py b/tests/tests_pytorch/conftest.py index aabea254b1365..7194fdc862738 100644 --- a/tests/tests_pytorch/conftest.py +++ b/tests/tests_pytorch/conftest.py @@ -318,22 +318,23 @@ def pytest_collection_modifyitems(items: List[pytest.Function], config: pytest.C for kwarg, env_var in options.items(): # this will compute the intersection of all tests selected per environment variable - if os.getenv(env_var, "0") == "1": - conditions.append(env_var) - for i, test in reversed(list(enumerate(items))): # loop in reverse, since we are going to pop items - already_skipped = any(marker.name == "skip" for marker in test.own_markers) - if already_skipped: - # the test was going to be skipped anyway, filter it out - items.pop(i) - skipped += 1 - continue - has_runif_with_kwarg = any( - marker.name == "skipif" and marker.kwargs.get(kwarg) for marker in test.own_markers - ) - if not has_runif_with_kwarg: - # the test has `@RunIf(kwarg=True)`, filter it out - items.pop(i) - filtered += 1 + if os.getenv(env_var, "0") != "1": + continue + conditions.append(env_var) + for i, test in reversed(list(enumerate(items))): # loop in reverse, since we are going to pop items + already_skipped = any(marker.name == "skip" for marker in test.own_markers) + if already_skipped: + # the test was going to be skipped anyway, filter it out + items.pop(i) + skipped += 1 + continue + has_runif_with_kwarg = any( + marker.name == "skipif" and marker.kwargs.get(kwarg) for marker in test.own_markers + ) + if not has_runif_with_kwarg: + # the test has `@RunIf(kwarg=True)`, filter it out + items.pop(i) + filtered += 1 if config.option.verbose >= 0 and (filtered or skipped): writer = config.get_terminal_writer() diff --git a/tests/tests_pytorch/profilers/test_profiler.py b/tests/tests_pytorch/profilers/test_profiler.py index 56d82734dcf14..0838d5b1b8c5a 100644 --- a/tests/tests_pytorch/profilers/test_profiler.py +++ b/tests/tests_pytorch/profilers/test_profiler.py @@ -471,20 +471,22 @@ def look_for_trace(trace_dir): assert look_for_trace(tmpdir / "lightning_logs" / "version_0") -@RunIf(min_cuda_gpus=1, standalone=True) -def test_pytorch_profiler_nested_emit_nvtx(): - """This test check emit_nvtx is correctly supported.""" - profiler = PyTorchProfiler(use_cuda=True, emit_nvtx=True) - model = BoringModel() - trainer = Trainer( - fast_dev_run=True, - profiler=profiler, - accelerator="gpu", - devices=1, - enable_progress_bar=False, - enable_model_summary=False, - ) - trainer.fit(model) +# Todo: this test has not been running as all our CI GPU runners have higher capacity +# @RunIf(min_cuda_gpus=1, standalone=True) +# @pytest.mark.skipif(torch.cuda.get_device_capability()[0] >= 8) +# def test_pytorch_profiler_nested_emit_nvtx(): +# """This test check emit_nvtx is correctly supported.""" +# profiler = PyTorchProfiler(use_cuda=True, emit_nvtx=True) +# model = BoringModel() +# trainer = Trainer( +# fast_dev_run=True, +# profiler=profiler, +# accelerator="gpu", +# devices=1, +# enable_progress_bar=False, +# enable_model_summary=False, +# ) +# trainer.fit(model) def test_register_record_function(tmpdir): diff --git a/tests/tests_pytorch/run_standalone_tasks.sh b/tests/tests_pytorch/run_standalone_tasks.sh index 7648adfb45b27..6f69046dff72d 100644 --- a/tests/tests_pytorch/run_standalone_tasks.sh +++ b/tests/tests_pytorch/run_standalone_tasks.sh @@ -18,11 +18,16 @@ set -e # this environment variable allows special tests to run export PL_RUN_STANDALONE_TESTS=1 -can_run_nvprof=$(python -c "import torch; print(torch.cuda.is_available() and torch.cuda.get_device_capability()[0] < 8)") -if [[ $can_run_nvprof == "True" ]]; then - echo "Running profilers/test_profiler.py::test_pytorch_profiler_nested_emit_nvtx" - nvprof --profile-from-start off -o trace_name.prof -- python -m coverage run --source lightning.pytorch --append -m pytest --no-header profilers/test_profiler.py::test_pytorch_profiler_nested_emit_nvtx -fi +#can_run_nvprof=$(python -c "import torch; print(torch.cuda.is_available() and torch.cuda.get_device_capability()[0] < 8)") +#if [[ $can_run_nvprof == "True" ]]; then +# echo "Running profilers/test_profiler.py::test_pytorch_profiler_nested_emit_nvtx" +# nvprof --profile-from-start off \ +# -o trace_name.prof \ +# -- python -m coverage run \ +# --source lightning.pytorch \ +# --append -m pytest \ +# --no-header profilers/test_profiler.py::test_pytorch_profiler_nested_emit_nvtx +#fi # test that a user can manually launch individual processes echo "Running manual ddp launch test" diff --git a/tests/tests_pytorch/run_tpu_tests.sh b/tests/tests_pytorch/run_tpu_tests.sh index ec5f8d44c9ef6..bdb21ef0531a8 100644 --- a/tests/tests_pytorch/run_tpu_tests.sh +++ b/tests/tests_pytorch/run_tpu_tests.sh @@ -34,7 +34,7 @@ cd tests/tests_pytorch PL_RUN_TPU_TESTS=1 python3 -m coverage run --source=lightning -m pytest -vv --durations=0 --timeout 60 ./ echo "--- Running standalone PL tests ---" -PL_RUN_TPU_TESTS=1 PL_STANDALONE_TESTS_BATCH_SIZE=1 bash run_standalone_tests.sh +PL_RUN_TPU_TESTS=1 PL_STANDALONE_TESTS_BATCH_SIZE=1 bash ../run_standalone_tests.sh "." echo "--- Generating coverage ---" python3 -m coverage xml