Skip to content

Commit

Permalink
ci/tests: cleaning standalone script (#19141)
Browse files Browse the repository at this point in the history
* tests: cleaning standalone script

* switch

* from tests

* -m

* collect

* array

* tests_fabric/

* ..

* path prefix

* pl

* cleaning

* test_pytorch_profiler_nested_emit_nvtx

* Apply suggestions from code review

* Apply suggestions from code review

* todo
  • Loading branch information
Borda authored Dec 13, 2023
1 parent 2e77862 commit 2a8789e
Show file tree
Hide file tree
Showing 12 changed files with 99 additions and 107 deletions.
18 changes: 8 additions & 10 deletions .azure/gpu-tests-fabric.yml
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,7 @@ jobs:
DEVICES: $( python -c 'print("$(Agent.Name)".split("_")[-1])' )
FREEZE_REQUIREMENTS: "1"
PIP_CACHE_DIR: "/var/tmp/pip"
PL_RUN_CUDA_TESTS: "1"
container:
image: $(image)
# default shm size is 64m. Increase it to avoid:
Expand Down Expand Up @@ -126,19 +127,16 @@ jobs:
condition: and(succeeded(), eq(variables['PACKAGE_NAME'], 'fabric'))
displayName: "Adjust tests & examples"
- bash: python -m coverage run --source ${COVERAGE_SOURCE} -m pytest -v --durations=50
workingDirectory: tests/tests_fabric
env:
PL_RUN_CUDA_TESTS: "1"
- bash: python -m coverage run --source ${COVERAGE_SOURCE} -m pytest . -v --durations=50
workingDirectory: tests/tests_fabric/
displayName: "Testing: fabric standard"
timeoutInMinutes: "10"

- bash: bash run_standalone_tests.sh
workingDirectory: tests/tests_fabric
- bash: bash ../run_standalone_tests.sh "."
workingDirectory: tests/tests_fabric/
env:
PL_RUN_CUDA_TESTS: "1"
PL_STANDALONE_TESTS_SOURCE: $(COVERAGE_SOURCE)
displayName: "Testing: fabric standalone tests"
displayName: "Testing: fabric standalone"
timeoutInMinutes: "10"

- bash: |
Expand All @@ -152,12 +150,12 @@ jobs:
./codecov --token=$(CODECOV_TOKEN) --commit=$(Build.SourceVersion) \
--flags=gpu,pytest,${COVERAGE_SOURCE} --name="GPU-coverage" --env=linux,azure
ls -l
workingDirectory: tests/tests_fabric
workingDirectory: tests/tests_fabric/
displayName: "Statistics"
- script: |
set -e
bash run_fabric_examples.sh --accelerator=cuda --devices=1
bash run_fabric_examples.sh --accelerator=cuda --devices=2 --strategy ddp
workingDirectory: examples
workingDirectory: examples/
displayName: "Testing: fabric examples"
7 changes: 2 additions & 5 deletions .azure/gpu-tests-pytorch.yml
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,7 @@ jobs:
DEVICES: $( python -c 'print("$(Agent.Name)".split("_")[-1])' )
FREEZE_REQUIREMENTS: "1"
PIP_CACHE_DIR: "/var/tmp/pip"
PL_RUN_CUDA_TESTS: "1"
container:
image: $(image)
# default shm size is 64m. Increase it to avoid:
Expand Down Expand Up @@ -154,16 +155,13 @@ jobs:
- bash: python -m coverage run --source ${COVERAGE_SOURCE} -m pytest -v --durations=50
workingDirectory: tests/tests_pytorch
env:
PL_RUN_CUDA_TESTS: "1"
displayName: "Testing: PyTorch standard"
timeoutInMinutes: "35"

- bash: bash run_standalone_tests.sh
- bash: bash ../run_standalone_tests.sh "."
workingDirectory: tests/tests_pytorch
env:
PL_USE_MOCKED_MNIST: "1"
PL_RUN_CUDA_TESTS: "1"
PL_STANDALONE_TESTS_SOURCE: $(COVERAGE_SOURCE)
displayName: "Testing: PyTorch standalone tests"
timeoutInMinutes: "35"
Expand All @@ -172,7 +170,6 @@ jobs:
workingDirectory: tests/tests_pytorch
env:
PL_USE_MOCKED_MNIST: "1"
PL_RUN_CUDA_TESTS: "1"
displayName: "Testing: PyTorch standalone tasks"
timeoutInMinutes: "10"

Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -269,7 +269,7 @@ addopts = [
"--ignore=legacy/checkpoints",
]
markers = [
"cloud:Run the cloud tests for example",
"cloud: Run the cloud tests for example",
]
filterwarnings = [
"error::FutureWarning",
Expand Down
4 changes: 2 additions & 2 deletions src/lightning/pytorch/utilities/testing/_runif.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
from lightning_utilities.core.imports import RequirementCache

from lightning.fabric.utilities.imports import _TORCH_GREATER_EQUAL_2_0
from lightning.fabric.utilities.testing import _runif_reasons as FabricRunIf
from lightning.fabric.utilities.testing import _runif_reasons as fabric_run_if
from lightning.pytorch.accelerators.cpu import _PSUTIL_AVAILABLE
from lightning.pytorch.callbacks.progress.rich_progress import _RICH_AVAILABLE
from lightning.pytorch.core.module import _ONNX_AVAILABLE
Expand Down Expand Up @@ -68,7 +68,7 @@ def _runif_reasons(
"""

reasons, kwargs = FabricRunIf(
reasons, kwargs = fabric_run_if(
min_cuda_gpus=min_cuda_gpus,
min_torch=min_torch,
max_torch=max_torch,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,25 +23,19 @@ source="${PL_STANDALONE_TESTS_SOURCE:-"lightning"}"
# this environment variable allows special tests to run
export PL_RUN_STANDALONE_TESTS=1
# python arguments
defaults="-m coverage run --source ${source} --append -m pytest --no-header -v -s --timeout 120"
defaults=" -m coverage run --source ${source} --append -m pytest --no-header -v -s --timeout 120 "
echo "Using defaults: ${defaults}"

# find tests marked as `@RunIf(standalone=True)`. done manually instead of with pytest because it is faster
grep_output=$(grep --recursive --word-regexp . --regexp 'standalone=True' --include '*.py')
# get the testing location as the fist argument
test_path=$1
printf "source path: $test_path\n"

# file paths, remove duplicates
files=$(echo "$grep_output" | cut -f1 -d: | sort | uniq)

# get the list of parametrizations. we need to call them separately. the last two lines are removed.
# note: if there's a syntax error, this will fail with some garbled output
if [[ "$OSTYPE" == "darwin"* ]]; then
parametrizations=$(python3 -m pytest $files --collect-only --quiet "$@" | tail -r | sed -e '1,3d' | tail -r)
else
parametrizations=$(python3 -m pytest $files --collect-only --quiet "$@" | head -n -2)
fi
# remove the "tests/tests_pytorch/" path suffixes
path_suffix=$(basename "$(dirname "$(pwd)")")/$(basename "$(pwd)")"/" # https://stackoverflow.com/a/8223345
parametrizations=${parametrizations//$path_suffix/}
# collect all tests with parametrization based filtering with PL_RUN_STANDALONE_TESTS
standalone_tests=$(python -m pytest $test_path -q --collect-only --pythonwarnings ignore)
printf "Collected tests: \n $standalone_tests"
# match only lines with tests
parametrizations=$(grep -oP '\S+::test_\S+' <<< "$standalone_tests")
# convert the list to be array
parametrizations_arr=($parametrizations)

report=''
Expand All @@ -61,30 +55,25 @@ function show_batched_output {
}
trap show_batched_output EXIT # show the output on exit

# remove the "tests/tests_pytorch/" path suffixes
path_prefix=$(basename "$(dirname "$(pwd)")")/$(basename "$(pwd)")"/" # https://stackoverflow.com/a/8223345
for i in "${!parametrizations_arr[@]}"; do
parametrization=${parametrizations_arr[$i]}
parametrization=${parametrizations_arr[$i]//$path_prefix/}
prefix="$((i+1))/${#parametrizations_arr[@]}"

# check blocklist
if [[ "${parametrization}" == *"test_pytorch_profiler_nested_emit_nvtx"* ]]; then
echo "$prefix: Skipping $parametrization"
report+="Skipped\t$parametrization\n"
# do not continue the loop because we might need to wait for batched jobs
else
echo "$prefix: Running $parametrization"
echo "$prefix: Running $parametrization"

# fix the port to avoid race condition when batched distributed tests select the port randomly
export MASTER_PORT=$((29500 + $i % $test_batch_size))
# fix the port to avoid race condition when batched distributed tests select the port randomly
export MASTER_PORT=$((29500 + $i % $test_batch_size))

# execute the test in the background
# redirect to a log file that buffers test output. since the tests will run in the background, we cannot let them
# output to std{out,err} because the outputs would be garbled together
python3 ${defaults} "$parametrization" &>> standalone_test_output.txt &
# save the PID in an array
pids[${i}]=$!
# add row to the final report
report+="Ran\t$parametrization\n"
fi
# execute the test in the background
# redirect to a log file that buffers test output. since the tests will run in the background, we cannot let them
# output to std{out,err} because the outputs would be garbled together
python ${defaults} "$parametrization" &>> standalone_test_output.txt &
# save the PID in an array
pids[${i}]=$!
# add row to the final report
report+="Ran\t$parametrization\n"

if ((($i + 1) % $test_batch_size == 0)); then
# wait for running tests
Expand Down
33 changes: 17 additions & 16 deletions tests/tests_fabric/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -192,22 +192,23 @@ def pytest_collection_modifyitems(items: List[pytest.Function], config: pytest.C

for kwarg, env_var in options.items():
# this will compute the intersection of all tests selected per environment variable
if os.getenv(env_var, "0") == "1":
conditions.append(env_var)
for i, test in reversed(list(enumerate(items))): # loop in reverse, since we are going to pop items
already_skipped = any(marker.name == "skip" for marker in test.own_markers)
if already_skipped:
# the test was going to be skipped anyway, filter it out
items.pop(i)
skipped += 1
continue
has_runif_with_kwarg = any(
marker.name == "skipif" and marker.kwargs.get(kwarg) for marker in test.own_markers
)
if not has_runif_with_kwarg:
# the test has `@RunIf(kwarg=True)`, filter it out
items.pop(i)
filtered += 1
if os.getenv(env_var, "0") != "1":
continue
conditions.append(env_var)
for i, test in reversed(list(enumerate(items))): # loop in reverse, since we are going to pop items
already_skipped = any(marker.name == "skip" for marker in test.own_markers)
if already_skipped:
# the test was going to be skipped anyway, filter it out
items.pop(i)
skipped += 1
continue
has_runif_with_kwarg = any(
marker.name == "skipif" and marker.kwargs.get(kwarg) for marker in test.own_markers
)
if not has_runif_with_kwarg:
# the test has `@RunIf(kwarg=True)`, filter it out
items.pop(i)
filtered += 1

if config.option.verbose >= 0 and (filtered or skipped):
writer = config.get_terminal_writer()
Expand Down
1 change: 0 additions & 1 deletion tests/tests_fabric/run_standalone_tests.sh

This file was deleted.

2 changes: 1 addition & 1 deletion tests/tests_fabric/run_tpu_tests.sh
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ cd tests/tests_fabric
PL_RUN_TPU_TESTS=1 python3 -m coverage run --source=lightning -m pytest -vv --durations=0 --timeout 60 ./

echo "--- Running standalone Fabric tests ---"
PL_RUN_TPU_TESTS=1 PL_STANDALONE_TESTS_BATCH_SIZE=1 bash run_standalone_tests.sh
PL_RUN_TPU_TESTS=1 PL_STANDALONE_TESTS_BATCH_SIZE=1 bash ../run_standalone_tests.sh "."

echo "--- Generating coverage ---"
python3 -m coverage xml
Expand Down
33 changes: 17 additions & 16 deletions tests/tests_pytorch/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -318,22 +318,23 @@ def pytest_collection_modifyitems(items: List[pytest.Function], config: pytest.C

for kwarg, env_var in options.items():
# this will compute the intersection of all tests selected per environment variable
if os.getenv(env_var, "0") == "1":
conditions.append(env_var)
for i, test in reversed(list(enumerate(items))): # loop in reverse, since we are going to pop items
already_skipped = any(marker.name == "skip" for marker in test.own_markers)
if already_skipped:
# the test was going to be skipped anyway, filter it out
items.pop(i)
skipped += 1
continue
has_runif_with_kwarg = any(
marker.name == "skipif" and marker.kwargs.get(kwarg) for marker in test.own_markers
)
if not has_runif_with_kwarg:
# the test has `@RunIf(kwarg=True)`, filter it out
items.pop(i)
filtered += 1
if os.getenv(env_var, "0") != "1":
continue
conditions.append(env_var)
for i, test in reversed(list(enumerate(items))): # loop in reverse, since we are going to pop items
already_skipped = any(marker.name == "skip" for marker in test.own_markers)
if already_skipped:
# the test was going to be skipped anyway, filter it out
items.pop(i)
skipped += 1
continue
has_runif_with_kwarg = any(
marker.name == "skipif" and marker.kwargs.get(kwarg) for marker in test.own_markers
)
if not has_runif_with_kwarg:
# the test has `@RunIf(kwarg=True)`, filter it out
items.pop(i)
filtered += 1

if config.option.verbose >= 0 and (filtered or skipped):
writer = config.get_terminal_writer()
Expand Down
30 changes: 16 additions & 14 deletions tests/tests_pytorch/profilers/test_profiler.py
Original file line number Diff line number Diff line change
Expand Up @@ -471,20 +471,22 @@ def look_for_trace(trace_dir):
assert look_for_trace(tmpdir / "lightning_logs" / "version_0")


@RunIf(min_cuda_gpus=1, standalone=True)
def test_pytorch_profiler_nested_emit_nvtx():
"""This test check emit_nvtx is correctly supported."""
profiler = PyTorchProfiler(use_cuda=True, emit_nvtx=True)
model = BoringModel()
trainer = Trainer(
fast_dev_run=True,
profiler=profiler,
accelerator="gpu",
devices=1,
enable_progress_bar=False,
enable_model_summary=False,
)
trainer.fit(model)
# Todo: this test has not been running as all our CI GPU runners have higher capacity
# @RunIf(min_cuda_gpus=1, standalone=True)
# @pytest.mark.skipif(torch.cuda.get_device_capability()[0] >= 8)
# def test_pytorch_profiler_nested_emit_nvtx():
# """This test check emit_nvtx is correctly supported."""
# profiler = PyTorchProfiler(use_cuda=True, emit_nvtx=True)
# model = BoringModel()
# trainer = Trainer(
# fast_dev_run=True,
# profiler=profiler,
# accelerator="gpu",
# devices=1,
# enable_progress_bar=False,
# enable_model_summary=False,
# )
# trainer.fit(model)


def test_register_record_function(tmpdir):
Expand Down
15 changes: 10 additions & 5 deletions tests/tests_pytorch/run_standalone_tasks.sh
Original file line number Diff line number Diff line change
Expand Up @@ -18,11 +18,16 @@ set -e
# this environment variable allows special tests to run
export PL_RUN_STANDALONE_TESTS=1

can_run_nvprof=$(python -c "import torch; print(torch.cuda.is_available() and torch.cuda.get_device_capability()[0] < 8)")
if [[ $can_run_nvprof == "True" ]]; then
echo "Running profilers/test_profiler.py::test_pytorch_profiler_nested_emit_nvtx"
nvprof --profile-from-start off -o trace_name.prof -- python -m coverage run --source lightning.pytorch --append -m pytest --no-header profilers/test_profiler.py::test_pytorch_profiler_nested_emit_nvtx
fi
#can_run_nvprof=$(python -c "import torch; print(torch.cuda.is_available() and torch.cuda.get_device_capability()[0] < 8)")
#if [[ $can_run_nvprof == "True" ]]; then
# echo "Running profilers/test_profiler.py::test_pytorch_profiler_nested_emit_nvtx"
# nvprof --profile-from-start off \
# -o trace_name.prof \
# -- python -m coverage run \
# --source lightning.pytorch \
# --append -m pytest \
# --no-header profilers/test_profiler.py::test_pytorch_profiler_nested_emit_nvtx
#fi

# test that a user can manually launch individual processes
echo "Running manual ddp launch test"
Expand Down
2 changes: 1 addition & 1 deletion tests/tests_pytorch/run_tpu_tests.sh
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ cd tests/tests_pytorch
PL_RUN_TPU_TESTS=1 python3 -m coverage run --source=lightning -m pytest -vv --durations=0 --timeout 60 ./

echo "--- Running standalone PL tests ---"
PL_RUN_TPU_TESTS=1 PL_STANDALONE_TESTS_BATCH_SIZE=1 bash run_standalone_tests.sh
PL_RUN_TPU_TESTS=1 PL_STANDALONE_TESTS_BATCH_SIZE=1 bash ../run_standalone_tests.sh "."

echo "--- Generating coverage ---"
python3 -m coverage xml
Expand Down

0 comments on commit 2a8789e

Please sign in to comment.