diff --git a/scripts/task_run_unit_tests.sh b/scripts/task_run_unit_tests.sh new file mode 100755 index 0000000000..6b8d25286b --- /dev/null +++ b/scripts/task_run_unit_tests.sh @@ -0,0 +1,94 @@ +#!/bin/bash + +set -eo pipefail + +# Get the directory where this script is located +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +# Source test environment setup (handles package overrides like TVM-FFI) +source "${SCRIPT_DIR}/setup_test_env.sh" + +# Source common test functions +# shellcheck disable=SC1091 # File exists, checked separately +source "${SCRIPT_DIR}/test_utils.sh" + +# Find and filter test files based on pytest.ini exclusions +find_test_files() { + echo "Reading pytest.ini for excluded directories..." + EXCLUDED_DIRS="" + if [ -f "./pytest.ini" ]; then + # Extract norecursedirs from pytest.ini and convert to array + NORECURSEDIRS=$(grep "^norecursedirs" ./pytest.ini | sed 's/norecursedirs\s*=\s*//' | sed 's/#.*//') + if [ -n "$NORECURSEDIRS" ]; then + EXCLUDED_DIRS=$(echo "$NORECURSEDIRS" | tr ',' ' ' | tr -s ' ') + echo "⚠️ WARNING: Excluding directories from pytest.ini: $EXCLUDED_DIRS" + echo "" + fi + fi + + echo "Finding all test_*.py files in tests/ directory..." + + # Find all test_*.py files + ALL_TEST_FILES=$(find tests/ -name "test_*.py" -type f | sort) + + # Filter out excluded files based on directory exclusions + TEST_FILES="" + for test_file in $ALL_TEST_FILES; do + exclude_file=false + test_dir=$(dirname "$test_file") + + for excluded_dir in $EXCLUDED_DIRS; do + excluded_dir=$(echo "$excluded_dir" | xargs) # trim whitespace + if [ -n "$excluded_dir" ]; then + # Check if this file's directory should be excluded + if [[ "$test_dir" == *"/$excluded_dir" ]] || [[ "$test_dir" == "tests/$excluded_dir" ]] || [[ "$test_dir" == *"/$excluded_dir/"* ]]; then + exclude_file=true + break + fi + fi + done + + if [ "$exclude_file" = false ]; then + TEST_FILES="$TEST_FILES $test_file" + fi + done + + # Clean up whitespace + TEST_FILES=$(echo "$TEST_FILES" | xargs) + + if [ -z "$TEST_FILES" ]; then + echo "No test files found in tests/ directory (after exclusions)" + exit 1 + fi + + echo "Found test files:" + for test_file in $TEST_FILES; do + echo " $test_file" + done + echo "" +} + +# Main execution +main() { + # Parse command line arguments + parse_args "$@" + + # Print test mode banner + print_test_mode_banner + + # Install and verify (includes precompiled kernels) + install_and_verify + + # Find test files (unique to unit tests - auto-discovery) + find_test_files + + # Execute tests or dry run + if [ "$DRY_RUN" == "true" ]; then + execute_dry_run "$TEST_FILES" + else + execute_tests "$TEST_FILES" + fi + + exit "$EXIT_CODE" +} + +main "$@" diff --git a/scripts/task_test_blackwell_kernels.sh b/scripts/task_test_blackwell_kernels.sh deleted file mode 100644 index 130c5b0adc..0000000000 --- a/scripts/task_test_blackwell_kernels.sh +++ /dev/null @@ -1,424 +0,0 @@ -#!/bin/bash - -set -eo pipefail - -# Source test environment setup (handles package overrides like TVM-FFI) -source "$(dirname "${BASH_SOURCE[0]}")/setup_test_env.sh" - -: ${JUNIT_DIR:=$(realpath ./junit)} -: ${MAX_JOBS:=$(nproc)} -: ${CUDA_VISIBLE_DEVICES:=0} -: ${SAMPLE_RATE:=5} # Run every Nth test in sanity mode (5 = ~20% coverage) - -# Randomize starting offset (0 to SAMPLE_RATE-1) for sampling variety -if [ -z "${SAMPLE_OFFSET:-}" ]; then - SAMPLE_OFFSET=$((RANDOM % SAMPLE_RATE)) -fi - -# Clean Python bytecode cache to avoid stale imports (e.g., after module refactoring) -echo "Cleaning Python bytecode cache..." -find . -type d -name __pycache__ -exec rm -rf {} + 2>/dev/null || true -find . -type f -name '*.pyc' -delete 2>/dev/null || true -echo "Cache cleaned." -echo "" - -# Pytest configuration flags -PYTEST_FLAGS="--continue-on-collection-errors -s" - -# Parse command line arguments -DRY_RUN=false -SANITY_TEST=false -for arg in "$@"; do - case $arg in - --dry-run) - DRY_RUN=true - ;; - --sanity-test) - SANITY_TEST=true - ;; - esac -done - -if [ "$DRY_RUN" = "true" ]; then - echo "🔍 DRY RUN MODE - No tests will be executed" - echo "" -fi - -if [ "$SANITY_TEST" = "true" ]; then - echo "🔬 SANITY TEST MODE - Running every ${SAMPLE_RATE}th test (~$((100 / SAMPLE_RATE))% coverage)" - echo " Sampling pattern: offset=${SAMPLE_OFFSET} (tests #${SAMPLE_OFFSET}, #$((SAMPLE_OFFSET + SAMPLE_RATE)), #$((SAMPLE_OFFSET + SAMPLE_RATE * 2))...)" - echo "" -else - echo "📋 FULL TEST MODE - Running all tests from each test file" - echo "" -fi - -if [ "$DRY_RUN" != "true" ]; then - echo "Using CUDA version: ${CUDA_VERSION}" - echo "" - - # Install precompiled kernels (require CI build artifacts) - JIT_ARCH_EFFECTIVE="" - # Map CUDA_VERSION to CUDA_STREAM for artifact lookup - if [[ "${CUDA_VERSION}" == cu* ]]; then - CUDA_STREAM="${CUDA_VERSION}" - elif [ "${CUDA_VERSION}" = "12.9.0" ]; then - CUDA_STREAM="cu129" - else - CUDA_STREAM="cu130" - fi - echo "Using CUDA stream: ${CUDA_STREAM}" - echo "" - if [ -n "${JIT_ARCH}" ]; then - # 12.0a for CUDA 12.9.0, 12.0f for CUDA 13.0.0 - if [ "${JIT_ARCH}" = "12.0" ]; then - if [ "${CUDA_STREAM}" = "cu129" ]; then - JIT_ARCH_EFFECTIVE="12.0a" - else - JIT_ARCH_EFFECTIVE="12.0f" - fi - else - JIT_ARCH_EFFECTIVE="${JIT_ARCH}" - fi - - echo "Using JIT_ARCH from environment: ${JIT_ARCH_EFFECTIVE}" - DIST_CUBIN_DIR="../dist/${CUDA_STREAM}/${JIT_ARCH_EFFECTIVE}/cubin" - DIST_JIT_CACHE_DIR="../dist/${CUDA_STREAM}/${JIT_ARCH_EFFECTIVE}/jit-cache" - - echo "==== Debug: listing artifact directories ====" - echo "Tree under ../dist:" - (cd .. && ls -al dist) || true - echo "" - echo "Tree under ../dist/${CUDA_STREAM}:" - (cd .. && ls -al "dist/${CUDA_STREAM}") || true - echo "" - echo "Contents of ${DIST_CUBIN_DIR}:" - ls -al "${DIST_CUBIN_DIR}" || true - echo "" - echo "Contents of ${DIST_JIT_CACHE_DIR}:" - ls -al "${DIST_JIT_CACHE_DIR}" || true - echo "=============================================" - - if [ -d "${DIST_CUBIN_DIR}" ] && ls "${DIST_CUBIN_DIR}"/*.whl >/dev/null 2>&1; then - echo "Installing flashinfer-cubin from ${DIST_CUBIN_DIR} ..." - pip install -q "${DIST_CUBIN_DIR}"/*.whl - else - echo "ERROR: flashinfer-cubin wheel not found in ${DIST_CUBIN_DIR}. Ensure the CI build stage produced the artifact." >&2 - fi - - if [ -d "${DIST_JIT_CACHE_DIR}" ] && ls "${DIST_JIT_CACHE_DIR}"/*.whl >/dev/null 2>&1; then - echo "Installing flashinfer-jit-cache from ${DIST_JIT_CACHE_DIR} ..." - pip install -q "${DIST_JIT_CACHE_DIR}"/*.whl - else - echo "ERROR: flashinfer-jit-cache wheel not found in ${DIST_JIT_CACHE_DIR} for ${CUDA_VERSION}. Ensure the CI build stage produced the artifact." >&2 - fi - echo "" - fi - - # Install local python sources - pip install -e . -v --no-deps - echo "" - - # Verify installation - echo "Verifying installation..." - (cd /tmp && python -m flashinfer show-config) - echo "" -fi - -EXIT_CODE=0 - -echo "Reading pytest.ini for excluded directories..." -EXCLUDED_DIRS="" -if [ -f "./pytest.ini" ]; then - # Extract norecursedirs from pytest.ini and convert to array - NORECURSEDIRS=$(grep "^norecursedirs" ./pytest.ini | sed 's/norecursedirs\s*=\s*//' | sed 's/#.*//') - if [ -n "$NORECURSEDIRS" ]; then - EXCLUDED_DIRS=$(echo "$NORECURSEDIRS" | tr ',' ' ' | tr -s ' ') - echo "⚠️ WARNING: Excluding directories from pytest.ini: $EXCLUDED_DIRS" - echo "" - fi -fi - -echo "Finding all test_*.py files in tests/ directory..." - -# Find all test_*.py files -ALL_TEST_FILES=$(find tests/ -name "test_*.py" -type f | sort) - -# Filter out excluded files based on directory exclusions -TEST_FILES="" -for test_file in $ALL_TEST_FILES; do - exclude_file=false - test_dir=$(dirname "$test_file") - - for excluded_dir in $EXCLUDED_DIRS; do - excluded_dir=$(echo "$excluded_dir" | xargs) # trim whitespace - if [ -n "$excluded_dir" ]; then - # Check if this file's directory should be excluded - if [[ "$test_dir" == *"/$excluded_dir" ]] || [[ "$test_dir" == "tests/$excluded_dir" ]] || [[ "$test_dir" == *"/$excluded_dir/"* ]]; then - exclude_file=true - break - fi - fi - done - - if [ "$exclude_file" = false ]; then - TEST_FILES="$TEST_FILES $test_file" - fi -done - -# Clean up whitespace -TEST_FILES=$(echo "$TEST_FILES" | xargs) - -if [ -z "$TEST_FILES" ]; then - echo "No test files found in tests/ directory (after exclusions)" - exit 1 -fi - -echo "Found test files:" -for test_file in $TEST_FILES; do - echo " $test_file" -done -echo "" - -FAILED_TESTS="" -TOTAL_TESTS=0 -PASSED_TESTS=0 -TOTAL_TEST_CASES=0 -SAMPLED_TEST_CASES=0 - -if [ "$DRY_RUN" == "true" ]; then - echo "==========================================" - echo "DRY RUN: Tests that would be executed" - echo "==========================================" - - if [ "$SANITY_TEST" == "true" ]; then - # Sanity test mode - show sampling details - FILE_COUNT=0 - for test_file in $TEST_FILES; do - FILE_COUNT=$((FILE_COUNT + 1)) - - echo "" - echo "[$FILE_COUNT] Collecting tests from: $test_file" - - # Temporarily disable exit on error for collection - set +e - COLLECTION_OUTPUT=$(pytest --collect-only -q "$test_file" 2>&1) - COLLECTION_EXIT_CODE=$? - set -e - - ALL_NODE_IDS=$(echo "$COLLECTION_OUTPUT" | grep "::" || true) - - if [ -z "$ALL_NODE_IDS" ]; then - if [ $COLLECTION_EXIT_CODE -ne 0 ]; then - echo " ⚠️ Collection failed for $test_file (skipping)" - else - echo " ⚠️ No tests found in $test_file" - fi - continue - fi - - # Count total tests - TOTAL_IN_FILE=$(echo "$ALL_NODE_IDS" | wc -l) - TOTAL_TEST_CASES=$((TOTAL_TEST_CASES + TOTAL_IN_FILE)) - - # Sample every Nth test with random offset - SAMPLED_NODE_IDS=$(echo "$ALL_NODE_IDS" | awk "NR % $SAMPLE_RATE == $SAMPLE_OFFSET") - # Fallback: if no tests sampled (offset missed all tests), take the first test - if [ -z "$SAMPLED_NODE_IDS" ] || [ $(echo "$SAMPLED_NODE_IDS" | wc -l) -eq 0 ]; then - SAMPLED_NODE_IDS=$(echo "$ALL_NODE_IDS" | head -1) - fi - SAMPLED_IN_FILE=$(echo "$SAMPLED_NODE_IDS" | wc -l) - SAMPLED_TEST_CASES=$((SAMPLED_TEST_CASES + SAMPLED_IN_FILE)) - - echo " Total test cases: $TOTAL_IN_FILE" - echo " Sampled test cases: $SAMPLED_IN_FILE (every ${SAMPLE_RATE}th test, offset ${SAMPLE_OFFSET})" - echo " Sample of tests that would run:" - echo "$SAMPLED_NODE_IDS" | head -5 | sed 's/^/ /' || true - if [ "$SAMPLED_IN_FILE" -gt 5 ]; then - echo " ... and $((SAMPLED_IN_FILE - 5)) more" - fi - done - - echo "" - echo "==========================================" - echo "DRY RUN SUMMARY (SANITY MODE)" - echo "==========================================" - echo "Total test files: $FILE_COUNT" - echo "Total test cases (full suite): $TOTAL_TEST_CASES" - echo "Sampled test cases (sanity): $SAMPLED_TEST_CASES" - if [ "$TOTAL_TEST_CASES" -gt 0 ]; then - echo "Coverage: ~$((SAMPLED_TEST_CASES * 100 / TOTAL_TEST_CASES))%" - else - echo "Coverage: N/A (no tests collected)" - fi - echo "Sample rate: every ${SAMPLE_RATE}th test, offset ${SAMPLE_OFFSET}" - echo "" - echo "To reproduce this exact run:" - echo " SAMPLE_RATE=${SAMPLE_RATE} SAMPLE_OFFSET=${SAMPLE_OFFSET} $0 --sanity-test" - else - # Full test mode - for test_file in $TEST_FILES; do - TOTAL_TESTS=$((TOTAL_TESTS + 1)) - JUNIT_FILENAME="${test_file//\//_}.xml" - JUNIT_FLAG="--junitxml=${JUNIT_DIR}/${JUNIT_FILENAME}" - echo "$TOTAL_TESTS. pytest $PYTEST_FLAGS ${JUNIT_FLAG} \"${test_file}\"" - done - - echo "" - echo "==========================================" - echo "DRY RUN SUMMARY" - echo "==========================================" - echo "Total test files that would be executed: $TOTAL_TESTS" - fi - - echo "" - echo "To actually run the tests, execute without --dry-run:" - if [ "$SANITY_TEST" == "true" ]; then - echo " $0 --sanity-test" - echo "" - echo "To reproduce this exact sampling pattern:" - echo " SAMPLE_RATE=${SAMPLE_RATE} SAMPLE_OFFSET=${SAMPLE_OFFSET} $0 --sanity-test" - else - echo " $0" - fi -else - mkdir -p "${JUNIT_DIR}" - - if [ "$SANITY_TEST" == "true" ]; then - # Sanity test mode - sample tests from each file - FILE_COUNT=0 - - for test_file in $TEST_FILES; do - FILE_COUNT=$((FILE_COUNT + 1)) - - echo "==========================================" - echo "[$FILE_COUNT] Processing: $test_file" - echo "==========================================" - - # Collect all test node IDs for this file - echo "Collecting test cases..." - - # Temporarily disable exit on error for collection - set +e - COLLECTION_OUTPUT=$(pytest --collect-only -q "$test_file" 2>&1) - COLLECTION_EXIT_CODE=$? - set -e - - ALL_NODE_IDS=$(echo "$COLLECTION_OUTPUT" | grep "::" || true) - - if [ -z "$ALL_NODE_IDS" ]; then - if [ $COLLECTION_EXIT_CODE -ne 0 ]; then - echo "⚠️ Collection failed for $test_file (skipping)" - else - echo "⚠️ No tests found in $test_file" - fi - echo "" - continue - fi - - # Count total tests - TOTAL_IN_FILE=$(echo "$ALL_NODE_IDS" | wc -l) - TOTAL_TEST_CASES=$((TOTAL_TEST_CASES + TOTAL_IN_FILE)) - - # Sample every Nth test with random offset - SAMPLED_NODE_IDS=$(echo "$ALL_NODE_IDS" | awk "NR % $SAMPLE_RATE == $SAMPLE_OFFSET") - # Fallback: if no tests sampled (offset missed all tests), take the first test - if [ -z "$SAMPLED_NODE_IDS" ] || [ $(echo "$SAMPLED_NODE_IDS" | wc -l) -eq 0 ]; then - SAMPLED_NODE_IDS=$(echo "$ALL_NODE_IDS" | head -1) - fi - SAMPLED_IN_FILE=$(echo "$SAMPLED_NODE_IDS" | wc -l) - SAMPLED_TEST_CASES=$((SAMPLED_TEST_CASES + SAMPLED_IN_FILE)) - - echo "Total test cases in file: $TOTAL_IN_FILE" - echo "Running sampled test cases: $SAMPLED_IN_FILE (every ${SAMPLE_RATE}th test, offset ${SAMPLE_OFFSET})" - - if [ "$SAMPLED_IN_FILE" -eq 0 ]; then - echo "⚠️ No tests sampled from $test_file, skipping" - echo "" - continue - fi - - # Create a bash array with the node IDs - mapfile -t SAMPLED_NODE_IDS_ARRAY <<< "$SAMPLED_NODE_IDS" - - JUNIT_FILENAME="${test_file//\//_}.xml" - JUNIT_FLAG="--junitxml=${JUNIT_DIR}/${JUNIT_FILENAME}" - - # Run pytest with the sampled node IDs - TOTAL_TESTS=$((TOTAL_TESTS + 1)) - - if pytest $PYTEST_FLAGS "${JUNIT_FLAG}" "${SAMPLED_NODE_IDS_ARRAY[@]}"; then - echo "✅ PASSED: $test_file ($SAMPLED_IN_FILE/$TOTAL_IN_FILE tests)" - PASSED_TESTS=$((PASSED_TESTS + 1)) - else - echo "❌ FAILED: $test_file ($SAMPLED_IN_FILE/$TOTAL_IN_FILE tests)" - FAILED_TESTS="$FAILED_TESTS\n - $test_file" - EXIT_CODE=1 - fi - - echo "" - done - - echo "==========================================" - echo "SANITY TEST SUMMARY" - echo "==========================================" - echo "Total test files executed: $TOTAL_TESTS" - echo "Test files passed: $PASSED_TESTS" - echo "Test files failed: $((TOTAL_TESTS - PASSED_TESTS))" - echo "" - echo "Total test cases (full suite): $TOTAL_TEST_CASES" - echo "Sampled test cases (executed): $SAMPLED_TEST_CASES" - if [ "$TOTAL_TEST_CASES" -gt 0 ]; then - echo "Coverage: ~$((SAMPLED_TEST_CASES * 100 / TOTAL_TEST_CASES))%" - else - echo "Coverage: N/A (no tests collected)" - fi - echo "Sample rate: every ${SAMPLE_RATE}th test, offset ${SAMPLE_OFFSET}" - echo "" - echo "To reproduce this exact run:" - echo " SAMPLE_RATE=${SAMPLE_RATE} SAMPLE_OFFSET=${SAMPLE_OFFSET} $0 --sanity-test" - - if [ -n "$FAILED_TESTS" ]; then - echo "" - echo "Failed test files:" - echo -e "$FAILED_TESTS" - fi - else - # Full test mode - run all tests in each file - for test_file in $TEST_FILES; do - echo "==========================================" - JUNIT_FILENAME="${test_file//\//_}.xml" - JUNIT_FLAG="--junitxml=${JUNIT_DIR}/${JUNIT_FILENAME}" - echo "Running: pytest $PYTEST_FLAGS ${JUNIT_FLAG} \"${test_file}\"" - echo "==========================================" - - TOTAL_TESTS=$((TOTAL_TESTS + 1)) - - if pytest $PYTEST_FLAGS "${JUNIT_FLAG}" "${test_file}"; then - echo "✅ PASSED: $test_file" - PASSED_TESTS=$((PASSED_TESTS + 1)) - else - echo "❌ FAILED: $test_file" - FAILED_TESTS="$FAILED_TESTS\n - $test_file" - EXIT_CODE=1 - fi - - echo "" - done - - echo "==========================================" - echo "TEST SUMMARY" - echo "==========================================" - echo "Total test files executed: $TOTAL_TESTS" - echo "Passed: $PASSED_TESTS" - echo "Failed: $((TOTAL_TESTS - PASSED_TESTS))" - - if [ -n "$FAILED_TESTS" ]; then - echo "" - echo "Failed tests:" - echo -e "$FAILED_TESTS" - fi - fi -fi - -exit $EXIT_CODE diff --git a/scripts/task_test_multi_gpu_comm_kernels.sh b/scripts/task_test_multi_gpu_comm_kernels.sh new file mode 100755 index 0000000000..94f2761dc8 --- /dev/null +++ b/scripts/task_test_multi_gpu_comm_kernels.sh @@ -0,0 +1,55 @@ +#!/bin/bash + +set -eo pipefail + +# Get the directory where this script is located +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" + +# Source test environment setup (handles package overrides like TVM-FFI) +source "${SCRIPT_DIR}/setup_test_env.sh" + +# Set MPI command prefix for multi-GPU tests +: "${PYTEST_COMMAND_PREFIX:=mpirun -np 4}" + +# Disable sanity testing for multi-GPU tests (always run full suite) +# shellcheck disable=SC2034 # Used by common_test_functions.sh +DISABLE_SANITY_TEST=true + +# Source common test functions +# shellcheck disable=SC1091 # File exists, checked separately +source "${SCRIPT_DIR}/test_utils.sh" + +# Define the specific test files for multi-GPU comm tests (single-node) +# TEST_FILES="tests/comm/test_allreduce_unified_api.py tests/comm/test_allreduce_negative.py tests/comm/test_trtllm_allreduce_fusion.py" +# Add others back once they are fixed +TEST_FILES="tests/comm/test_allreduce_unified_api.py" + +# Main execution +main() { + # Parse command line arguments + parse_args "$@" + + # Print test mode banner + print_test_mode_banner + + # Install and verify (unless dry run) + install_and_verify + + # Print test files + echo "Multi-GPU comm kernel test files (running with: ${PYTEST_COMMAND_PREFIX}):" + for test_file in $TEST_FILES; do + echo " $test_file" + done + echo "" + + # Execute tests or dry run + if [ "$DRY_RUN" == "true" ]; then + execute_dry_run "$TEST_FILES" + else + execute_tests "$TEST_FILES" + fi + + exit "$EXIT_CODE" +} + +main "$@" diff --git a/scripts/task_test_multi_node_comm_kernels.sh b/scripts/task_test_multi_node_comm_kernels.sh old mode 100644 new mode 100755 index 9eae39d705..0655a28f1c --- a/scripts/task_test_multi_node_comm_kernels.sh +++ b/scripts/task_test_multi_node_comm_kernels.sh @@ -1,12 +1,11 @@ #!/bin/bash set -eo pipefail -set -x -: ${MAX_JOBS:=$(nproc)} -: ${CUDA_VISIBLE_DEVICES:=0} +# Get the directory where this script is located +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" # Source test environment setup (handles package overrides like TVM-FFI) -source "$(dirname "${BASH_SOURCE[0]}")/setup_test_env.sh" +source "${SCRIPT_DIR}/setup_test_env.sh" # Clean Python bytecode cache to avoid stale imports (e.g., after module refactoring) # echo "Cleaning Python bytecode cache..." @@ -15,8 +14,43 @@ source "$(dirname "${BASH_SOURCE[0]}")/setup_test_env.sh" # echo "Cache cleaned." # echo "" -# pip install -e . -v +# Disable sanity testing for multi-node tests (always run full suite) +# shellcheck disable=SC2034 # Used by common_test_functions.sh +DISABLE_SANITY_TEST=true -pytest -s tests/comm/test_mnnvl_memory.py -pytest -s tests/comm/test_trtllm_mnnvl_allreduce.py -pytest -s tests/comm/test_mnnvl_moe_alltoall.py +# Source common test functions +# shellcheck disable=SC1091 # File exists, checked separately +source "${SCRIPT_DIR}/test_utils.sh" + +# Define the specific test files for multi-node comm tests +TEST_FILES="tests/comm/test_mnnvl_memory.py tests/comm/test_trtllm_mnnvl_allreduce.py tests/comm/test_mnnvl_moe_alltoall.py" + +# Main execution +main() { + # Parse command line arguments + parse_args "$@" + + # Print test mode banner + print_test_mode_banner + + # Install and verify (unless dry run) + install_and_verify + + # Print test files + echo "Multi-node comm kernel test files:" + for test_file in $TEST_FILES; do + echo " $test_file" + done + echo "" + + # Execute tests or dry run + if [ "$DRY_RUN" == "true" ]; then + execute_dry_run "$TEST_FILES" + else + execute_tests "$TEST_FILES" + fi + + exit "$EXIT_CODE" +} + +main "$@" diff --git a/scripts/test_utils.sh b/scripts/test_utils.sh new file mode 100755 index 0000000000..e055b08003 --- /dev/null +++ b/scripts/test_utils.sh @@ -0,0 +1,438 @@ +#!/bin/bash +# Common test functions for FlashInfer test scripts +# This file is meant to be sourced by test runner scripts + +# Default environment variables +: "${JUNIT_DIR:=$(realpath ./junit)}" +: "${MAX_JOBS:=$(nproc 2>/dev/null || sysctl -n hw.ncpu 2>/dev/null || echo 4)}" +: "${CUDA_VISIBLE_DEVICES:=0}" +: "${SAMPLE_RATE:=5}" # Run every Nth test in sanity mode (5 = ~20% coverage) + +# Randomize starting offset (0 to SAMPLE_RATE-1) for sampling variety +if [ -z "${SAMPLE_OFFSET:-}" ]; then + SAMPLE_OFFSET=$((RANDOM % SAMPLE_RATE)) +fi + +# Pytest configuration flags +PYTEST_FLAGS="--continue-on-collection-errors" + +# Command prefix for pytest (e.g., "mpirun -np 4" for multi-GPU tests) +: "${PYTEST_COMMAND_PREFIX:=}" + +# Global variables for test execution +FAILED_TESTS="" +TOTAL_TESTS=0 +PASSED_TESTS=0 +TOTAL_TEST_CASES=0 +SAMPLED_TEST_CASES=0 +# shellcheck disable=SC2034 # EXIT_CODE is used by calling scripts +EXIT_CODE=0 + +# Parse command line arguments +# Set DISABLE_SANITY_TEST=true before sourcing to disable sanity testing +: "${DISABLE_SANITY_TEST:=false}" + +parse_args() { + DRY_RUN=false + SANITY_TEST=false + for arg in "$@"; do + case $arg in + --dry-run) + DRY_RUN=true + ;; + --sanity-test) + if [ "$DISABLE_SANITY_TEST" = "true" ]; then + echo "⚠️ WARNING: Sanity testing is disabled for this test suite" + echo " Running full tests instead" + echo "" + else + SANITY_TEST=true + fi + ;; + esac + done +} + +# Print test mode banner +print_test_mode_banner() { + if [ "$DRY_RUN" = "true" ]; then + echo "🔍 DRY RUN MODE - No tests will be executed" + echo "" + fi + + if [ "$SANITY_TEST" = "true" ]; then + echo "🔬 SANITY TEST MODE - Running every ${SAMPLE_RATE}th test (~$((100 / SAMPLE_RATE))% coverage)" + echo " Sampling pattern: offset=${SAMPLE_OFFSET} (tests #${SAMPLE_OFFSET}, #$((SAMPLE_OFFSET + SAMPLE_RATE)), #$((SAMPLE_OFFSET + SAMPLE_RATE * 2))...)" + echo "" + else + echo "📋 FULL TEST MODE - Running all tests from each test file" + echo "" + fi +} + +# Install precompiled kernels (CI build artifacts) +install_precompiled_kernels() { + if [ "$DRY_RUN" = "true" ]; then + return + fi + + JIT_ARCH_EFFECTIVE="" + # Map CUDA_VERSION to CUDA_STREAM for artifact lookup + if [[ "${CUDA_VERSION}" == cu* ]]; then + CUDA_STREAM="${CUDA_VERSION}" + elif [ "${CUDA_VERSION}" = "12.9.0" ]; then + CUDA_STREAM="cu129" + else + CUDA_STREAM="cu130" + fi + echo "Using CUDA stream: ${CUDA_STREAM}" + echo "" + + if [ -n "${JIT_ARCH}" ]; then + # 12.0a for CUDA 12.9.0, 12.0f for CUDA 13.0.0 + if [ "${JIT_ARCH}" = "12.0" ]; then + if [ "${CUDA_STREAM}" = "cu129" ]; then + JIT_ARCH_EFFECTIVE="12.0a" + else + JIT_ARCH_EFFECTIVE="12.0f" + fi + else + JIT_ARCH_EFFECTIVE="${JIT_ARCH}" + fi + + echo "Using JIT_ARCH from environment: ${JIT_ARCH_EFFECTIVE}" + DIST_CUBIN_DIR="../dist/${CUDA_STREAM}/${JIT_ARCH_EFFECTIVE}/cubin" + DIST_JIT_CACHE_DIR="../dist/${CUDA_STREAM}/${JIT_ARCH_EFFECTIVE}/jit-cache" + + echo "==== Debug: listing artifact directories ====" + echo "Tree under ../dist:" + (cd .. && ls -al dist) || true + echo "" + echo "Tree under ../dist/${CUDA_STREAM}:" + (cd .. && ls -al "dist/${CUDA_STREAM}") || true + echo "" + echo "Contents of ${DIST_CUBIN_DIR}:" + ls -al "${DIST_CUBIN_DIR}" || true + echo "" + echo "Contents of ${DIST_JIT_CACHE_DIR}:" + ls -al "${DIST_JIT_CACHE_DIR}" || true + echo "=============================================" + + if [ -d "${DIST_CUBIN_DIR}" ] && ls "${DIST_CUBIN_DIR}"/*.whl >/dev/null 2>&1; then + echo "Installing flashinfer-cubin from ${DIST_CUBIN_DIR} ..." + pip install -q "${DIST_CUBIN_DIR}"/*.whl + else + echo "ERROR: flashinfer-cubin wheel not found in ${DIST_CUBIN_DIR}. Ensure the CI build stage produced the artifact." >&2 + fi + + if [ -d "${DIST_JIT_CACHE_DIR}" ] && ls "${DIST_JIT_CACHE_DIR}"/*.whl >/dev/null 2>&1; then + echo "Installing flashinfer-jit-cache from ${DIST_JIT_CACHE_DIR} ..." + pip install -q "${DIST_JIT_CACHE_DIR}"/*.whl + else + echo "ERROR: flashinfer-jit-cache wheel not found in ${DIST_JIT_CACHE_DIR} for ${CUDA_VERSION}. Ensure the CI build stage produced the artifact." >&2 + fi + echo "" + fi +} + +# Install and verify FlashInfer +install_and_verify() { + if [ "$DRY_RUN" != "true" ]; then + echo "Using CUDA version: ${CUDA_VERSION}" + echo "" + + # Install precompiled kernels if enabled + install_precompiled_kernels + + # Install local python sources + pip install -e . -v --no-deps + echo "" + + # Verify installation + echo "Verifying installation..." + (cd /tmp && python -m flashinfer show-config) + echo "" + fi +} + +# Collect tests from a file +collect_tests() { + local test_file=$1 + + # Temporarily disable exit on error for collection + set +e + COLLECTION_OUTPUT=$(pytest --collect-only -q "$test_file" 2>&1) + COLLECTION_EXIT_CODE=$? + set -e + + ALL_NODE_IDS=$(echo "$COLLECTION_OUTPUT" | grep "::" || true) +} + +# Sample tests based on SAMPLE_RATE and SAMPLE_OFFSET +sample_tests() { + local all_node_ids=$1 + + # Sample every Nth test with random offset + SAMPLED_NODE_IDS=$(echo "$all_node_ids" | awk "NR % $SAMPLE_RATE == $SAMPLE_OFFSET") + # Fallback: if no tests sampled (offset missed all tests), take the first test + if [ -z "$SAMPLED_NODE_IDS" ] || [ "$(echo "$SAMPLED_NODE_IDS" | wc -l)" -eq 0 ]; then + SAMPLED_NODE_IDS=$(echo "$all_node_ids" | head -1) + fi +} + +# Process a single test file for dry run (sanity mode) +dry_run_sanity_file() { + local test_file=$1 + local file_count=$2 + + echo "" + echo "[$file_count] Collecting tests from: $test_file" + + collect_tests "$test_file" + + if [ -z "$ALL_NODE_IDS" ]; then + if [ $COLLECTION_EXIT_CODE -ne 0 ]; then + echo " ⚠️ Collection failed for $test_file (skipping)" + else + echo " ⚠️ No tests found in $test_file" + fi + return + fi + + # Count total tests + TOTAL_IN_FILE=$(echo "$ALL_NODE_IDS" | wc -l) + TOTAL_TEST_CASES=$((TOTAL_TEST_CASES + TOTAL_IN_FILE)) + + sample_tests "$ALL_NODE_IDS" + SAMPLED_IN_FILE=$(echo "$SAMPLED_NODE_IDS" | wc -l) + SAMPLED_TEST_CASES=$((SAMPLED_TEST_CASES + SAMPLED_IN_FILE)) + + echo " Total test cases: $TOTAL_IN_FILE" + echo " Sampled test cases: $SAMPLED_IN_FILE (every ${SAMPLE_RATE}th test, offset ${SAMPLE_OFFSET})" + echo " Sample of tests that would run:" + echo "$SAMPLED_NODE_IDS" | head -5 | sed 's/^/ /' || true + if [ "$SAMPLED_IN_FILE" -gt 5 ]; then + echo " ... and $((SAMPLED_IN_FILE - 5)) more" + fi +} + +# Process a single test file for dry run (full mode) +dry_run_full_file() { + local test_file=$1 + + TOTAL_TESTS=$((TOTAL_TESTS + 1)) + JUNIT_FILENAME="${test_file//\//_}.xml" + JUNIT_FLAG="--junitxml=${JUNIT_DIR}/${JUNIT_FILENAME}" + # shellcheck disable=SC2086 # PYTEST_COMMAND_PREFIX needs word splitting + echo "$TOTAL_TESTS. ${PYTEST_COMMAND_PREFIX} pytest $PYTEST_FLAGS ${JUNIT_FLAG} \"${test_file}\"" +} + +# Print dry run summary +print_dry_run_summary() { + if [ "$SANITY_TEST" == "true" ]; then + echo "" + echo "==========================================" + echo "DRY RUN SUMMARY (SANITY MODE)" + echo "==========================================" + echo "Total test files: $FILE_COUNT" + echo "Total test cases (full suite): $TOTAL_TEST_CASES" + echo "Sampled test cases (sanity): $SAMPLED_TEST_CASES" + if [ "$TOTAL_TEST_CASES" -gt 0 ]; then + echo "Coverage: ~$((SAMPLED_TEST_CASES * 100 / TOTAL_TEST_CASES))%" + else + echo "Coverage: N/A (no tests collected)" + fi + echo "Sample rate: every ${SAMPLE_RATE}th test, offset ${SAMPLE_OFFSET}" + echo "" + echo "To reproduce this exact run:" + echo " SAMPLE_RATE=${SAMPLE_RATE} SAMPLE_OFFSET=${SAMPLE_OFFSET} $0 --sanity-test" + else + echo "" + echo "==========================================" + echo "DRY RUN SUMMARY" + echo "==========================================" + echo "Total test files that would be executed: $TOTAL_TESTS" + fi + + echo "" + echo "To actually run the tests, execute without --dry-run:" + if [ "$SANITY_TEST" == "true" ]; then + echo " $0 --sanity-test" + echo "" + echo "To reproduce this exact sampling pattern:" + echo " SAMPLE_RATE=${SAMPLE_RATE} SAMPLE_OFFSET=${SAMPLE_OFFSET} $0 --sanity-test" + else + echo " $0" + fi +} + +# Run a single test file in sanity mode +run_sanity_test_file() { + local test_file=$1 + local file_count=$2 + + echo "==========================================" + echo "[$file_count] Processing: $test_file" + echo "==========================================" + + echo "Collecting test cases..." + + collect_tests "$test_file" + + if [ -z "$ALL_NODE_IDS" ]; then + if [ $COLLECTION_EXIT_CODE -ne 0 ]; then + echo "⚠️ Collection failed for $test_file (skipping)" + else + echo "⚠️ No tests found in $test_file" + fi + echo "" + return + fi + + # Count total tests + TOTAL_IN_FILE=$(echo "$ALL_NODE_IDS" | wc -l) + TOTAL_TEST_CASES=$((TOTAL_TEST_CASES + TOTAL_IN_FILE)) + + sample_tests "$ALL_NODE_IDS" + SAMPLED_IN_FILE=$(echo "$SAMPLED_NODE_IDS" | wc -l) + SAMPLED_TEST_CASES=$((SAMPLED_TEST_CASES + SAMPLED_IN_FILE)) + + echo "Total test cases in file: $TOTAL_IN_FILE" + echo "Running sampled test cases: $SAMPLED_IN_FILE (every ${SAMPLE_RATE}th test, offset ${SAMPLE_OFFSET})" + + if [ "$SAMPLED_IN_FILE" -eq 0 ]; then + echo "⚠️ No tests sampled from $test_file, skipping" + echo "" + return + fi + + # Create a bash array with the node IDs + mapfile -t SAMPLED_NODE_IDS_ARRAY <<< "$SAMPLED_NODE_IDS" + + JUNIT_FILENAME="${test_file//\//_}.xml" + JUNIT_FLAG="--junitxml=${JUNIT_DIR}/${JUNIT_FILENAME}" + + # Run pytest with the sampled node IDs + TOTAL_TESTS=$((TOTAL_TESTS + 1)) + + # shellcheck disable=SC2086 # PYTEST_COMMAND_PREFIX and PYTEST_FLAGS need word splitting + if ${PYTEST_COMMAND_PREFIX} pytest $PYTEST_FLAGS "${JUNIT_FLAG}" "${SAMPLED_NODE_IDS_ARRAY[@]}"; then + echo "✅ PASSED: $test_file ($SAMPLED_IN_FILE/$TOTAL_IN_FILE tests)" + PASSED_TESTS=$((PASSED_TESTS + 1)) + else + echo "❌ FAILED: $test_file ($SAMPLED_IN_FILE/$TOTAL_IN_FILE tests)" + FAILED_TESTS="$FAILED_TESTS\n - $test_file" + # shellcheck disable=SC2034 # EXIT_CODE is used by calling scripts + EXIT_CODE=1 + fi + + echo "" +} + +# Run a single test file in full mode +run_full_test_file() { + local test_file=$1 + + echo "==========================================" + JUNIT_FILENAME="${test_file//\//_}.xml" + JUNIT_FLAG="--junitxml=${JUNIT_DIR}/${JUNIT_FILENAME}" + # shellcheck disable=SC2086 # PYTEST_COMMAND_PREFIX needs word splitting + echo "Running: ${PYTEST_COMMAND_PREFIX} pytest $PYTEST_FLAGS ${JUNIT_FLAG} \"${test_file}\"" + echo "==========================================" + + TOTAL_TESTS=$((TOTAL_TESTS + 1)) + + # shellcheck disable=SC2086 # PYTEST_COMMAND_PREFIX and PYTEST_FLAGS need word splitting + if ${PYTEST_COMMAND_PREFIX} pytest $PYTEST_FLAGS "${JUNIT_FLAG}" "${test_file}"; then + echo "✅ PASSED: $test_file" + PASSED_TESTS=$((PASSED_TESTS + 1)) + else + echo "❌ FAILED: $test_file" + FAILED_TESTS="$FAILED_TESTS\n - $test_file" + # shellcheck disable=SC2034 # EXIT_CODE is used by calling scripts + EXIT_CODE=1 + fi + + echo "" +} + +# Print execution summary +print_execution_summary() { + if [ "$SANITY_TEST" == "true" ]; then + echo "==========================================" + echo "SANITY TEST SUMMARY" + echo "==========================================" + echo "Total test files executed: $TOTAL_TESTS" + echo "Test files passed: $PASSED_TESTS" + echo "Test files failed: $((TOTAL_TESTS - PASSED_TESTS))" + echo "" + echo "Total test cases (full suite): $TOTAL_TEST_CASES" + echo "Sampled test cases (executed): $SAMPLED_TEST_CASES" + if [ "$TOTAL_TEST_CASES" -gt 0 ]; then + echo "Coverage: ~$((SAMPLED_TEST_CASES * 100 / TOTAL_TEST_CASES))%" + else + echo "Coverage: N/A (no tests collected)" + fi + echo "Sample rate: every ${SAMPLE_RATE}th test, offset ${SAMPLE_OFFSET}" + echo "" + echo "To reproduce this exact run:" + echo " SAMPLE_RATE=${SAMPLE_RATE} SAMPLE_OFFSET=${SAMPLE_OFFSET} $0 --sanity-test" + else + echo "==========================================" + echo "TEST SUMMARY" + echo "==========================================" + echo "Total test files executed: $TOTAL_TESTS" + echo "Passed: $PASSED_TESTS" + echo "Failed: $((TOTAL_TESTS - PASSED_TESTS))" + fi + + if [ -n "$FAILED_TESTS" ]; then + echo "" + echo "Failed test files:" + echo -e "$FAILED_TESTS" + fi +} + +# Main execution function for dry run mode +execute_dry_run() { + local test_files=$1 + + echo "==========================================" + echo "DRY RUN: Tests that would be executed" + echo "==========================================" + + if [ "$SANITY_TEST" == "true" ]; then + FILE_COUNT=0 + for test_file in $test_files; do + FILE_COUNT=$((FILE_COUNT + 1)) + dry_run_sanity_file "$test_file" "$FILE_COUNT" + done + else + for test_file in $test_files; do + dry_run_full_file "$test_file" + done + fi + + print_dry_run_summary +} + +# Main execution function for actual test run +execute_tests() { + local test_files=$1 + + mkdir -p "${JUNIT_DIR}" + + if [ "$SANITY_TEST" == "true" ]; then + FILE_COUNT=0 + for test_file in $test_files; do + FILE_COUNT=$((FILE_COUNT + 1)) + run_sanity_test_file "$test_file" "$FILE_COUNT" + done + else + for test_file in $test_files; do + run_full_test_file "$test_file" + done + fi + + print_execution_summary +}