.github/workflows/UnitTests.yml

# Copyright 2023 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#      https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# This workflow will install Python dependencies, run tests and lint with a variety of Python versions
# For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python

name: Unit Test

on:
  pull_request:
  push:
    branches: [ "main" ]
  workflow_dispatch:
  schedule:
    # Run the job every 2 hours
    - cron:  '0 */2 * * *'

jobs:
  build_and_upload_image:
    strategy:
        fail-fast: false
        matrix:
          device:
          - type: tpu
            name: v4-8
            mode: stable
          - type: gpu
            name: a100-40gb-4
            mode: pinned
    name: Build and upload image (${{ matrix.device.name }})
    runs-on: ["self-hosted", "${{ matrix.device.type }}", "${{ matrix.device.name }}"]
    steps:
    - uses: actions/checkout@v4
    - name: Cleanup old docker images
      run: docker system prune --all --force
    - name: Build an image
      run: |
        bash docker_build_dependency_image.sh MODE=${{ matrix.device.mode }} DEVICE=${{ matrix.device.type }} 
    - name: Tag the image
      run: |
        docker tag maxtext_base_image gcr.io/tpu-prod-env-multipod/maxtext_${{ github.run_id }}:${{ matrix.device.type }}
    - name: Upload the image
      run: |
        docker push gcr.io/tpu-prod-env-multipod/maxtext_${{ github.run_id }}:${{ matrix.device.type }}

  common:
    needs: build_and_upload_image
    strategy:
      fail-fast: False
      matrix:
        device:
        - type: tpu
          name: v4-8
          attention: autoselected
          pytest_marker: ''
          container_env:
            XLA_PYTHON_CLIENT_MEM_FRACTION: 0.75
            TF_FORCE_GPU_ALLOW_GROWTH: false
          container_resource_option: "--privileged"
        - type: gpu
          name: a100-40gb-4
          image_suffix: gpu_jax_pinned
          attention: dot_product
          pytest_marker: -m 'not tpu'
          container_env:
            XLA_PYTHON_CLIENT_MEM_FRACTION: 0.65
            TF_FORCE_GPU_ALLOW_GROWTH: true
          container_resource_option: "--shm-size 2g --runtime=nvidia --gpus all --privileged"
    name: Common test (${{ matrix.device.name }})
    runs-on: ["self-hosted", "${{ matrix.device.type }}", "${{ matrix.device.name }}"]
    container:
      image: gcr.io/tpu-prod-env-multipod/maxtext_${{ github.run_id }}:${{ matrix.device.type }}
      volumes: 
        - /home/runner/actions-runner/_work/maxtext/maxtext:/deps
      env:
        XLA_PYTHON_CLIENT_MEM_FRACTION: ${{ matrix.device.container_env.XLA_PYTHON_CLIENT_MEM_FRACTION }}
        TF_FORCE_GPU_ALLOW_GROWTH: ${{ matrix.device.container_env.TF_FORCE_GPU_ALLOW_GROWTH }}
      options: ${{ matrix.device.container_resource_option }}
    steps:
    - uses: actions/checkout@v4
    - name: Test gsutil installation
      run: which gsutil >/dev/null 2>&1 || { echo >&2 "gsutil is required but not installed. Aborting"; exit 24;}
    - name: Test with pytest
      run: cd MaxText;python3 -m pytest ${{ matrix.device.pytest_marker }}
    - name: Test train.py with TFDS c4
      run: python3 MaxText/train.py MaxText/configs/base.yml run_name=runner_$(date +%Y-%m-%d-%H-%M-%S) base_output_directory=gs://runner-maxtext-logs dataset_path=gs://maxtext-dataset steps=2 enable_checkpointing=false attention=${{ matrix.device.attention }}
    - name: Test train.py with HF c4
      run: python3 MaxText/train.py MaxText/configs/base.yml run_name=runner_$(date +%Y-%m-%d-%H-%M-%S) base_output_directory=gs://runner-maxtext-logs hf_data_files=gs://maxtext-dataset/hf/c4/c4-train-00000-of-01637.parquet hf_path=parquet dataset_type=hf steps=2 tokenizer_path=google-t5/t5-large attention=${{ matrix.device.attention }} enable_checkpointing=false
    - name: Test train.py with synthetic data
      run: python3 MaxText/train.py MaxText/configs/base.yml run_name=runner_$(date +%Y-%m-%d-%H-%M-%S) base_output_directory=gs://runner-maxtext-logs dataset_path=gs://maxtext-dataset steps=2 enable_checkpointing=false attention=${{ matrix.device.attention }} dataset_type=synthetic
    - name: Test train.py with per_device_batch_size < 1
      run: python3 MaxText/train.py MaxText/configs/base.yml run_name=runner_$(date +%Y-%m-%d-%H-%M-%S) base_output_directory=gs://runner-maxtext-logs dataset_path=gs://maxtext-dataset steps=2 per_device_batch_size=0.25 ici_tensor_parallelism=4 enable_checkpointing=false attention=${{ matrix.device.attention }}
    - name: Test decode.py
      run: python3 MaxText/decode.py MaxText/configs/base.yml run_name=runner_$(date +%Y-%m-%d-%H-%M-%S) base_output_directory=gs://runner-maxtext-logs dataset_path=gs://maxtext-dataset steps=2 ici_tensor_parallelism=4 attention=${{ matrix.device.attention }} enable_checkpointing=false max_target_length=128 per_device_batch_size=1
    - name: Test decode.py with per_device_batch_size < 1
      run: python3 MaxText/decode.py MaxText/configs/base.yml run_name=runner_$(date +%Y-%m-%d-%H-%M-%S) base_output_directory=gs://runner-maxtext-logs dataset_path=gs://maxtext-dataset steps=2 ici_tensor_parallelism=4 attention=${{ matrix.device.attention }} enable_checkpointing=false max_target_length=128 per_device_batch_size=.25
    - name: Test int8_training
      run: python3 MaxText/train.py MaxText/configs/base.yml run_name=runner_$(date +%Y-%m-%d-%H-%M-%S) base_output_directory=gs://runner-maxtext-logs dataset_path=gs://maxtext-dataset quantization=int8 steps=2 enable_checkpointing=false attention=${{ matrix.device.attention }}
    - name: Test fp8_training
      run: python3 MaxText/train.py MaxText/configs/base.yml run_name=runner_$(date +%Y-%m-%d-%H-%M-%S) base_output_directory=gs://runner-maxtext-logs dataset_path=gs://maxtext-dataset quantization=fp8 steps=2 enable_checkpointing=false attention=${{ matrix.device.attention }}
    - name: Test generate_param_only_checkpoint
      run: bash end_to_end/test_generate_param_only_checkpoint.sh -r runner_$(date +%Y-%m-%d-%H-%M-%S) -o gs://runner-maxtext-logs -d gs://maxtext-dataset -i 4 -a ${{ matrix.device.attention }}
    - name: Test generate_param_only_checkpoint with int8 quantization
      run: bash end_to_end/test_generate_param_only_checkpoint.sh -r runner_$(date +%Y-%m-%d-%H-%M-%S) -o gs://runner-maxtext-logs -d gs://maxtext-dataset -i 4 -q int8 -a ${{ matrix.device.attention }}
    - name: Test grain checkpoint determinism
      run: bash end_to_end/test_checkpointing.sh runner_$(date +%Y-%m-%d-%H-%M-%S) gs://runner-maxtext-logs gs://maxtext-dataset False grain ${{ matrix.device.attention }}
    - name: Test checkpoint compatibility
      run: bash end_to_end/test_checkpoint_compatibility.sh runner_$(date +%Y-%m-%d-%H-%M-%S) gs://runner-maxtext-logs gs://maxtext-dataset ${{ matrix.device.attention }}

  tpu:
    needs: build_and_upload_image
    strategy:
      fail-fast: false
      matrix:
        device-type: ["v4-8"]
    name: "TPU test (${{ matrix.device-type }})"
    runs-on: ["self-hosted", "tpu", "${{ matrix.device-type }}"]
    container:
      image: gcr.io/tpu-prod-env-multipod/maxtext_${{ github.run_id }}:tpu
      volumes: 
      - /home/runner/actions-runner/_work/maxtext/maxtext:/deps
      options: "--privileged"
    steps:
    - uses: actions/checkout@v4
    - name: Validate Pedagogical Example, Shmap_collective_matmul
      run: python3 pedagogical_examples/shmap_collective_matmul.py

  gpu:
    needs: build_and_upload_image
    strategy:
      fail-fast: false
      matrix:
        device-type: ["a100-40gb-4"]
        build-mode: ["pinned"]
    name: "GPU test (${{ matrix.device-type }}, ${{ matrix.build-mode }})"
    runs-on: ["self-hosted", "gpu", "${{ matrix.device-type }}"]
    container:
      image: gcr.io/tpu-prod-env-multipod/maxtext_${{ github.run_id }}:gpu
      volumes: 
      - /home/runner/actions-runner/_work/maxtext/maxtext:/deps
      env:
        XLA_PYTHON_CLIENT_MEM_FRACTION: 0.65
        TF_FORCE_GPU_ALLOW_GROWTH: true
      options: "--shm-size 2g --runtime=nvidia --gpus all --privileged"
    steps:
    - uses: actions/checkout@v4
    - name: Set up Docker Buildx
      uses: docker/setup-buildx-action@v3
    - name: Test train.py with flash attention
      run: python3 MaxText/train.py MaxText/configs/base.yml run_name=runner_$(date +%Y-%m-%d-%H-%M-%S) base_output_directory=gs://runner-maxtext-logs dataset_path=gs://maxtext-dataset steps=2 enable_checkpointing=false attention=cudnn_flash_te

  clean_up:
    if: ${{ always() }}
    needs: [common, gpu, tpu]
    name: "Clean up"
    runs-on: ["self-hosted"]
    steps:
    - name: Delete GPU image
      run: gcloud container images delete gcr.io/tpu-prod-env-multipod/maxtext_${{ github.run_id }}:gpu --force-delete-tags --quiet
    - name: Delete TPU image
      run: gcloud container images delete gcr.io/tpu-prod-env-multipod/maxtext_${{ github.run_id }}:tpu --force-delete-tags --quiet