diff --git a/.github/workflows/fast_tests.yml b/.github/workflows/fast_tests.yml index 31de2b8408..bfb463466c 100644 --- a/.github/workflows/fast_tests.yml +++ b/.github/workflows/fast_tests.yml @@ -27,7 +27,7 @@ jobs: runs-on: ubuntu-22.04 env: AWS_REGION: us-east-1 - EC2_AMI_ID: ami-0a2179742e502fdfe + EC2_AMI_ID: ami-04fe9856174d852b8 EC2_INSTANCE_TYPE: dl1.24xlarge EC2_SUBNET_ID: subnet-b7533b96 EC2_SECURITY_GROUP: sg-08af7938042271373 @@ -77,7 +77,7 @@ jobs: ref: ${{ github.event.pull_request.merge_commit_sha }} - name: Pull image run: | - docker pull vault.habana.ai/gaudi-docker/1.14.0/ubuntu22.04/habanalabs/pytorch-installer-2.1.1:latest + docker pull vault.habana.ai/gaudi-docker/1.15.0/ubuntu22.04/habanalabs/pytorch-installer-2.2.0:latest - name: Run tests run: | docker run \ @@ -89,7 +89,7 @@ jobs: --cap-add=sys_nice \ --net=host \ --ipc=host \ - vault.habana.ai/gaudi-docker/1.14.0/ubuntu22.04/habanalabs/pytorch-installer-2.1.1:latest \ + vault.habana.ai/gaudi-docker/1.15.0/ubuntu22.04/habanalabs/pytorch-installer-2.2.0:latest \ /bin/bash tests/ci/fast_tests.sh diffusers: name: Run tests for optimum.habana.diffusers @@ -113,7 +113,7 @@ jobs: ref: ${{ github.event.pull_request.merge_commit_sha }} - name: Pull image run: | - docker pull vault.habana.ai/gaudi-docker/1.14.0/ubuntu22.04/habanalabs/pytorch-installer-2.1.1:latest + docker pull vault.habana.ai/gaudi-docker/1.15.0/ubuntu22.04/habanalabs/pytorch-installer-2.2.0:latest - name: Run tests run: | docker run \ @@ -125,7 +125,7 @@ jobs: --cap-add=sys_nice \ --net=host \ --ipc=host \ - vault.habana.ai/gaudi-docker/1.14.0/ubuntu22.04/habanalabs/pytorch-installer-2.1.1:latest \ + vault.habana.ai/gaudi-docker/1.15.0/ubuntu22.04/habanalabs/pytorch-installer-2.2.0:latest \ /bin/bash tests/ci/fast_tests_diffusers.sh stop-runner: name: Stop self-hosted EC2 runner diff --git a/.github/workflows/slow_tests.yml b/.github/workflows/slow_tests.yml index 5e18f2460e..755e54e161 100644 --- a/.github/workflows/slow_tests.yml +++ b/.github/workflows/slow_tests.yml @@ -15,7 +15,7 @@ jobs: runs-on: ubuntu-22.04 env: AWS_REGION: us-west-2 - EC2_AMI_ID: ami-0961e95b539f72c46 + EC2_AMI_ID: ami-03549026a9aa06f99 EC2_INSTANCE_TYPE: dl1.24xlarge EC2_SUBNET_ID: subnet-452c913d EC2_SECURITY_GROUP: sg-0894f4f70dd6bd778 @@ -55,7 +55,7 @@ jobs: uses: actions/checkout@v2 - name: Pull image run: | - docker pull vault.habana.ai/gaudi-docker/1.14.0/ubuntu22.04/habanalabs/pytorch-installer-2.1.1:latest + docker pull vault.habana.ai/gaudi-docker/1.15.0/ubuntu22.04/habanalabs/pytorch-installer-2.2.0:latest - name: Run tests run: | docker run \ @@ -67,7 +67,7 @@ jobs: --cap-add=sys_nice \ --net=host \ --ipc=host \ - vault.habana.ai/gaudi-docker/1.14.0/ubuntu22.04/habanalabs/pytorch-installer-2.1.1:latest \ + vault.habana.ai/gaudi-docker/1.15.0/ubuntu22.04/habanalabs/pytorch-installer-2.2.0:latest \ /bin/bash tests/ci/example_diff_tests.sh stable-diffusion: name: Test Stable Diffusion @@ -83,7 +83,7 @@ jobs: uses: actions/checkout@v2 - name: Pull image run: | - docker pull vault.habana.ai/gaudi-docker/1.14.0/ubuntu22.04/habanalabs/pytorch-installer-2.1.1:latest + docker pull vault.habana.ai/gaudi-docker/1.15.0/ubuntu22.04/habanalabs/pytorch-installer-2.2.0:latest - name: Run tests run: | docker run \ @@ -95,7 +95,7 @@ jobs: --cap-add=sys_nice \ --net=host \ --ipc=host \ - vault.habana.ai/gaudi-docker/1.14.0/ubuntu22.04/habanalabs/pytorch-installer-2.1.1:latest \ + vault.habana.ai/gaudi-docker/1.15.0/ubuntu22.04/habanalabs/pytorch-installer-2.2.0:latest \ /bin/bash tests/ci/slow_tests_diffusers.sh deepspeed: name: Test DeepSpeed models @@ -112,7 +112,7 @@ jobs: uses: actions/checkout@v2 - name: Pull image run: | - docker pull vault.habana.ai/gaudi-docker/1.14.0/ubuntu22.04/habanalabs/pytorch-installer-2.1.1:latest + docker pull vault.habana.ai/gaudi-docker/1.15.0/ubuntu22.04/habanalabs/pytorch-installer-2.2.0:latest - name: Run tests run: | docker run \ @@ -124,7 +124,7 @@ jobs: --cap-add=sys_nice \ --net=host \ --ipc=host \ - vault.habana.ai/gaudi-docker/1.14.0/ubuntu22.04/habanalabs/pytorch-installer-2.1.1:latest \ + vault.habana.ai/gaudi-docker/1.15.0/ubuntu22.04/habanalabs/pytorch-installer-2.2.0:latest \ /bin/bash tests/ci/slow_tests_deepspeed.sh multi-card: name: Test multi-card models @@ -141,7 +141,7 @@ jobs: uses: actions/checkout@v2 - name: Pull image run: | - docker pull vault.habana.ai/gaudi-docker/1.14.0/ubuntu22.04/habanalabs/pytorch-installer-2.1.1:latest + docker pull vault.habana.ai/gaudi-docker/1.15.0/ubuntu22.04/habanalabs/pytorch-installer-2.2.0:latest - name: Run tests run: | docker run \ @@ -153,7 +153,7 @@ jobs: --cap-add=sys_nice \ --net=host \ --ipc=host \ - vault.habana.ai/gaudi-docker/1.14.0/ubuntu22.04/habanalabs/pytorch-installer-2.1.1:latest \ + vault.habana.ai/gaudi-docker/1.15.0/ubuntu22.04/habanalabs/pytorch-installer-2.2.0:latest \ /bin/bash tests/ci/slow_tests_8x.sh single-card: name: Test single-card models @@ -171,7 +171,7 @@ jobs: uses: actions/checkout@v2 - name: Pull image run: | - docker pull vault.habana.ai/gaudi-docker/1.14.0/ubuntu22.04/habanalabs/pytorch-installer-2.1.1:latest + docker pull vault.habana.ai/gaudi-docker/1.15.0/ubuntu22.04/habanalabs/pytorch-installer-2.2.0:latest - name: Run tests run: | docker run \ @@ -183,7 +183,7 @@ jobs: --cap-add=sys_nice \ --net=host \ --ipc=host \ - vault.habana.ai/gaudi-docker/1.14.0/ubuntu22.04/habanalabs/pytorch-installer-2.1.1:latest \ + vault.habana.ai/gaudi-docker/1.15.0/ubuntu22.04/habanalabs/pytorch-installer-2.2.0:latest \ /bin/bash tests/ci/slow_tests_1x.sh albert-xxl-single-card: name: Test single-card ALBERT XXL @@ -204,7 +204,7 @@ jobs: - name: Pull image if: github.event.schedule == '0 21 * * 6' run: | - docker pull vault.habana.ai/gaudi-docker/1.14.0/ubuntu22.04/habanalabs/pytorch-installer-2.1.1:latest + docker pull vault.habana.ai/gaudi-docker/1.15.0/ubuntu22.04/habanalabs/pytorch-installer-2.2.0:latest - name: Run test if: github.event.schedule == '0 21 * * 6' run: | @@ -217,7 +217,7 @@ jobs: --cap-add=sys_nice \ --net=host \ --ipc=host \ - vault.habana.ai/gaudi-docker/1.14.0/ubuntu22.04/habanalabs/pytorch-installer-2.1.1:latest \ + vault.habana.ai/gaudi-docker/1.15.0/ubuntu22.04/habanalabs/pytorch-installer-2.2.0:latest \ /bin/bash tests/ci/albert_xxl_1x.sh - name: Warning if: github.event.schedule != '0 21 * * 6' @@ -240,7 +240,7 @@ jobs: uses: actions/checkout@v2 - name: Pull image run: | - docker pull vault.habana.ai/gaudi-docker/1.14.0/ubuntu22.04/habanalabs/pytorch-installer-2.1.1:latest + docker pull vault.habana.ai/gaudi-docker/1.15.0/ubuntu22.04/habanalabs/pytorch-installer-2.2.0:latest - name: Run tests run: | docker run \ @@ -252,7 +252,7 @@ jobs: --cap-add=sys_nice \ --net=host \ --ipc=host \ - vault.habana.ai/gaudi-docker/1.14.0/ubuntu22.04/habanalabs/pytorch-installer-2.1.1:latest \ + vault.habana.ai/gaudi-docker/1.15.0/ubuntu22.04/habanalabs/pytorch-installer-2.2.0:latest \ make slow_tests_text_generation_example TOKEN=${{ secrets.TEXT_GENERATION_CI_HUB_TOKEN }} stop-runner: name: Stop self-hosted EC2 runner diff --git a/.github/workflows/slow_tests_gaudi2.yml b/.github/workflows/slow_tests_gaudi2.yml index a54b70f77c..1f540d432d 100644 --- a/.github/workflows/slow_tests_gaudi2.yml +++ b/.github/workflows/slow_tests_gaudi2.yml @@ -17,7 +17,7 @@ jobs: uses: actions/checkout@v2 - name: Pull image run: | - docker pull vault.habana.ai/gaudi-docker/1.14.0/ubuntu22.04/habanalabs/pytorch-installer-2.1.1:latest + docker pull vault.habana.ai/gaudi-docker/1.15.0/ubuntu22.04/habanalabs/pytorch-installer-2.2.0:latest - name: Run tests run: | docker run \ @@ -30,7 +30,7 @@ jobs: --cap-add=sys_nice \ --net=host \ --ipc=host \ - vault.habana.ai/gaudi-docker/1.14.0/ubuntu22.04/habanalabs/pytorch-installer-2.1.1:latest \ + vault.habana.ai/gaudi-docker/1.15.0/ubuntu22.04/habanalabs/pytorch-installer-2.2.0:latest \ /bin/bash tests/ci/slow_tests_diffusers.sh deepspeed: name: Test DeepSpeed models @@ -43,7 +43,7 @@ jobs: uses: actions/checkout@v2 - name: Pull image run: | - docker pull vault.habana.ai/gaudi-docker/1.14.0/ubuntu22.04/habanalabs/pytorch-installer-2.1.1:latest + docker pull vault.habana.ai/gaudi-docker/1.15.0/ubuntu22.04/habanalabs/pytorch-installer-2.2.0:latest - name: Run tests run: | docker run \ @@ -56,7 +56,7 @@ jobs: --cap-add=sys_nice \ --net=host \ --ipc=host \ - vault.habana.ai/gaudi-docker/1.14.0/ubuntu22.04/habanalabs/pytorch-installer-2.1.1:latest \ + vault.habana.ai/gaudi-docker/1.15.0/ubuntu22.04/habanalabs/pytorch-installer-2.2.0:latest \ /bin/bash tests/ci/slow_tests_deepspeed.sh fsdp: name: Test FSDP models @@ -69,7 +69,7 @@ jobs: uses: actions/checkout@v2 - name: Pull image run: | - docker pull vault.habana.ai/gaudi-docker/1.14.0/ubuntu22.04/habanalabs/pytorch-installer-2.1.1:latest + docker pull vault.habana.ai/gaudi-docker/1.15.0/ubuntu22.04/habanalabs/pytorch-installer-2.2.0:latest - name: Run tests run: | docker run \ @@ -82,7 +82,7 @@ jobs: --cap-add=sys_nice \ --net=host \ --ipc=host \ - vault.habana.ai/gaudi-docker/1.14.0/ubuntu22.04/habanalabs/pytorch-installer-2.1.1:latest \ + vault.habana.ai/gaudi-docker/1.15.0/ubuntu22.04/habanalabs/pytorch-installer-2.2.0:latest \ make slow_tests_fsdp TOKEN=${{ secrets.TEXT_GENERATION_CI_HUB_TOKEN }} multi-card: name: Test multi-card models @@ -95,7 +95,7 @@ jobs: uses: actions/checkout@v2 - name: Pull image run: | - docker pull vault.habana.ai/gaudi-docker/1.14.0/ubuntu22.04/habanalabs/pytorch-installer-2.1.1:latest + docker pull vault.habana.ai/gaudi-docker/1.15.0/ubuntu22.04/habanalabs/pytorch-installer-2.2.0:latest - name: Run tests run: | docker run \ @@ -108,7 +108,7 @@ jobs: --cap-add=sys_nice \ --net=host \ --ipc=host \ - vault.habana.ai/gaudi-docker/1.14.0/ubuntu22.04/habanalabs/pytorch-installer-2.1.1:latest \ + vault.habana.ai/gaudi-docker/1.15.0/ubuntu22.04/habanalabs/pytorch-installer-2.2.0:latest \ /bin/bash tests/ci/slow_tests_8x.sh single-card: name: Test single-card models @@ -122,7 +122,7 @@ jobs: uses: actions/checkout@v2 - name: Pull image run: | - docker pull vault.habana.ai/gaudi-docker/1.14.0/ubuntu22.04/habanalabs/pytorch-installer-2.1.1:latest + docker pull vault.habana.ai/gaudi-docker/1.15.0/ubuntu22.04/habanalabs/pytorch-installer-2.2.0:latest:latest - name: Run tests run: | docker run \ @@ -136,7 +136,7 @@ jobs: --cap-add=sys_nice \ --net=host \ --ipc=host \ - vault.habana.ai/gaudi-docker/1.14.0/ubuntu22.04/habanalabs/pytorch-installer-2.1.1:latest \ + vault.habana.ai/gaudi-docker/1.15.0/ubuntu22.04/habanalabs/pytorch-installer-2.2.0:latest \ /bin/bash tests/ci/slow_tests_1x.sh text-generation: name: Test text-generation example @@ -151,7 +151,7 @@ jobs: uses: actions/checkout@v2 - name: Pull image run: | - docker pull vault.habana.ai/gaudi-docker/1.14.0/ubuntu22.04/habanalabs/pytorch-installer-2.1.1:latest + docker pull vault.habana.ai/gaudi-docker/1.15.0/ubuntu22.04/habanalabs/pytorch-installer-2.2.0:latest - name: Run tests run: | docker run \ @@ -164,5 +164,5 @@ jobs: --cap-add=sys_nice \ --net=host \ --ipc=host \ - vault.habana.ai/gaudi-docker/1.14.0/ubuntu22.04/habanalabs/pytorch-installer-2.1.1:latest \ + vault.habana.ai/gaudi-docker/1.15.0/ubuntu22.04/habanalabs/pytorch-installer-2.2.0:latest \ make slow_tests_text_generation_example TOKEN=${{ secrets.TEXT_GENERATION_CI_HUB_TOKEN }} diff --git a/Makefile b/Makefile index c5de7c04fe..988435a0d4 100644 --- a/Makefile +++ b/Makefile @@ -51,7 +51,7 @@ slow_tests_8x: test_installs # Run DeepSpeed non-regression tests slow_tests_deepspeed: test_installs - python -m pip install git+https://github.com/HabanaAI/DeepSpeed.git@1.14.0 + python -m pip install git+https://github.com/HabanaAI/DeepSpeed.git@1.15.0 python -m pytest tests/test_examples.py -v -s -k "deepspeed" slow_tests_diffusers: test_installs @@ -63,7 +63,7 @@ slow_tests_diffusers: test_installs # Run text-generation non-regression tests slow_tests_text_generation_example: test_installs - python -m pip install git+https://github.com/HabanaAI/DeepSpeed.git@1.14.0 + python -m pip install git+https://github.com/HabanaAI/DeepSpeed.git@1.15.0 python -m pytest tests/test_text_generation_example.py tests/test_encoder_decoder.py -v -s --token $(TOKEN) slow_tests_fsdp: test_installs diff --git a/README.md b/README.md index 390214c47e..aa92ba63f1 100644 --- a/README.md +++ b/README.md @@ -45,9 +45,9 @@ The `--upgrade-strategy eager` option is needed to ensure `optimum-habana` is up To use the example associated with the latest stable release, run: > ``` > git clone https://github.com/huggingface/optimum-habana -> cd optimum-habana && git checkout v1.10.4 +> cd optimum-habana && git checkout v1.11.0 > ``` -> with `v1.10.4` the version number of this release. +> with `v1.11.0` the version number of this release. ### Option 2: Use the latest main branch under development @@ -62,7 +62,7 @@ git clone https://github.com/huggingface/optimum-habana To use DeepSpeed on HPUs, you also need to run the following command: >```bash ->pip install git+https://github.com/HabanaAI/DeepSpeed.git@1.14.0 +>pip install git+https://github.com/HabanaAI/DeepSpeed.git@1.15.0 >``` To install the requirements for every example: @@ -230,7 +230,7 @@ Please refer to Habana Gaudi's official [installation guide](https://docs.habana > Tests should be run in a Docker container based on Habana Docker images. > -> The current version has been validated for SynapseAI 1.14. +> The current version has been validated for SynapseAI 1.15. ## Development diff --git a/docs/Dockerfile b/docs/Dockerfile index 0c60a33b86..3d253fd361 100644 --- a/docs/Dockerfile +++ b/docs/Dockerfile @@ -1,4 +1,4 @@ -FROM vault.habana.ai/gaudi-docker/1.14.0/ubuntu22.04/habanalabs/pytorch-installer-2.1.1:latest +FROM vault.habana.ai/gaudi-docker/1.15.0/ubuntu22.04/habanalabs/pytorch-installer-2.2.0:latest ARG commit_sha ARG clone_url diff --git a/docs/source/installation.mdx b/docs/source/installation.mdx index eaad58cfd1..3dd8c2c1b0 100644 --- a/docs/source/installation.mdx +++ b/docs/source/installation.mdx @@ -23,6 +23,6 @@ python -m pip install --upgrade-strategy eager optimum[habana] To use DeepSpeed on HPUs, you also need to run the following command: ```bash -python -m pip install git+https://github.com/HabanaAI/DeepSpeed.git@1.14.0 +python -m pip install git+https://github.com/HabanaAI/DeepSpeed.git@1.15.0 ``` diff --git a/docs/source/usage_guides/deepspeed.mdx b/docs/source/usage_guides/deepspeed.mdx index b115554c83..51734bb42f 100644 --- a/docs/source/usage_guides/deepspeed.mdx +++ b/docs/source/usage_guides/deepspeed.mdx @@ -31,7 +31,7 @@ You can find more information about DeepSpeed Gaudi integration [here](https://d To use DeepSpeed on Gaudi, you need to install Optimum Habana and [Habana's DeepSpeed fork](https://github.com/HabanaAI/DeepSpeed) with: ```bash pip install optimum[habana] -pip install git+https://github.com/HabanaAI/DeepSpeed.git@1.14.0 +pip install git+https://github.com/HabanaAI/DeepSpeed.git@1.15.0 ``` @@ -78,7 +78,7 @@ It is strongly advised to read [this section](https://huggingface.co/docs/transf -Other examples of configurations for HPUs are proposed [here](https://github.com/HabanaAI/Model-References/tree/1.14.0/PyTorch/nlp/DeepSpeedExamples/deepspeed-bert/scripts) by Habana. +Other examples of configurations for HPUs are proposed [here](https://github.com/HabanaAI/Model-References/tree/1.15.0/PyTorch/nlp/DeepSpeedExamples/deepspeed-bert/scripts) by Habana. The [Transformers documentation](https://huggingface.co/docs/transformers/main_classes/deepspeed#configuration) explains how to write a configuration from scratch very well. A more complete description of all configuration possibilities is available [here](https://www.deepspeed.ai/docs/config-json/). diff --git a/examples/audio-classification/README.md b/examples/audio-classification/README.md index 58af855758..ecd227e018 100644 --- a/examples/audio-classification/README.md +++ b/examples/audio-classification/README.md @@ -100,7 +100,7 @@ On 8 HPUs, this script should run in ~12 minutes and yield an accuracy of **80.4 > You need to install DeepSpeed with: > ```bash -> pip install git+https://github.com/HabanaAI/DeepSpeed.git@1.14.0 +> pip install git+https://github.com/HabanaAI/DeepSpeed.git@1.15.0 > ``` DeepSpeed can be used with almost the same command as for a multi-card run: diff --git a/examples/gaudi_spawn.py b/examples/gaudi_spawn.py index a7cda9b9eb..b7833c4177 100644 --- a/examples/gaudi_spawn.py +++ b/examples/gaudi_spawn.py @@ -84,7 +84,7 @@ def main(): if not is_deepspeed_available(): raise ImportError( "--use_deepspeed requires deepspeed: `pip install" - " git+https://github.com/HabanaAI/DeepSpeed.git@1.14.0`." + " git+https://github.com/HabanaAI/DeepSpeed.git@1.15.0`." ) # Patch sys.argv diff --git a/examples/language-modeling/README.md b/examples/language-modeling/README.md index abf19c457b..776993aca1 100644 --- a/examples/language-modeling/README.md +++ b/examples/language-modeling/README.md @@ -562,41 +562,41 @@ python3 ../gaudi_spawn.py --use_deepspeed --world_size 8 run_lora_clm.py \ - Multi-card finetuning of Llama2-70B with FSDP and LoRA: ```bash -PT_HPU_MAX_COMPOUND_OP_SIZE=10 DEEPSPEED_HPU_ZERO3_SYNC_MARK_STEP_REQUIRED=1 \ -python3 ../gaudi_spawn.py --use_mpi --world_size 8 run_lora_clm.py \ +LOWER_LIST=ops_bf16.txt PT_HPU_LAZY_MODE=0 \ +python3 ../gaudi_spawn.py --world_size 8 --use_mpi run_lora_clm.py \ --model_name_or_path meta-llama/Llama-2-70b-hf \ --dataset_name tatsu-lab/alpaca \ --bf16 True \ --output_dir ./lora_out \ - --num_train_epochs 2 \ --max_seq_len 2048 \ - --per_device_train_batch_size 10 \ - --per_device_eval_batch_size 10 \ --gradient_checkpointing \ - --evaluation_strategy epoch \ - --eval_delay 2 \ + --per_device_train_batch_size 5 \ --save_strategy no \ --learning_rate 0.0004 \ --warmup_ratio 0.03 \ --lr_scheduler_type "constant" \ --logging_steps 1 \ --dataset_concatenation \ - --attn_softmax_bf16 True \ --do_train \ - --do_eval \ --use_habana \ - --use_lazy_mode False \ - --pipelining_fwd_bwd False \ --throughput_warmup_steps 3 \ --lora_rank 4 \ --lora_target_modules "q_proj" "v_proj" "k_proj" "o_proj" \ + --attn_softmax_bf16 True \ --validation_split_percentage 4 \ - --use_flash_attention True \ + --use_lazy_mode False \ --fsdp_config fsdp_config.json \ - --fsdp "auto_wrap" \ - --torch_compile_backend hpu_backend \. + --fsdp auto_wrap \ + --num_train_epochs 2 \ + --evaluation_strategy epoch \ + --per_device_eval_batch_size 1 \ + --eval_delay 2 \ + --do_eval \ + --pipelining_fwd_bwd False \ + --use_fused_rope False \ + --torch_compile_backend hpu_backend \ --torch_compile \ - --use_fused_rope False + --gradient_accumulation_steps 2 ``` - Multi-card finetuning of Falcon-180B: diff --git a/examples/multi-node-training/EFA/Dockerfile b/examples/multi-node-training/EFA/Dockerfile index a3d4d3ca99..2b97d0e54c 100644 --- a/examples/multi-node-training/EFA/Dockerfile +++ b/examples/multi-node-training/EFA/Dockerfile @@ -1,4 +1,4 @@ -FROM vault.habana.ai/gaudi-docker/1.14.0/ubuntu22.04/habanalabs/pytorch-installer-2.1.1:latest +FROM vault.habana.ai/gaudi-docker/1.15.0/ubuntu22.04/habanalabs/pytorch-installer-2.2.0:latest # Installs pdsh and upgrade pip RUN apt-get update && apt-get install -y pdsh && \ @@ -18,7 +18,7 @@ RUN sed -i 's/#Port 22/Port 3022/g' /etc/ssh/sshd_config && \ # Installs Optimum Habana and Habana's fork of DeepSpeed RUN pip install optimum[habana] && \ - pip install git+https://github.com/HabanaAI/DeepSpeed.git@1.14.0 + pip install git+https://github.com/HabanaAI/DeepSpeed.git@1.15.0 CMD ssh-keygen -t rsa -b 4096 -N '' -f ~/.ssh/id_rsa && \ chmod 600 ~/.ssh/id_rsa && \ diff --git a/examples/multi-node-training/GaudiNIC/Dockerfile b/examples/multi-node-training/GaudiNIC/Dockerfile index 9e73a4528f..a35013ea47 100644 --- a/examples/multi-node-training/GaudiNIC/Dockerfile +++ b/examples/multi-node-training/GaudiNIC/Dockerfile @@ -1,4 +1,4 @@ -FROM vault.habana.ai/gaudi-docker/1.14.0/ubuntu22.04/habanalabs/pytorch-installer-2.1.1:latest +FROM vault.habana.ai/gaudi-docker/1.15.0/ubuntu22.04/habanalabs/pytorch-installer-2.2.0:latest # Installs pdsh and upgrade pip RUN apt-get update && apt-get install -y pdsh && \ @@ -12,7 +12,7 @@ RUN sed -i 's/#Port 22/Port 3022/g' /etc/ssh/sshd_config && \ # Installs Optimum Habana and Habana's fork of DeepSpeed RUN pip install optimum[habana] && \ - pip install git+https://github.com/HabanaAI/DeepSpeed.git@1.14.0 + pip install git+https://github.com/HabanaAI/DeepSpeed.git@1.15.0 CMD ssh-keygen -t rsa -b 4096 -N '' -f ~/.ssh/id_rsa && \ chmod 600 ~/.ssh/id_rsa && \ diff --git a/examples/speech-recognition/README.md b/examples/speech-recognition/README.md index 510a52d213..50ae873ffb 100644 --- a/examples/speech-recognition/README.md +++ b/examples/speech-recognition/README.md @@ -78,13 +78,15 @@ python run_speech_recognition_ctc.py \ --use_lazy_mode \ --gaudi_config_name="Habana/wav2vec2" \ --throughput_warmup_steps="3" \ - --bf16 + --bf16 \ + --use_hpu_graphs_for_training \ + --use_hpu_grpahs_for_inference ``` On a single HPU, this script should run in *ca.* 6 hours and yield a CTC loss of **0.059** and a word error rate of **0.0423**. > If your data has a sampling rate which is different from the one of the data the model was trained on, this script will raise an error. -> Resampling with the `datasets` library is not supported on HPUs yet. +> Resampling with the `datasets` library is not supported on HPUs yet. HPU graphs are supported only on Gaudi2 and from SynapseAI v1.15. ### Multi-HPU CTC @@ -117,20 +119,22 @@ python ../gaudi_spawn.py \ --use_lazy_mode \ --gaudi_config_name Habana/wav2vec2 \ --throughput_warmup_steps 3 \ - --bf16 + --bf16 \ + --use_hpu_graphs_for_training \ + --use_hpu_graphs_for_inference ``` On 8 HPUs, this script should run in *ca.* 49 minutes and yield a CTC loss of **0.0613** and a word error rate of **0.0458**. > If your data has a sampling rate which is different from the one of the data the model was trained on, this script will raise an error. -> Resampling with the `datasets` library is not supported on HPUs yet. +> Resampling with the `datasets` library is not supported on HPUs yet. HPU graphs are supported only on Gaudi2 and from SynapseAI v1.15. ## DeepSpeed > You need to install DeepSpeed with: > ```bash -> pip install git+https://github.com/HabanaAI/DeepSpeed.git@1.14.0 +> pip install git+https://github.com/HabanaAI/DeepSpeed.git@1.15.0 > ``` DeepSpeed can be used with almost the same command as for a multi-card run: @@ -196,7 +200,8 @@ python run_speech_recognition_ctc.py \ --use_habana \ --use_lazy_mode \ --gaudi_config_name="Habana/wav2vec2" \ - --bf16 + --bf16 \ + --use_hpu_graphs_for_inference ``` ## Sequence to Sequence diff --git a/examples/text-generation/README.md b/examples/text-generation/README.md index 83a481970c..0f9a2c7b16 100644 --- a/examples/text-generation/README.md +++ b/examples/text-generation/README.md @@ -28,7 +28,7 @@ pip install -r requirements.txt Then, if you plan to use [DeepSpeed-inference](https://docs.habana.ai/en/latest/PyTorch/DeepSpeed/Inference_Using_DeepSpeed.html) (e.g. to use BLOOM/BLOOMZ), you should install DeepSpeed as follows: ```bash -pip install git+https://github.com/HabanaAI/DeepSpeed.git@1.14.0 +pip install git+https://github.com/HabanaAI/DeepSpeed.git@1.15.0 ``` @@ -108,7 +108,6 @@ Here are a few settings you may be interested in: - `--attn_softmax_bf16` to run attention softmax layer in bfloat16 precision provided that the model (such as Llama) supports it - `--trim_logits` to calculate logits only for the last token in the first time step provided that the model (such as Llama) supports it - `--fp8` Enable Quantization to fp8 -- `--kv_cache_fp8` Deprecated - Store kv-cache in float8 when kv-cache is used. should not be used with HQT(The Quantization Toolkit) For example, you can reproduce the results presented in [this blog post](https://huggingface.co/blog/habana-gaudi-2-bloom) with the following command: ```bash @@ -241,7 +240,7 @@ While `--bucket_size` works for any model without model file changes, an even mo ### Running with FP8 -Llama2-70b, Llama2-7b and Mixtral-8x7B in FP8 are enabled using the Quantization Toolkit (HQT), which provides model measurement and quantization capabilities in PyTorch. +Llama2-70b, Llama2-7b, Mixtral-8x7B, Falcon-7B, Falcon-40B, and Falcon-180B in FP8 are enabled using the Quantization Toolkit (HQT), which provides model measurement and quantization capabilities in PyTorch. More information on enabling fp8 in SynapseAI is available here: https://docs.habana.ai/en/latest/PyTorch/Inference_on_PyTorch/Inference_Using_FP8.html @@ -321,6 +320,38 @@ QUANT_CONFIG=./quantization_config/maxabs_quant_mixtral.json python run_generati --bf16 \ --fp8 ``` + +Here is an example to measure the tensor quantization statistics on Falcon-180B with 8 cards: +> Please note that Falcon-180B is a gated model, and users are required to request access to it. Please refer to the instructions provided in the StarCoder example above. +```bash +QUANT_CONFIG=./quantization_config/maxabs_measure_include_outputs.json python ../gaudi_spawn.py \ +--use_deepspeed --world_size 8 run_lm_eval.py \ +-o acc_falcon180b_bs1_quant.txt \ +--model_name_or_path tiiuae/falcon-180B \ +--use_hpu_graphs \ +--use_kv_cache \ +--trim_logits \ +--batch_size 1 \ +--bf16 \ +--reuse_cache +``` + +Here is an example to quantize the model based on previous measurements for Falcon-180B with 8 cards: +```bash +QUANT_CONFIG=./quantization_config/maxabs_quant.json python ../gaudi_spawn.py \ +--use_deepspeed --world_size 8 run_generation.py \ +--model_name_or_path tiiuae/falcon-180B \ +--use_hpu_graphs \ +--use_kv_cache \ +--limit_hpu_graphs \ +--max_input_tokens 128 \ +--max_new_tokens 2048 \ +--batch_size 110 \ +--bf16 \ +--reuse_cache \ +--trim_logits \ +--fp8 +``` `--fp8` is required to enable quantization in fp8. diff --git a/examples/text-generation/quantization_config/act_maxabs_hw_weights_pcs_maxabs_pow2_quant.json b/examples/text-generation/quantization_config/act_maxabs_hw_weights_pcs_maxabs_pow2_quant.json index c83fa281f6..602a147baa 100644 --- a/examples/text-generation/quantization_config/act_maxabs_hw_weights_pcs_maxabs_pow2_quant.json +++ b/examples/text-generation/quantization_config/act_maxabs_hw_weights_pcs_maxabs_pow2_quant.json @@ -2,9 +2,9 @@ "method": "HOOKS", "mode": "QUANTIZE", "observer": "maxabs", - "scale_method": "ACT_MAXABS_HW_WEIGHTS_PCS_MAXABS_POW2", - "whitelist": {"types": [], "names": []}, - "blacklist": {"types": [], "names": []}, + "scale_method": "ACT_MAXABS_POW2_WEIGHTS_PCS_OPT_POW2", + "allowlist": {"types": [], "names": []}, + "blocklist": {"types": [], "names": []}, "dump_stats_path": "./hqt_output/measure", "dump_stats_xlsx_path": "./hqt_output/measure/fp8stats.xlsx" } diff --git a/examples/text-generation/quantization_config/maxabs_measure.json b/examples/text-generation/quantization_config/maxabs_measure.json index 3715b506b6..3645fe743a 100644 --- a/examples/text-generation/quantization_config/maxabs_measure.json +++ b/examples/text-generation/quantization_config/maxabs_measure.json @@ -2,8 +2,8 @@ "method": "HOOKS", "mode": "MEASURE", "observer": "maxabs", - "whitelist": {"types": [], "names": []}, - "blacklist": {"types": [], "names": []}, + "allowlist": {"types": [], "names": []}, + "blocklist": {"types": [], "names": []}, "dump_stats_path": "./hqt_output/measure", "dump_stats_xlsx_path": "./hqt_output/measure/fp8stats.xlsx" } \ No newline at end of file diff --git a/examples/text-generation/quantization_config/maxabs_measure_include_outputs.json b/examples/text-generation/quantization_config/maxabs_measure_include_outputs.json new file mode 100644 index 0000000000..6de845a54d --- /dev/null +++ b/examples/text-generation/quantization_config/maxabs_measure_include_outputs.json @@ -0,0 +1,10 @@ +{ + "method": "HOOKS", + "mode": "MEASURE", + "observer": "maxabs", + "measure_exclude": "NONE", + "allowlist": {"types": [], "names": []}, + "blocklist": {"types": [], "names": []}, + "dump_stats_path": "./hqt_output/measure", + "dump_stats_xlsx_path": "./hqt_output/measure/fp8stats.xlsx" +} \ No newline at end of file diff --git a/examples/text-generation/quantization_config/maxabs_quant.json b/examples/text-generation/quantization_config/maxabs_quant.json index cb37e98a6e..02314a728e 100644 --- a/examples/text-generation/quantization_config/maxabs_quant.json +++ b/examples/text-generation/quantization_config/maxabs_quant.json @@ -3,8 +3,8 @@ "mode": "QUANTIZE", "observer": "maxabs", "scale_method": "maxabs_hw", - "whitelist": {"types": [], "names": []}, - "blacklist": {"types": [], "names": []}, + "allowlist": {"types": [], "names": []}, + "blocklist": {"types": [], "names": []}, "dump_stats_path": "./hqt_output/measure", "dump_stats_xlsx_path": "./hqt_output/measure/fp8stats.xlsx" } \ No newline at end of file diff --git a/examples/text-generation/quantization_config/unit_scale_quant.json b/examples/text-generation/quantization_config/unit_scale_quant.json index e2d709da61..caad4bb2a4 100644 --- a/examples/text-generation/quantization_config/unit_scale_quant.json +++ b/examples/text-generation/quantization_config/unit_scale_quant.json @@ -3,8 +3,8 @@ "mode": "QUANTIZE", "observer": "maxabs", "scale_method": "unit_scale", - "whitelist": {"types": [], "names": []}, - "blacklist": {"types": [], "names": []}, + "allowlist": {"types": [], "names": []}, + "blocklist": {"types": [], "names": []}, "dump_stats_path": "./hqt_output/measure", "dump_stats_xlsx_path": "./hqt_output/measure/fp8stats.xlsx" } diff --git a/examples/text-generation/quantization_tools/unify_measurements.py b/examples/text-generation/quantization_tools/unify_measurements.py new file mode 100644 index 0000000000..75ae329a44 --- /dev/null +++ b/examples/text-generation/quantization_tools/unify_measurements.py @@ -0,0 +1,171 @@ +import argparse +import json +import os +import sys + +import numpy as np + + +def find_measurement_path(measurement, measurements_dir_path, scales, group_size): + measurment_card = measurement + "_" + str(group_size) + for measurment_file in os.listdir(measurements_dir_path): + filename = os.fsdecode(measurment_file) + if not filename.endswith(".json") or "_mod_list" in filename or measurment_card not in filename: + continue + if scales: + if "MAXABS" in filename: + return os.path.join(measurements_dir_path, measurment_file) + else: + if "MAXABS" not in filename: + return os.path.join(measurements_dir_path, measurment_file) + + +def unify_measurements(measurement_group, measurements_dir_path, output_path, scales=False): + measurements_paths = [] + group_name = "" + + # save all the jsons paths in the given measurement group + for measurement in measurement_group: + measurement_path = find_measurement_path(measurement, measurements_dir_path, scales, len(measurement_group)) + measurements_paths.append(measurement_path) + group_name += measurement + + # save all the jsons content in the given measurement group + measurements_jsons = [] + for measurement_path in measurements_paths: + with open(measurement_path, "r") as f: + js = json.load(f) + measurements_jsons.append(js["Nodes"]) + + # create a name for the unified json that will be created for this measurement group + unified_json_name = ( + find_measurement_path(measurement_group[0], measurements_dir_path, scales, len(measurement_group)) + .split("/")[-1] + .replace("_" + measurement_group[0] + "_" + str(len(measurement_group)), "") + ) + unified_json_path = os.path.join(output_path, unified_json_name) + + # open a unified json file + with open(measurements_paths[0], "r") as origin, open(unified_json_path, "w") as copy: + copy.write(origin.read()) + with open(unified_json_path, "r") as json_file: + unified_json = json.load(json_file) + + # iterate all unified json nodes + for node_name, node_values in unified_json["Nodes"].items(): + max_inputs = node_values["inputs"] + max_outputs = None + if node_values.get("outputs") is not None: + max_outputs = node_values["outputs"] + max_weight = None + if node_values.get("params") is not None and node_values["params"].get("weight") is not None: + max_weight = node_values["params"]["weight"] + + # iterate over all the measurment group and take the maximum for each tensor and its channel + if scales: + for measurement_json in measurements_jsons: + max_inputs[0] = max(measurement_json[node_name]["inputs"][0], max_inputs[0]) + if max_outputs is not None: + max_outputs = max(measurement_json[node_name]["outputs"], max_outputs) + if max_weight is not None: + max_weight = max(measurement_json[node_name]["params"]["weight"], max_weight) + else: + for measurement_json in measurements_jsons: + for i in range(0, len(max_inputs)): + for j in range(0, len(max_inputs[i])): + max_inputs[i][j][0] = max(measurement_json[node_name]["inputs"][i][j][0], max_inputs[i][j][0]) + if max_outputs is not None: + for i in range(0, len(max_outputs)): + max_outputs[i][0] = max(measurement_json[node_name]["outputs"][i][0], max_outputs[i][0]) + if max_weight is not None: + for i in range(0, len(max_weight)): + max_weight[i][0] = max(measurement_json[node_name]["params"]["weight"][i][0], max_weight[i][0]) + + # update the maximum in the unified json + if scales: + unified_json["Nodes"][node_name]["inputs"][0] = max_inputs[0] + if max_outputs is not None: + unified_json["Nodes"][node_name]["outputs"] = max_outputs + if max_weight is not None: + unified_json["Nodes"][node_name]["params"]["weight"] = max_weight + else: + for i in range(0, len(max_inputs)): + for j in range(0, len(max_inputs[i])): + unified_json["Nodes"][node_name]["inputs"][i][j][0] = max_inputs[i][j][0] + if max_outputs is not None: + for i in range(0, len(max_outputs)): + unified_json["Nodes"][node_name]["outputs"][i][0] = max_outputs[i][0] + if max_weight is not None: + for i in range(0, len(max_weight)): + unified_json["Nodes"][node_name]["params"]["weight"][i][0] = max_weight[i][0] + global_rank = None + local_rank = None + mode = "" + layers = {} + with open(unified_json_path, "w") as json_file: + json.dump(unified_json, json_file) + mode = unified_json["Mode"] + nodes = unified_json["Nodes"] + + # create unified npz file from the unified json + unified_npz_path = os.path.join(output_path, unified_json_name.replace(".json", ".npz")) + for layer, dlayer in nodes.items(): + layers[layer] = {} + layers[layer]["inputs"] = [np.array(x) for x in dlayer["inputs"]] + if dlayer.get("outputs") is not None: + layers[layer]["outputs"] = np.array(dlayer["outputs"]) + if dlayer.get("params") is not None and dlayer["params"].get("weight") is not None: + layers[layer]["params"] = {} + layers[layer]["params"]["weight"] = np.array(dlayer["params"]["weight"]) + df = {"GlobalRank": global_rank, "LocalRank": local_rank, "Mode": mode, "Nodes": layers} + with open(unified_npz_path, "w"): + np.savez(unified_npz_path, df) + + +def parse_args(args): + parser = argparse.ArgumentParser( + description="Run the measurements parser", formatter_class=argparse.ArgumentDefaultsHelpFormatter + ) + parser.add_argument( + "-m", "--measurements", type=str, help="path to the directory of the measurements that will be unified" + ) + parser.add_argument( + "-g", + "--groups", + type=list, + nargs="+", + help="the groups of cards that are going to be unified- e.g. 01 23 45 67", + ) + parser.add_argument( + "-o", + "--out", + type=str, + default=os.getcwd(), + help="path to the directory where the unified measurements will be written", + ) + return parser.parse_args(args) + + +def main(args): + args = parse_args(args) + output_path = args.out + if not os.path.exists(output_path): + os.mkdir(output_path) + measurements_path = args.measurements + groups = args.groups + + num_jsons = 0 + for path in os.listdir(measurements_path): + if path.endswith(".json"): + num_jsons += 1 + assert os.path.isdir(measurements_path) and (num_jsons % len(groups)) == 0 + + for group in groups: + unify_measurements(group, measurements_path, output_path, scales=False) + unify_measurements(group, measurements_path, output_path, scales=True) + + print("finished measurement unifier script") + + +if __name__ == "__main__": + main(sys.argv[1:]) diff --git a/examples/text-generation/run_generation.py b/examples/text-generation/run_generation.py index 6b0b2e4695..1f503ed5e1 100644 --- a/examples/text-generation/run_generation.py +++ b/examples/text-generation/run_generation.py @@ -221,11 +221,6 @@ def setup_parser(parser): help="Preprocess on cpu, and some other optimizations. Useful to prevent recompilations when using dynamic prompts (simulate_dyn_prompt)", ) - parser.add_argument( - "--kv_cache_fp8", - action="store_true", - help="Store kv-cache in float8 when kv-cache is used. Can't use this argument together with QUANT_CONFIG env var", - ) parser.add_argument("--fp8", action="store_true", help="Enable Quantization to fp8") parser.add_argument( "--use_flash_attention", @@ -239,7 +234,17 @@ def setup_parser(parser): ) parser.add_argument("--temperature", default=1.0, type=float, help="Temperature value for text generation") parser.add_argument("--top_p", default=1.0, type=float, help="Top_p value for generating text via sampling") - + parser.add_argument( + "--const_serialization_path", + "--csp", + type=str, + help="Path to serialize const params. Const params will be held on disk memory instead of being allocated on host memory.", + ) + parser.add_argument( + "--disk_offload", + action="store_true", + help="Whether to enable device map auto. In case no space left on cpu, weights will be offloaded to disk.", + ) args = parser.parse_args() if args.torch_compile: @@ -249,10 +254,6 @@ def setup_parser(parser): args.limit_hpu_graphs = False args.quant_config = os.getenv("QUANT_CONFIG", "") - if args.quant_config and args.kv_cache_fp8: - # can't use both quant_config and kv_cache_fp8, since quant_config may trigger kv cache quantization - # with habana quantization toolkit - raise parser.error("Can't use QUANT_CONFIG env var with kv_cache_fp8 argument") return args @@ -561,6 +562,10 @@ def generate_dataset(batch): import habana_quantization_toolkit habana_quantization_toolkit.finish_measurements(model) + if args.const_serialization_path and os.path.isdir(args.const_serialization_path): + import shutil + + shutil.rmtree(args.const_serialization_path) if __name__ == "__main__": diff --git a/examples/text-generation/run_lm_eval.py b/examples/text-generation/run_lm_eval.py index 4ae8dcb26c..8d61118890 100644 --- a/examples/text-generation/run_lm_eval.py +++ b/examples/text-generation/run_lm_eval.py @@ -75,10 +75,15 @@ def __init__(self, tokenizer, model, args, options): self.options = options self._device = args.device self.model_inputs = {"use_cache": self.options.use_cache} - if self.model.config.model_type == "llama": + if self.model.config.model_type == "llama" or "falcon": self.model_inputs.update( { "reuse_cache": self.options.reuse_cache, + } + ) + if self.model.config.model_type == "llama": + self.model_inputs.update( + { "attn_softmax_bf16": self.options.attn_softmax_bf16, } ) @@ -131,12 +136,7 @@ def _model_call(self, inps): if self.options.static_shapes: bucket_length = self.find_bucket(seq_length) if self.options.use_cache and self.options.reuse_cache: - self.model.allocate_kv_cache( - bs, - bucket_length + 1, - bucket_length, - False, - ) + self.model.allocate_kv_cache(bs, bucket_length + 1, bucket_length) padding_length = bucket_length - seq_length inps = F.pad(inps, (0, padding_length), value=self.model.config.pad_token_id) logits = self.model(inps.to(self._device), **self.model_inputs)["logits"].cpu() @@ -176,6 +176,10 @@ def main(): import habana_quantization_toolkit habana_quantization_toolkit.finish_measurements(model) + if args.const_serialization_path and os.path.isdir(args.const_serialization_path): + import shutil + + shutil.rmtree(args.const_serialization_path) if __name__ == "__main__": diff --git a/examples/text-generation/text-generation-pipeline/README.md b/examples/text-generation/text-generation-pipeline/README.md index e73243dc8f..203b9ff333 100644 --- a/examples/text-generation/text-generation-pipeline/README.md +++ b/examples/text-generation/text-generation-pipeline/README.md @@ -28,7 +28,7 @@ export PYTHONPATH=${PYTHONPATH}:${OPTIMUM_HABANA_PATH}/examples/text-generation If you plan to use [DeepSpeed-inference](https://docs.habana.ai/en/latest/PyTorch/DeepSpeed/Inference_Using_DeepSpeed.html), you should install DeepSpeed as follows: ```bash -pip install git+https://github.com/HabanaAI/DeepSpeed.git@1.14.0 +pip install git+https://github.com/HabanaAI/DeepSpeed.git@1.15.0 ``` If you would like to use the pipeline with LangChain classes, you can install LangChain as follows: diff --git a/examples/text-generation/utils.py b/examples/text-generation/utils.py index e8c847c2f7..54d08d017f 100644 --- a/examples/text-generation/utils.py +++ b/examples/text-generation/utils.py @@ -96,18 +96,15 @@ def setup_distributed(args): args.global_rank = int(os.getenv("RANK", "0")) -def setup_quantization(args, model): - import habana_frameworks.torch.core as htcore - from habana_frameworks.torch.core.quantization import _check_params_as_const, _mark_params_as_const - from habana_frameworks.torch.hpu import hpu - - print("Initializing inference with quantization") - _mark_params_as_const(model) - _check_params_as_const(model) - if not args.quant_config: - hpu.enable_quantization() - htcore.hpu_initialize(model) - return model +def setup_const_serialization(const_serialization_path): + import uuid + + const_serialization_path = os.path.join(const_serialization_path + uuid.uuid4().hex) + os.makedirs(const_serialization_path) + from habana_frameworks.torch.hpu import enable_const_section_serialization + + print("Serializing const params to {}".format(const_serialization_path)) + enable_const_section_serialization(const_serialization_path, False, True) def setup_env(args): @@ -237,7 +234,7 @@ def setup_distributed_model(args, model_dtype, model_kwargs, logger): model = deepspeed.init_inference(model, **ds_inference_kwargs) model = model.module - if model.config.model_type == "llama": + if model.config.model_type in ["llama", "falcon"]: patch_scoped_linear_all_reduce(model) if args.quant_config: @@ -349,7 +346,6 @@ def setup_generation_config(args, model, tokenizer): generation_config.reduce_recompile = args.reduce_recompile if generation_config.reduce_recompile: assert generation_config.bucket_size > 0 - generation_config.kv_cache_fp8 = args.kv_cache_fp8 generation_config.use_flash_attention = args.use_flash_attention return generation_config @@ -373,6 +369,10 @@ def initialize_model(args, logger): "revision": args.model_revision, "token": args.token, } + if args.disk_offload: + model_kwargs["device_map"] = "auto" + model_kwargs["offload_folder"] = "/tmp/offload_folder/" + model = ( setup_model(args, model_dtype, model_kwargs, logger) if not use_deepspeed @@ -380,8 +380,16 @@ def initialize_model(args, logger): ) tokenizer, model = setup_tokenizer(args, model) generation_config = setup_generation_config(args, model, tokenizer) + + if args.const_serialization_path: + setup_const_serialization(args.const_serialization_path) if args.fp8: - model = setup_quantization(args, model) + import habana_frameworks.torch.core as htcore + + print("Initializing inference mode") + const_marking = os.getenv("ENABLE_CONST_MARKING", "True") + if const_marking == "True": + htcore.hpu_initialize(model) init_end = time.perf_counter() logger.info(f"Args: {args}") logger.info(f"device: {args.device}, n_hpu: {args.world_size}, bf16: {model_dtype == torch.bfloat16}") diff --git a/notebooks/AI_HW_Summit_2022.ipynb b/notebooks/AI_HW_Summit_2022.ipynb index 37075fd3ef..cf6c8bdea5 100644 --- a/notebooks/AI_HW_Summit_2022.ipynb +++ b/notebooks/AI_HW_Summit_2022.ipynb @@ -261,7 +261,7 @@ "metadata": {}, "outputs": [], "source": [ - "!pip install git+https://github.com/HabanaAI/DeepSpeed.git@1.14.0" + "!pip install git+https://github.com/HabanaAI/DeepSpeed.git@1.15.0" ] }, { diff --git a/optimum/habana/accelerate/accelerator.py b/optimum/habana/accelerate/accelerator.py index e33f5210db..84b8fb6d7f 100644 --- a/optimum/habana/accelerate/accelerator.py +++ b/optimum/habana/accelerate/accelerator.py @@ -141,7 +141,7 @@ def __init__( if deepspeed_plugin: if not is_deepspeed_available(): raise ImportError( - "DeepSpeed is not installed => run `pip install git+https://github.com/HabanaAI/DeepSpeed.git@1.14.0`." + "DeepSpeed is not installed => run `pip install git+https://github.com/HabanaAI/DeepSpeed.git@1.15.0`." ) mixed_precision = ( diff --git a/optimum/habana/accelerate/state.py b/optimum/habana/accelerate/state.py index e29651efa9..ab1cae7fef 100644 --- a/optimum/habana/accelerate/state.py +++ b/optimum/habana/accelerate/state.py @@ -55,7 +55,7 @@ def __init__(self, cpu: bool = False, **kwargs): if not is_deepspeed_available(): raise ImportError( "DeepSpeed is not available, install it with: `pip install" - " git+https://github.com/HabanaAI/DeepSpeed.git@1.14.0`." + " git+https://github.com/HabanaAI/DeepSpeed.git@1.15.0`." ) self.distributed_type = GaudiDistributedType.DEEPSPEED import deepspeed diff --git a/optimum/habana/transformers/generation/configuration_utils.py b/optimum/habana/transformers/generation/configuration_utils.py index e75e48a7c7..93df1335db 100644 --- a/optimum/habana/transformers/generation/configuration_utils.py +++ b/optimum/habana/transformers/generation/configuration_utils.py @@ -29,8 +29,6 @@ class GaudiGenerationConfig(GenerationConfig): Only active if `static_shapes` is used. Can't be used with `reuse_cache`. bucket_internal (`bool`, *optional*): Split kv sequence into buckets in decode phase. It improves throughput when max_new_tokens is large. - kv_cache_fp8 (`bool`, *optional*): - Store kv-cache in float8 when kv-cache is used use_flash_attention (`bool`, *optional*): Whether to use flash attention optimization. flash_attention_recompute (`bool`, *optional*): @@ -48,7 +46,6 @@ def __init__(self, **kwargs): self.bucket_size = kwargs.get("bucket_size", -1) self.bucket_internal = kwargs.get("bucket_internal", None) self.reduce_recompile = kwargs.get("reduce_recompile", None) - self.kv_cache_fp8 = kwargs.get("kv_cache_fp8", None) self.use_flash_attention = kwargs.get("use_flash_attention", None) self.flash_attention_recompute = kwargs.get("flash_attention_recompute", None) self.use_fused_rope = kwargs.get("use_fused_rope", None) diff --git a/optimum/habana/transformers/generation/utils.py b/optimum/habana/transformers/generation/utils.py index aa7d92ebce..92df17bb50 100755 --- a/optimum/habana/transformers/generation/utils.py +++ b/optimum/habana/transformers/generation/utils.py @@ -584,7 +584,8 @@ def generate( assert self.config.model_type in [ "llama", "mistral", - ], "reuse_cache only supported by llama and mistral at the moment" + "falcon", + ], "reuse_cache only supported by llama, mistral and falcon at the moment" if not generation_config.bucket_internal: assert ( generation_config.bucket_size <= 0 @@ -733,14 +734,11 @@ def generate( bs, _ = input_ids.shape if not is_greedy_or_beam_and_bucket: unwrap_deepspeed_model(self).allocate_kv_cache( - bs * generation_config.num_beams, - calculated_max_length, - token_idx, - generation_config.kv_cache_fp8, + bs * generation_config.num_beams, calculated_max_length, token_idx ) model_kwargs["kv_cache_len"] = calculated_max_length - if self.config.model_type in ["llama"]: + if self.config.model_type in ["llama", "falcon"]: if self.config.max_position_embeddings < calculated_max_length: unwrap_deepspeed_model(self).update_sincos_cache(seq_len=calculated_max_length) diff --git a/optimum/habana/transformers/modeling_utils.py b/optimum/habana/transformers/modeling_utils.py index 9d4e473aab..6dc40a73bf 100644 --- a/optimum/habana/transformers/modeling_utils.py +++ b/optimum/habana/transformers/modeling_utils.py @@ -26,7 +26,10 @@ GaudiBloomMLP, GaudiCodeGenAttention, GaudiCodeGenForCausalLM, + GaudiFalconAttention, + GaudiFalconDecoderLayer, GaudiFalconForCausalLM, + GaudiFalconMLP, GaudiFalconModel, GaudiGPT2Attention, GaudiGPT2LMHeadModel, @@ -84,9 +87,7 @@ gaudi_conv1d_forward, gaudi_esm_for_protein_folding_forward, gaudi_esmfolding_trunk_forward, - gaudi_falcon_attention_forward, gaudi_falcon_attention_split_heads, - gaudi_falcon_decoder_layer_forward, gaudi_generate_speech, gaudi_get_extended_attention_mask, gaudi_gpt2_block_forward, @@ -135,6 +136,7 @@ gaudi_wav2vec2_encoder_forward, gaudi_wav2vec2_forward, gaudi_wav2vec2_tdnnlayer_forward, + gaudi_wav2vec2forctc_forward, ) @@ -161,6 +163,7 @@ def adapt_transformers_to_gaudi(): ) transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2Model.forward = gaudi_wav2vec2_forward transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2Encoder.forward = gaudi_wav2vec2_encoder_forward + transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2ForCTC.forward = gaudi_wav2vec2forctc_forward transformers.models.wav2vec2.modeling_wav2vec2.TDNNLayer.forward = gaudi_wav2vec2_tdnnlayer_forward # Generation is modified to run faster in lazy mode @@ -298,10 +301,11 @@ def adapt_transformers_to_gaudi(): transformers.models.llama.modeling_llama.LlamaRMSNorm.forward = gaudi_llama_rmsnorm_forward # Optimization for falcon generation on Gaudi + transformers.models.falcon.modeling_falcon.FalconAttention = GaudiFalconAttention transformers.models.falcon.modeling_falcon.FalconForCausalLM = GaudiFalconForCausalLM + transformers.models.falcon.modeling_falcon.FalconMLP = GaudiFalconMLP transformers.models.falcon.modeling_falcon.FalconModel = GaudiFalconModel - transformers.models.falcon.modeling_falcon.FalconDecoderLayer.forward = gaudi_falcon_decoder_layer_forward - transformers.models.falcon.modeling_falcon.FalconAttention.forward = gaudi_falcon_attention_forward + transformers.models.falcon.modeling_falcon.FalconDecoderLayer = GaudiFalconDecoderLayer transformers.models.falcon.modeling_falcon.FalconAttention._split_heads = gaudi_falcon_attention_split_heads # Optimization for t5 on Gaudi diff --git a/optimum/habana/transformers/models/__init__.py b/optimum/habana/transformers/models/__init__.py index d0eb8b2dcd..1582d3f09e 100644 --- a/optimum/habana/transformers/models/__init__.py +++ b/optimum/habana/transformers/models/__init__.py @@ -43,11 +43,12 @@ gaudi_rot_vec_mul, ) from .falcon import ( + GaudiFalconAttention, + GaudiFalconDecoderLayer, GaudiFalconForCausalLM, + GaudiFalconMLP, GaudiFalconModel, - gaudi_falcon_attention_forward, gaudi_falcon_attention_split_heads, - gaudi_falcon_decoder_layer_forward, ) from .gpt2 import GaudiGPT2Attention, GaudiGPT2LMHeadModel, gaudi_gpt2_block_forward, gaudi_gpt2_forward from .gpt_bigcode import ( @@ -146,4 +147,5 @@ gaudi_wav2vec2_encoder_forward, gaudi_wav2vec2_forward, gaudi_wav2vec2_tdnnlayer_forward, + gaudi_wav2vec2forctc_forward, ) diff --git a/optimum/habana/transformers/models/falcon/__init__.py b/optimum/habana/transformers/models/falcon/__init__.py index 44ac5451f6..00c73ad110 100644 --- a/optimum/habana/transformers/models/falcon/__init__.py +++ b/optimum/habana/transformers/models/falcon/__init__.py @@ -1,7 +1,8 @@ from .modeling_falcon import ( + GaudiFalconAttention, + GaudiFalconDecoderLayer, GaudiFalconForCausalLM, + GaudiFalconMLP, GaudiFalconModel, - gaudi_falcon_attention_forward, gaudi_falcon_attention_split_heads, - gaudi_falcon_decoder_layer_forward, ) diff --git a/optimum/habana/transformers/models/falcon/modeling_falcon.py b/optimum/habana/transformers/models/falcon/modeling_falcon.py index 98e3555e95..9b9a74c12f 100644 --- a/optimum/habana/transformers/models/falcon/modeling_falcon.py +++ b/optimum/habana/transformers/models/falcon/modeling_falcon.py @@ -1,5 +1,6 @@ import contextlib import math +import os import warnings from typing import Optional, Tuple, Union @@ -27,6 +28,7 @@ import habana_frameworks.torch.core as htcore +from torch import nn from torch.nn import CrossEntropyLoss from torch.nn import functional as F from transformers.modeling_attn_mask_utils import _prepare_4d_causal_attention_mask_for_sdpa @@ -34,12 +36,15 @@ BaseModelOutputWithPastAndCrossAttentions, CausalLMOutputWithCrossAttentions, ) +from transformers.models.falcon.configuration_falcon import FalconConfig from transformers.models.falcon.modeling_falcon import ( + FalconAttention, + FalconDecoderLayer, FalconForCausalLM, + FalconMLP, FalconModel, apply_rotary_pos_emb, build_alibi_tensor, - dropout_add, ) from transformers.utils import logging @@ -52,6 +57,20 @@ logger = logging.get_logger(__name__) +def dropout_add(x: torch.Tensor, residual: torch.Tensor, prob: float, training: bool) -> torch.Tensor: + """ + Copied from transformers.models.falcon.modeling_falcon/dropout_add + https://github.com/huggingface/transformers/blob/b338a6c3b8eda29610d4d472cad8cd87cbfdaaed/src/transformers/models/falcon/modeling_falcon.py#L248 + """ + out = F.dropout(x, p=prob, training=training) + if training: + out = residual + out + return out + else: + residual.add_(out) + return residual + + def apply_customized_rope(q, k, cos, sin, position_ids): if q.device.type == "hpu" and FusedRoPE: # TODO: remove `.clone()` when it is fixed in SynapseAI @@ -111,257 +130,506 @@ def gaudi_falcon_attention_split_heads( return query, key, value -def gaudi_falcon_attention_forward( - self, - hidden_states: torch.Tensor, - alibi: Optional[torch.Tensor], - attention_mask: torch.Tensor, - position_ids: Optional[torch.LongTensor] = None, - layer_past: Optional[Tuple[torch.Tensor, torch.Tensor]] = None, - head_mask: Optional[torch.Tensor] = None, - use_cache: bool = False, - output_attentions: bool = False, - token_idx: Optional[torch.Tensor] = None, - **kwargs, -): +class Softmax(nn.Module): + def __init__(self): + super().__init__() + + def forward(self, x, dim=None, invAttnHead=None): + return torch.ops.hpu.softmax_fp8(x, dim, None, None, invAttnHead) + + +class Matmul(nn.Module): + def __init__(self): + super().__init__() + + def forward(self, *args, **kwargs): + return torch.matmul(*args, **kwargs) + + +# ScaledDotProductAttention is based on torch.nn.functional.scaled_dot_product_attention +class ScaledDotProductAttention(nn.Module): + def __init__(self, config: FalconConfig): + super().__init__() + self.head_dim = config.hidden_size // config.num_attention_heads + self.bmm1 = Matmul() + self.bmm2 = Matmul() + self.softmax = Softmax() + + def forward(self, query, key, value, attn_mask=None, dropout_p=0.0, is_causal=False, scale=None) -> torch.Tensor: + L, S = query.size(-2), key.size(-2) + scale_factor = 1 / math.sqrt(self.head_dim) + invAttnHead = torch.tensor(scale_factor, dtype=torch.float32).to("hpu") + + if is_causal: + assert attn_mask is None + attn_bias = torch.zeros(L, S, dtype=query.dtype) + temp_mask = torch.ones(L, S, dtype=torch.bool).tril(diagonal=0) + attn_bias.masked_fill_(temp_mask.logical_not(), float("-inf")) + attn_bias.to(query.dtype) + + if attn_mask is not None: + if attn_mask.dtype == torch.bool: + attn_mask.masked_fill_(attn_mask.logical_not(), float("-inf")) + + attn_weight = self.bmm1(query, key.transpose(-2, -1)) + + attn_weight += attn_mask + attn_weight = self.softmax(attn_weight, dim=-1, invAttnHead=invAttnHead) + attn_weight = torch.dropout(attn_weight, dropout_p, train=True) + return self.bmm2(attn_weight, value) + + +def update(prev, cur, dim, idx, inp_seq_len): + orig_cur = cur + cur = cur.to(dtype=prev.dtype) + + if prev.shape == cur.shape: + prev.copy_(cur) + return orig_cur + + if cur.shape[-2] > 1 and cur.shape[-2] <= prev.shape[-2]: + # Initialize + prev[:, :, :inp_seq_len, :].copy_(cur) + return orig_cur + assert cur.shape[2] == 1, f"Cannot update kv-cache. Unsupported shapes. prev:{prev.shape} cur:{cur.shape}" + if idx is not None: + prev.index_copy_(dim, idx - 1, cur) + prev_cast = prev.to(orig_cur.dtype) + return prev_cast + else: + return torch.cat((prev, cur), dim=dim) + + +class KVCache(torch.nn.Module): + def __init__(self): + super(KVCache, self).__init__() + self.cache = None + self.inp_seq_len = -1 + + def allocate(self, inp_seq_len, dtype, device, shape): + if self.cache is None or self.cache.shape != shape: + self.inp_seq_len = inp_seq_len + self.cache = torch.zeros(shape, dtype=dtype, device=device) + else: + assert ( + self.inp_seq_len == inp_seq_len + ), f"inp_seq_len must be the same. self.inp_seq_len:{self.inp_seq_len} inp_seq_len:{inp_seq_len}" + self.cache.fill_(0) + + def get_shape(self): + if self.cache is None: + return None + return self.cache.shape + + def forward(self, cur, dim, idx): + return self.update(self.cache, cur, dim, idx, self.inp_seq_len) + + def update(self, prev, cur, dim, idx, inp_seq_len): + return update(prev, cur, dim, idx, inp_seq_len) + + +class GaudiFalconAttention(FalconAttention): """ - Copied from FalconAttention.forward: https://github.com/huggingface/transformers/blob/main/src/transformers/models/falcon/modeling_falcon.py + Inherits from FalconAttention: https://github.com/huggingface/transformers/blob/838b87abe231fd70be5132088d0dee72a7bb8d62/src/transformers/models/falcon/modeling_falcon.py#L267 The only differences are: - add new args token_idx and position_ids - - replace F.scaled_dot_product_attention with Habana torch's version + - replace F.scaled_dot_product_attention with Habana torch's version for BF16 + - use ScaledDotProductAttention for FP8 quantization + - add new arg reuse_cache """ - if "padding_mask" in kwargs: - warnings.warn( - "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`" - ) - fused_qkv = self.query_key_value(hidden_states) # [batch_size, seq_length, 3 x hidden_size] - # 3 x [batch_size, seq_length, num_heads, head_dim] - (query_layer, key_layer, value_layer) = self._split_heads(fused_qkv) + def __init__(self, config: FalconConfig): + super().__init__(config) - batch_size, query_length, _, _ = query_layer.shape + if os.getenv("QUANT_CONFIG", ""): + self.sdpa = ScaledDotProductAttention(config) - query_layer = query_layer.transpose(1, 2).reshape(batch_size, -1, query_length, self.head_dim) - key_layer = key_layer.transpose(1, 2).reshape(batch_size, -1, query_length, self.head_dim) - value_layer = value_layer.transpose(1, 2).reshape(batch_size, -1, query_length, self.head_dim) + self.k_cache = KVCache() + self.v_cache = KVCache() + self.inp_seq_len = -1 + self.max_position_embeddings = config.max_position_embeddings - kv_seq_len = key_layer.shape[-2] - if layer_past is not None: - if token_idx is not None: - # When token_idx is used, - # past_kv_length = 0 - # static seq len = (input token len + max output token len) - kv_seq_len = layer_past[0].shape[-2] + def allocate_kv_cache(self, batch_size, max_seq_len, inp_seq_len): + if self.config.new_decoder_architecture: + cache_shape = (batch_size, self.num_heads, max_seq_len, self.head_dim) else: - kv_seq_len += layer_past[0].shape[-2] - if alibi is None: - cos, sin = self.rotary_emb(value_layer, seq_len=kv_seq_len) - query_layer, key_layer = apply_customized_rope(query_layer, key_layer, cos, sin, position_ids) - - if layer_past is not None: - past_key, past_value = layer_past - if token_idx is not None: - past_key.index_copy_(-2, token_idx - 1, key_layer) - past_value.index_copy_(-2, token_idx - 1, value_layer) - key_layer = past_key - value_layer = past_value - else: - # concatenate along seq_length dimension: - # - key: [batch_size, self.num_heads, kv_length, head_dim] - # - value: [batch_size, self.num_heads, kv_length, head_dim] - key_layer = torch.cat((past_key, key_layer), dim=-2) - value_layer = torch.cat((past_value, value_layer), dim=-2) - - kv_length = key_layer.shape[-2] - if use_cache: - present = (key_layer, value_layer) - else: - present = None + cache_shape = (batch_size, 1, max_seq_len, self.head_dim) + device = self.query_key_value.weight.device + dtype = self.config.torch_dtype + self.k_cache.allocate(inp_seq_len, dtype, device, cache_shape) + self.v_cache.allocate(inp_seq_len, dtype, device, cache_shape) + + def update_sincos_cache(self, seq_len): + # Call rotary emb forward() to update cos/sin cache when infering more than self.max_position_embeddings + # This helps in avoiding creation of these caches during actual model forward pass and + # reduce memory consumption and improve performance. + if seq_len > self.max_position_embeddings: + self.max_position_embeddings = seq_len + self.rotary_emb._set_cos_sin_cache( + seq_len, self.query_key_value.weight.device, self.query_key_value.weight.dtype + ) - if alibi is None: - if output_attentions: - attention_scores = query_layer @ key_layer.transpose(-1, -2) - attention_scores /= math.sqrt(self.head_dim) + def pre_attn_forward( + self, + hidden_states: torch.Tensor, + alibi: Optional[torch.Tensor], + attention_mask: torch.Tensor, + position_ids: Optional[torch.LongTensor] = None, + layer_past: Optional[Tuple[torch.Tensor, torch.Tensor]] = None, + head_mask: Optional[torch.Tensor] = None, + use_cache: bool = False, + output_attentions: bool = False, + token_idx: Optional[torch.Tensor] = None, + reuse_cache: Optional[bool] = False, + cache_idx: int = None, + **kwargs, + ): + if "padding_mask" in kwargs: + warnings.warn( + "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`" + ) + + fused_qkv = self.query_key_value(hidden_states) # [batch_size, seq_length, 3 x hidden_size] + # 3 x [batch_size, seq_length, num_heads, head_dim] + (query_layer, key_layer, value_layer) = self._split_heads(fused_qkv) + + batch_size, query_length, _, _ = query_layer.shape + + query_layer = query_layer.transpose(1, 2).reshape(batch_size, -1, query_length, self.head_dim) + key_layer = key_layer.transpose(1, 2).reshape(batch_size, -1, query_length, self.head_dim) + value_layer = value_layer.transpose(1, 2).reshape(batch_size, -1, query_length, self.head_dim) + + kv_seq_len = key_layer.shape[-2] + if layer_past is not None: + if token_idx is not None: + if reuse_cache: + kv_seq_len = layer_past[0][-2] + else: + kv_seq_len = layer_past[0].shape[-2] + else: + kv_seq_len += layer_past[0].shape[-2] + + if alibi is None: + cos, sin = self.rotary_emb(value_layer, seq_len=kv_seq_len) + query_layer, key_layer = apply_customized_rope(query_layer, key_layer, cos, sin, position_ids) + + if use_cache: + if self.training: + present = None + else: + if reuse_cache: + key_layer = self.k_cache(key_layer, -2, token_idx) + value_layer = self.v_cache(value_layer, -2, token_idx) + present = (self.k_cache.get_shape(), self.v_cache.get_shape()) + else: + if layer_past is None: + past_key = torch.zeros( + key_layer.shape, + dtype=self.query_key_value.weight.dtype, + device=self.query_key_value.weight.device, + ) + past_value = torch.zeros( + key_layer.shape, + dtype=self.query_key_value.weight.dtype, + device=self.query_key_value.weight.device, + ) + layer_past = (past_key, past_value) + key_layer = self.k_cache.update( + layer_past[0], key_layer, -2, token_idx, self.inp_seq_len + ) # k_layer bs*1, q_len, head_dim + value_layer = self.v_cache.update(layer_past[1], value_layer, -2, token_idx, self.inp_seq_len) + if token_idx is None: + layer_past = (key_layer, value_layer) + present = layer_past + + if cache_idx is not None and query_length == 1: + key_layer = key_layer[:, :, :cache_idx, :] + value_layer = value_layer[:, :, :cache_idx, :] + attention_mask = attention_mask[:, :, :, :cache_idx] + else: + present = None - attention_scores = F.softmax(attention_scores + attention_mask, dim=-1, dtype=hidden_states.dtype) - # It is unclear why neither dropout nor head_mask is applied here (while it is with alibi). - attn_output = attention_scores @ value_layer + if self.training or present is None: + kv_length = key_layer.shape[-2] else: - if FusedSDPA: - with sdp_kernel(enable_recompute=False) if SDPContext else contextlib.nullcontext(): - attn_output = FusedSDPA.apply( + kv_length = present[0][-2] if reuse_cache else present[0].shape[-2] + + if alibi is None: + if output_attentions: + attention_scores = query_layer @ key_layer.transpose(-1, -2) + attention_scores /= math.sqrt(self.head_dim) + + attention_scores = F.softmax(attention_scores + attention_mask, dim=-1, dtype=hidden_states.dtype) + # It is unclear why neither dropout nor head_mask is applied here (while it is with alibi). + attn_output = attention_scores @ value_layer + else: + if FusedSDPA: + if os.getenv("QUANT_CONFIG", ""): + attn_output = self.sdpa( + query_layer, key_layer, value_layer, attention_mask, 0.0, is_causal=False + ) + else: + with sdp_kernel(enable_recompute=False) if SDPContext else contextlib.nullcontext(): + attn_output = FusedSDPA.apply( + query_layer, + key_layer, + value_layer, + attention_mask, + 0.0, + # The query_length > 1 is necessary to match with AttentionMaskConverter.to_causal_4d that does not create a causal mask in case query_length == 1. + self.is_causal and attention_mask is None and query_length > 1, + ) + else: + # Workaround util scaled_dot_product_attention support broadcast. + if self.training is True and query_layer.shape != key_layer.shape: + key_layer = torch.broadcast_to(key_layer, query_layer.shape) + value_layer = torch.broadcast_to(value_layer, query_layer.shape) + attn_output = F.scaled_dot_product_attention( query_layer, key_layer, value_layer, attention_mask, 0.0, # The query_length > 1 is necessary to match with AttentionMaskConverter.to_causal_4d that does not create a causal mask in case query_length == 1. - self.is_causal and attention_mask is None and query_length > 1, + is_causal=self.is_causal and attention_mask is None and query_length > 1, ) - else: - # Workaround util scaled_dot_product_attention support broadcast. - if self.training is True and query_layer.shape != key_layer.shape: - key_layer = torch.broadcast_to(key_layer, query_layer.shape) - value_layer = torch.broadcast_to(value_layer, query_layer.shape) - attn_output = F.scaled_dot_product_attention( - query_layer, - key_layer, - value_layer, - attention_mask, - 0.0, - # The query_length > 1 is necessary to match with AttentionMaskConverter.to_causal_4d that does not create a causal mask in case query_length == 1. - is_causal=self.is_causal and attention_mask is None and query_length > 1, - ) - # Performance improvement for HPU - if self.training is True and htcore: - htcore.mark_step() - attention_scores = None + # Performance improvement for HPU + if self.training is True and htcore: + htcore.mark_step() + attention_scores = None - attn_output = attn_output.view(batch_size, -1, query_length, self.head_dim) - attn_output = attn_output.permute(0, 2, 1, 3) - attn_output = attn_output.reshape(batch_size, query_length, -1) + attn_output = attn_output.view(batch_size, -1, query_length, self.head_dim) + attn_output = attn_output.permute(0, 2, 1, 3) + attn_output = attn_output.reshape(batch_size, query_length, -1) - attn_output = self.dense(attn_output) + attn_output = self.dense(attn_output) - if output_attentions: - return attn_output, present, attention_scores - else: - return attn_output, present + if output_attentions: + return attn_output, present, attention_scores + else: + return attn_output, present, _ - else: - if self._use_sdpa and not output_attentions and head_mask is None: - if FusedSDPA: - with sdp_kernel(enable_recompute=False) if SDPContext else contextlib.nullcontext(): - attn_output = FusedSDPA.apply( + else: + if self._use_sdpa and not output_attentions and head_mask is None: + if FusedSDPA: + with sdp_kernel(enable_recompute=False) if SDPContext else contextlib.nullcontext(): + attn_output = FusedSDPA.apply( + query_layer, + key_layer, + value_layer, + attention_mask, + self.attention_dropout.p if self.training else 0.0, + self.is_causal and attention_mask is None and query_length > 1, + ) + else: + attn_output = F.scaled_dot_product_attention( query_layer, key_layer, value_layer, - attention_mask, - self.attention_dropout.p if self.training else 0.0, - self.is_causal and attention_mask is None and query_length > 1, + attn_mask=attention_mask, + dropout_p=self.attention_dropout.p if self.training else 0.0, + is_causal=self.is_causal and attention_mask is None and query_length > 1, ) + attn_output = attn_output.transpose(1, 2) + attn_output = attn_output.reshape(batch_size, query_length, self.num_heads * self.head_dim) + + attn_output = self.dense(attn_output) else: - attn_output = F.scaled_dot_product_attention( - query_layer, - key_layer, - value_layer, - attn_mask=attention_mask, - dropout_p=self.attention_dropout.p if self.training else 0.0, - is_causal=self.is_causal and attention_mask is None and query_length > 1, - ) - attn_output = attn_output.transpose(1, 2) - attn_output = attn_output.reshape(batch_size, query_length, self.num_heads * self.head_dim) + matmul_result = query_layer @ key_layer.transpose(-1, -2) - attn_output = self.dense(attn_output) - else: - matmul_result = query_layer @ key_layer.transpose(-1, -2) + # change view to [batch_size, num_heads, q_length, kv_length] + attention_scores = matmul_result.view(batch_size, self.num_heads, query_length, kv_length) - # change view to [batch_size, num_heads, q_length, kv_length] - attention_scores = matmul_result.view(batch_size, self.num_heads, query_length, kv_length) + # cast attention scores to fp32, compute scaled softmax and cast back to initial dtype - [batch_size, num_heads, q_length, kv_length] + input_dtype = attention_scores.dtype + # `float16` has a minimum value of -65504.0, whereas `bfloat16` and `float32` have a minimum value of `-3.4e+38` + if input_dtype == torch.float16 or input_dtype == torch.bfloat16: + attention_scores = attention_scores.to(torch.float32) - # cast attention scores to fp32, compute scaled softmax and cast back to initial dtype - [batch_size, num_heads, q_length, kv_length] - input_dtype = attention_scores.dtype - # `float16` has a minimum value of -65504.0, whereas `bfloat16` and `float32` have a minimum value of `-3.4e+38` - if input_dtype == torch.float16 or input_dtype == torch.bfloat16: - attention_scores = attention_scores.to(torch.float32) + attention_logits = attention_scores + alibi.view(batch_size, self.num_heads, 1, -1) + attention_logits *= self.inv_norm_factor + attention_probs = F.softmax(attention_logits + attention_mask, dim=-1, dtype=hidden_states.dtype) + # [batch_size, num_heads, q_length, kv_length] + attention_probs = self.attention_dropout(attention_probs) - attention_logits = attention_scores + alibi.view(batch_size, self.num_heads, 1, -1) - attention_logits *= self.inv_norm_factor - attention_probs = F.softmax(attention_logits + attention_mask, dim=-1, dtype=hidden_states.dtype) - # [batch_size, num_heads, q_length, kv_length] - attention_probs = self.attention_dropout(attention_probs) + if head_mask is not None: + attention_probs = attention_probs * head_mask - if head_mask is not None: - attention_probs = attention_probs * head_mask + # change view [batch_size, num_heads, q_length, kv_length] + attention_probs_reshaped = attention_probs.view(batch_size, self.num_heads, query_length, kv_length) - # change view [batch_size, num_heads, q_length, kv_length] - attention_probs_reshaped = attention_probs.view(batch_size, self.num_heads, query_length, kv_length) + # matmul: [batch_size * num_heads, q_length, head_dim] + attn_output = (attention_probs_reshaped @ value_layer).flatten(0, 1) - # matmul: [batch_size * num_heads, q_length, head_dim] - attn_output = (attention_probs_reshaped @ value_layer).flatten(0, 1) + # change view [batch_size, q_length, num_heads * head_dim] + attn_output = self._merge_heads(attn_output) - # change view [batch_size, q_length, num_heads * head_dim] - attn_output = self._merge_heads(attn_output) + attn_output = self.dense(attn_output) - attn_output = self.dense(attn_output) + if output_attentions: + return attn_output, present, attention_probs + else: + return attn_output, present, _ - if output_attentions: - return attn_output, present, attention_probs - else: - return attn_output, present - - -def gaudi_falcon_decoder_layer_forward( - self, - hidden_states: torch.Tensor, - alibi: Optional[torch.Tensor], - attention_mask: torch.Tensor, - position_ids: Optional[torch.LongTensor] = None, - layer_past: Optional[Tuple[torch.Tensor, torch.Tensor]] = None, - head_mask: Optional[torch.Tensor] = None, - use_cache: bool = False, - output_attentions: bool = False, - token_idx: Optional[torch.Tensor] = None, - **kwargs, -): + def attention_all_reduce(self, attn_output): + if hasattr(self.dense, "all_reduce"): + self.dense.all_reduce(attn_output) + + def post_attn_forward(self, attn_output): + if hasattr(self.dense, "all_reduce"): + self.dense.post_all_reduce(attn_output) + return attn_output + + +class GaudiFalconMLP(FalconMLP): + """ + Inherits from FalconMLP: https://github.com/huggingface/transformers/blob/main/src/transformers/models/falcon/modeling_falcon.py + """ + + def pre_mlp_forward(self, x): + x = self.act(self.dense_h_to_4h(x)) + x = self.dense_4h_to_h(x) + return x + + def mlp_all_reduce(self, x): + if hasattr(self.dense_4h_to_h, "all_reduce"): + self.dense_4h_to_h.all_reduce(x) + + def post_mlp_forward(self, x): + if hasattr(self.dense_4h_to_h, "all_reduce"): + self.dense_4h_to_h.post_all_reduce(x) + return x + + +class GaudiFalconDecoderLayer(FalconDecoderLayer): """ - Copied from FalconDecoderLayer.forward: https://github.com/huggingface/transformers/blob/main/src/transformers/models/falcon/modeling_falcon.py + Inherits from FalconDecoderLayer: https://github.com/huggingface/transformers/blob/main/src/transformers/models/falcon/modeling_falcon.py The only differences are: - add new args token_idx and position_ids - add token_idx and position_ids into attention inputs + - add new args reuse_cache """ - if "padding_mask" in kwargs: - warnings.warn( - "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`" - ) - residual = hidden_states + def __init__(self, config: FalconConfig): + super().__init__(config) + self.self_attention = GaudiFalconAttention(config) - if self.config.new_decoder_architecture: - attention_layernorm_out = self.ln_attn(hidden_states) - mlp_layernorm_out = self.ln_mlp(hidden_states) - else: - attention_layernorm_out = self.input_layernorm(hidden_states) - - # Self attention. - attn_outputs = self.self_attention( - attention_layernorm_out, - layer_past=layer_past, - attention_mask=attention_mask, - position_ids=position_ids, - alibi=alibi, - head_mask=head_mask, - use_cache=use_cache, - output_attentions=output_attentions, - token_idx=token_idx, + def allocate_kv_cache(self, batch_size, max_seq_len, inp_seq_len): + self.self_attention.allocate_kv_cache(batch_size, max_seq_len, inp_seq_len) + + def update_sincos_cache(self, seq_len): + self.self_attention.update_sincos_cache(seq_len) + + def forward( + self, + hidden_states: torch.Tensor, + alibi: Optional[torch.Tensor], + attention_mask: torch.Tensor, + position_ids: Optional[torch.LongTensor] = None, + layer_past: Optional[Tuple[torch.Tensor, torch.Tensor]] = None, + head_mask: Optional[torch.Tensor] = None, + use_cache: bool = False, + output_attentions: bool = False, + token_idx: Optional[torch.Tensor] = None, + reuse_cache: Optional[bool] = False, + cache_idx: int = None, **kwargs, - ) + ): + if "padding_mask" in kwargs: + warnings.warn( + "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`" + ) + residual = hidden_states + ( + hidden_states, + present, + attn_scores, + attention_layernorm_out, + mlp_layernorm_out, + ) = self.pre_attn( # layernorm + attention before AllReduce + hidden_states, + layer_past=layer_past, + attention_mask=attention_mask, + position_ids=position_ids, + alibi=alibi, + head_mask=head_mask, + use_cache=use_cache, + output_attentions=output_attentions, + token_idx=token_idx, + reuse_cache=reuse_cache, + cache_idx=cache_idx, + **kwargs, + ) - attention_output = attn_outputs[0] + self.self_attention.attention_all_reduce(hidden_states) + hidden_states = self.self_attention.post_attn_forward(hidden_states) - if not self.config.new_decoder_architecture: - if self.config.parallel_attn: - mlp_layernorm_out = attention_layernorm_out - else: - residual = dropout_add(attention_output, residual, self.config.attention_dropout, training=self.training) - mlp_layernorm_out = self.post_attention_layernorm(residual) + attention_output = hidden_states - outputs = attn_outputs[1:] + if not self.config.new_decoder_architecture: + if self.config.parallel_attn: + mlp_layernorm_out = attention_layernorm_out + else: + residual = dropout_add( + attention_output, residual, self.config.attention_dropout, training=self.training + ) + mlp_layernorm_out = self.post_attention_layernorm(residual) - # MLP. - mlp_output = self.mlp(mlp_layernorm_out) + outputs = (present, attn_scores) - if self.config.new_decoder_architecture or self.config.parallel_attn: - mlp_output += attention_output + hidden_states = self.mlp.pre_mlp_forward(mlp_layernorm_out) + self.mlp.mlp_all_reduce(hidden_states) + hidden_states = self.mlp.post_mlp_forward(hidden_states) - output = dropout_add(mlp_output, residual, self.config.hidden_dropout, training=self.training) + if self.config.new_decoder_architecture or self.config.parallel_attn: + hidden_states += attention_output - if use_cache: - outputs = (output,) + outputs - else: - outputs = (output,) + outputs[1:] + output = dropout_add(hidden_states, residual, self.config.hidden_dropout, training=self.training) + + if use_cache: + outputs = (output,) + outputs + else: + outputs = (output,) + outputs[1:] + + return outputs # hidden_states, present, attentions + + def pre_attn( + self, + hidden_states: torch.Tensor, + alibi: Optional[torch.Tensor], + attention_mask: torch.Tensor, + position_ids: Optional[torch.LongTensor] = None, + layer_past: Optional[Tuple[torch.Tensor, torch.Tensor]] = None, + head_mask: Optional[torch.Tensor] = None, + use_cache: bool = False, + output_attentions: bool = False, + token_idx: Optional[torch.Tensor] = None, + reuse_cache: Optional[bool] = False, + cache_idx: int = None, + ): + if self.config.new_decoder_architecture: + attention_layernorm_out = self.ln_attn(hidden_states) + mlp_layernorm_out = self.ln_mlp(hidden_states) + else: + attention_layernorm_out = self.input_layernorm(hidden_states) + mlp_layernorm_out = None - return outputs # hidden_states, present, attentions + # Self attention. + attn_outputs, present, attn_scores = self.self_attention.pre_attn_forward( + attention_layernorm_out, + layer_past=layer_past, + attention_mask=attention_mask, + position_ids=position_ids, + alibi=alibi, + head_mask=head_mask, + use_cache=use_cache, + output_attentions=output_attentions, + token_idx=token_idx, + reuse_cache=reuse_cache, + cache_idx=cache_idx, + ) + + return attn_outputs, present, attn_scores, attention_layernorm_out, mlp_layernorm_out class GaudiFalconModel(FalconModel): @@ -370,11 +638,17 @@ class GaudiFalconModel(FalconModel): The only differences are: - add new args token_idx and position_ids - add token_idx and position_ids into decoder inputs - - set past_key_values_length=0 when token_idx is used (with static input shape) - - add new arg tgt_len to _expand_mask because past_key_values_length is no longer valid with token_idx - - use old version of _make_causal_mask to workaround toch.triu that is not supported in Synapse + - add new arg reuse_cache """ + def allocate_kv_cache(self, batch_size, max_seq_len, inp_seq_len): + for layer in self.h: + layer.allocate_kv_cache(batch_size, max_seq_len, inp_seq_len) + + def update_sincos_cache(self, seq_len): + for layer in self.h: + layer.update_sincos_cache(seq_len) + def forward( self, input_ids: Optional[torch.LongTensor] = None, @@ -388,6 +662,8 @@ def forward( output_hidden_states: Optional[bool] = None, return_dict: Optional[bool] = None, token_idx: Optional[torch.Tensor] = None, + reuse_cache: Optional[bool] = False, + cache_idx: int = None, ) -> Union[Tuple[torch.Tensor, ...], BaseModelOutputWithPastAndCrossAttentions]: output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions output_hidden_states = ( @@ -426,7 +702,10 @@ def forward( # Compute alibi tensor: check build_alibi_tensor documentation past_key_values_length = 0 if past_key_values[0] is not None and token_idx is None: - past_key_values_length = past_key_values[0][0].shape[-2] + if reuse_cache: + past_key_values_length = past_key_values[0][0][-2] + else: + past_key_values_length = past_key_values[0][0].shape[-2] if self.use_alibi: mask = ( @@ -489,6 +768,7 @@ def forward( attention_mask = _gaudi_prepare_4d_causal_attention_mask( attention_mask, (batch_size, seq_length), inputs_embeds, past_key_values_length ) + else: # 4d mask is passed through the layers attention_mask = _gaudi_prepare_4d_causal_attention_mask( @@ -501,6 +781,7 @@ def forward( # head_mask has shape n_layer x batch x num_heads x N x N head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers) + htcore.mark_step() for i, (block, layer_past) in enumerate(zip(self.h, past_key_values)): if output_hidden_states: all_hidden_states = all_hidden_states + (hidden_states,) @@ -529,6 +810,8 @@ def forward( output_attentions=output_attentions, alibi=alibi, token_idx=token_idx, + reuse_cache=reuse_cache, + cache_idx=cache_idx, ) hidden_states = outputs[0] @@ -563,8 +846,16 @@ class GaudiFalconForCausalLM(FalconForCausalLM): - add token_idx and position_ids into model inputs - from step2 when enable KV cache, slice next_input_ids from input_ids base on the token_idx - from step2 when enable KV cache, slice next_position_ids from position_ids base on the token_idx + - add new args reuse_cache """ + def allocate_kv_cache(self, batch_size, max_seq_len, inp_seq_len): + self.transformer.allocate_kv_cache(batch_size, max_seq_len, inp_seq_len) + self.kv_cache_len = max_seq_len + + def update_sincos_cache(self, seq_len): + self.transformer.update_sincos_cache(seq_len) + def prepare_inputs_for_generation( self, input_ids: torch.LongTensor, @@ -574,6 +865,7 @@ def prepare_inputs_for_generation( token_idx: Optional[torch.Tensor] = None, **kwargs, ) -> dict: + reuse_cache = kwargs.get("reuse_cache") if past_key_values is not None: if token_idx is not None: input_ids = torch.index_select(input_ids, 1, token_idx - 1) @@ -588,6 +880,10 @@ def prepare_inputs_for_generation( remove_prefix_length = input_ids.shape[1] - 1 input_ids = input_ids[:, remove_prefix_length:] + elif reuse_cache and token_idx is not None: + # With reuse_cache, KV cache is pre allocated hence for the 1st token we can slice the inputs till token idx for the fwd pass + input_ids = input_ids[:, :token_idx] + attention_mask = attention_mask[:, :token_idx] # Note: versions of Falcon with alibi do not use position_ids. It is used with RoPE. if ( @@ -612,6 +908,8 @@ def prepare_inputs_for_generation( "use_cache": kwargs.get("use_cache"), "attention_mask": attention_mask, "token_idx": token_idx, + "reuse_cache": reuse_cache, + "cache_idx": kwargs.get("cache_idx"), } def forward( @@ -628,6 +926,9 @@ def forward( output_hidden_states: Optional[bool] = None, return_dict: Optional[bool] = None, token_idx: Optional[torch.Tensor] = None, + reuse_cache: Optional[bool] = False, + trim_logits: Optional[bool] = False, + cache_idx: int = None, ) -> Union[Tuple[torch.Tensor], CausalLMOutputWithCrossAttentions]: r""" labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): @@ -649,9 +950,18 @@ def forward( output_hidden_states=output_hidden_states, return_dict=return_dict, token_idx=token_idx, + reuse_cache=reuse_cache, + cache_idx=cache_idx, ) hidden_states = transformer_outputs[0] + _, seq_len, _ = hidden_states.shape + if seq_len > 1 and trim_logits and not self.training: + if token_idx is not None: + hidden_states = hidden_states.index_select(1, token_idx - 1) + else: + hidden_states = hidden_states[:, -1:, :] + lm_logits = self.lm_head(hidden_states) loss = None diff --git a/optimum/habana/transformers/models/llama/modeling_llama.py b/optimum/habana/transformers/models/llama/modeling_llama.py index 72e4b0fa55..4d0f3513d7 100755 --- a/optimum/habana/transformers/models/llama/modeling_llama.py +++ b/optimum/habana/transformers/models/llama/modeling_llama.py @@ -33,9 +33,11 @@ try: from habana_frameworks.torch.hpex.normalization import FusedRMSNorm as FusedRMSNorm + + has_fused_rms_norm = True except ImportError: + has_fused_rms_norm = False print("Not using HPU fused kernel for RMSNorm") - FusedRMSNorm = None try: from habana_frameworks.torch.hpex.kernels import FusedSDPA @@ -44,32 +46,13 @@ FusedSDPA = None -def update(prev, cur, dim, idx, inp_seq_len): - orig_cur = cur - if prev.dtype == torch.float8_e4m3fn: - from habana_frameworks.torch.hpex.kernels.Fp8Ops import cast_to_fp8_v2 - - cur = cast_to_fp8_v2(cur, None, False, False, prev.dtype)[0] - if cur.shape[2] > 1 and cur.shape[2] <= prev.shape[2]: - # Initialize - prev[:, :, :inp_seq_len, :].copy_(cur) - return orig_cur - assert cur.shape[2] == 1, f"Cannot update kv-cache. Unsupported shapes. prev:{prev.shape} cur:{cur.shape}" - if idx is not None: - prev.index_copy_(dim, idx - 1, cur) - prev_cast = prev.to(orig_cur.dtype) - return prev_cast - else: - return torch.cat((prev, cur), dim=dim) - - def gaudi_llama_rmsnorm_forward(self, hidden_states): """ Copied from LlamaRMSNorm.forward: https://github.com/huggingface/transformers/blob/main/src/transformers/models/llama/modeling_llama.py The only differences are: - override RMSNorm with Habana fused RMSNorm """ - if hidden_states.device.type == "hpu" and FusedRMSNorm: + if hidden_states.device.type == "hpu" and has_fused_rms_norm: # mixed dtypes are not good for FusedRMSNorm, both inputs need to have same dtype if hidden_states.dtype != self.weight.dtype: orig_dtype = hidden_states.dtype @@ -169,11 +152,9 @@ def __init__(self): self.cache = None self.inp_seq_len = -1 - def allocate(self, inp_seq_len, kv_cache_fp8, dtype, device, shape): + def allocate(self, inp_seq_len, dtype, device, shape): if self.cache is None or self.cache.shape != shape: self.inp_seq_len = inp_seq_len - if kv_cache_fp8: - dtype = torch.float8_e4m3fn self.cache = torch.zeros(shape, dtype=dtype, device=device) else: assert ( @@ -181,13 +162,29 @@ def allocate(self, inp_seq_len, kv_cache_fp8, dtype, device, shape): ), f"inp_seq_len must be the same. self.inp_seq_len:{self.inp_seq_len} inp_seq_len:{inp_seq_len}" self.cache.fill_(0) + def update(self, prev, cur, dim, idx, inp_seq_len): + orig_cur = cur + if prev.shape == cur.shape: + prev.copy_(cur) + return orig_cur + if cur.shape[2] > 1 and cur.shape[2] <= prev.shape[2]: + # Initialize + prev[:, :, :inp_seq_len, :].copy_(cur) + return orig_cur + assert cur.shape[2] == 1, f"Cannot update kv-cache. Unsupported shapes. prev:{prev.shape} cur:{cur.shape}" + if idx is not None: + prev.index_copy_(dim, idx - 1, cur) + return prev + else: + return torch.cat((prev, cur), dim=dim) + def get_shape(self): if self.cache is None: return None return self.cache.shape def forward(self, cur, dim, idx): - return update(self.cache, cur, dim, idx, self.inp_seq_len) + return self.update(self.cache, cur, dim, idx, self.inp_seq_len) class GaudiLlamaRotaryEmbedding(torch.nn.Module): @@ -271,12 +268,12 @@ def __init__(self, config: LlamaConfig, layer_idx: Optional[int] = None): self.inp_seq_len = -1 self.norm_factor = 1.0 / math.sqrt(self.head_dim) - def allocate_kv_cache(self, batch_size, max_seq_len, inp_seq_len, kv_cache_fp8): + def allocate_kv_cache(self, batch_size, max_seq_len, inp_seq_len): cache_shape = (batch_size, self.num_key_value_heads, max_seq_len, self.head_dim) device = self.k_proj.weight.device dtype = self.config.torch_dtype - self.k_cache.allocate(inp_seq_len, kv_cache_fp8, dtype, device, cache_shape) - self.v_cache.allocate(inp_seq_len, kv_cache_fp8, dtype, device, cache_shape) + self.k_cache.allocate(inp_seq_len, dtype, device, cache_shape) + self.v_cache.allocate(inp_seq_len, dtype, device, cache_shape) def update_sincos_cache(self, seq_len): # Call rotary emb forward() to update cos/sin cache when infering more than self.max_position_embeddings @@ -371,14 +368,23 @@ def pre_attn_forward( cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len) query_states, key_states = apply_customized_rope(query_states, key_states, cos, sin, position_ids) - if past_key_value is not None or reuse_cache: + if use_cache: # reuse k, v, self_attention if reuse_cache: key_states = self.k_cache(key_states, 2, token_idx) value_states = self.v_cache(value_states, 2, token_idx) + past_key_value = (self.k_cache.get_shape(), self.v_cache.get_shape()) else: - key_states = update(past_key_value[0], key_states, 2, token_idx, self.inp_seq_len) - value_states = update(past_key_value[1], value_states, 2, token_idx, self.inp_seq_len) + if past_key_value is None: + past_key = torch.zeros(key_states.shape, dtype=self.k_proj.weight.dtype, device=key_states.device) + past_value = torch.zeros( + key_states.shape, dtype=self.k_proj.weight.dtype, device=key_states.device + ) + past_key_value = (past_key, past_value) + key_states = self.k_cache.update(past_key_value[0], key_states, 2, token_idx, self.inp_seq_len) + value_states = self.v_cache.update(past_key_value[1], value_states, 2, token_idx, self.inp_seq_len) + if token_idx is None: + past_key_value = (key_states, value_states) if cache_idx is not None and q_len == 1: key_states = key_states[:, :, :cache_idx, :] @@ -386,12 +392,6 @@ def pre_attn_forward( if attention_mask is not None: attention_mask = attention_mask[:, :, :, :cache_idx] kv_seq_len = key_states.shape[-2] - - if use_cache: - if reuse_cache: - past_key_value = (self.k_cache.get_shape(), self.v_cache.get_shape()) - else: - past_key_value = (key_states.contiguous(), value_states.contiguous()) else: past_key_value = None @@ -473,8 +473,8 @@ def __init__(self, config: LlamaConfig, layer_idx: int): self.input_layernorm = LlamaRMSNorm(config.hidden_size, eps=config.rms_norm_eps) self.post_attention_layernorm = LlamaRMSNorm(config.hidden_size, eps=config.rms_norm_eps) - def allocate_kv_cache(self, batch_size, max_seq_len, inp_seq_len, kv_cache_fp8): - self.self_attn.allocate_kv_cache(batch_size, max_seq_len, inp_seq_len, kv_cache_fp8) + def allocate_kv_cache(self, batch_size, max_seq_len, inp_seq_len): + self.self_attn.allocate_kv_cache(batch_size, max_seq_len, inp_seq_len) def reorder_kv_cache(self, beam_idx: torch.LongTensor): return self.self_attn.reorder_kv_cache(beam_idx) @@ -629,9 +629,9 @@ def __init__(self, config: LlamaConfig): # Initialize weights and apply final processing self.post_init() - def allocate_kv_cache(self, batch_size, max_seq_len, inp_seq_len, kv_cache_fp8): + def allocate_kv_cache(self, batch_size, max_seq_len, inp_seq_len): for layer in self.layers: - layer.allocate_kv_cache(batch_size, max_seq_len, inp_seq_len, kv_cache_fp8) + layer.allocate_kv_cache(batch_size, max_seq_len, inp_seq_len) def reorder_kv_cache(self, beam_idx: torch.LongTensor): return tuple(layer.reorder_kv_cache(beam_idx) for layer in self.layers) @@ -820,9 +820,8 @@ class GaudiLlamaForCausalLM(LlamaForCausalLM): - add new args reuse_cache """ - def allocate_kv_cache(self, batch_size, max_seq_len, inp_seq_len, kv_cache_fp8): - self.model.allocate_kv_cache(batch_size, max_seq_len, inp_seq_len, kv_cache_fp8) - self.kv_cache_len = max_seq_len + def allocate_kv_cache(self, batch_size, max_seq_len, inp_seq_len): + self.model.allocate_kv_cache(batch_size, max_seq_len, inp_seq_len) def reorder_kv_cache(self, beam_idx: torch.LongTensor): return self.model.reorder_kv_cache(beam_idx) diff --git a/optimum/habana/transformers/models/wav2vec2/__init__.py b/optimum/habana/transformers/models/wav2vec2/__init__.py index 3a5bae22b8..84372061b6 100644 --- a/optimum/habana/transformers/models/wav2vec2/__init__.py +++ b/optimum/habana/transformers/models/wav2vec2/__init__.py @@ -5,4 +5,5 @@ gaudi_wav2vec2_encoder_forward, gaudi_wav2vec2_forward, gaudi_wav2vec2_tdnnlayer_forward, + gaudi_wav2vec2forctc_forward, ) diff --git a/optimum/habana/transformers/models/wav2vec2/modeling_wav2vec2.py b/optimum/habana/transformers/models/wav2vec2/modeling_wav2vec2.py index 983c5b5375..c6dd9cb546 100644 --- a/optimum/habana/transformers/models/wav2vec2/modeling_wav2vec2.py +++ b/optimum/habana/transformers/models/wav2vec2/modeling_wav2vec2.py @@ -17,11 +17,23 @@ from typing import Optional, Tuple, Union import torch +from habana_frameworks.torch.hpu import get_device_name from transformers.integrations.deepspeed import is_deepspeed_zero3_enabled from transformers.modeling_outputs import ( BaseModelOutput, + CausalLMOutput, Wav2Vec2BaseModelOutput, ) +from transformers.models.wav2vec2.modeling_wav2vec2 import _HIDDEN_STATES_START_POSITION + + +try: + from habana_frameworks.torch.hpex.kernels import CTCLoss + + custom_ctc_loss_fwd = CTCLoss.apply +except ImportError: + print("Could not import Custom CTCLoss kernel. This Kernel is available only for SynapseAI >= 1.15.0") + custom_ctc_loss_fwd = None def _gaudi_wav2vec2_compute_mask_indices( @@ -33,7 +45,8 @@ def _gaudi_wav2vec2_compute_mask_indices( ) -> torch.Tensor: """ Copied from Transformers: https://github.com/huggingface/transformers/blob/bd469c40659ce76c81f69c7726759d249b4aef49/src/transformers/models/wav2vec2/modeling_wav2vec2.py#L135 - The only difference is that the processing is performed with PyTorch on HPUs (Numpy is used in Transformers). + The only differences are (1) that the processing is performed with PyTorch on HPUs (Numpy is used in Transformers), (2) epsilon is generated on HPU instead of CPU, (3) check + to ensure indices are not larger than sequence length is re-written to avoid host sync. """ batch_size, sequence_length = shape @@ -122,8 +135,13 @@ def compute_num_masked_span(input_length): spec_aug_mask_idxs = spec_aug_mask_idxs + offsets # ensure that we cannot have indices larger than sequence_length - if spec_aug_mask_idxs.max() > sequence_length - 1: - spec_aug_mask_idxs[spec_aug_mask_idxs > sequence_length - 1] = sequence_length - 1 + if get_device_name() == "GAUDI" or custom_ctc_loss_fwd is None: + if spec_aug_mask_idxs.max() > sequence_length - 1: + spec_aug_mask_idxs[spec_aug_mask_idxs > sequence_length - 1] = sequence_length - 1 + else: + mask = (spec_aug_mask_idxs > sequence_length - 1) * (spec_aug_mask_idxs.max() > sequence_length - 1) + inverse_mask = torch.bitwise_not(mask) + spec_aug_mask_idxs = spec_aug_mask_idxs * inverse_mask + (sequence_length - 1) * mask # scatter indices to mask spec_aug_mask.scatter_(-1, spec_aug_mask_idxs, 1) @@ -172,6 +190,63 @@ def _gaudi_wav2vec2_sample_negative_indices( return sampled_negative_indices +def gaudi_wav2vec2_forward( + self, + input_values: Optional[torch.Tensor], + attention_mask: Optional[torch.Tensor] = None, + mask_time_indices: Optional[torch.FloatTensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, +) -> Union[Tuple, Wav2Vec2BaseModelOutput]: + """ + Copied from Transformers: https://github.com/huggingface/transformers/blob/bd469c40659ce76c81f69c7726759d249b4aef49/src/transformers/models/wav2vec2/modeling_wav2vec2.py#L1282 + The only difference is that a clone of `hidden_states` is given to _mask_hidden_states to avoid an error. + """ + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + extract_features = self.feature_extractor(input_values) + extract_features = extract_features.transpose(1, 2) + + if attention_mask is not None: + # compute reduced attention_mask corresponding to feature vectors + attention_mask = self._get_feature_vector_attention_mask( + extract_features.shape[1], attention_mask, add_adapter=False + ) + + hidden_states, extract_features = self.feature_projection(extract_features) + hidden_states = self._mask_hidden_states( + hidden_states.clone(), mask_time_indices=mask_time_indices, attention_mask=attention_mask + ) + + encoder_outputs = self.encoder( + hidden_states, + attention_mask=attention_mask, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + hidden_states = encoder_outputs[0] + + if self.adapter is not None: + hidden_states = self.adapter(hidden_states) + + if not return_dict: + return (hidden_states, extract_features) + encoder_outputs[1:] + + return Wav2Vec2BaseModelOutput( + last_hidden_state=hidden_states, + extract_features=extract_features, + hidden_states=encoder_outputs.hidden_states, + attentions=encoder_outputs.attentions, + ) + + def _gaudi_wav2vec2_mask_hidden_states( self, hidden_states: torch.FloatTensor, @@ -300,63 +375,6 @@ def gaudi_wav2vec2_encoder_forward( ) -def gaudi_wav2vec2_forward( - self, - input_values: Optional[torch.Tensor], - attention_mask: Optional[torch.Tensor] = None, - mask_time_indices: Optional[torch.FloatTensor] = None, - output_attentions: Optional[bool] = None, - output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, -) -> Union[Tuple, Wav2Vec2BaseModelOutput]: - """ - Copied from Transformers: https://github.com/huggingface/transformers/blob/bd469c40659ce76c81f69c7726759d249b4aef49/src/transformers/models/wav2vec2/modeling_wav2vec2.py#L1282 - The only difference is that a clone of `hidden_states` is given to _mask_hidden_states to avoid an error. - """ - output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions - output_hidden_states = ( - output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states - ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - - extract_features = self.feature_extractor(input_values) - extract_features = extract_features.transpose(1, 2) - - if attention_mask is not None: - # compute reduced attention_mask corresponding to feature vectors - attention_mask = self._get_feature_vector_attention_mask( - extract_features.shape[1], attention_mask, add_adapter=False - ) - - hidden_states, extract_features = self.feature_projection(extract_features) - hidden_states = self._mask_hidden_states( - hidden_states.clone(), mask_time_indices=mask_time_indices, attention_mask=attention_mask - ) - - encoder_outputs = self.encoder( - hidden_states, - attention_mask=attention_mask, - output_attentions=output_attentions, - output_hidden_states=output_hidden_states, - return_dict=return_dict, - ) - - hidden_states = encoder_outputs[0] - - if self.adapter is not None: - hidden_states = self.adapter(hidden_states) - - if not return_dict: - return (hidden_states, extract_features) + encoder_outputs[1:] - - return Wav2Vec2BaseModelOutput( - last_hidden_state=hidden_states, - extract_features=extract_features, - hidden_states=encoder_outputs.hidden_states, - attentions=encoder_outputs.attentions, - ) - - def gaudi_wav2vec2_tdnnlayer_forward(self, hidden_states: torch.Tensor) -> torch.Tensor: """ Copied from Transformers: https://github.com/huggingface/transformers/blob/v4.37.2/src/transformers/models/wav2vec2/modeling_wav2vec2.py#L2290 @@ -374,3 +392,74 @@ def gaudi_wav2vec2_tdnnlayer_forward(self, hidden_states: torch.Tensor) -> torch hidden_states = self.activation(hidden_states) return hidden_states + + +def gaudi_wav2vec2forctc_forward( + self, + input_values: Optional[torch.Tensor], + attention_mask: Optional[torch.Tensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + labels: Optional[torch.Tensor] = None, +) -> Union[Tuple, CausalLMOutput]: + """ + copied from Transformers https://github.com/huggingface/transformers/blob/e770f0316d2a9b787c9d1440f204fcb65e176682/src/transformers/models/wav2vec2/modeling_wav2vec2.py#L1950 + only differences are (1) attention_mask tensor generation using ones_like is done on HPU, (2) masked_select is not applied on labels to compute flattened_targets to avoid + changing flattened_targets tensor shapes across training iterations. + """ + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + outputs = self.wav2vec2( + input_values, + attention_mask=attention_mask, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + hidden_states = outputs[0] + hidden_states = self.dropout(hidden_states) + logits = self.lm_head(hidden_states) + loss = None + if labels is not None: + if labels.max() >= self.config.vocab_size: + raise ValueError(f"Label values must be <= vocab_size: {self.config.vocab_size}") + # retrieve loss input_lengths from attention_mask + attention_mask = ( + attention_mask + if attention_mask is not None + else torch.ones_like(input_values, dtype=torch.long, device="hpu") + ) + input_lengths = self._get_feat_extract_output_lengths(attention_mask.sum(-1)).to(torch.long) + # assuming that padded tokens are filled with -100 + # when not being attended to + labels_mask = labels >= 0 + target_lengths = labels_mask.sum(-1) + # ctc_loss doesn't support fp16 + log_probs = torch.nn.functional.log_softmax(logits, dim=-1, dtype=torch.float32).transpose(0, 1) + if get_device_name() == "GAUDI" or custom_ctc_loss_fwd is None: + flattened_targets = labels.masked_select(labels_mask) + loss = torch.nn.functional.ctc_loss( + log_probs, + flattened_targets, + input_lengths, + target_lengths, + blank=self.config.pad_token_id, + reduction=self.config.ctc_loss_reduction, + zero_infinity=self.config.ctc_zero_infinity, + ) + else: + flattened_targets = labels + loss = custom_ctc_loss_fwd( + log_probs, + flattened_targets, + input_lengths, + target_lengths, + self.config.pad_token_id, + self.config.ctc_loss_reduction, + self.config.ctc_zero_infinity, + ) + + if not return_dict: + output = (logits,) + outputs[_HIDDEN_STATES_START_POSITION:] + return ((loss,) + output) if loss is not None else output + return CausalLMOutput(loss=loss, logits=logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions) diff --git a/optimum/habana/utils.py b/optimum/habana/utils.py index a1707c5602..306e619471 100644 --- a/optimum/habana/utils.py +++ b/optimum/habana/utils.py @@ -31,7 +31,7 @@ logger = logging.get_logger(__name__) -CURRENTLY_VALIDATED_SYNAPSE_VERSION = version.parse("1.14.0") +CURRENTLY_VALIDATED_SYNAPSE_VERSION = version.parse("1.15.0") def to_device_dtype(my_input: Any, target_device: torch.device = None, target_dtype: torch.dtype = None): diff --git a/tests/baselines/albert_large_v2.json b/tests/baselines/albert_large_v2.json index 62c685b473..2f13722a95 100644 --- a/tests/baselines/albert_large_v2.json +++ b/tests/baselines/albert_large_v2.json @@ -7,9 +7,9 @@ "single_card": { "learning_rate": 6e-5, "train_batch_size": 32, - "eval_f1": 92.0109, - "train_runtime": 3246.7928, - "train_samples_per_second": 55.517, + "eval_f1": 91.8679, + "train_runtime": 2900.5518, + "train_samples_per_second": 62.298, "extra_arguments": [ "--max_seq_length 384", "--use_hpu_graphs_for_inference" @@ -18,9 +18,9 @@ "multi_card": { "learning_rate": 6e-5, "train_batch_size": 32, - "eval_f1": 92.8155, - "train_runtime": 497.1048, - "train_samples_per_second": 449.321, + "eval_f1": 92.7647, + "train_runtime": 464.9893, + "train_samples_per_second": 494.936, "extra_arguments": [ "--max_seq_length 384", "--use_hpu_graphs_for_inference" @@ -37,9 +37,9 @@ "single_card": { "learning_rate": 6e-5, "train_batch_size": 128, - "eval_f1": 92.6585, - "train_runtime": 659.795, - "train_samples_per_second": 277.916, + "eval_f1": 92.4235, + "train_runtime": 571.138, + "train_samples_per_second": 321.635, "extra_arguments": [ "--max_seq_length 384", "--use_hpu_graphs_for_inference" @@ -48,9 +48,9 @@ "multi_card": { "learning_rate": 7e-5, "train_batch_size": 128, - "eval_f1": 91.9053, - "train_runtime": 126.0638, - "train_samples_per_second": 2271.729, + "eval_f1": 92.2111, + "train_runtime": 115.15, + "train_samples_per_second": 2464.403, "extra_arguments": [ "--max_seq_length 384", "--use_hpu_graphs_for_inference" diff --git a/tests/baselines/albert_xxlarge_v1.json b/tests/baselines/albert_xxlarge_v1.json index 511344bf52..8efe5d729d 100644 --- a/tests/baselines/albert_xxlarge_v1.json +++ b/tests/baselines/albert_xxlarge_v1.json @@ -18,9 +18,9 @@ "multi_card": { "learning_rate": 5e-5, "train_batch_size": 12, - "eval_f1": 95.1629, - "train_runtime": 1308.2465, - "train_samples_per_second": 75.506, + "eval_f1": 95.1221, + "train_runtime": 1312.9496, + "train_samples_per_second": 75.51, "extra_arguments": [ "--max_seq_length 384", "--use_hpu_graphs_for_inference" @@ -48,9 +48,9 @@ "multi_card": { "learning_rate": 7e-5, "train_batch_size": 16, - "eval_f1": 95.0743, - "train_runtime": 218.7903, - "train_samples_per_second": 442.758, + "eval_f1": 95.1227, + "train_runtime": 221.2125, + "train_samples_per_second": 439.114, "extra_arguments": [ "--max_seq_length 384", "--use_hpu_graphs_for_inference" diff --git a/tests/baselines/bert_large_uncased_whole_word_masking.json b/tests/baselines/bert_large_uncased_whole_word_masking.json index 62ea2558b7..d153328e4a 100644 --- a/tests/baselines/bert_large_uncased_whole_word_masking.json +++ b/tests/baselines/bert_large_uncased_whole_word_masking.json @@ -7,9 +7,9 @@ "single_card": { "learning_rate": 3e-5, "train_batch_size": 24, - "eval_f1": 93.2812, - "train_runtime": 1719.9389, - "train_samples_per_second": 52.696, + "eval_f1": 93.1962, + "train_runtime": 1678.3456, + "train_samples_per_second": 54.101, "extra_arguments": [ "--max_seq_length 384", "--use_hpu_graphs_for_inference" @@ -18,9 +18,9 @@ "multi_card": { "learning_rate": 7e-5, "train_batch_size": 24, - "eval_f1": 93.2092, - "train_runtime": 306.8871, - "train_samples_per_second": 397.041, + "eval_f1": 93.1869, + "train_runtime": 309.9553, + "train_samples_per_second": 398.459, "extra_arguments": [ "--max_seq_length 384", "--use_hpu_graphs_for_inference" @@ -35,9 +35,9 @@ "single_card": { "learning_rate": 3e-5, "train_batch_size": 32, - "eval_f1": 0.8968, - "train_runtime": 88.7004, - "train_samples_per_second": 171.252, + "eval_f1": 0.9022, + "train_runtime": 90.3943, + "train_samples_per_second": 172.792, "extra_arguments": [ "--max_seq_length 128", "--use_hpu_graphs_for_inference" @@ -46,9 +46,9 @@ "multi_card": { "learning_rate": 3e-5, "train_batch_size": 16, - "eval_f1": 0.8885, - "train_runtime": 61.8181, - "train_samples_per_second": 893.265, + "eval_f1": 0.8897, + "train_runtime": 64.4986, + "train_samples_per_second": 968.596, "extra_arguments": [ "--max_seq_length 128", "--use_hpu_graphs_for_inference" @@ -65,9 +65,9 @@ "single_card": { "learning_rate": 4e-5, "train_batch_size": 32, - "eval_f1": 93.3512, - "train_runtime": 323.3053, - "train_samples_per_second": 287.096, + "eval_f1": 93.2753, + "train_runtime": 309.9491, + "train_samples_per_second": 302.089, "extra_arguments": [ "--max_seq_length 384", "--use_hpu_graphs_for_inference" @@ -76,9 +76,9 @@ "multi_card": { "learning_rate": 8e-5, "train_batch_size": 32, - "eval_f1": 92.9464, - "train_runtime": 77.4588, - "train_samples_per_second": 2178.613, + "eval_f1": 93.0981, + "train_runtime": 78.387, + "train_samples_per_second": 2300.127, "extra_arguments": [ "--max_seq_length 384", "--use_hpu_graphs_for_inference" @@ -93,9 +93,9 @@ "single_card": { "learning_rate": 9e-5, "train_batch_size": 256, - "eval_f1": 0.9027, - "train_runtime": 29.8624, - "train_samples_per_second": 1161.008, + "eval_f1": 0.8998, + "train_runtime": 33.2909, + "train_samples_per_second": 1151.598, "extra_arguments": [ "--max_seq_length 128", "--use_hpu_graphs_for_inference" @@ -104,9 +104,9 @@ "multi_card": { "learning_rate": 3e-5, "train_batch_size": 40, - "eval_f1": 0.8601, - "train_runtime": 38.35, - "train_samples_per_second": 2895.6, + "eval_f1": 0.8758, + "train_runtime": 41.4282, + "train_samples_per_second": 2771.405, "extra_arguments": [ "--max_seq_length 128", "--use_hpu_graphs_for_inference" diff --git a/tests/baselines/bridgetower_large_itm_mlm_itc.json b/tests/baselines/bridgetower_large_itm_mlm_itc.json index c81f437c70..e188228256 100644 --- a/tests/baselines/bridgetower_large_itm_mlm_itc.json +++ b/tests/baselines/bridgetower_large_itm_mlm_itc.json @@ -7,8 +7,8 @@ "multi_card": { "learning_rate": 1e-5, "train_batch_size": 48, - "train_runtime": 300.6945, - "train_samples_per_second": 930.245, + "train_runtime": 314.5877, + "train_samples_per_second": 918.387, "extra_arguments": [ "--dataset_config_name matching", "--dataset_revision 3c6c4f6c0ff7e902833d3afa5f8f3875c2b036e6", diff --git a/tests/baselines/clip_roberta.json b/tests/baselines/clip_roberta.json index 50105645f1..b95d98c016 100644 --- a/tests/baselines/clip_roberta.json +++ b/tests/baselines/clip_roberta.json @@ -7,8 +7,8 @@ "multi_card": { "learning_rate": 5e-5, "train_batch_size": 64, - "train_runtime": 304.18, - "train_samples_per_second": 2582.676, + "train_runtime": 314.7726, + "train_samples_per_second": 2560.999, "extra_arguments": [ "--data_dir $PWD/", "--dataset_config_name 2017", diff --git a/tests/baselines/distilbert_base_uncased.json b/tests/baselines/distilbert_base_uncased.json index e9bd14dafd..a85474a073 100644 --- a/tests/baselines/distilbert_base_uncased.json +++ b/tests/baselines/distilbert_base_uncased.json @@ -7,20 +7,20 @@ "single_card": { "learning_rate": 1e-4, "train_batch_size": 48, - "eval_f1": 84.7137, - "train_runtime": 271.2751, - "train_samples_per_second": 334.792, + "eval_f1": 84.5384, + "train_runtime": 264.3669, + "train_samples_per_second": 344.126, "extra_arguments": [ "--max_seq_length 384", "--use_hpu_graphs_for_inference" ] }, "multi_card": { - "learning_rate": 3e-4, + "learning_rate": 4e-4, "train_batch_size": 48, - "eval_f1": 82.8831, - "train_runtime": 54.0269, - "train_samples_per_second": 2500.721, + "eval_f1": 83.0667, + "train_runtime": 54.5344, + "train_samples_per_second": 2503.657, "extra_arguments": [ "--max_seq_length 384", "--use_hpu_graphs_for_inference" @@ -37,9 +37,9 @@ "single_card": { "learning_rate": 2e-4, "train_batch_size": 64, - "eval_f1": 84.87642669075069, - "train_runtime": 131.655, - "train_samples_per_second": 1377.209, + "eval_f1": 84.5418, + "train_runtime": 108.8333, + "train_samples_per_second": 1676.689, "extra_arguments": [ "--max_seq_length 384", "--use_hpu_graphs_for_inference" @@ -48,9 +48,9 @@ "multi_card": { "learning_rate": 3e-4, "train_batch_size": 64, - "eval_f1": 83.27897440376087, - "train_runtime": 25.7792, - "train_samples_per_second": 9951.533, + "eval_f1": 83.2233, + "train_runtime": 24.0441, + "train_samples_per_second": 11144.651, "extra_arguments": [ "--max_seq_length 384", "--use_hpu_graphs_for_inference" diff --git a/tests/baselines/falcon_40b.json b/tests/baselines/falcon_40b.json index 1b2b761907..cb08dc4ed4 100644 --- a/tests/baselines/falcon_40b.json +++ b/tests/baselines/falcon_40b.json @@ -7,9 +7,9 @@ "multi_card": { "learning_rate": 4e-4, "train_batch_size": 1, - "perplexity": 4.0596, - "train_runtime": 944.9201, - "train_samples_per_second": 27.045, + "perplexity": 4.0893, + "train_runtime": 931.1213, + "train_samples_per_second": 28.162, "extra_arguments": [ "--bf16", "--gradient_accumulation_steps 16", diff --git a/tests/baselines/flan_t5_xxl.json b/tests/baselines/flan_t5_xxl.json index 6b3f293f8f..779bc9fd83 100644 --- a/tests/baselines/flan_t5_xxl.json +++ b/tests/baselines/flan_t5_xxl.json @@ -7,9 +7,9 @@ "deepspeed": { "learning_rate": 1e-4, "train_batch_size": 22, - "eval_rougeLsum": 0.0, - "train_runtime": 90.2563, - "train_samples_per_second": 27.175, + "eval_rougeLsum": 0.1429, + "train_runtime": 89.486, + "train_samples_per_second": 27.299, "extra_arguments": [ "--max_steps 10", "--max_eval_samples 880", diff --git a/tests/baselines/gpt2.json b/tests/baselines/gpt2.json index d7f6d8dca6..889bdbd3d4 100644 --- a/tests/baselines/gpt2.json +++ b/tests/baselines/gpt2.json @@ -7,9 +7,9 @@ "single_card": { "learning_rate": 5e-5, "train_batch_size": 4, - "perplexity": 22.2641, - "train_runtime": 236.4595, - "train_samples_per_second": 20.24, + "perplexity": 22.2751, + "train_runtime": 225.2898, + "train_samples_per_second": 21.308, "extra_arguments": [ "--dataset_config_name wikitext-2-raw-v1", "--use_hpu_graphs_for_inference", @@ -19,9 +19,9 @@ "multi_card": { "learning_rate": 4e-4, "train_batch_size": 4, - "perplexity": 22.2696, - "train_runtime": 72.1582, - "train_samples_per_second": 150.303, + "perplexity": 22.2699, + "train_runtime": 68.9627, + "train_samples_per_second": 156.241, "extra_arguments": [ "--dataset_config_name wikitext-2-raw-v1", "--use_hpu_graphs_for_inference", @@ -39,9 +39,9 @@ "single_card": { "learning_rate": 2e-4, "train_batch_size": 16, - "perplexity": 21.0687, - "train_runtime": 45.091, - "train_samples_per_second": 118.884, + "perplexity": 21.0729, + "train_runtime": 43.9361, + "train_samples_per_second": 130.785, "extra_arguments": [ "--dataset_config_name wikitext-2-raw-v1", "--use_hpu_graphs_for_inference" @@ -50,9 +50,9 @@ "multi_card": { "learning_rate": 8e-4, "train_batch_size": 16, - "perplexity": 21.7965, - "train_runtime": 18.9527, - "train_samples_per_second": 847.568, + "perplexity": 21.7858, + "train_runtime": 23.8993, + "train_samples_per_second": 939.24, "extra_arguments": [ "--dataset_config_name wikitext-2-raw-v1", "--use_hpu_graphs_for_inference" diff --git a/tests/baselines/gpt2_xl.json b/tests/baselines/gpt2_xl.json index 2a5bd96ecf..ffd92331cb 100644 --- a/tests/baselines/gpt2_xl.json +++ b/tests/baselines/gpt2_xl.json @@ -7,9 +7,9 @@ "deepspeed": { "learning_rate": 5e-5, "train_batch_size": 2, - "perplexity": 12.6711, - "train_runtime": 380.1311, - "train_samples_per_second": 16.045, + "perplexity": 12.6744, + "train_runtime": 366.8694, + "train_samples_per_second": 16.464, "extra_arguments": [ "--dataset_config_name wikitext-2-raw-v1", "--use_hpu_graphs_for_inference", @@ -27,9 +27,9 @@ "deepspeed": { "learning_rate": 4e-4, "train_batch_size": 16, - "perplexity": 13.0563, - "train_runtime": 196.3264, - "train_samples_per_second": 86.855, + "perplexity": 13.0461, + "train_runtime": 190.696, + "train_samples_per_second": 89.877, "extra_arguments": [ "--dataset_config_name wikitext-2-raw-v1", "--gradient_checkpointing", diff --git a/tests/baselines/gpt_neox_20b.json b/tests/baselines/gpt_neox_20b.json index 165debd4ca..61b27156bf 100644 --- a/tests/baselines/gpt_neox_20b.json +++ b/tests/baselines/gpt_neox_20b.json @@ -8,8 +8,8 @@ "learning_rate": 5e-5, "train_batch_size": 2, "perplexity": 8.0545, - "train_runtime": 745.7237, - "train_samples_per_second": 7.242, + "train_runtime": 721.5428, + "train_samples_per_second": 7.571, "extra_arguments": [ "--dataset_config_name wikitext-2-raw-v1", "--gradient_checkpointing", diff --git a/tests/baselines/llama_7b.json b/tests/baselines/llama_7b.json index 7f05fde3b7..9bff377dbe 100644 --- a/tests/baselines/llama_7b.json +++ b/tests/baselines/llama_7b.json @@ -7,9 +7,9 @@ "multi_card": { "learning_rate": 1e-4, "train_batch_size": 2, - "perplexity": 2.7362, - "train_runtime": 538.1199, - "train_samples_per_second": 20.37, + "perplexity": 2.7542, + "train_runtime": 538.0159, + "train_samples_per_second": 20.397, "extra_arguments": [ "--bf16", "--gradient_accumulation_steps 4", @@ -32,9 +32,9 @@ "multi_card": { "learning_rate": 3e-4, "train_batch_size": 8, - "perplexity": 2.3666, - "train_runtime": 303.8345, - "train_samples_per_second": 144.392, + "perplexity": 2.3665, + "train_runtime": 294.5707, + "train_samples_per_second": 148.093, "extra_arguments": [ "--bf16", "--gradient_accumulation_steps 2", @@ -68,8 +68,8 @@ "learning_rate": 3e-4, "train_batch_size": 8, "perplexity": 2.4259, - "train_runtime": 199.94, - "train_samples_per_second": 88.664, + "train_runtime": 186.2483, + "train_samples_per_second": 93.5, "extra_arguments": [ "--bf16 True", "--gradient_accumulation_steps 2", @@ -89,7 +89,7 @@ "--adam_epsilon 1e-08", "--ddp_bucket_cap_mb 50", "--validation_split_percentage 10", - "--attn_softmax_bf16 True", + "--attn_softmax_bf16", "--pipelining_fwd_bwd False", "--fsdp auto_wrap", "--torch_compile_backend hpu_backend", @@ -100,5 +100,4 @@ } } } -} - +} \ No newline at end of file diff --git a/tests/baselines/roberta_base.json b/tests/baselines/roberta_base.json index c6dc95babc..210f608d27 100644 --- a/tests/baselines/roberta_base.json +++ b/tests/baselines/roberta_base.json @@ -7,9 +7,9 @@ "single_card": { "learning_rate": 3e-5, "train_batch_size": 12, - "eval_f1": 91.8466, - "train_runtime": 610.6291, - "train_samples_per_second": 147.028, + "eval_f1": 91.9903, + "train_runtime": 599.9343, + "train_samples_per_second": 149.781, "extra_arguments": [ "--max_seq_length 384", "--use_hpu_graphs_for_inference" @@ -18,9 +18,9 @@ "multi_card": { "learning_rate": 8e-5, "train_batch_size": 12, - "eval_f1": 91.7635, - "train_runtime": 102.8332, - "train_samples_per_second": 1081.823, + "eval_f1": 91.624, + "train_runtime": 103.5987, + "train_samples_per_second": 1083.304, "extra_arguments": [ "--max_seq_length 384", "--use_hpu_graphs_for_inference" @@ -35,9 +35,9 @@ "multi_card": { "learning_rate": 5e-5, "train_batch_size": 24, - "perplexity": 3.5988, - "train_runtime": 41.6183, - "train_samples_per_second": 553.572, + "perplexity": 3.6338, + "train_runtime": 43.1541, + "train_samples_per_second": 554.787, "extra_arguments": [ "--dataset_config_name wikitext-2-raw-v1", "--use_hpu_graphs_for_inference", @@ -55,9 +55,9 @@ "single_card": { "learning_rate": 7e-5, "train_batch_size": 64, - "eval_f1": 91.5167, - "train_runtime": 111.4348, - "train_samples_per_second": 851.971, + "eval_f1": 91.5253, + "train_runtime": 105.6042, + "train_samples_per_second": 907.395, "extra_arguments": [ "--max_seq_length 384", "--use_hpu_graphs_for_inference" @@ -66,9 +66,9 @@ "multi_card": { "learning_rate": 2e-4, "train_batch_size": 64, - "eval_f1": 90.7807, - "train_runtime": 31.8781, - "train_samples_per_second": 6634.081, + "eval_f1": 90.8766, + "train_runtime": 32.2213, + "train_samples_per_second": 6568.625, "extra_arguments": [ "--max_seq_length 384", "--use_hpu_graphs_for_inference" @@ -83,9 +83,9 @@ "multi_card": { "learning_rate": 8e-5, "train_batch_size": 32, - "perplexity": 3.6515, - "train_runtime": 12.0388, - "train_samples_per_second": 2754.437, + "perplexity": 3.6691, + "train_runtime": 12.3633, + "train_samples_per_second": 2758.371, "extra_arguments": [ "--dataset_config_name wikitext-2-raw-v1", "--use_hpu_graphs_for_inference", diff --git a/tests/baselines/roberta_large.json b/tests/baselines/roberta_large.json index 0e82fae0d8..4f1ba4c89d 100644 --- a/tests/baselines/roberta_large.json +++ b/tests/baselines/roberta_large.json @@ -7,9 +7,9 @@ "single_card": { "learning_rate": 3e-5, "train_batch_size": 12, - "eval_f1": 94.3634, - "train_runtime": 1801.8127, - "train_samples_per_second": 49.895, + "eval_f1": 94.2959, + "train_runtime": 1771.3319, + "train_samples_per_second": 50.815, "extra_arguments": [ "--max_seq_length 384", "--use_hpu_graphs_for_inference" @@ -18,9 +18,9 @@ "multi_card": { "learning_rate": 8e-5, "train_batch_size": 12, - "eval_f1": 94.0942, - "train_runtime": 299.6718, - "train_samples_per_second": 364.947, + "eval_f1": 94.2867, + "train_runtime": 304.9084, + "train_samples_per_second": 366.177, "extra_arguments": [ "--max_seq_length 384", "--use_hpu_graphs_for_inference" @@ -36,8 +36,8 @@ "learning_rate": 5e-5, "train_batch_size": 8, "perplexity": 2.7851, - "train_runtime": 72.0278, - "train_samples_per_second": 217.107, + "train_runtime": 75.0033, + "train_samples_per_second": 217.752, "extra_arguments": [ "--dataset_config_name wikitext-2-raw-v1", "--use_hpu_graphs_for_inference", @@ -55,9 +55,9 @@ "single_card": { "learning_rate": 3e-5, "train_batch_size": 32, - "eval_f1": 94.5763, - "train_runtime": 325.6019, - "train_samples_per_second": 286.78, + "eval_f1": 94.5886, + "train_runtime": 314.4407, + "train_samples_per_second": 300.578, "extra_arguments": [ "--max_seq_length 384", "--use_hpu_graphs_for_inference" @@ -66,9 +66,9 @@ "multi_card": { "learning_rate": 7e-5, "train_batch_size": 32, - "eval_f1": 94.0626, - "train_runtime": 76.6936, - "train_samples_per_second": 2242.639, + "eval_f1": 94.4348, + "train_runtime": 79.1007, + "train_samples_per_second": 2280.328, "extra_arguments": [ "--max_seq_length 384", "--use_hpu_graphs_for_inference" @@ -83,9 +83,9 @@ "multi_card": { "learning_rate": 7e-5, "train_batch_size": 16, - "perplexity": 2.8312, - "train_runtime": 25.2018, - "train_samples_per_second": 1075.842, + "perplexity": 2.829, + "train_runtime": 25.6323, + "train_samples_per_second": 1183.796, "extra_arguments": [ "--dataset_config_name wikitext-2-raw-v1", "--use_hpu_graphs_for_inference", diff --git a/tests/baselines/swin_base_patch4_window7_224_in22k.json b/tests/baselines/swin_base_patch4_window7_224_in22k.json index f8f5576d42..b6c09b6dec 100644 --- a/tests/baselines/swin_base_patch4_window7_224_in22k.json +++ b/tests/baselines/swin_base_patch4_window7_224_in22k.json @@ -7,9 +7,9 @@ "single_card": { "learning_rate": 3e-5, "train_batch_size": 64, - "eval_accuracy": 0.9851, - "train_runtime": 249.7865, - "train_samples_per_second": 203.94, + "eval_accuracy": 0.9871, + "train_runtime": 246.4134, + "train_samples_per_second": 212.722, "extra_arguments": [ "--remove_unused_columns False", "--image_column_name img", @@ -24,9 +24,9 @@ "multi_card": { "learning_rate": 2e-4, "train_batch_size": 64, - "eval_accuracy": 0.9836, - "train_runtime": 113.9324, - "train_samples_per_second": 1691.705, + "eval_accuracy": 0.9819, + "train_runtime": 117.6424, + "train_samples_per_second": 1683.344, "extra_arguments": [ "--remove_unused_columns False", "--image_column_name img", @@ -49,9 +49,9 @@ "single_card": { "learning_rate": 6e-5, "train_batch_size": 160, - "eval_accuracy": 0.9845, - "train_runtime": 77.0917, - "train_samples_per_second": 862.671, + "eval_accuracy": 0.9852, + "train_runtime": 73.5918, + "train_samples_per_second": 957.491, "extra_arguments": [ "--remove_unused_columns False", "--image_column_name img", @@ -66,9 +66,9 @@ "multi_card": { "learning_rate": 2e-4, "train_batch_size": 160, - "eval_accuracy": 0.9824, - "train_runtime": 61.0788, - "train_samples_per_second": 6170.79, + "eval_accuracy": 0.9821, + "train_runtime": 62.9986, + "train_samples_per_second": 6202.525, "extra_arguments": [ "--remove_unused_columns False", "--image_column_name img", diff --git a/tests/baselines/t5_small.json b/tests/baselines/t5_small.json index ce1dcc588b..ebbb774f89 100644 --- a/tests/baselines/t5_small.json +++ b/tests/baselines/t5_small.json @@ -7,10 +7,10 @@ "multi_card": { "learning_rate": 5e-5, "train_batch_size": 4, - "eval_rougeLsum": 38.6197, - "train_runtime": 1087.1076, - "train_samples_per_second": 268.231, - "eval_samples_per_second": 68.222, + "eval_rougeLsum": 38.5895, + "train_runtime": 1089.366, + "train_samples_per_second": 267.843, + "eval_samples_per_second": 71.913, "extra_arguments": [ "--dataset_config \"3.0.0\"", "--source_prefix \"summarize: \"", @@ -30,9 +30,9 @@ "multi_card": { "learning_rate": 2e-4, "train_batch_size": 16, - "eval_f1": 64.8034, - "train_runtime": 228.8655, - "train_samples_per_second": 1246.527, + "eval_f1": 64.8769, + "train_runtime": 230.6405, + "train_samples_per_second": 1235.893, "extra_arguments": [ "--context_column context", "--question_column question", @@ -57,10 +57,10 @@ "multi_card": { "learning_rate": 2e-4, "train_batch_size": 32, - "eval_rougeLsum": 38.5749, - "train_runtime": 162.5389, - "train_samples_per_second": 1870.707, - "eval_samples_per_second": 78.586, + "eval_rougeLsum": 38.5648, + "train_runtime": 164.962, + "train_samples_per_second": 1912.578, + "eval_samples_per_second": 116.48, "extra_arguments": [ "--dataset_config \"3.0.0\"", "--source_prefix \"summarize: \"", @@ -80,9 +80,9 @@ "multi_card": { "learning_rate": 2e-3, "train_batch_size": 64, - "eval_f1": 66.4991, - "train_runtime": 53.9037, - "train_samples_per_second": 5710.614, + "eval_f1": 65.7157, + "train_runtime": 49.5816, + "train_samples_per_second": 6353.351, "extra_arguments": [ "--context_column context", "--question_column question", diff --git a/tests/baselines/vit_base_patch16_224_in21k.json b/tests/baselines/vit_base_patch16_224_in21k.json index 3762a6f06c..03cd9f6131 100644 --- a/tests/baselines/vit_base_patch16_224_in21k.json +++ b/tests/baselines/vit_base_patch16_224_in21k.json @@ -7,9 +7,9 @@ "single_card": { "learning_rate": 5e-5, "train_batch_size": 64, - "eval_accuracy": 0.9828, - "train_runtime": 139.4456, - "train_samples_per_second": 349.624, + "eval_accuracy": 0.9812, + "train_runtime": 136.9418, + "train_samples_per_second": 359.584, "extra_arguments": [ "--remove_unused_columns False", "--image_column_name img", @@ -23,9 +23,9 @@ "multi_card": { "learning_rate": 2e-4, "train_batch_size": 64, - "eval_accuracy": 0.98, - "train_runtime": 58.345, - "train_samples_per_second": 2509.51, + "eval_accuracy": 0.9803, + "train_runtime": 59.972, + "train_samples_per_second": 2508.955, "extra_arguments": [ "--remove_unused_columns False", "--image_column_name img", @@ -48,9 +48,9 @@ "single_card": { "learning_rate": 6e-5, "train_batch_size": 96, - "eval_accuracy": 0.9819, - "train_runtime": 53.7091, - "train_samples_per_second": 916.872, + "eval_accuracy": 0.9813, + "train_runtime": 53.4501, + "train_samples_per_second": 931.955, "extra_arguments": [ "--remove_unused_columns False", "--image_column_name img", @@ -64,9 +64,9 @@ "multi_card": { "learning_rate": 5e-4, "train_batch_size": 96, - "eval_accuracy": 0.9811, - "train_runtime": 23.1594, - "train_samples_per_second": 6528.949, + "eval_accuracy": 0.9775, + "train_runtime": 22.8292, + "train_samples_per_second": 7337.003, "extra_arguments": [ "--remove_unused_columns False", "--image_column_name img", diff --git a/tests/baselines/wav2vec2_base.json b/tests/baselines/wav2vec2_base.json index 2778c1c036..3927ec4a5b 100644 --- a/tests/baselines/wav2vec2_base.json +++ b/tests/baselines/wav2vec2_base.json @@ -7,10 +7,10 @@ "multi_card": { "learning_rate": 5e-4, "train_batch_size": 32, - "eval_accuracy": 0.8045, - "train_runtime": 363.7165, - "train_samples_per_second": 715.004, - "eval_samples_per_second": 312.719, + "eval_accuracy": 0.8013, + "train_runtime": 366.8081, + "train_samples_per_second": 716.385, + "eval_samples_per_second": 329.12, "extra_arguments": [ "--audio_column_name audio", "--label_column_name language", @@ -35,10 +35,10 @@ "multi_card": { "learning_rate": 5e-4, "train_batch_size": 32, - "eval_accuracy": 0.795, - "train_runtime": 109.4142, - "train_samples_per_second": 2962.248, - "eval_samples_per_second": 580.266, + "eval_accuracy": 0.8006, + "train_runtime": 109.2047, + "train_samples_per_second": 3048.207, + "eval_samples_per_second": 631.601, "extra_arguments": [ "--audio_column_name audio", "--label_column_name language", diff --git a/tests/baselines/wav2vec2_large_lv60.json b/tests/baselines/wav2vec2_large_lv60.json index b1071302fa..d645ced656 100644 --- a/tests/baselines/wav2vec2_large_lv60.json +++ b/tests/baselines/wav2vec2_large_lv60.json @@ -7,10 +7,10 @@ "multi_card": { "learning_rate": 6e-4, "train_batch_size": 8, - "eval_wer": 0.0555, - "train_runtime": 889.0079, - "train_samples_per_second": 70.036, - "eval_samples_per_second": 57.302, + "eval_wer": 0.0496, + "train_runtime": 984.3022, + "train_samples_per_second": 63.043, + "eval_samples_per_second": 54.189, "extra_arguments": [ "--dataset_config_name clean", "--train_split_name train.100", @@ -33,12 +33,12 @@ "eval_batch_size": 8, "distribution": { "multi_card": { - "learning_rate": 3e-4, + "learning_rate": 4e-4, "train_batch_size": 8, - "eval_wer": 0.0531535105117017, - "train_runtime": 356.4723, - "train_samples_per_second": 183.245, - "eval_samples_per_second": 158.985, + "eval_wer": 0.06120587068623562, + "train_runtime": 308.8036, + "train_samples_per_second": 225.572, + "eval_samples_per_second": 196.665, "extra_arguments": [ "--dataset_config_name clean", "--train_split_name train.100", @@ -49,10 +49,12 @@ "--layerdrop 0.0", "--freeze_feature_encoder", "--dataloader_num_workers 8", - "--chars_to_ignore ',?.!-;:\"“%‘”'" + "--chars_to_ignore ',?.!-;:\"“%‘”'", + "--use_hpu_graphs_for_training", + "--use_hpu_graphs_for_inference" ] } } } } -} +} \ No newline at end of file diff --git a/tests/baselines/whisper_small.json b/tests/baselines/whisper_small.json index 513aeac2fa..42a18efe79 100644 --- a/tests/baselines/whisper_small.json +++ b/tests/baselines/whisper_small.json @@ -7,10 +7,10 @@ "multi_card": { "learning_rate": 1e-4, "train_batch_size": 8, - "eval_wer": 1.352786940708788, - "train_runtime": 532.0875, - "train_samples_per_second": 147.56, - "eval_samples_per_second": 7.683, + "eval_wer": 2.1133, + "train_runtime": 551.3249, + "train_samples_per_second": 145.59, + "eval_samples_per_second": 6.851, "extra_arguments": [ "--dataset_config_name hi", "--language hindi", @@ -41,10 +41,10 @@ "multi_card": { "learning_rate": 8e-5, "train_batch_size": 32, - "eval_wer": 1.2335690515806987, - "train_runtime": 298.2158, - "train_samples_per_second": 284.875, - "eval_samples_per_second": 15.679, + "eval_wer": 0.8477, + "train_runtime": 287.0947, + "train_samples_per_second": 307.526, + "eval_samples_per_second": 12.069, "extra_arguments": [ "--dataset_config_name hi", "--language hindi", diff --git a/tests/example_diff/run_generation.txt b/tests/example_diff/run_generation.txt index 4bc6a80807..d68c09a4c9 100644 --- a/tests/example_diff/run_generation.txt +++ b/tests/example_diff/run_generation.txt @@ -48,7 +48,7 @@ < from transformers.modeling_outputs import CausalLMOutputWithPast --- > from optimum.habana.utils import get_hpu_memory_stats -62,263d42 +62,284d42 < MAX_LENGTH = int(10000) # Hardcoded max length to avoid infinite loop < < MODEL_CLASSES = { @@ -251,7 +251,7 @@ < attentions=None, < ) < return fixed_output -265,287c44,46 +< < def __getattr__(self, item): < return getattr(self._default, item) < @@ -272,7 +272,7 @@ < """ < return self._default._reorder_cache(past_key_values, beam_idx) < -< +286,287c44,46 < def main(): < parser = argparse.ArgumentParser() --- @@ -477,56 +477,58 @@ < parser.add_argument("--prefix", type=str, default="", help="Text added prior to input.") < parser.add_argument("--padding_text", type=str, default="", help="Deprecated, the use of `--prefix` is preferred.") < parser.add_argument("--xlm_language", type=str, default="", help="Optional language when used with the XLM model.") -323d223 +323c224 < parser.add_argument("--seed", type=int, default=42, help="random seed for initialization") -325c225 +--- +> parser.add_argument("--fp8", action="store_true", help="Enable Quantization to fp8") +325c226 < "--use_cpu", --- -> "--kv_cache_fp8", -327c227 +> "--use_flash_attention", +327c228 < help="Whether or not to use cpu. If set to False, " "we will use gpu/npu or mps device if available", --- -> help="Store kv-cache in float8 when kv-cache is used. Can't use this argument together with QUANT_CONFIG env var", -329c229 +> help="Whether to enable Habana Flash Attention, provided that the model supports it.", +329d229 < parser.add_argument("--num_return_sequences", type=int, default=1, help="The number of samples to generate.") ---- -> parser.add_argument("--fp8", action="store_true", help="Enable Quantization to fp8") 331c231 < "--fp16", --- -> "--use_flash_attention", -333c233 +> "--torch_compile", +333c233,246 < help="Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit", --- -> help="Whether to enable Habana Flash Attention, provided that the model supports it.", -335,336c235,241 -< parser.add_argument("--jit", action="store_true", help="Whether or not to use jit trace to accelerate inference") -< args = parser.parse_args() ---- -> parser.add_argument( -> "--torch_compile", -> action="store_true", > help="Whether to use torch compiled model or not.", > ) > parser.add_argument("--temperature", default=1.0, type=float, help="Temperature value for text generation") > parser.add_argument("--top_p", default=1.0, type=float, help="Top_p value for generating text via sampling") -338,339c243 +> parser.add_argument( +> "--const_serialization_path", +> "--csp", +> type=str, +> help="Path to serialize const params. Const params will be held on disk memory instead of being allocated on host memory.", +> ) +> parser.add_argument( +> "--disk_offload", +> action="store_true", +> help="Whether to enable device map auto. In case no space left on cpu, weights will be offloaded to disk.", +335d247 +< parser.add_argument("--jit", action="store_true", help="Whether or not to use jit trace to accelerate inference") +338,341c250,251 < # Initialize the distributed state. < distributed_state = PartialState(cpu=args.use_cpu) ---- -> args = parser.parse_args() -341c245,246 +< < logger.warning(f"device: {distributed_state.device}, 16-bits inference: {args.fp16}") --- > if args.torch_compile: > args.use_hpu_graphs = False -343,344c248,249 +343,344c253,254 < if args.seed is not None: < set_seed(args.seed) --- > if not args.use_hpu_graphs: > args.limit_hpu_graphs = False -346,373c251,256 +346,373c256,257 < # Initialize the model and tokenizer < try: < args.model_type = args.model_type.lower() @@ -557,17 +559,13 @@ < preprocessed_prompt_text = prepare_input(args, model, tokenizer, prompt_text) --- > args.quant_config = os.getenv("QUANT_CONFIG", "") -> if args.quant_config and args.kv_cache_fp8: -> # can't use both quant_config and kv_cache_fp8, since quant_config may trigger kv cache quantization -> # with habana quantization toolkit -> raise parser.error("Can't use QUANT_CONFIG env var with kv_cache_fp8 argument") > return args -375,378d257 +375,378d258 < if model.__class__.__name__ in ["TransfoXLLMHeadModel"]: < tokenizer_kwargs = {"add_space_before_punct_symbol": True} < else: < tokenizer_kwargs = {} -380,386c259,262 +380,386c260,263 < encoded_prompt = tokenizer.encode( < preprocessed_prompt_text, add_special_tokens=False, return_tensors="pt", **tokenizer_kwargs < ) @@ -580,7 +578,7 @@ > parser = argparse.ArgumentParser() > args = setup_parser(parser) > model, tokenizer, generation_config = initialize_model(args, logger) -388,389c264,413 +388,389c265,414 < if encoded_prompt.size()[-1] == 0: < input_ids = None --- @@ -734,7 +732,7 @@ > print(f"Graph compilation duration = {compilation_duration} seconds") > print(separator) > print() -391c415,432 +391c416,433 < input_ids = encoded_prompt --- > # Downloading and loading a dataset from the hub. @@ -755,7 +753,7 @@ > .shuffle() > .select(range(args.dataset_max_samples if args.dataset_max_samples > 0 else (raw_dataset[split]).num_rows)) > ) -393,399c434,441 +393,399c435,442 < if args.jit: < jit_input_texts = ["enable jit"] < jit_inputs = prepare_jit_inputs(jit_input_texts, model, tokenizer) @@ -772,7 +770,7 @@ > logger.info( > f"No column name was given so automatically choosing '{column_name}' for prompts. If you would like to use another column of the dataset, you can set the argument `--column_name`." > ) -401,439c443,463 +401,439c444,464 < sig = inspect.signature(model.__call__) < jit_inputs = tuple(jit_inputs[key] for key in sig.parameters if jit_inputs.get(key, None) is not None) < traced_model = torch.jit.trace(model, jit_inputs, strict=False) @@ -834,7 +832,7 @@ > preprocess_function, > batched=True, > desc="Running tokenizer on dataset", -440a465,545 +440a466,546 > # After tokenization, we can remove the column of interest > raw_dataset = raw_dataset.remove_columns([column_name]) > raw_dataset.set_format(type="torch") @@ -916,7 +914,7 @@ > > throughput = total_new_tokens_generated / duration > # Print Stats -442,443c547,561 +442,443c548,566 < generated_sequences.append(total_sequence) < print(total_sequence) --- @@ -935,7 +933,11 @@ > print(separator) > if args.quant_config: > import habana_quantization_toolkit -445c563 +> +> habana_quantization_toolkit.finish_measurements(model) +> if args.const_serialization_path and os.path.isdir(args.const_serialization_path): +> import shutil +445c568 < return generated_sequences --- -> habana_quantization_toolkit.finish_measurements(model) +> shutil.rmtree(args.const_serialization_path) diff --git a/tests/test_diffusers.py b/tests/test_diffusers.py index c4b5e07a69..25f9b6be7a 100755 --- a/tests/test_diffusers.py +++ b/tests/test_diffusers.py @@ -55,7 +55,7 @@ if os.environ.get("GAUDI2_CI", "0") == "1": - THROUGHPUT_BASELINE_BF16 = 1.016 + THROUGHPUT_BASELINE_BF16 = 1.086 THROUGHPUT_BASELINE_AUTOCAST = 0.394 TEXTUAL_INVERSION_THROUGHPUT = 104.29806 TEXTUAL_INVERSION_RUNTIME = 114.1344320399221 @@ -64,10 +64,10 @@ else: THROUGHPUT_BASELINE_BF16 = 0.309 THROUGHPUT_BASELINE_AUTOCAST = 0.114 - TEXTUAL_INVERSION_THROUGHPUT = 58.17508958300077 - TEXTUAL_INVERSION_RUNTIME = 202.94231038199996 - CONTROLNET_THROUGHPUT = 44.412012818816905 - CONTROLNET_RUNTIME = 1124.0202105600001 + TEXTUAL_INVERSION_THROUGHPUT = 60.5991479573174 + TEXTUAL_INVERSION_RUNTIME = 196.43840550999994 + CONTROLNET_THROUGHPUT = 44.7278034963213 + CONTROLNET_RUNTIME = 1116.084316640001 _run_custom_bf16_ops_test_ = parse_flag_from_env("CUSTOM_BF16_OPS", default=False) diff --git a/tests/test_encoder_decoder.py b/tests/test_encoder_decoder.py index 532b24da12..5608679470 100644 --- a/tests/test_encoder_decoder.py +++ b/tests/test_encoder_decoder.py @@ -16,15 +16,15 @@ MODELS_TO_TEST = { "summarization": { "bf16": [ - ("facebook/bart-large-cnn", "Habana/bart", 4.691, 26.0688, 2, 1), - ("t5-3b", "Habana/t5", 2.88, 21.56, 2, 1), + ("facebook/bart-large-cnn", "Habana/bart", 5.233, 26.6928, 2, 1), + ("t5-3b", "Habana/t5", 2.955, 21.8877, 2, 1), ], }, "translation": { "bf16": [ - ("Babelscape/mrebel-large", "Habana/t5", 1.41, 0.162, 2, 1), - ("Helsinki-NLP/opus-mt-zh-en", "Habana/t5", 2.8, 0.813, 2, 1), - ("facebook/nllb-200-distilled-600M", "Habana/t5", 1.496, 1.2531, 2, 1), + ("Babelscape/mrebel-large", "Habana/t5", 1.323, 0.1618, 2, 1), + ("Helsinki-NLP/opus-mt-zh-en", "Habana/t5", 2.815, 0.8132, 2, 1), + ("facebook/nllb-200-distilled-600M", "Habana/t5", 1.401, 1.2599, 2, 1), ], }, } @@ -33,15 +33,15 @@ MODELS_TO_TEST = { "summarization": { "bf16": [ - ("facebook/bart-large-cnn", "Habana/bart", 2.574, 26.5069, 2, 1), - ("t5-3b", "Habana/t5", 0.987, 21.3831, 2, 1), + ("facebook/bart-large-cnn", "Habana/bart", 2.628, 26.7494, 2, 1), + ("t5-3b", "Habana/t5", 1.005, 21.7286, 2, 1), ], }, "translation": { "bf16": [ - ("Babelscape/mrebel-large", "Habana/t5", 1.015, 0.162, 2, 1), - ("Helsinki-NLP/opus-mt-zh-en", "Habana/t5", 2.421, 0.7995, 2, 1), - ("facebook/nllb-200-distilled-600M", "Habana/t5", 1.03, 1.2531, 2, 1), + ("Babelscape/mrebel-large", "Habana/t5", 0.995, 0.1784, 2, 1), + ("Helsinki-NLP/opus-mt-zh-en", "Habana/t5", 2.409, 0.7995, 2, 1), + ("facebook/nllb-200-distilled-600M", "Habana/t5", 0.998, 1.2457, 2, 1), ], }, } diff --git a/tests/test_examples.py b/tests/test_examples.py old mode 100644 new mode 100755 index 961f8a1ab5..5cf2559f5f --- a/tests/test_examples.py +++ b/tests/test_examples.py @@ -276,9 +276,12 @@ def test(self): self.assertEqual(return_code, 0) return elif self.EXAMPLE_NAME == "run_clip": - from .clip_coco_utils import COCO_URLS, create_clip_roberta_model, download_files + if not os.environ.get("DATA_CACHE", "0"): + from .clip_coco_utils import COCO_URLS, download_files + + download_files(COCO_URLS) + from .clip_coco_utils import create_clip_roberta_model - download_files(COCO_URLS) create_clip_roberta_model() self._install_requirements(example_script.parent / "requirements.txt") @@ -322,6 +325,11 @@ def test(self): env_variables["LOWER_LIST"] = str(example_script.parent / "ops_bf16.txt") env_variables["PT_HPU_LAZY_MODE"] = "0" + extra_command_line_arguments = baseline.get("distribution").get(distribution).get("extra_arguments", []) + + if os.environ.get("DATA_CACHE", "0") and self.EXAMPLE_NAME == "run_clip": + extra_command_line_arguments[0] = "--data_dir {}".format(os.environ.get("DATA_CACHE", "$PWD")) + with TemporaryDirectory() as tmp_dir: cmd_line = self._create_command_line( multi_card, @@ -336,9 +344,7 @@ def test(self): train_batch_size=baseline.get("distribution").get(distribution).get("train_batch_size"), eval_batch_size=baseline.get("eval_batch_size"), num_epochs=baseline.get("num_train_epochs"), - extra_command_line_arguments=baseline.get("distribution") - .get(distribution) - .get("extra_arguments", []), + extra_command_line_arguments=extra_command_line_arguments, ) p = subprocess.Popen(cmd_line, env=env_variables) @@ -577,6 +583,7 @@ class MultiCardSpeechRecognitionExampleTester( ExampleTesterBase, metaclass=ExampleTestMeta, example_name="run_speech_recognition_ctc", multi_card=True ): TASK_NAME = "regisss/librispeech_asr_for_optimum_habana_ci" + DATASET_NAME = os.environ.get("DATA_CACHE", 0) class MultiCardSummarizationExampleTester( diff --git a/tests/test_fsdp_examples.py b/tests/test_fsdp_examples.py index 29198ae7bd..13de801832 100644 --- a/tests/test_fsdp_examples.py +++ b/tests/test_fsdp_examples.py @@ -10,34 +10,37 @@ from .test_examples import ACCURACY_PERF_FACTOR, TIME_PERF_FACTOR -# Gaudi2 CI baselines -# FSDP is not supported on Gaudi1 -MODELS_TO_TEST = { - "bf16": [ - ( - "bert-base-uncased", - "Habana/bert-base-uncased", - 2807, - 85.4688, - "question-answering", - 24, - 8, - "run_qa.py", - "full_shard", - ), - ( - "meta-llama/Llama-2-7b-hf", - "", - 54, - 0.92, - "language-modeling", - 8, - 8, - "run_lora_clm.py", - "auto_wrap", - ), - ], -} +if os.environ.get("GAUDI2_CI", "0") == "1": + # Gaudi2 CI baselines + MODELS_TO_TEST = { + "bf16": [ + ( + "bert-base-uncased", + "Habana/bert-base-uncased", + 3516.322, + 85.5503, + "question-answering", + 24, + 8, + "run_qa.py", + "full_shard", + ), + ( + "meta-llama/Llama-2-7b-hf", + "", + 87.016, + 0.9093, + "language-modeling", + 8, + 8, + "run_lora_clm.py", + "auto_wrap", + ), + ], + } +else: + # FSDP is not supported on Gaudi1 + MODELS_TO_TEST = {"bf16": []} def _test_fsdp( @@ -54,8 +57,6 @@ def _test_fsdp( world_size: int = 8, ): os.environ["PT_HPU_LAZY_MODE"] = "0" - os.environ["PT_HPU_EAGER_4_STAGE_PIPELINE_ENABLE"] = "0" # To be removed later - os.environ["PT_HPU_EAGER_PIPELINE_ENABLE"] = "0" # To be removed later path_to_example_dir = Path(__file__).resolve().parent.parent / "examples" # Install question-answering example requirements diff --git a/tests/test_text_generation_example.py b/tests/test_text_generation_example.py index af7e9cfab3..8f3da77526 100644 --- a/tests/test_text_generation_example.py +++ b/tests/test_text_generation_example.py @@ -14,50 +14,54 @@ # Gaudi2 CI baselines MODELS_TO_TEST = { "bf16": [ - ("bigscience/bloomz-7b1", 130.10463607610703), - ("gpt2-xl", 293.2967921508155), - ("EleutherAI/gpt-j-6b", 157.39646612198123), - ("EleutherAI/gpt-neox-20b", 49.65827341338015), - ("meta-llama/Llama-2-7b-hf", 142.00624811267403), - ("tiiuae/falcon-40b", 25.065388035178792), - ("bigcode/starcoder", 65.50236665863024), - ("Salesforce/codegen2-1B", 456.7740998156863), - ("mosaicml/mpt-30b", 35.64501131267502), - ("mistralai/Mistral-7B-v0.1", 125.26115369093216), - ("mistralai/Mixtral-8x7B-v0.1", 23.78652574031883), - ("microsoft/phi-2", 218.08752713569007), + ("bigscience/bloomz-7b1", 130.0472971205316), + ("gpt2-xl", 281.8734689674413), + ("EleutherAI/gpt-j-6b", 160.5823842101192), + ("EleutherAI/gpt-neox-20b", 50.67672679310354), + ("meta-llama/Llama-2-7b-hf", 141.25776956002076), + ("tiiuae/falcon-40b", 25.202450111088346), + ("bigcode/starcoder", 65.58632640700114), + ("Salesforce/codegen2-1B", 446.4029486883532), + ("mosaicml/mpt-30b", 36.06464336116623), + ("mistralai/Mistral-7B-v0.1", 130.2172236767782), + ("mistralai/Mixtral-8x7B-v0.1", 23.7931001677926), + ("microsoft/phi-2", 224.72307766211117), + ], + "fp8": [ + ("tiiuae/falcon-180B", 52.85086442722326), ], "deepspeed": [ - ("bigscience/bloomz", 36.34664210641816), - ("meta-llama/Llama-2-70b-hf", 61.973950428647164), - ("facebook/opt-66b", 28.16154122335556), + ("bigscience/bloomz", 36.77314954096159), + ("meta-llama/Llama-2-70b-hf", 64.10514998902435), + ("facebook/opt-66b", 28.48069266504111), ], "torch_compile": [ - ("meta-llama/Llama-2-7b-hf", 12.468247401430999), + ("meta-llama/Llama-2-7b-hf", 102.27823420713148), ], "torch_compile_distributed": [ - ("meta-llama/Llama-2-7b-hf", 20.178927030275947), + ("meta-llama/Llama-2-7b-hf", 39.72973199515235), ], } else: # Gaudi1 CI baselines MODELS_TO_TEST = { "bf16": [ - ("bigscience/bloomz-7b1", 41.51855420676164), - ("gpt2-xl", 137.159223188195), + ("bigscience/bloomz-7b1", 41.7555095197846), + ("gpt2-xl", 142.11481820425706), # TODO: fix OPT 6.7B # ("facebook/opt-6.7b", 0.0), - ("EleutherAI/gpt-j-6b", 50.66146537939035), - ("meta-llama/Llama-2-7b-hf", 44.29688546702468), - ("tiiuae/falcon-7b", 44.217408724737744), - ("bigcode/starcoder", 15.948143541091655), - ("Salesforce/codegen2-1B", 153.79670508220687), - ("mosaicml/mpt-7b", 44.80241777760578), - ("mistralai/Mistral-7B-v0.1", 40.00435417311187), - ("microsoft/phi-2", 90.10751623430603), + ("EleutherAI/gpt-j-6b", 50.79545107991805), + ("meta-llama/Llama-2-7b-hf", 44.39616259946937), + ("tiiuae/falcon-7b", 44.82870145718665), + ("bigcode/starcoder", 15.945023767901013), + ("Salesforce/codegen2-1B", 155.32071248826423), + ("mosaicml/mpt-7b", 45.45168927038262), + ("mistralai/Mistral-7B-v0.1", 41.21906841459711), + ("microsoft/phi-2", 92.53083167241344), ], + "fp8": [], "deepspeed": [ - ("bigscience/bloomz-7b1", 31.044523676681507), + ("bigscience/bloomz-7b1", 31.994268212011505), ], "torch_compile": [], "torch_compile_distributed": [], @@ -71,6 +75,7 @@ def _test_text_generation( deepspeed: bool = False, world_size: int = 8, torch_compile: bool = False, + fp8: bool = False, ): command = ["python3"] path_to_example_dir = Path(__file__).resolve().parent.parent / "examples" @@ -108,6 +113,12 @@ def _test_text_generation( if not deepspeed: command.append("--bf16") + if fp8: + command += [ + "--reuse_cache", + "--trim_logits", + ] + with TemporaryDirectory() as tmp_dir: command.append(f"--output_dir {tmp_dir}") print(f"\n\nCommand to test: {' '.join(command)}\n") @@ -117,6 +128,16 @@ def _test_text_generation( pattern = re.compile(r"([\"\'].+?[\"\'])|\s") command = [x for y in command for x in re.split(pattern, y) if x] + if fp8: + env_variables["QUANT_CONFIG"] = os.path.join( + path_to_example_dir, "text-generation/quantization_config/maxabs_measure_include_outputs.json" + ) + subprocess.run(command, env=env_variables) + env_variables["QUANT_CONFIG"] = os.path.join( + path_to_example_dir, "text-generation/quantization_config/maxabs_quant.json" + ) + command.insert(-1, "--fp8") + proc = subprocess.run(command, env=env_variables) # Ensure the run finished without any issue @@ -140,6 +161,13 @@ def test_text_generation_bf16(model_name: str, baseline: float, token: str): _test_text_generation(model_name, baseline, token) +@pytest.mark.parametrize("model_name, baseline", MODELS_TO_TEST["fp8"]) +def test_text_generation_fp8(model_name: str, baseline: float, token: str): + deepspeed = True if "falcon-180B" in model_name else False + world_size = 8 if "falcon-180B" in model_name else None + _test_text_generation(model_name, baseline, token, deepspeed=deepspeed, world_size=world_size, fp8=True) + + @pytest.mark.parametrize("model_name, baseline", MODELS_TO_TEST["deepspeed"]) def test_text_generation_deepspeed(model_name: str, baseline: float, token: str): world_size = 2 if "opt-66b" in model_name else 8 diff --git a/tests/transformers/tests/models/falcon/test_modeling_falcon.py b/tests/transformers/tests/models/falcon/test_modeling_falcon.py index 1ab9f84cf9..20e9067f31 100644 --- a/tests/transformers/tests/models/falcon/test_modeling_falcon.py +++ b/tests/transformers/tests/models/falcon/test_modeling_falcon.py @@ -353,7 +353,7 @@ def test_past_key_values_format(self): outputs = model(**inputs) # If "past_key_values" is not returned, pass the test (e.g. RWKV uses a different cache name and format) - if "past_key_values" not in outputs: + if "past_key_values" not in outputs or all(ele is None for ele in outputs["past_key_values"]): return num_hidden_layers = (