diff --git a/examples/text-generation/README.md b/examples/text-generation/README.md index d74d308723..3e83584322 100644 --- a/examples/text-generation/README.md +++ b/examples/text-generation/README.md @@ -254,7 +254,10 @@ QUANT_CONFIG=./quantization_config/maxabs_measure.json python ../gaudi_spawn.py --use_hpu_graphs \ --trim_logits \ --use_kv_cache \ ---reuse_cache \ +--bucket_size=128 \ +--bucket_internal \ +--use_flash_attention \ +--flash_attention_recompute \ --bf16 \ --batch_size 1 ``` @@ -269,7 +272,10 @@ QUANT_CONFIG=./quantization_config/maxabs_quant.json python ../gaudi_spawn.py \ --use_hpu_graphs \ --trim_logits \ --use_kv_cache \ ---reuse_cache \ +--bucket_size=128 \ +--bucket_internal \ +--use_flash_attention \ +--flash_attention_recompute \ --bf16 \ --batch_size 1 \ ``` @@ -284,8 +290,10 @@ QUANT_CONFIG=./quantization_config/maxabs_quant.json python ../gaudi_spawn.py \ --trim_logits \ --use_kv_cache \ --reuse_cache \ +--use_flash_attention \ +--flash_attention_recompute \ --bf16 \ ---batch_size 277 \ +--batch_size 350 \ --max_new_tokens 2048 \ --max_input_tokens 2048 \ --limit_hpu_graphs \