opea-project · lkk12014402 · Oct 9, 2024
@@ -63,6 +63,9 @@ To setup a LLM model, we can use [tgi-gaudi](https://github.com/huggingface/tgi-
 # please set your llm_port and hf_token
 
 docker run -p {your_llm_port}:80 --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e PT_HPU_ENABLE_LAZY_COLLECTIVES=true -e OMPI_MCA_btl_vader_single_copy_mechanism=none -e HF_TOKEN={your_hf_token} --cap-add=sys_nice --ipc=host ghcr.io/huggingface/tgi-gaudi:2.0.1 --model-id mistralai/Mixtral-8x7B-Instruct-v0.1 --max-input-tokens 2048 --max-total-tokens 4096 --sharded true --num-shard 2
+
+# for better performance, set `PREFILL_BATCH_BUCKET_SIZE`, `BATCH_BUCKET_SIZE`, `max-batch-total-tokens`, `max-batch-prefill-tokens`
+docker run -p {your_llm_port}:80 --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e PT_HPU_ENABLE_LAZY_COLLECTIVES=true -e OMPI_MCA_btl_vader_single_copy_mechanism=none -e HF_TOKEN={your_hf_token} -e PREFILL_BATCH_BUCKET_SIZE=1 -e BATCH_BUCKET_SIZE=8 --cap-add=sys_nice --ipc=host ghcr.io/huggingface/tgi-gaudi:2.0.5 --model-id mistralai/Mixtral-8x7B-Instruct-v0.1 --max-input-tokens 2048 --max-total-tokens 4096 --sharded true --num-shard 2 --max-batch-total-tokens 65536 --max-batch-prefill-tokens 2048
 ```
 
 ### Prepare Dataset
@@ -138,6 +141,9 @@ If you are using docker compose to deploy RAG system, you can simply run the eva
 ```bash
 cd examples
 python eval_crud.py --dataset_path ../data/split_merged.json --docs_path ../data/80000_docs --ingest_docs
+
+# if you want to get ragas metrics
+python eval_crud.py --dataset_path ../data/split_merged.json --docs_path ../data/80000_docs  --contain_original_data --llm_endpoint "http://{llm_as_judge_ip}:{llm_as_judge_port}"  --ragas_metrics
 ```
 
 If you are using Kubernetes manifest/helm to deploy RAG system, you must specify more arguments as following: