diff --git a/examples/scripts/evals/generate_tldr.py b/examples/scripts/evals/generate_tldr.py index 5bb44e23797..41bf5dbc1b8 100644 --- a/examples/scripts/evals/generate_tldr.py +++ b/examples/scripts/evals/generate_tldr.py @@ -8,7 +8,7 @@ from datasets import load_dataset from gpt_tldr_judge import LLMJudgeConfig, llm_judge from transformers import AutoTokenizer, HfArgumentParser -from vllm import SamplingParams, SingleGPULLM +from vllm import LLM, SamplingParams """ @@ -28,6 +28,7 @@ class Args: output_path: str model_name_or_path: str model_revision: str = "main" + judge_model: str = "gpt-3.5-turbo-0125" n: int = 1000 @@ -50,11 +51,11 @@ def run_command(command: str): prompts = prompts[: args.n] reference_summaries = [message[-1]["content"] for message in raw_datasets["test"]["messages"]] sampling_params = SamplingParams(temperature=0.0, top_p=0.95, max_tokens=MAX_TOKENS) -llm = SingleGPULLM( +llm = LLM( model=args.model_name_or_path, revision=args.model_revision, + tokenizer_revision=args.model_revision, tensor_parallel_size=1, - device="cuda:0", ) outputs = llm.generate(prompts, sampling_params) table = defaultdict(list) @@ -82,8 +83,13 @@ def run_command(command: str): judged_df = llm_judge( LLMJudgeConfig( n=args.n, - model="gpt-3.5-turbo-0125", + model=args.judge_model, ), df, ) +judged_df.rename(columns={"response0": "model_response", "response1": "reference_response"}, inplace=True) +print(judged_df["preferred"].value_counts()) +# print percentage +print(judged_df["preferred"].value_counts(normalize=True)) + judged_df.to_csv(args.output_path.replace(".csv", "_judged.csv")) diff --git a/examples/scripts/evals/gpt_tldr_judge.py b/examples/scripts/evals/gpt_tldr_judge.py index 8e153fe25db..4d1f1fa0bbc 100644 --- a/examples/scripts/evals/gpt_tldr_judge.py +++ b/examples/scripts/evals/gpt_tldr_judge.py @@ -124,7 +124,6 @@ async def main(ljc: LLMJudgeConfig, df: pd.DataFrame): else "response1" ) df.at[i, "preferred"] = preferred_label - print(df["preferred"].value_counts()) return df return asyncio.run(main(ljc, df)) @@ -138,4 +137,5 @@ async def main(ljc: LLMJudgeConfig, df: pd.DataFrame): df["response0"] = df["model_response"].map(lambda x: x.strip()) df["response1"] = df["reference_response"].map(lambda x: x.strip()) judge_df = llm_judge(ljc, df) + print(judge_df["preferred"].value_counts()) judge_df.to_csv(args.output_path)