From baf3b492f02efa21ae2495751a217a24aaafa060 Mon Sep 17 00:00:00 2001 From: Costa Huang Date: Tue, 25 Jun 2024 19:55:05 -0400 Subject: [PATCH] Remove the leading space in the tldr preference dataset --- examples/datasets/tldr_preference.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/examples/datasets/tldr_preference.py b/examples/datasets/tldr_preference.py index f7c02a8a97b..7c1af328a51 100644 --- a/examples/datasets/tldr_preference.py +++ b/examples/datasets/tldr_preference.py @@ -63,8 +63,9 @@ def process(row): format_str = cnndm_format_str if row["batch"] in cnndm_batches else tldr_format_str row["prompt"] = format_str.format(**row["info"]) choice = row["choice"] - chosen = row["summaries"][choice]["text"] - rejected = row["summaries"][1 - choice]["text"] + # need to remove the leading space + chosen = row["summaries"][choice]["text"].strip() + rejected = row["summaries"][1 - choice]["text"].strip() row["chosen"] = [{"role": "user", "content": row["prompt"]}, {"role": "assistant", "content": chosen}] row["rejected"] = [{"role": "user", "content": row["prompt"]}, {"role": "assistant", "content": rejected}] return row