huggingface · sadra-barikbin · Jul 23, 2024 · Jul 27, 2024 · Jul 27, 2024 · Aug 17, 2024
diff --git a/src/lighteval/tasks/default_prompts.py b/src/lighteval/tasks/default_prompts.py
@@ -176,7 +176,6 @@ def bbh_harness(line, task_name: str = None):
         query=query,
         choices=choices,
         gold_index=correct_index,
-        target_for_fewshot_sorting=choices,
         instruction=line.get("task_prefix", None),
     )
 
@@ -196,18 +195,17 @@ def bbh_lighteval(line, task_name: str = None):
         query=query,
         choices=LETTER_INDICES[: len(line["choices"])],
         gold_index=line["target_idx"],
-        target_for_fewshot_sorting=LETTER_INDICES[: len(line["choices"])],
         instruction=line.get("task_prefix", None),
     )
 
 
 def bbh(line, instruction, choices, task_name: str = None):
+    is_few_shots = line.get("__few_shots", False)
     return Doc(
         task_name=task_name,
         query=f"{instruction}Q: {line['input']}\nA:",
-        choices=choices,
+        choices=[(" " if is_few_shots else "") + c for c in choices],
         gold_index=choices.index(line["target"]),
-        target_for_fewshot_sorting=[f" {c}" for c in choices],
         instruction=instruction,
     )
 
@@ -799,7 +797,6 @@ def hellaswag_generative(line, task_name: str = None):
         choices=[" " + i for i in LETTER_INDICES[: len(line["endings"])]],
         gold_index=gold_ix,  # -1 for test,
         instruction="The following are multiple choice questions (with answers) about common sense.\n\n",
-        target_for_fewshot_sorting=line["endings"][gold_ix] if gold_ix > -1 else "",
     )
 
 
@@ -1352,7 +1349,6 @@ def mmlu(line, topic, task_name: str = None):
         choices=[" A", " B", " C", " D"] if is_few_shots else ["A", "B", "C", "D"],
         gold_index=gold_ix,
         instruction=f"The following are multiple choice questions (with answers) about  {topic.replace('_', ' ')}.\n\n",
-        target_for_fewshot_sorting=[" A", " B", " C", " D"][gold_ix],
     )
 
 
@@ -1373,7 +1369,6 @@ def custom_mmlu_thom(line, task_name: str = None):
         choices=[" A", " B", " C", " D"] if is_few_shots else ["A", "B", "C", "D"],
         gold_index=gold_ix,
         instruction=f"The following are multiple choice questions (with answers) about  {topic.replace('_', ' ')}.\n\n",
-        target_for_fewshot_sorting=[" A", " B", " C", " D"][gold_ix],
     )
 
 
@@ -1613,15 +1608,13 @@ def mmlu_harness(line, task_name: str = None):
     query += "Answer:"
 
     gold_ix = LETTER_INDICES.index(line["answer"]) if isinstance(line["answer"], str) else line["answer"]
-    "__few_shots" in line and line["__few_shots"] is True  # We are adding few shots
 
     return Doc(
         task_name=task_name,
         query=query,
         choices=[" A", " B", " C", " D"],
         gold_index=gold_ix,
         instruction=f"The following are multiple choice questions (with answers) about {topic.replace('_', ' ')}.\n\n",
-        target_for_fewshot_sorting=[" A", " B", " C", " D"][gold_ix],
     )
 
 
@@ -1632,14 +1625,14 @@ def mmlu_helm(line, task_name: str = None):
     query += "\nAnswer:"
 
     gold_ix = LETTER_INDICES.index(line["answer"]) if isinstance(line["answer"], str) else line["answer"]
+    is_few_shots = line.get("__few_shots", False)  # We are adding few shots
 
     return Doc(
         task_name=task_name,
         query=query,
-        choices=[" A", " B", " C", " D"],
+        choices=[" A", " B", " C", " D"] if not is_few_shots else line["choices"],
         gold_index=gold_ix,
         instruction=f"The following are multiple choice questions (with answers) about {subject.replace('_', ' ')}.\n\n",
-        target_for_fewshot_sorting=line["choices"][gold_ix],  # specific to HELM evals
     )
 
 
@@ -1816,7 +1809,6 @@ def openbookqa_helm(line, task_name: str = None):
         choices=["A", "B", "C", "D", "E"],
         gold_index=gold_ix,
         instruction="The following are multiple choice questions (with answers) about common sense.\n",
-        target_for_fewshot_sorting=line["choices"]["text"][gold_ix],  # specific to HELM evals
     )
 
 
@@ -1837,14 +1829,13 @@ def piqa_helm(line, task_name: str = None):
     query += "Answer: "
 
     gold_ix = int(line["label"])
-
+    is_few_shots = line.get("__few_shots", False)
     return Doc(
         task_name=task_name,
         query=query,
-        choices=["A", "B"],
+        choices=["A", "B"] if not is_few_shots else [line["sol1"], line["sol2"]],
         gold_index=gold_ix,
         instruction="The following are multiple choice questions (with answers) about common sense.\n",
-        target_for_fewshot_sorting=[line["sol1"], line["sol2"]][gold_ix],
     )
 
 
@@ -1877,13 +1868,11 @@ def pubmed_qa_helm(line, task_name: str = None):
     )
     query += f"\n\nQuestion: {line['question']}\nAnswer: "
     gold_ix = ["yes", "no", "maybe"].index(line["final_decision"])
-
     return Doc(
         task_name=task_name,
         query=query,
         choices=["A", "B", "C"],
         gold_index=gold_ix,
-        target_for_fewshot_sorting=["yes", "no", "maybe"][gold_ix],
     )
 
 
@@ -2263,13 +2252,11 @@ def truthful_qa_helm(line, task_name: str = None):
     query = f"Question: {line['question']}\n"
     query += "".join([f"{key}. {choice}\n" for key, choice in zip(LETTER_INDICES, line["choices"])])
     query += "Answer:"
-
     return Doc(
         task_name=task_name,
         query=query,
         choices=LETTER_INDICES[: len(line["choices"])],
         gold_index=line["gold_index"],
-        target_for_fewshot_sorting=line["choices"][line["gold_index"]],
     )
 
 

diff --git a/src/lighteval/tasks/lighteval_task.py b/src/lighteval/tasks/lighteval_task.py
@@ -340,21 +340,6 @@ def eval_docs(self) -> list[Doc]:
                 self._docs = self.remove_duplicate_docs(self._docs)
         return self._docs
 
-    def doc_to_target(self, formatted_doc: Doc, few_shot: bool = False) -> str:
-        """
-        Returns the target of the given document.
-
-        Args:
-            formatted_doc (Doc): Formatted document.
-            few_shot (bool, optional): Whether the document is used for few
-                shot examples. Defaults to False.
-
-        Returns:
-            str: Target of the document, which is the correct answer for a document.
-        """
-        # likely we mostly need one example not all
-        return as_list(formatted_doc.get_golds(few_shot=few_shot))[0]
-
     def construct_requests(
         self, formatted_doc: Doc, context: str, document_id_seed: str, current_task_name: str
     ) -> Dict[RequestType, List[Request]]:

diff --git a/src/lighteval/tasks/prompt_manager.py b/src/lighteval/tasks/prompt_manager.py
@@ -65,20 +65,18 @@ def doc_to_text(doc: Doc, return_instructions: bool = False) -> Union[str, Tuple
         )
 
     @staticmethod
-    def doc_to_target(formatted_doc: Doc, few_shot: bool = False) -> str:
+    def doc_to_target(formatted_doc: Doc) -> str:
         """
         Returns the target of the given document.
 
         Args:
             formatted_doc (Doc): Formatted document.
-            few_shot (bool, optional): Whether the document is used for few
-                shot examples. Defaults to False.
 
         Returns:
             str: Target of the document, which is the correct answer for a document.
         """
         # likely we mostly need one example not all
-        return as_list(formatted_doc.get_golds(few_shot=few_shot))[0]
+        return as_list(formatted_doc.get_golds())[0]
 
     def add_context_to_doc(
         self,
@@ -255,9 +253,7 @@ def get_examples(
 class FewShotSelectionMethod:
     sorting: str  # sorting method for the overall few shot pool (balanced, random, sequential)
     with_sampling: bool  # samples item randomly from the few shot pool
-    fewshotpool_unique: (
-        bool
-    )  # set to true if you are CERTAIN there is no intersection between the few shot pool and your evaluation set
+    fewshotpool_unique: bool  # set to true if you are CERTAIN there is no intersection between the few shot pool and your evaluation set
 
 
 class FewShotSelection(Enum):
@@ -363,7 +359,7 @@ def _init_fewshot_sampling_balanced(
         # Sort by counts of labels
         label_to_instances = defaultdict(list)
         for instance in fewshotpool:
-            target = PromptManager.doc_to_target(instance, few_shot=True)
+            target = PromptManager.doc_to_target(instance)
             label_to_instances[target].append(instance)
 
         counts_to_labels = defaultdict(list)

diff --git a/src/lighteval/tasks/requests.py b/src/lighteval/tasks/requests.py
@@ -178,7 +178,6 @@ class Doc:
 
     # For few-shot
     instruction: Optional[str] = ""
-    target_for_fewshot_sorting: Optional[str] = None  # will probably have to be removed in the future
 
     # Filled when parsing and adding the few-shot context
     ctx: Optional[str] = ""
@@ -194,18 +193,12 @@ def __post_init__(self):
         if self.instruction is None:
             self.instruction = ""
 
-    def get_golds(self, few_shot: bool = False):
+    def get_golds(self):
         """Return gold targets extracted from the target dict"""
         gold_indices = as_list(self.gold_index)
-        if few_shot and self.target_for_fewshot_sorting is not None:
-            choices = self.target_for_fewshot_sorting
-            if isinstance(choices, str):  # correct choice is already selected
-                return choices
-        else:
-            choices = self.choices
         golds = []
         for gold_ix in gold_indices:
-            golds.extend(as_list(choices[gold_ix]))
+            golds.extend(as_list(self.choices[gold_ix]))
         return golds
 
     def __repr__(self):