Skip to content
25 changes: 6 additions & 19 deletions src/lighteval/tasks/default_prompts.py
Original file line number Diff line number Diff line change
Expand Up @@ -176,7 +176,6 @@ def bbh_harness(line, task_name: str = None):
query=query,
choices=choices,
gold_index=correct_index,
target_for_fewshot_sorting=choices,
instruction=line.get("task_prefix", None),
)

Expand All @@ -196,18 +195,17 @@ def bbh_lighteval(line, task_name: str = None):
query=query,
choices=LETTER_INDICES[: len(line["choices"])],
gold_index=line["target_idx"],
target_for_fewshot_sorting=LETTER_INDICES[: len(line["choices"])],
instruction=line.get("task_prefix", None),
)


def bbh(line, instruction, choices, task_name: str = None):
is_few_shots = line.get("__few_shots", False)
return Doc(
task_name=task_name,
query=f"{instruction}Q: {line['input']}\nA:",
choices=choices,
choices=[(" " if is_few_shots else "") + c for c in choices],
gold_index=choices.index(line["target"]),
target_for_fewshot_sorting=[f" {c}" for c in choices],
instruction=instruction,
)

Expand Down Expand Up @@ -799,7 +797,6 @@ def hellaswag_generative(line, task_name: str = None):
choices=[" " + i for i in LETTER_INDICES[: len(line["endings"])]],
gold_index=gold_ix, # -1 for test,
instruction="The following are multiple choice questions (with answers) about common sense.\n\n",
target_for_fewshot_sorting=line["endings"][gold_ix] if gold_ix > -1 else "",
)


Expand Down Expand Up @@ -1352,7 +1349,6 @@ def mmlu(line, topic, task_name: str = None):
choices=[" A", " B", " C", " D"] if is_few_shots else ["A", "B", "C", "D"],
gold_index=gold_ix,
instruction=f"The following are multiple choice questions (with answers) about {topic.replace('_', ' ')}.\n\n",
target_for_fewshot_sorting=[" A", " B", " C", " D"][gold_ix],
)


Expand All @@ -1373,7 +1369,6 @@ def custom_mmlu_thom(line, task_name: str = None):
choices=[" A", " B", " C", " D"] if is_few_shots else ["A", "B", "C", "D"],
gold_index=gold_ix,
instruction=f"The following are multiple choice questions (with answers) about {topic.replace('_', ' ')}.\n\n",
target_for_fewshot_sorting=[" A", " B", " C", " D"][gold_ix],
)


Expand Down Expand Up @@ -1613,15 +1608,13 @@ def mmlu_harness(line, task_name: str = None):
query += "Answer:"

gold_ix = LETTER_INDICES.index(line["answer"]) if isinstance(line["answer"], str) else line["answer"]
"__few_shots" in line and line["__few_shots"] is True # We are adding few shots

return Doc(
task_name=task_name,
query=query,
choices=[" A", " B", " C", " D"],
gold_index=gold_ix,
instruction=f"The following are multiple choice questions (with answers) about {topic.replace('_', ' ')}.\n\n",
target_for_fewshot_sorting=[" A", " B", " C", " D"][gold_ix],
)


Expand All @@ -1632,14 +1625,14 @@ def mmlu_helm(line, task_name: str = None):
query += "\nAnswer:"

gold_ix = LETTER_INDICES.index(line["answer"]) if isinstance(line["answer"], str) else line["answer"]
is_few_shots = line.get("__few_shots", False) # We are adding few shots

return Doc(
task_name=task_name,
query=query,
choices=[" A", " B", " C", " D"],
choices=[" A", " B", " C", " D"] if not is_few_shots else line["choices"],
gold_index=gold_ix,
instruction=f"The following are multiple choice questions (with answers) about {subject.replace('_', ' ')}.\n\n",
target_for_fewshot_sorting=line["choices"][gold_ix], # specific to HELM evals
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The conversion is incorrect here.
Helm evals are actually doing a super weird thing where the actual choice is used for the few shot, and the key for the evaluation. (I think we should change it anyway since it makes no sense, so you'll just need to remove the comment about helm)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is it correct now? choice contents for fewshot and labels(A, B, C, D) for eval.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Now I set target_for_fewshot_sorting to the label (A,B,C,D). This way fewshot sampling becomes balanced on labels. May I keep this or revert it to be the choice content?

)


Expand Down Expand Up @@ -1816,7 +1809,6 @@ def openbookqa_helm(line, task_name: str = None):
choices=["A", "B", "C", "D", "E"],
gold_index=gold_ix,
instruction="The following are multiple choice questions (with answers) about common sense.\n",
target_for_fewshot_sorting=line["choices"]["text"][gold_ix], # specific to HELM evals
)


Expand All @@ -1837,14 +1829,13 @@ def piqa_helm(line, task_name: str = None):
query += "Answer: "

gold_ix = int(line["label"])

is_few_shots = line.get("__few_shots", False)
return Doc(
task_name=task_name,
query=query,
choices=["A", "B"],
choices=["A", "B"] if not is_few_shots else [line["sol1"], line["sol2"]],
gold_index=gold_ix,
instruction="The following are multiple choice questions (with answers) about common sense.\n",
target_for_fewshot_sorting=[line["sol1"], line["sol2"]][gold_ix],
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's the same here, you changed the logic

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I changed this accordingly.

)


Expand Down Expand Up @@ -1877,13 +1868,11 @@ def pubmed_qa_helm(line, task_name: str = None):
)
query += f"\n\nQuestion: {line['question']}\nAnswer: "
gold_ix = ["yes", "no", "maybe"].index(line["final_decision"])

return Doc(
task_name=task_name,
query=query,
choices=["A", "B", "C"],
gold_index=gold_ix,
target_for_fewshot_sorting=["yes", "no", "maybe"][gold_ix],
)


Expand Down Expand Up @@ -2263,13 +2252,11 @@ def truthful_qa_helm(line, task_name: str = None):
query = f"Question: {line['question']}\n"
query += "".join([f"{key}. {choice}\n" for key, choice in zip(LETTER_INDICES, line["choices"])])
query += "Answer:"

return Doc(
task_name=task_name,
query=query,
choices=LETTER_INDICES[: len(line["choices"])],
gold_index=line["gold_index"],
target_for_fewshot_sorting=line["choices"][line["gold_index"]],
)


Expand Down
15 changes: 0 additions & 15 deletions src/lighteval/tasks/lighteval_task.py
Original file line number Diff line number Diff line change
Expand Up @@ -340,21 +340,6 @@ def eval_docs(self) -> list[Doc]:
self._docs = self.remove_duplicate_docs(self._docs)
return self._docs

def doc_to_target(self, formatted_doc: Doc, few_shot: bool = False) -> str:
"""
Returns the target of the given document.

Args:
formatted_doc (Doc): Formatted document.
few_shot (bool, optional): Whether the document is used for few
shot examples. Defaults to False.

Returns:
str: Target of the document, which is the correct answer for a document.
"""
# likely we mostly need one example not all
return as_list(formatted_doc.get_golds(few_shot=few_shot))[0]

def construct_requests(
self, formatted_doc: Doc, context: str, document_id_seed: str, current_task_name: str
) -> Dict[RequestType, List[Request]]:
Expand Down
12 changes: 4 additions & 8 deletions src/lighteval/tasks/prompt_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,20 +65,18 @@ def doc_to_text(doc: Doc, return_instructions: bool = False) -> Union[str, Tuple
)

@staticmethod
def doc_to_target(formatted_doc: Doc, few_shot: bool = False) -> str:
def doc_to_target(formatted_doc: Doc) -> str:
"""
Returns the target of the given document.

Args:
formatted_doc (Doc): Formatted document.
few_shot (bool, optional): Whether the document is used for few
shot examples. Defaults to False.

Returns:
str: Target of the document, which is the correct answer for a document.
"""
# likely we mostly need one example not all
return as_list(formatted_doc.get_golds(few_shot=few_shot))[0]
return as_list(formatted_doc.get_golds())[0]

def add_context_to_doc(
self,
Expand Down Expand Up @@ -255,9 +253,7 @@ def get_examples(
class FewShotSelectionMethod:
sorting: str # sorting method for the overall few shot pool (balanced, random, sequential)
with_sampling: bool # samples item randomly from the few shot pool
fewshotpool_unique: (
bool
) # set to true if you are CERTAIN there is no intersection between the few shot pool and your evaluation set
fewshotpool_unique: bool # set to true if you are CERTAIN there is no intersection between the few shot pool and your evaluation set


class FewShotSelection(Enum):
Expand Down Expand Up @@ -363,7 +359,7 @@ def _init_fewshot_sampling_balanced(
# Sort by counts of labels
label_to_instances = defaultdict(list)
for instance in fewshotpool:
target = PromptManager.doc_to_target(instance, few_shot=True)
target = PromptManager.doc_to_target(instance)
label_to_instances[target].append(instance)

counts_to_labels = defaultdict(list)
Expand Down
11 changes: 2 additions & 9 deletions src/lighteval/tasks/requests.py
Original file line number Diff line number Diff line change
Expand Up @@ -178,7 +178,6 @@ class Doc:

# For few-shot
instruction: Optional[str] = ""
target_for_fewshot_sorting: Optional[str] = None # will probably have to be removed in the future

# Filled when parsing and adding the few-shot context
ctx: Optional[str] = ""
Expand All @@ -194,18 +193,12 @@ def __post_init__(self):
if self.instruction is None:
self.instruction = ""

def get_golds(self, few_shot: bool = False):
def get_golds(self):
"""Return gold targets extracted from the target dict"""
gold_indices = as_list(self.gold_index)
if few_shot and self.target_for_fewshot_sorting is not None:
choices = self.target_for_fewshot_sorting
if isinstance(choices, str): # correct choice is already selected
return choices
else:
choices = self.choices
golds = []
for gold_ix in gold_indices:
golds.extend(as_list(choices[gold_ix]))
golds.extend(as_list(self.choices[gold_ix]))
return golds

def __repr__(self):
Expand Down
Loading