Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 5 additions & 5 deletions src/lighteval/metrics/utils/extractive_match_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -129,7 +129,7 @@ def lazy_expr_regex(expr_config: ExprExtractionConfig, language: Language) -> li

currency_units = re.escape("$€£¥₹₽₪₩₫฿₡₢₣₤₥₦₧₨₩₪₫₭₮₯₰₱₲₳₴₵₶₷₸₹₺₻₼₽₾₿")
expr_prefix_re = rf"(?:^|{space_re}|\=)(?:\*\*)?"
expr_suffix_re = rf"(?:\*\*)?(?:{full_stop_re}|{comma_re}|{colon_re}|{space_re}|\)|\$|$)"
expr_suffix_re = rf"(?:\*\*)?(?:{full_stop_re}|{comma_re}|\s?{colon_re}|{space_re}|\)|\$|$)"
# Expressions must be prefixed and suffixed while, digits don't need suffix and can have currency units preceeded, this is to ensure
# That we can extract stuff like $100 or 100m2, while we don't extract XDY2K as 2
expr_with_anchors = rf"(?:{expr_prefix_re}{expr_re}{expr_suffix_re})"
Expand All @@ -147,7 +147,7 @@ def lazy_expr_regex(expr_config: ExprExtractionConfig, language: Language) -> li
answer_prefix_re = rf"(?i:{translation_literal.answer})"

# Match after the last equals with answer word - require the number pattern,
equals_re_colon = rf"{answer_prefix_re}{colon_re}(?:.{{0,100}}=\s*|.{{0,50}}?){expr_or_number}(?!\s*=)"
equals_re_colon = rf"{answer_prefix_re}\s?{colon_re}(?:.{{0,100}}=\s*|.{{0,50}}?){expr_or_number}(?!\s*=)"
equals_re = rf"{answer_prefix_re}(?:.{{0,100}}=\s*|.{{0,50}}?){expr_or_number}(?!\s*=)"
regexes.extend([(equals_re_colon, 100), (equals_re, 200)])

Expand Down Expand Up @@ -252,7 +252,7 @@ def lazy_latex_regex(latex_config: LatexExtractionConfig, language: Language) ->
regexes.append((final_answer_prefixed_just_is, 50))

# Match with answer word - higher priority than plain latex
answer_re_colon = f"{answer_prefix_re}{colon_re}.{{0,50}}?{latex_re}"
answer_re_colon = rf"{answer_prefix_re}\s?{colon_re}.{{0,50}}?{latex_re}"
answer_re = f"{answer_prefix_re}.{{0,50}}?{latex_re}"

regexes.extend([(answer_re_colon, 100), (answer_re, 200)])
Expand Down Expand Up @@ -298,7 +298,7 @@ def lazy_indices_regex(
space_re = re.escape(translation_literal.sentence_space)

answer_prefix_re = rf"(?:^|{space_re})(?:\*\*)?"
answer_suffix_re = rf"(?:\*\*)?(?:{full_stop_re}|{comma_re}|{colon_re}|{space_re}|$)"
answer_suffix_re = rf"(?:\*\*)?(?:{full_stop_re}|{comma_re}|\s?{colon_re}|{space_re}|$)"
answer_re = f"{answer_prefix_re}{indice_str_re}{answer_suffix_re}"
answer_re_start = rf"^(?:\*\*)?{indice_str_re}{answer_suffix_re}"
answer_re_line_start = rf"\n(?:\*\*)?{indice_str_re}{answer_suffix_re}"
Expand All @@ -321,7 +321,7 @@ def lazy_indices_regex(
regexes.extend(
[
# Most specific patterns first
(f"{answer_word}{colon_re}.{{0,50}}?{answer_re}", 100),
(rf"{answer_word}\s?{colon_re}.{{0,50}}?{answer_re}", 100),
# Answer word patterns
(f"{answer_word}.{{0,50}}?{answer_re}", 150),
# Start of the string
Expand Down
Loading