|
27 | 27 | from lighteval.metrics.normalizations import LogProbTokenNorm
|
28 | 28 | from lighteval.tasks.lighteval_task import LightevalTaskConfig
|
29 | 29 | from lighteval.tasks.templates.copa import get_copa_prompt_function
|
| 30 | +from lighteval.tasks.templates.hellaswag import get_hellaswag_prompt_function |
30 | 31 | from lighteval.tasks.templates.nli import get_nli_prompt_function
|
31 | 32 | from lighteval.tasks.templates.utils.formulation import (
|
32 | 33 | CFFormulation,
|
|
386 | 387 | ),
|
387 | 388 | hf_repo="ai4bharat/IndicCOPA",
|
388 | 389 | hf_subset=f"translation-{standardize_tag(language.value)}",
|
| 390 | + # Since we use trust_dataset, we have to be careful about what is inside the dataset |
| 391 | + # script. We thus lock the revision to ensure that the script doesn't change |
| 392 | + hf_revision="d356ef19a4eb287e88a51d07a56b73ba88c7f188", |
389 | 393 | evaluation_splits=["test"],
|
390 | 394 | metric=[
|
391 | 395 | loglikelihood_acc_metric(normalization=LogProbTokenNorm()),
|
|
443 | 447 | ]
|
444 | 448 |
|
445 | 449 |
|
| 450 | +# ------------------------------- Hellaswag Tasks ------------------------------- # |
| 451 | +# Hellaswag is a commonsense reasoning task that requires models to complete a given scenario |
| 452 | +# with the most plausible ending. It tests the model's ability to understand and reason about |
| 453 | +# everyday situations and human behavior. |
| 454 | + |
| 455 | +# MLMM-Hellaswag: Multilingual adaptation of Hellaswag |
| 456 | +# Paper: https://arxiv.org/abs/2306.07610 |
| 457 | +# This is a multilingual version of Hellaswag, part of the MLMM (Massive Language Model Meta-Evaluation) benchmark. |
| 458 | +# It evaluates commonsense reasoning abilities across multiple languages. |
| 459 | +mlmm_hellaswag_tasks = [ |
| 460 | + LightevalTaskConfig( |
| 461 | + name=f"hellaswag_{lang.value}_{formulation.name.lower()}", |
| 462 | + suite=["lighteval"], |
| 463 | + prompt_function=get_hellaswag_prompt_function( |
| 464 | + language=lang, |
| 465 | + adapter=lambda line: { |
| 466 | + # We don't use activity_label as they are not available |
| 467 | + "ctx_a": line["ctx_a"], |
| 468 | + "ctx_b": line["ctx_b"], |
| 469 | + "continuations": line["endings"], |
| 470 | + "gold_idx": int(line["label"]), |
| 471 | + }, |
| 472 | + formulation=formulation, |
| 473 | + ), |
| 474 | + hf_repo="jon-tow/okapi_hellaswag", |
| 475 | + hf_subset=standardize_tag(lang.value), |
| 476 | + # Since we use trust_dataset, we have to be careful about what is inside the dataset |
| 477 | + # script. We thus lock the revision to ensure that the script doesn't change |
| 478 | + hf_revision="96ed8e0dfc6172dad1d3df338d7b8ba6c1ff9d83", |
| 479 | + evaluation_splits=["validation"], |
| 480 | + metric=[ |
| 481 | + loglikelihood_acc_metric(normalization=LogProbTokenNorm()), |
| 482 | + ], |
| 483 | + trust_dataset=True, |
| 484 | + ) |
| 485 | + for lang in [ |
| 486 | + Language.ARABIC, |
| 487 | + Language.BENGALI, |
| 488 | + Language.CATALAN, |
| 489 | + Language.DANISH, |
| 490 | + Language.GERMAN, |
| 491 | + Language.SPANISH, |
| 492 | + Language.BASQUE, |
| 493 | + Language.FRENCH, |
| 494 | + Language.GUJARATI, |
| 495 | + Language.HINDI, |
| 496 | + Language.CROATIAN, |
| 497 | + Language.HUNGARIAN, |
| 498 | + Language.ARMENIAN, |
| 499 | + Language.INDONESIAN, |
| 500 | + Language.ICELANDIC, |
| 501 | + Language.ITALIAN, |
| 502 | + Language.KANNADA, |
| 503 | + Language.MALAYALAM, |
| 504 | + Language.MARATHI, |
| 505 | + Language.NORWEGIAN, |
| 506 | + Language.NEPALI, |
| 507 | + Language.DUTCH, |
| 508 | + Language.PORTUGUESE, |
| 509 | + Language.ROMANIAN, |
| 510 | + Language.RUSSIAN, |
| 511 | + Language.SLOVAK, |
| 512 | + Language.SERBIAN, |
| 513 | + Language.SWEDISH, |
| 514 | + Language.TAMIL, |
| 515 | + Language.TELUGU, |
| 516 | + Language.UKRAINIAN, |
| 517 | + Language.VIETNAMESE, |
| 518 | + Language.CHINESE, |
| 519 | + ] |
| 520 | + for formulation in [MCFFormulation(), CFFormulation(), HybridFormulation()] |
| 521 | +] |
| 522 | + |
| 523 | +# Hellaswag Turkish |
| 524 | +# This is a Turkish adaptation of the Hellaswag task. |
| 525 | +# While there's no specific paper for this version, it has been found to work well for evaluating |
| 526 | +# Turkish language models on commonsense reasoning tasks. |
| 527 | + |
| 528 | +# We don't handle them in single task as there is quite a lot of differences (dataset/subset, dot replacement, etc.) |
| 529 | +# which would make it hard to read |
| 530 | +hellaswag_tur_tasks = [ |
| 531 | + LightevalTaskConfig( |
| 532 | + name=f"community_hellaswag_{Language.TURKISH.value}_{formulation.name.lower()}", |
| 533 | + suite=["lighteval"], |
| 534 | + prompt_function=get_hellaswag_prompt_function( |
| 535 | + language=Language.TURKISH, |
| 536 | + adapter=lambda line: { |
| 537 | + "ctx_a": line["ctx_a"], |
| 538 | + "ctx_b": line["ctx_b"], |
| 539 | + "continuations": line["endings"], |
| 540 | + "gold_idx": int(line["label"]), |
| 541 | + }, |
| 542 | + formulation=formulation, |
| 543 | + # https://github.com/malhajar17/lm-evaluation-harness_turkish/blob/main/lm_eval/tasks/hellaswag_tr-v0.2/utils.py |
| 544 | + wikihow_artifacts=[" [title]", " [başlık]", " [adım]", " [header]"], |
| 545 | + ), |
| 546 | + hf_repo="malhajar/hellaswag_tr-v0.2", |
| 547 | + hf_subset="default", |
| 548 | + evaluation_splits=["validation"], |
| 549 | + metric=[ |
| 550 | + loglikelihood_acc_metric(normalization=LogProbTokenNorm()), |
| 551 | + ], |
| 552 | + ) |
| 553 | + for formulation in [MCFFormulation(), CFFormulation(), HybridFormulation()] |
| 554 | +] |
| 555 | + |
| 556 | +# Hellaswag Thai |
| 557 | +# This is a Thai adaptation of the Hellaswag task. |
| 558 | +# Similar to the Turkish version, there's no specific paper, but it has been found to be effective |
| 559 | +# for evaluating Thai language models on commonsense reasoning tasks. |
| 560 | +hellaswag_tha_tasks = [ |
| 561 | + LightevalTaskConfig( |
| 562 | + name=f"community_hellaswag_{Language.THAI.value}_{formulation.name.lower()}", |
| 563 | + suite=["lighteval"], |
| 564 | + prompt_function=get_hellaswag_prompt_function( |
| 565 | + language=Language.THAI, |
| 566 | + adapter=lambda line: { |
| 567 | + "ctx_a": line["ctx_a"], |
| 568 | + "ctx_b": line["ctx_b"], |
| 569 | + "continuations": line["endings"], |
| 570 | + "gold_idx": int(line["label"]), |
| 571 | + }, |
| 572 | + formulation=formulation, |
| 573 | + ), |
| 574 | + hf_repo="HuggingFaceFW-Dev/hellaswag_thai", |
| 575 | + hf_subset="default", |
| 576 | + evaluation_splits=["validation"], |
| 577 | + few_shots_split="train", |
| 578 | + metric=[ |
| 579 | + loglikelihood_acc_metric(normalization=LogProbTokenNorm()), |
| 580 | + ], |
| 581 | + ) |
| 582 | + for formulation in [MCFFormulation(), CFFormulation(), HybridFormulation()] |
| 583 | +] |
| 584 | + |
446 | 585 | TASKS_TABLE = [
|
447 | 586 | *xnli_tasks,
|
448 | 587 | *xnli2_tasks,
|
|
454 | 593 | *xcopa_tasks,
|
455 | 594 | *copa_indic_tasks,
|
456 | 595 | *parus_tasks,
|
| 596 | + *mlmm_hellaswag_tasks, |
| 597 | + *hellaswag_tur_tasks, |
| 598 | + *hellaswag_tha_tasks, |
457 | 599 | ]
|
0 commit comments