From 79e38a792feec599d2dfa24a0ae8940f4da424a8 Mon Sep 17 00:00:00 2001
From: Matt Post <post@cs.jhu.edu>
Date: Fri, 3 Jan 2025 15:07:05 -0500
Subject: [PATCH] Add WMT 2024 test sets (#276)

---
 CHANGELOG.md                  | 136 ++++++++++++++++++----------------
 mypy.ini                      |   2 +-
 sacrebleu/dataset/__init__.py |  20 +++++
 3 files changed, 95 insertions(+), 63 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index d08d9e3..a6c78e7 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,23 +1,35 @@
 # Release Notes
 
-- 2.4.2 (2024-04-12)
+## 2.5.0 (2025-01-03)
+
+### Added
+
+- WMT24 test sets
+
+### Fixed
+
+- Convert Changelog to markdown format
+- Add optimization for compute_bleu precision initialization (#257)
+  Thanks to Ernests Lavrinovics for this contribution.
+
+## 2.4.2 (2024-04-12)
   Added:
   - Add printing of domain if present (via --echo)
 
-- 2.4.1 (2024-03-12)
+## 2.4.1 (2024-03-12)
   Fixed:
   - Add exports to package __init__.py
 
-- 2.4.0 (2023-12-11)
+## 2.4.0 (2023-12-11)
   Added:
   - WMT23 test sets (test set `wmt23`)
 
-- 2.3.3 (2023-11-28)
+## 2.3.3 (2023-11-28)
   Fixed:
   - Typing issues (#249, #250)
   - Improved builds (#252)
 
-- 2.3.2 (2023-11-06)
+## 2.3.2 (2023-11-06)
   Fixed:
   - Special treatment of empty references in TER (#232)
   - Bump in mecab version for JA (#234)
@@ -25,11 +37,11 @@
   Added:
   - Warning if `-tok spm` is used (use explicit `flores101` instead) (#238)
 
-- 2.3.1 (2022-10-18)
+## 2.3.1 (2022-10-18)
   Bugfix:
   - Set lru_cache to 2^16 for SPM tokenizer (was set to infinite)
 
-- 2.3.0 (2022-10-18)
+## 2.3.0 (2022-10-18)
   Features:
   - (#203) Added `-tok flores101` and `-tok flores200`, a.k.a. `spbleu`.
     These are multilingual tokenizations that make use of the
@@ -44,10 +56,10 @@
   - System outputs: include with wmt22. Also added wmt21/systems which will produce WMT21 submitted systems.
     To see available systems, give a dummy system to `--echo`, e.g., `sacrebleu -t wmt22 -l en-de --echo ?`
 
-- 2.2.1 (2022-09-13)
+## 2.2.1 (2022-09-13)
   Bugfix: Standard usage was returning (and using) each reference twice.
 
-- 2.2.0 (2022-07-25)
+## 2.2.0 (2022-07-25)
   Features:
   - Added WMT21 datasets (thanks to @BrighXiaoHan)
   - `--echo` now exposes document metadata where available (e.g., docid, genre, origlang)
@@ -65,7 +77,7 @@
   Many thanks to @BrightXiaoHan (https://github.com/BrightXiaoHan) for the bulk of
   the code contributions in this release.
 
-- 2.1.0 (2022-05-19)
+## 2.1.0 (2022-05-19)
   Features:
   - Added `-tok spm` for multilingual SPM tokenization (#168)
     (thanks to Naman Goyal and James Cross at Facebook)
@@ -75,7 +87,7 @@
   - Bugfix: BLEU.corpus_score() now using max_ngram_order (#173)
   - Upgraded ja-mecab to 1.0.5 (#196)
 
-- 2.0.0 (2021-07-18)
+## 2.0.0 (2021-07-18)
   - Build: Add Windows and OS X testing to Travis CI.
   - Improve documentation and type annotations.
   - Drop `Python < 3.6` support and migrate to f-strings.
@@ -137,11 +149,11 @@
     as well as paired bootstrap resampling (`--paired-bs`) and paired approximate
     randomization tests (`--paired-ar`) when evaluating multiple systems (#40 and #78).
 
-- 1.5.1 (2021-03-05)
+## 1.5.1 (2021-03-05)
   - Fix extraction error for WMT18 extra test sets (test-ts) (#142)
   - Validation and test datasets are added for multilingual TEDx
 
-- 1.5.0 (2021-01-15)
+## 1.5.0 (2021-01-15)
   - Fix an assertion error in chrF (#121)
   - Add missing `__repr__()` methods for BLEU and TER
   - TER: Fix exception when `--short` is used (#131)
@@ -155,7 +167,7 @@
   - Allow variable number of references for BLEU (only via API) (#130).
     Thanks to Ondrej Dusek (@tuetschek)
 
-- 1.4.14 (2020-09-13)
+## 1.4.14 (2020-09-13)
   - Added character-based tokenization (`-tok char`).
     Thanks to Christian Federmann.
   - Added TER (`-m ter`). Thanks to Ales Tamchyna! (fixes #90)
@@ -166,7 +178,7 @@
     - wmt20/robust/set2 (en-ja, ja-en)
     - wmt20/robust/set3 (de-en)
 
-- 1.4.13 (2020-07-30)
+## 1.4.13 (2020-07-30)
   - Added WMT20 newstest test sets (#103)
   - Make mecab3-python an extra dependency, adapt code to new mecab3-python
     This fixes the recent Windows installation issues as well (#104)
@@ -174,10 +186,10 @@
   - Fix return type annotation of corpus_bleu()
   - Improve sentence_score's documentation, do not allow single ref string (#98)
 
-- 1.4.12 (2020-07-03)
+## 1.4.12 (2020-07-03)
   - Fix a deployment bug (#96)
 
-- 1.4.11 (2020-07-03)
+## 1.4.11 (2020-07-03)
   - Added Multi30k multimodal MT test set metadata
   - Refactored all tokenizers into respective classes (fixes #85)
   - Refactored all metrics into respective classes
@@ -193,7 +205,7 @@
   - Added score regression tests for chrF using reference chrF++ implementation
   - Added multi-reference & tokenizer & signature tests
 
-- 1.4.10 (2020-05-30)
+## 1.4.10 (2020-05-30)
   - Fixed bug in signature with mecab tokenizer
   - Cleaned up deprecation warnings (thanks to Karthikeyan Singaravelan @tirkarthi)
   - Now only lists the external [typing](https://pypi.org/project/typing/)
@@ -201,13 +213,13 @@
     library in Python 3.5 (thanks to Erwan de Lépinau @ErwanDL).
   - Added LICENSE to pypi (thanks to Mark Harfouche @hmaarrfk)
 
-- 1.4.9 (2020-04-30)
+## 1.4.9 (2020-04-30)
   - Changed `get_available_testsets()` to return a list
   - Remove Japanese MeCab tokenizer from requirements.
     (Must be installed manually to avoid Windows incompatibility).
     Many thanks to Makoto Morishita (@MorinoseiMorizo).
 
-- 1.4.8 (2020-04-26)
+## 1.4.8 (2020-04-26)
   - Added to API:
     - get_source_file()
     - get_reference_files()
@@ -217,21 +229,21 @@
   - Fixed descriptions of some WMT19/google test sets
   - Added API test case (test/test_apy.py)
 
-- 1.4.7 (2020-04-19)
+## 1.4.7 (2020-04-19)
   - Added Google's extra wmt19/en-de refs (-t wmt19/google/{ar,arp,hqall,hqp,hqr,wmtp})
     (Freitag, Grangier, & Caswell
      BLEU might be Guilty but References are not Innocent
      https://arxiv.org/abs/2004.06063)
   - Restored SACREBLEU_DIR and smart_open to exports (thanks to Thomas Liao @tholiao)
 
-- 1.4.6 (2020-03-28)
+## 1.4.6 (2020-03-28)
   - Large internal reorganization as a module (thanks to Thamme Gowda @thammegowda)
 
-- 1.4.5 (2020-03-28)
+## 1.4.5 (2020-03-28)
   - Added Japanese MeCab tokenizer (`-tok ja-mecab`) (thanks to Makoto Morishita @MorinoseiMorizo)
   - Added wmt20/dev test sets (thanks to Martin Popel @martinpopel)
 
-- 1.4.4 (2020-03-10)
+## 1.4.4 (2020-03-10)
   - Smoothing changes (Sebastian Nickels @sn1c)
     - Fixed bug that only applied smoothing to n-grams for n > 2
     - Added default smoothing values for methods "floor" (0) and "add-k" (1)
@@ -240,20 +252,20 @@
   - added missing languages for IWSLT17
   - Minor code improvements (Thomas Liao @tholiao)
 
-- 1.4.3 (2019-12-02)
+## 1.4.3 (2019-12-02)
   - Bugfix: handling of result object for CHRF
   - Improved API example
 
-- 1.4.2 (2019-10-11)
+## 1.4.2 (2019-10-11)
   - Tokenization variant omitted from the chrF signature; it is relevant only for BLEU (thanks to Martin Popel)
   - Bugfix: call to sentence_bleu (thanks to Rachel Bawden)
   - Documentation example for Python API (thanks to Vlad Lyalin)
   - Calls to corpus_chrf and sentence_chrf now return a an object instead of a float (use result.score)
 
-- 1.4.1 (2019-09-11)
+## 1.4.1 (2019-09-11)
    - Added sentence-level scoring via -sl (--sentence-level)
 
-- 1.4.0 (2019-09-10)
+## 1.4.0 (2019-09-10)
    - Many thanks to Martin Popel for all the changes below!
    - Added evaluation on concatenated test sets (e.g., `-t wmt17,wmt18`).
      Works as long as they all have the same language pair.
@@ -269,102 +281,102 @@
      - Documentation and tests updates
      - Fixed a race condition bug (`os.makedirs(outdir, exist_ok=True)` instead of `if os.path.exists`)
 
-- 1.3.7 (2019-07-12)
+## 1.3.7 (2019-07-12)
    - Lazy loading of regexes cuts import time from ~1s to nearly nothing (thanks, @louismartin!)
    - Added a simple (non-atomic) lock on downloading
    - Can now read multiple refs from a single tab-delimited file.
      You need to pass `--num-refs N` to tell it to run the split.
      Only works with a single reference file passed from the command line.
 
-- 1.3.6 (2019-06-10)
+## 1.3.6 (2019-06-10)
    - Removed another f-string for Python 3.5 compatibility
 
-- 1.3.5 (2019-06-07)
+## 1.3.5 (2019-06-07)
    - Restored Python 3.5 compatibility
 
-- 1.3.4 (2019-05-28)
+## 1.3.4 (2019-05-28)
    - Added MTNT 2019 test sets
    - Added a BLEU object
 
-- 1.3.3 (2019-05-08)
+## 1.3.3 (2019-05-08)
    - Added WMT'19 test sets
 
-- 1.3.2 (2018-04-24)
+## 1.3.2 (2018-04-24)
    - Bugfix in test case (thanks to Adam Roberts, @adarob)
    - Passing smoothing method through `sentence_bleu`
 
-- 1.3.1 (2019-03-20)
+## 1.3.1 (2019-03-20)
    - Added another smoothing approach (add-k) and a command-line option for choosing the smoothing method
      (`--smooth exp|floor|add-n|none`) and the associated value (`--smooth-value`), when relevant.
    - Changed interface to some functions (backwards incompatible)
      - 'smooth' is now 'smooth_method'
      - 'smooth_floor' is now 'smooth_value'
 
-- 1.2.21 (19 March 2019)
+## 1.2.21 (19 March 2019)
    - Ctrl-M characters are now treated as normal characters, previously treated as newline.
 
-- 1.2.20 (28 February 2018)
+## 1.2.20 (28 February 2018)
    - Tokenization now defaults to "zh" when language pair is known
 
-- 1.2.19 (19 February 2019)
+## 1.2.19 (19 February 2019)
    - Updated checksum for wmt19/dev (seems to have changed)
 
-- 1.2.18 (19 February 2019)
+## 1.2.18 (19 February 2019)
    - Fixed checksum for wmt17/dev (copy-paste error)
 
-- 1.2.17 (6 February 2019)
+## 1.2.17 (6 February 2019)
    - Added kk-en and en-kk to wmt19/dev
 
-- 1.2.16 (4 February 2019)
+## 1.2.16 (4 February 2019)
    - Added gu-en and en-gu to wmt19/dev
 
-- 1.2.15 (30 January 2019)
+## 1.2.15 (30 January 2019)
    - Added MD5 checksumming of downloaded files for all datasets.
 
-- 1.2.14 (22 January 2019)
+## 1.2.14 (22 January 2019)
    - Added mtnt1.1/train mtnt1.1/valid mtnt1.1/test data from [MTNT](http://www.cs.cmu.edu/~pmichel1/mtnt/)
 
-- 1.2.13 (22 January 2019)
+## 1.2.13 (22 January 2019)
    - Added 'wmt19/dev' task for 'lt-en' and 'en-lt' (development data for new tasks).
    - Added MD5 checksum for downloaded tarballs.
 
-- 1.2.12 (8 November 2018)
+## 1.2.12 (8 November 2018)
    - Now outputs only only digit after the decimal
 
-- 1.2.11 (29 August 2018)
+## 1.2.11 (29 August 2018)
    - Added a function for sentence-level, smoothed BLEU
 
-- 1.2.10 (23 May 2018)
+## 1.2.10 (23 May 2018)
    - Added wmt18 test set (with references)
 
-- 1.2.9 (15 May 2018)
+## 1.2.9 (15 May 2018)
    - Added zh-en, en-zh, tr-en, and en-tr datasets for wmt18/test-ts
 
-- 1.2.8 (14 May 2018)
+## 1.2.8 (14 May 2018)
    - Added wmt18/test-ts, the test sources (only) for [WMT18](http://statmt.org/wmt18/translation-task.html)
    - Moved README out of `sacrebleu.py` and the CHANGELOG into a separate file
 
-- 1.2.7 (10 April 2018)
+## 1.2.7 (10 April 2018)
    - fixed another locale issue (with --echo)
    - grudgingly enabled `-tok none` from the command line
 
-- 1.2.6 (22 March 2018)
+## 1.2.6 (22 March 2018)
    - added wmt17/ms (Microsoft's [additional ZH-EN references](https://github.com/MicrosoftTranslator/Translator-HumanParityData)).
      Try `sacrebleu -t wmt17/ms --cite`.
    - `--echo ref` now pastes together all references, if there is more than one
 
-- 1.2.5 (13 March 2018)
+## 1.2.5 (13 March 2018)
    - added wmt18/dev datasets (en-et and et-en)
    - fixed logic with --force
    - locale-independent installation
    - added "--echo both" (tab-delimited)
 
-- 1.2.3 (28 January 2018)
+## 1.2.3 (28 January 2018)
    - metrics (`-m`) are now printed in the order requested
    - chrF now prints a version string (including the beta parameter, importantly)
    - attempt to remove dependence on locale setting
 
-- 1.2 (17 January 2018)
+## 1.2 (17 January 2018)
    - added the chrF metric (`-m chrf` or `-m bleu chrf` for both)
      See 'CHRF: character n-gram F-score for automatic MT evaluation' by Maja Popovic (WMT 2015)
      [http://www.statmt.org/wmt15/pdf/WMT49.pdf]
@@ -374,26 +386,26 @@
    - added `--input` (`-i`) to set input to a file instead of STDIN
    - removed accent mark after objection from UN official
 
-- 1.1.7 (27 November 2017)
+## 1.1.7 (27 November 2017)
    - corpus_bleu() now raises an exception if input streams are different lengths
    - thanks to Martin Popel for:
       - small bugfix in tokenization_13a (not affecting WMT references)
       - adding `--tok intl` (international tokenization)
    - added wmt17/dev and wmt17/dev sets (for languages intro'd those years)
 
-- 1.1.6 (15 November 2017)
+## 1.1.6 (15 November 2017)
    - bugfix for tokenization warning
 
-- 1.1.5 (12 November 2017)
+## 1.1.5 (12 November 2017)
    - added -b option (only output the BLEU score)
    - removed fi-en from list of WMT16/17 systems with more than one reference
    - added WMT16/tworefs and WMT17/tworefs for scoring with both en-fi references
 
-- 1.1.4 (10 November 2017)
+## 1.1.4 (10 November 2017)
    - added effective order for sentence-level BLEU computation
    - added unit tests from sockeye
 
-- 1.1.3 (8 November 2017).
+## 1.1.3 (8 November 2017).
    - Factored code a bit to facilitate API:
       - compute_bleu: works from raw stats
       - corpus_bleu for use from the command line
@@ -402,17 +414,17 @@
    - Added 'floor' smoothing (adds 0.01 to 0 counts, more versatile via API), 'none' smoothing (via API)
    - Small bugfixes, windows compatibility (H/T Christian Federmann)
 
-- 1.0.3 (4 November 2017).
+## 1.0.3 (4 November 2017).
    - Contributions from Christian Federmann:
       - Added explicit support for encoding
       - Fixed Windows support
       - Bugfix in handling reference length with multiple refs
 
-- version 1.0.1 (1 November 2017).
+## version 1.0.1 (1 November 2017).
    - Small bugfix affecting some versions of Python.
    - Code reformatting due to Ozan Çağlayan.
 
-- version 1.0 (23 October 2017).
+## version 1.0 (23 October 2017).
    - Support for WMT 2008--2017.
    - Single tokenization (v13a) with lowercase fix (proper lower() instead of just A-Z).
    - Chinese tokenization.
diff --git a/mypy.ini b/mypy.ini
index 26ff095..2fda182 100644
--- a/mypy.ini
+++ b/mypy.ini
@@ -1,5 +1,5 @@
 [mypy]
-python_version = 3.6
+python_version = 3.12
 
 [mypy-portalocker.*]
 ignore_missing_imports = True
diff --git a/sacrebleu/dataset/__init__.py b/sacrebleu/dataset/__init__.py
index 2478395..68c35ce 100644
--- a/sacrebleu/dataset/__init__.py
+++ b/sacrebleu/dataset/__init__.py
@@ -74,6 +74,26 @@
 
 DATASETS = {
     # wmt
+    "wmt24": WMTXMLDataset(
+        "wmt24",
+        data=["https://github.com/wmt-conference/wmt24-news-systems/releases/download/v1.1/data_onlyxml.tar.gz"],
+        description="WMT24 official test set release, v1.1 (excluding TS)",
+        md5=["299963fcb7b4e86d6d212bf69beb9580"],
+        langpairs={
+            "cs-uk": ["xml/wmttest2024.cs-uk.all.xml"],
+            "en-cs": ["xml/wmttest2024.en-cs.all.xml"],
+            "en-de": ["xml/wmttest2024.en-de.all.xml"],
+            "en-es": ["xml/wmttest2024.en-es.all.xml"],
+            "en-hi": ["xml/wmttest2024.en-hi.all.xml"],
+            "en-is": ["xml/wmttest2024.en-is.all.xml"],
+            "en-ja": ["xml/wmttest2024.en-ja.all.xml"],
+            "en-ru": ["xml/wmttest2024.en-ru.all.xml"],
+            "en-uk": ["xml/wmttest2024.en-uk.all.xml"],
+            "en-zh": ["xml/wmttest2024.en-zh.all.xml"],
+            "ja-zh": ["xml/wmttest2024.ja-zh.all.xml"],
+        },
+        refs=["refA"],
+    ),
     "wmt23": WMTXMLDataset(
         "wmt23",
         data=["https://github.com/wmt-conference/wmt23-news-systems/archive/refs/tags/v.0.1.tar.gz"],