From 79e38a792feec599d2dfa24a0ae8940f4da424a8 Mon Sep 17 00:00:00 2001 From: Matt Post Date: Fri, 3 Jan 2025 15:07:05 -0500 Subject: [PATCH] Add WMT 2024 test sets (#276) --- CHANGELOG.md | 136 ++++++++++++++++++---------------- mypy.ini | 2 +- sacrebleu/dataset/__init__.py | 20 +++++ 3 files changed, 95 insertions(+), 63 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index d08d9e3..a6c78e7 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,23 +1,35 @@ # Release Notes -- 2.4.2 (2024-04-12) +## 2.5.0 (2025-01-03) + +### Added + +- WMT24 test sets + +### Fixed + +- Convert Changelog to markdown format +- Add optimization for compute_bleu precision initialization (#257) + Thanks to Ernests Lavrinovics for this contribution. + +## 2.4.2 (2024-04-12) Added: - Add printing of domain if present (via --echo) -- 2.4.1 (2024-03-12) +## 2.4.1 (2024-03-12) Fixed: - Add exports to package __init__.py -- 2.4.0 (2023-12-11) +## 2.4.0 (2023-12-11) Added: - WMT23 test sets (test set `wmt23`) -- 2.3.3 (2023-11-28) +## 2.3.3 (2023-11-28) Fixed: - Typing issues (#249, #250) - Improved builds (#252) -- 2.3.2 (2023-11-06) +## 2.3.2 (2023-11-06) Fixed: - Special treatment of empty references in TER (#232) - Bump in mecab version for JA (#234) @@ -25,11 +37,11 @@ Added: - Warning if `-tok spm` is used (use explicit `flores101` instead) (#238) -- 2.3.1 (2022-10-18) +## 2.3.1 (2022-10-18) Bugfix: - Set lru_cache to 2^16 for SPM tokenizer (was set to infinite) -- 2.3.0 (2022-10-18) +## 2.3.0 (2022-10-18) Features: - (#203) Added `-tok flores101` and `-tok flores200`, a.k.a. `spbleu`. These are multilingual tokenizations that make use of the @@ -44,10 +56,10 @@ - System outputs: include with wmt22. Also added wmt21/systems which will produce WMT21 submitted systems. To see available systems, give a dummy system to `--echo`, e.g., `sacrebleu -t wmt22 -l en-de --echo ?` -- 2.2.1 (2022-09-13) +## 2.2.1 (2022-09-13) Bugfix: Standard usage was returning (and using) each reference twice. -- 2.2.0 (2022-07-25) +## 2.2.0 (2022-07-25) Features: - Added WMT21 datasets (thanks to @BrighXiaoHan) - `--echo` now exposes document metadata where available (e.g., docid, genre, origlang) @@ -65,7 +77,7 @@ Many thanks to @BrightXiaoHan (https://github.com/BrightXiaoHan) for the bulk of the code contributions in this release. -- 2.1.0 (2022-05-19) +## 2.1.0 (2022-05-19) Features: - Added `-tok spm` for multilingual SPM tokenization (#168) (thanks to Naman Goyal and James Cross at Facebook) @@ -75,7 +87,7 @@ - Bugfix: BLEU.corpus_score() now using max_ngram_order (#173) - Upgraded ja-mecab to 1.0.5 (#196) -- 2.0.0 (2021-07-18) +## 2.0.0 (2021-07-18) - Build: Add Windows and OS X testing to Travis CI. - Improve documentation and type annotations. - Drop `Python < 3.6` support and migrate to f-strings. @@ -137,11 +149,11 @@ as well as paired bootstrap resampling (`--paired-bs`) and paired approximate randomization tests (`--paired-ar`) when evaluating multiple systems (#40 and #78). -- 1.5.1 (2021-03-05) +## 1.5.1 (2021-03-05) - Fix extraction error for WMT18 extra test sets (test-ts) (#142) - Validation and test datasets are added for multilingual TEDx -- 1.5.0 (2021-01-15) +## 1.5.0 (2021-01-15) - Fix an assertion error in chrF (#121) - Add missing `__repr__()` methods for BLEU and TER - TER: Fix exception when `--short` is used (#131) @@ -155,7 +167,7 @@ - Allow variable number of references for BLEU (only via API) (#130). Thanks to Ondrej Dusek (@tuetschek) -- 1.4.14 (2020-09-13) +## 1.4.14 (2020-09-13) - Added character-based tokenization (`-tok char`). Thanks to Christian Federmann. - Added TER (`-m ter`). Thanks to Ales Tamchyna! (fixes #90) @@ -166,7 +178,7 @@ - wmt20/robust/set2 (en-ja, ja-en) - wmt20/robust/set3 (de-en) -- 1.4.13 (2020-07-30) +## 1.4.13 (2020-07-30) - Added WMT20 newstest test sets (#103) - Make mecab3-python an extra dependency, adapt code to new mecab3-python This fixes the recent Windows installation issues as well (#104) @@ -174,10 +186,10 @@ - Fix return type annotation of corpus_bleu() - Improve sentence_score's documentation, do not allow single ref string (#98) -- 1.4.12 (2020-07-03) +## 1.4.12 (2020-07-03) - Fix a deployment bug (#96) -- 1.4.11 (2020-07-03) +## 1.4.11 (2020-07-03) - Added Multi30k multimodal MT test set metadata - Refactored all tokenizers into respective classes (fixes #85) - Refactored all metrics into respective classes @@ -193,7 +205,7 @@ - Added score regression tests for chrF using reference chrF++ implementation - Added multi-reference & tokenizer & signature tests -- 1.4.10 (2020-05-30) +## 1.4.10 (2020-05-30) - Fixed bug in signature with mecab tokenizer - Cleaned up deprecation warnings (thanks to Karthikeyan Singaravelan @tirkarthi) - Now only lists the external [typing](https://pypi.org/project/typing/) @@ -201,13 +213,13 @@ library in Python 3.5 (thanks to Erwan de Lépinau @ErwanDL). - Added LICENSE to pypi (thanks to Mark Harfouche @hmaarrfk) -- 1.4.9 (2020-04-30) +## 1.4.9 (2020-04-30) - Changed `get_available_testsets()` to return a list - Remove Japanese MeCab tokenizer from requirements. (Must be installed manually to avoid Windows incompatibility). Many thanks to Makoto Morishita (@MorinoseiMorizo). -- 1.4.8 (2020-04-26) +## 1.4.8 (2020-04-26) - Added to API: - get_source_file() - get_reference_files() @@ -217,21 +229,21 @@ - Fixed descriptions of some WMT19/google test sets - Added API test case (test/test_apy.py) -- 1.4.7 (2020-04-19) +## 1.4.7 (2020-04-19) - Added Google's extra wmt19/en-de refs (-t wmt19/google/{ar,arp,hqall,hqp,hqr,wmtp}) (Freitag, Grangier, & Caswell BLEU might be Guilty but References are not Innocent https://arxiv.org/abs/2004.06063) - Restored SACREBLEU_DIR and smart_open to exports (thanks to Thomas Liao @tholiao) -- 1.4.6 (2020-03-28) +## 1.4.6 (2020-03-28) - Large internal reorganization as a module (thanks to Thamme Gowda @thammegowda) -- 1.4.5 (2020-03-28) +## 1.4.5 (2020-03-28) - Added Japanese MeCab tokenizer (`-tok ja-mecab`) (thanks to Makoto Morishita @MorinoseiMorizo) - Added wmt20/dev test sets (thanks to Martin Popel @martinpopel) -- 1.4.4 (2020-03-10) +## 1.4.4 (2020-03-10) - Smoothing changes (Sebastian Nickels @sn1c) - Fixed bug that only applied smoothing to n-grams for n > 2 - Added default smoothing values for methods "floor" (0) and "add-k" (1) @@ -240,20 +252,20 @@ - added missing languages for IWSLT17 - Minor code improvements (Thomas Liao @tholiao) -- 1.4.3 (2019-12-02) +## 1.4.3 (2019-12-02) - Bugfix: handling of result object for CHRF - Improved API example -- 1.4.2 (2019-10-11) +## 1.4.2 (2019-10-11) - Tokenization variant omitted from the chrF signature; it is relevant only for BLEU (thanks to Martin Popel) - Bugfix: call to sentence_bleu (thanks to Rachel Bawden) - Documentation example for Python API (thanks to Vlad Lyalin) - Calls to corpus_chrf and sentence_chrf now return a an object instead of a float (use result.score) -- 1.4.1 (2019-09-11) +## 1.4.1 (2019-09-11) - Added sentence-level scoring via -sl (--sentence-level) -- 1.4.0 (2019-09-10) +## 1.4.0 (2019-09-10) - Many thanks to Martin Popel for all the changes below! - Added evaluation on concatenated test sets (e.g., `-t wmt17,wmt18`). Works as long as they all have the same language pair. @@ -269,102 +281,102 @@ - Documentation and tests updates - Fixed a race condition bug (`os.makedirs(outdir, exist_ok=True)` instead of `if os.path.exists`) -- 1.3.7 (2019-07-12) +## 1.3.7 (2019-07-12) - Lazy loading of regexes cuts import time from ~1s to nearly nothing (thanks, @louismartin!) - Added a simple (non-atomic) lock on downloading - Can now read multiple refs from a single tab-delimited file. You need to pass `--num-refs N` to tell it to run the split. Only works with a single reference file passed from the command line. -- 1.3.6 (2019-06-10) +## 1.3.6 (2019-06-10) - Removed another f-string for Python 3.5 compatibility -- 1.3.5 (2019-06-07) +## 1.3.5 (2019-06-07) - Restored Python 3.5 compatibility -- 1.3.4 (2019-05-28) +## 1.3.4 (2019-05-28) - Added MTNT 2019 test sets - Added a BLEU object -- 1.3.3 (2019-05-08) +## 1.3.3 (2019-05-08) - Added WMT'19 test sets -- 1.3.2 (2018-04-24) +## 1.3.2 (2018-04-24) - Bugfix in test case (thanks to Adam Roberts, @adarob) - Passing smoothing method through `sentence_bleu` -- 1.3.1 (2019-03-20) +## 1.3.1 (2019-03-20) - Added another smoothing approach (add-k) and a command-line option for choosing the smoothing method (`--smooth exp|floor|add-n|none`) and the associated value (`--smooth-value`), when relevant. - Changed interface to some functions (backwards incompatible) - 'smooth' is now 'smooth_method' - 'smooth_floor' is now 'smooth_value' -- 1.2.21 (19 March 2019) +## 1.2.21 (19 March 2019) - Ctrl-M characters are now treated as normal characters, previously treated as newline. -- 1.2.20 (28 February 2018) +## 1.2.20 (28 February 2018) - Tokenization now defaults to "zh" when language pair is known -- 1.2.19 (19 February 2019) +## 1.2.19 (19 February 2019) - Updated checksum for wmt19/dev (seems to have changed) -- 1.2.18 (19 February 2019) +## 1.2.18 (19 February 2019) - Fixed checksum for wmt17/dev (copy-paste error) -- 1.2.17 (6 February 2019) +## 1.2.17 (6 February 2019) - Added kk-en and en-kk to wmt19/dev -- 1.2.16 (4 February 2019) +## 1.2.16 (4 February 2019) - Added gu-en and en-gu to wmt19/dev -- 1.2.15 (30 January 2019) +## 1.2.15 (30 January 2019) - Added MD5 checksumming of downloaded files for all datasets. -- 1.2.14 (22 January 2019) +## 1.2.14 (22 January 2019) - Added mtnt1.1/train mtnt1.1/valid mtnt1.1/test data from [MTNT](http://www.cs.cmu.edu/~pmichel1/mtnt/) -- 1.2.13 (22 January 2019) +## 1.2.13 (22 January 2019) - Added 'wmt19/dev' task for 'lt-en' and 'en-lt' (development data for new tasks). - Added MD5 checksum for downloaded tarballs. -- 1.2.12 (8 November 2018) +## 1.2.12 (8 November 2018) - Now outputs only only digit after the decimal -- 1.2.11 (29 August 2018) +## 1.2.11 (29 August 2018) - Added a function for sentence-level, smoothed BLEU -- 1.2.10 (23 May 2018) +## 1.2.10 (23 May 2018) - Added wmt18 test set (with references) -- 1.2.9 (15 May 2018) +## 1.2.9 (15 May 2018) - Added zh-en, en-zh, tr-en, and en-tr datasets for wmt18/test-ts -- 1.2.8 (14 May 2018) +## 1.2.8 (14 May 2018) - Added wmt18/test-ts, the test sources (only) for [WMT18](http://statmt.org/wmt18/translation-task.html) - Moved README out of `sacrebleu.py` and the CHANGELOG into a separate file -- 1.2.7 (10 April 2018) +## 1.2.7 (10 April 2018) - fixed another locale issue (with --echo) - grudgingly enabled `-tok none` from the command line -- 1.2.6 (22 March 2018) +## 1.2.6 (22 March 2018) - added wmt17/ms (Microsoft's [additional ZH-EN references](https://github.com/MicrosoftTranslator/Translator-HumanParityData)). Try `sacrebleu -t wmt17/ms --cite`. - `--echo ref` now pastes together all references, if there is more than one -- 1.2.5 (13 March 2018) +## 1.2.5 (13 March 2018) - added wmt18/dev datasets (en-et and et-en) - fixed logic with --force - locale-independent installation - added "--echo both" (tab-delimited) -- 1.2.3 (28 January 2018) +## 1.2.3 (28 January 2018) - metrics (`-m`) are now printed in the order requested - chrF now prints a version string (including the beta parameter, importantly) - attempt to remove dependence on locale setting -- 1.2 (17 January 2018) +## 1.2 (17 January 2018) - added the chrF metric (`-m chrf` or `-m bleu chrf` for both) See 'CHRF: character n-gram F-score for automatic MT evaluation' by Maja Popovic (WMT 2015) [http://www.statmt.org/wmt15/pdf/WMT49.pdf] @@ -374,26 +386,26 @@ - added `--input` (`-i`) to set input to a file instead of STDIN - removed accent mark after objection from UN official -- 1.1.7 (27 November 2017) +## 1.1.7 (27 November 2017) - corpus_bleu() now raises an exception if input streams are different lengths - thanks to Martin Popel for: - small bugfix in tokenization_13a (not affecting WMT references) - adding `--tok intl` (international tokenization) - added wmt17/dev and wmt17/dev sets (for languages intro'd those years) -- 1.1.6 (15 November 2017) +## 1.1.6 (15 November 2017) - bugfix for tokenization warning -- 1.1.5 (12 November 2017) +## 1.1.5 (12 November 2017) - added -b option (only output the BLEU score) - removed fi-en from list of WMT16/17 systems with more than one reference - added WMT16/tworefs and WMT17/tworefs for scoring with both en-fi references -- 1.1.4 (10 November 2017) +## 1.1.4 (10 November 2017) - added effective order for sentence-level BLEU computation - added unit tests from sockeye -- 1.1.3 (8 November 2017). +## 1.1.3 (8 November 2017). - Factored code a bit to facilitate API: - compute_bleu: works from raw stats - corpus_bleu for use from the command line @@ -402,17 +414,17 @@ - Added 'floor' smoothing (adds 0.01 to 0 counts, more versatile via API), 'none' smoothing (via API) - Small bugfixes, windows compatibility (H/T Christian Federmann) -- 1.0.3 (4 November 2017). +## 1.0.3 (4 November 2017). - Contributions from Christian Federmann: - Added explicit support for encoding - Fixed Windows support - Bugfix in handling reference length with multiple refs -- version 1.0.1 (1 November 2017). +## version 1.0.1 (1 November 2017). - Small bugfix affecting some versions of Python. - Code reformatting due to Ozan Çağlayan. -- version 1.0 (23 October 2017). +## version 1.0 (23 October 2017). - Support for WMT 2008--2017. - Single tokenization (v13a) with lowercase fix (proper lower() instead of just A-Z). - Chinese tokenization. diff --git a/mypy.ini b/mypy.ini index 26ff095..2fda182 100644 --- a/mypy.ini +++ b/mypy.ini @@ -1,5 +1,5 @@ [mypy] -python_version = 3.6 +python_version = 3.12 [mypy-portalocker.*] ignore_missing_imports = True diff --git a/sacrebleu/dataset/__init__.py b/sacrebleu/dataset/__init__.py index 2478395..68c35ce 100644 --- a/sacrebleu/dataset/__init__.py +++ b/sacrebleu/dataset/__init__.py @@ -74,6 +74,26 @@ DATASETS = { # wmt + "wmt24": WMTXMLDataset( + "wmt24", + data=["https://github.com/wmt-conference/wmt24-news-systems/releases/download/v1.1/data_onlyxml.tar.gz"], + description="WMT24 official test set release, v1.1 (excluding TS)", + md5=["299963fcb7b4e86d6d212bf69beb9580"], + langpairs={ + "cs-uk": ["xml/wmttest2024.cs-uk.all.xml"], + "en-cs": ["xml/wmttest2024.en-cs.all.xml"], + "en-de": ["xml/wmttest2024.en-de.all.xml"], + "en-es": ["xml/wmttest2024.en-es.all.xml"], + "en-hi": ["xml/wmttest2024.en-hi.all.xml"], + "en-is": ["xml/wmttest2024.en-is.all.xml"], + "en-ja": ["xml/wmttest2024.en-ja.all.xml"], + "en-ru": ["xml/wmttest2024.en-ru.all.xml"], + "en-uk": ["xml/wmttest2024.en-uk.all.xml"], + "en-zh": ["xml/wmttest2024.en-zh.all.xml"], + "ja-zh": ["xml/wmttest2024.ja-zh.all.xml"], + }, + refs=["refA"], + ), "wmt23": WMTXMLDataset( "wmt23", data=["https://github.com/wmt-conference/wmt23-news-systems/archive/refs/tags/v.0.1.tar.gz"],