Switch bestbleu to chrF (#908)

* Switch best bleu to chrf * Add requirements * Use the same version of sacrebleu and mtdata everywhere * Fix kind * Install requirements * Fix chrf score function * Update find-corpus * Skip mtdata flores datasets * Skip flores 200 * Skip WMT news * Clarify comment * Rename bestbleu to extract_best * Use full argument names * Add tests for extract best
mozilla · Nov 4, 2024 · 8170966 · 8170966
1 parent 8e4b11f
commit 8170966
Show file tree

Hide file tree

Showing 15 changed files with 1,462 additions and 993 deletions.
diff --git a/pipeline/data/requirements/data.in b/pipeline/data/requirements/data.in
@@ -3,3 +3,4 @@ git+https://github.com/hplt-project/OpusTrainer.git@c966d7b353d6b3c6a09d9573f1ab
 simalign==0.4
 mtdata==0.4.1
 psutil==6.0.0
+sacrebleu==2.4.2
diff --git a/pipeline/data/requirements/data.txt b/pipeline/data/requirements/data.txt
@@ -1,8 +1,8 @@
 #
-# This file is autogenerated by pip-compile with Python 3.10
+# This file is autogenerated by pip-compile with Python 3.11
 # by the following command:
 #
-#    pip-compile --allow-unsafe pipeline/data/requirements/data.in
+#    pip-compile pipeline/data/requirements/data.in
 #
 blessed==1.20.0
     # via enlighten
@@ -12,14 +12,15 @@ charset-normalizer==3.3.2
     # via requests
 click==8.1.7
     # via sacremoses
+colorama==0.4.6
+    # via sacrebleu
 enlighten==1.10.1
     # via mtdata
 filelock==3.15.4
     # via
     #   huggingface-hub
     #   torch
     #   transformers
-    #   triton
 fsspec==2024.6.1
     # via
     #   huggingface-hub
@@ -38,6 +39,8 @@ joblib==1.4.2
     #   scikit-learn
 latexcodec==3.0.0
     # via pybtex
+lxml==5.3.0
+    # via sacrebleu
 markupsafe==2.1.5
     # via jinja2
 mpmath==1.3.0
@@ -50,49 +53,21 @@ networkx==3.3
     #   torch
 numpy==1.26.4
     # via
+    #   sacrebleu
     #   scikit-learn
     #   scipy
     #   simalign
     #   transformers
-nvidia-cublas-cu12==12.1.3.1
-    # via
-    #   nvidia-cudnn-cu12
-    #   nvidia-cusolver-cu12
-    #   torch
-nvidia-cuda-cupti-cu12==12.1.105
-    # via torch
-nvidia-cuda-nvrtc-cu12==12.1.105
-    # via torch
-nvidia-cuda-runtime-cu12==12.1.105
-    # via torch
-nvidia-cudnn-cu12==8.9.2.26
-    # via torch
-nvidia-cufft-cu12==11.0.2.54
-    # via torch
-nvidia-curand-cu12==10.3.2.106
-    # via torch
-nvidia-cusolver-cu12==11.4.5.107
-    # via torch
-nvidia-cusparse-cu12==12.1.0.106
-    # via
-    #   nvidia-cusolver-cu12
-    #   torch
-nvidia-nccl-cu12==2.20.5
-    # via torch
-nvidia-nvjitlink-cu12==12.6.20
-    # via
-    #   nvidia-cusolver-cu12
-    #   nvidia-cusparse-cu12
-nvidia-nvtx-cu12==12.1.105
-    # via torch
 opustrainer @ git+https://github.com/hplt-project/OpusTrainer.git@c966d7b353d6b3c6a09d9573f1ab6ba3221c1d21
     # via -r pipeline/data/requirements/data.in
 packaging==24.1
     # via
     #   huggingface-hub
     #   transformers
 portalocker==2.3.0
-    # via mtdata
+    # via
+    #   mtdata
+    #   sacrebleu
 prefixed==0.7.1
     # via enlighten
 psutil==6.0.0
@@ -107,6 +82,7 @@ pyyaml==6.0.1
     #   transformers
 regex==2024.5.15
     # via
+    #   sacrebleu
     #   sacremoses
     #   simalign
     #   transformers
@@ -119,6 +95,8 @@ ruamel-yaml==0.18.6
     # via mtdata
 ruamel-yaml-clib==0.2.8
     # via ruamel-yaml
+sacrebleu==2.4.2
+    # via -r pipeline/data/requirements/data.in
 sacremoses==0.1.1
     # via opustrainer
 safetensors==0.4.3
@@ -139,11 +117,13 @@ six==1.16.0
     #   pybtex
 sympy==1.12.1
     # via torch
+tabulate==0.9.0
+    # via sacrebleu
 threadpoolctl==3.5.0
     # via scikit-learn
 tokenizers==0.19.1
     # via transformers
-torch==2.3.1
+torch==2.2.2
     # via simalign
 tqdm==4.66.4
     # via
@@ -152,8 +132,6 @@ tqdm==4.66.4
     #   transformers
 transformers==4.42.3
     # via simalign
-triton==2.3.1
-    # via torch
 typing-extensions==4.12.2
     # via
     #   huggingface-hub

diff --git a/pipeline/translate/bestbleu.py → pipeline/translate/extract_best.py b/pipeline/translate/bestbleu.py → pipeline/translate/extract_best.py
@@ -96,7 +96,7 @@ def marian_best_bleu(args, score_function):
 
 def compute_chrf(references, translation):
     hypo = " ".join(translation)
-    refs = [" ".join(r) for r in references][0]
+    refs = [" ".join(r) for r in references]
     return sacrebleu.sentence_chrf(hypo, refs).score
 
 

diff --git a/pipeline/translate/requirements/extract_best.in b/pipeline/translate/requirements/extract_best.in
@@ -0,0 +1 @@
+sacrebleu==2.4.2
diff --git a/pipeline/translate/requirements/extract_best.txt b/pipeline/translate/requirements/extract_best.txt
@@ -0,0 +1,20 @@
+#
+# This file is autogenerated by pip-compile with Python 3.11
+# by the following command:
+#
+#    pip-compile pipeline/translate/requirements/extract_best.in
+#
+colorama==0.4.6
+    # via sacrebleu
+lxml==5.3.0
+    # via sacrebleu
+numpy==2.1.2
+    # via sacrebleu
+portalocker==2.10.1
+    # via sacrebleu
+regex==2024.9.11
+    # via sacrebleu
+sacrebleu==2.4.2
+    # via -r pipeline/translate/requirements/extract_best.in
+tabulate==0.9.0
+    # via sacrebleu
diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -30,9 +30,9 @@ marian-tensorboard = "^0.2.1"
 
 # This install group is for running local utilities.
 [tool.poetry.group.utils.dependencies]
-sacrebleu="2.0.0"
-mtdata="0.3.2"
-requests="2.26.0"
+sacrebleu="2.4.2"
+mtdata="0.4.1"
+requests="^2.26.0"
 humanize = "^4.9.0"
 blessed = "^1.20.0"
 huggingface-hub = "^0.20.3"
@@ -45,8 +45,9 @@ pyperclip="1.9.0"
 # list is only for things imported directly in the tests.
 ruamel-yaml = "^0.18.6"
 [tool.poetry.group.tests.dependencies]
-mtdata="0.3.2"
-requests="2.26.0"
+sacrebleu="2.4.2"
+mtdata="0.4.1"
+requests="^2.26.0"
 pytest="7.4.3"
 # use the latest main, switch to PyPi when released
 opustrainer = {git = "https://github.com/hplt-project/OpusTrainer.git", rev="c966d7b353d6b3c6a09d9573f1ab6ba3221c1d21"}

diff --git a/snakemake/Snakefile b/snakemake/Snakefile
@@ -562,7 +562,7 @@ rule extract_best:
     #group 'translate_corpus'
     input: nbest=f"{translated}/corpus/file.{{part}}.nbest", ref=f"{translated}/corpus/file.{{part}}.ref"
     output: f"{translated}/corpus/file.{{part}}.nbest.out"
-    shell: 'python pipeline/translate/bestbleu.py -i {input.nbest} -r {input.ref} -m bleu -o {output} >> {log} 2>&1'
+    shell: 'python pipeline/translate/extract_best.py -i {input.nbest} -r {input.ref} -m bleu -o {output} >> {log} 2>&1'
 
 rule collect_corpus:
     message: "Collecting translated corpus"

diff --git a/taskcluster/docker/toolchain-build/Dockerfile b/taskcluster/docker/toolchain-build/Dockerfile
@@ -37,8 +37,6 @@ RUN locale-gen "$LANG"
 
 RUN pip install zstandard
 
-# Required to download sacrebleu datasets
-RUN pip install sacrebleu mtdata
 
 # %include-run-task
 

diff --git a/taskcluster/docker/train/Dockerfile b/taskcluster/docker/train/Dockerfile
@@ -16,7 +16,5 @@ RUN apt-get update -qq \
                           wget \
     && apt-get clean
 
-# Required to download sacrebleu datasets
-RUN pip install sacrebleu
 
 VOLUME /builds/worker/checkouts
diff --git a/taskcluster/kinds/extract-best/kind.yml b/taskcluster/kinds/extract-best/kind.yml
@@ -31,7 +31,8 @@ tasks:
             cache:
                 type: extract-best
                 resources:
-                    - pipeline/translate/bestbleu.py
+                    - pipeline/translate/extract_best.py
+                    - pipeline/translate/requirements/extract_best.txt
 
         task-context:
             from-parameters:
@@ -81,11 +82,13 @@ tasks:
                 - >-
                     zstd -d --rm $MOZ_FETCHES_DIR/*.zst &&
                     export PYTHONPATH=$PYTHONPATH:$VCS_PATH &&
-                    python3 $VCS_PATH/pipeline/translate/bestbleu.py
-                    -i "$MOZ_FETCHES_DIR/file.{this_chunk}.nbest"
-                    -r "$MOZ_FETCHES_DIR/file.{this_chunk}.ref"
-                    -o $TASK_WORKDIR/artifacts/file.{this_chunk}.nbest.out
-                    -m bleu
+                    pip install --upgrade pip &&
+                    pip install -r $VCS_PATH/pipeline/translate/requirements/extract_best.txt &&
+                    python3 $VCS_PATH/pipeline/translate/extract_best.py
+                    --nbest "$MOZ_FETCHES_DIR/file.{this_chunk}.nbest"
+                    --references "$MOZ_FETCHES_DIR/file.{this_chunk}.ref"
+                    --output $TASK_WORKDIR/artifacts/file.{this_chunk}.nbest.out
+                    --metric chrf
 
         dependencies:
             # double curly braces are used for the chunk substitutions because

diff --git a/tests/test_extract_best.py b/tests/test_extract_best.py
@@ -0,0 +1,46 @@
+import os
+
+from fixtures import DataDir
+
+nbest = """0 ||| Реформа, направленная на выдвижение условий, идет слишком медленно. ||| F0= -9.21191 F1= -11.53 ||| -1.22059
+0 ||| Реформа, направленная на выдвижение условий, проходит слишком медленно. ||| F0= -10.1025 F1= -11.1262 ||| -1.24908
+0 ||| Реформа условий была слишком медленной. ||| F0= -6.67615 F1= -6.21271 ||| -1.28906
+0 ||| Реформа, направленная на выдвижение условий, идет слишком медленными темпами. ||| F0= -12.3186 F1= -13.4513 ||| -1.28906
+0 ||| Реформа, направленная на выдвижение условий, осуществляется слишком медленно. ||| F0= -9.85156 F1= -12.1434 ||| -1.29412
+0 ||| Реформа до обусловленности была слишком медленной. ||| F0= -9.59259 F1= -6.63153 ||| -1.35026
+0 ||| Реформа системы обусловленности была слишком медленной. ||| F0= -10.0087 F1= -7.58777 ||| -1.46484
+0 ||| Реформа системы обусловленности идет слишком медленно. ||| F0= -9.89215 F1= -6.91754 ||| -1.52699
+1 ||| Помощь по-прежнему носит фрагментарный характер, а доноры не координируют свою деятельность. ||| F0= -7.94812 F1= -8.49457 ||| -0.821875
+1 ||| Помощь по-прежнему раздроблена, а доноры не координируют свою деятельность. ||| F0= -7.23773 F1= -8.77496 ||| -0.842928
+1 ||| Помощь по-прежнему фрагментирована, а доноры не координируют свою деятельность. ||| F0= -8.4671 F1= -6.81989 ||| -0.848524
+1 ||| Помощь остается раздробленной, а доноры - нескоординированными. ||| F0= -7.95831 F1= -8.09065 ||| -0.891493
+1 ||| Помощь по-прежнему носит фрагментарный характер, а доноры не координируют свои действия. ||| F0= -9.28394 F1= -9.13013 ||| -0.920313
+1 ||| Помощь по-прежнему раздроблена, а доноры не координируются. ||| F0= -6.3092 F1= -9.73718 ||| -0.943934
+1 ||| Помощь по-прежнему раздроблена, а доноры не координируют свои усилия. ||| F0= -8.31525 F1= -9.74164 ||| -0.949836
+1 ||| Помощь по-прежнему фрагментирована, а доноры не координируются. ||| F0= -7.51941 F1= -7.84265 ||| -0.959961"""
+
+refs = """Реформирование кондициональности проходит слишком медленно.
+Помощь по-прежнему оказывается фрагментарно, а действия доноров не координируются."""
+
+
+def test_extract_best_chr():
+    data_dir = DataDir("test_extract_best")
+    data_dir.create_file("file.1.nbest", nbest)
+    data_dir.create_file("file.1.ref", refs)
+    data_dir.mkdir("artifacts")
+    env = {
+        "TEST_ARTIFACTS": data_dir.path,
+        "SRC": "en",
+        "TRG": "ru",
+    }
+
+    data_dir.run_task("extract-best-en-ru-1/10", env=env)
+
+    output_file = os.path.join(data_dir.path, "artifacts", "file.1.nbest.out")
+    assert os.path.isfile(output_file)
+    with open(output_file, "r") as f:
+        output = f.read()
+    assert (
+        output == "Реформа, направленная на выдвижение условий, проходит слишком медленно.\n"
+        "Помощь по-прежнему носит фрагментарный характер, а доноры не координируют свои действия.\n"
+    )
diff --git a/tests/test_find_corpus.py b/tests/test_find_corpus.py
@@ -104,25 +104,25 @@ def test_opus_download_url(mock_opus_data, capsys):
 # mtdata has some deprecated dependencies
 @pytest.mark.filterwarnings("ignore::DeprecationWarning")
 def test_mtdata(requests_mock, capsys):
-    find_corpus(["en", "ca", "--importer", "mtdata"])
+    find_corpus(["en", "ckb", "--importer", "mtdata"])
     assert_stdout(
         capsys,
         "mtdata outputs nicely",
         """
         ┌────────────────────────────────────────────────┐
         │ mtdata - https://github.com/thammegowda/mtdata │
         └────────────────────────────────────────────────┘
-
-        Dataset                                URL
-        ────────────────────────────────────── ───────────────────────────────────────────────────────────────────────────────────────────────────────────
-        mtdata_ELRC-wikipedia_health-1-cat-eng https://elrc-share.eu/repository/download/ac6d557e8de811ea913100155d026706b0c5fee96b88489781ddd7675f8ea2ae/
-        mtdata_Facebook-wikimatrix-1-cat-eng   https://dl.fbaipublicfiles.com/laser/WikiMatrix/v1/WikiMatrix.ca-en.tsv.gz
-        mtdata_Statmt-ccaligned-1-cat_ES-eng   http://www.statmt.org/cc-aligned/sentence-aligned/ca_ES-en_XX.tsv.xz
-
+        
+        Dataset                                   URL                                                                       
+        ───────────────────────────────────────── ───────────────────────────────────────────────────────────────────────── 
+        mtdata_Flores-flores101_dev-1-ckb-eng     https://dl.fbaipublicfiles.com/flores101/dataset/flores101_dataset.tar.gz 
+        mtdata_Flores-flores101_devtest-1-ckb-eng https://dl.fbaipublicfiles.com/flores101/dataset/flores101_dataset.tar.gz 
+        mtdata_Statmt-ccaligned-1-ckb_IQ-eng      http://data.statmt.org/cc-aligned/sentence-aligned/cb_IQ-en_XX.tsv.xz     
+        
         YAML:
-            - mtdata_ELRC-wikipedia_health-1-cat-eng
-            - mtdata_Facebook-wikimatrix-1-cat-eng
-            - mtdata_Statmt-ccaligned-1-cat_ES-eng
+            - mtdata_Flores-flores101_dev-1-ckb-eng
+            - mtdata_Flores-flores101_devtest-1-ckb-eng
+            - mtdata_Statmt-ccaligned-1-ckb_IQ-eng
         """,
     )
 
@@ -139,9 +139,9 @@ def test_sacrebleu(requests_mock, capsys):
         └─────────────────────────────────────────────────┘
 
         Dataset   Description                             URLs
-        ───────── ─────────────────────────────────────── ──────────────────────────────────────────────────────
-        wmt20     Official evaluation data for WMT20      http://data.statmt.org/wmt20/translation-task/test.tgz
-        wmt20/dev Development data for tasks new to 2020. http://data.statmt.org/wmt20/translation-task/dev.tgz
+        ───────── ─────────────────────────────────────── ───────────────────────────────────────────────────────
+        wmt20     Official evaluation data for WMT20      https://data.statmt.org/wmt20/translation-task/test.tgz
+        wmt20/dev Development data for tasks new to 2020. https://data.statmt.org/wmt20/translation-task/dev.tgz
 
         YAML:
             - sacrebleu_wmt20

diff --git a/utils/config_generator.py b/utils/config_generator.py
@@ -45,6 +45,13 @@
     "lithuanian_legislation_seimas_lithuania",
     # Fails to load from OPUS.
     "SPC",
+    # MTdata duplicates Flores that we pull directly
+    "flores101_dev",
+    "flores101_devtest",
+    "flores200_dev",
+    "flores200_devtest",
+    # Skip OPUS WMT news test sets. They are used in our evaluation and shouldn't be used for training
+    "WMT-News",
 ]
 
 # Do not include small datasets. This works around #508, and minimizes dataset tasks that
@@ -185,6 +192,8 @@ def add_train_data(
     entries = fetch_mtdata(source, target)
 
     for corpus_key, entry in entries.items():
+        if entry.did.name in skip_datasets:
+            continue
         # mtdata can have test and devtest data as well.
         if entry.did.name.endswith("test"):
             dataset = datasets["test"]