Skip to content

Commit

Permalink
Switch bestbleu to chrF (#908)
Browse files Browse the repository at this point in the history
* Switch best bleu to chrf

* Add requirements

* Use the same version of sacrebleu and mtdata everywhere

* Fix kind

* Install requirements

* Fix chrf score function

* Update find-corpus

* Skip mtdata flores datasets

* Skip flores 200

* Skip WMT news

* Clarify comment

* Rename bestbleu to extract_best

* Use full argument names

* Add tests for extract best
  • Loading branch information
eu9ene authored Nov 4, 2024
1 parent 8e4b11f commit 8170966
Show file tree
Hide file tree
Showing 15 changed files with 1,462 additions and 993 deletions.
1 change: 1 addition & 0 deletions pipeline/data/requirements/data.in
Original file line number Diff line number Diff line change
Expand Up @@ -3,3 +3,4 @@ git+https://github.com/hplt-project/OpusTrainer.git@c966d7b353d6b3c6a09d9573f1ab
simalign==0.4
mtdata==0.4.1
psutil==6.0.0
sacrebleu==2.4.2
54 changes: 16 additions & 38 deletions pipeline/data/requirements/data.txt
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
#
# This file is autogenerated by pip-compile with Python 3.10
# This file is autogenerated by pip-compile with Python 3.11
# by the following command:
#
# pip-compile --allow-unsafe pipeline/data/requirements/data.in
# pip-compile pipeline/data/requirements/data.in
#
blessed==1.20.0
# via enlighten
Expand All @@ -12,14 +12,15 @@ charset-normalizer==3.3.2
# via requests
click==8.1.7
# via sacremoses
colorama==0.4.6
# via sacrebleu
enlighten==1.10.1
# via mtdata
filelock==3.15.4
# via
# huggingface-hub
# torch
# transformers
# triton
fsspec==2024.6.1
# via
# huggingface-hub
Expand All @@ -38,6 +39,8 @@ joblib==1.4.2
# scikit-learn
latexcodec==3.0.0
# via pybtex
lxml==5.3.0
# via sacrebleu
markupsafe==2.1.5
# via jinja2
mpmath==1.3.0
Expand All @@ -50,49 +53,21 @@ networkx==3.3
# torch
numpy==1.26.4
# via
# sacrebleu
# scikit-learn
# scipy
# simalign
# transformers
nvidia-cublas-cu12==12.1.3.1
# via
# nvidia-cudnn-cu12
# nvidia-cusolver-cu12
# torch
nvidia-cuda-cupti-cu12==12.1.105
# via torch
nvidia-cuda-nvrtc-cu12==12.1.105
# via torch
nvidia-cuda-runtime-cu12==12.1.105
# via torch
nvidia-cudnn-cu12==8.9.2.26
# via torch
nvidia-cufft-cu12==11.0.2.54
# via torch
nvidia-curand-cu12==10.3.2.106
# via torch
nvidia-cusolver-cu12==11.4.5.107
# via torch
nvidia-cusparse-cu12==12.1.0.106
# via
# nvidia-cusolver-cu12
# torch
nvidia-nccl-cu12==2.20.5
# via torch
nvidia-nvjitlink-cu12==12.6.20
# via
# nvidia-cusolver-cu12
# nvidia-cusparse-cu12
nvidia-nvtx-cu12==12.1.105
# via torch
opustrainer @ git+https://github.com/hplt-project/OpusTrainer.git@c966d7b353d6b3c6a09d9573f1ab6ba3221c1d21
# via -r pipeline/data/requirements/data.in
packaging==24.1
# via
# huggingface-hub
# transformers
portalocker==2.3.0
# via mtdata
# via
# mtdata
# sacrebleu
prefixed==0.7.1
# via enlighten
psutil==6.0.0
Expand All @@ -107,6 +82,7 @@ pyyaml==6.0.1
# transformers
regex==2024.5.15
# via
# sacrebleu
# sacremoses
# simalign
# transformers
Expand All @@ -119,6 +95,8 @@ ruamel-yaml==0.18.6
# via mtdata
ruamel-yaml-clib==0.2.8
# via ruamel-yaml
sacrebleu==2.4.2
# via -r pipeline/data/requirements/data.in
sacremoses==0.1.1
# via opustrainer
safetensors==0.4.3
Expand All @@ -139,11 +117,13 @@ six==1.16.0
# pybtex
sympy==1.12.1
# via torch
tabulate==0.9.0
# via sacrebleu
threadpoolctl==3.5.0
# via scikit-learn
tokenizers==0.19.1
# via transformers
torch==2.3.1
torch==2.2.2
# via simalign
tqdm==4.66.4
# via
Expand All @@ -152,8 +132,6 @@ tqdm==4.66.4
# transformers
transformers==4.42.3
# via simalign
triton==2.3.1
# via torch
typing-extensions==4.12.2
# via
# huggingface-hub
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -96,7 +96,7 @@ def marian_best_bleu(args, score_function):

def compute_chrf(references, translation):
hypo = " ".join(translation)
refs = [" ".join(r) for r in references][0]
refs = [" ".join(r) for r in references]
return sacrebleu.sentence_chrf(hypo, refs).score


Expand Down
1 change: 1 addition & 0 deletions pipeline/translate/requirements/extract_best.in
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
sacrebleu==2.4.2
20 changes: 20 additions & 0 deletions pipeline/translate/requirements/extract_best.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
#
# This file is autogenerated by pip-compile with Python 3.11
# by the following command:
#
# pip-compile pipeline/translate/requirements/extract_best.in
#
colorama==0.4.6
# via sacrebleu
lxml==5.3.0
# via sacrebleu
numpy==2.1.2
# via sacrebleu
portalocker==2.10.1
# via sacrebleu
regex==2024.9.11
# via sacrebleu
sacrebleu==2.4.2
# via -r pipeline/translate/requirements/extract_best.in
tabulate==0.9.0
# via sacrebleu
2,252 changes: 1,333 additions & 919 deletions poetry.lock

Large diffs are not rendered by default.

11 changes: 6 additions & 5 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -30,9 +30,9 @@ marian-tensorboard = "^0.2.1"

# This install group is for running local utilities.
[tool.poetry.group.utils.dependencies]
sacrebleu="2.0.0"
mtdata="0.3.2"
requests="2.26.0"
sacrebleu="2.4.2"
mtdata="0.4.1"
requests="^2.26.0"
humanize = "^4.9.0"
blessed = "^1.20.0"
huggingface-hub = "^0.20.3"
Expand All @@ -45,8 +45,9 @@ pyperclip="1.9.0"
# list is only for things imported directly in the tests.
ruamel-yaml = "^0.18.6"
[tool.poetry.group.tests.dependencies]
mtdata="0.3.2"
requests="2.26.0"
sacrebleu="2.4.2"
mtdata="0.4.1"
requests="^2.26.0"
pytest="7.4.3"
# use the latest main, switch to PyPi when released
opustrainer = {git = "https://github.com/hplt-project/OpusTrainer.git", rev="c966d7b353d6b3c6a09d9573f1ab6ba3221c1d21"}
Expand Down
2 changes: 1 addition & 1 deletion snakemake/Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -562,7 +562,7 @@ rule extract_best:
#group 'translate_corpus'
input: nbest=f"{translated}/corpus/file.{{part}}.nbest", ref=f"{translated}/corpus/file.{{part}}.ref"
output: f"{translated}/corpus/file.{{part}}.nbest.out"
shell: 'python pipeline/translate/bestbleu.py -i {input.nbest} -r {input.ref} -m bleu -o {output} >> {log} 2>&1'
shell: 'python pipeline/translate/extract_best.py -i {input.nbest} -r {input.ref} -m bleu -o {output} >> {log} 2>&1'

rule collect_corpus:
message: "Collecting translated corpus"
Expand Down
2 changes: 0 additions & 2 deletions taskcluster/docker/toolchain-build/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -37,8 +37,6 @@ RUN locale-gen "$LANG"

RUN pip install zstandard

# Required to download sacrebleu datasets
RUN pip install sacrebleu mtdata

# %include-run-task

Expand Down
2 changes: 0 additions & 2 deletions taskcluster/docker/train/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,5 @@ RUN apt-get update -qq \
wget \
&& apt-get clean

# Required to download sacrebleu datasets
RUN pip install sacrebleu

VOLUME /builds/worker/checkouts
15 changes: 9 additions & 6 deletions taskcluster/kinds/extract-best/kind.yml
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,8 @@ tasks:
cache:
type: extract-best
resources:
- pipeline/translate/bestbleu.py
- pipeline/translate/extract_best.py
- pipeline/translate/requirements/extract_best.txt

task-context:
from-parameters:
Expand Down Expand Up @@ -81,11 +82,13 @@ tasks:
- >-
zstd -d --rm $MOZ_FETCHES_DIR/*.zst &&
export PYTHONPATH=$PYTHONPATH:$VCS_PATH &&
python3 $VCS_PATH/pipeline/translate/bestbleu.py
-i "$MOZ_FETCHES_DIR/file.{this_chunk}.nbest"
-r "$MOZ_FETCHES_DIR/file.{this_chunk}.ref"
-o $TASK_WORKDIR/artifacts/file.{this_chunk}.nbest.out
-m bleu
pip install --upgrade pip &&
pip install -r $VCS_PATH/pipeline/translate/requirements/extract_best.txt &&
python3 $VCS_PATH/pipeline/translate/extract_best.py
--nbest "$MOZ_FETCHES_DIR/file.{this_chunk}.nbest"
--references "$MOZ_FETCHES_DIR/file.{this_chunk}.ref"
--output $TASK_WORKDIR/artifacts/file.{this_chunk}.nbest.out
--metric chrf
dependencies:
# double curly braces are used for the chunk substitutions because
Expand Down
46 changes: 46 additions & 0 deletions tests/test_extract_best.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
import os

from fixtures import DataDir

nbest = """0 ||| Реформа, направленная на выдвижение условий, идет слишком медленно. ||| F0= -9.21191 F1= -11.53 ||| -1.22059
0 ||| Реформа, направленная на выдвижение условий, проходит слишком медленно. ||| F0= -10.1025 F1= -11.1262 ||| -1.24908
0 ||| Реформа условий была слишком медленной. ||| F0= -6.67615 F1= -6.21271 ||| -1.28906
0 ||| Реформа, направленная на выдвижение условий, идет слишком медленными темпами. ||| F0= -12.3186 F1= -13.4513 ||| -1.28906
0 ||| Реформа, направленная на выдвижение условий, осуществляется слишком медленно. ||| F0= -9.85156 F1= -12.1434 ||| -1.29412
0 ||| Реформа до обусловленности была слишком медленной. ||| F0= -9.59259 F1= -6.63153 ||| -1.35026
0 ||| Реформа системы обусловленности была слишком медленной. ||| F0= -10.0087 F1= -7.58777 ||| -1.46484
0 ||| Реформа системы обусловленности идет слишком медленно. ||| F0= -9.89215 F1= -6.91754 ||| -1.52699
1 ||| Помощь по-прежнему носит фрагментарный характер, а доноры не координируют свою деятельность. ||| F0= -7.94812 F1= -8.49457 ||| -0.821875
1 ||| Помощь по-прежнему раздроблена, а доноры не координируют свою деятельность. ||| F0= -7.23773 F1= -8.77496 ||| -0.842928
1 ||| Помощь по-прежнему фрагментирована, а доноры не координируют свою деятельность. ||| F0= -8.4671 F1= -6.81989 ||| -0.848524
1 ||| Помощь остается раздробленной, а доноры - нескоординированными. ||| F0= -7.95831 F1= -8.09065 ||| -0.891493
1 ||| Помощь по-прежнему носит фрагментарный характер, а доноры не координируют свои действия. ||| F0= -9.28394 F1= -9.13013 ||| -0.920313
1 ||| Помощь по-прежнему раздроблена, а доноры не координируются. ||| F0= -6.3092 F1= -9.73718 ||| -0.943934
1 ||| Помощь по-прежнему раздроблена, а доноры не координируют свои усилия. ||| F0= -8.31525 F1= -9.74164 ||| -0.949836
1 ||| Помощь по-прежнему фрагментирована, а доноры не координируются. ||| F0= -7.51941 F1= -7.84265 ||| -0.959961"""

refs = """Реформирование кондициональности проходит слишком медленно.
Помощь по-прежнему оказывается фрагментарно, а действия доноров не координируются."""


def test_extract_best_chr():
data_dir = DataDir("test_extract_best")
data_dir.create_file("file.1.nbest", nbest)
data_dir.create_file("file.1.ref", refs)
data_dir.mkdir("artifacts")
env = {
"TEST_ARTIFACTS": data_dir.path,
"SRC": "en",
"TRG": "ru",
}

data_dir.run_task("extract-best-en-ru-1/10", env=env)

output_file = os.path.join(data_dir.path, "artifacts", "file.1.nbest.out")
assert os.path.isfile(output_file)
with open(output_file, "r") as f:
output = f.read()
assert (
output == "Реформа, направленная на выдвижение условий, проходит слишком медленно.\n"
"Помощь по-прежнему носит фрагментарный характер, а доноры не координируют свои действия.\n"
)
28 changes: 14 additions & 14 deletions tests/test_find_corpus.py
Original file line number Diff line number Diff line change
Expand Up @@ -104,25 +104,25 @@ def test_opus_download_url(mock_opus_data, capsys):
# mtdata has some deprecated dependencies
@pytest.mark.filterwarnings("ignore::DeprecationWarning")
def test_mtdata(requests_mock, capsys):
find_corpus(["en", "ca", "--importer", "mtdata"])
find_corpus(["en", "ckb", "--importer", "mtdata"])
assert_stdout(
capsys,
"mtdata outputs nicely",
"""
┌────────────────────────────────────────────────┐
│ mtdata - https://github.com/thammegowda/mtdata │
└────────────────────────────────────────────────┘
Dataset URL
────────────────────────────────────── ───────────────────────────────────────────────────────────────────────────────────────────────────────────
mtdata_ELRC-wikipedia_health-1-cat-eng https://elrc-share.eu/repository/download/ac6d557e8de811ea913100155d026706b0c5fee96b88489781ddd7675f8ea2ae/
mtdata_Facebook-wikimatrix-1-cat-eng https://dl.fbaipublicfiles.com/laser/WikiMatrix/v1/WikiMatrix.ca-en.tsv.gz
mtdata_Statmt-ccaligned-1-cat_ES-eng http://www.statmt.org/cc-aligned/sentence-aligned/ca_ES-en_XX.tsv.xz
Dataset URL
───────────────────────────────────────── ─────────────────────────────────────────────────────────────────────────
mtdata_Flores-flores101_dev-1-ckb-eng https://dl.fbaipublicfiles.com/flores101/dataset/flores101_dataset.tar.gz
mtdata_Flores-flores101_devtest-1-ckb-eng https://dl.fbaipublicfiles.com/flores101/dataset/flores101_dataset.tar.gz
mtdata_Statmt-ccaligned-1-ckb_IQ-eng http://data.statmt.org/cc-aligned/sentence-aligned/cb_IQ-en_XX.tsv.xz
YAML:
- mtdata_ELRC-wikipedia_health-1-cat-eng
- mtdata_Facebook-wikimatrix-1-cat-eng
- mtdata_Statmt-ccaligned-1-cat_ES-eng
- mtdata_Flores-flores101_dev-1-ckb-eng
- mtdata_Flores-flores101_devtest-1-ckb-eng
- mtdata_Statmt-ccaligned-1-ckb_IQ-eng
""",
)

Expand All @@ -139,9 +139,9 @@ def test_sacrebleu(requests_mock, capsys):
└─────────────────────────────────────────────────┘
Dataset Description URLs
───────── ─────────────────────────────────────── ──────────────────────────────────────────────────────
wmt20 Official evaluation data for WMT20 http://data.statmt.org/wmt20/translation-task/test.tgz
wmt20/dev Development data for tasks new to 2020. http://data.statmt.org/wmt20/translation-task/dev.tgz
───────── ─────────────────────────────────────── ──────────────────────────────────────────────────────
wmt20 Official evaluation data for WMT20 https://data.statmt.org/wmt20/translation-task/test.tgz
wmt20/dev Development data for tasks new to 2020. https://data.statmt.org/wmt20/translation-task/dev.tgz
YAML:
- sacrebleu_wmt20
Expand Down
9 changes: 9 additions & 0 deletions utils/config_generator.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,13 @@
"lithuanian_legislation_seimas_lithuania",
# Fails to load from OPUS.
"SPC",
# MTdata duplicates Flores that we pull directly
"flores101_dev",
"flores101_devtest",
"flores200_dev",
"flores200_devtest",
# Skip OPUS WMT news test sets. They are used in our evaluation and shouldn't be used for training
"WMT-News",
]

# Do not include small datasets. This works around #508, and minimizes dataset tasks that
Expand Down Expand Up @@ -185,6 +192,8 @@ def add_train_data(
entries = fetch_mtdata(source, target)

for corpus_key, entry in entries.items():
if entry.did.name in skip_datasets:
continue
# mtdata can have test and devtest data as well.
if entry.did.name.endswith("test"):
dataset = datasets["test"]
Expand Down
Loading

0 comments on commit 8170966

Please sign in to comment.