-
Notifications
You must be signed in to change notification settings - Fork 34
/
kind.yml
94 lines (88 loc) · 3.99 KB
/
kind.yml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this
# file, You can obtain one at http://mozilla.org/MPL/2.0/.
---
loader: taskgraph.loader.transform:loader
transforms:
- translations_taskgraph.transforms.from_datasets:per_dataset
- taskgraph.transforms.task_context
- taskgraph.transforms.job:transforms
- translations_taskgraph.transforms.cached_tasks:transforms
- taskgraph.transforms.task:transforms
kind-dependencies:
- dataset
tasks:
"{provider}-{dataset_sanitized}-{src_locale}-{trg_locale}":
description: Clean {provider} {dataset_sanitized} dataset {src_locale}-{trg_locale} use OpusCleaner {use_opuscleaner}
attributes:
cleaning-type: clean-corpus
dataset-category: train
stage: clean-corpus
cache:
type: clean-corpus
from-parameters:
use_opuscleaner: training_config.experiment.use-opuscleaner
resources:
- pipeline/clean/clean-corpus.sh
- pipeline/clean/tools/deescape-special-chars.perl
- pipeline/clean/tools/remove-non-printing-char.perl
- pipeline/clean/tools/clean_parallel.py
- pipeline/clean/tools/langid_fasttext.py
- pipeline/clean/opuscleaner/generate_filters.py
- pipeline/clean/opuscleaner/clean-corpus.sh
- pipeline/clean/opuscleaner/configs/remove_frequent_patterns.txt
- pipeline/clean/opuscleaner/configs/default.filters.json
- pipeline/clean/opuscleaner/configs/ru-en/opus_ELRC-3075-wikipedia_health-v1.ru-en.filters.json
- pipeline/clean/requirements/clean.txt
task-context:
from-parameters:
use_opuscleaner: training_config.experiment.use-opuscleaner
substitution-fields:
- description
- worker.env
worker-type: b-linux-large-300gb
dataset-config:
category: train
substitution-fields:
- description
- name
- dependencies
- fetches
- worker.env
- run.command
expires-after: "90 days"
worker:
docker-image: {"in-tree": "train"}
max-run-time: 86400
artifacts:
- name: public/build
path: /builds/worker/artifacts
type: directory
env:
SRC: "{src_locale}"
TRG: "{trg_locale}"
USE_OPUSCLEANER: "{use_opuscleaner}"
COMPRESSION_CMD: zstdmt
ARTIFACT_EXT: zst
# Don't run unless explicitly scheduled
run-on-tasks-for: []
run:
using: run-task
# We're going to migrate to OpusCleaner in the long term so we can use a simple 'if' in the command
# for now without introducing an extra step and complexity
# TODO: should redirection of stderr to stdout be done automatically for all TaskCluster tasks?
command:
- bash
- -c
- >-
pip install -r $VCS_PATH/pipeline/clean/requirements/clean.txt &&
if [ ${USE_OPUSCLEANER} = "true" ]; then dir="clean/opuscleaner"; else dir="clean"; fi &&
$VCS_PATH/pipeline/${dir}/clean-corpus.sh $MOZ_FETCHES_DIR/{dataset_sanitized} /builds/worker/artifacts/{dataset_sanitized} auto {dataset} 2>&1
dependencies:
"{provider}": dataset-{provider}-{dataset_sanitized}-{src_locale}-{trg_locale}
fetches:
"{provider}":
- artifact: "{dataset_sanitized}.{src_locale}.zst"
extract: false
- artifact: "{dataset_sanitized}.{trg_locale}.zst"
extract: false