diff --git a/docs/task-cluster.md b/docs/task-cluster.md index 5bc9b9c4d..5ca1d8f06 100644 --- a/docs/task-cluster.md +++ b/docs/task-cluster.md @@ -115,6 +115,12 @@ previous_group_ids: ["SsGpi3TGShaDT-h93fHL-g"] Note: This feature should _never_ be used for production training, as it completely bypasses all caching mechanisms, and you will most likely end up with invalid or useless models. +## Dealing with expired upstream tasks + +All tasks eventually expire, and have their artifacts and metadata deleted from Taskcluster, typically 1 year after creation. This can cause problems if it happens while partway through a training session. This happens most commonly with tasks that are shared across multiple training runs, such as `toolchain` and `docker-image` tasks. When this happens you can use the "Rebuild Docker Images and Toolchains" action to rebuild these, and add the task group they are rebuilt in to the `previous_group_ids` when kicking off a training run. + +You may also use this action directly prior to kicking off the start of a new lanugage pair training to ensure that it uses fresh toolchains and docker images, which will typically avoid this problem altogether. + ## Interactive Tasks Taskcluster allows authorized users to run so-called [interactive tasks](https://docs.taskcluster.net/docs/reference/workers/docker-worker/features#feature-interactive). These tasks allow users to gain a shell in the same environment that a pipeline step runs in. This can often be useful for quicker debugging or testing of ideas. diff --git a/taskcluster/translations_taskgraph/__init__.py b/taskcluster/translations_taskgraph/__init__.py index c90682666..5a0d42d3c 100644 --- a/taskcluster/translations_taskgraph/__init__.py +++ b/taskcluster/translations_taskgraph/__init__.py @@ -5,6 +5,7 @@ def register(graph_config): _import_modules( [ "actions.train", + "actions.rebuild_docker_images_and_toolchains", "parameters", "target_tasks", ] diff --git a/taskcluster/translations_taskgraph/actions/rebuild_docker_images_and_toolchains.py b/taskcluster/translations_taskgraph/actions/rebuild_docker_images_and_toolchains.py new file mode 100644 index 000000000..8bd80c990 --- /dev/null +++ b/taskcluster/translations_taskgraph/actions/rebuild_docker_images_and_toolchains.py @@ -0,0 +1,36 @@ +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at http://mozilla.org/MPL/2.0/. + +from taskgraph.actions.registry import register_callback_action +from taskgraph.actions.util import create_tasks, fetch_graph_and_labels + + +@register_callback_action( + name="rebuild-docker-images-and-toolchains", + title="Rebuild Docker Images and Toolchains", + symbol="images-and-toolchains", + description="Create docker-image and toolchain tasks to rebuild their artifacts.", + order=1000, + context=[], +) +def rebuild_docker_images_and_toolchains_action( + parameters, graph_config, input, task_group_id, task_id +): + decision_task_id, full_task_graph, label_to_task_id = fetch_graph_and_labels( + parameters, graph_config, task_group_id=task_group_id + ) + tasks_to_create = [ + label + for label, task in full_task_graph.tasks.items() + if task.kind == "docker-image" or task.kind == "fetch" or task.kind == "toolchain" + ] + if tasks_to_create: + create_tasks( + graph_config, + tasks_to_create, + full_task_graph, + label_to_task_id, + parameters, + decision_task_id, + )