Add Inference Tasks to CI (#871)

* Export IS_DOCKER from taskcluster/docker/base/Dockerfile Ensures that the `IS_DOCKER` environment variable is set to `1` for all CI tasks, by exporting it within the base Dockerfile build. * Define Dockerfile for inference-related tasks in CI Defines a Dockerfile for an image that is capable of running tasks related to the inference engine in the taskcluster CI. * Refactor how inference tasks are run within docker Refactors the organization and error messages for running inference-related tasks from Taskfile. Tasks should now be run in docker locally using `task docker-run`. This is because when the tasks are running in docker on CI, they should not be calling out to the `docker` command itself: the environment is already within docker. * Add taskcluster kind for inference Adds a taskcluster kind definition for tasks to run related to the inference directroy on each PR. Ensures that these tasks only run if releavant files to these tasks have been modified. * Rename "all*" kinds to "all*-pipeline" Renames taskcluster kinds that use the wording "all" to be called "all-pipeline" to improve the clarity of what they do.
mozilla · Oct 18, 2024 · bc20aa4 · bc20aa4
1 parent f1668c1
commit bc20aa4
Show file tree

Hide file tree

Showing 12 changed files with 195 additions and 15 deletions.
diff --git a/Taskfile.yml b/Taskfile.yml
@@ -79,25 +79,25 @@ tasks:
     desc: Clean build artifacts from the inference directory.
     cmds:
       - >-
-          task docker-run -- ./inference/scripts/clean.sh
+          ./inference/scripts/clean.sh
 
   inference-build:
     desc: Build inference engine.
     cmds:
       - >-
-          task docker-run -- ./inference/scripts/build-local.sh
+          ./inference/scripts/build-local.sh
 
   inference-test:
     desc: Run inference tests.
     cmds:
       - >-
-          task docker-run -- ./inference/scripts/unit-tests.sh
+          ./inference/scripts/unit-tests.sh
 
   inference-build-wasm:
     desc: Build inference engine WASM.
     cmds:
       - >-
-          task docker-run -- ./inference/scripts/build-wasm.sh
+          ./inference/scripts/build-wasm.sh
 
   lint-black:
     desc: Checks the styling of the Python code with Black.

diff --git a/inference/scripts/detect-docker.sh b/inference/scripts/detect-docker.sh
@@ -2,15 +2,16 @@
 
 help_task=$1
 
-if [ -z "${IS_DOCKER}" ]; then
+if [ "${IS_DOCKER}" != "1" ]; then
   if [ "${ALLOW_RUN_ON_HOST}" != "1" ]; then
     echo >&2
     echo "Error: This script needs to be run inside Docker, or you must set ALLOW_RUN_ON_HOST=1." >&2
     echo >&2
     if [ -n "${help_task}" ]; then
-      echo " Help: To run this script directly in docker, run: task ${help_task}" >&2
+      echo " Help: To run this script directly in docker, run: task docker-run -- task ${help_task}" >&2
     fi
     echo " Help: To enter docker, run: task docker" >&2
+    echo
     exit 1
   else
     echo >&2

diff --git a/taskcluster/docker/base/Dockerfile b/taskcluster/docker/base/Dockerfile
@@ -45,6 +45,9 @@ RUN pip install zstandard
 
 # %include-run-task
 
+# Allow scripts to detect if they are running in docker
+ENV IS_DOCKER 1
+
 ENV SHELL=/bin/bash \
     HOME=/builds/worker \
     PATH="/builds/worker/.local/bin:$PATH" \

diff --git a/taskcluster/docker/inference/Dockerfile b/taskcluster/docker/inference/Dockerfile
@@ -0,0 +1,51 @@
+FROM $DOCKER_IMAGE_PARENT
+LABEL maintainer="Mozilla Release Engineering <[email protected]>"
+
+RUN apt-get update -qq \
+    # We need to install tzdata before all of the other packages. Otherwise it will show an interactive dialog that
+    # we cannot navigate while building the Docker image.
+    && apt-get install -y tzdata \
+    && apt-get install -y wget \
+                          curl \
+                          zip \
+                          build-essential \
+                          gcc \
+                          g++ \
+                          make \
+                          cmake \
+                          libboost-dev \
+                          libboost-all-dev \
+                          libpcre2-dev \
+                          zstd \
+                          tar \
+                          libxml2 \
+                          libhunspell-dev \
+                          bc  \
+                          autoconf \
+                          automake \
+                          autopoint \
+                          libtool \
+    && apt-get clean
+
+RUN wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB | apt-key add -
+COPY intel-mkl.list /etc/apt/sources.list.d/intel-mkl.list
+
+RUN apt-get update -qq \
+    && apt-get install -y intel-mkl-64bit-2020.0-088 \
+    && apt-get clean
+
+RUN locale-gen "$LANG"
+
+# Install taskfile - https://taskfile.dev/
+# Keep the version in sync with docker/Dockerfile.
+RUN curl -sSLf "https://github.com/go-task/task/releases/download/v3.35.1/task_linux_amd64.tar.gz" \
+    | tar -xz -C /usr/local/bin
+
+ENV SHELL=/bin/bash \
+    HOME=/builds/worker \
+    PATH="/builds/worker/.local/bin:$PATH"
+
+VOLUME /builds/worker/checkouts
+VOLUME /builds/worker/.cache
+
+USER root
diff --git a/taskcluster/docker/inference/intel-mkl.list b/taskcluster/docker/inference/intel-mkl.list
@@ -0,0 +1 @@
+deb https://apt.repos.intel.com/mkl all main
diff --git a/taskcluster/kinds/all/kind.yml → taskcluster/kinds/all-pipeline/kind.yml b/taskcluster/kinds/all/kind.yml → taskcluster/kinds/all-pipeline/kind.yml
@@ -12,8 +12,8 @@ transforms:
     - taskgraph.transforms.task:transforms
 
 # In order for tasks to be produced as "leaves" of the task graph, they must be attached
-# as dependencies to the "all" dummy task. This file is for full training runs. See
-# "all-pr" for integrating into the CI training runs.
+# as dependencies to the "all" dummy task. This file is for full training runs.
+# See "all-pr-pipeline" for integrating into the CI training runs.
 kind-dependencies:
     - export
     - evaluate
@@ -23,10 +23,10 @@ kind-dependencies:
     - analyze-mono
 
 tasks:
-    all:
+    all-pipeline:
         description: Dummy task that ensures all parts of training pipeline will run
         attributes:
-            stage: all
+            stage: all-pipeline
             src_locale: "{src_locale}"
             trg_locale: "{trg_locale}"
 

diff --git a/taskcluster/kinds/all-pr/kind.yml → taskcluster/kinds/all-pr-pipeline/kind.yml b/taskcluster/kinds/all-pr/kind.yml → taskcluster/kinds/all-pr-pipeline/kind.yml
@@ -23,13 +23,13 @@ kind-dependencies:
     - analyze-mono
 
 tasks:
-    all-pr:
+    all-pr-pipeline:
         description: Dummy task that ensures all parts of training pipeline will run
         attributes:
-            stage: all-pr
+            stage: all-pr-pipeline
             src_locale: "{src_locale}"
             trg_locale: "{trg_locale}"
-        
+
         task-context:
             from-parameters:
                 src_locale: training_config.experiment.src

diff --git a/taskcluster/kinds/docker-image/kind.yml b/taskcluster/kinds/docker-image/kind.yml
@@ -9,6 +9,9 @@ transforms:
 tasks:
     base:
         symbol: Base
+    inference:
+        parent: base
+        symbol: Inference
     test:
         parent: base
         symbol: Test

diff --git a/taskcluster/kinds/inference/kind.yml b/taskcluster/kinds/inference/kind.yml
@@ -0,0 +1,68 @@
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this
+# file, You can obtain one at http://mozilla.org/MPL/2.0/.
+---
+
+loader: taskgraph.loader.transform:loader
+
+transforms:
+    - translations_taskgraph.transforms.skip_unless_inference_changed
+    - translations_taskgraph.transforms.worker_selection
+    - taskgraph.transforms.task_context
+    - taskgraph.transforms.run:transforms
+    - taskgraph.transforms.task:transforms
+
+kind-dependencies:
+    - toolchain
+
+task-defaults:
+  task-context:
+    from-parameters:
+      base_rev: base_rev
+    substitution-fields:
+      - run.command
+    worker:
+  run:
+    using: run-task
+    cwd: '{checkout}'
+  worker-type: b-cpu
+  worker:
+    max-run-time: 3600
+    docker-image: {in-tree: inference}
+    # 128 happens when cloning this repository fails
+    retry-exit-status: [128]
+
+tasks:
+  build-local:
+    description: "Build the inference engine locally"
+    run-on-tasks-for: ["github-push", "github-pull-request"]
+    run:
+      command:
+        - bash
+        - -c
+        - >-
+            task inference-build
+
+  test-local:
+    description: "Run local-build tests for the inference engine"
+    dependencies:
+      build: inference-build-local
+    run-on-tasks-for: ["github-pull-request"]
+    run:
+      command:
+        - bash
+        - -c
+        - >-
+            task inference-test
+
+  build-wasm:
+    description: "Build the wasm bindings for the inference engine"
+    dependencies:
+      build-local: inference-build-local
+    run-on-tasks-for: ["github-pull-request"]
+    run:
+      command:
+        - bash
+        - -c
+        - >-
+            task inference-build-wasm
diff --git a/taskcluster/test/test_default_params.py b/taskcluster/test/test_default_params.py
@@ -11,6 +11,7 @@
     {
         "substitute_digest": {
             "build-docker-image-base": "digest_base",
+            "build-docker-image-inference": "digest_inference",
             "build-docker-image-test": "digest_test",
             "build-docker-image-toolchain-build": "digest_toolchain",
             "build-docker-image-train": "digest_train",
@@ -25,6 +26,10 @@
                             "namespace": "translations.cache.level-3.docker-images.v2.base.hash.{digest_base}",
                             "taskId": "build-docker-image-base",
                         },
+                        {
+                            "namespace": "translations.cache.level-3.docker-images.v2.inference.hash.{digest_inference}",
+                            "taskId": "build-docker-image-inference",
+                        },
                         {
                             "namespace": "translations.cache.level-3.docker-images.v2.test.hash.{digest_test}",
                             "taskId": "build-docker-image-test",
@@ -57,6 +62,13 @@
                             },
                             "taskId": "build-docker-image-base",
                         },
+                        {
+                            "status": {
+                                "state": "completed",
+                                "expires": "3024-08-21T22:37:28.781Z",
+                            },
+                            "taskId": "build-docker-image-inference",
+                        },
                         {
                             "status": {
                                 "state": "completed",
@@ -89,7 +101,7 @@
 
 def test_last_task_is_targeted(target_task_set: TaskGraph):
     """Ensure that the last task in the pipeline is targeted by default"""
-    assert any([task == "all-ru-en-1" for task in target_task_set.tasks])
+    assert any([task == "all-pipeline-ru-en-1" for task in target_task_set.tasks])
 
 
 def test_cached_tasks_optimized_away(optimized_task_graph: TaskGraph):

diff --git a/taskcluster/translations_taskgraph/transforms/skip_unless_inference_changed.py b/taskcluster/translations_taskgraph/transforms/skip_unless_inference_changed.py
@@ -0,0 +1,41 @@
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this
+# file, You can obtain one at http://mozilla.org/MPL/2.0/.
+#
+# This transform sequence will remove all jobs unless at least one inference
+# impacting thing (an inference script or relevant Taskcluster code) has changed
+# (This is done with the `files_changed` helper, which uses data in the
+# parameters to determine files changed between the `base` and `head` revisions.)
+
+# When upstream taskgraph supports better selection (https://github.com/taskcluster/taskgraph/issues/369)
+# this can be replaced with it.
+
+import os
+from pathlib import Path
+
+from taskgraph.transforms.base import TransformSequence
+
+KIND_DIR = Path(__file__).parent.parent.parent / "kinds"
+
+# Kinds are slightly special - there are some kinds that don't affect inference,
+# and changing them shouldn't force inference to run.
+INCLUDE_KINDS = ["inference"]
+# Touching any file in any of these directories is considered an inference change
+INFERENCE_DIRS = [
+    "inference/**",
+    "taskcluster/docker/inference/**",
+]
+INFERENCE_DIRS.extend(
+    f"taskcluster/kinds/{kind}" for kind in os.listdir(KIND_DIR) if kind in INCLUDE_KINDS
+)
+
+transforms = TransformSequence()
+
+
+@transforms.add
+def skip_unless_inference_changed(config, jobs):
+    for job in jobs:
+        job.setdefault("optimization", {})
+        job["optimization"]["skip-unless-changed"] = INFERENCE_DIRS
+
+        yield job
diff --git a/tests/test_preflight_check.py b/tests/test_preflight_check.py
@@ -42,7 +42,7 @@ def test_task_group():
     assert "Training config" not in output
     assert "Visualization" not in output
 
-    assert "all-en-ru-1" in output
+    assert "all-pipeline-en-ru-1" in output
     assert not opened_url