ENH add tools in the repo

tomMoral · tomMoral · commit 0c5ba65000b8 · 2025-11-17T11:32:18.000+01:00
diff --git a/README.md b/README.md
@@ -59,6 +59,8 @@ public and private test set.
   the repository structure.
 - `tools/Dockerfile`: Dockerfile to build the docker image that will be used to
   run the ingestion and scoring programs.
+- `tools/run_docker.py`: convenience script to build and test the docker image
+  locally without knowing docker commands. See [here](#setting-up-and-testing-the-docker-image) for more details.
 
 ## Instruction to create the codabench bundle
 
diff --git a/tools/Dockerfile b/tools/Dockerfile
@@ -0,0 +1,20 @@
+# Step 1: Start from an official Docker image with desired base environment
+# Good starting points are the official codalab images or
+# pytorch images with CUDA support:
+#    - Codalab: codalab/codalab-legacy:py39
+#    - Codalab GPU: codalab/codalab-legacy:gpu310
+#    - Pytorch: pytorch/pytorch:2.8.0-cuda12.6-cudnn9-runtime
+FROM codalab/codalab-legacy:py39
+
+# Set environment variables to prevent interactive prompts
+ENV DEBIAN_FRONTEND=noninteractive
+
+# Step 2: Install system-level dependencies (if any)
+# e.g., git, wget, or common libraries for OpenCV like libgl1
+RUN pip install -U pip
+
+# Step 3: Copy and pre-install all Python dependencies
+# This 'requirements.txt' file should list pandas, scikit-learn, timm, etc.
+# Place it in the same directory as this Dockerfile.
+COPY requirements.txt /tmp/requirements.txt
+RUN pip install --no-cache-dir -r /tmp/requirements.txt
diff --git a/tools/create_bundle.py b/tools/create_bundle.py
@@ -0,0 +1,36 @@
+import zipfile
+from pathlib import Path
+
+
+PAGES_DIR = Path("pages")
+INGESTION_DIR = Path("ingestion_program")
+SCORING_DIR = Path("scoring_program")
+PHASE_DATA = Path("dev_phase")
+
+BUNDLE_FILES = [
+    "competition.yaml",
+    "logo.png",
+    "solution/submission.py",
+]
+
+
+if __name__ == "__main__":
+
+    with zipfile.ZipFile("bundle.zip", mode='w') as bundle:
+
+        for f in BUNDLE_FILES:
+            print(f)
+            bundle.write(f)
+        for dirpath in [INGESTION_DIR, SCORING_DIR, PAGES_DIR, PHASE_DATA]:
+            assert dirpath.exists(), (
+                f"{dirpath} does not exist while it should. Make sure you "
+                "followed all the instructions in the README before "
+                "creating the bundle."
+            )
+            for f in dirpath.rglob("*"):
+                if not f.is_file():
+                    continue
+                if f.name.startswith('.') or f.name.endswith('.pyc'):
+                    continue
+                print(f)
+                bundle.write(f)
diff --git a/tools/run_docker.py b/tools/run_docker.py
@@ -0,0 +1,49 @@
+from pathlib import Path
+try:
+    import docker
+except ImportError:
+    raise ImportError(
+        "The 'docker' package is required to run this script. "
+        "Please install it using 'pip install docker'."
+    )
+
+REPO = Path(__file__).resolve().parent.parent
+
+if __name__ == "__main__":
+    client = docker.from_env()
+    print("Docker client initialized successfully.")
+
+    print("Building Docker image...")
+    client.images.build(path=".", tag="tommoral/template:v1")
+    print("Docker image built successfully with tag 'tommoral/template:v1'.")
+
+    print("Running Docker container...")
+    logs = client.containers.run(
+        image="tommoral/template:v1",
+        command="python3 /app/ingestion_program/ingestion.py",
+        remove=True,
+        name="ingestion",
+        user="root",
+        volumes=[
+            f"{REPO}/ingestion_program:/app/ingestion_program",
+            f"{REPO}/dev_phase/input_data:/app/input_data",
+            f"{REPO}/ingestion_res:/app/output",
+            f"{REPO}/solution:/app/ingested_program",
+        ]
+    )
+    print(logs.decode("utf-8"))
+    logs = client.containers.run(
+        image="tommoral/template:v1",
+        command="python3 /app/scoring_program/scoring.py",
+        remove=True,
+        name="scoring",
+        user="root",
+        volumes=[
+            f"{REPO}/scoring_program:/app/scoring_program",
+            f"{REPO}/dev_phase/reference_data:/app/input/ref",
+            f"{REPO}/ingestion_res:/app/input/res",
+            f"{REPO}/scoring_res:/app/",
+        ]
+    )
+    print(logs.decode("utf-8"))
+    print("Docker container ran successfully.")
diff --git a/tools/setup_data.py b/tools/setup_data.py
@@ -0,0 +1,53 @@
+# Script to download the data from a given source and create the splits
+# This is a mock version that generate fake problems
+from pathlib import Path
+
+import numpy as np
+import pandas as pd
+from sklearn.datasets import make_classification
+from sklearn.model_selection import train_test_split
+
+PHASE = 'dev_phase'
+
+DATA_DIR = Path(PHASE) / 'input_data'
+REF_DIR = Path(PHASE) / 'reference_data'
+
+
+def make_csv(data, filepath):
+    filepath.parent.mkdir(parents=True, exist_ok=True)
+    pd.DataFrame(data).to_csv(filepath, index=False)
+
+
+if __name__ == "__main__":
+
+    import argparse
+    parser = argparse.ArgumentParser(
+        description='Load or generate data for the benchmark'
+    )
+    parser.add_argument('--seed', type=int, default=42,
+                        help='Random seed for data generation')
+    args = parser.parse_args()
+
+    # Generate and split the data
+    rng = np.random.RandomState(args.seed)
+    X, y = make_classification(n_samples=500, n_features=5, random_state=rng)
+    X_train, X_test, y_train, y_test = train_test_split(
+        X, y, test_size=0.4, random_state=rng
+    )
+    X_test, X_private_test, y_test, y_private_test = train_test_split(
+        X_test, y_test, test_size=0.5, random_state=rng
+    )
+
+    # Store the data in the correct folders:
+    # - input_data contains train data (both features and labels) and only
+    #   test features so the test labels are kept secret
+    # - reference_data contains the test labels for scoring
+    for split, X_split, y_split in [
+        ('train', X_train, y_train),
+        ('test', X_test, y_test),
+        ('private_test', X_private_test, y_private_test),
+    ]:
+        split_dir = DATA_DIR / split
+        make_csv(X_split, split_dir / f'{split}_features.csv')
+        label_dir = split_dir if split == "train" else REF_DIR
+        make_csv(y_split, label_dir / f'{split}_labels.csv')