Skip to content

Commit 0c5ba65

Browse files
committed
ENH add tools in the repo
1 parent aec576b commit 0c5ba65

File tree

5 files changed

+160
-0
lines changed

5 files changed

+160
-0
lines changed

README.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -59,6 +59,8 @@ public and private test set.
5959
the repository structure.
6060
- `tools/Dockerfile`: Dockerfile to build the docker image that will be used to
6161
run the ingestion and scoring programs.
62+
- `tools/run_docker.py`: convenience script to build and test the docker image
63+
locally without knowing docker commands. See [here](#setting-up-and-testing-the-docker-image) for more details.
6264

6365
## Instruction to create the codabench bundle
6466

tools/Dockerfile

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
# Step 1: Start from an official Docker image with desired base environment
2+
# Good starting points are the official codalab images or
3+
# pytorch images with CUDA support:
4+
# - Codalab: codalab/codalab-legacy:py39
5+
# - Codalab GPU: codalab/codalab-legacy:gpu310
6+
# - Pytorch: pytorch/pytorch:2.8.0-cuda12.6-cudnn9-runtime
7+
FROM codalab/codalab-legacy:py39
8+
9+
# Set environment variables to prevent interactive prompts
10+
ENV DEBIAN_FRONTEND=noninteractive
11+
12+
# Step 2: Install system-level dependencies (if any)
13+
# e.g., git, wget, or common libraries for OpenCV like libgl1
14+
RUN pip install -U pip
15+
16+
# Step 3: Copy and pre-install all Python dependencies
17+
# This 'requirements.txt' file should list pandas, scikit-learn, timm, etc.
18+
# Place it in the same directory as this Dockerfile.
19+
COPY requirements.txt /tmp/requirements.txt
20+
RUN pip install --no-cache-dir -r /tmp/requirements.txt

tools/create_bundle.py

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
import zipfile
2+
from pathlib import Path
3+
4+
5+
PAGES_DIR = Path("pages")
6+
INGESTION_DIR = Path("ingestion_program")
7+
SCORING_DIR = Path("scoring_program")
8+
PHASE_DATA = Path("dev_phase")
9+
10+
BUNDLE_FILES = [
11+
"competition.yaml",
12+
"logo.png",
13+
"solution/submission.py",
14+
]
15+
16+
17+
if __name__ == "__main__":
18+
19+
with zipfile.ZipFile("bundle.zip", mode='w') as bundle:
20+
21+
for f in BUNDLE_FILES:
22+
print(f)
23+
bundle.write(f)
24+
for dirpath in [INGESTION_DIR, SCORING_DIR, PAGES_DIR, PHASE_DATA]:
25+
assert dirpath.exists(), (
26+
f"{dirpath} does not exist while it should. Make sure you "
27+
"followed all the instructions in the README before "
28+
"creating the bundle."
29+
)
30+
for f in dirpath.rglob("*"):
31+
if not f.is_file():
32+
continue
33+
if f.name.startswith('.') or f.name.endswith('.pyc'):
34+
continue
35+
print(f)
36+
bundle.write(f)

tools/run_docker.py

Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,49 @@
1+
from pathlib import Path
2+
try:
3+
import docker
4+
except ImportError:
5+
raise ImportError(
6+
"The 'docker' package is required to run this script. "
7+
"Please install it using 'pip install docker'."
8+
)
9+
10+
REPO = Path(__file__).resolve().parent.parent
11+
12+
if __name__ == "__main__":
13+
client = docker.from_env()
14+
print("Docker client initialized successfully.")
15+
16+
print("Building Docker image...")
17+
client.images.build(path=".", tag="tommoral/template:v1")
18+
print("Docker image built successfully with tag 'tommoral/template:v1'.")
19+
20+
print("Running Docker container...")
21+
logs = client.containers.run(
22+
image="tommoral/template:v1",
23+
command="python3 /app/ingestion_program/ingestion.py",
24+
remove=True,
25+
name="ingestion",
26+
user="root",
27+
volumes=[
28+
f"{REPO}/ingestion_program:/app/ingestion_program",
29+
f"{REPO}/dev_phase/input_data:/app/input_data",
30+
f"{REPO}/ingestion_res:/app/output",
31+
f"{REPO}/solution:/app/ingested_program",
32+
]
33+
)
34+
print(logs.decode("utf-8"))
35+
logs = client.containers.run(
36+
image="tommoral/template:v1",
37+
command="python3 /app/scoring_program/scoring.py",
38+
remove=True,
39+
name="scoring",
40+
user="root",
41+
volumes=[
42+
f"{REPO}/scoring_program:/app/scoring_program",
43+
f"{REPO}/dev_phase/reference_data:/app/input/ref",
44+
f"{REPO}/ingestion_res:/app/input/res",
45+
f"{REPO}/scoring_res:/app/",
46+
]
47+
)
48+
print(logs.decode("utf-8"))
49+
print("Docker container ran successfully.")

tools/setup_data.py

Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,53 @@
1+
# Script to download the data from a given source and create the splits
2+
# This is a mock version that generate fake problems
3+
from pathlib import Path
4+
5+
import numpy as np
6+
import pandas as pd
7+
from sklearn.datasets import make_classification
8+
from sklearn.model_selection import train_test_split
9+
10+
PHASE = 'dev_phase'
11+
12+
DATA_DIR = Path(PHASE) / 'input_data'
13+
REF_DIR = Path(PHASE) / 'reference_data'
14+
15+
16+
def make_csv(data, filepath):
17+
filepath.parent.mkdir(parents=True, exist_ok=True)
18+
pd.DataFrame(data).to_csv(filepath, index=False)
19+
20+
21+
if __name__ == "__main__":
22+
23+
import argparse
24+
parser = argparse.ArgumentParser(
25+
description='Load or generate data for the benchmark'
26+
)
27+
parser.add_argument('--seed', type=int, default=42,
28+
help='Random seed for data generation')
29+
args = parser.parse_args()
30+
31+
# Generate and split the data
32+
rng = np.random.RandomState(args.seed)
33+
X, y = make_classification(n_samples=500, n_features=5, random_state=rng)
34+
X_train, X_test, y_train, y_test = train_test_split(
35+
X, y, test_size=0.4, random_state=rng
36+
)
37+
X_test, X_private_test, y_test, y_private_test = train_test_split(
38+
X_test, y_test, test_size=0.5, random_state=rng
39+
)
40+
41+
# Store the data in the correct folders:
42+
# - input_data contains train data (both features and labels) and only
43+
# test features so the test labels are kept secret
44+
# - reference_data contains the test labels for scoring
45+
for split, X_split, y_split in [
46+
('train', X_train, y_train),
47+
('test', X_test, y_test),
48+
('private_test', X_private_test, y_private_test),
49+
]:
50+
split_dir = DATA_DIR / split
51+
make_csv(X_split, split_dir / f'{split}_features.csv')
52+
label_dir = split_dir if split == "train" else REF_DIR
53+
make_csv(y_split, label_dir / f'{split}_labels.csv')

0 commit comments

Comments
 (0)