1+ # Script to download the data from a given source and create the splits
2+ # This is a mock version that generate fake problems
3+ from pathlib import Path
4+
5+ import numpy as np
6+ import pandas as pd
7+ from sklearn .datasets import make_classification
8+ from sklearn .model_selection import train_test_split
9+
10+ PHASE = 'dev_phase'
11+
12+ DATA_DIR = Path (PHASE ) / 'input_data'
13+ REF_DIR = Path (PHASE ) / 'reference_data'
14+
15+
16+ def make_csv (data , filepath ):
17+ filepath .parent .mkdir (parents = True , exist_ok = True )
18+ pd .DataFrame (data ).to_csv (filepath , index = False )
19+
20+
21+ if __name__ == "__main__" :
22+
23+ import argparse
24+ parser = argparse .ArgumentParser (
25+ description = 'Load or generate data for the benchmark'
26+ )
27+ parser .add_argument ('--seed' , type = int , default = 42 ,
28+ help = 'Random seed for data generation' )
29+ args = parser .parse_args ()
30+
31+ # Generate and split the data
32+ rng = np .random .RandomState (args .seed )
33+ X , y = make_classification (n_samples = 500 , n_features = 5 , random_state = rng )
34+ X_train , X_test , y_train , y_test = train_test_split (
35+ X , y , test_size = 0.4 , random_state = rng
36+ )
37+ X_test , X_private_test , y_test , y_private_test = train_test_split (
38+ X_test , y_test , test_size = 0.5 , random_state = rng
39+ )
40+
41+ # Store the data in the correct folders:
42+ # - input_data contains train data (both features and labels) and only
43+ # test features so the test labels are kept secret
44+ # - reference_data contains the test labels for scoring
45+ for split , X_split , y_split in [
46+ ('train' , X_train , y_train ),
47+ ('test' , X_test , y_test ),
48+ ('private_test' , X_private_test , y_private_test ),
49+ ]:
50+ split_dir = DATA_DIR / split
51+ make_csv (X_split , split_dir / f'{ split } _features.csv' )
52+ label_dir = split_dir if split == "train" else REF_DIR
53+ make_csv (y_split , label_dir / f'{ split } _labels.csv' )
0 commit comments