-
Notifications
You must be signed in to change notification settings - Fork 0
/
run_exp.sh
87 lines (77 loc) · 2.58 KB
/
run_exp.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
#!/bin/bash
ps_names="servers"
worker_names="workers"
num_wrk=1000 #The upper bound on the number of workers in any deployment
r_col=8
lambda=0
fps=0
iter=1050 #100000
dataset='cifar10' #tinyimagenet cifar10 mnist
optimizer='sgd'
batch=128
loss='cross-entropy'
lr=0.2 #for resnet, lr=0.2 for cifar. I reduced it to 0.1 for tiny imagenet.
port=29400
master=''
num_ps=0
savedir="batchSize128/"
augmenteddataset="none"
augmentedfolder="none"
while read p; do
num_ps=$((num_ps+1))
if [ $num_ps -eq 1 ]
then
master=$p
fi
done < $ps_names
num_workers=0
while read p; do
num_workers=$((num_workers+1))
if [ $num_workers -eq $num_wrk ]
then
break
fi
done < $worker_names
pwd=`pwd`
common="python3 $PWD/trainer.py --master $master --num_iter $iter --dataset $dataset --batch $batch --loss $loss"
common="$common --optimizer $optimizer --opt_args '{\"lr\":\"$lr\",\"momentum\":\"0.9\",\"weight_decay\":\"0.0005\"}' --num_ps $num_ps --num_workers $num_workers --fps $fps"
common="$common --port $port --r_col $r_col --lambda $lambda --augmenteddataset $augmenteddataset --augmentedfolder $augmentedfolder --savedir $savedir"
for model in "resnet18" "resnet50" #"resnet18" "resnet50"
do
for attack in "random" # "random" "drop"
do
for fw in 1 2 3 #0 1 2 3
do
for gar in "flag" "bulyan" "krum" "median" "meamed" "phocas" "trmean" #"flag" "bulyan" "krum" "median" "average" "trmean" "phocas" "meamed"
do
for seed in 2021 2022 2023 #2021 2022 2023
do
sleep 5;
i=0
while read p;
do
cmd="export PYTHONPATH=/home/miniconda3/envs/garfield/lib/python3.8/site-packages;source /home/evl/$USER/miniconda3/bin/activate;conda activate garfield;ulimit -n 102400;$common --model $model --attack $attack --fw $fw --gar $gar --rank $i --seed $seed"
ssh $p "$cmd" < /dev/tty &
echo "Running $cmd on $p"
i=$((i+1))
done < $ps_names
count_workers=0
while read p;
do
cmd="export PYTHONPATH=/home/miniconda3/envs/garfield/lib/python3.8/site-packages;source /home/evl/$USER/miniconda3/bin/activate;conda activate garfield;ulimit -n 102400;$common --model $model --attack $attack --fw $fw --gar $gar --rank $i --seed $seed"
ssh $p "$cmd" < /dev/tty &
echo "Running $cmd on $p"
i=$((i+1))
count_workers=$((count_workers+1))
if [ $count_workers -eq $num_wrk ]
then
break
fi
done < $worker_names
echo "Waiting for the current combination to finish..."
wait
done
done
done
done
done