Skip to content

Commit 51c3f07

Browse files
committed
add code
1 parent a612cb6 commit 51c3f07

30 files changed

+1143
-124
lines changed

commands.txt

+8-2
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,14 @@
11
# get credentials from shell for cluster
22
gcloud container clusters get-credentials cluster-3-nodes --zone us-central1-c --project dsd-demo-370521
33

4+
# submit a cloud build job that will be executed remotly over gcloud
5+
gcloud builds submit --config ogbl-cloudbuild.yaml .
6+
47
# add crd for pytorch and other training operators
5-
kubectl apply -k "github.com/kubeflow/training-operator/manifests/overlays/standalone?ref=v1.5.0"
8+
kubectl apply -k "github.com/kubeflow/training-operator/manifests/overlays/standalone?ref=v1.5.0"
69

710
# create disk
8-
gcloud compute disks create gke-pv --zone=us-central1-c --size=50G
11+
gcloud compute disks create gke-pv --zone=us-central1-c --size=50G
12+
13+
# Create a firewall rule to allow TCP traffic on your node port
14+
gcloud compute firewall-rules create test-node-port --allow tcp:NODE_PORT

mnist-cloudbuild.yaml mnist/mnist-cloudbuild.yaml

+2-2
Original file line numberDiff line numberDiff line change
@@ -4,10 +4,10 @@ steps:
44
[
55
'build',
66
'-t',
7-
'gcr.io/dsd-demo-370521/pytorch-dis-mnist:4.0',
7+
'gcr.io/dsd-demo-370521/pytorch-dis-mnist:5.0',
88
'-f',
99
'mnist.Dockerfile',
1010
'.',
1111
]
1212
- name: 'gcr.io/cloud-builders/docker'
13-
args: ['push', 'gcr.io/dsd-demo-370521/pytorch-dis-mnist:4.0']
13+
args: ['push', 'gcr.io/dsd-demo-370521/pytorch-dis-mnist:5.0']

mnist-pytorch.yaml mnist/mnist-pytorch.yaml

+6-4
Original file line numberDiff line numberDiff line change
@@ -15,15 +15,16 @@ spec:
1515
spec:
1616
containers:
1717
- name: pytorch
18-
image: gcr.io/dsd-demo-370521/pytorch-dis-mnist:4.0
18+
image: gcr.io/dsd-demo-370521/pytorch-dis-mnist:5.0
19+
imagePullPolicy: Always
1920
args: ['--backend', 'gloo', '--epochs', '5']
2021
volumeMounts:
2122
- name: savedmodel
22-
mountPath: /savedmodel/fds
23+
mountPath: /app/src/tensorboard
2324
volumes:
2425
- name: savedmodel
2526
persistentVolumeClaim:
26-
claimName: pv-ogbl-claim
27+
claimName: nfs
2728

2829
Worker:
2930
replicas: 4
@@ -35,5 +36,6 @@ spec:
3536
spec:
3637
containers:
3738
- name: pytorch
38-
image: gcr.io/dsd-demo-370521/pytorch-dis-mnist:4.0
39+
image: gcr.io/dsd-demo-370521/pytorch-dis-mnist:5.0
40+
imagePullPolicy: Always
3941
args: ['--backend', 'gloo', '--epochs', '5']

mnist.Dockerfile mnist/mnist.Dockerfile

+6-7
Original file line numberDiff line numberDiff line change
@@ -9,13 +9,12 @@ RUN pip install torch-geometric -f https://data.pyg.org/whl/torch-1.13.0+cu116.h
99
RUN pip install ogb
1010
RUN pip install networkx
1111

12-
RUN mkdir -p /opt/mnist
12+
RUN mkdir -p /app
1313

14-
WORKDIR /opt/mnist/src
15-
ADD mnist.py /opt/mnist/src/mnist.py
16-
ADD train.py /opt/mnist/src/train.py
14+
WORKDIR /app/src
15+
ADD mnist.py /app/src/mnist.py
1716

18-
RUN chgrp -R 0 /opt/mnist \
19-
&& chmod -R g+rwX /opt/mnist
17+
RUN chgrp -R 0 /app/src \
18+
&& chmod -R g+rwX /app/src
2019

21-
ENTRYPOINT ["python", "/opt/mnist/src/mnist.py"]
20+
ENTRYPOINT ["python", "/app/src/mnist.py"]

mnist.py mnist/mnist.py

+6-3
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22

33
import argparse
44
import os
5+
import pathlib
56

67
from tensorboardX import SummaryWriter
78
from torchvision import datasets, transforms
@@ -93,7 +94,7 @@ def main():
9394
help='random seed (default: 1)')
9495
parser.add_argument('--log-interval', type=int, default=10, metavar='N',
9596
help='how many batches to wait before logging training status')
96-
parser.add_argument('--save-model', action='store_true', default=False,
97+
parser.add_argument('--save-model', action='store_true', default=True,
9798
help='For Saving the current Model')
9899
parser.add_argument('--dir', default='logs', metavar='L',
99100
help='directory where summary logs are stored')
@@ -106,7 +107,8 @@ def main():
106107
if use_cuda:
107108
print('Using CUDA')
108109

109-
writer = SummaryWriter(args.dir)
110+
pathlib.Path("./tensorboard/logs/").mkdir(parents=True, exist_ok=True)
111+
writer = SummaryWriter("./tensorboard/logs/")
110112

111113
torch.manual_seed(args.seed)
112114

@@ -147,8 +149,9 @@ def main():
147149
train(args, model, device, train_loader, optimizer, epoch, writer)
148150
test(args, model, device, test_loader, writer, epoch)
149151

152+
pathlib.Path("./tensorboard/models/").mkdir(parents=True, exist_ok=True)
150153
if (args.save_model):
151-
torch.save(model.state_dict(),"mnist_cnn.pt")
154+
torch.save(model.state_dict(),"./tensorboard/models/mnist_cnn.pt")
152155

153156
if __name__ == '__main__':
154157
main()

nfs/nfs-deployment.yaml

+35
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,35 @@
1+
apiVersion: networking.k8s.io/v1
2+
kind: Deployment
3+
metadata:
4+
name: nfs-server
5+
spec:
6+
replicas: 1
7+
selector:
8+
matchLabels:
9+
role: nfs-server
10+
template:
11+
metadata:
12+
labels:
13+
role: nfs-server
14+
spec:
15+
containers:
16+
- name: nfs-server
17+
image: gcr.io/google_containers/volume-nfs:latest
18+
imagePullPolicy: Always
19+
ports:
20+
- name: nfs
21+
containerPort: 2049
22+
- name: mountd
23+
containerPort: 20048
24+
- name: rpcbind
25+
containerPort: 111
26+
securityContext:
27+
privileged: true
28+
volumeMounts:
29+
- mountPath: /exports
30+
name: mypvc
31+
volumes:
32+
- name: mypvc
33+
gcePersistentDisk:
34+
pdName: gke-nfs
35+
fsType: ext4

nfs/nfs-service.yaml

+15
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
apiVersion: v1
2+
kind: Service
3+
metadata:
4+
name: nfs-server
5+
spec:
6+
# clusterIP: 10.3.240.20
7+
ports:
8+
- name: nfs
9+
port: 2049
10+
- name: mountd
11+
port: 20048
12+
- name: rpcbind
13+
port: 111
14+
selector:
15+
role: nfs-server

nfs/pv-nfs.yaml

+25
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
apiVersion: v1
2+
kind: PersistentVolume
3+
metadata:
4+
name: nfs
5+
spec:
6+
capacity:
7+
storage: 20Gi
8+
accessModes:
9+
- ReadWriteMany
10+
nfs:
11+
server: 10.116.4.18
12+
path: '/'
13+
14+
---
15+
kind: PersistentVolumeClaim
16+
apiVersion: v1
17+
metadata:
18+
name: nfs
19+
spec:
20+
accessModes:
21+
- ReadWriteMany
22+
storageClassName: ''
23+
resources:
24+
requests:
25+
storage: 20Gi

nginx/nginx.yaml

+40
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,40 @@
1+
apiVersion: apps/v1
2+
kind: Deployment
3+
metadata:
4+
name: clear-nginx-deployment
5+
spec:
6+
selector:
7+
matchLabels:
8+
app: clear-nginx
9+
template:
10+
metadata:
11+
labels:
12+
app: clear-nginx
13+
spec:
14+
containers:
15+
- name: clear-nginx
16+
image: clearlinux/nginx
17+
imagePullPolicy: Always
18+
volumeMounts:
19+
- mountPath: /var/www/html
20+
name: site-data
21+
ports:
22+
- containerPort: 80
23+
volumes:
24+
- name: site-data
25+
persistentVolumeClaim:
26+
claimName: pv-ogbl-claim
27+
---
28+
apiVersion: v1
29+
kind: Service
30+
metadata:
31+
name: clear-nginx-service
32+
spec:
33+
ports:
34+
- name: http
35+
port: 80
36+
protocol: TCP
37+
targetPort: 80
38+
selector:
39+
app: clear-nginx
40+
type: NodePort

ogbl-pytorch.yaml

-41
This file was deleted.
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,64 @@
1+
apiVersion: 'kubeflow.org/v1'
2+
kind: 'PyTorchJob'
3+
metadata:
4+
name: 'm-1-w-15-e-10-pytorch-dist-ogbl-gloo'
5+
namespace: 'default'
6+
spec:
7+
pytorchReplicaSpecs:
8+
Master:
9+
replicas: 1
10+
restartPolicy: OnFailure
11+
template:
12+
metadata:
13+
annotations:
14+
sidecar.istio.io/inject: 'false'
15+
spec:
16+
containers:
17+
- name: pytorch
18+
image: gcr.io/dsd-demo-370521/pytorch-distributed-ogbl:1.0
19+
imagePullPolicy: Always
20+
args:
21+
[
22+
'--backend',
23+
'gloo',
24+
'--epochs',
25+
'10',
26+
'--exp-name',
27+
'm-1-w-15-e-10',
28+
]
29+
volumeMounts:
30+
- name: savedmodel
31+
mountPath: /app/src/tensorboard
32+
volumes:
33+
- name: savedmodel
34+
persistentVolumeClaim:
35+
claimName: nfs
36+
37+
Worker:
38+
replicas: 15
39+
restartPolicy: OnFailure
40+
template:
41+
metadata:
42+
annotations:
43+
sidecar.istio.io/inject: 'false'
44+
spec:
45+
containers:
46+
- name: pytorch
47+
image: gcr.io/dsd-demo-370521/pytorch-distributed-ogbl:1.0
48+
imagePullPolicy: Always
49+
args:
50+
[
51+
'--backend',
52+
'gloo',
53+
'--epochs',
54+
'10',
55+
'--exp-name',
56+
'm-1-w-15-e-10',
57+
]
58+
volumeMounts:
59+
- name: savedmodel
60+
mountPath: /app/src/tensorboard
61+
volumes:
62+
- name: savedmodel
63+
persistentVolumeClaim:
64+
claimName: nfs

0 commit comments

Comments
 (0)