22
22
MAINNET_SNAPSHOT_NAME = "mainnet-archive"
23
23
24
24
25
+ def get_region_from_zone (zone ):
26
+ return zone .rsplit ("-" , 1 )[0 ]
27
+
28
+
25
29
def get_kubectl_credentials (project_id , region , cluster_name ):
26
30
try :
27
31
# Command to get kubectl credentials for the cluster
@@ -141,6 +145,8 @@ def create_snapshot_with_gcloud(
141
145
source_disk_link ,
142
146
"--project" ,
143
147
target_project ,
148
+ "--storage-location" ,
149
+ get_region_from_zone (source_zone ),
144
150
]
145
151
146
152
try :
@@ -156,6 +162,22 @@ def create_snapshot_with_gcloud(
156
162
raise Exception (f"Error creating snapshot: { e } " )
157
163
158
164
165
+ def delete_disk (disk_client , project , zone , disk_name ):
166
+ # Check if the disk already exists
167
+
168
+ try :
169
+ disk = disk_client .get (project = project , zone = zone , disk = disk_name )
170
+ logger .info (f"Disk { disk_name } already exists. Deleting it." )
171
+ # Delete the existing disk
172
+ operation = disk_client .delete (project = project , zone = zone , disk = disk_name )
173
+ wait_for_operation (
174
+ project , zone , operation .name , compute_v1 .ZoneOperationsClient ()
175
+ )
176
+ logger .info (f"Disk { disk_name } deleted." )
177
+ except Exception as e :
178
+ logger .info (f"Disk { e } { disk_name } does not exist, no delete needed." )
179
+
180
+
159
181
# Creating disk from import snapshots
160
182
# require getting a hold of the kubectrl of the cluster
161
183
# eg: gcloud container clusters get-credentials replay-on-archive --region us-central1 --project replay-verify
@@ -172,19 +194,7 @@ def create_disk_pv_pvc_from_snapshot(
172
194
):
173
195
disk_client = compute_v1 .DisksClient ()
174
196
snapshot_client = compute_v1 .SnapshotsClient ()
175
-
176
- # Check if the disk already exists
177
- try :
178
- disk = disk_client .get (project = project , zone = zone , disk = disk_name )
179
- logger .info (f"Disk { disk_name } already exists. Deleting it." )
180
- # Delete the existing disk
181
- operation = disk_client .delete (project = project , zone = zone , disk = disk_name )
182
- wait_for_operation (
183
- project , zone , operation .name , compute_v1 .ZoneOperationsClient ()
184
- )
185
- logger .info (f"Disk { disk_name } deleted." )
186
- except Exception as e :
187
- logger .info (f"Disk { e } { disk_name } does not exist. Creating a new one." )
197
+ delete_disk (disk_client , project , zone , disk_name )
188
198
189
199
# Create a new disk from the snapshot
190
200
logger .info (f"Creating disk { disk_name } from snapshot { og_snapshot_name } ." )
@@ -199,14 +209,16 @@ def create_disk_pv_pvc_from_snapshot(
199
209
wait_for_operation (project , zone , operation .name , compute_v1 .ZoneOperationsClient ())
200
210
logger .info (f"Disk { disk_name } created from snapshot { og_snapshot_name } ." )
201
211
202
- region_name = zone . rsplit ( "-" , 1 )[ 0 ]
212
+ region_name = get_region_from_zone ( zone )
203
213
get_kubectl_credentials (project , region_name , cluster_name )
204
214
# create_persistent_volume(disk_name, pv_name, pvc_name, namespace, True)
205
215
# this is only for xfs replaying logs to repair the disk
206
216
repair_pv = f"{ pv_name } -repair"
207
217
repair_pvc = f"{ pvc_name } -repair"
208
218
repair_job_name = f"xfs-repair-{ pvc_name } "
209
- create_persistent_volume (disk_name , repair_pv , repair_pvc , namespace , False )
219
+ create_persistent_volume (
220
+ project , zone , disk_name , repair_pv , repair_pvc , namespace , False
221
+ )
210
222
# start a pod to mount the disk and run simple task
211
223
with open ("xfs-disk-repair.yaml" , "r" ) as f :
212
224
pod_manifest = yaml .safe_load (f )
@@ -228,6 +240,9 @@ def create_disk_pv_pvc_from_snapshot(
228
240
time .sleep (10 )
229
241
logger .info (f"creating final snapshot" )
230
242
create_snapshot_with_gcloud (snapshot_name , project , disk_name , zone , project )
243
+ logger .info ("deleting repair pvc and correpsonding pv and disks" )
244
+ # delete the disk used for repair
245
+ delete_disk (disk_client , project , zone , disk_name )
231
246
232
247
233
248
def is_job_pod_cleanedup (namespace , job_name ):
@@ -255,7 +270,9 @@ def wait_for_operation(project, zone, operation_name, zone_operations_client):
255
270
time .sleep (20 )
256
271
257
272
258
- def create_persistent_volume (disk_name , pv_name , pvc_name , namespace , read_only ):
273
+ def create_persistent_volume (
274
+ project , zone , disk_name , pv_name , pvc_name , namespace , read_only
275
+ ):
259
276
config .load_kube_config ()
260
277
v1 = client .CoreV1Api ()
261
278
@@ -286,20 +303,22 @@ def create_persistent_volume(disk_name, pv_name, pvc_name, namespace, read_only)
286
303
raise
287
304
288
305
# Create PersistentVolume
306
+ volume_handle = f"projects/{ project } /zones/{ zone } /disks/{ disk_name } "
289
307
pv = client .V1PersistentVolume (
290
308
api_version = "v1" ,
291
309
kind = "PersistentVolume" ,
292
310
metadata = client .V1ObjectMeta (name = pv_name ),
293
311
spec = client .V1PersistentVolumeSpec (
294
312
capacity = {"storage" : "10000Gi" },
295
- access_modes = ["ReadOnlyMany" ],
296
- gce_persistent_disk = client .V1GCEPersistentDiskVolumeSource (
297
- pd_name = disk_name ,
313
+ access_modes = ["ReadWriteOnce" ],
314
+ csi = client .V1CSIPersistentVolumeSource (
315
+ driver = "pd.csi.storage.gke.io" ,
316
+ volume_handle = volume_handle ,
298
317
fs_type = "xfs" ,
299
318
read_only = read_only ,
300
319
),
301
- persistent_volume_reclaim_policy = "Retain" ,
302
- storage_class_name = "standard " ,
320
+ persistent_volume_reclaim_policy = "Retain" , # this is to delete the PV and disk separately to speed up pv deletion
321
+ storage_class_name = "ssd-data-xfs " ,
303
322
),
304
323
)
305
324
@@ -309,9 +328,9 @@ def create_persistent_volume(disk_name, pv_name, pvc_name, namespace, read_only)
309
328
kind = "PersistentVolumeClaim" ,
310
329
metadata = client .V1ObjectMeta (name = pvc_name , namespace = namespace ),
311
330
spec = client .V1PersistentVolumeClaimSpec (
312
- access_modes = ["ReadOnlyMany " ],
331
+ access_modes = ["ReadWriteOnce " ],
313
332
resources = client .V1ResourceRequirements (requests = {"storage" : "10000Gi" }),
314
- storage_class_name = "standard " ,
333
+ storage_class_name = "ssd-data-xfs " ,
315
334
volume_name = pv_name ,
316
335
),
317
336
)
@@ -427,7 +446,7 @@ def create_pvcs_from_snapshot(run_id, snapshot_name, namespace, pvc_num, label):
427
446
return res
428
447
429
448
430
- def create_disk_pv_pvc (
449
+ def create_repair_disk_and_its_snapshot (
431
450
project , zone , cluster_name , og_snapshot_name , snapshot_name , prefix , namespace
432
451
):
433
452
tasks = []
@@ -462,8 +481,6 @@ def create_disk_pv_pvc(
462
481
except Exception as e :
463
482
logger .error (f"Task generated an exception: { e } " )
464
483
465
- # start a self deleteing job to mount the xfs disks for repairing
466
-
467
484
468
485
def parse_args ():
469
486
parser = argparse .ArgumentParser (
@@ -506,7 +523,7 @@ def parse_args():
506
523
source_namespace ,
507
524
project_id ,
508
525
)
509
- create_disk_pv_pvc (
526
+ create_repair_disk_and_its_snapshot (
510
527
project_id ,
511
528
zone ,
512
529
cluster_name ,
0 commit comments