Skip to content

Commit 9455fc1

Browse files
committed
reduce cost
1 parent fda6770 commit 9455fc1

File tree

2 files changed

+65
-37
lines changed

2 files changed

+65
-37
lines changed

testsuite/replay-verify/archive_disk_utils.py

+44-27
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,10 @@
2222
MAINNET_SNAPSHOT_NAME = "mainnet-archive"
2323

2424

25+
def get_region_from_zone(zone):
26+
return zone.rsplit("-", 1)[0]
27+
28+
2529
def get_kubectl_credentials(project_id, region, cluster_name):
2630
try:
2731
# Command to get kubectl credentials for the cluster
@@ -141,6 +145,8 @@ def create_snapshot_with_gcloud(
141145
source_disk_link,
142146
"--project",
143147
target_project,
148+
"--storage-location",
149+
get_region_from_zone(source_zone),
144150
]
145151

146152
try:
@@ -156,6 +162,22 @@ def create_snapshot_with_gcloud(
156162
raise Exception(f"Error creating snapshot: {e}")
157163

158164

165+
def delete_disk(disk_client, project, zone, disk_name):
166+
# Check if the disk already exists
167+
168+
try:
169+
disk = disk_client.get(project=project, zone=zone, disk=disk_name)
170+
logger.info(f"Disk {disk_name} already exists. Deleting it.")
171+
# Delete the existing disk
172+
operation = disk_client.delete(project=project, zone=zone, disk=disk_name)
173+
wait_for_operation(
174+
project, zone, operation.name, compute_v1.ZoneOperationsClient()
175+
)
176+
logger.info(f"Disk {disk_name} deleted.")
177+
except Exception as e:
178+
logger.info(f"Disk {e} {disk_name} does not exist, no delete needed.")
179+
180+
159181
# Creating disk from import snapshots
160182
# require getting a hold of the kubectrl of the cluster
161183
# eg: gcloud container clusters get-credentials replay-on-archive --region us-central1 --project replay-verify
@@ -172,19 +194,7 @@ def create_disk_pv_pvc_from_snapshot(
172194
):
173195
disk_client = compute_v1.DisksClient()
174196
snapshot_client = compute_v1.SnapshotsClient()
175-
176-
# Check if the disk already exists
177-
try:
178-
disk = disk_client.get(project=project, zone=zone, disk=disk_name)
179-
logger.info(f"Disk {disk_name} already exists. Deleting it.")
180-
# Delete the existing disk
181-
operation = disk_client.delete(project=project, zone=zone, disk=disk_name)
182-
wait_for_operation(
183-
project, zone, operation.name, compute_v1.ZoneOperationsClient()
184-
)
185-
logger.info(f"Disk {disk_name} deleted.")
186-
except Exception as e:
187-
logger.info(f"Disk {e} {disk_name} does not exist. Creating a new one.")
197+
delete_disk(disk_client, project, zone, disk_name)
188198

189199
# Create a new disk from the snapshot
190200
logger.info(f"Creating disk {disk_name} from snapshot {og_snapshot_name}.")
@@ -199,14 +209,16 @@ def create_disk_pv_pvc_from_snapshot(
199209
wait_for_operation(project, zone, operation.name, compute_v1.ZoneOperationsClient())
200210
logger.info(f"Disk {disk_name} created from snapshot {og_snapshot_name}.")
201211

202-
region_name = zone.rsplit("-", 1)[0]
212+
region_name = get_region_from_zone(zone)
203213
get_kubectl_credentials(project, region_name, cluster_name)
204214
# create_persistent_volume(disk_name, pv_name, pvc_name, namespace, True)
205215
# this is only for xfs replaying logs to repair the disk
206216
repair_pv = f"{pv_name}-repair"
207217
repair_pvc = f"{pvc_name}-repair"
208218
repair_job_name = f"xfs-repair-{pvc_name}"
209-
create_persistent_volume(disk_name, repair_pv, repair_pvc, namespace, False)
219+
create_persistent_volume(
220+
project, zone, disk_name, repair_pv, repair_pvc, namespace, False
221+
)
210222
# start a pod to mount the disk and run simple task
211223
with open("xfs-disk-repair.yaml", "r") as f:
212224
pod_manifest = yaml.safe_load(f)
@@ -228,6 +240,9 @@ def create_disk_pv_pvc_from_snapshot(
228240
time.sleep(10)
229241
logger.info(f"creating final snapshot")
230242
create_snapshot_with_gcloud(snapshot_name, project, disk_name, zone, project)
243+
logger.info("deleting repair pvc and correpsonding pv and disks")
244+
# delete the disk used for repair
245+
delete_disk(disk_client, project, zone, disk_name)
231246

232247

233248
def is_job_pod_cleanedup(namespace, job_name):
@@ -255,7 +270,9 @@ def wait_for_operation(project, zone, operation_name, zone_operations_client):
255270
time.sleep(20)
256271

257272

258-
def create_persistent_volume(disk_name, pv_name, pvc_name, namespace, read_only):
273+
def create_persistent_volume(
274+
project, zone, disk_name, pv_name, pvc_name, namespace, read_only
275+
):
259276
config.load_kube_config()
260277
v1 = client.CoreV1Api()
261278

@@ -286,20 +303,22 @@ def create_persistent_volume(disk_name, pv_name, pvc_name, namespace, read_only)
286303
raise
287304

288305
# Create PersistentVolume
306+
volume_handle = f"projects/{project}/zones/{zone}/disks/{disk_name}"
289307
pv = client.V1PersistentVolume(
290308
api_version="v1",
291309
kind="PersistentVolume",
292310
metadata=client.V1ObjectMeta(name=pv_name),
293311
spec=client.V1PersistentVolumeSpec(
294312
capacity={"storage": "10000Gi"},
295-
access_modes=["ReadOnlyMany"],
296-
gce_persistent_disk=client.V1GCEPersistentDiskVolumeSource(
297-
pd_name=disk_name,
313+
access_modes=["ReadWriteOnce"],
314+
csi=client.V1CSIPersistentVolumeSource(
315+
driver="pd.csi.storage.gke.io",
316+
volume_handle=volume_handle,
298317
fs_type="xfs",
299318
read_only=read_only,
300319
),
301-
persistent_volume_reclaim_policy="Retain",
302-
storage_class_name="standard",
320+
persistent_volume_reclaim_policy="Retain", # this is to delete the PV and disk separately to speed up pv deletion
321+
storage_class_name="ssd-data-xfs",
303322
),
304323
)
305324

@@ -309,9 +328,9 @@ def create_persistent_volume(disk_name, pv_name, pvc_name, namespace, read_only)
309328
kind="PersistentVolumeClaim",
310329
metadata=client.V1ObjectMeta(name=pvc_name, namespace=namespace),
311330
spec=client.V1PersistentVolumeClaimSpec(
312-
access_modes=["ReadOnlyMany"],
331+
access_modes=["ReadWriteOnce"],
313332
resources=client.V1ResourceRequirements(requests={"storage": "10000Gi"}),
314-
storage_class_name="standard",
333+
storage_class_name="ssd-data-xfs",
315334
volume_name=pv_name,
316335
),
317336
)
@@ -427,7 +446,7 @@ def create_pvcs_from_snapshot(run_id, snapshot_name, namespace, pvc_num, label):
427446
return res
428447

429448

430-
def create_disk_pv_pvc(
449+
def create_repair_disk_and_its_snapshot(
431450
project, zone, cluster_name, og_snapshot_name, snapshot_name, prefix, namespace
432451
):
433452
tasks = []
@@ -462,8 +481,6 @@ def create_disk_pv_pvc(
462481
except Exception as e:
463482
logger.error(f"Task generated an exception: {e}")
464483

465-
# start a self deleteing job to mount the xfs disks for repairing
466-
467484

468485
def parse_args():
469486
parser = argparse.ArgumentParser(
@@ -506,7 +523,7 @@ def parse_args():
506523
source_namespace,
507524
project_id,
508525
)
509-
create_disk_pv_pvc(
526+
create_repair_disk_and_its_snapshot(
510527
project_id,
511528
zone,
512529
cluster_name,

testsuite/replay-verify/main.py

+21-10
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@
2828

2929
REPLAY_CONCURRENCY_LEVEL = 1
3030

31+
3132
class Network(Enum):
3233
TESTNET = 1
3334
MAINNET = 2
@@ -241,6 +242,7 @@ def get_pod_status(self):
241242
def get_humio_log_link(self):
242243
return construct_humio_url(self.label, self.name, self.start_time, time.time())
243244

245+
244246
class ReplayConfig:
245247
def __init__(self, network):
246248
if network == Network.TESTNET:
@@ -253,9 +255,10 @@ def __init__(self, network):
253255
self.concurrent_replayer = 18
254256
self.pvc_number = 8
255257
self.min_range_size = 10_000
256-
self.range_size = 2_000_000
258+
self.range_size = 2_000_000
257259
self.timeout_secs = 400
258260

261+
259262
class TaskStats:
260263
def __init__(self, name):
261264
self.name = name
@@ -308,7 +311,7 @@ def __init__(
308311
self.image = image
309312
self.pvcs = []
310313
self.config = replay_config
311-
314+
312315
def __str__(self):
313316
return f"""ReplayScheduler:
314317
id: {self.id}
@@ -360,7 +363,11 @@ def create_pvc_from_snapshot(self):
360363
else MAINNET_SNAPSHOT_NAME
361364
)
362365
pvcs = create_pvcs_from_snapshot(
363-
self.id, snapshot_name, self.namespace, self.config.pvc_number, self.get_label()
366+
self.id,
367+
snapshot_name,
368+
self.namespace,
369+
self.config.pvc_number,
370+
self.get_label(),
364371
)
365372
assert len(pvcs) == self.config.pvc_number, "failed to create all pvcs"
366373
self.pvcs = pvcs
@@ -504,12 +511,16 @@ def get_image(image_tag=None):
504511
shell = forge.LocalShell()
505512
git = forge.Git(shell)
506513
image_name = "tools"
507-
default_latest_image = forge.find_recent_images(
508-
shell,
509-
git,
510-
1,
511-
image_name=image_name,
512-
)[0] if image_tag is None else image_tag
514+
default_latest_image = (
515+
forge.find_recent_images(
516+
shell,
517+
git,
518+
1,
519+
image_name=image_name,
520+
)[0]
521+
if image_tag is None
522+
else image_tag
523+
)
513524
full_image = f"{forge.GAR_REPO_NAME}/{image_name}:{default_latest_image}"
514525
return full_image
515526

@@ -546,7 +557,7 @@ def print_logs(failed_workpod_logs, txn_mismatch_logs):
546557
range_size=range_size,
547558
image=image,
548559
replay_config=config,
549-
network= network,
560+
network=network,
550561
namespace=args.namespace,
551562
)
552563
logger.info(f"scheduler: {scheduler}")

0 commit comments

Comments
 (0)