Skip to content

Commit

Permalink
reduce cost
Browse files Browse the repository at this point in the history
  • Loading branch information
areshand committed Dec 3, 2024
1 parent fda6770 commit 55b3cd5
Show file tree
Hide file tree
Showing 2 changed files with 59 additions and 32 deletions.
60 changes: 38 additions & 22 deletions testsuite/replay-verify/archive_disk_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,10 @@
MAINNET_SNAPSHOT_NAME = "mainnet-archive"


def get_region_from_zone(zone):
return zone.rsplit("-", 1)[0]


def get_kubectl_credentials(project_id, region, cluster_name):
try:
# Command to get kubectl credentials for the cluster
Expand Down Expand Up @@ -141,6 +145,8 @@ def create_snapshot_with_gcloud(
source_disk_link,
"--project",
target_project,
"--storage-location",
get_region_from_zone(source_zone),
]

try:
Expand Down Expand Up @@ -199,7 +205,7 @@ def create_disk_pv_pvc_from_snapshot(
wait_for_operation(project, zone, operation.name, compute_v1.ZoneOperationsClient())
logger.info(f"Disk {disk_name} created from snapshot {og_snapshot_name}.")

region_name = zone.rsplit("-", 1)[0]
region_name = get_region_from_zone(zone)
get_kubectl_credentials(project, region_name, cluster_name)
# create_persistent_volume(disk_name, pv_name, pvc_name, namespace, True)
# this is only for xfs replaying logs to repair the disk
Expand Down Expand Up @@ -228,6 +234,17 @@ def create_disk_pv_pvc_from_snapshot(
time.sleep(10)
logger.info(f"creating final snapshot")
create_snapshot_with_gcloud(snapshot_name, project, disk_name, zone, project)
logger.info("deleting repair pvc and correpsonding pv and disks")
# delete the PVC used for repair
try:
client.CoreV1Api().delete_namespaced_persistent_volume_claim(
name=repair_pvc, namespace=namespace
)
except ApiException as e:
if e.status == 404:
print(f"PersistentVolumeClaim '{repair_pvc}' not found.")
else:
print(f"Exception when deleting PersistentVolumeClaim: {e}")


def is_job_pod_cleanedup(namespace, job_name):
Expand Down Expand Up @@ -287,20 +304,21 @@ def create_persistent_volume(disk_name, pv_name, pvc_name, namespace, read_only)

# Create PersistentVolume
pv = client.V1PersistentVolume(
api_version="v1",
kind="PersistentVolume",
metadata=client.V1ObjectMeta(name=pv_name),
spec=client.V1PersistentVolumeSpec(
capacity={"storage": "10000Gi"},
access_modes=["ReadOnlyMany"],
gce_persistent_disk=client.V1GCEPersistentDiskVolumeSource(
pd_name=disk_name,
fs_type="xfs",
read_only=read_only,
),
persistent_volume_reclaim_policy="Retain",
storage_class_name="standard",
),
api_version="v1",
kind="PersistentVolume",
metadata=client.V1ObjectMeta(name=pv_name),
spec=client.V1PersistentVolumeSpec(
capacity={"storage": "10000Gi"},
access_modes=["ReadWriteOnce"],
csi=client.V1CSIPersistentVolumeSource(
driver="pd.csi.storage.gke.io",
volume_handle=disk_name,
fs_type="xfs",
read_only=read_only,
),
persistent_volume_reclaim_policy="Delete",
storage_class_name="ssd-data-xfs",
),
)

# Create PersistentVolumeClaim
Expand All @@ -311,7 +329,7 @@ def create_persistent_volume(disk_name, pv_name, pvc_name, namespace, read_only)
spec=client.V1PersistentVolumeClaimSpec(
access_modes=["ReadOnlyMany"],
resources=client.V1ResourceRequirements(requests={"storage": "10000Gi"}),
storage_class_name="standard",
storage_class_name="ssd-data-xfs",
volume_name=pv_name,
),
)
Expand Down Expand Up @@ -427,7 +445,7 @@ def create_pvcs_from_snapshot(run_id, snapshot_name, namespace, pvc_num, label):
return res


def create_disk_pv_pvc(
def create_repair_disk_and_its_snapshot(
project, zone, cluster_name, og_snapshot_name, snapshot_name, prefix, namespace
):
tasks = []
Expand Down Expand Up @@ -462,8 +480,6 @@ def create_disk_pv_pvc(
except Exception as e:
logger.error(f"Task generated an exception: {e}")

# start a self deleteing job to mount the xfs disks for repairing


def parse_args():
parser = argparse.ArgumentParser(
Expand Down Expand Up @@ -498,15 +514,15 @@ def parse_args():
new_pv_prefix = MAINNET_SNAPSHOT_NAME
# create OG snapshot
og_snapshot_name = f"{snapshot_name}-og"
create_snapshot_from_backup_pods(
"""create_snapshot_from_backup_pods(
og_snapshot_name,
source_project_id,
source_cluster_id,
region,
source_namespace,
project_id,
)
create_disk_pv_pvc(
)"""
create_repair_disk_and_its_snapshot(
project_id,
zone,
cluster_name,
Expand Down
31 changes: 21 additions & 10 deletions testsuite/replay-verify/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@

REPLAY_CONCURRENCY_LEVEL = 1


class Network(Enum):
TESTNET = 1
MAINNET = 2
Expand Down Expand Up @@ -241,6 +242,7 @@ def get_pod_status(self):
def get_humio_log_link(self):
return construct_humio_url(self.label, self.name, self.start_time, time.time())


class ReplayConfig:
def __init__(self, network):
if network == Network.TESTNET:
Expand All @@ -253,9 +255,10 @@ def __init__(self, network):
self.concurrent_replayer = 18
self.pvc_number = 8
self.min_range_size = 10_000
self.range_size = 2_000_000
self.range_size = 2_000_000
self.timeout_secs = 400


class TaskStats:
def __init__(self, name):
self.name = name
Expand Down Expand Up @@ -308,7 +311,7 @@ def __init__(
self.image = image
self.pvcs = []
self.config = replay_config

def __str__(self):
return f"""ReplayScheduler:
id: {self.id}
Expand Down Expand Up @@ -360,7 +363,11 @@ def create_pvc_from_snapshot(self):
else MAINNET_SNAPSHOT_NAME
)
pvcs = create_pvcs_from_snapshot(
self.id, snapshot_name, self.namespace, self.config.pvc_number, self.get_label()
self.id,
snapshot_name,
self.namespace,
self.config.pvc_number,
self.get_label(),
)
assert len(pvcs) == self.config.pvc_number, "failed to create all pvcs"
self.pvcs = pvcs
Expand Down Expand Up @@ -504,12 +511,16 @@ def get_image(image_tag=None):
shell = forge.LocalShell()
git = forge.Git(shell)
image_name = "tools"
default_latest_image = forge.find_recent_images(
shell,
git,
1,
image_name=image_name,
)[0] if image_tag is None else image_tag
default_latest_image = (
forge.find_recent_images(
shell,
git,
1,
image_name=image_name,
)[0]
if image_tag is None
else image_tag
)
full_image = f"{forge.GAR_REPO_NAME}/{image_name}:{default_latest_image}"
return full_image

Expand Down Expand Up @@ -546,7 +557,7 @@ def print_logs(failed_workpod_logs, txn_mismatch_logs):
range_size=range_size,
image=image,
replay_config=config,
network= network,
network=network,
namespace=args.namespace,
)
logger.info(f"scheduler: {scheduler}")
Expand Down

0 comments on commit 55b3cd5

Please sign in to comment.