Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 10 additions & 2 deletions sky/utils/kubernetes/gpu_labeler.py
Original file line number Diff line number Diff line change
Expand Up @@ -183,9 +183,17 @@ def wait_for_jobs_completion(jobs_to_node_names: Dict[str, str],
batch_v1 = kubernetes.batch_api(context=context)
w = kubernetes.watch()
completed_jobs = []
# Use resource_version="0" to start from the oldest available version.
# In multi-replica API server environments, replicas may be at different
# resource versions due to replication lag. Without specifying this, the
# watch may get version X from one replica but connect to another replica
# that only has up to version Y < X, causing "Too large resource version"
# errors. Using "0" ensures all replicas can serve the request from their
# oldest available version, avoiding version mismatches.
for event in w.stream(func=batch_v1.list_namespaced_job,
namespace=namespace,
timeout_seconds=timeout):
timeout_seconds=timeout,
resource_version='0'):
job = event['object']
job_name = job.metadata.name
if job_name in jobs_to_node_names:
Expand All @@ -212,7 +220,7 @@ def wait_for_jobs_completion(jobs_to_node_names: Dict[str, str],
_format_string(
f'Timed out after waiting {timeout} seconds '
'for job to complete', colorama.Style.DIM))
return False #Timed out
return False # Timed out


def main():
Expand Down
Loading