From 208695372c656305ee8839b927fdb9fe362c6b3c Mon Sep 17 00:00:00 2001 From: Rohan Sonecha Date: Fri, 31 Oct 2025 16:04:59 -0700 Subject: [PATCH 1/6] add more debug logs around k8s list_namespaced_pod --- sky/provision/kubernetes/instance.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/sky/provision/kubernetes/instance.py b/sky/provision/kubernetes/instance.py index dc0a864a08f..63332a6242b 100644 --- a/sky/provision/kubernetes/instance.py +++ b/sky/provision/kubernetes/instance.py @@ -1697,13 +1697,11 @@ def query_instances( logger.debug( f'Querying k8s api for pods in context: {context} and ' f'namespace: {namespace} with ' - f'`skypilot-cluster-name={cluster_name_on_cloud}` label selector.') + f'`skypilot-cluster={cluster_name_on_cloud}` label selector.') - label_selector = (f'{constants.TAG_SKYPILOT_CLUSTER_NAME}=' - f'{cluster_name_on_cloud}') response = kubernetes.core_api(context).list_namespaced_pod( namespace, - label_selector=label_selector, + label_selector=f'skypilot-cluster={cluster_name_on_cloud}', _request_timeout=kubernetes.API_TIMEOUT) pods = response.items From 1b57b5ada63f51001bae8104ac4fd9dc2be4a647 Mon Sep 17 00:00:00 2001 From: Rohan Sonecha Date: Tue, 4 Nov 2025 10:43:15 -0800 Subject: [PATCH 2/6] more info --- sky/provision/kubernetes/instance.py | 29 +++++++++++++++++++++------- 1 file changed, 22 insertions(+), 7 deletions(-) diff --git a/sky/provision/kubernetes/instance.py b/sky/provision/kubernetes/instance.py index 63332a6242b..54f5da1d52f 100644 --- a/sky/provision/kubernetes/instance.py +++ b/sky/provision/kubernetes/instance.py @@ -1694,6 +1694,7 @@ def query_instances( # Get all the pods with the label skypilot-cluster-name: try: + # log the query parameters we pass to the k8s api logger.debug( f'Querying k8s api for pods in context: {context} and ' f'namespace: {namespace} with ' @@ -1703,15 +1704,29 @@ def query_instances( namespace, label_selector=f'skypilot-cluster={cluster_name_on_cloud}', _request_timeout=kubernetes.API_TIMEOUT) + + # log PodList response info + logger.debug(f'k8s api response for skypilot-cluster=' + f'{cluster_name_on_cloud}: ' + f'apiVersion={response.api_version}, ' + f'kind={response.kind}, ' + f'metadata={response.metadata}') + pods = response.items - # Log response metadata - # pylint: disable=protected-access - logger.debug( - f'Query response for skypilot cluster {cluster_name_on_cloud}: ' - f'resource_version={response.metadata.resource_version}, ' - f'pod_count={len(pods)}, ' - f'continue_token={response.metadata._continue}') + logger.debug(f'k8s api response for skypilot-cluster=' + f'{cluster_name_on_cloud}: ' + f'len(pods)={len(pods)}') + + # log detailed Pod info + for pod in pods: + logger.debug( + f'k8s pod info for skypilot cluster={cluster_name_on_cloud}: ' + f'pod.apiVersion={pod.api_version}, ' + f'pod.kind={pod.kind}, ' + f'pod.metadata={pod.metadata}, ' + f'pod.status={pod.status}') + except kubernetes.max_retry_error(): with ux_utils.print_exception_no_traceback(): if is_ssh: From c199e399a5dab69da8281d1db2c44a8990386481 Mon Sep 17 00:00:00 2001 From: Rohan Sonecha Date: Tue, 4 Nov 2025 14:28:53 -0800 Subject: [PATCH 3/6] reduce overhead --- sky/provision/kubernetes/instance.py | 33 ++++++++++++++++++---------- 1 file changed, 21 insertions(+), 12 deletions(-) diff --git a/sky/provision/kubernetes/instance.py b/sky/provision/kubernetes/instance.py index 54f5da1d52f..1fecb7b6d26 100644 --- a/sky/provision/kubernetes/instance.py +++ b/sky/provision/kubernetes/instance.py @@ -1706,11 +1706,12 @@ def query_instances( _request_timeout=kubernetes.API_TIMEOUT) # log PodList response info - logger.debug(f'k8s api response for skypilot-cluster=' - f'{cluster_name_on_cloud}: ' - f'apiVersion={response.api_version}, ' - f'kind={response.kind}, ' - f'metadata={response.metadata}') + if sky_logging.logging_enabled(logger, sky_logging.DEBUG): + logger.debug(f'k8s api response for skypilot-cluster=' + f'{cluster_name_on_cloud}: ' + f'apiVersion={response.api_version}, ' + f'kind={response.kind}, ' + f'metadata={response.metadata}') pods = response.items @@ -1719,13 +1720,21 @@ def query_instances( f'len(pods)={len(pods)}') # log detailed Pod info - for pod in pods: - logger.debug( - f'k8s pod info for skypilot cluster={cluster_name_on_cloud}: ' - f'pod.apiVersion={pod.api_version}, ' - f'pod.kind={pod.kind}, ' - f'pod.metadata={pod.metadata}, ' - f'pod.status={pod.status}') + if sky_logging.logging_enabled(logger, sky_logging.DEBUG): + for pod in pods: + logger.debug(f'k8s pod info for ' + f'skypilot cluster={cluster_name_on_cloud}: ' + f'pod.apiVersion={pod.api_version}, ' + f'pod.kind={pod.kind}, \n' + f'pod.name={pod.metadata.name}, ' + f'pod.namespace={pod.metadata.namespace}, \n' + f'pod.labels={pod.metadata.labels}, \n' + f'pod.annotations={pod.metadata.annotations}, \n' + 'pod.creationTimestamp=' + f'{pod.metadata.creation_timestamp}, ' + 'pod.deletionTimestamp=' + f'{pod.metadata.deletion_timestamp}, \n' + f'pod.status={pod.status}') except kubernetes.max_retry_error(): with ux_utils.print_exception_no_traceback(): From 15ae163ec543b349832e2be8749569e64db3f272 Mon Sep 17 00:00:00 2001 From: Rohan Sonecha Date: Tue, 4 Nov 2025 14:51:12 -0800 Subject: [PATCH 4/6] label selector --- sky/provision/kubernetes/instance.py | 20 +++++++++----------- 1 file changed, 9 insertions(+), 11 deletions(-) diff --git a/sky/provision/kubernetes/instance.py b/sky/provision/kubernetes/instance.py index 1fecb7b6d26..9d855694040 100644 --- a/sky/provision/kubernetes/instance.py +++ b/sky/provision/kubernetes/instance.py @@ -1691,39 +1691,37 @@ def query_instances( context = kubernetes_utils.get_context_from_config(provider_config) is_ssh = context.startswith('ssh-') if context else False identity = 'SSH Node Pool' if is_ssh else 'Kubernetes cluster' + label_selector = (f'{constants.TAG_SKYPILOT_CLUSTER_NAME}=' + f'{cluster_name_on_cloud}') # Get all the pods with the label skypilot-cluster-name: try: # log the query parameters we pass to the k8s api - logger.debug( - f'Querying k8s api for pods in context: {context} and ' - f'namespace: {namespace} with ' - f'`skypilot-cluster={cluster_name_on_cloud}` label selector.') + logger.debug(f'Querying k8s api for pods in context: {context} and ' + f'namespace: {namespace} with ' + f'label selector:`{label_selector}`.') response = kubernetes.core_api(context).list_namespaced_pod( namespace, - label_selector=f'skypilot-cluster={cluster_name_on_cloud}', + label_selector=label_selector, _request_timeout=kubernetes.API_TIMEOUT) # log PodList response info if sky_logging.logging_enabled(logger, sky_logging.DEBUG): - logger.debug(f'k8s api response for skypilot-cluster=' - f'{cluster_name_on_cloud}: ' + logger.debug(f'k8s api response for `{label_selector}`: ' f'apiVersion={response.api_version}, ' f'kind={response.kind}, ' f'metadata={response.metadata}') pods = response.items - logger.debug(f'k8s api response for skypilot-cluster=' - f'{cluster_name_on_cloud}: ' + logger.debug(f'k8s api response for `{label_selector}`: ' f'len(pods)={len(pods)}') # log detailed Pod info if sky_logging.logging_enabled(logger, sky_logging.DEBUG): for pod in pods: - logger.debug(f'k8s pod info for ' - f'skypilot cluster={cluster_name_on_cloud}: ' + logger.debug(f'k8s pod info for `{label_selector}`: ' f'pod.apiVersion={pod.api_version}, ' f'pod.kind={pod.kind}, \n' f'pod.name={pod.metadata.name}, ' From 04453b27dbd8f274141b38f478b7da7e02f7daf3 Mon Sep 17 00:00:00 2001 From: Rohan Sonecha Date: Tue, 4 Nov 2025 14:59:49 -0800 Subject: [PATCH 5/6] clean --- sky/provision/kubernetes/instance.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/sky/provision/kubernetes/instance.py b/sky/provision/kubernetes/instance.py index 9d855694040..a5706135f8f 100644 --- a/sky/provision/kubernetes/instance.py +++ b/sky/provision/kubernetes/instance.py @@ -1697,8 +1697,9 @@ def query_instances( # Get all the pods with the label skypilot-cluster-name: try: # log the query parameters we pass to the k8s api - logger.debug(f'Querying k8s api for pods in context: {context} and ' - f'namespace: {namespace} with ' + logger.debug(f'Querying k8s api for pods:\n' + f'context: {context}\n' + f'namespace: {namespace}\n' f'label selector:`{label_selector}`.') response = kubernetes.core_api(context).list_namespaced_pod( @@ -1708,9 +1709,9 @@ def query_instances( # log PodList response info if sky_logging.logging_enabled(logger, sky_logging.DEBUG): - logger.debug(f'k8s api response for `{label_selector}`: ' + logger.debug(f'k8s api response for `{label_selector}`:\n' f'apiVersion={response.api_version}, ' - f'kind={response.kind}, ' + f'kind={response.kind},\n' f'metadata={response.metadata}') pods = response.items From b8c777287827ce2405b1e2900ed5459075fcd5ea Mon Sep 17 00:00:00 2001 From: Rohan Sonecha Date: Tue, 4 Nov 2025 15:06:36 -0800 Subject: [PATCH 6/6] only debug --- sky/provision/kubernetes/instance.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/sky/provision/kubernetes/instance.py b/sky/provision/kubernetes/instance.py index a5706135f8f..ec34f6e3a0e 100644 --- a/sky/provision/kubernetes/instance.py +++ b/sky/provision/kubernetes/instance.py @@ -1716,11 +1716,10 @@ def query_instances( pods = response.items - logger.debug(f'k8s api response for `{label_selector}`: ' - f'len(pods)={len(pods)}') - # log detailed Pod info if sky_logging.logging_enabled(logger, sky_logging.DEBUG): + logger.debug(f'k8s api response for `{label_selector}`: ' + f'len(pods)={len(pods)}') for pod in pods: logger.debug(f'k8s pod info for `{label_selector}`: ' f'pod.apiVersion={pod.api_version}, '