diff --git a/charts/external-metrics/values.yaml b/charts/external-metrics/values.yaml index d16dc3d5cde..3949d8e8d53 100644 --- a/charts/external-metrics/values.yaml +++ b/charts/external-metrics/values.yaml @@ -11,8 +11,9 @@ prometheus: size: 10Gi kube-state-metrics: enabled: true + # TODO (kyuds): remove skypilot-cluster label in v0.11.0; deprecated in favor of skypilot-cluster-name. metricLabelsAllowlist: - - pods=[skypilot-cluster] + - pods=[skypilot-cluster,skypilot-cluster-name] prometheus-node-exporter: enabled: false prometheus-pushgateway: diff --git a/charts/skypilot/values.yaml b/charts/skypilot/values.yaml index 07f0295b409..3341af48b02 100644 --- a/charts/skypilot/values.yaml +++ b/charts/skypilot/values.yaml @@ -555,8 +555,9 @@ prometheus: replacement: $1 kube-state-metrics: enabled: true + # TODO (kyuds): remove skypilot-cluster label in v0.11.0; deprecated in favor of skypilot-cluster-name. metricLabelsAllowlist: - - pods=[skypilot-cluster] + - pods=[skypilot-cluster,skypilot-cluster-name] prometheus-node-exporter: enabled: false prometheus-pushgateway: diff --git a/docs/source/reference/api-server/helm-values-spec.rst b/docs/source/reference/api-server/helm-values-spec.rst index 9c674c33f6c..121085c26f1 100644 --- a/docs/source/reference/api-server/helm-values-spec.rst +++ b/docs/source/reference/api-server/helm-values-spec.rst @@ -2140,7 +2140,7 @@ SkyPilot provides a minimal Prometheus configuration by default. If you want to kube-state-metrics: enabled: true metricLabelsAllowlist: - - pods=[skypilot-cluster] + - pods=[skypilot-cluster-name] prometheus-node-exporter: enabled: false prometheus-pushgateway: diff --git a/sky/provision/kubernetes/constants.py b/sky/provision/kubernetes/constants.py index fabb835bf7e..fe1687449dc 100644 --- a/sky/provision/kubernetes/constants.py +++ b/sky/provision/kubernetes/constants.py @@ -18,7 +18,6 @@ # Labels for the Pods created by SkyPilot TAG_RAY_CLUSTER_NAME = 'ray-cluster-name' -TAG_SKYPILOT_CLUSTER_NAME = 'skypilot-cluster-name' TAG_POD_INITIALIZED = 'skypilot-initialized' TAG_SKYPILOT_DEPLOYMENT_NAME = 'skypilot-deployment-name' diff --git a/sky/provision/kubernetes/instance.py b/sky/provision/kubernetes/instance.py index f6072b78baf..dc0a864a08f 100644 --- a/sky/provision/kubernetes/instance.py +++ b/sky/provision/kubernetes/instance.py @@ -82,7 +82,7 @@ def is_high_availability_cluster_by_kubectl( context).list_namespaced_deployment( namespace, label_selector= - f'{k8s_constants.TAG_SKYPILOT_CLUSTER_NAME}={cluster_name}') + f'{constants.TAG_SKYPILOT_CLUSTER_NAME}={cluster_name}') except kubernetes.api_exception(): return False # It is a high availability cluster if there is at least one deployment @@ -426,11 +426,11 @@ def _evaluate_timeout() -> bool: # Get all pods in a single API call using the cluster name label # which all pods in new_nodes should share cluster_name_on_cloud = new_nodes[0].metadata.labels[ - k8s_constants.TAG_SKYPILOT_CLUSTER_NAME] + constants.TAG_SKYPILOT_CLUSTER_NAME] pods = kubernetes.core_api(context).list_namespaced_pod( namespace, label_selector= - f'{k8s_constants.TAG_SKYPILOT_CLUSTER_NAME}={cluster_name_on_cloud}' + f'{constants.TAG_SKYPILOT_CLUSTER_NAME}={cluster_name_on_cloud}' ).items # Get the set of found pod names and check if we have all expected pods @@ -531,11 +531,11 @@ def _check_init_containers(pod): while True: # Get all pods in a single API call cluster_name_on_cloud = new_pods[0].metadata.labels[ - k8s_constants.TAG_SKYPILOT_CLUSTER_NAME] + constants.TAG_SKYPILOT_CLUSTER_NAME] all_pods = kubernetes.core_api(context).list_namespaced_pod( namespace, label_selector= - f'{k8s_constants.TAG_SKYPILOT_CLUSTER_NAME}={cluster_name_on_cloud}' + f'{constants.TAG_SKYPILOT_CLUSTER_NAME}={cluster_name_on_cloud}' ).items # Get the set of found pod names and check if we have all expected pods @@ -912,7 +912,7 @@ def _create_pods(region: str, cluster_name: str, cluster_name_on_cloud: str, else: pod_spec['metadata']['labels'] = tags pod_spec['metadata']['labels'].update( - {k8s_constants.TAG_SKYPILOT_CLUSTER_NAME: cluster_name_on_cloud}) + {constants.TAG_SKYPILOT_CLUSTER_NAME: cluster_name_on_cloud}) terminating_pods = kubernetes_utils.filter_pods(namespace, context, tags, ['Terminating']) @@ -1052,7 +1052,7 @@ def _create_resource_thread(i: int): 'podAffinityTerm': { 'labelSelector': { 'matchExpressions': [{ - 'key': k8s_constants.TAG_SKYPILOT_CLUSTER_NAME, + 'key': constants.TAG_SKYPILOT_CLUSTER_NAME, 'operator': 'In', 'values': [cluster_name_on_cloud] }] @@ -1692,16 +1692,18 @@ def query_instances( is_ssh = context.startswith('ssh-') if context else False identity = 'SSH Node Pool' if is_ssh else 'Kubernetes cluster' - # Get all the pods with the label skypilot-cluster: + # Get all the pods with the label skypilot-cluster-name: try: logger.debug( f'Querying k8s api for pods in context: {context} and ' f'namespace: {namespace} with ' - f'`skypilot-cluster={cluster_name_on_cloud}` label selector.') + f'`skypilot-cluster-name={cluster_name_on_cloud}` label selector.') + label_selector = (f'{constants.TAG_SKYPILOT_CLUSTER_NAME}=' + f'{cluster_name_on_cloud}') response = kubernetes.core_api(context).list_namespaced_pod( namespace, - label_selector=f'skypilot-cluster={cluster_name_on_cloud}', + label_selector=label_selector, _request_timeout=kubernetes.API_TIMEOUT) pods = response.items diff --git a/sky/provision/kubernetes/network.py b/sky/provision/kubernetes/network.py index a69d5ee1bb8..f03f40f3f98 100644 --- a/sky/provision/kubernetes/network.py +++ b/sky/provision/kubernetes/network.py @@ -4,6 +4,7 @@ from sky import sky_logging from sky.adaptors import kubernetes from sky.provision import common +from sky.provision import constants as provision_constants from sky.provision.kubernetes import network_utils from sky.provision.kubernetes import utils as kubernetes_utils from sky.utils import kubernetes_enums @@ -55,7 +56,7 @@ def _open_ports_using_loadbalancer( context=context, service_name=service_name, ports=ports, - selector_key='skypilot-cluster', + selector_key=provision_constants.TAG_SKYPILOT_CLUSTER_NAME, selector_value=cluster_name_on_cloud, ) @@ -109,7 +110,7 @@ def _open_ports_using_ingress( context=context, service_details=service_details, ingress_name=f'{cluster_name_on_cloud}-skypilot-ingress', - selector_key='skypilot-cluster', + selector_key=provision_constants.TAG_SKYPILOT_CLUSTER_NAME, selector_value=cluster_name_on_cloud, ) diff --git a/sky/provision/kubernetes/utils.py b/sky/provision/kubernetes/utils.py index a44db38a072..d7889909d1c 100644 --- a/sky/provision/kubernetes/utils.py +++ b/sky/provision/kubernetes/utils.py @@ -3301,13 +3301,13 @@ def get_skypilot_pods(context: Optional[str] = None) -> List[Any]: try: pods = kubernetes.core_api(context).list_pod_for_all_namespaces( - label_selector='skypilot-cluster', + label_selector=provision_constants.TAG_SKYPILOT_CLUSTER_NAME, _request_timeout=kubernetes.API_TIMEOUT).items except kubernetes.max_retry_error(): raise exceptions.ResourcesUnavailableError( 'Timed out trying to get SkyPilot pods from Kubernetes cluster. ' 'Please check if the cluster is healthy and retry. To debug, run: ' - 'kubectl get pods --selector=skypilot-cluster --all-namespaces' + 'kubectl get pods --selector=skypilot-cluster-name --all-namespaces' ) from None return pods @@ -3444,7 +3444,8 @@ def process_skypilot_pods( serve_controllers: List[KubernetesSkyPilotClusterInfo] = [] for pod in pods: - cluster_name_on_cloud = pod.metadata.labels.get('skypilot-cluster') + cluster_name_on_cloud = pod.metadata.labels.get( + provision_constants.TAG_SKYPILOT_CLUSTER_NAME) cluster_name = cluster_name_on_cloud.rsplit( '-', 1 )[0] # Remove the user hash to get cluster name (e.g., mycluster-2ea4) diff --git a/sky/provision/kubernetes/volume.py b/sky/provision/kubernetes/volume.py index be449b1bd3e..12a18754d23 100644 --- a/sky/provision/kubernetes/volume.py +++ b/sky/provision/kubernetes/volume.py @@ -5,6 +5,7 @@ from sky import models from sky import sky_logging from sky.adaptors import kubernetes +from sky.provision import constants from sky.provision.kubernetes import config as config_lib from sky.provision.kubernetes import constants as k8s_constants from sky.provision.kubernetes import utils as kubernetes_utils @@ -128,7 +129,7 @@ def _get_volume_usedby( usedby_pods.append(pod.metadata.name) # Get the real cluster name cluster_name_on_cloud = pod.metadata.labels.get( - k8s_constants.TAG_SKYPILOT_CLUSTER_NAME) + constants.TAG_SKYPILOT_CLUSTER_NAME) if cluster_name_on_cloud is None: continue cluster_name = cloud_to_name_map.get(cluster_name_on_cloud) @@ -205,7 +206,7 @@ def get_all_volumes_usedby( used_by_pods[context][namespace][volume_name].append( pod.metadata.name) cluster_name_on_cloud = pod.metadata.labels.get( - k8s_constants.TAG_SKYPILOT_CLUSTER_NAME) + constants.TAG_SKYPILOT_CLUSTER_NAME) if cluster_name_on_cloud is None: continue cluster_name = cloud_to_name_map.get(cluster_name_on_cloud) diff --git a/sky/templates/kubernetes-ray.yml.j2 b/sky/templates/kubernetes-ray.yml.j2 index 95663983e20..16d2047e15a 100644 --- a/sky/templates/kubernetes-ray.yml.j2 +++ b/sky/templates/kubernetes-ray.yml.j2 @@ -209,7 +209,9 @@ provider: metadata: labels: parent: skypilot + # TODO (kyuds): remove this label for v0.12.0, as skypilot-cluster label is deprecated in favor of skypilot-cluster-name. skypilot-cluster: {{cluster_name_on_cloud}} + skypilot-cluster-name: {{cluster_name_on_cloud}} skypilot-user: {{ user }} name: {{cluster_name_on_cloud}}-head-ssh spec: @@ -227,7 +229,9 @@ provider: metadata: labels: parent: skypilot + # TODO (kyuds): remove this label for v0.12.0, as skypilot-cluster label is deprecated in favor of skypilot-cluster-name. skypilot-cluster: {{cluster_name_on_cloud}} + skypilot-cluster-name: {{cluster_name_on_cloud}} skypilot-user: {{ user }} # NOTE: If you're running multiple Ray clusters with services # on one Kubernetes cluster, they must have unique service @@ -247,7 +251,9 @@ provider: metadata: labels: parent: skypilot + # TODO (kyuds): remove this label for v0.12.0, as skypilot-cluster label is deprecated in favor of skypilot-cluster-name. skypilot-cluster: {{cluster_name_on_cloud}} + skypilot-cluster-name: {{cluster_name_on_cloud}} skypilot-user: {{ user }} name: {{cluster_name_on_cloud}}-worker{{ worker_id }} spec: @@ -272,6 +278,7 @@ available_node_types: labels: parent: skypilot # component will be set for the head node pod to be the same as the head node service selector above if a + # TODO (kyuds): remove this label for v0.11.0, as skypilot-cluster label is deprecated in favor of skypilot-cluster-name. skypilot-cluster: {{cluster_name_on_cloud}} skypilot-user: {{ user }} # Custom tags for the pods diff --git a/tests/smoke_tests/backward_compat/test_backward_compat.py b/tests/smoke_tests/backward_compat/test_backward_compat.py index 5c2fde2e5d1..570a000c081 100644 --- a/tests/smoke_tests/backward_compat/test_backward_compat.py +++ b/tests/smoke_tests/backward_compat/test_backward_compat.py @@ -379,7 +379,7 @@ def wait_for_status(job_name: str, return smoke_tests_utils.get_cmd_wait_until_managed_job_status_contains_matching_job_name( job_name=job_name, job_status=status, - timeout=600 if generic_cloud == 'kubernetes' else 300) + timeout=1200 if generic_cloud == 'kubernetes' else 300) blocking_seconds_for_cancel_job = 2000 if generic_cloud == 'kubernetes' else 1000