Skip to content
Merged
Show file tree
Hide file tree
Changes from 8 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion charts/external-metrics/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,9 @@ prometheus:
size: 10Gi
kube-state-metrics:
enabled: true
# TODO (kyuds): remove skypilot-cluster label in v0.11.0; deprecated in favor of skypilot-cluster-name.
metricLabelsAllowlist:
- pods=[skypilot-cluster]
- pods=[skypilot-cluster,skypilot-cluster-name]
prometheus-node-exporter:
enabled: false
prometheus-pushgateway:
Expand Down
3 changes: 2 additions & 1 deletion charts/skypilot/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -555,8 +555,9 @@ prometheus:
replacement: $1
kube-state-metrics:
enabled: true
# TODO (kyuds): remove skypilot-cluster label in v0.11.0; deprecated in favor of skypilot-cluster-name.
metricLabelsAllowlist:
- pods=[skypilot-cluster]
- pods=[skypilot-cluster,skypilot-cluster-name]
prometheus-node-exporter:
enabled: false
prometheus-pushgateway:
Expand Down
2 changes: 1 addition & 1 deletion docs/source/reference/api-server/helm-values-spec.rst
Original file line number Diff line number Diff line change
Expand Up @@ -2140,7 +2140,7 @@ SkyPilot provides a minimal Prometheus configuration by default. If you want to
kube-state-metrics:
enabled: true
metricLabelsAllowlist:
- pods=[skypilot-cluster]
- pods=[skypilot-cluster-name]
prometheus-node-exporter:
enabled: false
prometheus-pushgateway:
Expand Down
6 changes: 4 additions & 2 deletions sky/provision/kubernetes/instance.py
Original file line number Diff line number Diff line change
Expand Up @@ -1692,11 +1692,13 @@ def query_instances(
is_ssh = context.startswith('ssh-') if context else False
identity = 'SSH Node Pool' if is_ssh else 'Kubernetes cluster'

# Get all the pods with the label skypilot-cluster: <cluster_name>
# Get all the pods with the label skypilot-cluster-name: <cluster_name>
try:
label_selector = (f'{k8s_constants.TAG_SKYPILOT_CLUSTER_NAME}='
f'{cluster_name_on_cloud}')
pods = kubernetes.core_api(context).list_namespaced_pod(
namespace,
label_selector=f'skypilot-cluster={cluster_name_on_cloud}',
label_selector=label_selector,
_request_timeout=kubernetes.API_TIMEOUT).items
except kubernetes.max_retry_error():
with ux_utils.print_exception_no_traceback():
Expand Down
5 changes: 3 additions & 2 deletions sky/provision/kubernetes/network.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
from sky import sky_logging
from sky.adaptors import kubernetes
from sky.provision import common
from sky.provision.kubernetes import constants
from sky.provision.kubernetes import network_utils
from sky.provision.kubernetes import utils as kubernetes_utils
from sky.utils import kubernetes_enums
Expand Down Expand Up @@ -55,7 +56,7 @@ def _open_ports_using_loadbalancer(
context=context,
service_name=service_name,
ports=ports,
selector_key='skypilot-cluster',
selector_key=constants.TAG_SKYPILOT_CLUSTER_NAME,
selector_value=cluster_name_on_cloud,
)

Expand Down Expand Up @@ -109,7 +110,7 @@ def _open_ports_using_ingress(
context=context,
service_details=service_details,
ingress_name=f'{cluster_name_on_cloud}-skypilot-ingress',
selector_key='skypilot-cluster',
selector_key=constants.TAG_SKYPILOT_CLUSTER_NAME,
selector_value=cluster_name_on_cloud,
)

Expand Down
7 changes: 4 additions & 3 deletions sky/provision/kubernetes/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -3301,13 +3301,13 @@ def get_skypilot_pods(context: Optional[str] = None) -> List[Any]:

try:
pods = kubernetes.core_api(context).list_pod_for_all_namespaces(
label_selector='skypilot-cluster',
label_selector=kubernetes_constants.TAG_SKYPILOT_CLUSTER_NAME,
_request_timeout=kubernetes.API_TIMEOUT).items
except kubernetes.max_retry_error():
raise exceptions.ResourcesUnavailableError(
'Timed out trying to get SkyPilot pods from Kubernetes cluster. '
'Please check if the cluster is healthy and retry. To debug, run: '
'kubectl get pods --selector=skypilot-cluster --all-namespaces'
'kubectl get pods --selector=skypilot-cluster-name --all-namespaces'
) from None
return pods

Expand Down Expand Up @@ -3444,7 +3444,8 @@ def process_skypilot_pods(
serve_controllers: List[KubernetesSkyPilotClusterInfo] = []

for pod in pods:
cluster_name_on_cloud = pod.metadata.labels.get('skypilot-cluster')
cluster_name_on_cloud = pod.metadata.labels.get(
kubernetes_constants.TAG_SKYPILOT_CLUSTER_NAME)
cluster_name = cluster_name_on_cloud.rsplit(
'-', 1
)[0] # Remove the user hash to get cluster name (e.g., mycluster-2ea4)
Expand Down
7 changes: 7 additions & 0 deletions sky/templates/kubernetes-ray.yml.j2
Original file line number Diff line number Diff line change
Expand Up @@ -209,7 +209,9 @@ provider:
metadata:
labels:
parent: skypilot
# TODO (kyuds): remove this label for v0.11.0, as skypilot-cluster label is deprecated in favor of skypilot-cluster-name.
skypilot-cluster: {{cluster_name_on_cloud}}
skypilot-cluster-name: {{cluster_name_on_cloud}}
skypilot-user: {{ user }}
name: {{cluster_name_on_cloud}}-head-ssh
spec:
Expand All @@ -227,7 +229,9 @@ provider:
metadata:
labels:
parent: skypilot
# TODO (kyuds): remove this label for v0.11.0, as skypilot-cluster label is deprecated in favor of skypilot-cluster-name.
skypilot-cluster: {{cluster_name_on_cloud}}
skypilot-cluster-name: {{cluster_name_on_cloud}}
skypilot-user: {{ user }}
# NOTE: If you're running multiple Ray clusters with services
# on one Kubernetes cluster, they must have unique service
Expand All @@ -247,7 +251,9 @@ provider:
metadata:
labels:
parent: skypilot
# TODO (kyuds): remove this label for v0.11.0, as skypilot-cluster label is deprecated in favor of skypilot-cluster-name.
skypilot-cluster: {{cluster_name_on_cloud}}
skypilot-cluster-name: {{cluster_name_on_cloud}}
skypilot-user: {{ user }}
name: {{cluster_name_on_cloud}}-worker{{ worker_id }}
spec:
Expand All @@ -272,6 +278,7 @@ available_node_types:
labels:
parent: skypilot
# component will be set for the head node pod to be the same as the head node service selector above if a
# TODO (kyuds): remove this label for v0.11.0, as skypilot-cluster label is deprecated in favor of skypilot-cluster-name.
skypilot-cluster: {{cluster_name_on_cloud}}
skypilot-user: {{ user }}
# Custom tags for the pods
Expand Down
Loading