InftyAI · X1aoZEOuO · Aug 5, 2025 · Sep 1, 2025 · Sep 9, 2025 · Oct 29, 2025
diff --git a/Makefile b/Makefile
@@ -302,6 +302,15 @@ install-prometheus:
 uninstall-prometheus:
 	kubectl delete -k config/prometheus
 
+.PHONY: install-keda
+install-keda:
+	helm repo add kedacore https://kedacore.github.io/charts
+	helm install keda kedacore/keda --namespace keda --create-namespace
+
+.PHONY: uninstall-keda
+uninstall-keda:
+	helm uninstall keda -n keda
+
 ##@Release
 
 .PHONY: artifacts

diff --git a/docs/examples/serverless/README.md b/docs/examples/serverless/README.md
@@ -0,0 +1,59 @@
+# Serverless Examples
+
+This directory contains example configurations for setting up serverless deployments with llmaz using KEDA for event-driven autoscaling.
+
+> For detailed documentation on serverless concepts, architecture, and configuration, please refer to the [Serverless Features Documentation](../../../site/content/en/docs/features/serverless.md).
+
+## Files
+
+- **basic.yaml**: Example configuration showing a complete serverless setup including:
+  - OpenModel definition for Qwen2-0.5B
+  - Playground deployment with zero initial replicas
+  - Gateway and AIGatewayRoute configuration
+  - AIServiceBackend setup
+
+- **service-monitor.yaml**: Prometheus ServiceMonitor for cross-namespace metric collection
+
+- **scaled-object.yaml**: KEDA ScaledObject configuration for scaling based on Prometheus metrics
+
+## Quick Start
+
+1. Install prerequisites (llmaz, Prometheus, and KEDA):
+
+```bash
+helm install llmaz oci://registry-1.docker.io/inftyai/llmaz --namespace llmaz-system --create-namespace --version 0.0.10
+make install-prometheus
+make install-keda
+```
+
+2. Deploy the example configuration:
+
+```bash
+kubectl apply -f basic.yaml
+```
+
+3. Create ServiceMonitor and ScaledObject:
+
+```bash
+kubectl apply -f service-monitor.yaml
+kubectl apply -f scaled-object.yaml
+```
+
+4. Test cold start by sending a request:
+
+```bash
+kubectl exec -it -n kube-system deploy/activator -- wget -O- qwen2-0--5b-lb.default.svc:8080
+```
+
+5. Monitor metrics and scaling activity:
+
+```bash
+kubectl port-forward services/prometheus-operated 9090:9090 --address 0.0.0.0 -n llmaz-system
+```
+
+## Configuration Notes
+
+- The example uses `minReplicaCount: 0` to enable scale-to-zero
+- Scaling is triggered based on the `llamacpp:requests_processing` metric
+- The activator component intercepts requests when replicas are at zero
+- Adjust `pollingInterval` and `cooldownPeriod` in the ScaledObject to optimize scaling behavior
diff --git a/docs/examples/serverless/basic.yaml b/docs/examples/serverless/basic.yaml
@@ -0,0 +1,76 @@
+apiVersion: llmaz.io/v1alpha1
+kind: OpenModel
+metadata:
+  name: qwen2-0--5b
+spec:
+  familyName: qwen2
+  source:
+    modelHub:
+      modelID: Qwen/Qwen2-0.5B-Instruct-GGUF
+      filename: qwen2-0_5b-instruct-q5_k_m.gguf
+---
+apiVersion: inference.llmaz.io/v1alpha1
+kind: Playground
+metadata:
+  name: qwen2-0--5b
+spec:
+  replicas: 0
+  modelClaim:
+    modelName: qwen2-0--5b
+  backendRuntimeConfig:
+    backendName: llamacpp
+    configName: default
+    args:
+      - -fa # use flash attention
+---
+apiVersion: gateway.networking.k8s.io/v1
+kind: GatewayClass
+metadata:
+  name: default-envoy-ai-gateway
+spec:
+  controllerName: gateway.envoyproxy.io/gatewayclass-controller
+---
+apiVersion: gateway.networking.k8s.io/v1
+kind: Gateway
+metadata:
+  name: default-envoy-ai-gateway
+spec:
+  gatewayClassName: default-envoy-ai-gateway
+  listeners:
+    - name: http
+      protocol: HTTP
+      port: 80
+---
+apiVersion: aigateway.envoyproxy.io/v1alpha1
+kind: AIGatewayRoute
+metadata:
+  name: default-envoy-ai-gateway
+spec:
+  schema:
+    name: OpenAI
+  targetRefs:
+    - name: default-envoy-ai-gateway
+      kind: Gateway
+      group: gateway.networking.k8s.io
+  rules:
+    - matches:
+        - headers:
+            - type: Exact
+              name: x-ai-eg-model
+              value: qwen2-0--5b
+      backendRefs:
+        - name: qwen2-0--5b
+---
+apiVersion: aigateway.envoyproxy.io/v1alpha1
+kind: AIServiceBackend
+metadata:
+  name: qwen2-0--5b
+spec:
+  timeouts:
+    request: 3m
+  schema:
+    name: OpenAI
+  backendRef:
+    name: qwen2-0--5b-lb
+    kind: Service
+    port: 8080
diff --git a/docs/examples/serverless/scaled-object.yaml b/docs/examples/serverless/scaled-object.yaml
@@ -0,0 +1,21 @@
+apiVersion: keda.sh/v1alpha1
+kind: ScaledObject
+metadata:
+  name: qwen2-0--5b-scaler
+  namespace: default
+spec:
+  scaleTargetRef:
+    apiVersion: inference.llmaz.io/v1alpha1
+    kind: Playground
+    name: qwen2-0--5b
+  pollingInterval: 30
+  cooldownPeriod: 50
+  minReplicaCount: 0
+  maxReplicaCount: 3
+  triggers:
+  - type: prometheus
+    metadata:
+      serverAddress: http://prometheus-operated.llmaz-system.svc.cluster.local:9090
+      metricName: llamacpp:requests_processing
+      query: sum(llamacpp:requests_processing)
+      threshold: "0.2"
diff --git a/docs/examples/serverless/service-monitor.yaml b/docs/examples/serverless/service-monitor.yaml
@@ -0,0 +1,18 @@
+apiVersion: monitoring.coreos.com/v1
+kind: ServiceMonitor
+metadata:
+  name: qwen2-0--5b-lb-monitor
+  namespace: llmaz-system
+  labels:
+    control-plane: controller-manager
+    app.kubernetes.io/name: servicemonitor
+spec:
+  namespaceSelector:
+    any: true
+  selector:
+    matchLabels:
+      llmaz.io/model-name: qwen2-0--5b
+  endpoints:
+    - port: http
+      path: /metrics
+      scheme: http
diff --git a/pkg/controller/inference/service_controller.go b/pkg/controller/inference/service_controller.go
@@ -131,7 +131,7 @@ func (r *ServiceReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ct
 	}
 
 	// Create a service for the leader pods of the lws for loadbalancing.
-	if err := CreateServiceIfNotExists(ctx, r.Client, r.Scheme, service); err != nil {
+	if err := CreateServiceIfNotExists(ctx, r.Client, r.Scheme, service, models); err != nil {
 		return ctrl.Result{}, err
 	}
 
@@ -419,7 +419,7 @@ func setControllerReferenceForWorkload(owner metav1.Object, lws *applyconfigurat
 	return nil
 }
 
-func CreateServiceIfNotExists(ctx context.Context, k8sClient client.Client, Scheme *runtime.Scheme, service *inferenceapi.Service) error {
+func CreateServiceIfNotExists(ctx context.Context, k8sClient client.Client, Scheme *runtime.Scheme, service *inferenceapi.Service, model []*coreapi.OpenModel) error {
 	log := ctrl.LoggerFrom(ctx)
 	// The load balancing service name.
 	svcName := service.Name + "-lb"
@@ -433,6 +433,7 @@ func CreateServiceIfNotExists(ctx context.Context, k8sClient client.Client, Sche
 			ObjectMeta: metav1.ObjectMeta{
 				Name:      svcName,
 				Namespace: service.Namespace,
+				Labels:    modelLabels(model[0]),
 			},
 			Spec: corev1.ServiceSpec{
 				Ports: []corev1.ServicePort{