Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -302,6 +302,15 @@ install-prometheus:
uninstall-prometheus:
kubectl delete -k config/prometheus

.PHONY: install-keda
install-keda:
helm repo add kedacore https://kedacore.github.io/charts
helm install keda kedacore/keda --namespace keda --create-namespace

.PHONY: uninstall-keda
uninstall-keda:
helm uninstall keda -n keda

##@Release

.PHONY: artifacts
Expand Down
59 changes: 59 additions & 0 deletions docs/examples/serverless/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
# Serverless Examples

This directory contains example configurations for setting up serverless deployments with llmaz using KEDA for event-driven autoscaling.

> For detailed documentation on serverless concepts, architecture, and configuration, please refer to the [Serverless Features Documentation](../../../site/content/en/docs/features/serverless.md).

## Files

- **basic.yaml**: Example configuration showing a complete serverless setup including:
- OpenModel definition for Qwen2-0.5B
- Playground deployment with zero initial replicas
- Gateway and AIGatewayRoute configuration
- AIServiceBackend setup

- **service-monitor.yaml**: Prometheus ServiceMonitor for cross-namespace metric collection

- **scaled-object.yaml**: KEDA ScaledObject configuration for scaling based on Prometheus metrics

## Quick Start

1. Install prerequisites (llmaz, Prometheus, and KEDA):

```bash
helm install llmaz oci://registry-1.docker.io/inftyai/llmaz --namespace llmaz-system --create-namespace --version 0.0.10
make install-prometheus
make install-keda
```

2. Deploy the example configuration:

```bash
kubectl apply -f basic.yaml
```

3. Create ServiceMonitor and ScaledObject:

```bash
kubectl apply -f service-monitor.yaml
kubectl apply -f scaled-object.yaml
```

4. Test cold start by sending a request:

```bash
kubectl exec -it -n kube-system deploy/activator -- wget -O- qwen2-0--5b-lb.default.svc:8080
```

5. Monitor metrics and scaling activity:

```bash
kubectl port-forward services/prometheus-operated 9090:9090 --address 0.0.0.0 -n llmaz-system
```

## Configuration Notes

- The example uses `minReplicaCount: 0` to enable scale-to-zero
- Scaling is triggered based on the `llamacpp:requests_processing` metric
- The activator component intercepts requests when replicas are at zero
- Adjust `pollingInterval` and `cooldownPeriod` in the ScaledObject to optimize scaling behavior
76 changes: 76 additions & 0 deletions docs/examples/serverless/basic.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
apiVersion: llmaz.io/v1alpha1
kind: OpenModel
metadata:
name: qwen2-0--5b
spec:
familyName: qwen2
source:
modelHub:
modelID: Qwen/Qwen2-0.5B-Instruct-GGUF
filename: qwen2-0_5b-instruct-q5_k_m.gguf
---
apiVersion: inference.llmaz.io/v1alpha1
kind: Playground
metadata:
name: qwen2-0--5b
spec:
replicas: 0
modelClaim:
modelName: qwen2-0--5b
backendRuntimeConfig:
backendName: llamacpp
configName: default
args:
- -fa # use flash attention
---
apiVersion: gateway.networking.k8s.io/v1
kind: GatewayClass
metadata:
name: default-envoy-ai-gateway
spec:
controllerName: gateway.envoyproxy.io/gatewayclass-controller
---
apiVersion: gateway.networking.k8s.io/v1
kind: Gateway
metadata:
name: default-envoy-ai-gateway
spec:
gatewayClassName: default-envoy-ai-gateway
listeners:
- name: http
protocol: HTTP
port: 80
---
apiVersion: aigateway.envoyproxy.io/v1alpha1
kind: AIGatewayRoute
metadata:
name: default-envoy-ai-gateway
spec:
schema:
name: OpenAI
targetRefs:
- name: default-envoy-ai-gateway
kind: Gateway
group: gateway.networking.k8s.io
rules:
- matches:
- headers:
- type: Exact
name: x-ai-eg-model
value: qwen2-0--5b
backendRefs:
- name: qwen2-0--5b
---
apiVersion: aigateway.envoyproxy.io/v1alpha1
kind: AIServiceBackend
metadata:
name: qwen2-0--5b
spec:
timeouts:
request: 3m
schema:
name: OpenAI
backendRef:
name: qwen2-0--5b-lb
kind: Service
port: 8080
21 changes: 21 additions & 0 deletions docs/examples/serverless/scaled-object.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
apiVersion: keda.sh/v1alpha1
kind: ScaledObject
metadata:
name: qwen2-0--5b-scaler
namespace: default
spec:
scaleTargetRef:
apiVersion: inference.llmaz.io/v1alpha1
kind: Playground
name: qwen2-0--5b
pollingInterval: 30
cooldownPeriod: 50
minReplicaCount: 0
maxReplicaCount: 3
triggers:
- type: prometheus
metadata:
serverAddress: http://prometheus-operated.llmaz-system.svc.cluster.local:9090
metricName: llamacpp:requests_processing
query: sum(llamacpp:requests_processing)
threshold: "0.2"
18 changes: 18 additions & 0 deletions docs/examples/serverless/service-monitor.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
apiVersion: monitoring.coreos.com/v1
kind: ServiceMonitor
metadata:
name: qwen2-0--5b-lb-monitor
namespace: llmaz-system
labels:
control-plane: controller-manager
app.kubernetes.io/name: servicemonitor
spec:
namespaceSelector:
any: true
selector:
matchLabels:
llmaz.io/model-name: qwen2-0--5b
endpoints:
- port: http
path: /metrics
scheme: http
5 changes: 3 additions & 2 deletions pkg/controller/inference/service_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -131,7 +131,7 @@ func (r *ServiceReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ct
}

// Create a service for the leader pods of the lws for loadbalancing.
if err := CreateServiceIfNotExists(ctx, r.Client, r.Scheme, service); err != nil {
if err := CreateServiceIfNotExists(ctx, r.Client, r.Scheme, service, models); err != nil {
return ctrl.Result{}, err
}

Expand Down Expand Up @@ -419,7 +419,7 @@ func setControllerReferenceForWorkload(owner metav1.Object, lws *applyconfigurat
return nil
}

func CreateServiceIfNotExists(ctx context.Context, k8sClient client.Client, Scheme *runtime.Scheme, service *inferenceapi.Service) error {
func CreateServiceIfNotExists(ctx context.Context, k8sClient client.Client, Scheme *runtime.Scheme, service *inferenceapi.Service, model []*coreapi.OpenModel) error {
log := ctrl.LoggerFrom(ctx)
// The load balancing service name.
svcName := service.Name + "-lb"
Expand All @@ -433,6 +433,7 @@ func CreateServiceIfNotExists(ctx context.Context, k8sClient client.Client, Sche
ObjectMeta: metav1.ObjectMeta{
Name: svcName,
Namespace: service.Namespace,
Labels: modelLabels(model[0]),
},
Spec: corev1.ServiceSpec{
Ports: []corev1.ServicePort{
Expand Down
Loading