summaryrefslogtreecommitdiff
path: root/charts/llm-app
diff options
context:
space:
mode:
Diffstat (limited to 'charts/llm-app')
-rw-r--r--charts/llm-app/Chart.yaml6
-rw-r--r--charts/llm-app/templates/_helpers.tpl8
-rw-r--r--charts/llm-app/templates/deployment.yaml76
-rw-r--r--charts/llm-app/templates/ingress.yaml19
-rw-r--r--charts/llm-app/templates/service.yaml13
-rw-r--r--charts/llm-app/templates/servicemonitor.yaml19
-rw-r--r--charts/llm-app/templates/smoketest-job.yaml32
-rw-r--r--charts/llm-app/values.yaml51
8 files changed, 224 insertions, 0 deletions
diff --git a/charts/llm-app/Chart.yaml b/charts/llm-app/Chart.yaml
new file mode 100644
index 0000000..e0747df
--- /dev/null
+++ b/charts/llm-app/Chart.yaml
@@ -0,0 +1,6 @@
1apiVersion: v2
2name: llm-app
3description: OpenAI-compatible LLM server (vLLM CPU) with Prometheus metrics
4type: application
5version: 0.1.0
6appVersion: "latest"
diff --git a/charts/llm-app/templates/_helpers.tpl b/charts/llm-app/templates/_helpers.tpl
new file mode 100644
index 0000000..8b104de
--- /dev/null
+++ b/charts/llm-app/templates/_helpers.tpl
@@ -0,0 +1,8 @@
1{{- define "llm-app.fullname" -}}
2{{- printf "%s-%s" .Release.Name .Chart.Name | trunc 63 | trimSuffix "-" -}}
3{{- end -}}
4
5{{- define "llm-app.selectorLabels" -}}
6app.kubernetes.io/name: {{ .Chart.Name }}
7app.kubernetes.io/instance: {{ .Release.Name }}
8{{- end -}}
diff --git a/charts/llm-app/templates/deployment.yaml b/charts/llm-app/templates/deployment.yaml
new file mode 100644
index 0000000..12677b5
--- /dev/null
+++ b/charts/llm-app/templates/deployment.yaml
@@ -0,0 +1,76 @@
1apiVersion: apps/v1
2kind: Deployment
3metadata:
4 name: {{ include "llm-app.fullname" . }}
5spec:
6 replicas: {{ .Values.replicaCount }}
7 selector:
8 matchLabels: {{- include "llm-app.selectorLabels" . | nindent 6 }}
9 template:
10 metadata:
11 labels: {{- include "llm-app.selectorLabels" . | nindent 8 }}
12 spec:
13 containers:
14 - name: vllm-server
15 # Image entrypoint is already `vllm serve`; args start with the model tag.
16 image: "{{ .Values.image.repository }}{{ if .Values.image.digest }}@{{ .Values.image.digest }}{{ else }}:{{ .Values.image.tag }}{{ end }}"
17 imagePullPolicy: {{ .Values.image.pullPolicy }}
18 args:
19 - {{ .Values.model.name | quote }}
20 - "--host"
21 - "0.0.0.0"
22 - "--port"
23 - {{ .Values.server.port | quote }}
24 - "--served-model-name"
25 - {{ .Values.model.alias | quote }}
26 - "--max-model-len"
27 - {{ .Values.model.maxModelLen | quote }}
28 - "--dtype"
29 - {{ .Values.model.dtype | quote }}
30 {{- with .Values.server.extraArgs }}
31 {{- toYaml . | nindent 12 }}
32 {{- end }}
33 env:
34 - name: HF_HOME
35 value: /cache/huggingface
36 - name: VLLM_CPU_KVCACHE_SPACE
37 value: "2"
38 {{- if gt (int .Values.server.ompThreads) 0 }}
39 - name: OMP_NUM_THREADS
40 value: {{ .Values.server.ompThreads | quote }}
41 {{- end }}
42 ports:
43 - name: http
44 containerPort: {{ .Values.server.port }}
45 protocol: TCP
46 readinessProbe:
47 httpGet:
48 path: /health
49 port: http
50 # vLLM CPU cold-start is ~2 min + HF download on first boot.
51 initialDelaySeconds: 60
52 periodSeconds: 10
53 timeoutSeconds: 5
54 failureThreshold: 180
55 livenessProbe:
56 httpGet:
57 path: /health
58 port: http
59 initialDelaySeconds: 600
60 periodSeconds: 30
61 timeoutSeconds: 5
62 failureThreshold: 6
63 resources: {{- toYaml .Values.resources | nindent 12 }}
64 volumeMounts:
65 - name: cache
66 mountPath: /cache
67 - name: shm
68 mountPath: /dev/shm
69 volumes:
70 - name: cache
71 emptyDir:
72 sizeLimit: {{ .Values.modelCache.sizeLimit }}
73 - name: shm
74 emptyDir:
75 medium: Memory
76 sizeLimit: 1Gi
diff --git a/charts/llm-app/templates/ingress.yaml b/charts/llm-app/templates/ingress.yaml
new file mode 100644
index 0000000..f3a6ded
--- /dev/null
+++ b/charts/llm-app/templates/ingress.yaml
@@ -0,0 +1,19 @@
1{{- if .Values.ingress.enabled -}}
2apiVersion: networking.k8s.io/v1
3kind: Ingress
4metadata:
5 name: {{ include "llm-app.fullname" . }}
6spec:
7 ingressClassName: {{ .Values.ingress.className }}
8 rules:
9 - host: {{ .Values.ingress.host | quote }}
10 http:
11 paths:
12 - path: /
13 pathType: Prefix
14 backend:
15 service:
16 name: {{ include "llm-app.fullname" . }}
17 port:
18 number: {{ .Values.service.port }}
19{{- end }}
diff --git a/charts/llm-app/templates/service.yaml b/charts/llm-app/templates/service.yaml
new file mode 100644
index 0000000..6350996
--- /dev/null
+++ b/charts/llm-app/templates/service.yaml
@@ -0,0 +1,13 @@
1apiVersion: v1
2kind: Service
3metadata:
4 name: {{ include "llm-app.fullname" . }}
5 labels: {{- include "llm-app.selectorLabels" . | nindent 4 }}
6spec:
7 type: {{ .Values.service.type }}
8 ports:
9 - name: http
10 port: {{ .Values.service.port }}
11 targetPort: http
12 protocol: TCP
13 selector: {{- include "llm-app.selectorLabels" . | nindent 4 }}
diff --git a/charts/llm-app/templates/servicemonitor.yaml b/charts/llm-app/templates/servicemonitor.yaml
new file mode 100644
index 0000000..264e766
--- /dev/null
+++ b/charts/llm-app/templates/servicemonitor.yaml
@@ -0,0 +1,19 @@
1{{- if .Values.monitoring.serviceMonitor.enabled -}}
2apiVersion: monitoring.coreos.com/v1
3kind: ServiceMonitor
4metadata:
5 name: {{ include "llm-app.fullname" . }}
6 {{- with .Values.monitoring.serviceMonitor.labels }}
7 labels: {{- toYaml . | nindent 4 }}
8 {{- end }}
9spec:
10 selector:
11 matchLabels: {{- include "llm-app.selectorLabels" . | nindent 6 }}
12 endpoints:
13 - port: http
14 path: /metrics
15 interval: {{ .Values.monitoring.serviceMonitor.interval }}
16 namespaceSelector:
17 matchNames:
18 - {{ .Release.Namespace }}
19{{- end }}
diff --git a/charts/llm-app/templates/smoketest-job.yaml b/charts/llm-app/templates/smoketest-job.yaml
new file mode 100644
index 0000000..ac97f33
--- /dev/null
+++ b/charts/llm-app/templates/smoketest-job.yaml
@@ -0,0 +1,32 @@
1apiVersion: batch/v1
2kind: Job
3metadata:
4 name: {{ include "llm-app.fullname" . }}-smoketest
5 annotations:
6 "helm.sh/hook": post-install,post-upgrade
7 "helm.sh/hook-weight": "10"
8 "helm.sh/hook-delete-policy": before-hook-creation,hook-succeeded
9spec:
10 backoffLimit: 2
11 activeDeadlineSeconds: 240
12 ttlSecondsAfterFinished: 600
13 template:
14 spec:
15 restartPolicy: Never
16 containers:
17 - name: curl
18 image: curlimages/curl:8.10.1
19 command: ["/bin/sh", "-euc"]
20 args:
21 - |
22 ENDPOINT="http://{{ include "llm-app.fullname" . }}:{{ .Values.service.port }}"
23 MODEL={{ .Values.model.alias | quote }}
24 echo "smoketest: GET $ENDPOINT/v1/models"
25 out=$(curl -fsS --max-time 60 "$ENDPOINT/v1/models")
26 echo "$out" | grep -q "\"$MODEL\"" || { echo "FAIL: $MODEL not listed in /v1/models"; echo "$out"; exit 1; }
27 echo "smoketest: POST $ENDPOINT/v1/chat/completions"
28 resp=$(curl -fsS --max-time 90 "$ENDPOINT/v1/chat/completions" \
29 -H "Content-Type: application/json" \
30 -d "{\"model\":\"$MODEL\",\"messages\":[{\"role\":\"user\",\"content\":\"Reply with just: pong\"}],\"max_tokens\":8,\"temperature\":0}")
31 echo "$resp" | grep -q '"content"' || { echo "FAIL: no content in response"; echo "$resp"; exit 1; }
32 echo "OK"
diff --git a/charts/llm-app/values.yaml b/charts/llm-app/values.yaml
new file mode 100644
index 0000000..96c5c9a
--- /dev/null
+++ b/charts/llm-app/values.yaml
@@ -0,0 +1,51 @@
1replicaCount: 1
2
3image:
4 # vLLM CPU-only image (no CUDA, works on AVX2+).
5 repository: public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo
6 tag: latest
7 # Optional. If set, used in place of `tag` to pin the image by content.
8 # Example: "sha256:abc123...". Fill via scripts/resolve-digests.sh.
9 digest: ""
10 pullPolicy: IfNotPresent
11
12# vLLM pulls model weights from HuggingFace at first boot into the cache volume.
13# `name` is the HF repo id (also used as `--served-model-name` unless `alias` overrides).
14model:
15 name: "Qwen/Qwen2.5-0.5B-Instruct"
16 alias: "Qwen2.5-0.5B-Instruct"
17 maxModelLen: 2048
18 dtype: "bfloat16"
19
20server:
21 port: 8000
22 # OMP threads for the CPU backend; 0 = autodetect.
23 ompThreads: 0
24 extraArgs: []
25
26resources:
27 requests:
28 cpu: "500m"
29 memory: "1Gi"
30 limits:
31 cpu: "2"
32 memory: "3Gi"
33
34service:
35 type: ClusterIP
36 port: 8000
37
38ingress:
39 enabled: true
40 className: nginx
41 host: llm.localtest.me
42
43monitoring:
44 serviceMonitor:
45 enabled: true
46 interval: 15s
47 labels:
48 release: kube-prometheus-stack
49
50modelCache:
51 sizeLimit: 10Gi