8 files changed, 224 insertions, 0 deletions
diff --git a/charts/llm-app/Chart.yaml b/charts/llm-app/Chart.yaml
new file mode 100644
index 0000000..e0747df
--- /dev/null
+++ b/charts/llm-app/Chart.yaml
@@ -0,0 +1,6 @@
+apiVersion: v2
+name: llm-app
+description: OpenAI-compatible LLM server (vLLM CPU) with Prometheus metrics
+type: application
+version: 0.1.0
+appVersion: "latest"
diff --git a/charts/llm-app/templates/_helpers.tpl b/charts/llm-app/templates/_helpers.tpl
new file mode 100644
index 0000000..8b104de
--- /dev/null
+++ b/charts/llm-app/templates/_helpers.tpl
@@ -0,0 +1,8 @@
+{{- define "llm-app.fullname" -}}
+{{- printf "%s-%s" .Release.Name .Chart.Name | trunc 63 | trimSuffix "-" -}}
+{{- end -}}
+{{- define "llm-app.selectorLabels" -}}
+app.kubernetes.io/name: {{ .Chart.Name }}
+app.kubernetes.io/instance: {{ .Release.Name }}
+{{- end -}}
diff --git a/charts/llm-app/templates/deployment.yaml b/charts/llm-app/templates/deployment.yaml
new file mode 100644
index 0000000..12677b5
--- /dev/null
+++ b/charts/llm-app/templates/deployment.yaml
@@ -0,0 +1,76 @@
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: {{ include "llm-app.fullname" . }}
+spec:
+  replicas: {{ .Values.replicaCount }}
+  selector:
+    matchLabels: {{- include "llm-app.selectorLabels" . | nindent 6 }}
+  template:
+    metadata:
+      labels: {{- include "llm-app.selectorLabels" . | nindent 8 }}
+    spec:
+      containers:
+        - name: vllm-server
+          # Image entrypoint is already `vllm serve`; args start with the model tag.
+          image: "{{ .Values.image.repository }}{{ if .Values.image.digest }}@{{ .Values.image.digest }}{{ else }}:{{ .Values.image.tag }}{{ end }}"
+          imagePullPolicy: {{ .Values.image.pullPolicy }}
+          args:
+            - {{ .Values.model.name | quote }}
+            - "--host"
+            - "0.0.0.0"
+            - "--port"
+            - {{ .Values.server.port | quote }}
+            - "--served-model-name"
+            - {{ .Values.model.alias | quote }}
+            - "--max-model-len"
+            - {{ .Values.model.maxModelLen | quote }}
+            - "--dtype"
+            - {{ .Values.model.dtype | quote }}
+            {{- with .Values.server.extraArgs }}
+            {{- toYaml . | nindent 12 }}
+            {{- end }}
+          env:
+            - name: HF_HOME
+              value: /cache/huggingface
+            - name: VLLM_CPU_KVCACHE_SPACE
+              value: "2"
+            {{- if gt (int .Values.server.ompThreads) 0 }}
+            - name: OMP_NUM_THREADS
+              value: {{ .Values.server.ompThreads | quote }}
+            {{- end }}
+          ports:
+            - name: http
+              containerPort: {{ .Values.server.port }}
+              protocol: TCP
+          readinessProbe:
+            httpGet:
+              path: /health
+              port: http
+            # vLLM CPU cold-start is ~2 min + HF download on first boot.
+            initialDelaySeconds: 60
+            periodSeconds: 10
+            timeoutSeconds: 5
+            failureThreshold: 180
+          livenessProbe:
+            httpGet:
+              path: /health
+              port: http
+            initialDelaySeconds: 600
+            periodSeconds: 30
+            timeoutSeconds: 5
+            failureThreshold: 6
+          resources: {{- toYaml .Values.resources | nindent 12 }}
+          volumeMounts:
+            - name: cache
+              mountPath: /cache
+            - name: shm
+              mountPath: /dev/shm
+      volumes:
+        - name: cache
+          emptyDir:
+            sizeLimit: {{ .Values.modelCache.sizeLimit }}
+        - name: shm
+          emptyDir:
+            medium: Memory
+            sizeLimit: 1Gi
diff --git a/charts/llm-app/templates/ingress.yaml b/charts/llm-app/templates/ingress.yaml
new file mode 100644
index 0000000..f3a6ded
--- /dev/null
+++ b/charts/llm-app/templates/ingress.yaml
@@ -0,0 +1,19 @@
+{{- if .Values.ingress.enabled -}}
+apiVersion: networking.k8s.io/v1
+kind: Ingress
+metadata:
+  name: {{ include "llm-app.fullname" . }}
+spec:
+  ingressClassName: {{ .Values.ingress.className }}
+  rules:
+    - host: {{ .Values.ingress.host | quote }}
+      http:
+        paths:
+          - path: /
+            pathType: Prefix
+            backend:
+              service:
+                name: {{ include "llm-app.fullname" . }}
+                port:
+                  number: {{ .Values.service.port }}
+{{- end }}
diff --git a/charts/llm-app/templates/service.yaml b/charts/llm-app/templates/service.yaml
new file mode 100644
index 0000000..6350996
--- /dev/null
+++ b/charts/llm-app/templates/service.yaml
@@ -0,0 +1,13 @@
+apiVersion: v1
+kind: Service
+metadata:
+  name: {{ include "llm-app.fullname" . }}
+  labels: {{- include "llm-app.selectorLabels" . | nindent 4 }}
+spec:
+  type: {{ .Values.service.type }}
+  ports:
+    - name: http
+      port: {{ .Values.service.port }}
+      targetPort: http
+      protocol: TCP
+  selector: {{- include "llm-app.selectorLabels" . | nindent 4 }}
diff --git a/charts/llm-app/templates/servicemonitor.yaml b/charts/llm-app/templates/servicemonitor.yaml
new file mode 100644
index 0000000..264e766
--- /dev/null
+++ b/charts/llm-app/templates/servicemonitor.yaml
@@ -0,0 +1,19 @@
+{{- if .Values.monitoring.serviceMonitor.enabled -}}
+apiVersion: monitoring.coreos.com/v1
+kind: ServiceMonitor
+metadata:
+  name: {{ include "llm-app.fullname" . }}
+  {{- with .Values.monitoring.serviceMonitor.labels }}
+  labels: {{- toYaml . | nindent 4 }}
+  {{- end }}
+spec:
+  selector:
+    matchLabels: {{- include "llm-app.selectorLabels" . | nindent 6 }}
+  endpoints:
+    - port: http
+      path: /metrics
+      interval: {{ .Values.monitoring.serviceMonitor.interval }}
+  namespaceSelector:
+    matchNames:
+      - {{ .Release.Namespace }}
+{{- end }}
diff --git a/charts/llm-app/templates/smoketest-job.yaml b/charts/llm-app/templates/smoketest-job.yaml
new file mode 100644
index 0000000..ac97f33
--- /dev/null
+++ b/charts/llm-app/templates/smoketest-job.yaml
@@ -0,0 +1,32 @@
+apiVersion: batch/v1
+kind: Job
+metadata:
+  name: {{ include "llm-app.fullname" . }}-smoketest
+  annotations:
+    "helm.sh/hook": post-install,post-upgrade
+    "helm.sh/hook-weight": "10"
+    "helm.sh/hook-delete-policy": before-hook-creation,hook-succeeded
+spec:
+  backoffLimit: 2
+  activeDeadlineSeconds: 240
+  ttlSecondsAfterFinished: 600
+  template:
+    spec:
+      restartPolicy: Never
+      containers:
+        - name: curl
+          image: curlimages/curl:8.10.1
+          command: ["/bin/sh", "-euc"]
+          args:
+            - |
+              ENDPOINT="http://{{ include "llm-app.fullname" . }}:{{ .Values.service.port }}"
+              MODEL={{ .Values.model.alias | quote }}
+              echo "smoketest: GET $ENDPOINT/v1/models"
+              out=$(curl -fsS --max-time 60 "$ENDPOINT/v1/models")
+              echo "$out" | grep -q "\"$MODEL\"" || { echo "FAIL: $MODEL not listed in /v1/models"; echo "$out"; exit 1; }
+              echo "smoketest: POST $ENDPOINT/v1/chat/completions"
+              resp=$(curl -fsS --max-time 90 "$ENDPOINT/v1/chat/completions" \
+                -H "Content-Type: application/json" \
+                -d "{\"model\":\"$MODEL\",\"messages\":[{\"role\":\"user\",\"content\":\"Reply with just: pong\"}],\"max_tokens\":8,\"temperature\":0}")
+              echo "$resp" | grep -q '"content"' || { echo "FAIL: no content in response"; echo "$resp"; exit 1; }
+              echo "OK"
diff --git a/charts/llm-app/values.yaml b/charts/llm-app/values.yaml
new file mode 100644
index 0000000..96c5c9a
--- /dev/null
+++ b/charts/llm-app/values.yaml
@@ -0,0 +1,51 @@
+replicaCount: 1
+image:
+  # vLLM CPU-only image (no CUDA, works on AVX2+).
+  repository: public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo
+  tag: latest
+  # Optional. If set, used in place of `tag` to pin the image by content.
+  # Example: "sha256:abc123...". Fill via scripts/resolve-digests.sh.
+  digest: ""
+  pullPolicy: IfNotPresent
+# vLLM pulls model weights from HuggingFace at first boot into the cache volume.
+# `name` is the HF repo id (also used as `--served-model-name` unless `alias` overrides).
+model:
+  name: "Qwen/Qwen2.5-0.5B-Instruct"
+  alias: "Qwen2.5-0.5B-Instruct"
+  maxModelLen: 2048
+  dtype: "bfloat16"
+server:
+  port: 8000
+  # OMP threads for the CPU backend; 0 = autodetect.
+  ompThreads: 0
+  extraArgs: []
+resources:
+  requests:
+    cpu: "500m"
+    memory: "1Gi"
+  limits:
+    cpu: "2"
+    memory: "3Gi"
+service:
+  type: ClusterIP
+  port: 8000
+ingress:
+  enabled: true
+  className: nginx
+  host: llm.localtest.me
+monitoring:
+  serviceMonitor:
+    enabled: true
+    interval: 15s
+    labels:
+      release: kube-prometheus-stack
+modelCache:
+  sizeLimit: 10Gi

diff --git a/charts/llm-app/Chart.yaml b/charts/llm-app/Chart.yaml new file mode 100644 index 0000000..e0747df --- /dev/null +++ b/charts/llm-app/Chart.yaml
@@ -0,0 +1,6 @@
	1	apiVersion: v2
	2	name: llm-app
	3	description: OpenAI-compatible LLM server (vLLM CPU) with Prometheus metrics
	4	type: application
	5	version: 0.1.0
	6	appVersion: "latest"


diff --git a/charts/llm-app/templates/_helpers.tpl b/charts/llm-app/templates/_helpers.tpl new file mode 100644 index 0000000..8b104de --- /dev/null +++ b/charts/llm-app/templates/_helpers.tpl
@@ -0,0 +1,8 @@
	1	{{- define "llm-app.fullname" -}}
	2	{{- printf "%s-%s" .Release.Name .Chart.Name \| trunc 63 \| trimSuffix "-" -}}
	3	{{- end -}}
	4
	5	{{- define "llm-app.selectorLabels" -}}
	6	app.kubernetes.io/name: {{ .Chart.Name }}
	7	app.kubernetes.io/instance: {{ .Release.Name }}
	8	{{- end -}}


diff --git a/charts/llm-app/templates/deployment.yaml b/charts/llm-app/templates/deployment.yaml new file mode 100644 index 0000000..12677b5 --- /dev/null +++ b/charts/llm-app/templates/deployment.yaml
@@ -0,0 +1,76 @@
	1	apiVersion: apps/v1
	2	kind: Deployment
	3	metadata:
	4	name: {{ include "llm-app.fullname" . }}
	5	spec:
	6	replicas: {{ .Values.replicaCount }}
	7	selector:
	8	matchLabels: {{- include "llm-app.selectorLabels" . \| nindent 6 }}
	9	template:
	10	metadata:
	11	labels: {{- include "llm-app.selectorLabels" . \| nindent 8 }}
	12	spec:
	13	containers:
	14	- name: vllm-server
	15	# Image entrypoint is already `vllm serve`; args start with the model tag.
	16	image: "{{ .Values.image.repository }}{{ if .Values.image.digest }}@{{ .Values.image.digest }}{{ else }}:{{ .Values.image.tag }}{{ end }}"
	17	imagePullPolicy: {{ .Values.image.pullPolicy }}
	18	args:
	19	- {{ .Values.model.name \| quote }}
	20	- "--host"
	21	- "0.0.0.0"
	22	- "--port"
	23	- {{ .Values.server.port \| quote }}
	24	- "--served-model-name"
	25	- {{ .Values.model.alias \| quote }}
	26	- "--max-model-len"
	27	- {{ .Values.model.maxModelLen \| quote }}
	28	- "--dtype"
	29	- {{ .Values.model.dtype \| quote }}
	30	{{- with .Values.server.extraArgs }}
	31	{{- toYaml . \| nindent 12 }}
	32	{{- end }}
	33	env:
	34	- name: HF_HOME
	35	value: /cache/huggingface
	36	- name: VLLM_CPU_KVCACHE_SPACE
	37	value: "2"
	38	{{- if gt (int .Values.server.ompThreads) 0 }}
	39	- name: OMP_NUM_THREADS
	40	value: {{ .Values.server.ompThreads \| quote }}
	41	{{- end }}
	42	ports:
	43	- name: http
	44	containerPort: {{ .Values.server.port }}
	45	protocol: TCP
	46	readinessProbe:
	47	httpGet:
	48	path: /health
	49	port: http
	50	# vLLM CPU cold-start is ~2 min + HF download on first boot.
	51	initialDelaySeconds: 60
	52	periodSeconds: 10
	53	timeoutSeconds: 5
	54	failureThreshold: 180
	55	livenessProbe:
	56	httpGet:
	57	path: /health
	58	port: http
	59	initialDelaySeconds: 600
	60	periodSeconds: 30
	61	timeoutSeconds: 5
	62	failureThreshold: 6
	63	resources: {{- toYaml .Values.resources \| nindent 12 }}
	64	volumeMounts:
	65	- name: cache
	66	mountPath: /cache
	67	- name: shm
	68	mountPath: /dev/shm
	69	volumes:
	70	- name: cache
	71	emptyDir:
	72	sizeLimit: {{ .Values.modelCache.sizeLimit }}
	73	- name: shm
	74	emptyDir:
	75	medium: Memory
	76	sizeLimit: 1Gi


diff --git a/charts/llm-app/templates/ingress.yaml b/charts/llm-app/templates/ingress.yaml new file mode 100644 index 0000000..f3a6ded --- /dev/null +++ b/charts/llm-app/templates/ingress.yaml
@@ -0,0 +1,19 @@
	1	{{- if .Values.ingress.enabled -}}
	2	apiVersion: networking.k8s.io/v1
	3	kind: Ingress
	4	metadata:
	5	name: {{ include "llm-app.fullname" . }}
	6	spec:
	7	ingressClassName: {{ .Values.ingress.className }}
	8	rules:
	9	- host: {{ .Values.ingress.host \| quote }}
	10	http:
	11	paths:
	12	- path: /
	13	pathType: Prefix
	14	backend:
	15	service:
	16	name: {{ include "llm-app.fullname" . }}
	17	port:
	18	number: {{ .Values.service.port }}
	19	{{- end }}


diff --git a/charts/llm-app/templates/service.yaml b/charts/llm-app/templates/service.yaml new file mode 100644 index 0000000..6350996 --- /dev/null +++ b/charts/llm-app/templates/service.yaml
@@ -0,0 +1,13 @@
	1	apiVersion: v1
	2	kind: Service
	3	metadata:
	4	name: {{ include "llm-app.fullname" . }}
	5	labels: {{- include "llm-app.selectorLabels" . \| nindent 4 }}
	6	spec:
	7	type: {{ .Values.service.type }}
	8	ports:
	9	- name: http
	10	port: {{ .Values.service.port }}
	11	targetPort: http
	12	protocol: TCP
	13	selector: {{- include "llm-app.selectorLabels" . \| nindent 4 }}


diff --git a/charts/llm-app/templates/servicemonitor.yaml b/charts/llm-app/templates/servicemonitor.yaml new file mode 100644 index 0000000..264e766 --- /dev/null +++ b/charts/llm-app/templates/servicemonitor.yaml
@@ -0,0 +1,19 @@
	1	{{- if .Values.monitoring.serviceMonitor.enabled -}}
	2	apiVersion: monitoring.coreos.com/v1
	3	kind: ServiceMonitor
	4	metadata:
	5	name: {{ include "llm-app.fullname" . }}
	6	{{- with .Values.monitoring.serviceMonitor.labels }}
	7	labels: {{- toYaml . \| nindent 4 }}
	8	{{- end }}
	9	spec:
	10	selector:
	11	matchLabels: {{- include "llm-app.selectorLabels" . \| nindent 6 }}
	12	endpoints:
	13	- port: http
	14	path: /metrics
	15	interval: {{ .Values.monitoring.serviceMonitor.interval }}
	16	namespaceSelector:
	17	matchNames:
	18	- {{ .Release.Namespace }}
	19	{{- end }}


diff --git a/charts/llm-app/templates/smoketest-job.yaml b/charts/llm-app/templates/smoketest-job.yaml new file mode 100644 index 0000000..ac97f33 --- /dev/null +++ b/charts/llm-app/templates/smoketest-job.yaml
@@ -0,0 +1,32 @@
	1	apiVersion: batch/v1
	2	kind: Job
	3	metadata:
	4	name: {{ include "llm-app.fullname" . }}-smoketest
	5	annotations:
	6	"helm.sh/hook": post-install,post-upgrade
	7	"helm.sh/hook-weight": "10"
	8	"helm.sh/hook-delete-policy": before-hook-creation,hook-succeeded
	9	spec:
	10	backoffLimit: 2
	11	activeDeadlineSeconds: 240
	12	ttlSecondsAfterFinished: 600
	13	template:
	14	spec:
	15	restartPolicy: Never
	16	containers:
	17	- name: curl
	18	image: curlimages/curl:8.10.1
	19	command: ["/bin/sh", "-euc"]
	20	args:
	21	- \|
	22	ENDPOINT="http://{{ include "llm-app.fullname" . }}:{{ .Values.service.port }}"
	23	MODEL={{ .Values.model.alias \| quote }}
	24	echo "smoketest: GET $ENDPOINT/v1/models"
	25	out=$(curl -fsS --max-time 60 "$ENDPOINT/v1/models")
	26	echo "$out" \| grep -q "\"$MODEL\"" \|\| { echo "FAIL: $MODEL not listed in /v1/models"; echo "$out"; exit 1; }
	27	echo "smoketest: POST $ENDPOINT/v1/chat/completions"
	28	resp=$(curl -fsS --max-time 90 "$ENDPOINT/v1/chat/completions" \
	29	-H "Content-Type: application/json" \
	30	-d "{\"model\":\"$MODEL\",\"messages\":[{\"role\":\"user\",\"content\":\"Reply with just: pong\"}],\"max_tokens\":8,\"temperature\":0}")
	31	echo "$resp" \| grep -q '"content"' \|\| { echo "FAIL: no content in response"; echo "$resp"; exit 1; }
	32	echo "OK"


diff --git a/charts/llm-app/values.yaml b/charts/llm-app/values.yaml new file mode 100644 index 0000000..96c5c9a --- /dev/null +++ b/charts/llm-app/values.yaml
@@ -0,0 +1,51 @@
	1	replicaCount: 1
	2
	3	image:
	4	# vLLM CPU-only image (no CUDA, works on AVX2+).
	5	repository: public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo
	6	tag: latest
	7	# Optional. If set, used in place of `tag` to pin the image by content.
	8	# Example: "sha256:abc123...". Fill via scripts/resolve-digests.sh.
	9	digest: ""
	10	pullPolicy: IfNotPresent
	11
	12	# vLLM pulls model weights from HuggingFace at first boot into the cache volume.
	13	# `name` is the HF repo id (also used as `--served-model-name` unless `alias` overrides).
	14	model:
	15	name: "Qwen/Qwen2.5-0.5B-Instruct"
	16	alias: "Qwen2.5-0.5B-Instruct"
	17	maxModelLen: 2048
	18	dtype: "bfloat16"
	19
	20	server:
	21	port: 8000
	22	# OMP threads for the CPU backend; 0 = autodetect.
	23	ompThreads: 0
	24	extraArgs: []
	25
	26	resources:
	27	requests:
	28	cpu: "500m"
	29	memory: "1Gi"
	30	limits:
	31	cpu: "2"
	32	memory: "3Gi"
	33
	34	service:
	35	type: ClusterIP
	36	port: 8000
	37
	38	ingress:
	39	enabled: true
	40	className: nginx
	41	host: llm.localtest.me
	42
	43	monitoring:
	44	serviceMonitor:
	45	enabled: true
	46	interval: 15s
	47	labels:
	48	release: kube-prometheus-stack
	49
	50	modelCache:
	51	sizeLimit: 10Gi