diff options
| author | Your Name <you@example.com> | 2026-04-26 21:02:47 +0800 |
|---|---|---|
| committer | Your Name <you@example.com> | 2026-04-26 21:02:47 +0800 |
| commit | d3e770254de0bb301815ca87257c8b1a357d06c4 (patch) | |
| tree | 358c814be2a06b9e2009905f14938243286b8d82 /charts | |
Diffstat (limited to 'charts')
| -rw-r--r-- | charts/llm-app/Chart.yaml | 6 | ||||
| -rw-r--r-- | charts/llm-app/templates/_helpers.tpl | 8 | ||||
| -rw-r--r-- | charts/llm-app/templates/deployment.yaml | 76 | ||||
| -rw-r--r-- | charts/llm-app/templates/ingress.yaml | 19 | ||||
| -rw-r--r-- | charts/llm-app/templates/service.yaml | 13 | ||||
| -rw-r--r-- | charts/llm-app/templates/servicemonitor.yaml | 19 | ||||
| -rw-r--r-- | charts/llm-app/templates/smoketest-job.yaml | 32 | ||||
| -rw-r--r-- | charts/llm-app/values.yaml | 51 |
8 files changed, 224 insertions, 0 deletions
diff --git a/charts/llm-app/Chart.yaml b/charts/llm-app/Chart.yaml new file mode 100644 index 0000000..e0747df --- /dev/null +++ b/charts/llm-app/Chart.yaml | |||
| @@ -0,0 +1,6 @@ | |||
| 1 | apiVersion: v2 | ||
| 2 | name: llm-app | ||
| 3 | description: OpenAI-compatible LLM server (vLLM CPU) with Prometheus metrics | ||
| 4 | type: application | ||
| 5 | version: 0.1.0 | ||
| 6 | appVersion: "latest" | ||
diff --git a/charts/llm-app/templates/_helpers.tpl b/charts/llm-app/templates/_helpers.tpl new file mode 100644 index 0000000..8b104de --- /dev/null +++ b/charts/llm-app/templates/_helpers.tpl | |||
| @@ -0,0 +1,8 @@ | |||
| 1 | {{- define "llm-app.fullname" -}} | ||
| 2 | {{- printf "%s-%s" .Release.Name .Chart.Name | trunc 63 | trimSuffix "-" -}} | ||
| 3 | {{- end -}} | ||
| 4 | |||
| 5 | {{- define "llm-app.selectorLabels" -}} | ||
| 6 | app.kubernetes.io/name: {{ .Chart.Name }} | ||
| 7 | app.kubernetes.io/instance: {{ .Release.Name }} | ||
| 8 | {{- end -}} | ||
diff --git a/charts/llm-app/templates/deployment.yaml b/charts/llm-app/templates/deployment.yaml new file mode 100644 index 0000000..12677b5 --- /dev/null +++ b/charts/llm-app/templates/deployment.yaml | |||
| @@ -0,0 +1,76 @@ | |||
| 1 | apiVersion: apps/v1 | ||
| 2 | kind: Deployment | ||
| 3 | metadata: | ||
| 4 | name: {{ include "llm-app.fullname" . }} | ||
| 5 | spec: | ||
| 6 | replicas: {{ .Values.replicaCount }} | ||
| 7 | selector: | ||
| 8 | matchLabels: {{- include "llm-app.selectorLabels" . | nindent 6 }} | ||
| 9 | template: | ||
| 10 | metadata: | ||
| 11 | labels: {{- include "llm-app.selectorLabels" . | nindent 8 }} | ||
| 12 | spec: | ||
| 13 | containers: | ||
| 14 | - name: vllm-server | ||
| 15 | # Image entrypoint is already `vllm serve`; args start with the model tag. | ||
| 16 | image: "{{ .Values.image.repository }}{{ if .Values.image.digest }}@{{ .Values.image.digest }}{{ else }}:{{ .Values.image.tag }}{{ end }}" | ||
| 17 | imagePullPolicy: {{ .Values.image.pullPolicy }} | ||
| 18 | args: | ||
| 19 | - {{ .Values.model.name | quote }} | ||
| 20 | - "--host" | ||
| 21 | - "0.0.0.0" | ||
| 22 | - "--port" | ||
| 23 | - {{ .Values.server.port | quote }} | ||
| 24 | - "--served-model-name" | ||
| 25 | - {{ .Values.model.alias | quote }} | ||
| 26 | - "--max-model-len" | ||
| 27 | - {{ .Values.model.maxModelLen | quote }} | ||
| 28 | - "--dtype" | ||
| 29 | - {{ .Values.model.dtype | quote }} | ||
| 30 | {{- with .Values.server.extraArgs }} | ||
| 31 | {{- toYaml . | nindent 12 }} | ||
| 32 | {{- end }} | ||
| 33 | env: | ||
| 34 | - name: HF_HOME | ||
| 35 | value: /cache/huggingface | ||
| 36 | - name: VLLM_CPU_KVCACHE_SPACE | ||
| 37 | value: "2" | ||
| 38 | {{- if gt (int .Values.server.ompThreads) 0 }} | ||
| 39 | - name: OMP_NUM_THREADS | ||
| 40 | value: {{ .Values.server.ompThreads | quote }} | ||
| 41 | {{- end }} | ||
| 42 | ports: | ||
| 43 | - name: http | ||
| 44 | containerPort: {{ .Values.server.port }} | ||
| 45 | protocol: TCP | ||
| 46 | readinessProbe: | ||
| 47 | httpGet: | ||
| 48 | path: /health | ||
| 49 | port: http | ||
| 50 | # vLLM CPU cold-start is ~2 min + HF download on first boot. | ||
| 51 | initialDelaySeconds: 60 | ||
| 52 | periodSeconds: 10 | ||
| 53 | timeoutSeconds: 5 | ||
| 54 | failureThreshold: 180 | ||
| 55 | livenessProbe: | ||
| 56 | httpGet: | ||
| 57 | path: /health | ||
| 58 | port: http | ||
| 59 | initialDelaySeconds: 600 | ||
| 60 | periodSeconds: 30 | ||
| 61 | timeoutSeconds: 5 | ||
| 62 | failureThreshold: 6 | ||
| 63 | resources: {{- toYaml .Values.resources | nindent 12 }} | ||
| 64 | volumeMounts: | ||
| 65 | - name: cache | ||
| 66 | mountPath: /cache | ||
| 67 | - name: shm | ||
| 68 | mountPath: /dev/shm | ||
| 69 | volumes: | ||
| 70 | - name: cache | ||
| 71 | emptyDir: | ||
| 72 | sizeLimit: {{ .Values.modelCache.sizeLimit }} | ||
| 73 | - name: shm | ||
| 74 | emptyDir: | ||
| 75 | medium: Memory | ||
| 76 | sizeLimit: 1Gi | ||
diff --git a/charts/llm-app/templates/ingress.yaml b/charts/llm-app/templates/ingress.yaml new file mode 100644 index 0000000..f3a6ded --- /dev/null +++ b/charts/llm-app/templates/ingress.yaml | |||
| @@ -0,0 +1,19 @@ | |||
| 1 | {{- if .Values.ingress.enabled -}} | ||
| 2 | apiVersion: networking.k8s.io/v1 | ||
| 3 | kind: Ingress | ||
| 4 | metadata: | ||
| 5 | name: {{ include "llm-app.fullname" . }} | ||
| 6 | spec: | ||
| 7 | ingressClassName: {{ .Values.ingress.className }} | ||
| 8 | rules: | ||
| 9 | - host: {{ .Values.ingress.host | quote }} | ||
| 10 | http: | ||
| 11 | paths: | ||
| 12 | - path: / | ||
| 13 | pathType: Prefix | ||
| 14 | backend: | ||
| 15 | service: | ||
| 16 | name: {{ include "llm-app.fullname" . }} | ||
| 17 | port: | ||
| 18 | number: {{ .Values.service.port }} | ||
| 19 | {{- end }} | ||
diff --git a/charts/llm-app/templates/service.yaml b/charts/llm-app/templates/service.yaml new file mode 100644 index 0000000..6350996 --- /dev/null +++ b/charts/llm-app/templates/service.yaml | |||
| @@ -0,0 +1,13 @@ | |||
| 1 | apiVersion: v1 | ||
| 2 | kind: Service | ||
| 3 | metadata: | ||
| 4 | name: {{ include "llm-app.fullname" . }} | ||
| 5 | labels: {{- include "llm-app.selectorLabels" . | nindent 4 }} | ||
| 6 | spec: | ||
| 7 | type: {{ .Values.service.type }} | ||
| 8 | ports: | ||
| 9 | - name: http | ||
| 10 | port: {{ .Values.service.port }} | ||
| 11 | targetPort: http | ||
| 12 | protocol: TCP | ||
| 13 | selector: {{- include "llm-app.selectorLabels" . | nindent 4 }} | ||
diff --git a/charts/llm-app/templates/servicemonitor.yaml b/charts/llm-app/templates/servicemonitor.yaml new file mode 100644 index 0000000..264e766 --- /dev/null +++ b/charts/llm-app/templates/servicemonitor.yaml | |||
| @@ -0,0 +1,19 @@ | |||
| 1 | {{- if .Values.monitoring.serviceMonitor.enabled -}} | ||
| 2 | apiVersion: monitoring.coreos.com/v1 | ||
| 3 | kind: ServiceMonitor | ||
| 4 | metadata: | ||
| 5 | name: {{ include "llm-app.fullname" . }} | ||
| 6 | {{- with .Values.monitoring.serviceMonitor.labels }} | ||
| 7 | labels: {{- toYaml . | nindent 4 }} | ||
| 8 | {{- end }} | ||
| 9 | spec: | ||
| 10 | selector: | ||
| 11 | matchLabels: {{- include "llm-app.selectorLabels" . | nindent 6 }} | ||
| 12 | endpoints: | ||
| 13 | - port: http | ||
| 14 | path: /metrics | ||
| 15 | interval: {{ .Values.monitoring.serviceMonitor.interval }} | ||
| 16 | namespaceSelector: | ||
| 17 | matchNames: | ||
| 18 | - {{ .Release.Namespace }} | ||
| 19 | {{- end }} | ||
diff --git a/charts/llm-app/templates/smoketest-job.yaml b/charts/llm-app/templates/smoketest-job.yaml new file mode 100644 index 0000000..ac97f33 --- /dev/null +++ b/charts/llm-app/templates/smoketest-job.yaml | |||
| @@ -0,0 +1,32 @@ | |||
| 1 | apiVersion: batch/v1 | ||
| 2 | kind: Job | ||
| 3 | metadata: | ||
| 4 | name: {{ include "llm-app.fullname" . }}-smoketest | ||
| 5 | annotations: | ||
| 6 | "helm.sh/hook": post-install,post-upgrade | ||
| 7 | "helm.sh/hook-weight": "10" | ||
| 8 | "helm.sh/hook-delete-policy": before-hook-creation,hook-succeeded | ||
| 9 | spec: | ||
| 10 | backoffLimit: 2 | ||
| 11 | activeDeadlineSeconds: 240 | ||
| 12 | ttlSecondsAfterFinished: 600 | ||
| 13 | template: | ||
| 14 | spec: | ||
| 15 | restartPolicy: Never | ||
| 16 | containers: | ||
| 17 | - name: curl | ||
| 18 | image: curlimages/curl:8.10.1 | ||
| 19 | command: ["/bin/sh", "-euc"] | ||
| 20 | args: | ||
| 21 | - | | ||
| 22 | ENDPOINT="http://{{ include "llm-app.fullname" . }}:{{ .Values.service.port }}" | ||
| 23 | MODEL={{ .Values.model.alias | quote }} | ||
| 24 | echo "smoketest: GET $ENDPOINT/v1/models" | ||
| 25 | out=$(curl -fsS --max-time 60 "$ENDPOINT/v1/models") | ||
| 26 | echo "$out" | grep -q "\"$MODEL\"" || { echo "FAIL: $MODEL not listed in /v1/models"; echo "$out"; exit 1; } | ||
| 27 | echo "smoketest: POST $ENDPOINT/v1/chat/completions" | ||
| 28 | resp=$(curl -fsS --max-time 90 "$ENDPOINT/v1/chat/completions" \ | ||
| 29 | -H "Content-Type: application/json" \ | ||
| 30 | -d "{\"model\":\"$MODEL\",\"messages\":[{\"role\":\"user\",\"content\":\"Reply with just: pong\"}],\"max_tokens\":8,\"temperature\":0}") | ||
| 31 | echo "$resp" | grep -q '"content"' || { echo "FAIL: no content in response"; echo "$resp"; exit 1; } | ||
| 32 | echo "OK" | ||
diff --git a/charts/llm-app/values.yaml b/charts/llm-app/values.yaml new file mode 100644 index 0000000..96c5c9a --- /dev/null +++ b/charts/llm-app/values.yaml | |||
| @@ -0,0 +1,51 @@ | |||
| 1 | replicaCount: 1 | ||
| 2 | |||
| 3 | image: | ||
| 4 | # vLLM CPU-only image (no CUDA, works on AVX2+). | ||
| 5 | repository: public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo | ||
| 6 | tag: latest | ||
| 7 | # Optional. If set, used in place of `tag` to pin the image by content. | ||
| 8 | # Example: "sha256:abc123...". Fill via scripts/resolve-digests.sh. | ||
| 9 | digest: "" | ||
| 10 | pullPolicy: IfNotPresent | ||
| 11 | |||
| 12 | # vLLM pulls model weights from HuggingFace at first boot into the cache volume. | ||
| 13 | # `name` is the HF repo id (also used as `--served-model-name` unless `alias` overrides). | ||
| 14 | model: | ||
| 15 | name: "Qwen/Qwen2.5-0.5B-Instruct" | ||
| 16 | alias: "Qwen2.5-0.5B-Instruct" | ||
| 17 | maxModelLen: 2048 | ||
| 18 | dtype: "bfloat16" | ||
| 19 | |||
| 20 | server: | ||
| 21 | port: 8000 | ||
| 22 | # OMP threads for the CPU backend; 0 = autodetect. | ||
| 23 | ompThreads: 0 | ||
| 24 | extraArgs: [] | ||
| 25 | |||
| 26 | resources: | ||
| 27 | requests: | ||
| 28 | cpu: "500m" | ||
| 29 | memory: "1Gi" | ||
| 30 | limits: | ||
| 31 | cpu: "2" | ||
| 32 | memory: "3Gi" | ||
| 33 | |||
| 34 | service: | ||
| 35 | type: ClusterIP | ||
| 36 | port: 8000 | ||
| 37 | |||
| 38 | ingress: | ||
| 39 | enabled: true | ||
| 40 | className: nginx | ||
| 41 | host: llm.localtest.me | ||
| 42 | |||
| 43 | monitoring: | ||
| 44 | serviceMonitor: | ||
| 45 | enabled: true | ||
| 46 | interval: 15s | ||
| 47 | labels: | ||
| 48 | release: kube-prometheus-stack | ||
| 49 | |||
| 50 | modelCache: | ||
| 51 | sizeLimit: 10Gi | ||
