From d3e770254de0bb301815ca87257c8b1a357d06c4 Mon Sep 17 00:00:00 2001
From: Your Name <you@example.com>
Date: Sun, 26 Apr 2026 21:02:47 +0800
Subject: hehe

---
 charts/llm-app/values.yaml | 51 ++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 51 insertions(+)
 create mode 100644 charts/llm-app/values.yaml

(limited to 'charts/llm-app/values.yaml')

diff --git a/charts/llm-app/values.yaml b/charts/llm-app/values.yaml
new file mode 100644
index 0000000..96c5c9a
--- /dev/null
+++ b/charts/llm-app/values.yaml
@@ -0,0 +1,51 @@
+replicaCount: 1
+
+image:
+  # vLLM CPU-only image (no CUDA, works on AVX2+).
+  repository: public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo
+  tag: latest
+  # Optional. If set, used in place of `tag` to pin the image by content.
+  # Example: "sha256:abc123...". Fill via scripts/resolve-digests.sh.
+  digest: ""
+  pullPolicy: IfNotPresent
+
+# vLLM pulls model weights from HuggingFace at first boot into the cache volume.
+# `name` is the HF repo id (also used as `--served-model-name` unless `alias` overrides).
+model:
+  name: "Qwen/Qwen2.5-0.5B-Instruct"
+  alias: "Qwen2.5-0.5B-Instruct"
+  maxModelLen: 2048
+  dtype: "bfloat16"
+
+server:
+  port: 8000
+  # OMP threads for the CPU backend; 0 = autodetect.
+  ompThreads: 0
+  extraArgs: []
+
+resources:
+  requests:
+    cpu: "500m"
+    memory: "1Gi"
+  limits:
+    cpu: "2"
+    memory: "3Gi"
+
+service:
+  type: ClusterIP
+  port: 8000
+
+ingress:
+  enabled: true
+  className: nginx
+  host: llm.localtest.me
+
+monitoring:
+  serviceMonitor:
+    enabled: true
+    interval: 15s
+    labels:
+      release: kube-prometheus-stack
+
+modelCache:
+  sizeLimit: 10Gi
-- 
cgit