summaryrefslogtreecommitdiff
path: root/charts/llm-app/values.yaml
diff options
context:
space:
mode:
authorYour Name <you@example.com>2026-04-26 21:02:47 +0800
committerYour Name <you@example.com>2026-04-26 21:02:47 +0800
commitd3e770254de0bb301815ca87257c8b1a357d06c4 (patch)
tree358c814be2a06b9e2009905f14938243286b8d82 /charts/llm-app/values.yaml
Diffstat (limited to 'charts/llm-app/values.yaml')
-rw-r--r--charts/llm-app/values.yaml51
1 files changed, 51 insertions, 0 deletions
diff --git a/charts/llm-app/values.yaml b/charts/llm-app/values.yaml
new file mode 100644
index 0000000..96c5c9a
--- /dev/null
+++ b/charts/llm-app/values.yaml
@@ -0,0 +1,51 @@
1replicaCount: 1
2
3image:
4 # vLLM CPU-only image (no CUDA, works on AVX2+).
5 repository: public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo
6 tag: latest
7 # Optional. If set, used in place of `tag` to pin the image by content.
8 # Example: "sha256:abc123...". Fill via scripts/resolve-digests.sh.
9 digest: ""
10 pullPolicy: IfNotPresent
11
12# vLLM pulls model weights from HuggingFace at first boot into the cache volume.
13# `name` is the HF repo id (also used as `--served-model-name` unless `alias` overrides).
14model:
15 name: "Qwen/Qwen2.5-0.5B-Instruct"
16 alias: "Qwen2.5-0.5B-Instruct"
17 maxModelLen: 2048
18 dtype: "bfloat16"
19
20server:
21 port: 8000
22 # OMP threads for the CPU backend; 0 = autodetect.
23 ompThreads: 0
24 extraArgs: []
25
26resources:
27 requests:
28 cpu: "500m"
29 memory: "1Gi"
30 limits:
31 cpu: "2"
32 memory: "3Gi"
33
34service:
35 type: ClusterIP
36 port: 8000
37
38ingress:
39 enabled: true
40 className: nginx
41 host: llm.localtest.me
42
43monitoring:
44 serviceMonitor:
45 enabled: true
46 interval: 15s
47 labels:
48 release: kube-prometheus-stack
49
50modelCache:
51 sizeLimit: 10Gi