heheHEAD main

author: Your Name <you@example.com> 2026-04-26 21:02:47 +0800
committer: Your Name <you@example.com> 2026-04-26 21:02:47 +0800
commit: d3e770254de0bb301815ca87257c8b1a357d06c4 (patch)
tree: 358c814be2a06b9e2009905f14938243286b8d82 /charts/llm-app/values.yaml
1 files changed, 51 insertions, 0 deletions
diff --git a/charts/llm-app/values.yaml b/charts/llm-app/values.yaml
new file mode 100644
index 0000000..96c5c9a
--- /dev/null
+++ b/charts/llm-app/values.yaml
@@ -0,0 +1,51 @@
+replicaCount: 1
+image:
+  # vLLM CPU-only image (no CUDA, works on AVX2+).
+  repository: public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo
+  tag: latest
+  # Optional. If set, used in place of `tag` to pin the image by content.
+  # Example: "sha256:abc123...". Fill via scripts/resolve-digests.sh.
+  digest: ""
+  pullPolicy: IfNotPresent
+# vLLM pulls model weights from HuggingFace at first boot into the cache volume.
+# `name` is the HF repo id (also used as `--served-model-name` unless `alias` overrides).
+model:
+  name: "Qwen/Qwen2.5-0.5B-Instruct"
+  alias: "Qwen2.5-0.5B-Instruct"
+  maxModelLen: 2048
+  dtype: "bfloat16"
+server:
+  port: 8000
+  # OMP threads for the CPU backend; 0 = autodetect.
+  ompThreads: 0
+  extraArgs: []
+resources:
+  requests:
+    cpu: "500m"
+    memory: "1Gi"
+  limits:
+    cpu: "2"
+    memory: "3Gi"
+service:
+  type: ClusterIP
+  port: 8000
+ingress:
+  enabled: true
+  className: nginx
+  host: llm.localtest.me
+monitoring:
+  serviceMonitor:
+    enabled: true
+    interval: 15s
+    labels:
+      release: kube-prometheus-stack
+modelCache:
+  sizeLimit: 10Gi
author	Your Name <you@example.com>	2026-04-26 21:02:47 +0800
committer	Your Name <you@example.com>	2026-04-26 21:02:47 +0800
commit	d3e770254de0bb301815ca87257c8b1a357d06c4 (patch)
tree	358c814be2a06b9e2009905f14938243286b8d82 /charts/llm-app/values.yaml

diff --git a/charts/llm-app/values.yaml b/charts/llm-app/values.yaml new file mode 100644 index 0000000..96c5c9a --- /dev/null +++ b/charts/llm-app/values.yaml
@@ -0,0 +1,51 @@
	1	replicaCount: 1
	2
	3	image:
	4	# vLLM CPU-only image (no CUDA, works on AVX2+).
	5	repository: public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo
	6	tag: latest
	7	# Optional. If set, used in place of `tag` to pin the image by content.
	8	# Example: "sha256:abc123...". Fill via scripts/resolve-digests.sh.
	9	digest: ""
	10	pullPolicy: IfNotPresent
	11
	12	# vLLM pulls model weights from HuggingFace at first boot into the cache volume.
	13	# `name` is the HF repo id (also used as `--served-model-name` unless `alias` overrides).
	14	model:
	15	name: "Qwen/Qwen2.5-0.5B-Instruct"
	16	alias: "Qwen2.5-0.5B-Instruct"
	17	maxModelLen: 2048
	18	dtype: "bfloat16"
	19
	20	server:
	21	port: 8000
	22	# OMP threads for the CPU backend; 0 = autodetect.
	23	ompThreads: 0
	24	extraArgs: []
	25
	26	resources:
	27	requests:
	28	cpu: "500m"
	29	memory: "1Gi"
	30	limits:
	31	cpu: "2"
	32	memory: "3Gi"
	33
	34	service:
	35	type: ClusterIP
	36	port: 8000
	37
	38	ingress:
	39	enabled: true
	40	className: nginx
	41	host: llm.localtest.me
	42
	43	monitoring:
	44	serviceMonitor:
	45	enabled: true
	46	interval: 15s
	47	labels:
	48	release: kube-prometheus-stack
	49
	50	modelCache:
	51	sizeLimit: 10Gi