From d3e770254de0bb301815ca87257c8b1a357d06c4 Mon Sep 17 00:00:00 2001 From: Your Name Date: Sun, 26 Apr 2026 21:02:47 +0800 Subject: hehe --- charts/llm-app/values.yaml | 51 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 51 insertions(+) create mode 100644 charts/llm-app/values.yaml (limited to 'charts/llm-app/values.yaml') diff --git a/charts/llm-app/values.yaml b/charts/llm-app/values.yaml new file mode 100644 index 0000000..96c5c9a --- /dev/null +++ b/charts/llm-app/values.yaml @@ -0,0 +1,51 @@ +replicaCount: 1 + +image: + # vLLM CPU-only image (no CUDA, works on AVX2+). + repository: public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo + tag: latest + # Optional. If set, used in place of `tag` to pin the image by content. + # Example: "sha256:abc123...". Fill via scripts/resolve-digests.sh. + digest: "" + pullPolicy: IfNotPresent + +# vLLM pulls model weights from HuggingFace at first boot into the cache volume. +# `name` is the HF repo id (also used as `--served-model-name` unless `alias` overrides). +model: + name: "Qwen/Qwen2.5-0.5B-Instruct" + alias: "Qwen2.5-0.5B-Instruct" + maxModelLen: 2048 + dtype: "bfloat16" + +server: + port: 8000 + # OMP threads for the CPU backend; 0 = autodetect. + ompThreads: 0 + extraArgs: [] + +resources: + requests: + cpu: "500m" + memory: "1Gi" + limits: + cpu: "2" + memory: "3Gi" + +service: + type: ClusterIP + port: 8000 + +ingress: + enabled: true + className: nginx + host: llm.localtest.me + +monitoring: + serviceMonitor: + enabled: true + interval: 15s + labels: + release: kube-prometheus-stack + +modelCache: + sizeLimit: 10Gi -- cgit