replicaCount: 1 image: # vLLM CPU-only image (no CUDA, works on AVX2+). repository: public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo tag: latest # Optional. If set, used in place of `tag` to pin the image by content. # Example: "sha256:abc123...". Fill via scripts/resolve-digests.sh. digest: "" pullPolicy: IfNotPresent # vLLM pulls model weights from HuggingFace at first boot into the cache volume. # `name` is the HF repo id (also used as `--served-model-name` unless `alias` overrides). model: name: "Qwen/Qwen2.5-0.5B-Instruct" alias: "Qwen2.5-0.5B-Instruct" maxModelLen: 2048 dtype: "bfloat16" server: port: 8000 # OMP threads for the CPU backend; 0 = autodetect. ompThreads: 0 extraArgs: [] resources: requests: cpu: "500m" memory: "1Gi" limits: cpu: "2" memory: "3Gi" service: type: ClusterIP port: 8000 ingress: enabled: true className: nginx host: llm.localtest.me monitoring: serviceMonitor: enabled: true interval: 15s labels: release: kube-prometheus-stack modelCache: sizeLimit: 10Gi