charts/llm-app/values.yaml


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51

replicaCount: 1

image:
  # vLLM CPU-only image (no CUDA, works on AVX2+).
  repository: public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo
  tag: latest
  # Optional. If set, used in place of `tag` to pin the image by content.
  # Example: "sha256:abc123...". Fill via scripts/resolve-digests.sh.
  digest: ""
  pullPolicy: IfNotPresent

# vLLM pulls model weights from HuggingFace at first boot into the cache volume.
# `name` is the HF repo id (also used as `--served-model-name` unless `alias` overrides).
model:
  name: "Qwen/Qwen2.5-0.5B-Instruct"
  alias: "Qwen2.5-0.5B-Instruct"
  maxModelLen: 2048
  dtype: "bfloat16"

server:
  port: 8000
  # OMP threads for the CPU backend; 0 = autodetect.
  ompThreads: 0
  extraArgs: []

resources:
  requests:
    cpu: "500m"
    memory: "1Gi"
  limits:
    cpu: "2"
    memory: "3Gi"

service:
  type: ClusterIP
  port: 8000

ingress:
  enabled: true
  className: nginx
  host: llm.localtest.me

monitoring:
  serviceMonitor:
    enabled: true
    interval: 15s
    labels:
      release: kube-prometheus-stack

modelCache:
  sizeLimit: 10Gi