blob: 96c5c9ab13170d5fc3fa11a796dab8b29dea0c91 (
plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
|
replicaCount: 1
image:
# vLLM CPU-only image (no CUDA, works on AVX2+).
repository: public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo
tag: latest
# Optional. If set, used in place of `tag` to pin the image by content.
# Example: "sha256:abc123...". Fill via scripts/resolve-digests.sh.
digest: ""
pullPolicy: IfNotPresent
# vLLM pulls model weights from HuggingFace at first boot into the cache volume.
# `name` is the HF repo id (also used as `--served-model-name` unless `alias` overrides).
model:
name: "Qwen/Qwen2.5-0.5B-Instruct"
alias: "Qwen2.5-0.5B-Instruct"
maxModelLen: 2048
dtype: "bfloat16"
server:
port: 8000
# OMP threads for the CPU backend; 0 = autodetect.
ompThreads: 0
extraArgs: []
resources:
requests:
cpu: "500m"
memory: "1Gi"
limits:
cpu: "2"
memory: "3Gi"
service:
type: ClusterIP
port: 8000
ingress:
enabled: true
className: nginx
host: llm.localtest.me
monitoring:
serviceMonitor:
enabled: true
interval: 15s
labels:
release: kube-prometheus-stack
modelCache:
sizeLimit: 10Gi
|