blob: 12677b51370eb7c0086d6d84b2f6358dcb6bdd26 (
plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
|
apiVersion: apps/v1
kind: Deployment
metadata:
name: {{ include "llm-app.fullname" . }}
spec:
replicas: {{ .Values.replicaCount }}
selector:
matchLabels: {{- include "llm-app.selectorLabels" . | nindent 6 }}
template:
metadata:
labels: {{- include "llm-app.selectorLabels" . | nindent 8 }}
spec:
containers:
- name: vllm-server
# Image entrypoint is already `vllm serve`; args start with the model tag.
image: "{{ .Values.image.repository }}{{ if .Values.image.digest }}@{{ .Values.image.digest }}{{ else }}:{{ .Values.image.tag }}{{ end }}"
imagePullPolicy: {{ .Values.image.pullPolicy }}
args:
- {{ .Values.model.name | quote }}
- "--host"
- "0.0.0.0"
- "--port"
- {{ .Values.server.port | quote }}
- "--served-model-name"
- {{ .Values.model.alias | quote }}
- "--max-model-len"
- {{ .Values.model.maxModelLen | quote }}
- "--dtype"
- {{ .Values.model.dtype | quote }}
{{- with .Values.server.extraArgs }}
{{- toYaml . | nindent 12 }}
{{- end }}
env:
- name: HF_HOME
value: /cache/huggingface
- name: VLLM_CPU_KVCACHE_SPACE
value: "2"
{{- if gt (int .Values.server.ompThreads) 0 }}
- name: OMP_NUM_THREADS
value: {{ .Values.server.ompThreads | quote }}
{{- end }}
ports:
- name: http
containerPort: {{ .Values.server.port }}
protocol: TCP
readinessProbe:
httpGet:
path: /health
port: http
# vLLM CPU cold-start is ~2 min + HF download on first boot.
initialDelaySeconds: 60
periodSeconds: 10
timeoutSeconds: 5
failureThreshold: 180
livenessProbe:
httpGet:
path: /health
port: http
initialDelaySeconds: 600
periodSeconds: 30
timeoutSeconds: 5
failureThreshold: 6
resources: {{- toYaml .Values.resources | nindent 12 }}
volumeMounts:
- name: cache
mountPath: /cache
- name: shm
mountPath: /dev/shm
volumes:
- name: cache
emptyDir:
sizeLimit: {{ .Values.modelCache.sizeLimit }}
- name: shm
emptyDir:
medium: Memory
sizeLimit: 1Gi
|