summaryrefslogtreecommitdiff
path: root/charts/llm-app/templates/deployment.yaml
diff options
context:
space:
mode:
Diffstat (limited to 'charts/llm-app/templates/deployment.yaml')
-rw-r--r--charts/llm-app/templates/deployment.yaml76
1 files changed, 76 insertions, 0 deletions
diff --git a/charts/llm-app/templates/deployment.yaml b/charts/llm-app/templates/deployment.yaml
new file mode 100644
index 0000000..12677b5
--- /dev/null
+++ b/charts/llm-app/templates/deployment.yaml
@@ -0,0 +1,76 @@
1apiVersion: apps/v1
2kind: Deployment
3metadata:
4 name: {{ include "llm-app.fullname" . }}
5spec:
6 replicas: {{ .Values.replicaCount }}
7 selector:
8 matchLabels: {{- include "llm-app.selectorLabels" . | nindent 6 }}
9 template:
10 metadata:
11 labels: {{- include "llm-app.selectorLabels" . | nindent 8 }}
12 spec:
13 containers:
14 - name: vllm-server
15 # Image entrypoint is already `vllm serve`; args start with the model tag.
16 image: "{{ .Values.image.repository }}{{ if .Values.image.digest }}@{{ .Values.image.digest }}{{ else }}:{{ .Values.image.tag }}{{ end }}"
17 imagePullPolicy: {{ .Values.image.pullPolicy }}
18 args:
19 - {{ .Values.model.name | quote }}
20 - "--host"
21 - "0.0.0.0"
22 - "--port"
23 - {{ .Values.server.port | quote }}
24 - "--served-model-name"
25 - {{ .Values.model.alias | quote }}
26 - "--max-model-len"
27 - {{ .Values.model.maxModelLen | quote }}
28 - "--dtype"
29 - {{ .Values.model.dtype | quote }}
30 {{- with .Values.server.extraArgs }}
31 {{- toYaml . | nindent 12 }}
32 {{- end }}
33 env:
34 - name: HF_HOME
35 value: /cache/huggingface
36 - name: VLLM_CPU_KVCACHE_SPACE
37 value: "2"
38 {{- if gt (int .Values.server.ompThreads) 0 }}
39 - name: OMP_NUM_THREADS
40 value: {{ .Values.server.ompThreads | quote }}
41 {{- end }}
42 ports:
43 - name: http
44 containerPort: {{ .Values.server.port }}
45 protocol: TCP
46 readinessProbe:
47 httpGet:
48 path: /health
49 port: http
50 # vLLM CPU cold-start is ~2 min + HF download on first boot.
51 initialDelaySeconds: 60
52 periodSeconds: 10
53 timeoutSeconds: 5
54 failureThreshold: 180
55 livenessProbe:
56 httpGet:
57 path: /health
58 port: http
59 initialDelaySeconds: 600
60 periodSeconds: 30
61 timeoutSeconds: 5
62 failureThreshold: 6
63 resources: {{- toYaml .Values.resources | nindent 12 }}
64 volumeMounts:
65 - name: cache
66 mountPath: /cache
67 - name: shm
68 mountPath: /dev/shm
69 volumes:
70 - name: cache
71 emptyDir:
72 sizeLimit: {{ .Values.modelCache.sizeLimit }}
73 - name: shm
74 emptyDir:
75 medium: Memory
76 sizeLimit: 1Gi