apiVersion: apps/v1 kind: Deployment metadata: name: {{ include "llm-app.fullname" . }} spec: replicas: {{ .Values.replicaCount }} selector: matchLabels: {{- include "llm-app.selectorLabels" . | nindent 6 }} template: metadata: labels: {{- include "llm-app.selectorLabels" . | nindent 8 }} spec: containers: - name: vllm-server # Image entrypoint is already `vllm serve`; args start with the model tag. image: "{{ .Values.image.repository }}{{ if .Values.image.digest }}@{{ .Values.image.digest }}{{ else }}:{{ .Values.image.tag }}{{ end }}" imagePullPolicy: {{ .Values.image.pullPolicy }} args: - {{ .Values.model.name | quote }} - "--host" - "0.0.0.0" - "--port" - {{ .Values.server.port | quote }} - "--served-model-name" - {{ .Values.model.alias | quote }} - "--max-model-len" - {{ .Values.model.maxModelLen | quote }} - "--dtype" - {{ .Values.model.dtype | quote }} {{- with .Values.server.extraArgs }} {{- toYaml . | nindent 12 }} {{- end }} env: - name: HF_HOME value: /cache/huggingface - name: VLLM_CPU_KVCACHE_SPACE value: "2" {{- if gt (int .Values.server.ompThreads) 0 }} - name: OMP_NUM_THREADS value: {{ .Values.server.ompThreads | quote }} {{- end }} ports: - name: http containerPort: {{ .Values.server.port }} protocol: TCP readinessProbe: httpGet: path: /health port: http # vLLM CPU cold-start is ~2 min + HF download on first boot. initialDelaySeconds: 60 periodSeconds: 10 timeoutSeconds: 5 failureThreshold: 180 livenessProbe: httpGet: path: /health port: http initialDelaySeconds: 600 periodSeconds: 30 timeoutSeconds: 5 failureThreshold: 6 resources: {{- toYaml .Values.resources | nindent 12 }} volumeMounts: - name: cache mountPath: /cache - name: shm mountPath: /dev/shm volumes: - name: cache emptyDir: sizeLimit: {{ .Values.modelCache.sizeLimit }} - name: shm emptyDir: medium: Memory sizeLimit: 1Gi