blob: 3a7d8f728df3c6dfce2851001351816ad5cea6dc (
plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
|
variable "release_name" {
type = string
description = "Helm release name."
}
variable "namespace" {
type = string
description = "Kubernetes namespace to deploy into."
}
variable "chart_path" {
type = string
description = "Path to the local llm-app chart."
}
variable "replicas" {
type = number
default = 1
}
variable "model_name" {
type = string
description = "HuggingFace repo id, passed as vLLM model_tag (positional)."
}
variable "model_alias" {
type = string
description = "Value clients pass in the OpenAI 'model' field (maps to --served-model-name)."
}
variable "max_model_len" {
type = number
default = 2048
}
variable "dtype" {
type = string
default = "bfloat16"
}
variable "omp_threads" {
type = number
default = 0
description = "OMP_NUM_THREADS for vLLM CPU backend. 0 = autodetect."
}
variable "extra_args" {
type = list(string)
default = []
description = "Extra CLI args passed to `vllm serve`, appended after the stock set."
}
variable "resources" {
type = object({
requests = object({ cpu = string, memory = string })
limits = object({ cpu = string, memory = string })
})
}
variable "ingress_host" {
type = string
}
variable "ingress_class" {
type = string
default = "nginx"
}
variable "image_repository" {
type = string
default = "public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo"
}
variable "image_tag" {
type = string
default = "latest"
description = "Used only when image_digest is empty."
}
variable "image_digest" {
type = string
default = ""
description = "Optional sha256:abc... content-addressable digest. Takes precedence over image_tag."
}
variable "service_monitor_release_label" {
type = string
default = "kube-prometheus-stack"
description = "Must match the release label the Prometheus Operator selects on."
}
variable "model_cache_size" {
type = string
default = "10Gi"
}
variable "hpa" {
type = object({
enabled = bool
min_replicas = number
max_replicas = number
metric_name = string
target_average_value = string
})
default = {
enabled = false
min_replicas = 1
max_replicas = 3
metric_name = "vllm:num_requests_running"
target_average_value = "500m"
}
}
|