1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
|
resource "kubernetes_namespace_v1" "monitoring" {
metadata {
name = var.namespace
}
}
resource "kubernetes_namespace_v1" "ingress" {
metadata {
name = "ingress-nginx"
}
}
resource "helm_release" "ingress_nginx" {
name = "ingress-nginx"
repository = "https://kubernetes.github.io/ingress-nginx"
chart = "ingress-nginx"
version = var.ingress_nginx_version
namespace = kubernetes_namespace_v1.ingress.metadata[0].name
wait = true
timeout = 300
values = [
yamlencode({
controller = {
hostPort = { enabled = true, ports = { http = 80, https = 443 } }
service = { type = "NodePort" }
nodeSelector = {
"ingress-ready" = "true"
}
tolerations = [
{ key = "node-role.kubernetes.io/control-plane", operator = "Equal", effect = "NoSchedule" },
{ key = "node-role.kubernetes.io/master", operator = "Equal", effect = "NoSchedule" },
]
publishService = { enabled = false }
admissionWebhooks = { enabled = false } # speeds up kind cluster installs
# Cap worker_processes so nginx doesn't try to spawn 14 threads under
# CPU pressure from vLLM cold-starts. With auto (= one per CPU) it
# sometimes hits pthread EAGAIN and workers die without respawn.
config = {
"worker-processes" = "4"
}
}
}),
]
}
resource "helm_release" "kps" {
name = "kube-prometheus-stack"
repository = "https://prometheus-community.github.io/helm-charts"
chart = "kube-prometheus-stack"
version = var.kps_version
namespace = kubernetes_namespace_v1.monitoring.metadata[0].name
wait = true
timeout = 600
values = [
yamlencode({
fullnameOverride = "kps"
prometheus = {
prometheusSpec = {
# Let Prometheus pick up ServiceMonitors from any namespace matching
# the release=kube-prometheus-stack label (the chart's default).
serviceMonitorSelectorNilUsesHelmValues = false
podMonitorSelectorNilUsesHelmValues = false
ruleSelectorNilUsesHelmValues = false
retention = "2d"
resources = {
requests = { cpu = "100m", memory = "400Mi" }
limits = { memory = "1Gi" }
}
}
ingress = {
enabled = true
ingressClassName = "nginx"
hosts = ["prom.localtest.me"]
}
}
alertmanager = { enabled = false }
grafana = {
adminPassword = var.grafana_admin_password
sidecar = {
dashboards = {
enabled = true
label = "grafana_dashboard"
labelValue = "1"
searchNamespace = "ALL"
}
}
service = { type = "ClusterIP" }
ingress = {
enabled = true
ingressClassName = "nginx"
hosts = ["grafana.localtest.me"]
}
}
}),
]
}
resource "helm_release" "prometheus_adapter" {
name = "prometheus-adapter"
repository = "https://prometheus-community.github.io/helm-charts"
chart = "prometheus-adapter"
version = var.prometheus_adapter_version
namespace = kubernetes_namespace_v1.monitoring.metadata[0].name
wait = true
timeout = 300
values = [
yamlencode({
prometheus = {
url = "http://kps-prometheus.${kubernetes_namespace_v1.monitoring.metadata[0].name}.svc"
port = 9090
}
rules = {
default = false
custom = [
{
# In-flight request count per pod; basis for autoscaling.
# vLLM exposes this as a gauge per model-engine.
seriesQuery = "vllm:num_requests_running{namespace!=\"\",pod!=\"\"}"
resources = {
overrides = {
namespace = { resource = "namespace" }
pod = { resource = "pod" }
}
}
name = {
matches = "^vllm:num_requests_running$"
as = "vllm:num_requests_running"
}
metricsQuery = "avg(<<.Series>>{<<.LabelMatchers>>}) by (<<.GroupBy>>)"
},
{
# Waiting (queued) requests per pod — an alternative scale signal.
seriesQuery = "vllm:num_requests_waiting{namespace!=\"\",pod!=\"\"}"
resources = {
overrides = {
namespace = { resource = "namespace" }
pod = { resource = "pod" }
}
}
name = {
matches = "^vllm:num_requests_waiting$"
as = "vllm:num_requests_waiting"
}
metricsQuery = "avg(<<.Series>>{<<.LabelMatchers>>}) by (<<.GroupBy>>)"
},
]
}
}),
]
depends_on = [helm_release.kps]
}
|