summaryrefslogtreecommitdiff
path: root/terraform/modules/observability
diff options
context:
space:
mode:
authorYour Name <you@example.com>2026-04-26 21:02:47 +0800
committerYour Name <you@example.com>2026-04-26 21:02:47 +0800
commitd3e770254de0bb301815ca87257c8b1a357d06c4 (patch)
tree358c814be2a06b9e2009905f14938243286b8d82 /terraform/modules/observability
Diffstat (limited to 'terraform/modules/observability')
-rw-r--r--terraform/modules/observability/main.tf156
-rw-r--r--terraform/modules/observability/outputs.tf11
-rw-r--r--terraform/modules/observability/variables.tf27
3 files changed, 194 insertions, 0 deletions
diff --git a/terraform/modules/observability/main.tf b/terraform/modules/observability/main.tf
new file mode 100644
index 0000000..2f88f2e
--- /dev/null
+++ b/terraform/modules/observability/main.tf
@@ -0,0 +1,156 @@
1resource "kubernetes_namespace_v1" "monitoring" {
2 metadata {
3 name = var.namespace
4 }
5}
6
7resource "kubernetes_namespace_v1" "ingress" {
8 metadata {
9 name = "ingress-nginx"
10 }
11}
12
13resource "helm_release" "ingress_nginx" {
14 name = "ingress-nginx"
15 repository = "https://kubernetes.github.io/ingress-nginx"
16 chart = "ingress-nginx"
17 version = var.ingress_nginx_version
18 namespace = kubernetes_namespace_v1.ingress.metadata[0].name
19 wait = true
20 timeout = 300
21
22 values = [
23 yamlencode({
24 controller = {
25 hostPort = { enabled = true, ports = { http = 80, https = 443 } }
26 service = { type = "NodePort" }
27 nodeSelector = {
28 "ingress-ready" = "true"
29 }
30 tolerations = [
31 { key = "node-role.kubernetes.io/control-plane", operator = "Equal", effect = "NoSchedule" },
32 { key = "node-role.kubernetes.io/master", operator = "Equal", effect = "NoSchedule" },
33 ]
34 publishService = { enabled = false }
35 admissionWebhooks = { enabled = false } # speeds up kind cluster installs
36 # Cap worker_processes so nginx doesn't try to spawn 14 threads under
37 # CPU pressure from vLLM cold-starts. With auto (= one per CPU) it
38 # sometimes hits pthread EAGAIN and workers die without respawn.
39 config = {
40 "worker-processes" = "4"
41 }
42 }
43 }),
44 ]
45}
46
47resource "helm_release" "kps" {
48 name = "kube-prometheus-stack"
49 repository = "https://prometheus-community.github.io/helm-charts"
50 chart = "kube-prometheus-stack"
51 version = var.kps_version
52 namespace = kubernetes_namespace_v1.monitoring.metadata[0].name
53 wait = true
54 timeout = 600
55
56 values = [
57 yamlencode({
58 fullnameOverride = "kps"
59 prometheus = {
60 prometheusSpec = {
61 # Let Prometheus pick up ServiceMonitors from any namespace matching
62 # the release=kube-prometheus-stack label (the chart's default).
63 serviceMonitorSelectorNilUsesHelmValues = false
64 podMonitorSelectorNilUsesHelmValues = false
65 ruleSelectorNilUsesHelmValues = false
66 retention = "2d"
67 resources = {
68 requests = { cpu = "100m", memory = "400Mi" }
69 limits = { memory = "1Gi" }
70 }
71 }
72 ingress = {
73 enabled = true
74 ingressClassName = "nginx"
75 hosts = ["prom.localtest.me"]
76 }
77 }
78 alertmanager = { enabled = false }
79 grafana = {
80 adminPassword = var.grafana_admin_password
81 sidecar = {
82 dashboards = {
83 enabled = true
84 label = "grafana_dashboard"
85 labelValue = "1"
86 searchNamespace = "ALL"
87 }
88 }
89 service = { type = "ClusterIP" }
90 ingress = {
91 enabled = true
92 ingressClassName = "nginx"
93 hosts = ["grafana.localtest.me"]
94 }
95 }
96 }),
97 ]
98}
99
100resource "helm_release" "prometheus_adapter" {
101 name = "prometheus-adapter"
102 repository = "https://prometheus-community.github.io/helm-charts"
103 chart = "prometheus-adapter"
104 version = var.prometheus_adapter_version
105 namespace = kubernetes_namespace_v1.monitoring.metadata[0].name
106 wait = true
107 timeout = 300
108
109 values = [
110 yamlencode({
111 prometheus = {
112 url = "http://kps-prometheus.${kubernetes_namespace_v1.monitoring.metadata[0].name}.svc"
113 port = 9090
114 }
115 rules = {
116 default = false
117 custom = [
118 {
119 # In-flight request count per pod; basis for autoscaling.
120 # vLLM exposes this as a gauge per model-engine.
121 seriesQuery = "vllm:num_requests_running{namespace!=\"\",pod!=\"\"}"
122 resources = {
123 overrides = {
124 namespace = { resource = "namespace" }
125 pod = { resource = "pod" }
126 }
127 }
128 name = {
129 matches = "^vllm:num_requests_running$"
130 as = "vllm:num_requests_running"
131 }
132 metricsQuery = "avg(<<.Series>>{<<.LabelMatchers>>}) by (<<.GroupBy>>)"
133 },
134 {
135 # Waiting (queued) requests per pod — an alternative scale signal.
136 seriesQuery = "vllm:num_requests_waiting{namespace!=\"\",pod!=\"\"}"
137 resources = {
138 overrides = {
139 namespace = { resource = "namespace" }
140 pod = { resource = "pod" }
141 }
142 }
143 name = {
144 matches = "^vllm:num_requests_waiting$"
145 as = "vllm:num_requests_waiting"
146 }
147 metricsQuery = "avg(<<.Series>>{<<.LabelMatchers>>}) by (<<.GroupBy>>)"
148 },
149 ]
150 }
151 }),
152 ]
153
154 depends_on = [helm_release.kps]
155}
156
diff --git a/terraform/modules/observability/outputs.tf b/terraform/modules/observability/outputs.tf
new file mode 100644
index 0000000..06a507d
--- /dev/null
+++ b/terraform/modules/observability/outputs.tf
@@ -0,0 +1,11 @@
1output "namespace" {
2 value = kubernetes_namespace_v1.monitoring.metadata[0].name
3}
4
5output "grafana_service" {
6 value = "kube-prometheus-stack-grafana.${kubernetes_namespace_v1.monitoring.metadata[0].name}.svc.cluster.local"
7}
8
9output "prometheus_service" {
10 value = "kps-prometheus.${kubernetes_namespace_v1.monitoring.metadata[0].name}.svc.cluster.local"
11}
diff --git a/terraform/modules/observability/variables.tf b/terraform/modules/observability/variables.tf
new file mode 100644
index 0000000..6aeaca3
--- /dev/null
+++ b/terraform/modules/observability/variables.tf
@@ -0,0 +1,27 @@
1variable "namespace" {
2 type = string
3 default = "monitoring"
4}
5
6variable "kps_version" {
7 type = string
8 default = "65.5.1"
9 description = "kube-prometheus-stack chart version."
10}
11
12variable "ingress_nginx_version" {
13 type = string
14 default = "4.11.3"
15 description = "ingress-nginx chart version."
16}
17
18variable "grafana_admin_password" {
19 type = string
20 default = "admin"
21 sensitive = true
22}
23
24variable "prometheus_adapter_version" {
25 type = string
26 default = "4.11.0"
27}