diff options
Diffstat (limited to 'terraform/modules/observability')
| -rw-r--r-- | terraform/modules/observability/main.tf | 156 | ||||
| -rw-r--r-- | terraform/modules/observability/outputs.tf | 11 | ||||
| -rw-r--r-- | terraform/modules/observability/variables.tf | 27 |
3 files changed, 194 insertions, 0 deletions
diff --git a/terraform/modules/observability/main.tf b/terraform/modules/observability/main.tf new file mode 100644 index 0000000..2f88f2e --- /dev/null +++ b/terraform/modules/observability/main.tf | |||
| @@ -0,0 +1,156 @@ | |||
| 1 | resource "kubernetes_namespace_v1" "monitoring" { | ||
| 2 | metadata { | ||
| 3 | name = var.namespace | ||
| 4 | } | ||
| 5 | } | ||
| 6 | |||
| 7 | resource "kubernetes_namespace_v1" "ingress" { | ||
| 8 | metadata { | ||
| 9 | name = "ingress-nginx" | ||
| 10 | } | ||
| 11 | } | ||
| 12 | |||
| 13 | resource "helm_release" "ingress_nginx" { | ||
| 14 | name = "ingress-nginx" | ||
| 15 | repository = "https://kubernetes.github.io/ingress-nginx" | ||
| 16 | chart = "ingress-nginx" | ||
| 17 | version = var.ingress_nginx_version | ||
| 18 | namespace = kubernetes_namespace_v1.ingress.metadata[0].name | ||
| 19 | wait = true | ||
| 20 | timeout = 300 | ||
| 21 | |||
| 22 | values = [ | ||
| 23 | yamlencode({ | ||
| 24 | controller = { | ||
| 25 | hostPort = { enabled = true, ports = { http = 80, https = 443 } } | ||
| 26 | service = { type = "NodePort" } | ||
| 27 | nodeSelector = { | ||
| 28 | "ingress-ready" = "true" | ||
| 29 | } | ||
| 30 | tolerations = [ | ||
| 31 | { key = "node-role.kubernetes.io/control-plane", operator = "Equal", effect = "NoSchedule" }, | ||
| 32 | { key = "node-role.kubernetes.io/master", operator = "Equal", effect = "NoSchedule" }, | ||
| 33 | ] | ||
| 34 | publishService = { enabled = false } | ||
| 35 | admissionWebhooks = { enabled = false } # speeds up kind cluster installs | ||
| 36 | # Cap worker_processes so nginx doesn't try to spawn 14 threads under | ||
| 37 | # CPU pressure from vLLM cold-starts. With auto (= one per CPU) it | ||
| 38 | # sometimes hits pthread EAGAIN and workers die without respawn. | ||
| 39 | config = { | ||
| 40 | "worker-processes" = "4" | ||
| 41 | } | ||
| 42 | } | ||
| 43 | }), | ||
| 44 | ] | ||
| 45 | } | ||
| 46 | |||
| 47 | resource "helm_release" "kps" { | ||
| 48 | name = "kube-prometheus-stack" | ||
| 49 | repository = "https://prometheus-community.github.io/helm-charts" | ||
| 50 | chart = "kube-prometheus-stack" | ||
| 51 | version = var.kps_version | ||
| 52 | namespace = kubernetes_namespace_v1.monitoring.metadata[0].name | ||
| 53 | wait = true | ||
| 54 | timeout = 600 | ||
| 55 | |||
| 56 | values = [ | ||
| 57 | yamlencode({ | ||
| 58 | fullnameOverride = "kps" | ||
| 59 | prometheus = { | ||
| 60 | prometheusSpec = { | ||
| 61 | # Let Prometheus pick up ServiceMonitors from any namespace matching | ||
| 62 | # the release=kube-prometheus-stack label (the chart's default). | ||
| 63 | serviceMonitorSelectorNilUsesHelmValues = false | ||
| 64 | podMonitorSelectorNilUsesHelmValues = false | ||
| 65 | ruleSelectorNilUsesHelmValues = false | ||
| 66 | retention = "2d" | ||
| 67 | resources = { | ||
| 68 | requests = { cpu = "100m", memory = "400Mi" } | ||
| 69 | limits = { memory = "1Gi" } | ||
| 70 | } | ||
| 71 | } | ||
| 72 | ingress = { | ||
| 73 | enabled = true | ||
| 74 | ingressClassName = "nginx" | ||
| 75 | hosts = ["prom.localtest.me"] | ||
| 76 | } | ||
| 77 | } | ||
| 78 | alertmanager = { enabled = false } | ||
| 79 | grafana = { | ||
| 80 | adminPassword = var.grafana_admin_password | ||
| 81 | sidecar = { | ||
| 82 | dashboards = { | ||
| 83 | enabled = true | ||
| 84 | label = "grafana_dashboard" | ||
| 85 | labelValue = "1" | ||
| 86 | searchNamespace = "ALL" | ||
| 87 | } | ||
| 88 | } | ||
| 89 | service = { type = "ClusterIP" } | ||
| 90 | ingress = { | ||
| 91 | enabled = true | ||
| 92 | ingressClassName = "nginx" | ||
| 93 | hosts = ["grafana.localtest.me"] | ||
| 94 | } | ||
| 95 | } | ||
| 96 | }), | ||
| 97 | ] | ||
| 98 | } | ||
| 99 | |||
| 100 | resource "helm_release" "prometheus_adapter" { | ||
| 101 | name = "prometheus-adapter" | ||
| 102 | repository = "https://prometheus-community.github.io/helm-charts" | ||
| 103 | chart = "prometheus-adapter" | ||
| 104 | version = var.prometheus_adapter_version | ||
| 105 | namespace = kubernetes_namespace_v1.monitoring.metadata[0].name | ||
| 106 | wait = true | ||
| 107 | timeout = 300 | ||
| 108 | |||
| 109 | values = [ | ||
| 110 | yamlencode({ | ||
| 111 | prometheus = { | ||
| 112 | url = "http://kps-prometheus.${kubernetes_namespace_v1.monitoring.metadata[0].name}.svc" | ||
| 113 | port = 9090 | ||
| 114 | } | ||
| 115 | rules = { | ||
| 116 | default = false | ||
| 117 | custom = [ | ||
| 118 | { | ||
| 119 | # In-flight request count per pod; basis for autoscaling. | ||
| 120 | # vLLM exposes this as a gauge per model-engine. | ||
| 121 | seriesQuery = "vllm:num_requests_running{namespace!=\"\",pod!=\"\"}" | ||
| 122 | resources = { | ||
| 123 | overrides = { | ||
| 124 | namespace = { resource = "namespace" } | ||
| 125 | pod = { resource = "pod" } | ||
| 126 | } | ||
| 127 | } | ||
| 128 | name = { | ||
| 129 | matches = "^vllm:num_requests_running$" | ||
| 130 | as = "vllm:num_requests_running" | ||
| 131 | } | ||
| 132 | metricsQuery = "avg(<<.Series>>{<<.LabelMatchers>>}) by (<<.GroupBy>>)" | ||
| 133 | }, | ||
| 134 | { | ||
| 135 | # Waiting (queued) requests per pod — an alternative scale signal. | ||
| 136 | seriesQuery = "vllm:num_requests_waiting{namespace!=\"\",pod!=\"\"}" | ||
| 137 | resources = { | ||
| 138 | overrides = { | ||
| 139 | namespace = { resource = "namespace" } | ||
| 140 | pod = { resource = "pod" } | ||
| 141 | } | ||
| 142 | } | ||
| 143 | name = { | ||
| 144 | matches = "^vllm:num_requests_waiting$" | ||
| 145 | as = "vllm:num_requests_waiting" | ||
| 146 | } | ||
| 147 | metricsQuery = "avg(<<.Series>>{<<.LabelMatchers>>}) by (<<.GroupBy>>)" | ||
| 148 | }, | ||
| 149 | ] | ||
| 150 | } | ||
| 151 | }), | ||
| 152 | ] | ||
| 153 | |||
| 154 | depends_on = [helm_release.kps] | ||
| 155 | } | ||
| 156 | |||
diff --git a/terraform/modules/observability/outputs.tf b/terraform/modules/observability/outputs.tf new file mode 100644 index 0000000..06a507d --- /dev/null +++ b/terraform/modules/observability/outputs.tf | |||
| @@ -0,0 +1,11 @@ | |||
| 1 | output "namespace" { | ||
| 2 | value = kubernetes_namespace_v1.monitoring.metadata[0].name | ||
| 3 | } | ||
| 4 | |||
| 5 | output "grafana_service" { | ||
| 6 | value = "kube-prometheus-stack-grafana.${kubernetes_namespace_v1.monitoring.metadata[0].name}.svc.cluster.local" | ||
| 7 | } | ||
| 8 | |||
| 9 | output "prometheus_service" { | ||
| 10 | value = "kps-prometheus.${kubernetes_namespace_v1.monitoring.metadata[0].name}.svc.cluster.local" | ||
| 11 | } | ||
diff --git a/terraform/modules/observability/variables.tf b/terraform/modules/observability/variables.tf new file mode 100644 index 0000000..6aeaca3 --- /dev/null +++ b/terraform/modules/observability/variables.tf | |||
| @@ -0,0 +1,27 @@ | |||
| 1 | variable "namespace" { | ||
| 2 | type = string | ||
| 3 | default = "monitoring" | ||
| 4 | } | ||
| 5 | |||
| 6 | variable "kps_version" { | ||
| 7 | type = string | ||
| 8 | default = "65.5.1" | ||
| 9 | description = "kube-prometheus-stack chart version." | ||
| 10 | } | ||
| 11 | |||
| 12 | variable "ingress_nginx_version" { | ||
| 13 | type = string | ||
| 14 | default = "4.11.3" | ||
| 15 | description = "ingress-nginx chart version." | ||
| 16 | } | ||
| 17 | |||
| 18 | variable "grafana_admin_password" { | ||
| 19 | type = string | ||
| 20 | default = "admin" | ||
| 21 | sensitive = true | ||
| 22 | } | ||
| 23 | |||
| 24 | variable "prometheus_adapter_version" { | ||
| 25 | type = string | ||
| 26 | default = "4.11.0" | ||
| 27 | } | ||
