resource "kubernetes_namespace_v1" "monitoring" { metadata { name = var.namespace } } resource "kubernetes_namespace_v1" "ingress" { metadata { name = "ingress-nginx" } } resource "helm_release" "ingress_nginx" { name = "ingress-nginx" repository = "https://kubernetes.github.io/ingress-nginx" chart = "ingress-nginx" version = var.ingress_nginx_version namespace = kubernetes_namespace_v1.ingress.metadata[0].name wait = true timeout = 300 values = [ yamlencode({ controller = { hostPort = { enabled = true, ports = { http = 80, https = 443 } } service = { type = "NodePort" } nodeSelector = { "ingress-ready" = "true" } tolerations = [ { key = "node-role.kubernetes.io/control-plane", operator = "Equal", effect = "NoSchedule" }, { key = "node-role.kubernetes.io/master", operator = "Equal", effect = "NoSchedule" }, ] publishService = { enabled = false } admissionWebhooks = { enabled = false } # speeds up kind cluster installs # Cap worker_processes so nginx doesn't try to spawn 14 threads under # CPU pressure from vLLM cold-starts. With auto (= one per CPU) it # sometimes hits pthread EAGAIN and workers die without respawn. config = { "worker-processes" = "4" } } }), ] } resource "helm_release" "kps" { name = "kube-prometheus-stack" repository = "https://prometheus-community.github.io/helm-charts" chart = "kube-prometheus-stack" version = var.kps_version namespace = kubernetes_namespace_v1.monitoring.metadata[0].name wait = true timeout = 600 values = [ yamlencode({ fullnameOverride = "kps" prometheus = { prometheusSpec = { # Let Prometheus pick up ServiceMonitors from any namespace matching # the release=kube-prometheus-stack label (the chart's default). serviceMonitorSelectorNilUsesHelmValues = false podMonitorSelectorNilUsesHelmValues = false ruleSelectorNilUsesHelmValues = false retention = "2d" resources = { requests = { cpu = "100m", memory = "400Mi" } limits = { memory = "1Gi" } } } ingress = { enabled = true ingressClassName = "nginx" hosts = ["prom.localtest.me"] } } alertmanager = { enabled = false } grafana = { adminPassword = var.grafana_admin_password sidecar = { dashboards = { enabled = true label = "grafana_dashboard" labelValue = "1" searchNamespace = "ALL" } } service = { type = "ClusterIP" } ingress = { enabled = true ingressClassName = "nginx" hosts = ["grafana.localtest.me"] } } }), ] } resource "helm_release" "prometheus_adapter" { name = "prometheus-adapter" repository = "https://prometheus-community.github.io/helm-charts" chart = "prometheus-adapter" version = var.prometheus_adapter_version namespace = kubernetes_namespace_v1.monitoring.metadata[0].name wait = true timeout = 300 values = [ yamlencode({ prometheus = { url = "http://kps-prometheus.${kubernetes_namespace_v1.monitoring.metadata[0].name}.svc" port = 9090 } rules = { default = false custom = [ { # In-flight request count per pod; basis for autoscaling. # vLLM exposes this as a gauge per model-engine. seriesQuery = "vllm:num_requests_running{namespace!=\"\",pod!=\"\"}" resources = { overrides = { namespace = { resource = "namespace" } pod = { resource = "pod" } } } name = { matches = "^vllm:num_requests_running$" as = "vllm:num_requests_running" } metricsQuery = "avg(<<.Series>>{<<.LabelMatchers>>}) by (<<.GroupBy>>)" }, { # Waiting (queued) requests per pod — an alternative scale signal. seriesQuery = "vllm:num_requests_waiting{namespace!=\"\",pod!=\"\"}" resources = { overrides = { namespace = { resource = "namespace" } pod = { resource = "pod" } } } name = { matches = "^vllm:num_requests_waiting$" as = "vllm:num_requests_waiting" } metricsQuery = "avg(<<.Series>>{<<.LabelMatchers>>}) by (<<.GroupBy>>)" }, ] } }), ] depends_on = [helm_release.kps] }