diff options
Diffstat (limited to 'terraform/modules')
| -rw-r--r-- | terraform/modules/agent/main.tf | 114 | ||||
| -rw-r--r-- | terraform/modules/agent/outputs.tf | 11 | ||||
| -rw-r--r-- | terraform/modules/agent/variables.tf | 33 | ||||
| -rw-r--r-- | terraform/modules/agent/versions.tf | 5 | ||||
| -rw-r--r-- | terraform/modules/llm/main.tf | 99 | ||||
| -rw-r--r-- | terraform/modules/llm/outputs.tf | 12 | ||||
| -rw-r--r-- | terraform/modules/llm/variables.tf | 112 | ||||
| -rw-r--r-- | terraform/modules/observability/main.tf | 156 | ||||
| -rw-r--r-- | terraform/modules/observability/outputs.tf | 11 | ||||
| -rw-r--r-- | terraform/modules/observability/variables.tf | 27 |
10 files changed, 580 insertions, 0 deletions
diff --git a/terraform/modules/agent/main.tf b/terraform/modules/agent/main.tf new file mode 100644 index 0000000..f53acdc --- /dev/null +++ b/terraform/modules/agent/main.tf | |||
| @@ -0,0 +1,114 @@ | |||
| 1 | resource "kubernetes_namespace_v1" "agent" { | ||
| 2 | metadata { | ||
| 3 | name = var.namespace | ||
| 4 | labels = { | ||
| 5 | "app.kubernetes.io/part-of" = "llm-platform" | ||
| 6 | } | ||
| 7 | } | ||
| 8 | } | ||
| 9 | |||
| 10 | resource "kubernetes_deployment_v1" "agent" { | ||
| 11 | metadata { | ||
| 12 | name = "agent" | ||
| 13 | namespace = kubernetes_namespace_v1.agent.metadata[0].name | ||
| 14 | labels = { app = "agent" } | ||
| 15 | } | ||
| 16 | spec { | ||
| 17 | replicas = 1 | ||
| 18 | selector { | ||
| 19 | match_labels = { app = "agent" } | ||
| 20 | } | ||
| 21 | template { | ||
| 22 | metadata { | ||
| 23 | labels = { app = "agent" } | ||
| 24 | annotations = { | ||
| 25 | # Bounce the pod when agent.py changes on disk, even if image tag is unchanged. | ||
| 26 | "checksum/code" = substr(sha256(file(var.agent_source_path)), 0, 16) | ||
| 27 | } | ||
| 28 | } | ||
| 29 | spec { | ||
| 30 | container { | ||
| 31 | name = "agent" | ||
| 32 | image = var.agent_image | ||
| 33 | image_pull_policy = "IfNotPresent" | ||
| 34 | env { | ||
| 35 | name = "OPENAI_BASE_URL" | ||
| 36 | value = var.llm_service_url | ||
| 37 | } | ||
| 38 | env { | ||
| 39 | name = "MODEL" | ||
| 40 | value = var.model_alias | ||
| 41 | } | ||
| 42 | port { | ||
| 43 | name = "http" | ||
| 44 | container_port = 8001 | ||
| 45 | } | ||
| 46 | readiness_probe { | ||
| 47 | http_get { | ||
| 48 | path = "/health" | ||
| 49 | port = "http" | ||
| 50 | } | ||
| 51 | initial_delay_seconds = 3 | ||
| 52 | period_seconds = 5 | ||
| 53 | failure_threshold = 10 | ||
| 54 | } | ||
| 55 | liveness_probe { | ||
| 56 | http_get { | ||
| 57 | path = "/health" | ||
| 58 | port = "http" | ||
| 59 | } | ||
| 60 | initial_delay_seconds = 30 | ||
| 61 | period_seconds = 30 | ||
| 62 | } | ||
| 63 | resources { | ||
| 64 | requests = { cpu = "100m", memory = "128Mi" } | ||
| 65 | limits = { cpu = "1", memory = "512Mi" } | ||
| 66 | } | ||
| 67 | } | ||
| 68 | } | ||
| 69 | } | ||
| 70 | } | ||
| 71 | } | ||
| 72 | |||
| 73 | resource "kubernetes_service_v1" "agent" { | ||
| 74 | metadata { | ||
| 75 | name = "agent" | ||
| 76 | namespace = kubernetes_namespace_v1.agent.metadata[0].name | ||
| 77 | labels = { app = "agent" } | ||
| 78 | } | ||
| 79 | spec { | ||
| 80 | selector = { app = "agent" } | ||
| 81 | port { | ||
| 82 | name = "http" | ||
| 83 | port = 8001 | ||
| 84 | target_port = "http" | ||
| 85 | } | ||
| 86 | } | ||
| 87 | } | ||
| 88 | |||
| 89 | resource "kubernetes_ingress_v1" "agent" { | ||
| 90 | metadata { | ||
| 91 | name = "agent" | ||
| 92 | namespace = kubernetes_namespace_v1.agent.metadata[0].name | ||
| 93 | } | ||
| 94 | spec { | ||
| 95 | ingress_class_name = var.ingress_class | ||
| 96 | rule { | ||
| 97 | host = var.ingress_host | ||
| 98 | http { | ||
| 99 | path { | ||
| 100 | path = "/" | ||
| 101 | path_type = "Prefix" | ||
| 102 | backend { | ||
| 103 | service { | ||
| 104 | name = kubernetes_service_v1.agent.metadata[0].name | ||
| 105 | port { | ||
| 106 | number = 8001 | ||
| 107 | } | ||
| 108 | } | ||
| 109 | } | ||
| 110 | } | ||
| 111 | } | ||
| 112 | } | ||
| 113 | } | ||
| 114 | } | ||
diff --git a/terraform/modules/agent/outputs.tf b/terraform/modules/agent/outputs.tf new file mode 100644 index 0000000..ac9932b --- /dev/null +++ b/terraform/modules/agent/outputs.tf | |||
| @@ -0,0 +1,11 @@ | |||
| 1 | output "service_dns" { | ||
| 2 | value = "${kubernetes_service_v1.agent.metadata[0].name}.${kubernetes_namespace_v1.agent.metadata[0].name}.svc.cluster.local" | ||
| 3 | } | ||
| 4 | |||
| 5 | output "ingress_host" { | ||
| 6 | value = var.ingress_host | ||
| 7 | } | ||
| 8 | |||
| 9 | output "namespace" { | ||
| 10 | value = kubernetes_namespace_v1.agent.metadata[0].name | ||
| 11 | } | ||
diff --git a/terraform/modules/agent/variables.tf b/terraform/modules/agent/variables.tf new file mode 100644 index 0000000..6f525ee --- /dev/null +++ b/terraform/modules/agent/variables.tf | |||
| @@ -0,0 +1,33 @@ | |||
| 1 | variable "namespace" { | ||
| 2 | type = string | ||
| 3 | } | ||
| 4 | |||
| 5 | variable "agent_source_path" { | ||
| 6 | type = string | ||
| 7 | description = "Absolute path to agent/agent.py. Used only to bounce pods on code change." | ||
| 8 | } | ||
| 9 | |||
| 10 | variable "agent_image" { | ||
| 11 | type = string | ||
| 12 | default = "localhost/agent:0.1.0" | ||
| 13 | description = "Pre-built agent image. Must be loaded into kind with `make agent-build`." | ||
| 14 | } | ||
| 15 | |||
| 16 | variable "llm_service_url" { | ||
| 17 | type = string | ||
| 18 | description = "OpenAI-compatible base URL, e.g. http://llm-llm-app.llm-prod.svc.cluster.local:8000/v1" | ||
| 19 | } | ||
| 20 | |||
| 21 | variable "model_alias" { | ||
| 22 | type = string | ||
| 23 | default = "Qwen2.5-1.5B-Instruct" | ||
| 24 | } | ||
| 25 | |||
| 26 | variable "ingress_host" { | ||
| 27 | type = string | ||
| 28 | } | ||
| 29 | |||
| 30 | variable "ingress_class" { | ||
| 31 | type = string | ||
| 32 | default = "nginx" | ||
| 33 | } | ||
diff --git a/terraform/modules/agent/versions.tf b/terraform/modules/agent/versions.tf new file mode 100644 index 0000000..4242705 --- /dev/null +++ b/terraform/modules/agent/versions.tf | |||
| @@ -0,0 +1,5 @@ | |||
| 1 | terraform { | ||
| 2 | required_providers { | ||
| 3 | kubernetes = { source = "hashicorp/kubernetes", version = "~> 2.31" } | ||
| 4 | } | ||
| 5 | } | ||
diff --git a/terraform/modules/llm/main.tf b/terraform/modules/llm/main.tf new file mode 100644 index 0000000..cd22019 --- /dev/null +++ b/terraform/modules/llm/main.tf | |||
| @@ -0,0 +1,99 @@ | |||
| 1 | resource "kubernetes_namespace_v1" "this" { | ||
| 2 | metadata { | ||
| 3 | name = var.namespace | ||
| 4 | labels = { | ||
| 5 | "app.kubernetes.io/part-of" = "llm-platform" | ||
| 6 | } | ||
| 7 | } | ||
| 8 | } | ||
| 9 | |||
| 10 | resource "kubernetes_horizontal_pod_autoscaler_v2" "llm" { | ||
| 11 | count = var.hpa.enabled ? 1 : 0 | ||
| 12 | |||
| 13 | metadata { | ||
| 14 | name = "${var.release_name}-llm-app" | ||
| 15 | namespace = kubernetes_namespace_v1.this.metadata[0].name | ||
| 16 | } | ||
| 17 | spec { | ||
| 18 | scale_target_ref { | ||
| 19 | api_version = "apps/v1" | ||
| 20 | kind = "Deployment" | ||
| 21 | name = "${var.release_name}-llm-app" | ||
| 22 | } | ||
| 23 | min_replicas = var.hpa.min_replicas | ||
| 24 | max_replicas = var.hpa.max_replicas | ||
| 25 | |||
| 26 | metric { | ||
| 27 | type = "Pods" | ||
| 28 | pods { | ||
| 29 | metric { | ||
| 30 | name = var.hpa.metric_name | ||
| 31 | } | ||
| 32 | target { | ||
| 33 | type = "AverageValue" | ||
| 34 | average_value = var.hpa.target_average_value | ||
| 35 | } | ||
| 36 | } | ||
| 37 | } | ||
| 38 | } | ||
| 39 | |||
| 40 | depends_on = [helm_release.llm] | ||
| 41 | } | ||
| 42 | |||
| 43 | resource "helm_release" "llm" { | ||
| 44 | name = var.release_name | ||
| 45 | chart = var.chart_path | ||
| 46 | namespace = kubernetes_namespace_v1.this.metadata[0].name | ||
| 47 | create_namespace = false | ||
| 48 | atomic = false | ||
| 49 | wait = true | ||
| 50 | timeout = 1800 | ||
| 51 | |||
| 52 | values = [ | ||
| 53 | yamlencode({ | ||
| 54 | replicaCount = var.replicas | ||
| 55 | |||
| 56 | image = { | ||
| 57 | repository = var.image_repository | ||
| 58 | tag = var.image_tag | ||
| 59 | digest = var.image_digest | ||
| 60 | pullPolicy = "IfNotPresent" | ||
| 61 | } | ||
| 62 | |||
| 63 | model = { | ||
| 64 | name = var.model_name | ||
| 65 | alias = var.model_alias | ||
| 66 | maxModelLen = var.max_model_len | ||
| 67 | dtype = var.dtype | ||
| 68 | } | ||
| 69 | |||
| 70 | server = { | ||
| 71 | port = 8000 | ||
| 72 | ompThreads = var.omp_threads | ||
| 73 | extraArgs = var.extra_args | ||
| 74 | } | ||
| 75 | |||
| 76 | resources = var.resources | ||
| 77 | |||
| 78 | ingress = { | ||
| 79 | enabled = true | ||
| 80 | className = var.ingress_class | ||
| 81 | host = var.ingress_host | ||
| 82 | } | ||
| 83 | |||
| 84 | monitoring = { | ||
| 85 | serviceMonitor = { | ||
| 86 | enabled = true | ||
| 87 | interval = "15s" | ||
| 88 | labels = { | ||
| 89 | release = var.service_monitor_release_label | ||
| 90 | } | ||
| 91 | } | ||
| 92 | } | ||
| 93 | |||
| 94 | modelCache = { | ||
| 95 | sizeLimit = var.model_cache_size | ||
| 96 | } | ||
| 97 | }), | ||
| 98 | ] | ||
| 99 | } | ||
diff --git a/terraform/modules/llm/outputs.tf b/terraform/modules/llm/outputs.tf new file mode 100644 index 0000000..a953e73 --- /dev/null +++ b/terraform/modules/llm/outputs.tf | |||
| @@ -0,0 +1,12 @@ | |||
| 1 | output "service_dns" { | ||
| 2 | value = "${var.release_name}-llm-app.${var.namespace}.svc.cluster.local" | ||
| 3 | description = "In-cluster DNS name for the LLM Service." | ||
| 4 | } | ||
| 5 | |||
| 6 | output "ingress_host" { | ||
| 7 | value = var.ingress_host | ||
| 8 | } | ||
| 9 | |||
| 10 | output "namespace" { | ||
| 11 | value = kubernetes_namespace_v1.this.metadata[0].name | ||
| 12 | } | ||
diff --git a/terraform/modules/llm/variables.tf b/terraform/modules/llm/variables.tf new file mode 100644 index 0000000..3a7d8f7 --- /dev/null +++ b/terraform/modules/llm/variables.tf | |||
| @@ -0,0 +1,112 @@ | |||
| 1 | variable "release_name" { | ||
| 2 | type = string | ||
| 3 | description = "Helm release name." | ||
| 4 | } | ||
| 5 | |||
| 6 | variable "namespace" { | ||
| 7 | type = string | ||
| 8 | description = "Kubernetes namespace to deploy into." | ||
| 9 | } | ||
| 10 | |||
| 11 | variable "chart_path" { | ||
| 12 | type = string | ||
| 13 | description = "Path to the local llm-app chart." | ||
| 14 | } | ||
| 15 | |||
| 16 | variable "replicas" { | ||
| 17 | type = number | ||
| 18 | default = 1 | ||
| 19 | } | ||
| 20 | |||
| 21 | variable "model_name" { | ||
| 22 | type = string | ||
| 23 | description = "HuggingFace repo id, passed as vLLM model_tag (positional)." | ||
| 24 | } | ||
| 25 | |||
| 26 | variable "model_alias" { | ||
| 27 | type = string | ||
| 28 | description = "Value clients pass in the OpenAI 'model' field (maps to --served-model-name)." | ||
| 29 | } | ||
| 30 | |||
| 31 | variable "max_model_len" { | ||
| 32 | type = number | ||
| 33 | default = 2048 | ||
| 34 | } | ||
| 35 | |||
| 36 | variable "dtype" { | ||
| 37 | type = string | ||
| 38 | default = "bfloat16" | ||
| 39 | } | ||
| 40 | |||
| 41 | variable "omp_threads" { | ||
| 42 | type = number | ||
| 43 | default = 0 | ||
| 44 | description = "OMP_NUM_THREADS for vLLM CPU backend. 0 = autodetect." | ||
| 45 | } | ||
| 46 | |||
| 47 | variable "extra_args" { | ||
| 48 | type = list(string) | ||
| 49 | default = [] | ||
| 50 | description = "Extra CLI args passed to `vllm serve`, appended after the stock set." | ||
| 51 | } | ||
| 52 | |||
| 53 | variable "resources" { | ||
| 54 | type = object({ | ||
| 55 | requests = object({ cpu = string, memory = string }) | ||
| 56 | limits = object({ cpu = string, memory = string }) | ||
| 57 | }) | ||
| 58 | } | ||
| 59 | |||
| 60 | variable "ingress_host" { | ||
| 61 | type = string | ||
| 62 | } | ||
| 63 | |||
| 64 | variable "ingress_class" { | ||
| 65 | type = string | ||
| 66 | default = "nginx" | ||
| 67 | } | ||
| 68 | |||
| 69 | variable "image_repository" { | ||
| 70 | type = string | ||
| 71 | default = "public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo" | ||
| 72 | } | ||
| 73 | |||
| 74 | variable "image_tag" { | ||
| 75 | type = string | ||
| 76 | default = "latest" | ||
| 77 | description = "Used only when image_digest is empty." | ||
| 78 | } | ||
| 79 | |||
| 80 | variable "image_digest" { | ||
| 81 | type = string | ||
| 82 | default = "" | ||
| 83 | description = "Optional sha256:abc... content-addressable digest. Takes precedence over image_tag." | ||
| 84 | } | ||
| 85 | |||
| 86 | variable "service_monitor_release_label" { | ||
| 87 | type = string | ||
| 88 | default = "kube-prometheus-stack" | ||
| 89 | description = "Must match the release label the Prometheus Operator selects on." | ||
| 90 | } | ||
| 91 | |||
| 92 | variable "model_cache_size" { | ||
| 93 | type = string | ||
| 94 | default = "10Gi" | ||
| 95 | } | ||
| 96 | |||
| 97 | variable "hpa" { | ||
| 98 | type = object({ | ||
| 99 | enabled = bool | ||
| 100 | min_replicas = number | ||
| 101 | max_replicas = number | ||
| 102 | metric_name = string | ||
| 103 | target_average_value = string | ||
| 104 | }) | ||
| 105 | default = { | ||
| 106 | enabled = false | ||
| 107 | min_replicas = 1 | ||
| 108 | max_replicas = 3 | ||
| 109 | metric_name = "vllm:num_requests_running" | ||
| 110 | target_average_value = "500m" | ||
| 111 | } | ||
| 112 | } | ||
diff --git a/terraform/modules/observability/main.tf b/terraform/modules/observability/main.tf new file mode 100644 index 0000000..2f88f2e --- /dev/null +++ b/terraform/modules/observability/main.tf | |||
| @@ -0,0 +1,156 @@ | |||
| 1 | resource "kubernetes_namespace_v1" "monitoring" { | ||
| 2 | metadata { | ||
| 3 | name = var.namespace | ||
| 4 | } | ||
| 5 | } | ||
| 6 | |||
| 7 | resource "kubernetes_namespace_v1" "ingress" { | ||
| 8 | metadata { | ||
| 9 | name = "ingress-nginx" | ||
| 10 | } | ||
| 11 | } | ||
| 12 | |||
| 13 | resource "helm_release" "ingress_nginx" { | ||
| 14 | name = "ingress-nginx" | ||
| 15 | repository = "https://kubernetes.github.io/ingress-nginx" | ||
| 16 | chart = "ingress-nginx" | ||
| 17 | version = var.ingress_nginx_version | ||
| 18 | namespace = kubernetes_namespace_v1.ingress.metadata[0].name | ||
| 19 | wait = true | ||
| 20 | timeout = 300 | ||
| 21 | |||
| 22 | values = [ | ||
| 23 | yamlencode({ | ||
| 24 | controller = { | ||
| 25 | hostPort = { enabled = true, ports = { http = 80, https = 443 } } | ||
| 26 | service = { type = "NodePort" } | ||
| 27 | nodeSelector = { | ||
| 28 | "ingress-ready" = "true" | ||
| 29 | } | ||
| 30 | tolerations = [ | ||
| 31 | { key = "node-role.kubernetes.io/control-plane", operator = "Equal", effect = "NoSchedule" }, | ||
| 32 | { key = "node-role.kubernetes.io/master", operator = "Equal", effect = "NoSchedule" }, | ||
| 33 | ] | ||
| 34 | publishService = { enabled = false } | ||
| 35 | admissionWebhooks = { enabled = false } # speeds up kind cluster installs | ||
| 36 | # Cap worker_processes so nginx doesn't try to spawn 14 threads under | ||
| 37 | # CPU pressure from vLLM cold-starts. With auto (= one per CPU) it | ||
| 38 | # sometimes hits pthread EAGAIN and workers die without respawn. | ||
| 39 | config = { | ||
| 40 | "worker-processes" = "4" | ||
| 41 | } | ||
| 42 | } | ||
| 43 | }), | ||
| 44 | ] | ||
| 45 | } | ||
| 46 | |||
| 47 | resource "helm_release" "kps" { | ||
| 48 | name = "kube-prometheus-stack" | ||
| 49 | repository = "https://prometheus-community.github.io/helm-charts" | ||
| 50 | chart = "kube-prometheus-stack" | ||
| 51 | version = var.kps_version | ||
| 52 | namespace = kubernetes_namespace_v1.monitoring.metadata[0].name | ||
| 53 | wait = true | ||
| 54 | timeout = 600 | ||
| 55 | |||
| 56 | values = [ | ||
| 57 | yamlencode({ | ||
| 58 | fullnameOverride = "kps" | ||
| 59 | prometheus = { | ||
| 60 | prometheusSpec = { | ||
| 61 | # Let Prometheus pick up ServiceMonitors from any namespace matching | ||
| 62 | # the release=kube-prometheus-stack label (the chart's default). | ||
| 63 | serviceMonitorSelectorNilUsesHelmValues = false | ||
| 64 | podMonitorSelectorNilUsesHelmValues = false | ||
| 65 | ruleSelectorNilUsesHelmValues = false | ||
| 66 | retention = "2d" | ||
| 67 | resources = { | ||
| 68 | requests = { cpu = "100m", memory = "400Mi" } | ||
| 69 | limits = { memory = "1Gi" } | ||
| 70 | } | ||
| 71 | } | ||
| 72 | ingress = { | ||
| 73 | enabled = true | ||
| 74 | ingressClassName = "nginx" | ||
| 75 | hosts = ["prom.localtest.me"] | ||
| 76 | } | ||
| 77 | } | ||
| 78 | alertmanager = { enabled = false } | ||
| 79 | grafana = { | ||
| 80 | adminPassword = var.grafana_admin_password | ||
| 81 | sidecar = { | ||
| 82 | dashboards = { | ||
| 83 | enabled = true | ||
| 84 | label = "grafana_dashboard" | ||
| 85 | labelValue = "1" | ||
| 86 | searchNamespace = "ALL" | ||
| 87 | } | ||
| 88 | } | ||
| 89 | service = { type = "ClusterIP" } | ||
| 90 | ingress = { | ||
| 91 | enabled = true | ||
| 92 | ingressClassName = "nginx" | ||
| 93 | hosts = ["grafana.localtest.me"] | ||
| 94 | } | ||
| 95 | } | ||
| 96 | }), | ||
| 97 | ] | ||
| 98 | } | ||
| 99 | |||
| 100 | resource "helm_release" "prometheus_adapter" { | ||
| 101 | name = "prometheus-adapter" | ||
| 102 | repository = "https://prometheus-community.github.io/helm-charts" | ||
| 103 | chart = "prometheus-adapter" | ||
| 104 | version = var.prometheus_adapter_version | ||
| 105 | namespace = kubernetes_namespace_v1.monitoring.metadata[0].name | ||
| 106 | wait = true | ||
| 107 | timeout = 300 | ||
| 108 | |||
| 109 | values = [ | ||
| 110 | yamlencode({ | ||
| 111 | prometheus = { | ||
| 112 | url = "http://kps-prometheus.${kubernetes_namespace_v1.monitoring.metadata[0].name}.svc" | ||
| 113 | port = 9090 | ||
| 114 | } | ||
| 115 | rules = { | ||
| 116 | default = false | ||
| 117 | custom = [ | ||
| 118 | { | ||
| 119 | # In-flight request count per pod; basis for autoscaling. | ||
| 120 | # vLLM exposes this as a gauge per model-engine. | ||
| 121 | seriesQuery = "vllm:num_requests_running{namespace!=\"\",pod!=\"\"}" | ||
| 122 | resources = { | ||
| 123 | overrides = { | ||
| 124 | namespace = { resource = "namespace" } | ||
| 125 | pod = { resource = "pod" } | ||
| 126 | } | ||
| 127 | } | ||
| 128 | name = { | ||
| 129 | matches = "^vllm:num_requests_running$" | ||
| 130 | as = "vllm:num_requests_running" | ||
| 131 | } | ||
| 132 | metricsQuery = "avg(<<.Series>>{<<.LabelMatchers>>}) by (<<.GroupBy>>)" | ||
| 133 | }, | ||
| 134 | { | ||
| 135 | # Waiting (queued) requests per pod — an alternative scale signal. | ||
| 136 | seriesQuery = "vllm:num_requests_waiting{namespace!=\"\",pod!=\"\"}" | ||
| 137 | resources = { | ||
| 138 | overrides = { | ||
| 139 | namespace = { resource = "namespace" } | ||
| 140 | pod = { resource = "pod" } | ||
| 141 | } | ||
| 142 | } | ||
| 143 | name = { | ||
| 144 | matches = "^vllm:num_requests_waiting$" | ||
| 145 | as = "vllm:num_requests_waiting" | ||
| 146 | } | ||
| 147 | metricsQuery = "avg(<<.Series>>{<<.LabelMatchers>>}) by (<<.GroupBy>>)" | ||
| 148 | }, | ||
| 149 | ] | ||
| 150 | } | ||
| 151 | }), | ||
| 152 | ] | ||
| 153 | |||
| 154 | depends_on = [helm_release.kps] | ||
| 155 | } | ||
| 156 | |||
diff --git a/terraform/modules/observability/outputs.tf b/terraform/modules/observability/outputs.tf new file mode 100644 index 0000000..06a507d --- /dev/null +++ b/terraform/modules/observability/outputs.tf | |||
| @@ -0,0 +1,11 @@ | |||
| 1 | output "namespace" { | ||
| 2 | value = kubernetes_namespace_v1.monitoring.metadata[0].name | ||
| 3 | } | ||
| 4 | |||
| 5 | output "grafana_service" { | ||
| 6 | value = "kube-prometheus-stack-grafana.${kubernetes_namespace_v1.monitoring.metadata[0].name}.svc.cluster.local" | ||
| 7 | } | ||
| 8 | |||
| 9 | output "prometheus_service" { | ||
| 10 | value = "kps-prometheus.${kubernetes_namespace_v1.monitoring.metadata[0].name}.svc.cluster.local" | ||
| 11 | } | ||
diff --git a/terraform/modules/observability/variables.tf b/terraform/modules/observability/variables.tf new file mode 100644 index 0000000..6aeaca3 --- /dev/null +++ b/terraform/modules/observability/variables.tf | |||
| @@ -0,0 +1,27 @@ | |||
| 1 | variable "namespace" { | ||
| 2 | type = string | ||
| 3 | default = "monitoring" | ||
| 4 | } | ||
| 5 | |||
| 6 | variable "kps_version" { | ||
| 7 | type = string | ||
| 8 | default = "65.5.1" | ||
| 9 | description = "kube-prometheus-stack chart version." | ||
| 10 | } | ||
| 11 | |||
| 12 | variable "ingress_nginx_version" { | ||
| 13 | type = string | ||
| 14 | default = "4.11.3" | ||
| 15 | description = "ingress-nginx chart version." | ||
| 16 | } | ||
| 17 | |||
| 18 | variable "grafana_admin_password" { | ||
| 19 | type = string | ||
| 20 | default = "admin" | ||
| 21 | sensitive = true | ||
| 22 | } | ||
| 23 | |||
| 24 | variable "prometheus_adapter_version" { | ||
| 25 | type = string | ||
| 26 | default = "4.11.0" | ||
| 27 | } | ||
