summaryrefslogtreecommitdiff
path: root/terraform/modules
diff options
context:
space:
mode:
Diffstat (limited to 'terraform/modules')
-rw-r--r--terraform/modules/agent/main.tf114
-rw-r--r--terraform/modules/agent/outputs.tf11
-rw-r--r--terraform/modules/agent/variables.tf33
-rw-r--r--terraform/modules/agent/versions.tf5
-rw-r--r--terraform/modules/llm/main.tf99
-rw-r--r--terraform/modules/llm/outputs.tf12
-rw-r--r--terraform/modules/llm/variables.tf112
-rw-r--r--terraform/modules/observability/main.tf156
-rw-r--r--terraform/modules/observability/outputs.tf11
-rw-r--r--terraform/modules/observability/variables.tf27
10 files changed, 580 insertions, 0 deletions
diff --git a/terraform/modules/agent/main.tf b/terraform/modules/agent/main.tf
new file mode 100644
index 0000000..f53acdc
--- /dev/null
+++ b/terraform/modules/agent/main.tf
@@ -0,0 +1,114 @@
1resource "kubernetes_namespace_v1" "agent" {
2 metadata {
3 name = var.namespace
4 labels = {
5 "app.kubernetes.io/part-of" = "llm-platform"
6 }
7 }
8}
9
10resource "kubernetes_deployment_v1" "agent" {
11 metadata {
12 name = "agent"
13 namespace = kubernetes_namespace_v1.agent.metadata[0].name
14 labels = { app = "agent" }
15 }
16 spec {
17 replicas = 1
18 selector {
19 match_labels = { app = "agent" }
20 }
21 template {
22 metadata {
23 labels = { app = "agent" }
24 annotations = {
25 # Bounce the pod when agent.py changes on disk, even if image tag is unchanged.
26 "checksum/code" = substr(sha256(file(var.agent_source_path)), 0, 16)
27 }
28 }
29 spec {
30 container {
31 name = "agent"
32 image = var.agent_image
33 image_pull_policy = "IfNotPresent"
34 env {
35 name = "OPENAI_BASE_URL"
36 value = var.llm_service_url
37 }
38 env {
39 name = "MODEL"
40 value = var.model_alias
41 }
42 port {
43 name = "http"
44 container_port = 8001
45 }
46 readiness_probe {
47 http_get {
48 path = "/health"
49 port = "http"
50 }
51 initial_delay_seconds = 3
52 period_seconds = 5
53 failure_threshold = 10
54 }
55 liveness_probe {
56 http_get {
57 path = "/health"
58 port = "http"
59 }
60 initial_delay_seconds = 30
61 period_seconds = 30
62 }
63 resources {
64 requests = { cpu = "100m", memory = "128Mi" }
65 limits = { cpu = "1", memory = "512Mi" }
66 }
67 }
68 }
69 }
70 }
71}
72
73resource "kubernetes_service_v1" "agent" {
74 metadata {
75 name = "agent"
76 namespace = kubernetes_namespace_v1.agent.metadata[0].name
77 labels = { app = "agent" }
78 }
79 spec {
80 selector = { app = "agent" }
81 port {
82 name = "http"
83 port = 8001
84 target_port = "http"
85 }
86 }
87}
88
89resource "kubernetes_ingress_v1" "agent" {
90 metadata {
91 name = "agent"
92 namespace = kubernetes_namespace_v1.agent.metadata[0].name
93 }
94 spec {
95 ingress_class_name = var.ingress_class
96 rule {
97 host = var.ingress_host
98 http {
99 path {
100 path = "/"
101 path_type = "Prefix"
102 backend {
103 service {
104 name = kubernetes_service_v1.agent.metadata[0].name
105 port {
106 number = 8001
107 }
108 }
109 }
110 }
111 }
112 }
113 }
114}
diff --git a/terraform/modules/agent/outputs.tf b/terraform/modules/agent/outputs.tf
new file mode 100644
index 0000000..ac9932b
--- /dev/null
+++ b/terraform/modules/agent/outputs.tf
@@ -0,0 +1,11 @@
1output "service_dns" {
2 value = "${kubernetes_service_v1.agent.metadata[0].name}.${kubernetes_namespace_v1.agent.metadata[0].name}.svc.cluster.local"
3}
4
5output "ingress_host" {
6 value = var.ingress_host
7}
8
9output "namespace" {
10 value = kubernetes_namespace_v1.agent.metadata[0].name
11}
diff --git a/terraform/modules/agent/variables.tf b/terraform/modules/agent/variables.tf
new file mode 100644
index 0000000..6f525ee
--- /dev/null
+++ b/terraform/modules/agent/variables.tf
@@ -0,0 +1,33 @@
1variable "namespace" {
2 type = string
3}
4
5variable "agent_source_path" {
6 type = string
7 description = "Absolute path to agent/agent.py. Used only to bounce pods on code change."
8}
9
10variable "agent_image" {
11 type = string
12 default = "localhost/agent:0.1.0"
13 description = "Pre-built agent image. Must be loaded into kind with `make agent-build`."
14}
15
16variable "llm_service_url" {
17 type = string
18 description = "OpenAI-compatible base URL, e.g. http://llm-llm-app.llm-prod.svc.cluster.local:8000/v1"
19}
20
21variable "model_alias" {
22 type = string
23 default = "Qwen2.5-1.5B-Instruct"
24}
25
26variable "ingress_host" {
27 type = string
28}
29
30variable "ingress_class" {
31 type = string
32 default = "nginx"
33}
diff --git a/terraform/modules/agent/versions.tf b/terraform/modules/agent/versions.tf
new file mode 100644
index 0000000..4242705
--- /dev/null
+++ b/terraform/modules/agent/versions.tf
@@ -0,0 +1,5 @@
1terraform {
2 required_providers {
3 kubernetes = { source = "hashicorp/kubernetes", version = "~> 2.31" }
4 }
5}
diff --git a/terraform/modules/llm/main.tf b/terraform/modules/llm/main.tf
new file mode 100644
index 0000000..cd22019
--- /dev/null
+++ b/terraform/modules/llm/main.tf
@@ -0,0 +1,99 @@
1resource "kubernetes_namespace_v1" "this" {
2 metadata {
3 name = var.namespace
4 labels = {
5 "app.kubernetes.io/part-of" = "llm-platform"
6 }
7 }
8}
9
10resource "kubernetes_horizontal_pod_autoscaler_v2" "llm" {
11 count = var.hpa.enabled ? 1 : 0
12
13 metadata {
14 name = "${var.release_name}-llm-app"
15 namespace = kubernetes_namespace_v1.this.metadata[0].name
16 }
17 spec {
18 scale_target_ref {
19 api_version = "apps/v1"
20 kind = "Deployment"
21 name = "${var.release_name}-llm-app"
22 }
23 min_replicas = var.hpa.min_replicas
24 max_replicas = var.hpa.max_replicas
25
26 metric {
27 type = "Pods"
28 pods {
29 metric {
30 name = var.hpa.metric_name
31 }
32 target {
33 type = "AverageValue"
34 average_value = var.hpa.target_average_value
35 }
36 }
37 }
38 }
39
40 depends_on = [helm_release.llm]
41}
42
43resource "helm_release" "llm" {
44 name = var.release_name
45 chart = var.chart_path
46 namespace = kubernetes_namespace_v1.this.metadata[0].name
47 create_namespace = false
48 atomic = false
49 wait = true
50 timeout = 1800
51
52 values = [
53 yamlencode({
54 replicaCount = var.replicas
55
56 image = {
57 repository = var.image_repository
58 tag = var.image_tag
59 digest = var.image_digest
60 pullPolicy = "IfNotPresent"
61 }
62
63 model = {
64 name = var.model_name
65 alias = var.model_alias
66 maxModelLen = var.max_model_len
67 dtype = var.dtype
68 }
69
70 server = {
71 port = 8000
72 ompThreads = var.omp_threads
73 extraArgs = var.extra_args
74 }
75
76 resources = var.resources
77
78 ingress = {
79 enabled = true
80 className = var.ingress_class
81 host = var.ingress_host
82 }
83
84 monitoring = {
85 serviceMonitor = {
86 enabled = true
87 interval = "15s"
88 labels = {
89 release = var.service_monitor_release_label
90 }
91 }
92 }
93
94 modelCache = {
95 sizeLimit = var.model_cache_size
96 }
97 }),
98 ]
99}
diff --git a/terraform/modules/llm/outputs.tf b/terraform/modules/llm/outputs.tf
new file mode 100644
index 0000000..a953e73
--- /dev/null
+++ b/terraform/modules/llm/outputs.tf
@@ -0,0 +1,12 @@
1output "service_dns" {
2 value = "${var.release_name}-llm-app.${var.namespace}.svc.cluster.local"
3 description = "In-cluster DNS name for the LLM Service."
4}
5
6output "ingress_host" {
7 value = var.ingress_host
8}
9
10output "namespace" {
11 value = kubernetes_namespace_v1.this.metadata[0].name
12}
diff --git a/terraform/modules/llm/variables.tf b/terraform/modules/llm/variables.tf
new file mode 100644
index 0000000..3a7d8f7
--- /dev/null
+++ b/terraform/modules/llm/variables.tf
@@ -0,0 +1,112 @@
1variable "release_name" {
2 type = string
3 description = "Helm release name."
4}
5
6variable "namespace" {
7 type = string
8 description = "Kubernetes namespace to deploy into."
9}
10
11variable "chart_path" {
12 type = string
13 description = "Path to the local llm-app chart."
14}
15
16variable "replicas" {
17 type = number
18 default = 1
19}
20
21variable "model_name" {
22 type = string
23 description = "HuggingFace repo id, passed as vLLM model_tag (positional)."
24}
25
26variable "model_alias" {
27 type = string
28 description = "Value clients pass in the OpenAI 'model' field (maps to --served-model-name)."
29}
30
31variable "max_model_len" {
32 type = number
33 default = 2048
34}
35
36variable "dtype" {
37 type = string
38 default = "bfloat16"
39}
40
41variable "omp_threads" {
42 type = number
43 default = 0
44 description = "OMP_NUM_THREADS for vLLM CPU backend. 0 = autodetect."
45}
46
47variable "extra_args" {
48 type = list(string)
49 default = []
50 description = "Extra CLI args passed to `vllm serve`, appended after the stock set."
51}
52
53variable "resources" {
54 type = object({
55 requests = object({ cpu = string, memory = string })
56 limits = object({ cpu = string, memory = string })
57 })
58}
59
60variable "ingress_host" {
61 type = string
62}
63
64variable "ingress_class" {
65 type = string
66 default = "nginx"
67}
68
69variable "image_repository" {
70 type = string
71 default = "public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo"
72}
73
74variable "image_tag" {
75 type = string
76 default = "latest"
77 description = "Used only when image_digest is empty."
78}
79
80variable "image_digest" {
81 type = string
82 default = ""
83 description = "Optional sha256:abc... content-addressable digest. Takes precedence over image_tag."
84}
85
86variable "service_monitor_release_label" {
87 type = string
88 default = "kube-prometheus-stack"
89 description = "Must match the release label the Prometheus Operator selects on."
90}
91
92variable "model_cache_size" {
93 type = string
94 default = "10Gi"
95}
96
97variable "hpa" {
98 type = object({
99 enabled = bool
100 min_replicas = number
101 max_replicas = number
102 metric_name = string
103 target_average_value = string
104 })
105 default = {
106 enabled = false
107 min_replicas = 1
108 max_replicas = 3
109 metric_name = "vllm:num_requests_running"
110 target_average_value = "500m"
111 }
112}
diff --git a/terraform/modules/observability/main.tf b/terraform/modules/observability/main.tf
new file mode 100644
index 0000000..2f88f2e
--- /dev/null
+++ b/terraform/modules/observability/main.tf
@@ -0,0 +1,156 @@
1resource "kubernetes_namespace_v1" "monitoring" {
2 metadata {
3 name = var.namespace
4 }
5}
6
7resource "kubernetes_namespace_v1" "ingress" {
8 metadata {
9 name = "ingress-nginx"
10 }
11}
12
13resource "helm_release" "ingress_nginx" {
14 name = "ingress-nginx"
15 repository = "https://kubernetes.github.io/ingress-nginx"
16 chart = "ingress-nginx"
17 version = var.ingress_nginx_version
18 namespace = kubernetes_namespace_v1.ingress.metadata[0].name
19 wait = true
20 timeout = 300
21
22 values = [
23 yamlencode({
24 controller = {
25 hostPort = { enabled = true, ports = { http = 80, https = 443 } }
26 service = { type = "NodePort" }
27 nodeSelector = {
28 "ingress-ready" = "true"
29 }
30 tolerations = [
31 { key = "node-role.kubernetes.io/control-plane", operator = "Equal", effect = "NoSchedule" },
32 { key = "node-role.kubernetes.io/master", operator = "Equal", effect = "NoSchedule" },
33 ]
34 publishService = { enabled = false }
35 admissionWebhooks = { enabled = false } # speeds up kind cluster installs
36 # Cap worker_processes so nginx doesn't try to spawn 14 threads under
37 # CPU pressure from vLLM cold-starts. With auto (= one per CPU) it
38 # sometimes hits pthread EAGAIN and workers die without respawn.
39 config = {
40 "worker-processes" = "4"
41 }
42 }
43 }),
44 ]
45}
46
47resource "helm_release" "kps" {
48 name = "kube-prometheus-stack"
49 repository = "https://prometheus-community.github.io/helm-charts"
50 chart = "kube-prometheus-stack"
51 version = var.kps_version
52 namespace = kubernetes_namespace_v1.monitoring.metadata[0].name
53 wait = true
54 timeout = 600
55
56 values = [
57 yamlencode({
58 fullnameOverride = "kps"
59 prometheus = {
60 prometheusSpec = {
61 # Let Prometheus pick up ServiceMonitors from any namespace matching
62 # the release=kube-prometheus-stack label (the chart's default).
63 serviceMonitorSelectorNilUsesHelmValues = false
64 podMonitorSelectorNilUsesHelmValues = false
65 ruleSelectorNilUsesHelmValues = false
66 retention = "2d"
67 resources = {
68 requests = { cpu = "100m", memory = "400Mi" }
69 limits = { memory = "1Gi" }
70 }
71 }
72 ingress = {
73 enabled = true
74 ingressClassName = "nginx"
75 hosts = ["prom.localtest.me"]
76 }
77 }
78 alertmanager = { enabled = false }
79 grafana = {
80 adminPassword = var.grafana_admin_password
81 sidecar = {
82 dashboards = {
83 enabled = true
84 label = "grafana_dashboard"
85 labelValue = "1"
86 searchNamespace = "ALL"
87 }
88 }
89 service = { type = "ClusterIP" }
90 ingress = {
91 enabled = true
92 ingressClassName = "nginx"
93 hosts = ["grafana.localtest.me"]
94 }
95 }
96 }),
97 ]
98}
99
100resource "helm_release" "prometheus_adapter" {
101 name = "prometheus-adapter"
102 repository = "https://prometheus-community.github.io/helm-charts"
103 chart = "prometheus-adapter"
104 version = var.prometheus_adapter_version
105 namespace = kubernetes_namespace_v1.monitoring.metadata[0].name
106 wait = true
107 timeout = 300
108
109 values = [
110 yamlencode({
111 prometheus = {
112 url = "http://kps-prometheus.${kubernetes_namespace_v1.monitoring.metadata[0].name}.svc"
113 port = 9090
114 }
115 rules = {
116 default = false
117 custom = [
118 {
119 # In-flight request count per pod; basis for autoscaling.
120 # vLLM exposes this as a gauge per model-engine.
121 seriesQuery = "vllm:num_requests_running{namespace!=\"\",pod!=\"\"}"
122 resources = {
123 overrides = {
124 namespace = { resource = "namespace" }
125 pod = { resource = "pod" }
126 }
127 }
128 name = {
129 matches = "^vllm:num_requests_running$"
130 as = "vllm:num_requests_running"
131 }
132 metricsQuery = "avg(<<.Series>>{<<.LabelMatchers>>}) by (<<.GroupBy>>)"
133 },
134 {
135 # Waiting (queued) requests per pod — an alternative scale signal.
136 seriesQuery = "vllm:num_requests_waiting{namespace!=\"\",pod!=\"\"}"
137 resources = {
138 overrides = {
139 namespace = { resource = "namespace" }
140 pod = { resource = "pod" }
141 }
142 }
143 name = {
144 matches = "^vllm:num_requests_waiting$"
145 as = "vllm:num_requests_waiting"
146 }
147 metricsQuery = "avg(<<.Series>>{<<.LabelMatchers>>}) by (<<.GroupBy>>)"
148 },
149 ]
150 }
151 }),
152 ]
153
154 depends_on = [helm_release.kps]
155}
156
diff --git a/terraform/modules/observability/outputs.tf b/terraform/modules/observability/outputs.tf
new file mode 100644
index 0000000..06a507d
--- /dev/null
+++ b/terraform/modules/observability/outputs.tf
@@ -0,0 +1,11 @@
1output "namespace" {
2 value = kubernetes_namespace_v1.monitoring.metadata[0].name
3}
4
5output "grafana_service" {
6 value = "kube-prometheus-stack-grafana.${kubernetes_namespace_v1.monitoring.metadata[0].name}.svc.cluster.local"
7}
8
9output "prometheus_service" {
10 value = "kps-prometheus.${kubernetes_namespace_v1.monitoring.metadata[0].name}.svc.cluster.local"
11}
diff --git a/terraform/modules/observability/variables.tf b/terraform/modules/observability/variables.tf
new file mode 100644
index 0000000..6aeaca3
--- /dev/null
+++ b/terraform/modules/observability/variables.tf
@@ -0,0 +1,27 @@
1variable "namespace" {
2 type = string
3 default = "monitoring"
4}
5
6variable "kps_version" {
7 type = string
8 default = "65.5.1"
9 description = "kube-prometheus-stack chart version."
10}
11
12variable "ingress_nginx_version" {
13 type = string
14 default = "4.11.3"
15 description = "ingress-nginx chart version."
16}
17
18variable "grafana_admin_password" {
19 type = string
20 default = "admin"
21 sensitive = true
22}
23
24variable "prometheus_adapter_version" {
25 type = string
26 default = "4.11.0"
27}