From d3e770254de0bb301815ca87257c8b1a357d06c4 Mon Sep 17 00:00:00 2001 From: Your Name Date: Sun, 26 Apr 2026 21:02:47 +0800 Subject: hehe --- terraform/modules/agent/main.tf | 114 ++++++++++++++++++++ terraform/modules/agent/outputs.tf | 11 ++ terraform/modules/agent/variables.tf | 33 ++++++ terraform/modules/agent/versions.tf | 5 + terraform/modules/llm/main.tf | 99 +++++++++++++++++ terraform/modules/llm/outputs.tf | 12 +++ terraform/modules/llm/variables.tf | 112 +++++++++++++++++++ terraform/modules/observability/main.tf | 156 +++++++++++++++++++++++++++ terraform/modules/observability/outputs.tf | 11 ++ terraform/modules/observability/variables.tf | 27 +++++ 10 files changed, 580 insertions(+) create mode 100644 terraform/modules/agent/main.tf create mode 100644 terraform/modules/agent/outputs.tf create mode 100644 terraform/modules/agent/variables.tf create mode 100644 terraform/modules/agent/versions.tf create mode 100644 terraform/modules/llm/main.tf create mode 100644 terraform/modules/llm/outputs.tf create mode 100644 terraform/modules/llm/variables.tf create mode 100644 terraform/modules/observability/main.tf create mode 100644 terraform/modules/observability/outputs.tf create mode 100644 terraform/modules/observability/variables.tf (limited to 'terraform/modules') diff --git a/terraform/modules/agent/main.tf b/terraform/modules/agent/main.tf new file mode 100644 index 0000000..f53acdc --- /dev/null +++ b/terraform/modules/agent/main.tf @@ -0,0 +1,114 @@ +resource "kubernetes_namespace_v1" "agent" { + metadata { + name = var.namespace + labels = { + "app.kubernetes.io/part-of" = "llm-platform" + } + } +} + +resource "kubernetes_deployment_v1" "agent" { + metadata { + name = "agent" + namespace = kubernetes_namespace_v1.agent.metadata[0].name + labels = { app = "agent" } + } + spec { + replicas = 1 + selector { + match_labels = { app = "agent" } + } + template { + metadata { + labels = { app = "agent" } + annotations = { + # Bounce the pod when agent.py changes on disk, even if image tag is unchanged. + "checksum/code" = substr(sha256(file(var.agent_source_path)), 0, 16) + } + } + spec { + container { + name = "agent" + image = var.agent_image + image_pull_policy = "IfNotPresent" + env { + name = "OPENAI_BASE_URL" + value = var.llm_service_url + } + env { + name = "MODEL" + value = var.model_alias + } + port { + name = "http" + container_port = 8001 + } + readiness_probe { + http_get { + path = "/health" + port = "http" + } + initial_delay_seconds = 3 + period_seconds = 5 + failure_threshold = 10 + } + liveness_probe { + http_get { + path = "/health" + port = "http" + } + initial_delay_seconds = 30 + period_seconds = 30 + } + resources { + requests = { cpu = "100m", memory = "128Mi" } + limits = { cpu = "1", memory = "512Mi" } + } + } + } + } + } +} + +resource "kubernetes_service_v1" "agent" { + metadata { + name = "agent" + namespace = kubernetes_namespace_v1.agent.metadata[0].name + labels = { app = "agent" } + } + spec { + selector = { app = "agent" } + port { + name = "http" + port = 8001 + target_port = "http" + } + } +} + +resource "kubernetes_ingress_v1" "agent" { + metadata { + name = "agent" + namespace = kubernetes_namespace_v1.agent.metadata[0].name + } + spec { + ingress_class_name = var.ingress_class + rule { + host = var.ingress_host + http { + path { + path = "/" + path_type = "Prefix" + backend { + service { + name = kubernetes_service_v1.agent.metadata[0].name + port { + number = 8001 + } + } + } + } + } + } + } +} diff --git a/terraform/modules/agent/outputs.tf b/terraform/modules/agent/outputs.tf new file mode 100644 index 0000000..ac9932b --- /dev/null +++ b/terraform/modules/agent/outputs.tf @@ -0,0 +1,11 @@ +output "service_dns" { + value = "${kubernetes_service_v1.agent.metadata[0].name}.${kubernetes_namespace_v1.agent.metadata[0].name}.svc.cluster.local" +} + +output "ingress_host" { + value = var.ingress_host +} + +output "namespace" { + value = kubernetes_namespace_v1.agent.metadata[0].name +} diff --git a/terraform/modules/agent/variables.tf b/terraform/modules/agent/variables.tf new file mode 100644 index 0000000..6f525ee --- /dev/null +++ b/terraform/modules/agent/variables.tf @@ -0,0 +1,33 @@ +variable "namespace" { + type = string +} + +variable "agent_source_path" { + type = string + description = "Absolute path to agent/agent.py. Used only to bounce pods on code change." +} + +variable "agent_image" { + type = string + default = "localhost/agent:0.1.0" + description = "Pre-built agent image. Must be loaded into kind with `make agent-build`." +} + +variable "llm_service_url" { + type = string + description = "OpenAI-compatible base URL, e.g. http://llm-llm-app.llm-prod.svc.cluster.local:8000/v1" +} + +variable "model_alias" { + type = string + default = "Qwen2.5-1.5B-Instruct" +} + +variable "ingress_host" { + type = string +} + +variable "ingress_class" { + type = string + default = "nginx" +} diff --git a/terraform/modules/agent/versions.tf b/terraform/modules/agent/versions.tf new file mode 100644 index 0000000..4242705 --- /dev/null +++ b/terraform/modules/agent/versions.tf @@ -0,0 +1,5 @@ +terraform { + required_providers { + kubernetes = { source = "hashicorp/kubernetes", version = "~> 2.31" } + } +} diff --git a/terraform/modules/llm/main.tf b/terraform/modules/llm/main.tf new file mode 100644 index 0000000..cd22019 --- /dev/null +++ b/terraform/modules/llm/main.tf @@ -0,0 +1,99 @@ +resource "kubernetes_namespace_v1" "this" { + metadata { + name = var.namespace + labels = { + "app.kubernetes.io/part-of" = "llm-platform" + } + } +} + +resource "kubernetes_horizontal_pod_autoscaler_v2" "llm" { + count = var.hpa.enabled ? 1 : 0 + + metadata { + name = "${var.release_name}-llm-app" + namespace = kubernetes_namespace_v1.this.metadata[0].name + } + spec { + scale_target_ref { + api_version = "apps/v1" + kind = "Deployment" + name = "${var.release_name}-llm-app" + } + min_replicas = var.hpa.min_replicas + max_replicas = var.hpa.max_replicas + + metric { + type = "Pods" + pods { + metric { + name = var.hpa.metric_name + } + target { + type = "AverageValue" + average_value = var.hpa.target_average_value + } + } + } + } + + depends_on = [helm_release.llm] +} + +resource "helm_release" "llm" { + name = var.release_name + chart = var.chart_path + namespace = kubernetes_namespace_v1.this.metadata[0].name + create_namespace = false + atomic = false + wait = true + timeout = 1800 + + values = [ + yamlencode({ + replicaCount = var.replicas + + image = { + repository = var.image_repository + tag = var.image_tag + digest = var.image_digest + pullPolicy = "IfNotPresent" + } + + model = { + name = var.model_name + alias = var.model_alias + maxModelLen = var.max_model_len + dtype = var.dtype + } + + server = { + port = 8000 + ompThreads = var.omp_threads + extraArgs = var.extra_args + } + + resources = var.resources + + ingress = { + enabled = true + className = var.ingress_class + host = var.ingress_host + } + + monitoring = { + serviceMonitor = { + enabled = true + interval = "15s" + labels = { + release = var.service_monitor_release_label + } + } + } + + modelCache = { + sizeLimit = var.model_cache_size + } + }), + ] +} diff --git a/terraform/modules/llm/outputs.tf b/terraform/modules/llm/outputs.tf new file mode 100644 index 0000000..a953e73 --- /dev/null +++ b/terraform/modules/llm/outputs.tf @@ -0,0 +1,12 @@ +output "service_dns" { + value = "${var.release_name}-llm-app.${var.namespace}.svc.cluster.local" + description = "In-cluster DNS name for the LLM Service." +} + +output "ingress_host" { + value = var.ingress_host +} + +output "namespace" { + value = kubernetes_namespace_v1.this.metadata[0].name +} diff --git a/terraform/modules/llm/variables.tf b/terraform/modules/llm/variables.tf new file mode 100644 index 0000000..3a7d8f7 --- /dev/null +++ b/terraform/modules/llm/variables.tf @@ -0,0 +1,112 @@ +variable "release_name" { + type = string + description = "Helm release name." +} + +variable "namespace" { + type = string + description = "Kubernetes namespace to deploy into." +} + +variable "chart_path" { + type = string + description = "Path to the local llm-app chart." +} + +variable "replicas" { + type = number + default = 1 +} + +variable "model_name" { + type = string + description = "HuggingFace repo id, passed as vLLM model_tag (positional)." +} + +variable "model_alias" { + type = string + description = "Value clients pass in the OpenAI 'model' field (maps to --served-model-name)." +} + +variable "max_model_len" { + type = number + default = 2048 +} + +variable "dtype" { + type = string + default = "bfloat16" +} + +variable "omp_threads" { + type = number + default = 0 + description = "OMP_NUM_THREADS for vLLM CPU backend. 0 = autodetect." +} + +variable "extra_args" { + type = list(string) + default = [] + description = "Extra CLI args passed to `vllm serve`, appended after the stock set." +} + +variable "resources" { + type = object({ + requests = object({ cpu = string, memory = string }) + limits = object({ cpu = string, memory = string }) + }) +} + +variable "ingress_host" { + type = string +} + +variable "ingress_class" { + type = string + default = "nginx" +} + +variable "image_repository" { + type = string + default = "public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo" +} + +variable "image_tag" { + type = string + default = "latest" + description = "Used only when image_digest is empty." +} + +variable "image_digest" { + type = string + default = "" + description = "Optional sha256:abc... content-addressable digest. Takes precedence over image_tag." +} + +variable "service_monitor_release_label" { + type = string + default = "kube-prometheus-stack" + description = "Must match the release label the Prometheus Operator selects on." +} + +variable "model_cache_size" { + type = string + default = "10Gi" +} + +variable "hpa" { + type = object({ + enabled = bool + min_replicas = number + max_replicas = number + metric_name = string + target_average_value = string + }) + default = { + enabled = false + min_replicas = 1 + max_replicas = 3 + metric_name = "vllm:num_requests_running" + target_average_value = "500m" + } +} diff --git a/terraform/modules/observability/main.tf b/terraform/modules/observability/main.tf new file mode 100644 index 0000000..2f88f2e --- /dev/null +++ b/terraform/modules/observability/main.tf @@ -0,0 +1,156 @@ +resource "kubernetes_namespace_v1" "monitoring" { + metadata { + name = var.namespace + } +} + +resource "kubernetes_namespace_v1" "ingress" { + metadata { + name = "ingress-nginx" + } +} + +resource "helm_release" "ingress_nginx" { + name = "ingress-nginx" + repository = "https://kubernetes.github.io/ingress-nginx" + chart = "ingress-nginx" + version = var.ingress_nginx_version + namespace = kubernetes_namespace_v1.ingress.metadata[0].name + wait = true + timeout = 300 + + values = [ + yamlencode({ + controller = { + hostPort = { enabled = true, ports = { http = 80, https = 443 } } + service = { type = "NodePort" } + nodeSelector = { + "ingress-ready" = "true" + } + tolerations = [ + { key = "node-role.kubernetes.io/control-plane", operator = "Equal", effect = "NoSchedule" }, + { key = "node-role.kubernetes.io/master", operator = "Equal", effect = "NoSchedule" }, + ] + publishService = { enabled = false } + admissionWebhooks = { enabled = false } # speeds up kind cluster installs + # Cap worker_processes so nginx doesn't try to spawn 14 threads under + # CPU pressure from vLLM cold-starts. With auto (= one per CPU) it + # sometimes hits pthread EAGAIN and workers die without respawn. + config = { + "worker-processes" = "4" + } + } + }), + ] +} + +resource "helm_release" "kps" { + name = "kube-prometheus-stack" + repository = "https://prometheus-community.github.io/helm-charts" + chart = "kube-prometheus-stack" + version = var.kps_version + namespace = kubernetes_namespace_v1.monitoring.metadata[0].name + wait = true + timeout = 600 + + values = [ + yamlencode({ + fullnameOverride = "kps" + prometheus = { + prometheusSpec = { + # Let Prometheus pick up ServiceMonitors from any namespace matching + # the release=kube-prometheus-stack label (the chart's default). + serviceMonitorSelectorNilUsesHelmValues = false + podMonitorSelectorNilUsesHelmValues = false + ruleSelectorNilUsesHelmValues = false + retention = "2d" + resources = { + requests = { cpu = "100m", memory = "400Mi" } + limits = { memory = "1Gi" } + } + } + ingress = { + enabled = true + ingressClassName = "nginx" + hosts = ["prom.localtest.me"] + } + } + alertmanager = { enabled = false } + grafana = { + adminPassword = var.grafana_admin_password + sidecar = { + dashboards = { + enabled = true + label = "grafana_dashboard" + labelValue = "1" + searchNamespace = "ALL" + } + } + service = { type = "ClusterIP" } + ingress = { + enabled = true + ingressClassName = "nginx" + hosts = ["grafana.localtest.me"] + } + } + }), + ] +} + +resource "helm_release" "prometheus_adapter" { + name = "prometheus-adapter" + repository = "https://prometheus-community.github.io/helm-charts" + chart = "prometheus-adapter" + version = var.prometheus_adapter_version + namespace = kubernetes_namespace_v1.monitoring.metadata[0].name + wait = true + timeout = 300 + + values = [ + yamlencode({ + prometheus = { + url = "http://kps-prometheus.${kubernetes_namespace_v1.monitoring.metadata[0].name}.svc" + port = 9090 + } + rules = { + default = false + custom = [ + { + # In-flight request count per pod; basis for autoscaling. + # vLLM exposes this as a gauge per model-engine. + seriesQuery = "vllm:num_requests_running{namespace!=\"\",pod!=\"\"}" + resources = { + overrides = { + namespace = { resource = "namespace" } + pod = { resource = "pod" } + } + } + name = { + matches = "^vllm:num_requests_running$" + as = "vllm:num_requests_running" + } + metricsQuery = "avg(<<.Series>>{<<.LabelMatchers>>}) by (<<.GroupBy>>)" + }, + { + # Waiting (queued) requests per pod — an alternative scale signal. + seriesQuery = "vllm:num_requests_waiting{namespace!=\"\",pod!=\"\"}" + resources = { + overrides = { + namespace = { resource = "namespace" } + pod = { resource = "pod" } + } + } + name = { + matches = "^vllm:num_requests_waiting$" + as = "vllm:num_requests_waiting" + } + metricsQuery = "avg(<<.Series>>{<<.LabelMatchers>>}) by (<<.GroupBy>>)" + }, + ] + } + }), + ] + + depends_on = [helm_release.kps] +} + diff --git a/terraform/modules/observability/outputs.tf b/terraform/modules/observability/outputs.tf new file mode 100644 index 0000000..06a507d --- /dev/null +++ b/terraform/modules/observability/outputs.tf @@ -0,0 +1,11 @@ +output "namespace" { + value = kubernetes_namespace_v1.monitoring.metadata[0].name +} + +output "grafana_service" { + value = "kube-prometheus-stack-grafana.${kubernetes_namespace_v1.monitoring.metadata[0].name}.svc.cluster.local" +} + +output "prometheus_service" { + value = "kps-prometheus.${kubernetes_namespace_v1.monitoring.metadata[0].name}.svc.cluster.local" +} diff --git a/terraform/modules/observability/variables.tf b/terraform/modules/observability/variables.tf new file mode 100644 index 0000000..6aeaca3 --- /dev/null +++ b/terraform/modules/observability/variables.tf @@ -0,0 +1,27 @@ +variable "namespace" { + type = string + default = "monitoring" +} + +variable "kps_version" { + type = string + default = "65.5.1" + description = "kube-prometheus-stack chart version." +} + +variable "ingress_nginx_version" { + type = string + default = "4.11.3" + description = "ingress-nginx chart version." +} + +variable "grafana_admin_password" { + type = string + default = "admin" + sensitive = true +} + +variable "prometheus_adapter_version" { + type = string + default = "4.11.0" +} -- cgit