From d3e770254de0bb301815ca87257c8b1a357d06c4 Mon Sep 17 00:00:00 2001
From: Your Name <you@example.com>
Date: Sun, 26 Apr 2026 21:02:47 +0800
Subject: hehe

---
 terraform/modules/llm/main.tf      |  99 ++++++++++++++++++++++++++++++++
 terraform/modules/llm/outputs.tf   |  12 ++++
 terraform/modules/llm/variables.tf | 112 +++++++++++++++++++++++++++++++++++++
 3 files changed, 223 insertions(+)
 create mode 100644 terraform/modules/llm/main.tf
 create mode 100644 terraform/modules/llm/outputs.tf
 create mode 100644 terraform/modules/llm/variables.tf

(limited to 'terraform/modules/llm')

diff --git a/terraform/modules/llm/main.tf b/terraform/modules/llm/main.tf
new file mode 100644
index 0000000..cd22019
--- /dev/null
+++ b/terraform/modules/llm/main.tf
@@ -0,0 +1,99 @@
+resource "kubernetes_namespace_v1" "this" {
+  metadata {
+    name = var.namespace
+    labels = {
+      "app.kubernetes.io/part-of" = "llm-platform"
+    }
+  }
+}
+
+resource "kubernetes_horizontal_pod_autoscaler_v2" "llm" {
+  count = var.hpa.enabled ? 1 : 0
+
+  metadata {
+    name      = "${var.release_name}-llm-app"
+    namespace = kubernetes_namespace_v1.this.metadata[0].name
+  }
+  spec {
+    scale_target_ref {
+      api_version = "apps/v1"
+      kind        = "Deployment"
+      name        = "${var.release_name}-llm-app"
+    }
+    min_replicas = var.hpa.min_replicas
+    max_replicas = var.hpa.max_replicas
+
+    metric {
+      type = "Pods"
+      pods {
+        metric {
+          name = var.hpa.metric_name
+        }
+        target {
+          type          = "AverageValue"
+          average_value = var.hpa.target_average_value
+        }
+      }
+    }
+  }
+
+  depends_on = [helm_release.llm]
+}
+
+resource "helm_release" "llm" {
+  name             = var.release_name
+  chart            = var.chart_path
+  namespace        = kubernetes_namespace_v1.this.metadata[0].name
+  create_namespace = false
+  atomic           = false
+  wait             = true
+  timeout          = 1800
+
+  values = [
+    yamlencode({
+      replicaCount = var.replicas
+
+      image = {
+        repository = var.image_repository
+        tag        = var.image_tag
+        digest     = var.image_digest
+        pullPolicy = "IfNotPresent"
+      }
+
+      model = {
+        name        = var.model_name
+        alias       = var.model_alias
+        maxModelLen = var.max_model_len
+        dtype       = var.dtype
+      }
+
+      server = {
+        port       = 8000
+        ompThreads = var.omp_threads
+        extraArgs  = var.extra_args
+      }
+
+      resources = var.resources
+
+      ingress = {
+        enabled   = true
+        className = var.ingress_class
+        host      = var.ingress_host
+      }
+
+      monitoring = {
+        serviceMonitor = {
+          enabled  = true
+          interval = "15s"
+          labels = {
+            release = var.service_monitor_release_label
+          }
+        }
+      }
+
+      modelCache = {
+        sizeLimit = var.model_cache_size
+      }
+    }),
+  ]
+}
diff --git a/terraform/modules/llm/outputs.tf b/terraform/modules/llm/outputs.tf
new file mode 100644
index 0000000..a953e73
--- /dev/null
+++ b/terraform/modules/llm/outputs.tf
@@ -0,0 +1,12 @@
+output "service_dns" {
+  value       = "${var.release_name}-llm-app.${var.namespace}.svc.cluster.local"
+  description = "In-cluster DNS name for the LLM Service."
+}
+
+output "ingress_host" {
+  value = var.ingress_host
+}
+
+output "namespace" {
+  value = kubernetes_namespace_v1.this.metadata[0].name
+}
diff --git a/terraform/modules/llm/variables.tf b/terraform/modules/llm/variables.tf
new file mode 100644
index 0000000..3a7d8f7
--- /dev/null
+++ b/terraform/modules/llm/variables.tf
@@ -0,0 +1,112 @@
+variable "release_name" {
+  type        = string
+  description = "Helm release name."
+}
+
+variable "namespace" {
+  type        = string
+  description = "Kubernetes namespace to deploy into."
+}
+
+variable "chart_path" {
+  type        = string
+  description = "Path to the local llm-app chart."
+}
+
+variable "replicas" {
+  type    = number
+  default = 1
+}
+
+variable "model_name" {
+  type        = string
+  description = "HuggingFace repo id, passed as vLLM model_tag (positional)."
+}
+
+variable "model_alias" {
+  type        = string
+  description = "Value clients pass in the OpenAI 'model' field (maps to --served-model-name)."
+}
+
+variable "max_model_len" {
+  type    = number
+  default = 2048
+}
+
+variable "dtype" {
+  type    = string
+  default = "bfloat16"
+}
+
+variable "omp_threads" {
+  type        = number
+  default     = 0
+  description = "OMP_NUM_THREADS for vLLM CPU backend. 0 = autodetect."
+}
+
+variable "extra_args" {
+  type        = list(string)
+  default     = []
+  description = "Extra CLI args passed to `vllm serve`, appended after the stock set."
+}
+
+variable "resources" {
+  type = object({
+    requests = object({ cpu = string, memory = string })
+    limits   = object({ cpu = string, memory = string })
+  })
+}
+
+variable "ingress_host" {
+  type = string
+}
+
+variable "ingress_class" {
+  type    = string
+  default = "nginx"
+}
+
+variable "image_repository" {
+  type    = string
+  default = "public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo"
+}
+
+variable "image_tag" {
+  type        = string
+  default     = "latest"
+  description = "Used only when image_digest is empty."
+}
+
+variable "image_digest" {
+  type        = string
+  default     = ""
+  description = "Optional sha256:abc... content-addressable digest. Takes precedence over image_tag."
+}
+
+variable "service_monitor_release_label" {
+  type        = string
+  default     = "kube-prometheus-stack"
+  description = "Must match the release label the Prometheus Operator selects on."
+}
+
+variable "model_cache_size" {
+  type    = string
+  default = "10Gi"
+}
+
+variable "hpa" {
+  type = object({
+    enabled              = bool
+    min_replicas         = number
+    max_replicas         = number
+    metric_name          = string
+    target_average_value = string
+  })
+  default = {
+    enabled              = false
+    min_replicas         = 1
+    max_replicas         = 3
+    metric_name          = "vllm:num_requests_running"
+    target_average_value = "500m"
+  }
+}
-- 
cgit