From d3e770254de0bb301815ca87257c8b1a357d06c4 Mon Sep 17 00:00:00 2001 From: Your Name Date: Sun, 26 Apr 2026 21:02:47 +0800 Subject: hehe --- terraform/modules/llm/main.tf | 99 ++++++++++++++++++++++++++++++++ terraform/modules/llm/outputs.tf | 12 ++++ terraform/modules/llm/variables.tf | 112 +++++++++++++++++++++++++++++++++++++ 3 files changed, 223 insertions(+) create mode 100644 terraform/modules/llm/main.tf create mode 100644 terraform/modules/llm/outputs.tf create mode 100644 terraform/modules/llm/variables.tf (limited to 'terraform/modules/llm') diff --git a/terraform/modules/llm/main.tf b/terraform/modules/llm/main.tf new file mode 100644 index 0000000..cd22019 --- /dev/null +++ b/terraform/modules/llm/main.tf @@ -0,0 +1,99 @@ +resource "kubernetes_namespace_v1" "this" { + metadata { + name = var.namespace + labels = { + "app.kubernetes.io/part-of" = "llm-platform" + } + } +} + +resource "kubernetes_horizontal_pod_autoscaler_v2" "llm" { + count = var.hpa.enabled ? 1 : 0 + + metadata { + name = "${var.release_name}-llm-app" + namespace = kubernetes_namespace_v1.this.metadata[0].name + } + spec { + scale_target_ref { + api_version = "apps/v1" + kind = "Deployment" + name = "${var.release_name}-llm-app" + } + min_replicas = var.hpa.min_replicas + max_replicas = var.hpa.max_replicas + + metric { + type = "Pods" + pods { + metric { + name = var.hpa.metric_name + } + target { + type = "AverageValue" + average_value = var.hpa.target_average_value + } + } + } + } + + depends_on = [helm_release.llm] +} + +resource "helm_release" "llm" { + name = var.release_name + chart = var.chart_path + namespace = kubernetes_namespace_v1.this.metadata[0].name + create_namespace = false + atomic = false + wait = true + timeout = 1800 + + values = [ + yamlencode({ + replicaCount = var.replicas + + image = { + repository = var.image_repository + tag = var.image_tag + digest = var.image_digest + pullPolicy = "IfNotPresent" + } + + model = { + name = var.model_name + alias = var.model_alias + maxModelLen = var.max_model_len + dtype = var.dtype + } + + server = { + port = 8000 + ompThreads = var.omp_threads + extraArgs = var.extra_args + } + + resources = var.resources + + ingress = { + enabled = true + className = var.ingress_class + host = var.ingress_host + } + + monitoring = { + serviceMonitor = { + enabled = true + interval = "15s" + labels = { + release = var.service_monitor_release_label + } + } + } + + modelCache = { + sizeLimit = var.model_cache_size + } + }), + ] +} diff --git a/terraform/modules/llm/outputs.tf b/terraform/modules/llm/outputs.tf new file mode 100644 index 0000000..a953e73 --- /dev/null +++ b/terraform/modules/llm/outputs.tf @@ -0,0 +1,12 @@ +output "service_dns" { + value = "${var.release_name}-llm-app.${var.namespace}.svc.cluster.local" + description = "In-cluster DNS name for the LLM Service." +} + +output "ingress_host" { + value = var.ingress_host +} + +output "namespace" { + value = kubernetes_namespace_v1.this.metadata[0].name +} diff --git a/terraform/modules/llm/variables.tf b/terraform/modules/llm/variables.tf new file mode 100644 index 0000000..3a7d8f7 --- /dev/null +++ b/terraform/modules/llm/variables.tf @@ -0,0 +1,112 @@ +variable "release_name" { + type = string + description = "Helm release name." +} + +variable "namespace" { + type = string + description = "Kubernetes namespace to deploy into." +} + +variable "chart_path" { + type = string + description = "Path to the local llm-app chart." +} + +variable "replicas" { + type = number + default = 1 +} + +variable "model_name" { + type = string + description = "HuggingFace repo id, passed as vLLM model_tag (positional)." +} + +variable "model_alias" { + type = string + description = "Value clients pass in the OpenAI 'model' field (maps to --served-model-name)." +} + +variable "max_model_len" { + type = number + default = 2048 +} + +variable "dtype" { + type = string + default = "bfloat16" +} + +variable "omp_threads" { + type = number + default = 0 + description = "OMP_NUM_THREADS for vLLM CPU backend. 0 = autodetect." +} + +variable "extra_args" { + type = list(string) + default = [] + description = "Extra CLI args passed to `vllm serve`, appended after the stock set." +} + +variable "resources" { + type = object({ + requests = object({ cpu = string, memory = string }) + limits = object({ cpu = string, memory = string }) + }) +} + +variable "ingress_host" { + type = string +} + +variable "ingress_class" { + type = string + default = "nginx" +} + +variable "image_repository" { + type = string + default = "public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo" +} + +variable "image_tag" { + type = string + default = "latest" + description = "Used only when image_digest is empty." +} + +variable "image_digest" { + type = string + default = "" + description = "Optional sha256:abc... content-addressable digest. Takes precedence over image_tag." +} + +variable "service_monitor_release_label" { + type = string + default = "kube-prometheus-stack" + description = "Must match the release label the Prometheus Operator selects on." +} + +variable "model_cache_size" { + type = string + default = "10Gi" +} + +variable "hpa" { + type = object({ + enabled = bool + min_replicas = number + max_replicas = number + metric_name = string + target_average_value = string + }) + default = { + enabled = false + min_replicas = 1 + max_replicas = 3 + metric_name = "vllm:num_requests_running" + target_average_value = "500m" + } +} -- cgit