From d3e770254de0bb301815ca87257c8b1a357d06c4 Mon Sep 17 00:00:00 2001
From: Your Name <you@example.com>
Date: Sun, 26 Apr 2026 21:02:47 +0800
Subject: hehe

---
 terraform/envs/prod/main.tf | 70 +++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 70 insertions(+)
 create mode 100644 terraform/envs/prod/main.tf

(limited to 'terraform/envs/prod/main.tf')

diff --git a/terraform/envs/prod/main.tf b/terraform/envs/prod/main.tf
new file mode 100644
index 0000000..04db31d
--- /dev/null
+++ b/terraform/envs/prod/main.tf
@@ -0,0 +1,70 @@
+provider "kubernetes" {
+  config_path    = pathexpand(var.kubeconfig)
+  config_context = var.kube_context
+}
+
+provider "helm" {
+  kubernetes {
+    config_path    = pathexpand(var.kubeconfig)
+    config_context = var.kube_context
+  }
+}
+
+locals {
+  env = "prod"
+}
+
+module "llm" {
+  source = "../../modules/llm"
+
+  release_name = "llm"
+  namespace    = "llm-${local.env}"
+  chart_path   = var.chart_path
+
+  replicas = 1
+
+  model_name    = "Qwen/Qwen2.5-1.5B-Instruct"
+  model_alias   = "Qwen2.5-1.5B-Instruct"
+  max_model_len = 4096
+  dtype         = "bfloat16"
+  omp_threads   = 6
+
+  resources = {
+    requests = { cpu = "2", memory = "4Gi" }
+    limits   = { cpu = "6", memory = "8Gi" }
+  }
+
+  ingress_host = "llm.prod.localtest.me"
+  image_tag    = "latest"
+  # Content-addressable pin, resolved with scripts/resolve-digests.sh on amd64.
+  # Per-arch digest — re-resolve on a different arch or after an upstream tag move.
+  # Dev intentionally runs on `:latest` so new fixes flow in without a PR.
+  image_digest = "sha256:bb7ed9b6c595334d78179e9d8f6490e06bf9220ed4a10b9b4e15064454ddc69e"
+
+  # Enable OpenAI tool-calling so the agent's function-call path works.
+  # Qwen 2.5 uses hermes-style tool parsing in vLLM.
+  extra_args = [
+    "--enable-auto-tool-choice",
+    "--tool-call-parser", "hermes",
+  ]
+
+  hpa = {
+    enabled      = true
+    min_replicas = 1
+    max_replicas = 3
+    # vLLM exposes `vllm:num_requests_running` as a per-pod gauge of in-flight
+    # requests. Scale up when >50% of pods are actively serving.
+    metric_name          = "vllm:num_requests_running"
+    target_average_value = "500m"
+  }
+}
+
+output "ingress_host" { value = module.llm.ingress_host }
+output "service_dns" { value = module.llm.service_dns }
+output "curl_example" {
+  value = <<-EOT
+    curl -s http://${module.llm.ingress_host}:8080/v1/chat/completions \
+      -H 'Content-Type: application/json' \
+      -d '{"model":"Qwen2.5-1.5B-Instruct","messages":[{"role":"user","content":"Say hi."}]}'
+  EOT
+}
-- 
cgit