From d3e770254de0bb301815ca87257c8b1a357d06c4 Mon Sep 17 00:00:00 2001 From: Your Name Date: Sun, 26 Apr 2026 21:02:47 +0800 Subject: hehe --- terraform/envs/prod/main.tf | 70 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 70 insertions(+) create mode 100644 terraform/envs/prod/main.tf (limited to 'terraform/envs/prod/main.tf') diff --git a/terraform/envs/prod/main.tf b/terraform/envs/prod/main.tf new file mode 100644 index 0000000..04db31d --- /dev/null +++ b/terraform/envs/prod/main.tf @@ -0,0 +1,70 @@ +provider "kubernetes" { + config_path = pathexpand(var.kubeconfig) + config_context = var.kube_context +} + +provider "helm" { + kubernetes { + config_path = pathexpand(var.kubeconfig) + config_context = var.kube_context + } +} + +locals { + env = "prod" +} + +module "llm" { + source = "../../modules/llm" + + release_name = "llm" + namespace = "llm-${local.env}" + chart_path = var.chart_path + + replicas = 1 + + model_name = "Qwen/Qwen2.5-1.5B-Instruct" + model_alias = "Qwen2.5-1.5B-Instruct" + max_model_len = 4096 + dtype = "bfloat16" + omp_threads = 6 + + resources = { + requests = { cpu = "2", memory = "4Gi" } + limits = { cpu = "6", memory = "8Gi" } + } + + ingress_host = "llm.prod.localtest.me" + image_tag = "latest" + # Content-addressable pin, resolved with scripts/resolve-digests.sh on amd64. + # Per-arch digest — re-resolve on a different arch or after an upstream tag move. + # Dev intentionally runs on `:latest` so new fixes flow in without a PR. + image_digest = "sha256:bb7ed9b6c595334d78179e9d8f6490e06bf9220ed4a10b9b4e15064454ddc69e" + + # Enable OpenAI tool-calling so the agent's function-call path works. + # Qwen 2.5 uses hermes-style tool parsing in vLLM. + extra_args = [ + "--enable-auto-tool-choice", + "--tool-call-parser", "hermes", + ] + + hpa = { + enabled = true + min_replicas = 1 + max_replicas = 3 + # vLLM exposes `vllm:num_requests_running` as a per-pod gauge of in-flight + # requests. Scale up when >50% of pods are actively serving. + metric_name = "vllm:num_requests_running" + target_average_value = "500m" + } +} + +output "ingress_host" { value = module.llm.ingress_host } +output "service_dns" { value = module.llm.service_dns } +output "curl_example" { + value = <<-EOT + curl -s http://${module.llm.ingress_host}:8080/v1/chat/completions \ + -H 'Content-Type: application/json' \ + -d '{"model":"Qwen2.5-1.5B-Instruct","messages":[{"role":"user","content":"Say hi."}]}' + EOT +} -- cgit