summaryrefslogtreecommitdiff
path: root/terraform/envs/prod/main.tf
diff options
context:
space:
mode:
authorYour Name <you@example.com>2026-04-26 21:02:47 +0800
committerYour Name <you@example.com>2026-04-26 21:02:47 +0800
commitd3e770254de0bb301815ca87257c8b1a357d06c4 (patch)
tree358c814be2a06b9e2009905f14938243286b8d82 /terraform/envs/prod/main.tf
Diffstat (limited to 'terraform/envs/prod/main.tf')
-rw-r--r--terraform/envs/prod/main.tf70
1 files changed, 70 insertions, 0 deletions
diff --git a/terraform/envs/prod/main.tf b/terraform/envs/prod/main.tf
new file mode 100644
index 0000000..04db31d
--- /dev/null
+++ b/terraform/envs/prod/main.tf
@@ -0,0 +1,70 @@
1provider "kubernetes" {
2 config_path = pathexpand(var.kubeconfig)
3 config_context = var.kube_context
4}
5
6provider "helm" {
7 kubernetes {
8 config_path = pathexpand(var.kubeconfig)
9 config_context = var.kube_context
10 }
11}
12
13locals {
14 env = "prod"
15}
16
17module "llm" {
18 source = "../../modules/llm"
19
20 release_name = "llm"
21 namespace = "llm-${local.env}"
22 chart_path = var.chart_path
23
24 replicas = 1
25
26 model_name = "Qwen/Qwen2.5-1.5B-Instruct"
27 model_alias = "Qwen2.5-1.5B-Instruct"
28 max_model_len = 4096
29 dtype = "bfloat16"
30 omp_threads = 6
31
32 resources = {
33 requests = { cpu = "2", memory = "4Gi" }
34 limits = { cpu = "6", memory = "8Gi" }
35 }
36
37 ingress_host = "llm.prod.localtest.me"
38 image_tag = "latest"
39 # Content-addressable pin, resolved with scripts/resolve-digests.sh on amd64.
40 # Per-arch digest — re-resolve on a different arch or after an upstream tag move.
41 # Dev intentionally runs on `:latest` so new fixes flow in without a PR.
42 image_digest = "sha256:bb7ed9b6c595334d78179e9d8f6490e06bf9220ed4a10b9b4e15064454ddc69e"
43
44 # Enable OpenAI tool-calling so the agent's function-call path works.
45 # Qwen 2.5 uses hermes-style tool parsing in vLLM.
46 extra_args = [
47 "--enable-auto-tool-choice",
48 "--tool-call-parser", "hermes",
49 ]
50
51 hpa = {
52 enabled = true
53 min_replicas = 1
54 max_replicas = 3
55 # vLLM exposes `vllm:num_requests_running` as a per-pod gauge of in-flight
56 # requests. Scale up when >50% of pods are actively serving.
57 metric_name = "vllm:num_requests_running"
58 target_average_value = "500m"
59 }
60}
61
62output "ingress_host" { value = module.llm.ingress_host }
63output "service_dns" { value = module.llm.service_dns }
64output "curl_example" {
65 value = <<-EOT
66 curl -s http://${module.llm.ingress_host}:8080/v1/chat/completions \
67 -H 'Content-Type: application/json' \
68 -d '{"model":"Qwen2.5-1.5B-Instruct","messages":[{"role":"user","content":"Say hi."}]}'
69 EOT
70}