heheHEAD main

author: Your Name <you@example.com> 2026-04-26 21:02:47 +0800
committer: Your Name <you@example.com> 2026-04-26 21:02:47 +0800
commit: d3e770254de0bb301815ca87257c8b1a357d06c4 (patch)
tree: 358c814be2a06b9e2009905f14938243286b8d82 /terraform/envs/prod/main.tf
1 files changed, 70 insertions, 0 deletions
diff --git a/terraform/envs/prod/main.tf b/terraform/envs/prod/main.tf
new file mode 100644
index 0000000..04db31d
--- /dev/null
+++ b/terraform/envs/prod/main.tf
@@ -0,0 +1,70 @@
+provider "kubernetes" {
+  config_path    = pathexpand(var.kubeconfig)
+  config_context = var.kube_context
+}
+provider "helm" {
+  kubernetes {
+    config_path    = pathexpand(var.kubeconfig)
+    config_context = var.kube_context
+  }
+}
+locals {
+  env = "prod"
+}
+module "llm" {
+  source = "../../modules/llm"
+  release_name = "llm"
+  namespace    = "llm-${local.env}"
+  chart_path   = var.chart_path
+  replicas = 1
+  model_name    = "Qwen/Qwen2.5-1.5B-Instruct"
+  model_alias   = "Qwen2.5-1.5B-Instruct"
+  max_model_len = 4096
+  dtype         = "bfloat16"
+  omp_threads   = 6
+  resources = {
+    requests = { cpu = "2", memory = "4Gi" }
+    limits   = { cpu = "6", memory = "8Gi" }
+  }
+  ingress_host = "llm.prod.localtest.me"
+  image_tag    = "latest"
+  # Content-addressable pin, resolved with scripts/resolve-digests.sh on amd64.
+  # Per-arch digest — re-resolve on a different arch or after an upstream tag move.
+  # Dev intentionally runs on `:latest` so new fixes flow in without a PR.
+  image_digest = "sha256:bb7ed9b6c595334d78179e9d8f6490e06bf9220ed4a10b9b4e15064454ddc69e"
+  # Enable OpenAI tool-calling so the agent's function-call path works.
+  # Qwen 2.5 uses hermes-style tool parsing in vLLM.
+  extra_args = [
+    "--enable-auto-tool-choice",
+    "--tool-call-parser", "hermes",
+  ]
+  hpa = {
+    enabled      = true
+    min_replicas = 1
+    max_replicas = 3
+    # vLLM exposes `vllm:num_requests_running` as a per-pod gauge of in-flight
+    # requests. Scale up when >50% of pods are actively serving.
+    metric_name          = "vllm:num_requests_running"
+    target_average_value = "500m"
+  }
+}
+output "ingress_host" { value = module.llm.ingress_host }
+output "service_dns" { value = module.llm.service_dns }
+output "curl_example" {
+  value = <<-EOT
+    curl -s http://${module.llm.ingress_host}:8080/v1/chat/completions \
+      -H 'Content-Type: application/json' \
+      -d '{"model":"Qwen2.5-1.5B-Instruct","messages":[{"role":"user","content":"Say hi."}]}'
+  EOT
+}
author	Your Name <you@example.com>	2026-04-26 21:02:47 +0800
committer	Your Name <you@example.com>	2026-04-26 21:02:47 +0800
commit	d3e770254de0bb301815ca87257c8b1a357d06c4 (patch)
tree	358c814be2a06b9e2009905f14938243286b8d82 /terraform/envs/prod/main.tf

diff --git a/terraform/envs/prod/main.tf b/terraform/envs/prod/main.tf new file mode 100644 index 0000000..04db31d --- /dev/null +++ b/terraform/envs/prod/main.tf
@@ -0,0 +1,70 @@
	1	provider "kubernetes" {
	2	config_path = pathexpand(var.kubeconfig)
	3	config_context = var.kube_context
	4	}
	5
	6	provider "helm" {
	7	kubernetes {
	8	config_path = pathexpand(var.kubeconfig)
	9	config_context = var.kube_context
	10	}
	11	}
	12
	13	locals {
	14	env = "prod"
	15	}
	16
	17	module "llm" {
	18	source = "../../modules/llm"
	19
	20	release_name = "llm"
	21	namespace = "llm-${local.env}"
	22	chart_path = var.chart_path
	23
	24	replicas = 1
	25
	26	model_name = "Qwen/Qwen2.5-1.5B-Instruct"
	27	model_alias = "Qwen2.5-1.5B-Instruct"
	28	max_model_len = 4096
	29	dtype = "bfloat16"
	30	omp_threads = 6
	31
	32	resources = {
	33	requests = { cpu = "2", memory = "4Gi" }
	34	limits = { cpu = "6", memory = "8Gi" }
	35	}
	36
	37	ingress_host = "llm.prod.localtest.me"
	38	image_tag = "latest"
	39	# Content-addressable pin, resolved with scripts/resolve-digests.sh on amd64.
	40	# Per-arch digest — re-resolve on a different arch or after an upstream tag move.
	41	# Dev intentionally runs on `:latest` so new fixes flow in without a PR.
	42	image_digest = "sha256:bb7ed9b6c595334d78179e9d8f6490e06bf9220ed4a10b9b4e15064454ddc69e"
	43
	44	# Enable OpenAI tool-calling so the agent's function-call path works.
	45	# Qwen 2.5 uses hermes-style tool parsing in vLLM.
	46	extra_args = [
	47	"--enable-auto-tool-choice",
	48	"--tool-call-parser", "hermes",
	49	]
	50
	51	hpa = {
	52	enabled = true
	53	min_replicas = 1
	54	max_replicas = 3
	55	# vLLM exposes `vllm:num_requests_running` as a per-pod gauge of in-flight
	56	# requests. Scale up when >50% of pods are actively serving.
	57	metric_name = "vllm:num_requests_running"
	58	target_average_value = "500m"
	59	}
	60	}
	61
	62	output "ingress_host" { value = module.llm.ingress_host }
	63	output "service_dns" { value = module.llm.service_dns }
	64	output "curl_example" {
	65	value = <<-EOT
	66	curl -s http://${module.llm.ingress_host}:8080/v1/chat/completions \
	67	-H 'Content-Type: application/json' \
	68	-d '{"model":"Qwen2.5-1.5B-Instruct","messages":[{"role":"user","content":"Say hi."}]}'
	69	EOT
	70	}