diff options
Diffstat (limited to 'terraform/envs/prod/main.tf')
| -rw-r--r-- | terraform/envs/prod/main.tf | 70 |
1 files changed, 70 insertions, 0 deletions
diff --git a/terraform/envs/prod/main.tf b/terraform/envs/prod/main.tf new file mode 100644 index 0000000..04db31d --- /dev/null +++ b/terraform/envs/prod/main.tf | |||
| @@ -0,0 +1,70 @@ | |||
| 1 | provider "kubernetes" { | ||
| 2 | config_path = pathexpand(var.kubeconfig) | ||
| 3 | config_context = var.kube_context | ||
| 4 | } | ||
| 5 | |||
| 6 | provider "helm" { | ||
| 7 | kubernetes { | ||
| 8 | config_path = pathexpand(var.kubeconfig) | ||
| 9 | config_context = var.kube_context | ||
| 10 | } | ||
| 11 | } | ||
| 12 | |||
| 13 | locals { | ||
| 14 | env = "prod" | ||
| 15 | } | ||
| 16 | |||
| 17 | module "llm" { | ||
| 18 | source = "../../modules/llm" | ||
| 19 | |||
| 20 | release_name = "llm" | ||
| 21 | namespace = "llm-${local.env}" | ||
| 22 | chart_path = var.chart_path | ||
| 23 | |||
| 24 | replicas = 1 | ||
| 25 | |||
| 26 | model_name = "Qwen/Qwen2.5-1.5B-Instruct" | ||
| 27 | model_alias = "Qwen2.5-1.5B-Instruct" | ||
| 28 | max_model_len = 4096 | ||
| 29 | dtype = "bfloat16" | ||
| 30 | omp_threads = 6 | ||
| 31 | |||
| 32 | resources = { | ||
| 33 | requests = { cpu = "2", memory = "4Gi" } | ||
| 34 | limits = { cpu = "6", memory = "8Gi" } | ||
| 35 | } | ||
| 36 | |||
| 37 | ingress_host = "llm.prod.localtest.me" | ||
| 38 | image_tag = "latest" | ||
| 39 | # Content-addressable pin, resolved with scripts/resolve-digests.sh on amd64. | ||
| 40 | # Per-arch digest — re-resolve on a different arch or after an upstream tag move. | ||
| 41 | # Dev intentionally runs on `:latest` so new fixes flow in without a PR. | ||
| 42 | image_digest = "sha256:bb7ed9b6c595334d78179e9d8f6490e06bf9220ed4a10b9b4e15064454ddc69e" | ||
| 43 | |||
| 44 | # Enable OpenAI tool-calling so the agent's function-call path works. | ||
| 45 | # Qwen 2.5 uses hermes-style tool parsing in vLLM. | ||
| 46 | extra_args = [ | ||
| 47 | "--enable-auto-tool-choice", | ||
| 48 | "--tool-call-parser", "hermes", | ||
| 49 | ] | ||
| 50 | |||
| 51 | hpa = { | ||
| 52 | enabled = true | ||
| 53 | min_replicas = 1 | ||
| 54 | max_replicas = 3 | ||
| 55 | # vLLM exposes `vllm:num_requests_running` as a per-pod gauge of in-flight | ||
| 56 | # requests. Scale up when >50% of pods are actively serving. | ||
| 57 | metric_name = "vllm:num_requests_running" | ||
| 58 | target_average_value = "500m" | ||
| 59 | } | ||
| 60 | } | ||
| 61 | |||
| 62 | output "ingress_host" { value = module.llm.ingress_host } | ||
| 63 | output "service_dns" { value = module.llm.service_dns } | ||
| 64 | output "curl_example" { | ||
| 65 | value = <<-EOT | ||
| 66 | curl -s http://${module.llm.ingress_host}:8080/v1/chat/completions \ | ||
| 67 | -H 'Content-Type: application/json' \ | ||
| 68 | -d '{"model":"Qwen2.5-1.5B-Instruct","messages":[{"role":"user","content":"Say hi."}]}' | ||
| 69 | EOT | ||
| 70 | } | ||
