provider "kubernetes" { config_path = pathexpand(var.kubeconfig) config_context = var.kube_context } provider "helm" { kubernetes { config_path = pathexpand(var.kubeconfig) config_context = var.kube_context } } locals { env = "prod" } module "llm" { source = "../../modules/llm" release_name = "llm" namespace = "llm-${local.env}" chart_path = var.chart_path replicas = 1 model_name = "Qwen/Qwen2.5-1.5B-Instruct" model_alias = "Qwen2.5-1.5B-Instruct" max_model_len = 4096 dtype = "bfloat16" omp_threads = 6 resources = { requests = { cpu = "2", memory = "4Gi" } limits = { cpu = "6", memory = "8Gi" } } ingress_host = "llm.prod.localtest.me" image_tag = "latest" # Content-addressable pin, resolved with scripts/resolve-digests.sh on amd64. # Per-arch digest — re-resolve on a different arch or after an upstream tag move. # Dev intentionally runs on `:latest` so new fixes flow in without a PR. image_digest = "sha256:bb7ed9b6c595334d78179e9d8f6490e06bf9220ed4a10b9b4e15064454ddc69e" # Enable OpenAI tool-calling so the agent's function-call path works. # Qwen 2.5 uses hermes-style tool parsing in vLLM. extra_args = [ "--enable-auto-tool-choice", "--tool-call-parser", "hermes", ] hpa = { enabled = true min_replicas = 1 max_replicas = 3 # vLLM exposes `vllm:num_requests_running` as a per-pod gauge of in-flight # requests. Scale up when >50% of pods are actively serving. metric_name = "vllm:num_requests_running" target_average_value = "500m" } } output "ingress_host" { value = module.llm.ingress_host } output "service_dns" { value = module.llm.service_dns } output "curl_example" { value = <<-EOT curl -s http://${module.llm.ingress_host}:8080/v1/chat/completions \ -H 'Content-Type: application/json' \ -d '{"model":"Qwen2.5-1.5B-Instruct","messages":[{"role":"user","content":"Say hi."}]}' EOT }