blob: 04db31d02ff262102b1f9696dfe04876a4bac00a (
plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
|
provider "kubernetes" {
config_path = pathexpand(var.kubeconfig)
config_context = var.kube_context
}
provider "helm" {
kubernetes {
config_path = pathexpand(var.kubeconfig)
config_context = var.kube_context
}
}
locals {
env = "prod"
}
module "llm" {
source = "../../modules/llm"
release_name = "llm"
namespace = "llm-${local.env}"
chart_path = var.chart_path
replicas = 1
model_name = "Qwen/Qwen2.5-1.5B-Instruct"
model_alias = "Qwen2.5-1.5B-Instruct"
max_model_len = 4096
dtype = "bfloat16"
omp_threads = 6
resources = {
requests = { cpu = "2", memory = "4Gi" }
limits = { cpu = "6", memory = "8Gi" }
}
ingress_host = "llm.prod.localtest.me"
image_tag = "latest"
# Content-addressable pin, resolved with scripts/resolve-digests.sh on amd64.
# Per-arch digest — re-resolve on a different arch or after an upstream tag move.
# Dev intentionally runs on `:latest` so new fixes flow in without a PR.
image_digest = "sha256:bb7ed9b6c595334d78179e9d8f6490e06bf9220ed4a10b9b4e15064454ddc69e"
# Enable OpenAI tool-calling so the agent's function-call path works.
# Qwen 2.5 uses hermes-style tool parsing in vLLM.
extra_args = [
"--enable-auto-tool-choice",
"--tool-call-parser", "hermes",
]
hpa = {
enabled = true
min_replicas = 1
max_replicas = 3
# vLLM exposes `vllm:num_requests_running` as a per-pod gauge of in-flight
# requests. Scale up when >50% of pods are actively serving.
metric_name = "vllm:num_requests_running"
target_average_value = "500m"
}
}
output "ingress_host" { value = module.llm.ingress_host }
output "service_dns" { value = module.llm.service_dns }
output "curl_example" {
value = <<-EOT
curl -s http://${module.llm.ingress_host}:8080/v1/chat/completions \
-H 'Content-Type: application/json' \
-d '{"model":"Qwen2.5-1.5B-Instruct","messages":[{"role":"user","content":"Say hi."}]}'
EOT
}
|