summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--.gitignore29
-rw-r--r--Makefile78
-rw-r--r--README.txt90
-rw-r--r--agent/Dockerfile6
-rw-r--r--agent/agent.py162
-rw-r--r--charts/llm-app/Chart.yaml6
-rw-r--r--charts/llm-app/templates/_helpers.tpl8
-rw-r--r--charts/llm-app/templates/deployment.yaml76
-rw-r--r--charts/llm-app/templates/ingress.yaml19
-rw-r--r--charts/llm-app/templates/service.yaml13
-rw-r--r--charts/llm-app/templates/servicemonitor.yaml19
-rw-r--r--charts/llm-app/templates/smoketest-job.yaml32
-rw-r--r--charts/llm-app/values.yaml51
-rw-r--r--cluster/kind-config.yaml21
-rw-r--r--goals18
-rwxr-xr-xscripts/resolve-digests.sh31
-rw-r--r--terraform/envs/agent/.terraform.lock.hcl19
-rw-r--r--terraform/envs/agent/backend.tf5
-rw-r--r--terraform/envs/agent/main.tf27
-rw-r--r--terraform/envs/agent/variables.tf14
-rw-r--r--terraform/envs/agent/versions.tf6
-rw-r--r--terraform/envs/bootstrap/.terraform.lock.hcl37
-rw-r--r--terraform/envs/bootstrap/backend.tf5
-rw-r--r--terraform/envs/bootstrap/main.tf25
-rw-r--r--terraform/envs/bootstrap/variables.tf15
-rw-r--r--terraform/envs/bootstrap/versions.tf7
-rw-r--r--terraform/envs/dev/.terraform.lock.hcl54
-rw-r--r--terraform/envs/dev/backend.tf5
-rw-r--r--terraform/envs/dev/main.tf49
-rw-r--r--terraform/envs/dev/variables.tf14
-rw-r--r--terraform/envs/dev/versions.tf8
-rw-r--r--terraform/envs/prod/.terraform.lock.hcl54
-rw-r--r--terraform/envs/prod/backend.tf5
-rw-r--r--terraform/envs/prod/main.tf70
-rw-r--r--terraform/envs/prod/variables.tf14
-rw-r--r--terraform/envs/prod/versions.tf8
-rw-r--r--terraform/modules/agent/main.tf114
-rw-r--r--terraform/modules/agent/outputs.tf11
-rw-r--r--terraform/modules/agent/variables.tf33
-rw-r--r--terraform/modules/agent/versions.tf5
-rw-r--r--terraform/modules/llm/main.tf99
-rw-r--r--terraform/modules/llm/outputs.tf12
-rw-r--r--terraform/modules/llm/variables.tf112
-rw-r--r--terraform/modules/observability/main.tf156
-rw-r--r--terraform/modules/observability/outputs.tf11
-rw-r--r--terraform/modules/observability/variables.tf27
-rwxr-xr-xtests/smoke.sh38
47 files changed, 1718 insertions, 0 deletions
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..747a966
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,29 @@
1# Terraform
2.terraform/
3terraform.tfstate
4terraform.tfstate.*
5*.tfvars
6*.tfvars.json
7crash.log
8crash.*.log
9override.tf
10override.tf.json
11*_override.tf
12*_override.tf.json
13
14# Python
15__pycache__/
16*.py[cod]
17*.egg-info/
18.pytest_cache/
19.venv/
20venv/
21
22# OS
23.DS_Store
24Thumbs.db
25
26# Editors
27.idea/
28.vscode/
29*.swp
diff --git a/Makefile b/Makefile
new file mode 100644
index 0000000..15c0031
--- /dev/null
+++ b/Makefile
@@ -0,0 +1,78 @@
1REPO_ROOT := $(abspath $(dir $(abspath $(lastword $(MAKEFILE_LIST)))))
2CHART_PATH := $(REPO_ROOT)/charts/llm-app
3AGENT_SRC := $(REPO_ROOT)/agent/agent.py
4AGENT_IMG := localhost/agent:0.1.0
5CLUSTER := llm-local
6CONTEXT := kind-$(CLUSTER)
7
8export KIND_EXPERIMENTAL_PROVIDER=podman
9
10.PHONY: help
11help:
12 @echo "Targets:"
13 @echo " up-dev deploy dev LLM (Qwen2.5-0.5B, 2 replicas)"
14 @echo " up-prod deploy prod LLM (Qwen2.5-1.5B, 1 replica + HPA 1->3)"
15 @echo " up-agent up-prod + tool-using agent"
16 @echo " ask Q='...' POST a question to the agent"
17 @echo " down destroy everything + delete kind cluster"
18 @echo ""
19 @echo "URLs (after up-dev/up-prod):"
20 @echo " Grafana http://grafana.localtest.me:8080 (admin/admin)"
21 @echo " curl -f http://grafana.localtest.me:8080/api/health"
22 @echo " Prometheus http://prom.localtest.me:8080"
23 @echo " curl -f http://prom.localtest.me:8080/-/healthy"
24 @echo ""
25
26.PHONY: up-dev
27up-dev:
28 @kind get clusters | grep -qx $(CLUSTER) || kind create cluster --config $(REPO_ROOT)/cluster/kind-config.yaml
29 cd $(REPO_ROOT)/terraform/envs/bootstrap && \
30 tofu init -upgrade && \
31 tofu apply -auto-approve \
32 -var kube_context=$(CONTEXT)
33 cd $(REPO_ROOT)/terraform/envs/dev && \
34 tofu init -upgrade && \
35 tofu apply -auto-approve \
36 -var kube_context=$(CONTEXT) \
37 -var chart_path=$(CHART_PATH)
38
39.PHONY: up-prod
40up-prod:
41 @kind get clusters | grep -qx $(CLUSTER) || kind create cluster --config $(REPO_ROOT)/cluster/kind-config.yaml
42 cd $(REPO_ROOT)/terraform/envs/bootstrap && \
43 tofu init -upgrade && \
44 tofu apply -auto-approve \
45 -var kube_context=$(CONTEXT)
46 cd $(REPO_ROOT)/terraform/envs/prod && \
47 tofu init -upgrade && \
48 tofu apply -auto-approve \
49 -var kube_context=$(CONTEXT) \
50 -var chart_path=$(CHART_PATH)
51
52.PHONY: up-agent
53up-agent: up-prod
54 podman build -t $(AGENT_IMG) $(REPO_ROOT)/agent/
55 @tmp=$$(mktemp -t agent-XXXXXX.tar); \
56 podman save $(AGENT_IMG) -o $$tmp && \
57 kind load image-archive $$tmp --name $(CLUSTER) && \
58 rm -f $$tmp
59 cd $(REPO_ROOT)/terraform/envs/agent && \
60 tofu init -upgrade && \
61 tofu apply -auto-approve \
62 -var kube_context=$(CONTEXT) \
63 -var agent_source_path=$(AGENT_SRC)
64
65.PHONY: ask
66ask:
67 @if [ -z "$(Q)" ]; then echo "usage: make ask Q='what is 17*23?'"; exit 1; fi
68 curl -s http://agent.localtest.me:8080/ask \
69 -H 'Content-Type: application/json' \
70 -d "$(shell printf '{"question":"%s"}' "$(Q)")" | python3 -m json.tool
71
72.PHONY: down
73down:
74 -cd $(REPO_ROOT)/terraform/envs/agent && tofu destroy -auto-approve -var kube_context=$(CONTEXT) -var agent_source_path=$(AGENT_SRC) || true
75 -cd $(REPO_ROOT)/terraform/envs/prod && tofu destroy -auto-approve -var kube_context=$(CONTEXT) -var chart_path=$(CHART_PATH) || true
76 -cd $(REPO_ROOT)/terraform/envs/dev && tofu destroy -auto-approve -var kube_context=$(CONTEXT) -var chart_path=$(CHART_PATH) || true
77 -cd $(REPO_ROOT)/terraform/envs/bootstrap && tofu destroy -auto-approve -var kube_context=$(CONTEXT) || true
78 KIND_EXPERIMENTAL_PROVIDER=podman kind delete cluster --name $(CLUSTER)
diff --git a/README.txt b/README.txt
new file mode 100644
index 0000000..4ce7a8e
--- /dev/null
+++ b/README.txt
@@ -0,0 +1,90 @@
1=============================================================================
2 Local K8s LLM demo — kind + OpenTofu + vLLM
3=============================================================================
4
5sudo dnf install -y podman git make jq curl tar
6
7# kind v0.31.0 (node image: kindest/node:v1.35.0, pinned by digest in cluster/kind-config.yaml)
8cluster/kind-config.yaml
9
10# kubectl v1.36.0
11curl -fsSLo /tmp/kubectl \
12 https://dl.k8s.io/release/v1.36.0/bin/linux/amd64/kubectl
13sudo install -m 0755 /tmp/kubectl /usr/local/bin/kubectl
14
15# Helm 4.1.4
16terraform/modules/observability/variables.tf
17
18# OpenTofu 1.11.6
19terraform/envs/{dev,prod,bootstrap}/versions.tf
20
21
22
23# kind runs each k8s "node" as a long-lived podman container. The default pids_limit = 2048 causes ingress-nginx to hit pthread EAGAIN once the control plane warms up. Raise it once, then restart podman:
24sudo mkdir -p /etc/containers/containers.conf.d
25printf '[containers]\npids_limit = 0\n' \
26 | sudo tee /etc/containers/containers.conf.d/99-kind-pids.conf
27sudo systemctl restart podman.socket podman 2>/dev/null || true
28
29
30make help # will have self explaantory commands to coopy adn paste stuff with
31
32
33
34# Expose an OpenAI-compatible endpoint through a K8s Service / Ingress and prove it works with a `curl` example in the README.
35
36http://llm.dev.localtest.me:8080
37$curl http://llm.dev.localtest.me:8080/v1/chat/completions -H 'Content-Type: application/json' -d '{"model":"Qwen2.5-0.5B-Instruct","messages":[{"role":"user","content":"hi"}],"max_tokens":8}' | jq
38
39http://llm.prod.localtest.me:8080
40$curl http://llm.prod.localtest.me:8080/v1/chat/completions -H 'Content-Type: application/json' -d '{"model":"Qwen2.5-1.5B-Instruct","messages":[{"role":"user","content":"hi"}],"max_tokens":8}' | jq
41
42
43# Observability: scrape `/metrics` from the inference pod with Prometheus and show at least one dashboard or PromQL query for request latency and GPU/CPU utilization.
44
45Fire 10 chat requests against dev to populate metrics
46$for i in {1..10}; do curl -s http://llm.dev.localtest.me:8080/v1/chat/completions -H 'Content-Type: application/json' -d '{"model":"Qwen2.5-0.5B-Instruct","messages":[{"role":"user","content":"hi"}],"max_tokens":4}' >/dev/null & done; wait
47
48Raw /metrics (vLLM exposes natively)
49$curl -s http://llm.dev.localtest.me:8080/metrics | grep '^vllm:' | head
50
51Request latency p95 (seconds) — via Prometheus
52$curl -G http://prom.localtest.me:8080/api/v1/query --data-urlencode 'query=histogram_quantile(0.95, sum by (le) (rate(vllm:e2e_request_latency_seconds_bucket[5m])))' | jq .data.result
53
54CPU cores in use per vLLM pod (CPU-only inference — no GPU on this stack)
55$curl -G http://prom.localtest.me:8080/api/v1/query --data-urlencode 'query=sum by (pod) (rate(container_cpu_usage_seconds_total{namespace="llm-dev",pod=~"llm-llm-app.*",container!="",container!="POD"}[5m]))' | jq .data.result
56
57In-flight requests per pod (the same metric the prod HPA scales on)
58$curl -G http://prom.localtest.me:8080/api/v1/query --data-urlencode 'query=sum by (namespace, pod) (vllm:num_requests_running)' | jq .data.result
59
60
61# stretch 1
62deployed /agent/agent.py in a container to use the backend to caluclate the product of 2 nums
63$curl http://agent.localtest.me:8080/ask -H 'Content-Type: application/json' -d '{"question":"what is 17 * 23?"}' # need to make up-agent first
64
65# stretch 2
66horizontal pod scaling by counting total inflight requests, up to a total of 3 pods
67term1
68$(trap 'kill 0' INT; for i in {1..5}; do \
69 curl -s http://llm.prod.localtest.me:8080/v1/chat/completions \
70 -H 'Content-Type: application/json' \
71 -d '{"model":"Qwen2.5-1.5B-Instruct","messages":[{"role":"user","content":"hi"}],"max_tokens":32}' \
72 >/dev/null &
73 done; wait)
74term2
75$kubectl -n llm-prod get hpa -w # better to use watch -n for this, -w is slow af
76
77# stretch 3
78image pinning — prod uses repo@sha256:<digest> (resolved via scripts/resolve-digests.sh);
79terraform/envs/prod/main.tf
80dev tracks :latest. The chart prefers digest over tag when both are set.
81
82#stretch 4
83smoke test
84charts/llm-app/templates/smoketest-job.yaml reruns after every install or upgrade
85just checks if the response has a content field, no functional thingummy, then passes it
86
87
88
89
90all stretch goals except atlantis git controlled thingummy cuz lazy setup locally or use github
diff --git a/agent/Dockerfile b/agent/Dockerfile
new file mode 100644
index 0000000..509c3b6
--- /dev/null
+++ b/agent/Dockerfile
@@ -0,0 +1,6 @@
1FROM python:3.12-slim
2RUN pip install --no-cache-dir 'openai>=1.59.2,<2' 'httpx<0.28'
3WORKDIR /app
4COPY agent.py /app/agent.py
5EXPOSE 8001
6CMD ["python", "/app/agent.py"]
diff --git a/agent/agent.py b/agent/agent.py
new file mode 100644
index 0000000..12ad9d6
--- /dev/null
+++ b/agent/agent.py
@@ -0,0 +1,162 @@
1"""Tool-using agent over an OpenAI-compatible backend.
2
3Uses the standard OpenAI tools API (function calling). vLLM maps this to the
4model's native tool-call template (Qwen here), so small models follow the
5protocol much more reliably than a hand-rolled text convention.
6
7POST /ask {"question": "..."} -> {"answer": "...", "transcript": [...]}
8GET /health -> "ok"
9"""
10import json
11import os
12import re
13from http.server import BaseHTTPRequestHandler, HTTPServer
14
15from openai import OpenAI
16
17client = OpenAI(
18 base_url=os.environ["OPENAI_BASE_URL"],
19 api_key=os.environ.get("OPENAI_API_KEY", "sk-local"),
20)
21MODEL = os.environ.get("MODEL", "Qwen2.5-1.5B-Instruct")
22MAX_STEPS = int(os.environ.get("MAX_STEPS", "6"))
23
24SYSTEM = (
25 "You are a careful math assistant. When the user asks any arithmetic question, "
26 "call the 'calc' tool with the exact expression. Do not compute arithmetic in your head. "
27 "After you receive the tool result, give a concise final answer."
28)
29
30TOOLS = [
31 {
32 "type": "function",
33 "function": {
34 "name": "calc",
35 "description": "Evaluate a safe arithmetic expression and return the numeric result.",
36 "parameters": {
37 "type": "object",
38 "properties": {
39 "expression": {
40 "type": "string",
41 "description": "Arithmetic expression using only digits, spaces, and + - * / . ( )",
42 }
43 },
44 "required": ["expression"],
45 },
46 },
47 }
48]
49
50SAFE_EXPR = re.compile(r"^[\d\s+\-*/().]+$")
51
52
53def calc(expression: str) -> str:
54 if not SAFE_EXPR.fullmatch(expression):
55 return "ERROR: disallowed characters"
56 try:
57 return str(eval(expression, {"__builtins__": {}}, {})) # noqa: S307
58 except Exception as e:
59 return f"ERROR: {e}"
60
61
62def run_agent(question: str) -> dict:
63 messages = [
64 {"role": "system", "content": SYSTEM},
65 {"role": "user", "content": question},
66 ]
67 transcript: list = []
68
69 for step in range(MAX_STEPS):
70 resp = client.chat.completions.create(
71 model=MODEL,
72 messages=messages,
73 tools=TOOLS,
74 tool_choice="auto",
75 temperature=0.0,
76 max_tokens=256,
77 )
78 msg = resp.choices[0].message
79
80 # Always append the assistant message (with any tool_calls) to history.
81 assistant_entry = {"role": "assistant", "content": msg.content or ""}
82 if msg.tool_calls:
83 assistant_entry["tool_calls"] = [
84 {
85 "id": tc.id,
86 "type": "function",
87 "function": {"name": tc.function.name, "arguments": tc.function.arguments},
88 }
89 for tc in msg.tool_calls
90 ]
91 messages.append(assistant_entry)
92
93 transcript.append(
94 {
95 "step": step + 1,
96 "content": msg.content,
97 "tool_calls": [
98 {"name": tc.function.name, "arguments": tc.function.arguments}
99 for tc in (msg.tool_calls or [])
100 ],
101 }
102 )
103
104 if msg.tool_calls:
105 for tc in msg.tool_calls:
106 if tc.function.name != "calc":
107 result = f"ERROR: unknown tool {tc.function.name}"
108 else:
109 try:
110 args = json.loads(tc.function.arguments)
111 except json.JSONDecodeError:
112 result = "ERROR: bad JSON arguments"
113 else:
114 result = calc(args.get("expression", ""))
115 transcript.append({"tool_result": {"name": tc.function.name, "result": result}})
116 messages.append(
117 {"role": "tool", "tool_call_id": tc.id, "content": result}
118 )
119 continue
120
121 # No tool call -> model produced a final answer.
122 return {"answer": (msg.content or "").strip(), "steps": step + 1, "transcript": transcript}
123
124 return {"answer": None, "steps": MAX_STEPS, "note": "MAX_STEPS reached", "transcript": transcript}
125
126
127class Handler(BaseHTTPRequestHandler):
128 def do_POST(self): # noqa: N802
129 if self.path != "/ask":
130 self.send_response(404); self.end_headers(); return
131 n = int(self.headers.get("Content-Length", "0"))
132 try:
133 body = json.loads(self.rfile.read(n) or b"{}")
134 except json.JSONDecodeError:
135 self.send_response(400); self.end_headers(); self.wfile.write(b'{"error":"invalid json"}'); return
136 q = body.get("question", "")
137 try:
138 result = run_agent(q)
139 code = 200
140 except Exception as e:
141 result = {"error": str(e), "type": type(e).__name__}
142 code = 500
143 payload = json.dumps(result).encode()
144 self.send_response(code)
145 self.send_header("Content-Type", "application/json")
146 self.send_header("Content-Length", str(len(payload)))
147 self.end_headers()
148 self.wfile.write(payload)
149
150 def do_GET(self): # noqa: N802
151 if self.path == "/health":
152 self.send_response(200); self.end_headers(); self.wfile.write(b"ok"); return
153 self.send_response(404); self.end_headers()
154
155 def log_message(self, fmt, *args):
156 import sys
157 print(f"{self.address_string()} {fmt % args}", file=sys.stderr)
158
159
160if __name__ == "__main__":
161 print(f"agent starting on :8001, model={MODEL}, backend={os.environ['OPENAI_BASE_URL']}")
162 HTTPServer(("0.0.0.0", 8001), Handler).serve_forever()
diff --git a/charts/llm-app/Chart.yaml b/charts/llm-app/Chart.yaml
new file mode 100644
index 0000000..e0747df
--- /dev/null
+++ b/charts/llm-app/Chart.yaml
@@ -0,0 +1,6 @@
1apiVersion: v2
2name: llm-app
3description: OpenAI-compatible LLM server (vLLM CPU) with Prometheus metrics
4type: application
5version: 0.1.0
6appVersion: "latest"
diff --git a/charts/llm-app/templates/_helpers.tpl b/charts/llm-app/templates/_helpers.tpl
new file mode 100644
index 0000000..8b104de
--- /dev/null
+++ b/charts/llm-app/templates/_helpers.tpl
@@ -0,0 +1,8 @@
1{{- define "llm-app.fullname" -}}
2{{- printf "%s-%s" .Release.Name .Chart.Name | trunc 63 | trimSuffix "-" -}}
3{{- end -}}
4
5{{- define "llm-app.selectorLabels" -}}
6app.kubernetes.io/name: {{ .Chart.Name }}
7app.kubernetes.io/instance: {{ .Release.Name }}
8{{- end -}}
diff --git a/charts/llm-app/templates/deployment.yaml b/charts/llm-app/templates/deployment.yaml
new file mode 100644
index 0000000..12677b5
--- /dev/null
+++ b/charts/llm-app/templates/deployment.yaml
@@ -0,0 +1,76 @@
1apiVersion: apps/v1
2kind: Deployment
3metadata:
4 name: {{ include "llm-app.fullname" . }}
5spec:
6 replicas: {{ .Values.replicaCount }}
7 selector:
8 matchLabels: {{- include "llm-app.selectorLabels" . | nindent 6 }}
9 template:
10 metadata:
11 labels: {{- include "llm-app.selectorLabels" . | nindent 8 }}
12 spec:
13 containers:
14 - name: vllm-server
15 # Image entrypoint is already `vllm serve`; args start with the model tag.
16 image: "{{ .Values.image.repository }}{{ if .Values.image.digest }}@{{ .Values.image.digest }}{{ else }}:{{ .Values.image.tag }}{{ end }}"
17 imagePullPolicy: {{ .Values.image.pullPolicy }}
18 args:
19 - {{ .Values.model.name | quote }}
20 - "--host"
21 - "0.0.0.0"
22 - "--port"
23 - {{ .Values.server.port | quote }}
24 - "--served-model-name"
25 - {{ .Values.model.alias | quote }}
26 - "--max-model-len"
27 - {{ .Values.model.maxModelLen | quote }}
28 - "--dtype"
29 - {{ .Values.model.dtype | quote }}
30 {{- with .Values.server.extraArgs }}
31 {{- toYaml . | nindent 12 }}
32 {{- end }}
33 env:
34 - name: HF_HOME
35 value: /cache/huggingface
36 - name: VLLM_CPU_KVCACHE_SPACE
37 value: "2"
38 {{- if gt (int .Values.server.ompThreads) 0 }}
39 - name: OMP_NUM_THREADS
40 value: {{ .Values.server.ompThreads | quote }}
41 {{- end }}
42 ports:
43 - name: http
44 containerPort: {{ .Values.server.port }}
45 protocol: TCP
46 readinessProbe:
47 httpGet:
48 path: /health
49 port: http
50 # vLLM CPU cold-start is ~2 min + HF download on first boot.
51 initialDelaySeconds: 60
52 periodSeconds: 10
53 timeoutSeconds: 5
54 failureThreshold: 180
55 livenessProbe:
56 httpGet:
57 path: /health
58 port: http
59 initialDelaySeconds: 600
60 periodSeconds: 30
61 timeoutSeconds: 5
62 failureThreshold: 6
63 resources: {{- toYaml .Values.resources | nindent 12 }}
64 volumeMounts:
65 - name: cache
66 mountPath: /cache
67 - name: shm
68 mountPath: /dev/shm
69 volumes:
70 - name: cache
71 emptyDir:
72 sizeLimit: {{ .Values.modelCache.sizeLimit }}
73 - name: shm
74 emptyDir:
75 medium: Memory
76 sizeLimit: 1Gi
diff --git a/charts/llm-app/templates/ingress.yaml b/charts/llm-app/templates/ingress.yaml
new file mode 100644
index 0000000..f3a6ded
--- /dev/null
+++ b/charts/llm-app/templates/ingress.yaml
@@ -0,0 +1,19 @@
1{{- if .Values.ingress.enabled -}}
2apiVersion: networking.k8s.io/v1
3kind: Ingress
4metadata:
5 name: {{ include "llm-app.fullname" . }}
6spec:
7 ingressClassName: {{ .Values.ingress.className }}
8 rules:
9 - host: {{ .Values.ingress.host | quote }}
10 http:
11 paths:
12 - path: /
13 pathType: Prefix
14 backend:
15 service:
16 name: {{ include "llm-app.fullname" . }}
17 port:
18 number: {{ .Values.service.port }}
19{{- end }}
diff --git a/charts/llm-app/templates/service.yaml b/charts/llm-app/templates/service.yaml
new file mode 100644
index 0000000..6350996
--- /dev/null
+++ b/charts/llm-app/templates/service.yaml
@@ -0,0 +1,13 @@
1apiVersion: v1
2kind: Service
3metadata:
4 name: {{ include "llm-app.fullname" . }}
5 labels: {{- include "llm-app.selectorLabels" . | nindent 4 }}
6spec:
7 type: {{ .Values.service.type }}
8 ports:
9 - name: http
10 port: {{ .Values.service.port }}
11 targetPort: http
12 protocol: TCP
13 selector: {{- include "llm-app.selectorLabels" . | nindent 4 }}
diff --git a/charts/llm-app/templates/servicemonitor.yaml b/charts/llm-app/templates/servicemonitor.yaml
new file mode 100644
index 0000000..264e766
--- /dev/null
+++ b/charts/llm-app/templates/servicemonitor.yaml
@@ -0,0 +1,19 @@
1{{- if .Values.monitoring.serviceMonitor.enabled -}}
2apiVersion: monitoring.coreos.com/v1
3kind: ServiceMonitor
4metadata:
5 name: {{ include "llm-app.fullname" . }}
6 {{- with .Values.monitoring.serviceMonitor.labels }}
7 labels: {{- toYaml . | nindent 4 }}
8 {{- end }}
9spec:
10 selector:
11 matchLabels: {{- include "llm-app.selectorLabels" . | nindent 6 }}
12 endpoints:
13 - port: http
14 path: /metrics
15 interval: {{ .Values.monitoring.serviceMonitor.interval }}
16 namespaceSelector:
17 matchNames:
18 - {{ .Release.Namespace }}
19{{- end }}
diff --git a/charts/llm-app/templates/smoketest-job.yaml b/charts/llm-app/templates/smoketest-job.yaml
new file mode 100644
index 0000000..ac97f33
--- /dev/null
+++ b/charts/llm-app/templates/smoketest-job.yaml
@@ -0,0 +1,32 @@
1apiVersion: batch/v1
2kind: Job
3metadata:
4 name: {{ include "llm-app.fullname" . }}-smoketest
5 annotations:
6 "helm.sh/hook": post-install,post-upgrade
7 "helm.sh/hook-weight": "10"
8 "helm.sh/hook-delete-policy": before-hook-creation,hook-succeeded
9spec:
10 backoffLimit: 2
11 activeDeadlineSeconds: 240
12 ttlSecondsAfterFinished: 600
13 template:
14 spec:
15 restartPolicy: Never
16 containers:
17 - name: curl
18 image: curlimages/curl:8.10.1
19 command: ["/bin/sh", "-euc"]
20 args:
21 - |
22 ENDPOINT="http://{{ include "llm-app.fullname" . }}:{{ .Values.service.port }}"
23 MODEL={{ .Values.model.alias | quote }}
24 echo "smoketest: GET $ENDPOINT/v1/models"
25 out=$(curl -fsS --max-time 60 "$ENDPOINT/v1/models")
26 echo "$out" | grep -q "\"$MODEL\"" || { echo "FAIL: $MODEL not listed in /v1/models"; echo "$out"; exit 1; }
27 echo "smoketest: POST $ENDPOINT/v1/chat/completions"
28 resp=$(curl -fsS --max-time 90 "$ENDPOINT/v1/chat/completions" \
29 -H "Content-Type: application/json" \
30 -d "{\"model\":\"$MODEL\",\"messages\":[{\"role\":\"user\",\"content\":\"Reply with just: pong\"}],\"max_tokens\":8,\"temperature\":0}")
31 echo "$resp" | grep -q '"content"' || { echo "FAIL: no content in response"; echo "$resp"; exit 1; }
32 echo "OK"
diff --git a/charts/llm-app/values.yaml b/charts/llm-app/values.yaml
new file mode 100644
index 0000000..96c5c9a
--- /dev/null
+++ b/charts/llm-app/values.yaml
@@ -0,0 +1,51 @@
1replicaCount: 1
2
3image:
4 # vLLM CPU-only image (no CUDA, works on AVX2+).
5 repository: public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo
6 tag: latest
7 # Optional. If set, used in place of `tag` to pin the image by content.
8 # Example: "sha256:abc123...". Fill via scripts/resolve-digests.sh.
9 digest: ""
10 pullPolicy: IfNotPresent
11
12# vLLM pulls model weights from HuggingFace at first boot into the cache volume.
13# `name` is the HF repo id (also used as `--served-model-name` unless `alias` overrides).
14model:
15 name: "Qwen/Qwen2.5-0.5B-Instruct"
16 alias: "Qwen2.5-0.5B-Instruct"
17 maxModelLen: 2048
18 dtype: "bfloat16"
19
20server:
21 port: 8000
22 # OMP threads for the CPU backend; 0 = autodetect.
23 ompThreads: 0
24 extraArgs: []
25
26resources:
27 requests:
28 cpu: "500m"
29 memory: "1Gi"
30 limits:
31 cpu: "2"
32 memory: "3Gi"
33
34service:
35 type: ClusterIP
36 port: 8000
37
38ingress:
39 enabled: true
40 className: nginx
41 host: llm.localtest.me
42
43monitoring:
44 serviceMonitor:
45 enabled: true
46 interval: 15s
47 labels:
48 release: kube-prometheus-stack
49
50modelCache:
51 sizeLimit: 10Gi
diff --git a/cluster/kind-config.yaml b/cluster/kind-config.yaml
new file mode 100644
index 0000000..c0306ce
--- /dev/null
+++ b/cluster/kind-config.yaml
@@ -0,0 +1,21 @@
1kind: Cluster
2apiVersion: kind.x-k8s.io/v1alpha4
3name: llm-local
4nodes:
5 - role: control-plane
6 image: kindest/node:v1.35.0@sha256:452d707d4862f52530247495d180205e029056831160e22870e37e3f6c1ac31f
7 kubeadmConfigPatches:
8 - |
9 kind: InitConfiguration
10 nodeRegistration:
11 kubeletExtraArgs:
12 node-labels: "ingress-ready=true"
13 extraPortMappings:
14 - containerPort: 80
15 hostPort: 8080
16 protocol: TCP
17 - containerPort: 443
18 hostPort: 8443
19 protocol: TCP
20 - role: worker
21 image: kindest/node:v1.35.0@sha256:452d707d4862f52530247495d180205e029056831160e22870e37e3f6c1ac31f
diff --git a/goals b/goals
new file mode 100644
index 0000000..9bbec82
--- /dev/null
+++ b/goals
@@ -0,0 +1,18 @@
1### Task
21. Stand up a local K8s cluster with `kind`, `k3d`, or `minikube`. Document exact versions.
32. Write a Helm chart (or use the upstream vLLM/SGLang chart and extend it) that deploys a small open-weights model — e.g. `Qwen2.5-0.5B-Instruct`, `Llama-3.2-1B-Instruct`, or any model that fits on CPU/small GPU. CPU-only inference is acceptable.
43. Wrap it in Terraform (or OpenTofu) using the `helm` and `kubernetes` providers.
54. Expose an OpenAI-compatible endpoint through a K8s Service / Ingress and prove it works with a `curl` example in the README.
65. Observability: scrape `/metrics` from the inference pod with Prometheus and show at least one dashboard or PromQL query for request latency and GPU/CPU utilization.
76. Two environments — `dev` and `prod` — differ by at least: replica count, resource requests/limits, and model choice. Use Terraform workspaces, tfvars, or environment directories; justify your choice.
8
9Stretch Goals
10- Deploy a separate application container containing an agentic system utilizing the deployed vLLM/SGLang as the backend model server. The agent system's use-case is free to you to choose.
11- HPA based on a custom metric (e.g. queue depth or tokens/sec)
12- Image digest pinning and an `atlantis.yaml` or equivalent GitOps config
13- A smoke-test job that runs post-deploy and fails the apply if the endpoint is unhealthy
14
15You will be assessed on the following criteria:
16- the correctness of its output (stochastic functions notwithstanding);
17- how reliable, testable, modular and clean your code is;
18- other interesting add-ons you can think of.
diff --git a/scripts/resolve-digests.sh b/scripts/resolve-digests.sh
new file mode 100755
index 0000000..526d463
--- /dev/null
+++ b/scripts/resolve-digests.sh
@@ -0,0 +1,31 @@
1#!/usr/bin/env bash
2# Resolve an image tag to a content-addressable digest for pinning.
3#
4# Usage:
5# scripts/resolve-digests.sh public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:latest
6# scripts/resolve-digests.sh # default image
7#
8# Prints three lines:
9# repo: public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo
10# digest: sha256:abc123...
11# pin: public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo@sha256:abc123...
12#
13# Paste the digest into the env's terraform (var.image_digest) to pin.
14set -euo pipefail
15
16IMG="${1:-public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:latest}"
17
18engine=""
19if command -v podman >/dev/null 2>&1; then engine=podman
20elif command -v docker >/dev/null 2>&1; then engine=docker
21else
22 echo "need podman or docker on PATH" >&2; exit 1
23fi
24
25"$engine" pull --quiet "$IMG" >/dev/null
26digest="$("$engine" image inspect "$IMG" --format '{{.Digest}}')"
27repo="${IMG%:*}"
28
29printf 'repo: %s\n' "$repo"
30printf 'digest: %s\n' "$digest"
31printf 'pin: %s@%s\n' "$repo" "$digest"
diff --git a/terraform/envs/agent/.terraform.lock.hcl b/terraform/envs/agent/.terraform.lock.hcl
new file mode 100644
index 0000000..605df33
--- /dev/null
+++ b/terraform/envs/agent/.terraform.lock.hcl
@@ -0,0 +1,19 @@
1# This file is maintained automatically by "tofu init".
2# Manual edits may be lost in future updates.
3
4provider "registry.opentofu.org/hashicorp/kubernetes" {
5 version = "2.38.0"
6 constraints = "~> 2.31"
7 hashes = [
8 "h1:nY7J9jFXcsRINog0KYagiWZw1GVYF9D2JmtIB7Wnrao=",
9 "zh:1096b41c4e5b2ee6c1980916fb9a8579bc1892071396f7a9432be058aabf3cbc",
10 "zh:2959fde9ae3d1deb5e317df0d7b02ea4977951ee6b9c4beb083c148ca8f3681c",
11 "zh:5082f98fcb3389c73339365f7df39fc6912bf2bd1a46d5f97778f441a67fd337",
12 "zh:620fd5d0fbc2d7a24ac6b420a4922e6093020358162a62fa8cbd37b2bac1d22e",
13 "zh:7f47c2de179bba35d759147c53082cad6c3449d19b0ec0c5a4ca8db5b06393e1",
14 "zh:89c3aa2a87e29febf100fd21cead34f9a4c0e6e7ae5f383b5cef815c677eb52a",
15 "zh:96eecc9f94938a0bc35b8a63d2c4a5f972395e44206620db06760b730d0471fc",
16 "zh:e15567c1095f898af173c281b66bffdc4f3068afdd9f84bb5b5b5521d9f29584",
17 "zh:ecc6b912629734a9a41a7cf1c4c73fb13b4b510afc9e7b2e0011d290bcd6d77f",
18 ]
19}
diff --git a/terraform/envs/agent/backend.tf b/terraform/envs/agent/backend.tf
new file mode 100644
index 0000000..3c533e6
--- /dev/null
+++ b/terraform/envs/agent/backend.tf
@@ -0,0 +1,5 @@
1terraform {
2 backend "local" {
3 path = "terraform.tfstate"
4 }
5}
diff --git a/terraform/envs/agent/main.tf b/terraform/envs/agent/main.tf
new file mode 100644
index 0000000..122eaca
--- /dev/null
+++ b/terraform/envs/agent/main.tf
@@ -0,0 +1,27 @@
1provider "kubernetes" {
2 config_path = pathexpand(var.kubeconfig)
3 config_context = var.kube_context
4}
5
6module "agent" {
7 source = "../../modules/agent"
8
9 namespace = "agent"
10 agent_source_path = var.agent_source_path
11
12 # Point at the prod LLM. `svc.cluster.local` resolves from any namespace.
13 llm_service_url = "http://llm-llm-app.llm-prod.svc.cluster.local:8000/v1"
14 model_alias = "Qwen2.5-1.5B-Instruct"
15
16 ingress_host = "agent.localtest.me"
17}
18
19output "ingress_host" { value = module.agent.ingress_host }
20output "service_dns" { value = module.agent.service_dns }
21output "curl_example" {
22 value = <<-EOT
23 curl -s http://${module.agent.ingress_host}:8080/ask \
24 -H 'Content-Type: application/json' \
25 -d '{"question":"what is 123 * 47?"}'
26 EOT
27}
diff --git a/terraform/envs/agent/variables.tf b/terraform/envs/agent/variables.tf
new file mode 100644
index 0000000..bf005b9
--- /dev/null
+++ b/terraform/envs/agent/variables.tf
@@ -0,0 +1,14 @@
1variable "kubeconfig" {
2 type = string
3 default = "~/.kube/config"
4}
5
6variable "kube_context" {
7 type = string
8 default = "kind-llm-local"
9}
10
11variable "agent_source_path" {
12 type = string
13 description = "Absolute path to agent/agent.py"
14}
diff --git a/terraform/envs/agent/versions.tf b/terraform/envs/agent/versions.tf
new file mode 100644
index 0000000..69cf77e
--- /dev/null
+++ b/terraform/envs/agent/versions.tf
@@ -0,0 +1,6 @@
1terraform {
2 required_version = ">= 1.6.0"
3 required_providers {
4 kubernetes = { source = "hashicorp/kubernetes", version = "~> 2.31" }
5 }
6}
diff --git a/terraform/envs/bootstrap/.terraform.lock.hcl b/terraform/envs/bootstrap/.terraform.lock.hcl
new file mode 100644
index 0000000..baa0088
--- /dev/null
+++ b/terraform/envs/bootstrap/.terraform.lock.hcl
@@ -0,0 +1,37 @@
1# This file is maintained automatically by "tofu init".
2# Manual edits may be lost in future updates.
3
4provider "registry.opentofu.org/hashicorp/helm" {
5 version = "2.17.0"
6 constraints = "~> 2.17"
7 hashes = [
8 "h1:69PnHoYrrDrm7C8+8PiSvRGPI55taqL14SvQR/FGM+g=",
9 "zh:02690815e35131a42cb9851f63a3369c216af30ad093d05b39001d43da04b56b",
10 "zh:27a62f12b29926387f4d71aeeee9f7ffa0ccb81a1b6066ee895716ad050d1b7a",
11 "zh:2d0a5babfa73604b3fefc9dab9c87f91c77fce756c2e32b294e9f1290aed26c0",
12 "zh:3976400ceba6dda4636e1d297e3097e1831de5628afa534a166de98a70d1dcbe",
13 "zh:54440ef14f342b41d75c1aded7487bfcc3f76322b75894235b47b7e89ac4bfa4",
14 "zh:6512e2ab9f2fa31cbb90d9249647b5c5798f62eb1215ec44da2cdaa24e38ad25",
15 "zh:795f327ca0b8c5368af0ed03d5d4f6da7260692b4b3ca0bd004ed542e683464d",
16 "zh:ba659e1d94f224bc3f1fd34cbb9d2663e3a8e734108e5a58eb49eda84b140978",
17 "zh:c5c8575c4458835c2acbc3d1ed5570589b14baa2525d8fbd04295c097caf41eb",
18 "zh:e0877a5dac3de138e61eefa26b2f5a13305a17259779465899880f70e11314e0",
19 ]
20}
21
22provider "registry.opentofu.org/hashicorp/kubernetes" {
23 version = "2.38.0"
24 constraints = "~> 2.31"
25 hashes = [
26 "h1:nY7J9jFXcsRINog0KYagiWZw1GVYF9D2JmtIB7Wnrao=",
27 "zh:1096b41c4e5b2ee6c1980916fb9a8579bc1892071396f7a9432be058aabf3cbc",
28 "zh:2959fde9ae3d1deb5e317df0d7b02ea4977951ee6b9c4beb083c148ca8f3681c",
29 "zh:5082f98fcb3389c73339365f7df39fc6912bf2bd1a46d5f97778f441a67fd337",
30 "zh:620fd5d0fbc2d7a24ac6b420a4922e6093020358162a62fa8cbd37b2bac1d22e",
31 "zh:7f47c2de179bba35d759147c53082cad6c3449d19b0ec0c5a4ca8db5b06393e1",
32 "zh:89c3aa2a87e29febf100fd21cead34f9a4c0e6e7ae5f383b5cef815c677eb52a",
33 "zh:96eecc9f94938a0bc35b8a63d2c4a5f972395e44206620db06760b730d0471fc",
34 "zh:e15567c1095f898af173c281b66bffdc4f3068afdd9f84bb5b5b5521d9f29584",
35 "zh:ecc6b912629734a9a41a7cf1c4c73fb13b4b510afc9e7b2e0011d290bcd6d77f",
36 ]
37}
diff --git a/terraform/envs/bootstrap/backend.tf b/terraform/envs/bootstrap/backend.tf
new file mode 100644
index 0000000..3c533e6
--- /dev/null
+++ b/terraform/envs/bootstrap/backend.tf
@@ -0,0 +1,5 @@
1terraform {
2 backend "local" {
3 path = "terraform.tfstate"
4 }
5}
diff --git a/terraform/envs/bootstrap/main.tf b/terraform/envs/bootstrap/main.tf
new file mode 100644
index 0000000..07bf04d
--- /dev/null
+++ b/terraform/envs/bootstrap/main.tf
@@ -0,0 +1,25 @@
1provider "kubernetes" {
2 config_path = pathexpand(var.kubeconfig)
3 config_context = var.kube_context
4}
5
6provider "helm" {
7 kubernetes {
8 config_path = pathexpand(var.kubeconfig)
9 config_context = var.kube_context
10 }
11}
12
13module "observability" {
14 source = "../../modules/observability"
15 namespace = "monitoring"
16 grafana_admin_password = var.grafana_admin_password
17}
18
19output "grafana" {
20 value = module.observability.grafana_service
21}
22
23output "prometheus" {
24 value = module.observability.prometheus_service
25}
diff --git a/terraform/envs/bootstrap/variables.tf b/terraform/envs/bootstrap/variables.tf
new file mode 100644
index 0000000..220bed3
--- /dev/null
+++ b/terraform/envs/bootstrap/variables.tf
@@ -0,0 +1,15 @@
1variable "kubeconfig" {
2 type = string
3 default = "~/.kube/config"
4}
5
6variable "kube_context" {
7 type = string
8 default = "kind-llm-local"
9}
10
11variable "grafana_admin_password" {
12 type = string
13 default = "admin"
14 sensitive = true
15}
diff --git a/terraform/envs/bootstrap/versions.tf b/terraform/envs/bootstrap/versions.tf
new file mode 100644
index 0000000..0d7f77b
--- /dev/null
+++ b/terraform/envs/bootstrap/versions.tf
@@ -0,0 +1,7 @@
1terraform {
2 required_version = ">= 1.6.0"
3 required_providers {
4 helm = { source = "hashicorp/helm", version = "~> 2.17" }
5 kubernetes = { source = "hashicorp/kubernetes", version = "~> 2.31" }
6 }
7}
diff --git a/terraform/envs/dev/.terraform.lock.hcl b/terraform/envs/dev/.terraform.lock.hcl
new file mode 100644
index 0000000..09902a1
--- /dev/null
+++ b/terraform/envs/dev/.terraform.lock.hcl
@@ -0,0 +1,54 @@
1# This file is maintained automatically by "tofu init".
2# Manual edits may be lost in future updates.
3
4provider "registry.opentofu.org/hashicorp/helm" {
5 version = "2.17.0"
6 constraints = "~> 2.17"
7 hashes = [
8 "h1:69PnHoYrrDrm7C8+8PiSvRGPI55taqL14SvQR/FGM+g=",
9 "zh:02690815e35131a42cb9851f63a3369c216af30ad093d05b39001d43da04b56b",
10 "zh:27a62f12b29926387f4d71aeeee9f7ffa0ccb81a1b6066ee895716ad050d1b7a",
11 "zh:2d0a5babfa73604b3fefc9dab9c87f91c77fce756c2e32b294e9f1290aed26c0",
12 "zh:3976400ceba6dda4636e1d297e3097e1831de5628afa534a166de98a70d1dcbe",
13 "zh:54440ef14f342b41d75c1aded7487bfcc3f76322b75894235b47b7e89ac4bfa4",
14 "zh:6512e2ab9f2fa31cbb90d9249647b5c5798f62eb1215ec44da2cdaa24e38ad25",
15 "zh:795f327ca0b8c5368af0ed03d5d4f6da7260692b4b3ca0bd004ed542e683464d",
16 "zh:ba659e1d94f224bc3f1fd34cbb9d2663e3a8e734108e5a58eb49eda84b140978",
17 "zh:c5c8575c4458835c2acbc3d1ed5570589b14baa2525d8fbd04295c097caf41eb",
18 "zh:e0877a5dac3de138e61eefa26b2f5a13305a17259779465899880f70e11314e0",
19 ]
20}
21
22provider "registry.opentofu.org/hashicorp/kubernetes" {
23 version = "2.38.0"
24 constraints = "~> 2.31"
25 hashes = [
26 "h1:nY7J9jFXcsRINog0KYagiWZw1GVYF9D2JmtIB7Wnrao=",
27 "zh:1096b41c4e5b2ee6c1980916fb9a8579bc1892071396f7a9432be058aabf3cbc",
28 "zh:2959fde9ae3d1deb5e317df0d7b02ea4977951ee6b9c4beb083c148ca8f3681c",
29 "zh:5082f98fcb3389c73339365f7df39fc6912bf2bd1a46d5f97778f441a67fd337",
30 "zh:620fd5d0fbc2d7a24ac6b420a4922e6093020358162a62fa8cbd37b2bac1d22e",
31 "zh:7f47c2de179bba35d759147c53082cad6c3449d19b0ec0c5a4ca8db5b06393e1",
32 "zh:89c3aa2a87e29febf100fd21cead34f9a4c0e6e7ae5f383b5cef815c677eb52a",
33 "zh:96eecc9f94938a0bc35b8a63d2c4a5f972395e44206620db06760b730d0471fc",
34 "zh:e15567c1095f898af173c281b66bffdc4f3068afdd9f84bb5b5b5521d9f29584",
35 "zh:ecc6b912629734a9a41a7cf1c4c73fb13b4b510afc9e7b2e0011d290bcd6d77f",
36 ]
37}
38
39provider "registry.opentofu.org/hashicorp/random" {
40 version = "3.8.1"
41 constraints = "~> 3.6"
42 hashes = [
43 "h1:EHn3jsqOKhWjbg0X+psk0Ww96yz3N7ASqEKKuFvDFwo=",
44 "zh:25c458c7c676f15705e872202dad7dcd0982e4a48e7ea1800afa5fc64e77f4c8",
45 "zh:2edeaf6f1b20435b2f81855ad98a2e70956d473be9e52a5fdf57ccd0098ba476",
46 "zh:44becb9d5f75d55e36dfed0c5beabaf4c92e0a2bc61a3814d698271c646d48e7",
47 "zh:7699032612c3b16cc69928add8973de47b10ce81b1141f30644a0e8a895b5cd3",
48 "zh:86d07aa98d17703de9fbf402c89590dc1e01dbe5671dd6bc5e487eb8fe87eee0",
49 "zh:8c411c77b8390a49a8a1bc9f176529e6b32369dd33a723606c8533e5ca4d68c1",
50 "zh:a5ecc8255a612652a56b28149994985e2c4dc046e5d34d416d47fa7767f5c28f",
51 "zh:aea3fe1a5669b932eda9c5c72e5f327db8da707fe514aaca0d0ef60cb24892f9",
52 "zh:f56e26e6977f755d7ae56fa6320af96ecf4bb09580d47cb481efbf27f1c5afff",
53 ]
54}
diff --git a/terraform/envs/dev/backend.tf b/terraform/envs/dev/backend.tf
new file mode 100644
index 0000000..3c533e6
--- /dev/null
+++ b/terraform/envs/dev/backend.tf
@@ -0,0 +1,5 @@
1terraform {
2 backend "local" {
3 path = "terraform.tfstate"
4 }
5}
diff --git a/terraform/envs/dev/main.tf b/terraform/envs/dev/main.tf
new file mode 100644
index 0000000..8e1b882
--- /dev/null
+++ b/terraform/envs/dev/main.tf
@@ -0,0 +1,49 @@
1provider "kubernetes" {
2 config_path = pathexpand(var.kubeconfig)
3 config_context = var.kube_context
4}
5
6provider "helm" {
7 kubernetes {
8 config_path = pathexpand(var.kubeconfig)
9 config_context = var.kube_context
10 }
11}
12
13locals {
14 env = "dev"
15}
16
17module "llm" {
18 source = "../../modules/llm"
19
20 release_name = "llm"
21 namespace = "llm-${local.env}"
22 chart_path = var.chart_path
23
24 replicas = 2
25
26 model_name = "Qwen/Qwen2.5-0.5B-Instruct"
27 model_alias = "Qwen2.5-0.5B-Instruct"
28 max_model_len = 2048
29 dtype = "bfloat16"
30 omp_threads = 4
31
32 resources = {
33 requests = { cpu = "1", memory = "2Gi" }
34 limits = { cpu = "4", memory = "6Gi" }
35 }
36
37 ingress_host = "llm.dev.localtest.me"
38 image_tag = "latest"
39}
40
41output "ingress_host" { value = module.llm.ingress_host }
42output "service_dns" { value = module.llm.service_dns }
43output "curl_example" {
44 value = <<-EOT
45 curl -s http://${module.llm.ingress_host}:8080/v1/chat/completions \
46 -H 'Content-Type: application/json' \
47 -d '{"model":"Qwen2.5-0.5B-Instruct","messages":[{"role":"user","content":"Say hi."}]}'
48 EOT
49}
diff --git a/terraform/envs/dev/variables.tf b/terraform/envs/dev/variables.tf
new file mode 100644
index 0000000..9f1b697
--- /dev/null
+++ b/terraform/envs/dev/variables.tf
@@ -0,0 +1,14 @@
1variable "kubeconfig" {
2 type = string
3 default = "~/.kube/config"
4}
5
6variable "kube_context" {
7 type = string
8 default = "kind-llm-local"
9}
10
11variable "chart_path" {
12 type = string
13 description = "Absolute path to charts/llm-app"
14}
diff --git a/terraform/envs/dev/versions.tf b/terraform/envs/dev/versions.tf
new file mode 100644
index 0000000..6a87674
--- /dev/null
+++ b/terraform/envs/dev/versions.tf
@@ -0,0 +1,8 @@
1terraform {
2 required_version = ">= 1.6.0"
3 required_providers {
4 helm = { source = "hashicorp/helm", version = "~> 2.17" }
5 kubernetes = { source = "hashicorp/kubernetes", version = "~> 2.31" }
6 random = { source = "hashicorp/random", version = "~> 3.6" }
7 }
8}
diff --git a/terraform/envs/prod/.terraform.lock.hcl b/terraform/envs/prod/.terraform.lock.hcl
new file mode 100644
index 0000000..09902a1
--- /dev/null
+++ b/terraform/envs/prod/.terraform.lock.hcl
@@ -0,0 +1,54 @@
1# This file is maintained automatically by "tofu init".
2# Manual edits may be lost in future updates.
3
4provider "registry.opentofu.org/hashicorp/helm" {
5 version = "2.17.0"
6 constraints = "~> 2.17"
7 hashes = [
8 "h1:69PnHoYrrDrm7C8+8PiSvRGPI55taqL14SvQR/FGM+g=",
9 "zh:02690815e35131a42cb9851f63a3369c216af30ad093d05b39001d43da04b56b",
10 "zh:27a62f12b29926387f4d71aeeee9f7ffa0ccb81a1b6066ee895716ad050d1b7a",
11 "zh:2d0a5babfa73604b3fefc9dab9c87f91c77fce756c2e32b294e9f1290aed26c0",
12 "zh:3976400ceba6dda4636e1d297e3097e1831de5628afa534a166de98a70d1dcbe",
13 "zh:54440ef14f342b41d75c1aded7487bfcc3f76322b75894235b47b7e89ac4bfa4",
14 "zh:6512e2ab9f2fa31cbb90d9249647b5c5798f62eb1215ec44da2cdaa24e38ad25",
15 "zh:795f327ca0b8c5368af0ed03d5d4f6da7260692b4b3ca0bd004ed542e683464d",
16 "zh:ba659e1d94f224bc3f1fd34cbb9d2663e3a8e734108e5a58eb49eda84b140978",
17 "zh:c5c8575c4458835c2acbc3d1ed5570589b14baa2525d8fbd04295c097caf41eb",
18 "zh:e0877a5dac3de138e61eefa26b2f5a13305a17259779465899880f70e11314e0",
19 ]
20}
21
22provider "registry.opentofu.org/hashicorp/kubernetes" {
23 version = "2.38.0"
24 constraints = "~> 2.31"
25 hashes = [
26 "h1:nY7J9jFXcsRINog0KYagiWZw1GVYF9D2JmtIB7Wnrao=",
27 "zh:1096b41c4e5b2ee6c1980916fb9a8579bc1892071396f7a9432be058aabf3cbc",
28 "zh:2959fde9ae3d1deb5e317df0d7b02ea4977951ee6b9c4beb083c148ca8f3681c",
29 "zh:5082f98fcb3389c73339365f7df39fc6912bf2bd1a46d5f97778f441a67fd337",
30 "zh:620fd5d0fbc2d7a24ac6b420a4922e6093020358162a62fa8cbd37b2bac1d22e",
31 "zh:7f47c2de179bba35d759147c53082cad6c3449d19b0ec0c5a4ca8db5b06393e1",
32 "zh:89c3aa2a87e29febf100fd21cead34f9a4c0e6e7ae5f383b5cef815c677eb52a",
33 "zh:96eecc9f94938a0bc35b8a63d2c4a5f972395e44206620db06760b730d0471fc",
34 "zh:e15567c1095f898af173c281b66bffdc4f3068afdd9f84bb5b5b5521d9f29584",
35 "zh:ecc6b912629734a9a41a7cf1c4c73fb13b4b510afc9e7b2e0011d290bcd6d77f",
36 ]
37}
38
39provider "registry.opentofu.org/hashicorp/random" {
40 version = "3.8.1"
41 constraints = "~> 3.6"
42 hashes = [
43 "h1:EHn3jsqOKhWjbg0X+psk0Ww96yz3N7ASqEKKuFvDFwo=",
44 "zh:25c458c7c676f15705e872202dad7dcd0982e4a48e7ea1800afa5fc64e77f4c8",
45 "zh:2edeaf6f1b20435b2f81855ad98a2e70956d473be9e52a5fdf57ccd0098ba476",
46 "zh:44becb9d5f75d55e36dfed0c5beabaf4c92e0a2bc61a3814d698271c646d48e7",
47 "zh:7699032612c3b16cc69928add8973de47b10ce81b1141f30644a0e8a895b5cd3",
48 "zh:86d07aa98d17703de9fbf402c89590dc1e01dbe5671dd6bc5e487eb8fe87eee0",
49 "zh:8c411c77b8390a49a8a1bc9f176529e6b32369dd33a723606c8533e5ca4d68c1",
50 "zh:a5ecc8255a612652a56b28149994985e2c4dc046e5d34d416d47fa7767f5c28f",
51 "zh:aea3fe1a5669b932eda9c5c72e5f327db8da707fe514aaca0d0ef60cb24892f9",
52 "zh:f56e26e6977f755d7ae56fa6320af96ecf4bb09580d47cb481efbf27f1c5afff",
53 ]
54}
diff --git a/terraform/envs/prod/backend.tf b/terraform/envs/prod/backend.tf
new file mode 100644
index 0000000..3c533e6
--- /dev/null
+++ b/terraform/envs/prod/backend.tf
@@ -0,0 +1,5 @@
1terraform {
2 backend "local" {
3 path = "terraform.tfstate"
4 }
5}
diff --git a/terraform/envs/prod/main.tf b/terraform/envs/prod/main.tf
new file mode 100644
index 0000000..04db31d
--- /dev/null
+++ b/terraform/envs/prod/main.tf
@@ -0,0 +1,70 @@
1provider "kubernetes" {
2 config_path = pathexpand(var.kubeconfig)
3 config_context = var.kube_context
4}
5
6provider "helm" {
7 kubernetes {
8 config_path = pathexpand(var.kubeconfig)
9 config_context = var.kube_context
10 }
11}
12
13locals {
14 env = "prod"
15}
16
17module "llm" {
18 source = "../../modules/llm"
19
20 release_name = "llm"
21 namespace = "llm-${local.env}"
22 chart_path = var.chart_path
23
24 replicas = 1
25
26 model_name = "Qwen/Qwen2.5-1.5B-Instruct"
27 model_alias = "Qwen2.5-1.5B-Instruct"
28 max_model_len = 4096
29 dtype = "bfloat16"
30 omp_threads = 6
31
32 resources = {
33 requests = { cpu = "2", memory = "4Gi" }
34 limits = { cpu = "6", memory = "8Gi" }
35 }
36
37 ingress_host = "llm.prod.localtest.me"
38 image_tag = "latest"
39 # Content-addressable pin, resolved with scripts/resolve-digests.sh on amd64.
40 # Per-arch digest — re-resolve on a different arch or after an upstream tag move.
41 # Dev intentionally runs on `:latest` so new fixes flow in without a PR.
42 image_digest = "sha256:bb7ed9b6c595334d78179e9d8f6490e06bf9220ed4a10b9b4e15064454ddc69e"
43
44 # Enable OpenAI tool-calling so the agent's function-call path works.
45 # Qwen 2.5 uses hermes-style tool parsing in vLLM.
46 extra_args = [
47 "--enable-auto-tool-choice",
48 "--tool-call-parser", "hermes",
49 ]
50
51 hpa = {
52 enabled = true
53 min_replicas = 1
54 max_replicas = 3
55 # vLLM exposes `vllm:num_requests_running` as a per-pod gauge of in-flight
56 # requests. Scale up when >50% of pods are actively serving.
57 metric_name = "vllm:num_requests_running"
58 target_average_value = "500m"
59 }
60}
61
62output "ingress_host" { value = module.llm.ingress_host }
63output "service_dns" { value = module.llm.service_dns }
64output "curl_example" {
65 value = <<-EOT
66 curl -s http://${module.llm.ingress_host}:8080/v1/chat/completions \
67 -H 'Content-Type: application/json' \
68 -d '{"model":"Qwen2.5-1.5B-Instruct","messages":[{"role":"user","content":"Say hi."}]}'
69 EOT
70}
diff --git a/terraform/envs/prod/variables.tf b/terraform/envs/prod/variables.tf
new file mode 100644
index 0000000..9f1b697
--- /dev/null
+++ b/terraform/envs/prod/variables.tf
@@ -0,0 +1,14 @@
1variable "kubeconfig" {
2 type = string
3 default = "~/.kube/config"
4}
5
6variable "kube_context" {
7 type = string
8 default = "kind-llm-local"
9}
10
11variable "chart_path" {
12 type = string
13 description = "Absolute path to charts/llm-app"
14}
diff --git a/terraform/envs/prod/versions.tf b/terraform/envs/prod/versions.tf
new file mode 100644
index 0000000..6a87674
--- /dev/null
+++ b/terraform/envs/prod/versions.tf
@@ -0,0 +1,8 @@
1terraform {
2 required_version = ">= 1.6.0"
3 required_providers {
4 helm = { source = "hashicorp/helm", version = "~> 2.17" }
5 kubernetes = { source = "hashicorp/kubernetes", version = "~> 2.31" }
6 random = { source = "hashicorp/random", version = "~> 3.6" }
7 }
8}
diff --git a/terraform/modules/agent/main.tf b/terraform/modules/agent/main.tf
new file mode 100644
index 0000000..f53acdc
--- /dev/null
+++ b/terraform/modules/agent/main.tf
@@ -0,0 +1,114 @@
1resource "kubernetes_namespace_v1" "agent" {
2 metadata {
3 name = var.namespace
4 labels = {
5 "app.kubernetes.io/part-of" = "llm-platform"
6 }
7 }
8}
9
10resource "kubernetes_deployment_v1" "agent" {
11 metadata {
12 name = "agent"
13 namespace = kubernetes_namespace_v1.agent.metadata[0].name
14 labels = { app = "agent" }
15 }
16 spec {
17 replicas = 1
18 selector {
19 match_labels = { app = "agent" }
20 }
21 template {
22 metadata {
23 labels = { app = "agent" }
24 annotations = {
25 # Bounce the pod when agent.py changes on disk, even if image tag is unchanged.
26 "checksum/code" = substr(sha256(file(var.agent_source_path)), 0, 16)
27 }
28 }
29 spec {
30 container {
31 name = "agent"
32 image = var.agent_image
33 image_pull_policy = "IfNotPresent"
34 env {
35 name = "OPENAI_BASE_URL"
36 value = var.llm_service_url
37 }
38 env {
39 name = "MODEL"
40 value = var.model_alias
41 }
42 port {
43 name = "http"
44 container_port = 8001
45 }
46 readiness_probe {
47 http_get {
48 path = "/health"
49 port = "http"
50 }
51 initial_delay_seconds = 3
52 period_seconds = 5
53 failure_threshold = 10
54 }
55 liveness_probe {
56 http_get {
57 path = "/health"
58 port = "http"
59 }
60 initial_delay_seconds = 30
61 period_seconds = 30
62 }
63 resources {
64 requests = { cpu = "100m", memory = "128Mi" }
65 limits = { cpu = "1", memory = "512Mi" }
66 }
67 }
68 }
69 }
70 }
71}
72
73resource "kubernetes_service_v1" "agent" {
74 metadata {
75 name = "agent"
76 namespace = kubernetes_namespace_v1.agent.metadata[0].name
77 labels = { app = "agent" }
78 }
79 spec {
80 selector = { app = "agent" }
81 port {
82 name = "http"
83 port = 8001
84 target_port = "http"
85 }
86 }
87}
88
89resource "kubernetes_ingress_v1" "agent" {
90 metadata {
91 name = "agent"
92 namespace = kubernetes_namespace_v1.agent.metadata[0].name
93 }
94 spec {
95 ingress_class_name = var.ingress_class
96 rule {
97 host = var.ingress_host
98 http {
99 path {
100 path = "/"
101 path_type = "Prefix"
102 backend {
103 service {
104 name = kubernetes_service_v1.agent.metadata[0].name
105 port {
106 number = 8001
107 }
108 }
109 }
110 }
111 }
112 }
113 }
114}
diff --git a/terraform/modules/agent/outputs.tf b/terraform/modules/agent/outputs.tf
new file mode 100644
index 0000000..ac9932b
--- /dev/null
+++ b/terraform/modules/agent/outputs.tf
@@ -0,0 +1,11 @@
1output "service_dns" {
2 value = "${kubernetes_service_v1.agent.metadata[0].name}.${kubernetes_namespace_v1.agent.metadata[0].name}.svc.cluster.local"
3}
4
5output "ingress_host" {
6 value = var.ingress_host
7}
8
9output "namespace" {
10 value = kubernetes_namespace_v1.agent.metadata[0].name
11}
diff --git a/terraform/modules/agent/variables.tf b/terraform/modules/agent/variables.tf
new file mode 100644
index 0000000..6f525ee
--- /dev/null
+++ b/terraform/modules/agent/variables.tf
@@ -0,0 +1,33 @@
1variable "namespace" {
2 type = string
3}
4
5variable "agent_source_path" {
6 type = string
7 description = "Absolute path to agent/agent.py. Used only to bounce pods on code change."
8}
9
10variable "agent_image" {
11 type = string
12 default = "localhost/agent:0.1.0"
13 description = "Pre-built agent image. Must be loaded into kind with `make agent-build`."
14}
15
16variable "llm_service_url" {
17 type = string
18 description = "OpenAI-compatible base URL, e.g. http://llm-llm-app.llm-prod.svc.cluster.local:8000/v1"
19}
20
21variable "model_alias" {
22 type = string
23 default = "Qwen2.5-1.5B-Instruct"
24}
25
26variable "ingress_host" {
27 type = string
28}
29
30variable "ingress_class" {
31 type = string
32 default = "nginx"
33}
diff --git a/terraform/modules/agent/versions.tf b/terraform/modules/agent/versions.tf
new file mode 100644
index 0000000..4242705
--- /dev/null
+++ b/terraform/modules/agent/versions.tf
@@ -0,0 +1,5 @@
1terraform {
2 required_providers {
3 kubernetes = { source = "hashicorp/kubernetes", version = "~> 2.31" }
4 }
5}
diff --git a/terraform/modules/llm/main.tf b/terraform/modules/llm/main.tf
new file mode 100644
index 0000000..cd22019
--- /dev/null
+++ b/terraform/modules/llm/main.tf
@@ -0,0 +1,99 @@
1resource "kubernetes_namespace_v1" "this" {
2 metadata {
3 name = var.namespace
4 labels = {
5 "app.kubernetes.io/part-of" = "llm-platform"
6 }
7 }
8}
9
10resource "kubernetes_horizontal_pod_autoscaler_v2" "llm" {
11 count = var.hpa.enabled ? 1 : 0
12
13 metadata {
14 name = "${var.release_name}-llm-app"
15 namespace = kubernetes_namespace_v1.this.metadata[0].name
16 }
17 spec {
18 scale_target_ref {
19 api_version = "apps/v1"
20 kind = "Deployment"
21 name = "${var.release_name}-llm-app"
22 }
23 min_replicas = var.hpa.min_replicas
24 max_replicas = var.hpa.max_replicas
25
26 metric {
27 type = "Pods"
28 pods {
29 metric {
30 name = var.hpa.metric_name
31 }
32 target {
33 type = "AverageValue"
34 average_value = var.hpa.target_average_value
35 }
36 }
37 }
38 }
39
40 depends_on = [helm_release.llm]
41}
42
43resource "helm_release" "llm" {
44 name = var.release_name
45 chart = var.chart_path
46 namespace = kubernetes_namespace_v1.this.metadata[0].name
47 create_namespace = false
48 atomic = false
49 wait = true
50 timeout = 1800
51
52 values = [
53 yamlencode({
54 replicaCount = var.replicas
55
56 image = {
57 repository = var.image_repository
58 tag = var.image_tag
59 digest = var.image_digest
60 pullPolicy = "IfNotPresent"
61 }
62
63 model = {
64 name = var.model_name
65 alias = var.model_alias
66 maxModelLen = var.max_model_len
67 dtype = var.dtype
68 }
69
70 server = {
71 port = 8000
72 ompThreads = var.omp_threads
73 extraArgs = var.extra_args
74 }
75
76 resources = var.resources
77
78 ingress = {
79 enabled = true
80 className = var.ingress_class
81 host = var.ingress_host
82 }
83
84 monitoring = {
85 serviceMonitor = {
86 enabled = true
87 interval = "15s"
88 labels = {
89 release = var.service_monitor_release_label
90 }
91 }
92 }
93
94 modelCache = {
95 sizeLimit = var.model_cache_size
96 }
97 }),
98 ]
99}
diff --git a/terraform/modules/llm/outputs.tf b/terraform/modules/llm/outputs.tf
new file mode 100644
index 0000000..a953e73
--- /dev/null
+++ b/terraform/modules/llm/outputs.tf
@@ -0,0 +1,12 @@
1output "service_dns" {
2 value = "${var.release_name}-llm-app.${var.namespace}.svc.cluster.local"
3 description = "In-cluster DNS name for the LLM Service."
4}
5
6output "ingress_host" {
7 value = var.ingress_host
8}
9
10output "namespace" {
11 value = kubernetes_namespace_v1.this.metadata[0].name
12}
diff --git a/terraform/modules/llm/variables.tf b/terraform/modules/llm/variables.tf
new file mode 100644
index 0000000..3a7d8f7
--- /dev/null
+++ b/terraform/modules/llm/variables.tf
@@ -0,0 +1,112 @@
1variable "release_name" {
2 type = string
3 description = "Helm release name."
4}
5
6variable "namespace" {
7 type = string
8 description = "Kubernetes namespace to deploy into."
9}
10
11variable "chart_path" {
12 type = string
13 description = "Path to the local llm-app chart."
14}
15
16variable "replicas" {
17 type = number
18 default = 1
19}
20
21variable "model_name" {
22 type = string
23 description = "HuggingFace repo id, passed as vLLM model_tag (positional)."
24}
25
26variable "model_alias" {
27 type = string
28 description = "Value clients pass in the OpenAI 'model' field (maps to --served-model-name)."
29}
30
31variable "max_model_len" {
32 type = number
33 default = 2048
34}
35
36variable "dtype" {
37 type = string
38 default = "bfloat16"
39}
40
41variable "omp_threads" {
42 type = number
43 default = 0
44 description = "OMP_NUM_THREADS for vLLM CPU backend. 0 = autodetect."
45}
46
47variable "extra_args" {
48 type = list(string)
49 default = []
50 description = "Extra CLI args passed to `vllm serve`, appended after the stock set."
51}
52
53variable "resources" {
54 type = object({
55 requests = object({ cpu = string, memory = string })
56 limits = object({ cpu = string, memory = string })
57 })
58}
59
60variable "ingress_host" {
61 type = string
62}
63
64variable "ingress_class" {
65 type = string
66 default = "nginx"
67}
68
69variable "image_repository" {
70 type = string
71 default = "public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo"
72}
73
74variable "image_tag" {
75 type = string
76 default = "latest"
77 description = "Used only when image_digest is empty."
78}
79
80variable "image_digest" {
81 type = string
82 default = ""
83 description = "Optional sha256:abc... content-addressable digest. Takes precedence over image_tag."
84}
85
86variable "service_monitor_release_label" {
87 type = string
88 default = "kube-prometheus-stack"
89 description = "Must match the release label the Prometheus Operator selects on."
90}
91
92variable "model_cache_size" {
93 type = string
94 default = "10Gi"
95}
96
97variable "hpa" {
98 type = object({
99 enabled = bool
100 min_replicas = number
101 max_replicas = number
102 metric_name = string
103 target_average_value = string
104 })
105 default = {
106 enabled = false
107 min_replicas = 1
108 max_replicas = 3
109 metric_name = "vllm:num_requests_running"
110 target_average_value = "500m"
111 }
112}
diff --git a/terraform/modules/observability/main.tf b/terraform/modules/observability/main.tf
new file mode 100644
index 0000000..2f88f2e
--- /dev/null
+++ b/terraform/modules/observability/main.tf
@@ -0,0 +1,156 @@
1resource "kubernetes_namespace_v1" "monitoring" {
2 metadata {
3 name = var.namespace
4 }
5}
6
7resource "kubernetes_namespace_v1" "ingress" {
8 metadata {
9 name = "ingress-nginx"
10 }
11}
12
13resource "helm_release" "ingress_nginx" {
14 name = "ingress-nginx"
15 repository = "https://kubernetes.github.io/ingress-nginx"
16 chart = "ingress-nginx"
17 version = var.ingress_nginx_version
18 namespace = kubernetes_namespace_v1.ingress.metadata[0].name
19 wait = true
20 timeout = 300
21
22 values = [
23 yamlencode({
24 controller = {
25 hostPort = { enabled = true, ports = { http = 80, https = 443 } }
26 service = { type = "NodePort" }
27 nodeSelector = {
28 "ingress-ready" = "true"
29 }
30 tolerations = [
31 { key = "node-role.kubernetes.io/control-plane", operator = "Equal", effect = "NoSchedule" },
32 { key = "node-role.kubernetes.io/master", operator = "Equal", effect = "NoSchedule" },
33 ]
34 publishService = { enabled = false }
35 admissionWebhooks = { enabled = false } # speeds up kind cluster installs
36 # Cap worker_processes so nginx doesn't try to spawn 14 threads under
37 # CPU pressure from vLLM cold-starts. With auto (= one per CPU) it
38 # sometimes hits pthread EAGAIN and workers die without respawn.
39 config = {
40 "worker-processes" = "4"
41 }
42 }
43 }),
44 ]
45}
46
47resource "helm_release" "kps" {
48 name = "kube-prometheus-stack"
49 repository = "https://prometheus-community.github.io/helm-charts"
50 chart = "kube-prometheus-stack"
51 version = var.kps_version
52 namespace = kubernetes_namespace_v1.monitoring.metadata[0].name
53 wait = true
54 timeout = 600
55
56 values = [
57 yamlencode({
58 fullnameOverride = "kps"
59 prometheus = {
60 prometheusSpec = {
61 # Let Prometheus pick up ServiceMonitors from any namespace matching
62 # the release=kube-prometheus-stack label (the chart's default).
63 serviceMonitorSelectorNilUsesHelmValues = false
64 podMonitorSelectorNilUsesHelmValues = false
65 ruleSelectorNilUsesHelmValues = false
66 retention = "2d"
67 resources = {
68 requests = { cpu = "100m", memory = "400Mi" }
69 limits = { memory = "1Gi" }
70 }
71 }
72 ingress = {
73 enabled = true
74 ingressClassName = "nginx"
75 hosts = ["prom.localtest.me"]
76 }
77 }
78 alertmanager = { enabled = false }
79 grafana = {
80 adminPassword = var.grafana_admin_password
81 sidecar = {
82 dashboards = {
83 enabled = true
84 label = "grafana_dashboard"
85 labelValue = "1"
86 searchNamespace = "ALL"
87 }
88 }
89 service = { type = "ClusterIP" }
90 ingress = {
91 enabled = true
92 ingressClassName = "nginx"
93 hosts = ["grafana.localtest.me"]
94 }
95 }
96 }),
97 ]
98}
99
100resource "helm_release" "prometheus_adapter" {
101 name = "prometheus-adapter"
102 repository = "https://prometheus-community.github.io/helm-charts"
103 chart = "prometheus-adapter"
104 version = var.prometheus_adapter_version
105 namespace = kubernetes_namespace_v1.monitoring.metadata[0].name
106 wait = true
107 timeout = 300
108
109 values = [
110 yamlencode({
111 prometheus = {
112 url = "http://kps-prometheus.${kubernetes_namespace_v1.monitoring.metadata[0].name}.svc"
113 port = 9090
114 }
115 rules = {
116 default = false
117 custom = [
118 {
119 # In-flight request count per pod; basis for autoscaling.
120 # vLLM exposes this as a gauge per model-engine.
121 seriesQuery = "vllm:num_requests_running{namespace!=\"\",pod!=\"\"}"
122 resources = {
123 overrides = {
124 namespace = { resource = "namespace" }
125 pod = { resource = "pod" }
126 }
127 }
128 name = {
129 matches = "^vllm:num_requests_running$"
130 as = "vllm:num_requests_running"
131 }
132 metricsQuery = "avg(<<.Series>>{<<.LabelMatchers>>}) by (<<.GroupBy>>)"
133 },
134 {
135 # Waiting (queued) requests per pod — an alternative scale signal.
136 seriesQuery = "vllm:num_requests_waiting{namespace!=\"\",pod!=\"\"}"
137 resources = {
138 overrides = {
139 namespace = { resource = "namespace" }
140 pod = { resource = "pod" }
141 }
142 }
143 name = {
144 matches = "^vllm:num_requests_waiting$"
145 as = "vllm:num_requests_waiting"
146 }
147 metricsQuery = "avg(<<.Series>>{<<.LabelMatchers>>}) by (<<.GroupBy>>)"
148 },
149 ]
150 }
151 }),
152 ]
153
154 depends_on = [helm_release.kps]
155}
156
diff --git a/terraform/modules/observability/outputs.tf b/terraform/modules/observability/outputs.tf
new file mode 100644
index 0000000..06a507d
--- /dev/null
+++ b/terraform/modules/observability/outputs.tf
@@ -0,0 +1,11 @@
1output "namespace" {
2 value = kubernetes_namespace_v1.monitoring.metadata[0].name
3}
4
5output "grafana_service" {
6 value = "kube-prometheus-stack-grafana.${kubernetes_namespace_v1.monitoring.metadata[0].name}.svc.cluster.local"
7}
8
9output "prometheus_service" {
10 value = "kps-prometheus.${kubernetes_namespace_v1.monitoring.metadata[0].name}.svc.cluster.local"
11}
diff --git a/terraform/modules/observability/variables.tf b/terraform/modules/observability/variables.tf
new file mode 100644
index 0000000..6aeaca3
--- /dev/null
+++ b/terraform/modules/observability/variables.tf
@@ -0,0 +1,27 @@
1variable "namespace" {
2 type = string
3 default = "monitoring"
4}
5
6variable "kps_version" {
7 type = string
8 default = "65.5.1"
9 description = "kube-prometheus-stack chart version."
10}
11
12variable "ingress_nginx_version" {
13 type = string
14 default = "4.11.3"
15 description = "ingress-nginx chart version."
16}
17
18variable "grafana_admin_password" {
19 type = string
20 default = "admin"
21 sensitive = true
22}
23
24variable "prometheus_adapter_version" {
25 type = string
26 default = "4.11.0"
27}
diff --git a/tests/smoke.sh b/tests/smoke.sh
new file mode 100755
index 0000000..a5ef23d
--- /dev/null
+++ b/tests/smoke.sh
@@ -0,0 +1,38 @@
1#!/usr/bin/env bash
2# Smoke test for the OpenAI-compatible LLM endpoint.
3# Usage:
4# ENDPOINT=http://llm.dev.localtest.me:8080 MODEL=Qwen2.5-0.5B-Instruct ./tests/smoke.sh
5set -euo pipefail
6
7ENDPOINT="${ENDPOINT:-http://llm.dev.localtest.me:8080}"
8MODEL="${MODEL:-Qwen2.5-0.5B-Instruct}"
9TIMEOUT="${TIMEOUT:-120}"
10
11say() { printf '\033[1;34m==>\033[0m %s\n' "$*"; }
12fail() { printf '\033[1;31mFAIL\033[0m %s\n' "$*" >&2; exit 1; }
13
14say "Endpoint: $ENDPOINT"
15say "Model: $MODEL"
16
17say "GET /v1/models"
18models_json="$(curl -fsS --max-time "$TIMEOUT" "$ENDPOINT/v1/models")" || fail "/v1/models unreachable"
19echo "$models_json" | grep -q "$MODEL" || fail "/v1/models does not list $MODEL"
20
21say "POST /v1/chat/completions"
22resp="$(curl -fsS --max-time "$TIMEOUT" "$ENDPOINT/v1/chat/completions" \
23 -H 'Content-Type: application/json' \
24 -d "$(cat <<EOF
25{
26 "model": "$MODEL",
27 "messages": [{"role": "user", "content": "Reply with the single word: pong"}],
28 "max_tokens": 8,
29 "temperature": 0
30}
31EOF
32)")" || fail "chat completion request failed"
33
34content="$(echo "$resp" | python3 -c 'import sys, json; print(json.load(sys.stdin)["choices"][0]["message"]["content"])')"
35echo "model reply: $content"
36[[ -n "$content" ]] || fail "empty completion content"
37
38say "OK"