47 files changed, 1718 insertions, 0 deletions
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..747a966
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,29 @@
+# Terraform
+.terraform/
+terraform.tfstate
+terraform.tfstate.*
+*.tfvars
+*.tfvars.json
+crash.log
+crash.*.log
+override.tf
+override.tf.json
+*_override.tf
+*_override.tf.json
+# Python
+__pycache__/
+*.py[cod]
+*.egg-info/
+.pytest_cache/
+.venv/
+venv/
+# OS
+.DS_Store
+Thumbs.db
+# Editors
+.idea/
+.vscode/
+*.swp
diff --git a/Makefile b/Makefile
new file mode 100644
index 0000000..15c0031
--- /dev/null
+++ b/Makefile
@@ -0,0 +1,78 @@
+REPO_ROOT   := $(abspath $(dir $(abspath $(lastword $(MAKEFILE_LIST)))))
+CHART_PATH  := $(REPO_ROOT)/charts/llm-app
+AGENT_SRC   := $(REPO_ROOT)/agent/agent.py
+AGENT_IMG   := localhost/agent:0.1.0
+CLUSTER     := llm-local
+CONTEXT     := kind-$(CLUSTER)
+export KIND_EXPERIMENTAL_PROVIDER=podman
+.PHONY: help
+help:
+        @echo "Targets:"
+        @echo "  up-dev         deploy dev LLM (Qwen2.5-0.5B, 2 replicas)"
+        @echo "  up-prod        deploy prod LLM (Qwen2.5-1.5B, 1 replica + HPA 1->3)"
+        @echo "  up-agent       up-prod + tool-using agent"
+        @echo "  ask Q='...'    POST a question to the agent"
+        @echo "  down           destroy everything + delete kind cluster"
+        @echo ""
+        @echo "URLs (after up-dev/up-prod):"
+        @echo "  Grafana    http://grafana.localtest.me:8080  (admin/admin)"
+        @echo "    curl -f http://grafana.localtest.me:8080/api/health"
+        @echo "  Prometheus http://prom.localtest.me:8080"
+        @echo "    curl -f http://prom.localtest.me:8080/-/healthy"
+        @echo ""
+.PHONY: up-dev
+up-dev:
+        @kind get clusters | grep -qx $(CLUSTER) || kind create cluster --config $(REPO_ROOT)/cluster/kind-config.yaml
+        cd $(REPO_ROOT)/terraform/envs/bootstrap && \
+          tofu init -upgrade && \
+          tofu apply -auto-approve \
+            -var kube_context=$(CONTEXT)
+        cd $(REPO_ROOT)/terraform/envs/dev && \
+          tofu init -upgrade && \
+          tofu apply -auto-approve \
+            -var kube_context=$(CONTEXT) \
+            -var chart_path=$(CHART_PATH)
+.PHONY: up-prod
+up-prod:
+        @kind get clusters | grep -qx $(CLUSTER) || kind create cluster --config $(REPO_ROOT)/cluster/kind-config.yaml
+        cd $(REPO_ROOT)/terraform/envs/bootstrap && \
+          tofu init -upgrade && \
+          tofu apply -auto-approve \
+            -var kube_context=$(CONTEXT)
+        cd $(REPO_ROOT)/terraform/envs/prod && \
+          tofu init -upgrade && \
+          tofu apply -auto-approve \
+            -var kube_context=$(CONTEXT) \
+            -var chart_path=$(CHART_PATH)
+.PHONY: up-agent
+up-agent: up-prod
+        podman build -t $(AGENT_IMG) $(REPO_ROOT)/agent/
+        @tmp=$$(mktemp -t agent-XXXXXX.tar); \
+          podman save $(AGENT_IMG) -o $$tmp && \
+          kind load image-archive $$tmp --name $(CLUSTER) && \
+          rm -f $$tmp
+        cd $(REPO_ROOT)/terraform/envs/agent && \
+          tofu init -upgrade && \
+          tofu apply -auto-approve \
+            -var kube_context=$(CONTEXT) \
+            -var agent_source_path=$(AGENT_SRC)
+.PHONY: ask
+ask:
+        @if [ -z "$(Q)" ]; then echo "usage: make ask Q='what is 17*23?'"; exit 1; fi
+        curl -s http://agent.localtest.me:8080/ask \
+          -H 'Content-Type: application/json' \
+          -d "$(shell printf '{"question":"%s"}' "$(Q)")" | python3 -m json.tool
+.PHONY: down
+down:
+        -cd $(REPO_ROOT)/terraform/envs/agent     && tofu destroy -auto-approve -var kube_context=$(CONTEXT) -var agent_source_path=$(AGENT_SRC) || true
+        -cd $(REPO_ROOT)/terraform/envs/prod      && tofu destroy -auto-approve -var kube_context=$(CONTEXT) -var chart_path=$(CHART_PATH) || true
+        -cd $(REPO_ROOT)/terraform/envs/dev       && tofu destroy -auto-approve -var kube_context=$(CONTEXT) -var chart_path=$(CHART_PATH) || true
+        -cd $(REPO_ROOT)/terraform/envs/bootstrap && tofu destroy -auto-approve -var kube_context=$(CONTEXT) || true
+        KIND_EXPERIMENTAL_PROVIDER=podman kind delete cluster --name $(CLUSTER)
diff --git a/README.txt b/README.txt
new file mode 100644
index 0000000..4ce7a8e
--- /dev/null
+++ b/README.txt
@@ -0,0 +1,90 @@
+=============================================================================
+ Local K8s LLM demo — kind + OpenTofu + vLLM
+=============================================================================
+sudo dnf install -y podman git make jq curl tar
+# kind v0.31.0 (node image: kindest/node:v1.35.0, pinned by digest in cluster/kind-config.yaml)
+cluster/kind-config.yaml
+# kubectl v1.36.0
+curl -fsSLo /tmp/kubectl \
+  https://dl.k8s.io/release/v1.36.0/bin/linux/amd64/kubectl
+sudo install -m 0755 /tmp/kubectl /usr/local/bin/kubectl
+# Helm 4.1.4
+terraform/modules/observability/variables.tf
+# OpenTofu 1.11.6
+terraform/envs/{dev,prod,bootstrap}/versions.tf
+# kind runs each k8s "node" as a long-lived podman container. The default pids_limit = 2048 causes ingress-nginx to hit pthread EAGAIN once the control plane warms up. Raise it once, then restart podman:
+sudo mkdir -p /etc/containers/containers.conf.d
+printf '[containers]\npids_limit = 0\n' \
+  | sudo tee /etc/containers/containers.conf.d/99-kind-pids.conf
+sudo systemctl restart podman.socket podman 2>/dev/null || true
+make help  # will have self explaantory commands to coopy adn paste stuff with
+# Expose an OpenAI-compatible endpoint through a K8s Service / Ingress and prove it works with a `curl` example in the README.
+http://llm.dev.localtest.me:8080
+$curl http://llm.dev.localtest.me:8080/v1/chat/completions -H 'Content-Type: application/json' -d '{"model":"Qwen2.5-0.5B-Instruct","messages":[{"role":"user","content":"hi"}],"max_tokens":8}' | jq
+http://llm.prod.localtest.me:8080
+$curl http://llm.prod.localtest.me:8080/v1/chat/completions -H 'Content-Type: application/json' -d '{"model":"Qwen2.5-1.5B-Instruct","messages":[{"role":"user","content":"hi"}],"max_tokens":8}' | jq
+# Observability: scrape `/metrics` from the inference pod with Prometheus and show at least one dashboard or PromQL query for request latency and GPU/CPU utilization.
+Fire 10 chat requests against dev to populate metrics
+$for i in {1..10}; do curl -s http://llm.dev.localtest.me:8080/v1/chat/completions -H 'Content-Type: application/json' -d '{"model":"Qwen2.5-0.5B-Instruct","messages":[{"role":"user","content":"hi"}],"max_tokens":4}' >/dev/null & done; wait
+Raw /metrics (vLLM exposes natively)
+$curl -s http://llm.dev.localtest.me:8080/metrics | grep '^vllm:' | head
+Request latency p95 (seconds) — via Prometheus
+$curl -G http://prom.localtest.me:8080/api/v1/query --data-urlencode 'query=histogram_quantile(0.95, sum by (le) (rate(vllm:e2e_request_latency_seconds_bucket[5m])))' | jq .data.result
+CPU cores in use per vLLM pod (CPU-only inference — no GPU on this stack)
+$curl -G http://prom.localtest.me:8080/api/v1/query --data-urlencode 'query=sum by (pod) (rate(container_cpu_usage_seconds_total{namespace="llm-dev",pod=~"llm-llm-app.*",container!="",container!="POD"}[5m]))' | jq .data.result
+In-flight requests per pod (the same metric the prod HPA scales on)
+$curl -G http://prom.localtest.me:8080/api/v1/query --data-urlencode 'query=sum by (namespace, pod) (vllm:num_requests_running)' | jq .data.result
+# stretch 1
+deployed /agent/agent.py in a container to use the backend to caluclate the product of 2 nums
+$curl http://agent.localtest.me:8080/ask -H 'Content-Type: application/json' -d '{"question":"what is 17 * 23?"}'  # need to make up-agent first
+# stretch 2 
+horizontal pod scaling by counting total inflight requests, up to a total of 3 pods
+term1
+$(trap 'kill 0' INT; for i in {1..5}; do \
+    curl -s http://llm.prod.localtest.me:8080/v1/chat/completions \
+      -H 'Content-Type: application/json' \
+      -d '{"model":"Qwen2.5-1.5B-Instruct","messages":[{"role":"user","content":"hi"}],"max_tokens":32}' \
+      >/dev/null &
+  done; wait)
+term2
+$kubectl -n llm-prod get hpa -w  # better to use watch -n for this, -w is slow af
+# stretch 3
+image pinning — prod uses repo@sha256:<digest> (resolved via scripts/resolve-digests.sh);
+terraform/envs/prod/main.tf
+dev tracks :latest. The chart prefers digest over tag when both are set.
+#stretch 4
+smoke test
+charts/llm-app/templates/smoketest-job.yaml reruns after every install or upgrade
+just checks if the response has a content field, no functional thingummy, then passes it
+all stretch goals except atlantis git controlled thingummy cuz lazy setup locally or use github
diff --git a/agent/Dockerfile b/agent/Dockerfile
new file mode 100644
index 0000000..509c3b6
--- /dev/null
+++ b/agent/Dockerfile
@@ -0,0 +1,6 @@
+FROM python:3.12-slim
+RUN pip install --no-cache-dir 'openai>=1.59.2,<2' 'httpx<0.28'
+WORKDIR /app
+COPY agent.py /app/agent.py
+EXPOSE 8001
+CMD ["python", "/app/agent.py"]
diff --git a/agent/agent.py b/agent/agent.py
new file mode 100644
index 0000000..12ad9d6
--- /dev/null
+++ b/agent/agent.py
@@ -0,0 +1,162 @@
+"""Tool-using agent over an OpenAI-compatible backend.
+Uses the standard OpenAI tools API (function calling). vLLM maps this to the
+model's native tool-call template (Qwen here), so small models follow the
+protocol much more reliably than a hand-rolled text convention.
+POST /ask  {"question": "..."}  -> {"answer": "...", "transcript": [...]}
+GET  /health                    -> "ok"
+"""
+import json
+import os
+import re
+from http.server import BaseHTTPRequestHandler, HTTPServer
+from openai import OpenAI
+client = OpenAI(
+    base_url=os.environ["OPENAI_BASE_URL"],
+    api_key=os.environ.get("OPENAI_API_KEY", "sk-local"),
+)
+MODEL = os.environ.get("MODEL", "Qwen2.5-1.5B-Instruct")
+MAX_STEPS = int(os.environ.get("MAX_STEPS", "6"))
+SYSTEM = (
+    "You are a careful math assistant. When the user asks any arithmetic question, "
+    "call the 'calc' tool with the exact expression. Do not compute arithmetic in your head. "
+    "After you receive the tool result, give a concise final answer."
+)
+TOOLS = [
+    {
+        "type": "function",
+        "function": {
+            "name": "calc",
+            "description": "Evaluate a safe arithmetic expression and return the numeric result.",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "expression": {
+                        "type": "string",
+                        "description": "Arithmetic expression using only digits, spaces, and + - * / . ( )",
+                    }
+                },
+                "required": ["expression"],
+            },
+        },
+    }
+]
+SAFE_EXPR = re.compile(r"^[\d\s+\-*/().]+$")
+def calc(expression: str) -> str:
+    if not SAFE_EXPR.fullmatch(expression):
+        return "ERROR: disallowed characters"
+    try:
+        return str(eval(expression, {"__builtins__": {}}, {}))  # noqa: S307
+    except Exception as e:
+        return f"ERROR: {e}"
+def run_agent(question: str) -> dict:
+    messages = [
+        {"role": "system", "content": SYSTEM},
+        {"role": "user", "content": question},
+    ]
+    transcript: list = []
+    for step in range(MAX_STEPS):
+        resp = client.chat.completions.create(
+            model=MODEL,
+            messages=messages,
+            tools=TOOLS,
+            tool_choice="auto",
+            temperature=0.0,
+            max_tokens=256,
+        )
+        msg = resp.choices[0].message
+        # Always append the assistant message (with any tool_calls) to history.
+        assistant_entry = {"role": "assistant", "content": msg.content or ""}
+        if msg.tool_calls:
+            assistant_entry["tool_calls"] = [
+                {
+                    "id": tc.id,
+                    "type": "function",
+                    "function": {"name": tc.function.name, "arguments": tc.function.arguments},
+                }
+                for tc in msg.tool_calls
+            ]
+        messages.append(assistant_entry)
+        transcript.append(
+            {
+                "step": step + 1,
+                "content": msg.content,
+                "tool_calls": [
+                    {"name": tc.function.name, "arguments": tc.function.arguments}
+                    for tc in (msg.tool_calls or [])
+                ],
+            }
+        )
+        if msg.tool_calls:
+            for tc in msg.tool_calls:
+                if tc.function.name != "calc":
+                    result = f"ERROR: unknown tool {tc.function.name}"
+                else:
+                    try:
+                        args = json.loads(tc.function.arguments)
+                    except json.JSONDecodeError:
+                        result = "ERROR: bad JSON arguments"
+                    else:
+                        result = calc(args.get("expression", ""))
+                transcript.append({"tool_result": {"name": tc.function.name, "result": result}})
+                messages.append(
+                    {"role": "tool", "tool_call_id": tc.id, "content": result}
+                )
+            continue
+        # No tool call -> model produced a final answer.
+        return {"answer": (msg.content or "").strip(), "steps": step + 1, "transcript": transcript}
+    return {"answer": None, "steps": MAX_STEPS, "note": "MAX_STEPS reached", "transcript": transcript}
+class Handler(BaseHTTPRequestHandler):
+    def do_POST(self):  # noqa: N802
+        if self.path != "/ask":
+            self.send_response(404); self.end_headers(); return
+        n = int(self.headers.get("Content-Length", "0"))
+        try:
+            body = json.loads(self.rfile.read(n) or b"{}")
+        except json.JSONDecodeError:
+            self.send_response(400); self.end_headers(); self.wfile.write(b'{"error":"invalid json"}'); return
+        q = body.get("question", "")
+        try:
+            result = run_agent(q)
+            code = 200
+        except Exception as e:
+            result = {"error": str(e), "type": type(e).__name__}
+            code = 500
+        payload = json.dumps(result).encode()
+        self.send_response(code)
+        self.send_header("Content-Type", "application/json")
+        self.send_header("Content-Length", str(len(payload)))
+        self.end_headers()
+        self.wfile.write(payload)
+    def do_GET(self):  # noqa: N802
+        if self.path == "/health":
+            self.send_response(200); self.end_headers(); self.wfile.write(b"ok"); return
+        self.send_response(404); self.end_headers()
+    def log_message(self, fmt, *args):
+        import sys
+        print(f"{self.address_string()} {fmt % args}", file=sys.stderr)
+if __name__ == "__main__":
+    print(f"agent starting on :8001, model={MODEL}, backend={os.environ['OPENAI_BASE_URL']}")
+    HTTPServer(("0.0.0.0", 8001), Handler).serve_forever()
diff --git a/charts/llm-app/Chart.yaml b/charts/llm-app/Chart.yaml
new file mode 100644
index 0000000..e0747df
--- /dev/null
+++ b/charts/llm-app/Chart.yaml
@@ -0,0 +1,6 @@
+apiVersion: v2
+name: llm-app
+description: OpenAI-compatible LLM server (vLLM CPU) with Prometheus metrics
+type: application
+version: 0.1.0
+appVersion: "latest"
diff --git a/charts/llm-app/templates/_helpers.tpl b/charts/llm-app/templates/_helpers.tpl
new file mode 100644
index 0000000..8b104de
--- /dev/null
+++ b/charts/llm-app/templates/_helpers.tpl
@@ -0,0 +1,8 @@
+{{- define "llm-app.fullname" -}}
+{{- printf "%s-%s" .Release.Name .Chart.Name | trunc 63 | trimSuffix "-" -}}
+{{- end -}}
+{{- define "llm-app.selectorLabels" -}}
+app.kubernetes.io/name: {{ .Chart.Name }}
+app.kubernetes.io/instance: {{ .Release.Name }}
+{{- end -}}
diff --git a/charts/llm-app/templates/deployment.yaml b/charts/llm-app/templates/deployment.yaml
new file mode 100644
index 0000000..12677b5
--- /dev/null
+++ b/charts/llm-app/templates/deployment.yaml
@@ -0,0 +1,76 @@
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: {{ include "llm-app.fullname" . }}
+spec:
+  replicas: {{ .Values.replicaCount }}
+  selector:
+    matchLabels: {{- include "llm-app.selectorLabels" . | nindent 6 }}
+  template:
+    metadata:
+      labels: {{- include "llm-app.selectorLabels" . | nindent 8 }}
+    spec:
+      containers:
+        - name: vllm-server
+          # Image entrypoint is already `vllm serve`; args start with the model tag.
+          image: "{{ .Values.image.repository }}{{ if .Values.image.digest }}@{{ .Values.image.digest }}{{ else }}:{{ .Values.image.tag }}{{ end }}"
+          imagePullPolicy: {{ .Values.image.pullPolicy }}
+          args:
+            - {{ .Values.model.name | quote }}
+            - "--host"
+            - "0.0.0.0"
+            - "--port"
+            - {{ .Values.server.port | quote }}
+            - "--served-model-name"
+            - {{ .Values.model.alias | quote }}
+            - "--max-model-len"
+            - {{ .Values.model.maxModelLen | quote }}
+            - "--dtype"
+            - {{ .Values.model.dtype | quote }}
+            {{- with .Values.server.extraArgs }}
+            {{- toYaml . | nindent 12 }}
+            {{- end }}
+          env:
+            - name: HF_HOME
+              value: /cache/huggingface
+            - name: VLLM_CPU_KVCACHE_SPACE
+              value: "2"
+            {{- if gt (int .Values.server.ompThreads) 0 }}
+            - name: OMP_NUM_THREADS
+              value: {{ .Values.server.ompThreads | quote }}
+            {{- end }}
+          ports:
+            - name: http
+              containerPort: {{ .Values.server.port }}
+              protocol: TCP
+          readinessProbe:
+            httpGet:
+              path: /health
+              port: http
+            # vLLM CPU cold-start is ~2 min + HF download on first boot.
+            initialDelaySeconds: 60
+            periodSeconds: 10
+            timeoutSeconds: 5
+            failureThreshold: 180
+          livenessProbe:
+            httpGet:
+              path: /health
+              port: http
+            initialDelaySeconds: 600
+            periodSeconds: 30
+            timeoutSeconds: 5
+            failureThreshold: 6
+          resources: {{- toYaml .Values.resources | nindent 12 }}
+          volumeMounts:
+            - name: cache
+              mountPath: /cache
+            - name: shm
+              mountPath: /dev/shm
+      volumes:
+        - name: cache
+          emptyDir:
+            sizeLimit: {{ .Values.modelCache.sizeLimit }}
+        - name: shm
+          emptyDir:
+            medium: Memory
+            sizeLimit: 1Gi
diff --git a/charts/llm-app/templates/ingress.yaml b/charts/llm-app/templates/ingress.yaml
new file mode 100644
index 0000000..f3a6ded
--- /dev/null
+++ b/charts/llm-app/templates/ingress.yaml
@@ -0,0 +1,19 @@
+{{- if .Values.ingress.enabled -}}
+apiVersion: networking.k8s.io/v1
+kind: Ingress
+metadata:
+  name: {{ include "llm-app.fullname" . }}
+spec:
+  ingressClassName: {{ .Values.ingress.className }}
+  rules:
+    - host: {{ .Values.ingress.host | quote }}
+      http:
+        paths:
+          - path: /
+            pathType: Prefix
+            backend:
+              service:
+                name: {{ include "llm-app.fullname" . }}
+                port:
+                  number: {{ .Values.service.port }}
+{{- end }}
diff --git a/charts/llm-app/templates/service.yaml b/charts/llm-app/templates/service.yaml
new file mode 100644
index 0000000..6350996
--- /dev/null
+++ b/charts/llm-app/templates/service.yaml
@@ -0,0 +1,13 @@
+apiVersion: v1
+kind: Service
+metadata:
+  name: {{ include "llm-app.fullname" . }}
+  labels: {{- include "llm-app.selectorLabels" . | nindent 4 }}
+spec:
+  type: {{ .Values.service.type }}
+  ports:
+    - name: http
+      port: {{ .Values.service.port }}
+      targetPort: http
+      protocol: TCP
+  selector: {{- include "llm-app.selectorLabels" . | nindent 4 }}
diff --git a/charts/llm-app/templates/servicemonitor.yaml b/charts/llm-app/templates/servicemonitor.yaml
new file mode 100644
index 0000000..264e766
--- /dev/null
+++ b/charts/llm-app/templates/servicemonitor.yaml
@@ -0,0 +1,19 @@
+{{- if .Values.monitoring.serviceMonitor.enabled -}}
+apiVersion: monitoring.coreos.com/v1
+kind: ServiceMonitor
+metadata:
+  name: {{ include "llm-app.fullname" . }}
+  {{- with .Values.monitoring.serviceMonitor.labels }}
+  labels: {{- toYaml . | nindent 4 }}
+  {{- end }}
+spec:
+  selector:
+    matchLabels: {{- include "llm-app.selectorLabels" . | nindent 6 }}
+  endpoints:
+    - port: http
+      path: /metrics
+      interval: {{ .Values.monitoring.serviceMonitor.interval }}
+  namespaceSelector:
+    matchNames:
+      - {{ .Release.Namespace }}
+{{- end }}
diff --git a/charts/llm-app/templates/smoketest-job.yaml b/charts/llm-app/templates/smoketest-job.yaml
new file mode 100644
index 0000000..ac97f33
--- /dev/null
+++ b/charts/llm-app/templates/smoketest-job.yaml
@@ -0,0 +1,32 @@
+apiVersion: batch/v1
+kind: Job
+metadata:
+  name: {{ include "llm-app.fullname" . }}-smoketest
+  annotations:
+    "helm.sh/hook": post-install,post-upgrade
+    "helm.sh/hook-weight": "10"
+    "helm.sh/hook-delete-policy": before-hook-creation,hook-succeeded
+spec:
+  backoffLimit: 2
+  activeDeadlineSeconds: 240
+  ttlSecondsAfterFinished: 600
+  template:
+    spec:
+      restartPolicy: Never
+      containers:
+        - name: curl
+          image: curlimages/curl:8.10.1
+          command: ["/bin/sh", "-euc"]
+          args:
+            - |
+              ENDPOINT="http://{{ include "llm-app.fullname" . }}:{{ .Values.service.port }}"
+              MODEL={{ .Values.model.alias | quote }}
+              echo "smoketest: GET $ENDPOINT/v1/models"
+              out=$(curl -fsS --max-time 60 "$ENDPOINT/v1/models")
+              echo "$out" | grep -q "\"$MODEL\"" || { echo "FAIL: $MODEL not listed in /v1/models"; echo "$out"; exit 1; }
+              echo "smoketest: POST $ENDPOINT/v1/chat/completions"
+              resp=$(curl -fsS --max-time 90 "$ENDPOINT/v1/chat/completions" \
+                -H "Content-Type: application/json" \
+                -d "{\"model\":\"$MODEL\",\"messages\":[{\"role\":\"user\",\"content\":\"Reply with just: pong\"}],\"max_tokens\":8,\"temperature\":0}")
+              echo "$resp" | grep -q '"content"' || { echo "FAIL: no content in response"; echo "$resp"; exit 1; }
+              echo "OK"
diff --git a/charts/llm-app/values.yaml b/charts/llm-app/values.yaml
new file mode 100644
index 0000000..96c5c9a
--- /dev/null
+++ b/charts/llm-app/values.yaml
@@ -0,0 +1,51 @@
+replicaCount: 1
+image:
+  # vLLM CPU-only image (no CUDA, works on AVX2+).
+  repository: public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo
+  tag: latest
+  # Optional. If set, used in place of `tag` to pin the image by content.
+  # Example: "sha256:abc123...". Fill via scripts/resolve-digests.sh.
+  digest: ""
+  pullPolicy: IfNotPresent
+# vLLM pulls model weights from HuggingFace at first boot into the cache volume.
+# `name` is the HF repo id (also used as `--served-model-name` unless `alias` overrides).
+model:
+  name: "Qwen/Qwen2.5-0.5B-Instruct"
+  alias: "Qwen2.5-0.5B-Instruct"
+  maxModelLen: 2048
+  dtype: "bfloat16"
+server:
+  port: 8000
+  # OMP threads for the CPU backend; 0 = autodetect.
+  ompThreads: 0
+  extraArgs: []
+resources:
+  requests:
+    cpu: "500m"
+    memory: "1Gi"
+  limits:
+    cpu: "2"
+    memory: "3Gi"
+service:
+  type: ClusterIP
+  port: 8000
+ingress:
+  enabled: true
+  className: nginx
+  host: llm.localtest.me
+monitoring:
+  serviceMonitor:
+    enabled: true
+    interval: 15s
+    labels:
+      release: kube-prometheus-stack
+modelCache:
+  sizeLimit: 10Gi
diff --git a/cluster/kind-config.yaml b/cluster/kind-config.yaml
new file mode 100644
index 0000000..c0306ce
--- /dev/null
+++ b/cluster/kind-config.yaml
@@ -0,0 +1,21 @@
+kind: Cluster
+apiVersion: kind.x-k8s.io/v1alpha4
+name: llm-local
+nodes:
+  - role: control-plane
+    image: kindest/node:v1.35.0@sha256:452d707d4862f52530247495d180205e029056831160e22870e37e3f6c1ac31f
+    kubeadmConfigPatches:
+      - |
+        kind: InitConfiguration
+        nodeRegistration:
+          kubeletExtraArgs:
+            node-labels: "ingress-ready=true"
+    extraPortMappings:
+      - containerPort: 80
+        hostPort: 8080
+        protocol: TCP
+      - containerPort: 443
+        hostPort: 8443
+        protocol: TCP
+  - role: worker
+    image: kindest/node:v1.35.0@sha256:452d707d4862f52530247495d180205e029056831160e22870e37e3f6c1ac31f
diff --git a/goals b/goals
new file mode 100644
index 0000000..9bbec82
--- /dev/null
+++ b/goals
@@ -0,0 +1,18 @@
+### Task
+1. Stand up a local K8s cluster with `kind`, `k3d`, or `minikube`. Document exact versions.
+2. Write a Helm chart (or use the upstream vLLM/SGLang chart and extend it) that deploys a small open-weights model — e.g. `Qwen2.5-0.5B-Instruct`, `Llama-3.2-1B-Instruct`, or any model that fits on CPU/small GPU. CPU-only inference is acceptable.
+3. Wrap it in Terraform (or OpenTofu) using the `helm` and `kubernetes` providers.
+4. Expose an OpenAI-compatible endpoint through a K8s Service / Ingress and prove it works with a `curl` example in the README.
+5. Observability: scrape `/metrics` from the inference pod with Prometheus and show at least one dashboard or PromQL query for request latency and GPU/CPU utilization.
+6. Two environments — `dev` and `prod` — differ by at least: replica count, resource requests/limits, and model choice. Use Terraform workspaces, tfvars, or environment directories; justify your choice.
+Stretch Goals
+- Deploy a separate application container containing an agentic system utilizing the deployed vLLM/SGLang as the backend model server. The agent system's use-case is free to you to choose.
+- HPA based on a custom metric (e.g. queue depth or tokens/sec)
+- Image digest pinning and an `atlantis.yaml` or equivalent GitOps config
+- A smoke-test job that runs post-deploy and fails the apply if the endpoint is unhealthy
+You will be assessed on the following criteria:
+- the correctness of its output (stochastic functions notwithstanding);
+- how reliable, testable, modular and clean your code is;
+- other interesting add-ons you can think of.
diff --git a/scripts/resolve-digests.sh b/scripts/resolve-digests.sh
new file mode 100755
index 0000000..526d463
--- /dev/null
+++ b/scripts/resolve-digests.sh
@@ -0,0 +1,31 @@
+#!/usr/bin/env bash
+# Resolve an image tag to a content-addressable digest for pinning.
+#
+# Usage:
+#   scripts/resolve-digests.sh public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:latest
+#   scripts/resolve-digests.sh                          # default image
+#
+# Prints three lines:
+#   repo:        public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo
+#   digest:      sha256:abc123...
+#   pin:         public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo@sha256:abc123...
+#
+# Paste the digest into the env's terraform (var.image_digest) to pin.
+set -euo pipefail
+IMG="${1:-public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:latest}"
+engine=""
+if command -v podman >/dev/null 2>&1; then engine=podman
+elif command -v docker >/dev/null 2>&1; then engine=docker
+else
+  echo "need podman or docker on PATH" >&2; exit 1
+fi
+"$engine" pull --quiet "$IMG" >/dev/null
+digest="$("$engine" image inspect "$IMG" --format '{{.Digest}}')"
+repo="${IMG%:*}"
+printf 'repo:   %s\n' "$repo"
+printf 'digest: %s\n' "$digest"
+printf 'pin:    %s@%s\n' "$repo" "$digest"
diff --git a/terraform/envs/agent/.terraform.lock.hcl b/terraform/envs/agent/.terraform.lock.hcl
new file mode 100644
index 0000000..605df33
--- /dev/null
+++ b/terraform/envs/agent/.terraform.lock.hcl
@@ -0,0 +1,19 @@
+# This file is maintained automatically by "tofu init".
+# Manual edits may be lost in future updates.
+provider "registry.opentofu.org/hashicorp/kubernetes" {
+  version     = "2.38.0"
+  constraints = "~> 2.31"
+  hashes = [
+    "h1:nY7J9jFXcsRINog0KYagiWZw1GVYF9D2JmtIB7Wnrao=",
+    "zh:1096b41c4e5b2ee6c1980916fb9a8579bc1892071396f7a9432be058aabf3cbc",
+    "zh:2959fde9ae3d1deb5e317df0d7b02ea4977951ee6b9c4beb083c148ca8f3681c",
+    "zh:5082f98fcb3389c73339365f7df39fc6912bf2bd1a46d5f97778f441a67fd337",
+    "zh:620fd5d0fbc2d7a24ac6b420a4922e6093020358162a62fa8cbd37b2bac1d22e",
+    "zh:7f47c2de179bba35d759147c53082cad6c3449d19b0ec0c5a4ca8db5b06393e1",
+    "zh:89c3aa2a87e29febf100fd21cead34f9a4c0e6e7ae5f383b5cef815c677eb52a",
+    "zh:96eecc9f94938a0bc35b8a63d2c4a5f972395e44206620db06760b730d0471fc",
+    "zh:e15567c1095f898af173c281b66bffdc4f3068afdd9f84bb5b5b5521d9f29584",
+    "zh:ecc6b912629734a9a41a7cf1c4c73fb13b4b510afc9e7b2e0011d290bcd6d77f",
+  ]
+}
diff --git a/terraform/envs/agent/backend.tf b/terraform/envs/agent/backend.tf
new file mode 100644
index 0000000..3c533e6
--- /dev/null
+++ b/terraform/envs/agent/backend.tf
@@ -0,0 +1,5 @@
+terraform {
+  backend "local" {
+    path = "terraform.tfstate"
+  }
+}
diff --git a/terraform/envs/agent/main.tf b/terraform/envs/agent/main.tf
new file mode 100644
index 0000000..122eaca
--- /dev/null
+++ b/terraform/envs/agent/main.tf
@@ -0,0 +1,27 @@
+provider "kubernetes" {
+  config_path    = pathexpand(var.kubeconfig)
+  config_context = var.kube_context
+}
+module "agent" {
+  source = "../../modules/agent"
+  namespace         = "agent"
+  agent_source_path = var.agent_source_path
+  # Point at the prod LLM. `svc.cluster.local` resolves from any namespace.
+  llm_service_url = "http://llm-llm-app.llm-prod.svc.cluster.local:8000/v1"
+  model_alias     = "Qwen2.5-1.5B-Instruct"
+  ingress_host = "agent.localtest.me"
+}
+output "ingress_host" { value = module.agent.ingress_host }
+output "service_dns" { value = module.agent.service_dns }
+output "curl_example" {
+  value = <<-EOT
+    curl -s http://${module.agent.ingress_host}:8080/ask \
+      -H 'Content-Type: application/json' \
+      -d '{"question":"what is 123 * 47?"}'
+  EOT
+}
diff --git a/terraform/envs/agent/variables.tf b/terraform/envs/agent/variables.tf
new file mode 100644
index 0000000..bf005b9
--- /dev/null
+++ b/terraform/envs/agent/variables.tf
@@ -0,0 +1,14 @@
+variable "kubeconfig" {
+  type    = string
+  default = "~/.kube/config"
+}
+variable "kube_context" {
+  type    = string
+  default = "kind-llm-local"
+}
+variable "agent_source_path" {
+  type        = string
+  description = "Absolute path to agent/agent.py"
+}
diff --git a/terraform/envs/agent/versions.tf b/terraform/envs/agent/versions.tf
new file mode 100644
index 0000000..69cf77e
--- /dev/null
+++ b/terraform/envs/agent/versions.tf
@@ -0,0 +1,6 @@
+terraform {
+  required_version = ">= 1.6.0"
+  required_providers {
+    kubernetes = { source = "hashicorp/kubernetes", version = "~> 2.31" }
+  }
+}
diff --git a/terraform/envs/bootstrap/.terraform.lock.hcl b/terraform/envs/bootstrap/.terraform.lock.hcl
new file mode 100644
index 0000000..baa0088
--- /dev/null
+++ b/terraform/envs/bootstrap/.terraform.lock.hcl
@@ -0,0 +1,37 @@
+# This file is maintained automatically by "tofu init".
+# Manual edits may be lost in future updates.
+provider "registry.opentofu.org/hashicorp/helm" {
+  version     = "2.17.0"
+  constraints = "~> 2.17"
+  hashes = [
+    "h1:69PnHoYrrDrm7C8+8PiSvRGPI55taqL14SvQR/FGM+g=",
+    "zh:02690815e35131a42cb9851f63a3369c216af30ad093d05b39001d43da04b56b",
+    "zh:27a62f12b29926387f4d71aeeee9f7ffa0ccb81a1b6066ee895716ad050d1b7a",
+    "zh:2d0a5babfa73604b3fefc9dab9c87f91c77fce756c2e32b294e9f1290aed26c0",
+    "zh:3976400ceba6dda4636e1d297e3097e1831de5628afa534a166de98a70d1dcbe",
+    "zh:54440ef14f342b41d75c1aded7487bfcc3f76322b75894235b47b7e89ac4bfa4",
+    "zh:6512e2ab9f2fa31cbb90d9249647b5c5798f62eb1215ec44da2cdaa24e38ad25",
+    "zh:795f327ca0b8c5368af0ed03d5d4f6da7260692b4b3ca0bd004ed542e683464d",
+    "zh:ba659e1d94f224bc3f1fd34cbb9d2663e3a8e734108e5a58eb49eda84b140978",
+    "zh:c5c8575c4458835c2acbc3d1ed5570589b14baa2525d8fbd04295c097caf41eb",
+    "zh:e0877a5dac3de138e61eefa26b2f5a13305a17259779465899880f70e11314e0",
+  ]
+}
+provider "registry.opentofu.org/hashicorp/kubernetes" {
+  version     = "2.38.0"
+  constraints = "~> 2.31"
+  hashes = [
+    "h1:nY7J9jFXcsRINog0KYagiWZw1GVYF9D2JmtIB7Wnrao=",
+    "zh:1096b41c4e5b2ee6c1980916fb9a8579bc1892071396f7a9432be058aabf3cbc",
+    "zh:2959fde9ae3d1deb5e317df0d7b02ea4977951ee6b9c4beb083c148ca8f3681c",
+    "zh:5082f98fcb3389c73339365f7df39fc6912bf2bd1a46d5f97778f441a67fd337",
+    "zh:620fd5d0fbc2d7a24ac6b420a4922e6093020358162a62fa8cbd37b2bac1d22e",
+    "zh:7f47c2de179bba35d759147c53082cad6c3449d19b0ec0c5a4ca8db5b06393e1",
+    "zh:89c3aa2a87e29febf100fd21cead34f9a4c0e6e7ae5f383b5cef815c677eb52a",
+    "zh:96eecc9f94938a0bc35b8a63d2c4a5f972395e44206620db06760b730d0471fc",
+    "zh:e15567c1095f898af173c281b66bffdc4f3068afdd9f84bb5b5b5521d9f29584",
+    "zh:ecc6b912629734a9a41a7cf1c4c73fb13b4b510afc9e7b2e0011d290bcd6d77f",
+  ]
+}
diff --git a/terraform/envs/bootstrap/backend.tf b/terraform/envs/bootstrap/backend.tf
new file mode 100644
index 0000000..3c533e6
--- /dev/null
+++ b/terraform/envs/bootstrap/backend.tf
@@ -0,0 +1,5 @@
+terraform {
+  backend "local" {
+    path = "terraform.tfstate"
+  }
+}
diff --git a/terraform/envs/bootstrap/main.tf b/terraform/envs/bootstrap/main.tf
new file mode 100644
index 0000000..07bf04d
--- /dev/null
+++ b/terraform/envs/bootstrap/main.tf
@@ -0,0 +1,25 @@
+provider "kubernetes" {
+  config_path    = pathexpand(var.kubeconfig)
+  config_context = var.kube_context
+}
+provider "helm" {
+  kubernetes {
+    config_path    = pathexpand(var.kubeconfig)
+    config_context = var.kube_context
+  }
+}
+module "observability" {
+  source                 = "../../modules/observability"
+  namespace              = "monitoring"
+  grafana_admin_password = var.grafana_admin_password
+}
+output "grafana" {
+  value = module.observability.grafana_service
+}
+output "prometheus" {
+  value = module.observability.prometheus_service
+}
diff --git a/terraform/envs/bootstrap/variables.tf b/terraform/envs/bootstrap/variables.tf
new file mode 100644
index 0000000..220bed3
--- /dev/null
+++ b/terraform/envs/bootstrap/variables.tf
@@ -0,0 +1,15 @@
+variable "kubeconfig" {
+  type    = string
+  default = "~/.kube/config"
+}
+variable "kube_context" {
+  type    = string
+  default = "kind-llm-local"
+}
+variable "grafana_admin_password" {
+  type      = string
+  default   = "admin"
+  sensitive = true
+}
diff --git a/terraform/envs/bootstrap/versions.tf b/terraform/envs/bootstrap/versions.tf
new file mode 100644
index 0000000..0d7f77b
--- /dev/null
+++ b/terraform/envs/bootstrap/versions.tf
@@ -0,0 +1,7 @@
+terraform {
+  required_version = ">= 1.6.0"
+  required_providers {
+    helm       = { source = "hashicorp/helm", version = "~> 2.17" }
+    kubernetes = { source = "hashicorp/kubernetes", version = "~> 2.31" }
+  }
+}
diff --git a/terraform/envs/dev/.terraform.lock.hcl b/terraform/envs/dev/.terraform.lock.hcl
new file mode 100644
index 0000000..09902a1
--- /dev/null
+++ b/terraform/envs/dev/.terraform.lock.hcl
@@ -0,0 +1,54 @@
+# This file is maintained automatically by "tofu init".
+# Manual edits may be lost in future updates.
+provider "registry.opentofu.org/hashicorp/helm" {
+  version     = "2.17.0"
+  constraints = "~> 2.17"
+  hashes = [
+    "h1:69PnHoYrrDrm7C8+8PiSvRGPI55taqL14SvQR/FGM+g=",
+    "zh:02690815e35131a42cb9851f63a3369c216af30ad093d05b39001d43da04b56b",
+    "zh:27a62f12b29926387f4d71aeeee9f7ffa0ccb81a1b6066ee895716ad050d1b7a",
+    "zh:2d0a5babfa73604b3fefc9dab9c87f91c77fce756c2e32b294e9f1290aed26c0",
+    "zh:3976400ceba6dda4636e1d297e3097e1831de5628afa534a166de98a70d1dcbe",
+    "zh:54440ef14f342b41d75c1aded7487bfcc3f76322b75894235b47b7e89ac4bfa4",
+    "zh:6512e2ab9f2fa31cbb90d9249647b5c5798f62eb1215ec44da2cdaa24e38ad25",
+    "zh:795f327ca0b8c5368af0ed03d5d4f6da7260692b4b3ca0bd004ed542e683464d",
+    "zh:ba659e1d94f224bc3f1fd34cbb9d2663e3a8e734108e5a58eb49eda84b140978",
+    "zh:c5c8575c4458835c2acbc3d1ed5570589b14baa2525d8fbd04295c097caf41eb",
+    "zh:e0877a5dac3de138e61eefa26b2f5a13305a17259779465899880f70e11314e0",
+  ]
+}
+provider "registry.opentofu.org/hashicorp/kubernetes" {
+  version     = "2.38.0"
+  constraints = "~> 2.31"
+  hashes = [
+    "h1:nY7J9jFXcsRINog0KYagiWZw1GVYF9D2JmtIB7Wnrao=",
+    "zh:1096b41c4e5b2ee6c1980916fb9a8579bc1892071396f7a9432be058aabf3cbc",
+    "zh:2959fde9ae3d1deb5e317df0d7b02ea4977951ee6b9c4beb083c148ca8f3681c",
+    "zh:5082f98fcb3389c73339365f7df39fc6912bf2bd1a46d5f97778f441a67fd337",
+    "zh:620fd5d0fbc2d7a24ac6b420a4922e6093020358162a62fa8cbd37b2bac1d22e",
+    "zh:7f47c2de179bba35d759147c53082cad6c3449d19b0ec0c5a4ca8db5b06393e1",
+    "zh:89c3aa2a87e29febf100fd21cead34f9a4c0e6e7ae5f383b5cef815c677eb52a",
+    "zh:96eecc9f94938a0bc35b8a63d2c4a5f972395e44206620db06760b730d0471fc",
+    "zh:e15567c1095f898af173c281b66bffdc4f3068afdd9f84bb5b5b5521d9f29584",
+    "zh:ecc6b912629734a9a41a7cf1c4c73fb13b4b510afc9e7b2e0011d290bcd6d77f",
+  ]
+}
+provider "registry.opentofu.org/hashicorp/random" {
+  version     = "3.8.1"
+  constraints = "~> 3.6"
+  hashes = [
+    "h1:EHn3jsqOKhWjbg0X+psk0Ww96yz3N7ASqEKKuFvDFwo=",
+    "zh:25c458c7c676f15705e872202dad7dcd0982e4a48e7ea1800afa5fc64e77f4c8",
+    "zh:2edeaf6f1b20435b2f81855ad98a2e70956d473be9e52a5fdf57ccd0098ba476",
+    "zh:44becb9d5f75d55e36dfed0c5beabaf4c92e0a2bc61a3814d698271c646d48e7",
+    "zh:7699032612c3b16cc69928add8973de47b10ce81b1141f30644a0e8a895b5cd3",
+    "zh:86d07aa98d17703de9fbf402c89590dc1e01dbe5671dd6bc5e487eb8fe87eee0",
+    "zh:8c411c77b8390a49a8a1bc9f176529e6b32369dd33a723606c8533e5ca4d68c1",
+    "zh:a5ecc8255a612652a56b28149994985e2c4dc046e5d34d416d47fa7767f5c28f",
+    "zh:aea3fe1a5669b932eda9c5c72e5f327db8da707fe514aaca0d0ef60cb24892f9",
+    "zh:f56e26e6977f755d7ae56fa6320af96ecf4bb09580d47cb481efbf27f1c5afff",
+  ]
+}
diff --git a/terraform/envs/dev/backend.tf b/terraform/envs/dev/backend.tf
new file mode 100644
index 0000000..3c533e6
--- /dev/null
+++ b/terraform/envs/dev/backend.tf
@@ -0,0 +1,5 @@
+terraform {
+  backend "local" {
+    path = "terraform.tfstate"
+  }
+}
diff --git a/terraform/envs/dev/main.tf b/terraform/envs/dev/main.tf
new file mode 100644
index 0000000..8e1b882
--- /dev/null
+++ b/terraform/envs/dev/main.tf
@@ -0,0 +1,49 @@
+provider "kubernetes" {
+  config_path    = pathexpand(var.kubeconfig)
+  config_context = var.kube_context
+}
+provider "helm" {
+  kubernetes {
+    config_path    = pathexpand(var.kubeconfig)
+    config_context = var.kube_context
+  }
+}
+locals {
+  env = "dev"
+}
+module "llm" {
+  source = "../../modules/llm"
+  release_name = "llm"
+  namespace    = "llm-${local.env}"
+  chart_path   = var.chart_path
+  replicas = 2
+  model_name    = "Qwen/Qwen2.5-0.5B-Instruct"
+  model_alias   = "Qwen2.5-0.5B-Instruct"
+  max_model_len = 2048
+  dtype         = "bfloat16"
+  omp_threads   = 4
+  resources = {
+    requests = { cpu = "1", memory = "2Gi" }
+    limits   = { cpu = "4", memory = "6Gi" }
+  }
+  ingress_host = "llm.dev.localtest.me"
+  image_tag    = "latest"
+}
+output "ingress_host" { value = module.llm.ingress_host }
+output "service_dns" { value = module.llm.service_dns }
+output "curl_example" {
+  value = <<-EOT
+    curl -s http://${module.llm.ingress_host}:8080/v1/chat/completions \
+      -H 'Content-Type: application/json' \
+      -d '{"model":"Qwen2.5-0.5B-Instruct","messages":[{"role":"user","content":"Say hi."}]}'
+  EOT
+}
diff --git a/terraform/envs/dev/variables.tf b/terraform/envs/dev/variables.tf
new file mode 100644
index 0000000..9f1b697
--- /dev/null
+++ b/terraform/envs/dev/variables.tf
@@ -0,0 +1,14 @@
+variable "kubeconfig" {
+  type    = string
+  default = "~/.kube/config"
+}
+variable "kube_context" {
+  type    = string
+  default = "kind-llm-local"
+}
+variable "chart_path" {
+  type        = string
+  description = "Absolute path to charts/llm-app"
+}
diff --git a/terraform/envs/dev/versions.tf b/terraform/envs/dev/versions.tf
new file mode 100644
index 0000000..6a87674
--- /dev/null
+++ b/terraform/envs/dev/versions.tf
@@ -0,0 +1,8 @@
+terraform {
+  required_version = ">= 1.6.0"
+  required_providers {
+    helm       = { source = "hashicorp/helm", version = "~> 2.17" }
+    kubernetes = { source = "hashicorp/kubernetes", version = "~> 2.31" }
+    random     = { source = "hashicorp/random", version = "~> 3.6" }
+  }
+}
diff --git a/terraform/envs/prod/.terraform.lock.hcl b/terraform/envs/prod/.terraform.lock.hcl
new file mode 100644
index 0000000..09902a1
--- /dev/null
+++ b/terraform/envs/prod/.terraform.lock.hcl
@@ -0,0 +1,54 @@
+# This file is maintained automatically by "tofu init".
+# Manual edits may be lost in future updates.
+provider "registry.opentofu.org/hashicorp/helm" {
+  version     = "2.17.0"
+  constraints = "~> 2.17"
+  hashes = [
+    "h1:69PnHoYrrDrm7C8+8PiSvRGPI55taqL14SvQR/FGM+g=",
+    "zh:02690815e35131a42cb9851f63a3369c216af30ad093d05b39001d43da04b56b",
+    "zh:27a62f12b29926387f4d71aeeee9f7ffa0ccb81a1b6066ee895716ad050d1b7a",
+    "zh:2d0a5babfa73604b3fefc9dab9c87f91c77fce756c2e32b294e9f1290aed26c0",
+    "zh:3976400ceba6dda4636e1d297e3097e1831de5628afa534a166de98a70d1dcbe",
+    "zh:54440ef14f342b41d75c1aded7487bfcc3f76322b75894235b47b7e89ac4bfa4",
+    "zh:6512e2ab9f2fa31cbb90d9249647b5c5798f62eb1215ec44da2cdaa24e38ad25",
+    "zh:795f327ca0b8c5368af0ed03d5d4f6da7260692b4b3ca0bd004ed542e683464d",
+    "zh:ba659e1d94f224bc3f1fd34cbb9d2663e3a8e734108e5a58eb49eda84b140978",
+    "zh:c5c8575c4458835c2acbc3d1ed5570589b14baa2525d8fbd04295c097caf41eb",
+    "zh:e0877a5dac3de138e61eefa26b2f5a13305a17259779465899880f70e11314e0",
+  ]
+}
+provider "registry.opentofu.org/hashicorp/kubernetes" {
+  version     = "2.38.0"
+  constraints = "~> 2.31"
+  hashes = [
+    "h1:nY7J9jFXcsRINog0KYagiWZw1GVYF9D2JmtIB7Wnrao=",
+    "zh:1096b41c4e5b2ee6c1980916fb9a8579bc1892071396f7a9432be058aabf3cbc",
+    "zh:2959fde9ae3d1deb5e317df0d7b02ea4977951ee6b9c4beb083c148ca8f3681c",
+    "zh:5082f98fcb3389c73339365f7df39fc6912bf2bd1a46d5f97778f441a67fd337",
+    "zh:620fd5d0fbc2d7a24ac6b420a4922e6093020358162a62fa8cbd37b2bac1d22e",
+    "zh:7f47c2de179bba35d759147c53082cad6c3449d19b0ec0c5a4ca8db5b06393e1",
+    "zh:89c3aa2a87e29febf100fd21cead34f9a4c0e6e7ae5f383b5cef815c677eb52a",
+    "zh:96eecc9f94938a0bc35b8a63d2c4a5f972395e44206620db06760b730d0471fc",
+    "zh:e15567c1095f898af173c281b66bffdc4f3068afdd9f84bb5b5b5521d9f29584",
+    "zh:ecc6b912629734a9a41a7cf1c4c73fb13b4b510afc9e7b2e0011d290bcd6d77f",
+  ]
+}
+provider "registry.opentofu.org/hashicorp/random" {
+  version     = "3.8.1"
+  constraints = "~> 3.6"
+  hashes = [
+    "h1:EHn3jsqOKhWjbg0X+psk0Ww96yz3N7ASqEKKuFvDFwo=",
+    "zh:25c458c7c676f15705e872202dad7dcd0982e4a48e7ea1800afa5fc64e77f4c8",
+    "zh:2edeaf6f1b20435b2f81855ad98a2e70956d473be9e52a5fdf57ccd0098ba476",
+    "zh:44becb9d5f75d55e36dfed0c5beabaf4c92e0a2bc61a3814d698271c646d48e7",
+    "zh:7699032612c3b16cc69928add8973de47b10ce81b1141f30644a0e8a895b5cd3",
+    "zh:86d07aa98d17703de9fbf402c89590dc1e01dbe5671dd6bc5e487eb8fe87eee0",
+    "zh:8c411c77b8390a49a8a1bc9f176529e6b32369dd33a723606c8533e5ca4d68c1",
+    "zh:a5ecc8255a612652a56b28149994985e2c4dc046e5d34d416d47fa7767f5c28f",
+    "zh:aea3fe1a5669b932eda9c5c72e5f327db8da707fe514aaca0d0ef60cb24892f9",
+    "zh:f56e26e6977f755d7ae56fa6320af96ecf4bb09580d47cb481efbf27f1c5afff",
+  ]
+}
diff --git a/terraform/envs/prod/backend.tf b/terraform/envs/prod/backend.tf
new file mode 100644
index 0000000..3c533e6
--- /dev/null
+++ b/terraform/envs/prod/backend.tf
@@ -0,0 +1,5 @@
+terraform {
+  backend "local" {
+    path = "terraform.tfstate"
+  }
+}
diff --git a/terraform/envs/prod/main.tf b/terraform/envs/prod/main.tf
new file mode 100644
index 0000000..04db31d
--- /dev/null
+++ b/terraform/envs/prod/main.tf
@@ -0,0 +1,70 @@
+provider "kubernetes" {
+  config_path    = pathexpand(var.kubeconfig)
+  config_context = var.kube_context
+}
+provider "helm" {
+  kubernetes {
+    config_path    = pathexpand(var.kubeconfig)
+    config_context = var.kube_context
+  }
+}
+locals {
+  env = "prod"
+}
+module "llm" {
+  source = "../../modules/llm"
+  release_name = "llm"
+  namespace    = "llm-${local.env}"
+  chart_path   = var.chart_path
+  replicas = 1
+  model_name    = "Qwen/Qwen2.5-1.5B-Instruct"
+  model_alias   = "Qwen2.5-1.5B-Instruct"
+  max_model_len = 4096
+  dtype         = "bfloat16"
+  omp_threads   = 6
+  resources = {
+    requests = { cpu = "2", memory = "4Gi" }
+    limits   = { cpu = "6", memory = "8Gi" }
+  }
+  ingress_host = "llm.prod.localtest.me"
+  image_tag    = "latest"
+  # Content-addressable pin, resolved with scripts/resolve-digests.sh on amd64.
+  # Per-arch digest — re-resolve on a different arch or after an upstream tag move.
+  # Dev intentionally runs on `:latest` so new fixes flow in without a PR.
+  image_digest = "sha256:bb7ed9b6c595334d78179e9d8f6490e06bf9220ed4a10b9b4e15064454ddc69e"
+  # Enable OpenAI tool-calling so the agent's function-call path works.
+  # Qwen 2.5 uses hermes-style tool parsing in vLLM.
+  extra_args = [
+    "--enable-auto-tool-choice",
+    "--tool-call-parser", "hermes",
+  ]
+  hpa = {
+    enabled      = true
+    min_replicas = 1
+    max_replicas = 3
+    # vLLM exposes `vllm:num_requests_running` as a per-pod gauge of in-flight
+    # requests. Scale up when >50% of pods are actively serving.
+    metric_name          = "vllm:num_requests_running"
+    target_average_value = "500m"
+  }
+}
+output "ingress_host" { value = module.llm.ingress_host }
+output "service_dns" { value = module.llm.service_dns }
+output "curl_example" {
+  value = <<-EOT
+    curl -s http://${module.llm.ingress_host}:8080/v1/chat/completions \
+      -H 'Content-Type: application/json' \
+      -d '{"model":"Qwen2.5-1.5B-Instruct","messages":[{"role":"user","content":"Say hi."}]}'
+  EOT
+}
diff --git a/terraform/envs/prod/variables.tf b/terraform/envs/prod/variables.tf
new file mode 100644
index 0000000..9f1b697
--- /dev/null
+++ b/terraform/envs/prod/variables.tf
@@ -0,0 +1,14 @@
+variable "kubeconfig" {
+  type    = string
+  default = "~/.kube/config"
+}
+variable "kube_context" {
+  type    = string
+  default = "kind-llm-local"
+}
+variable "chart_path" {
+  type        = string
+  description = "Absolute path to charts/llm-app"
+}
diff --git a/terraform/envs/prod/versions.tf b/terraform/envs/prod/versions.tf
new file mode 100644
index 0000000..6a87674
--- /dev/null
+++ b/terraform/envs/prod/versions.tf
@@ -0,0 +1,8 @@
+terraform {
+  required_version = ">= 1.6.0"
+  required_providers {
+    helm       = { source = "hashicorp/helm", version = "~> 2.17" }
+    kubernetes = { source = "hashicorp/kubernetes", version = "~> 2.31" }
+    random     = { source = "hashicorp/random", version = "~> 3.6" }
+  }
+}
diff --git a/terraform/modules/agent/main.tf b/terraform/modules/agent/main.tf
new file mode 100644
index 0000000..f53acdc
--- /dev/null
+++ b/terraform/modules/agent/main.tf
@@ -0,0 +1,114 @@
+resource "kubernetes_namespace_v1" "agent" {
+  metadata {
+    name = var.namespace
+    labels = {
+      "app.kubernetes.io/part-of" = "llm-platform"
+    }
+  }
+}
+resource "kubernetes_deployment_v1" "agent" {
+  metadata {
+    name      = "agent"
+    namespace = kubernetes_namespace_v1.agent.metadata[0].name
+    labels    = { app = "agent" }
+  }
+  spec {
+    replicas = 1
+    selector {
+      match_labels = { app = "agent" }
+    }
+    template {
+      metadata {
+        labels = { app = "agent" }
+        annotations = {
+          # Bounce the pod when agent.py changes on disk, even if image tag is unchanged.
+          "checksum/code" = substr(sha256(file(var.agent_source_path)), 0, 16)
+        }
+      }
+      spec {
+        container {
+          name              = "agent"
+          image             = var.agent_image
+          image_pull_policy = "IfNotPresent"
+          env {
+            name  = "OPENAI_BASE_URL"
+            value = var.llm_service_url
+          }
+          env {
+            name  = "MODEL"
+            value = var.model_alias
+          }
+          port {
+            name           = "http"
+            container_port = 8001
+          }
+          readiness_probe {
+            http_get {
+              path = "/health"
+              port = "http"
+            }
+            initial_delay_seconds = 3
+            period_seconds        = 5
+            failure_threshold     = 10
+          }
+          liveness_probe {
+            http_get {
+              path = "/health"
+              port = "http"
+            }
+            initial_delay_seconds = 30
+            period_seconds        = 30
+          }
+          resources {
+            requests = { cpu = "100m", memory = "128Mi" }
+            limits   = { cpu = "1",    memory = "512Mi" }
+          }
+        }
+      }
+    }
+  }
+}
+resource "kubernetes_service_v1" "agent" {
+  metadata {
+    name      = "agent"
+    namespace = kubernetes_namespace_v1.agent.metadata[0].name
+    labels    = { app = "agent" }
+  }
+  spec {
+    selector = { app = "agent" }
+    port {
+      name        = "http"
+      port        = 8001
+      target_port = "http"
+    }
+  }
+}
+resource "kubernetes_ingress_v1" "agent" {
+  metadata {
+    name      = "agent"
+    namespace = kubernetes_namespace_v1.agent.metadata[0].name
+  }
+  spec {
+    ingress_class_name = var.ingress_class
+    rule {
+      host = var.ingress_host
+      http {
+        path {
+          path      = "/"
+          path_type = "Prefix"
+          backend {
+            service {
+              name = kubernetes_service_v1.agent.metadata[0].name
+              port {
+                number = 8001
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+}
diff --git a/terraform/modules/agent/outputs.tf b/terraform/modules/agent/outputs.tf
new file mode 100644
index 0000000..ac9932b
--- /dev/null
+++ b/terraform/modules/agent/outputs.tf
@@ -0,0 +1,11 @@
+output "service_dns" {
+  value = "${kubernetes_service_v1.agent.metadata[0].name}.${kubernetes_namespace_v1.agent.metadata[0].name}.svc.cluster.local"
+}
+output "ingress_host" {
+  value = var.ingress_host
+}
+output "namespace" {
+  value = kubernetes_namespace_v1.agent.metadata[0].name
+}
diff --git a/terraform/modules/agent/variables.tf b/terraform/modules/agent/variables.tf
new file mode 100644
index 0000000..6f525ee
--- /dev/null
+++ b/terraform/modules/agent/variables.tf
@@ -0,0 +1,33 @@
+variable "namespace" {
+  type = string
+}
+variable "agent_source_path" {
+  type        = string
+  description = "Absolute path to agent/agent.py. Used only to bounce pods on code change."
+}
+variable "agent_image" {
+  type        = string
+  default     = "localhost/agent:0.1.0"
+  description = "Pre-built agent image. Must be loaded into kind with `make agent-build`."
+}
+variable "llm_service_url" {
+  type        = string
+  description = "OpenAI-compatible base URL, e.g. http://llm-llm-app.llm-prod.svc.cluster.local:8000/v1"
+}
+variable "model_alias" {
+  type    = string
+  default = "Qwen2.5-1.5B-Instruct"
+}
+variable "ingress_host" {
+  type = string
+}
+variable "ingress_class" {
+  type    = string
+  default = "nginx"
+}
diff --git a/terraform/modules/agent/versions.tf b/terraform/modules/agent/versions.tf
new file mode 100644
index 0000000..4242705
--- /dev/null
+++ b/terraform/modules/agent/versions.tf
@@ -0,0 +1,5 @@
+terraform {
+  required_providers {
+    kubernetes = { source = "hashicorp/kubernetes", version = "~> 2.31" }
+  }
+}
diff --git a/terraform/modules/llm/main.tf b/terraform/modules/llm/main.tf
new file mode 100644
index 0000000..cd22019
--- /dev/null
+++ b/terraform/modules/llm/main.tf
@@ -0,0 +1,99 @@
+resource "kubernetes_namespace_v1" "this" {
+  metadata {
+    name = var.namespace
+    labels = {
+      "app.kubernetes.io/part-of" = "llm-platform"
+    }
+  }
+}
+resource "kubernetes_horizontal_pod_autoscaler_v2" "llm" {
+  count = var.hpa.enabled ? 1 : 0
+  metadata {
+    name      = "${var.release_name}-llm-app"
+    namespace = kubernetes_namespace_v1.this.metadata[0].name
+  }
+  spec {
+    scale_target_ref {
+      api_version = "apps/v1"
+      kind        = "Deployment"
+      name        = "${var.release_name}-llm-app"
+    }
+    min_replicas = var.hpa.min_replicas
+    max_replicas = var.hpa.max_replicas
+    metric {
+      type = "Pods"
+      pods {
+        metric {
+          name = var.hpa.metric_name
+        }
+        target {
+          type          = "AverageValue"
+          average_value = var.hpa.target_average_value
+        }
+      }
+    }
+  }
+  depends_on = [helm_release.llm]
+}
+resource "helm_release" "llm" {
+  name             = var.release_name
+  chart            = var.chart_path
+  namespace        = kubernetes_namespace_v1.this.metadata[0].name
+  create_namespace = false
+  atomic           = false
+  wait             = true
+  timeout          = 1800
+  values = [
+    yamlencode({
+      replicaCount = var.replicas
+      image = {
+        repository = var.image_repository
+        tag        = var.image_tag
+        digest     = var.image_digest
+        pullPolicy = "IfNotPresent"
+      }
+      model = {
+        name        = var.model_name
+        alias       = var.model_alias
+        maxModelLen = var.max_model_len
+        dtype       = var.dtype
+      }
+      server = {
+        port       = 8000
+        ompThreads = var.omp_threads
+        extraArgs  = var.extra_args
+      }
+      resources = var.resources
+      ingress = {
+        enabled   = true
+        className = var.ingress_class
+        host      = var.ingress_host
+      }
+      monitoring = {
+        serviceMonitor = {
+          enabled  = true
+          interval = "15s"
+          labels = {
+            release = var.service_monitor_release_label
+          }
+        }
+      }
+      modelCache = {
+        sizeLimit = var.model_cache_size
+      }
+    }),
+  ]
+}
diff --git a/terraform/modules/llm/outputs.tf b/terraform/modules/llm/outputs.tf
new file mode 100644
index 0000000..a953e73
--- /dev/null
+++ b/terraform/modules/llm/outputs.tf
@@ -0,0 +1,12 @@
+output "service_dns" {
+  value       = "${var.release_name}-llm-app.${var.namespace}.svc.cluster.local"
+  description = "In-cluster DNS name for the LLM Service."
+}
+output "ingress_host" {
+  value = var.ingress_host
+}
+output "namespace" {
+  value = kubernetes_namespace_v1.this.metadata[0].name
+}
diff --git a/terraform/modules/llm/variables.tf b/terraform/modules/llm/variables.tf
new file mode 100644
index 0000000..3a7d8f7
--- /dev/null
+++ b/terraform/modules/llm/variables.tf
@@ -0,0 +1,112 @@
+variable "release_name" {
+  type        = string
+  description = "Helm release name."
+}
+variable "namespace" {
+  type        = string
+  description = "Kubernetes namespace to deploy into."
+}
+variable "chart_path" {
+  type        = string
+  description = "Path to the local llm-app chart."
+}
+variable "replicas" {
+  type    = number
+  default = 1
+}
+variable "model_name" {
+  type        = string
+  description = "HuggingFace repo id, passed as vLLM model_tag (positional)."
+}
+variable "model_alias" {
+  type        = string
+  description = "Value clients pass in the OpenAI 'model' field (maps to --served-model-name)."
+}
+variable "max_model_len" {
+  type    = number
+  default = 2048
+}
+variable "dtype" {
+  type    = string
+  default = "bfloat16"
+}
+variable "omp_threads" {
+  type        = number
+  default     = 0
+  description = "OMP_NUM_THREADS for vLLM CPU backend. 0 = autodetect."
+}
+variable "extra_args" {
+  type        = list(string)
+  default     = []
+  description = "Extra CLI args passed to `vllm serve`, appended after the stock set."
+}
+variable "resources" {
+  type = object({
+    requests = object({ cpu = string, memory = string })
+    limits   = object({ cpu = string, memory = string })
+  })
+}
+variable "ingress_host" {
+  type = string
+}
+variable "ingress_class" {
+  type    = string
+  default = "nginx"
+}
+variable "image_repository" {
+  type    = string
+  default = "public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo"
+}
+variable "image_tag" {
+  type        = string
+  default     = "latest"
+  description = "Used only when image_digest is empty."
+}
+variable "image_digest" {
+  type        = string
+  default     = ""
+  description = "Optional sha256:abc... content-addressable digest. Takes precedence over image_tag."
+}
+variable "service_monitor_release_label" {
+  type        = string
+  default     = "kube-prometheus-stack"
+  description = "Must match the release label the Prometheus Operator selects on."
+}
+variable "model_cache_size" {
+  type    = string
+  default = "10Gi"
+}
+variable "hpa" {
+  type = object({
+    enabled              = bool
+    min_replicas         = number
+    max_replicas         = number
+    metric_name          = string
+    target_average_value = string
+  })
+  default = {
+    enabled              = false
+    min_replicas         = 1
+    max_replicas         = 3
+    metric_name          = "vllm:num_requests_running"
+    target_average_value = "500m"
+  }
+}
diff --git a/terraform/modules/observability/main.tf b/terraform/modules/observability/main.tf
new file mode 100644
index 0000000..2f88f2e
--- /dev/null
+++ b/terraform/modules/observability/main.tf
@@ -0,0 +1,156 @@
+resource "kubernetes_namespace_v1" "monitoring" {
+  metadata {
+    name = var.namespace
+  }
+}
+resource "kubernetes_namespace_v1" "ingress" {
+  metadata {
+    name = "ingress-nginx"
+  }
+}
+resource "helm_release" "ingress_nginx" {
+  name       = "ingress-nginx"
+  repository = "https://kubernetes.github.io/ingress-nginx"
+  chart      = "ingress-nginx"
+  version    = var.ingress_nginx_version
+  namespace  = kubernetes_namespace_v1.ingress.metadata[0].name
+  wait       = true
+  timeout    = 300
+  values = [
+    yamlencode({
+      controller = {
+        hostPort = { enabled = true, ports = { http = 80, https = 443 } }
+        service  = { type = "NodePort" }
+        nodeSelector = {
+          "ingress-ready" = "true"
+        }
+        tolerations = [
+          { key = "node-role.kubernetes.io/control-plane", operator = "Equal", effect = "NoSchedule" },
+          { key = "node-role.kubernetes.io/master", operator = "Equal", effect = "NoSchedule" },
+        ]
+        publishService = { enabled = false }
+        admissionWebhooks = { enabled = false } # speeds up kind cluster installs
+        # Cap worker_processes so nginx doesn't try to spawn 14 threads under
+        # CPU pressure from vLLM cold-starts. With auto (= one per CPU) it
+        # sometimes hits pthread EAGAIN and workers die without respawn.
+        config = {
+          "worker-processes" = "4"
+        }
+      }
+    }),
+  ]
+}
+resource "helm_release" "kps" {
+  name       = "kube-prometheus-stack"
+  repository = "https://prometheus-community.github.io/helm-charts"
+  chart      = "kube-prometheus-stack"
+  version    = var.kps_version
+  namespace  = kubernetes_namespace_v1.monitoring.metadata[0].name
+  wait       = true
+  timeout    = 600
+  values = [
+    yamlencode({
+      fullnameOverride = "kps"
+      prometheus = {
+        prometheusSpec = {
+          # Let Prometheus pick up ServiceMonitors from any namespace matching
+          # the release=kube-prometheus-stack label (the chart's default).
+          serviceMonitorSelectorNilUsesHelmValues = false
+          podMonitorSelectorNilUsesHelmValues     = false
+          ruleSelectorNilUsesHelmValues           = false
+          retention                               = "2d"
+          resources = {
+            requests = { cpu = "100m", memory = "400Mi" }
+            limits   = { memory = "1Gi" }
+          }
+        }
+        ingress = {
+          enabled          = true
+          ingressClassName = "nginx"
+          hosts            = ["prom.localtest.me"]
+        }
+      }
+      alertmanager = { enabled = false }
+      grafana = {
+        adminPassword = var.grafana_admin_password
+        sidecar = {
+          dashboards = {
+            enabled        = true
+            label          = "grafana_dashboard"
+            labelValue     = "1"
+            searchNamespace = "ALL"
+          }
+        }
+        service = { type = "ClusterIP" }
+        ingress = {
+          enabled          = true
+          ingressClassName = "nginx"
+          hosts            = ["grafana.localtest.me"]
+        }
+      }
+    }),
+  ]
+}
+resource "helm_release" "prometheus_adapter" {
+  name       = "prometheus-adapter"
+  repository = "https://prometheus-community.github.io/helm-charts"
+  chart      = "prometheus-adapter"
+  version    = var.prometheus_adapter_version
+  namespace  = kubernetes_namespace_v1.monitoring.metadata[0].name
+  wait       = true
+  timeout    = 300
+  values = [
+    yamlencode({
+      prometheus = {
+        url  = "http://kps-prometheus.${kubernetes_namespace_v1.monitoring.metadata[0].name}.svc"
+        port = 9090
+      }
+      rules = {
+        default = false
+        custom = [
+          {
+            # In-flight request count per pod; basis for autoscaling.
+            # vLLM exposes this as a gauge per model-engine.
+            seriesQuery = "vllm:num_requests_running{namespace!=\"\",pod!=\"\"}"
+            resources = {
+              overrides = {
+                namespace = { resource = "namespace" }
+                pod       = { resource = "pod" }
+              }
+            }
+            name = {
+              matches = "^vllm:num_requests_running$"
+              as      = "vllm:num_requests_running"
+            }
+            metricsQuery = "avg(<<.Series>>{<<.LabelMatchers>>}) by (<<.GroupBy>>)"
+          },
+          {
+            # Waiting (queued) requests per pod — an alternative scale signal.
+            seriesQuery = "vllm:num_requests_waiting{namespace!=\"\",pod!=\"\"}"
+            resources = {
+              overrides = {
+                namespace = { resource = "namespace" }
+                pod       = { resource = "pod" }
+              }
+            }
+            name = {
+              matches = "^vllm:num_requests_waiting$"
+              as      = "vllm:num_requests_waiting"
+            }
+            metricsQuery = "avg(<<.Series>>{<<.LabelMatchers>>}) by (<<.GroupBy>>)"
+          },
+        ]
+      }
+    }),
+  ]
+  depends_on = [helm_release.kps]
+}
diff --git a/terraform/modules/observability/outputs.tf b/terraform/modules/observability/outputs.tf
new file mode 100644
index 0000000..06a507d
--- /dev/null
+++ b/terraform/modules/observability/outputs.tf
@@ -0,0 +1,11 @@
+output "namespace" {
+  value = kubernetes_namespace_v1.monitoring.metadata[0].name
+}
+output "grafana_service" {
+  value = "kube-prometheus-stack-grafana.${kubernetes_namespace_v1.monitoring.metadata[0].name}.svc.cluster.local"
+}
+output "prometheus_service" {
+  value = "kps-prometheus.${kubernetes_namespace_v1.monitoring.metadata[0].name}.svc.cluster.local"
+}
diff --git a/terraform/modules/observability/variables.tf b/terraform/modules/observability/variables.tf
new file mode 100644
index 0000000..6aeaca3
--- /dev/null
+++ b/terraform/modules/observability/variables.tf
@@ -0,0 +1,27 @@
+variable "namespace" {
+  type    = string
+  default = "monitoring"
+}
+variable "kps_version" {
+  type        = string
+  default     = "65.5.1"
+  description = "kube-prometheus-stack chart version."
+}
+variable "ingress_nginx_version" {
+  type        = string
+  default     = "4.11.3"
+  description = "ingress-nginx chart version."
+}
+variable "grafana_admin_password" {
+  type      = string
+  default   = "admin"
+  sensitive = true
+}
+variable "prometheus_adapter_version" {
+  type    = string
+  default = "4.11.0"
+}
diff --git a/tests/smoke.sh b/tests/smoke.sh
new file mode 100755
index 0000000..a5ef23d
--- /dev/null
+++ b/tests/smoke.sh
@@ -0,0 +1,38 @@
+#!/usr/bin/env bash
+# Smoke test for the OpenAI-compatible LLM endpoint.
+# Usage:
+#   ENDPOINT=http://llm.dev.localtest.me:8080 MODEL=Qwen2.5-0.5B-Instruct ./tests/smoke.sh
+set -euo pipefail
+ENDPOINT="${ENDPOINT:-http://llm.dev.localtest.me:8080}"
+MODEL="${MODEL:-Qwen2.5-0.5B-Instruct}"
+TIMEOUT="${TIMEOUT:-120}"
+say() { printf '\033[1;34m==>\033[0m %s\n' "$*"; }
+fail() { printf '\033[1;31mFAIL\033[0m %s\n' "$*" >&2; exit 1; }
+say "Endpoint: $ENDPOINT"
+say "Model:    $MODEL"
+say "GET /v1/models"
+models_json="$(curl -fsS --max-time "$TIMEOUT" "$ENDPOINT/v1/models")" || fail "/v1/models unreachable"
+echo "$models_json" | grep -q "$MODEL" || fail "/v1/models does not list $MODEL"
+say "POST /v1/chat/completions"
+resp="$(curl -fsS --max-time "$TIMEOUT" "$ENDPOINT/v1/chat/completions" \
+  -H 'Content-Type: application/json' \
+  -d "$(cat <<EOF
+{
+  "model": "$MODEL",
+  "messages": [{"role": "user", "content": "Reply with the single word: pong"}],
+  "max_tokens": 8,
+  "temperature": 0
+}
+EOF
+)")" || fail "chat completion request failed"
+content="$(echo "$resp" | python3 -c 'import sys, json; print(json.load(sys.stdin)["choices"][0]["message"]["content"])')"
+echo "model reply: $content"
+[[ -n "$content" ]] || fail "empty completion content"
+say "OK"