From d3e770254de0bb301815ca87257c8b1a357d06c4 Mon Sep 17 00:00:00 2001 From: Your Name Date: Sun, 26 Apr 2026 21:02:47 +0800 Subject: hehe --- .gitignore | 29 +++++ Makefile | 78 +++++++++++++ README.txt | 90 +++++++++++++++ agent/Dockerfile | 6 + agent/agent.py | 162 +++++++++++++++++++++++++++ charts/llm-app/Chart.yaml | 6 + charts/llm-app/templates/_helpers.tpl | 8 ++ charts/llm-app/templates/deployment.yaml | 76 +++++++++++++ charts/llm-app/templates/ingress.yaml | 19 ++++ charts/llm-app/templates/service.yaml | 13 +++ charts/llm-app/templates/servicemonitor.yaml | 19 ++++ charts/llm-app/templates/smoketest-job.yaml | 32 ++++++ charts/llm-app/values.yaml | 51 +++++++++ cluster/kind-config.yaml | 21 ++++ goals | 18 +++ scripts/resolve-digests.sh | 31 +++++ terraform/envs/agent/.terraform.lock.hcl | 19 ++++ terraform/envs/agent/backend.tf | 5 + terraform/envs/agent/main.tf | 27 +++++ terraform/envs/agent/variables.tf | 14 +++ terraform/envs/agent/versions.tf | 6 + terraform/envs/bootstrap/.terraform.lock.hcl | 37 ++++++ terraform/envs/bootstrap/backend.tf | 5 + terraform/envs/bootstrap/main.tf | 25 +++++ terraform/envs/bootstrap/variables.tf | 15 +++ terraform/envs/bootstrap/versions.tf | 7 ++ terraform/envs/dev/.terraform.lock.hcl | 54 +++++++++ terraform/envs/dev/backend.tf | 5 + terraform/envs/dev/main.tf | 49 ++++++++ terraform/envs/dev/variables.tf | 14 +++ terraform/envs/dev/versions.tf | 8 ++ terraform/envs/prod/.terraform.lock.hcl | 54 +++++++++ terraform/envs/prod/backend.tf | 5 + terraform/envs/prod/main.tf | 70 ++++++++++++ terraform/envs/prod/variables.tf | 14 +++ terraform/envs/prod/versions.tf | 8 ++ terraform/modules/agent/main.tf | 114 +++++++++++++++++++ terraform/modules/agent/outputs.tf | 11 ++ terraform/modules/agent/variables.tf | 33 ++++++ terraform/modules/agent/versions.tf | 5 + terraform/modules/llm/main.tf | 99 ++++++++++++++++ terraform/modules/llm/outputs.tf | 12 ++ terraform/modules/llm/variables.tf | 112 ++++++++++++++++++ terraform/modules/observability/main.tf | 156 ++++++++++++++++++++++++++ terraform/modules/observability/outputs.tf | 11 ++ terraform/modules/observability/variables.tf | 27 +++++ tests/smoke.sh | 38 +++++++ 47 files changed, 1718 insertions(+) create mode 100644 .gitignore create mode 100644 Makefile create mode 100644 README.txt create mode 100644 agent/Dockerfile create mode 100644 agent/agent.py create mode 100644 charts/llm-app/Chart.yaml create mode 100644 charts/llm-app/templates/_helpers.tpl create mode 100644 charts/llm-app/templates/deployment.yaml create mode 100644 charts/llm-app/templates/ingress.yaml create mode 100644 charts/llm-app/templates/service.yaml create mode 100644 charts/llm-app/templates/servicemonitor.yaml create mode 100644 charts/llm-app/templates/smoketest-job.yaml create mode 100644 charts/llm-app/values.yaml create mode 100644 cluster/kind-config.yaml create mode 100644 goals create mode 100755 scripts/resolve-digests.sh create mode 100644 terraform/envs/agent/.terraform.lock.hcl create mode 100644 terraform/envs/agent/backend.tf create mode 100644 terraform/envs/agent/main.tf create mode 100644 terraform/envs/agent/variables.tf create mode 100644 terraform/envs/agent/versions.tf create mode 100644 terraform/envs/bootstrap/.terraform.lock.hcl create mode 100644 terraform/envs/bootstrap/backend.tf create mode 100644 terraform/envs/bootstrap/main.tf create mode 100644 terraform/envs/bootstrap/variables.tf create mode 100644 terraform/envs/bootstrap/versions.tf create mode 100644 terraform/envs/dev/.terraform.lock.hcl create mode 100644 terraform/envs/dev/backend.tf create mode 100644 terraform/envs/dev/main.tf create mode 100644 terraform/envs/dev/variables.tf create mode 100644 terraform/envs/dev/versions.tf create mode 100644 terraform/envs/prod/.terraform.lock.hcl create mode 100644 terraform/envs/prod/backend.tf create mode 100644 terraform/envs/prod/main.tf create mode 100644 terraform/envs/prod/variables.tf create mode 100644 terraform/envs/prod/versions.tf create mode 100644 terraform/modules/agent/main.tf create mode 100644 terraform/modules/agent/outputs.tf create mode 100644 terraform/modules/agent/variables.tf create mode 100644 terraform/modules/agent/versions.tf create mode 100644 terraform/modules/llm/main.tf create mode 100644 terraform/modules/llm/outputs.tf create mode 100644 terraform/modules/llm/variables.tf create mode 100644 terraform/modules/observability/main.tf create mode 100644 terraform/modules/observability/outputs.tf create mode 100644 terraform/modules/observability/variables.tf create mode 100755 tests/smoke.sh diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..747a966 --- /dev/null +++ b/.gitignore @@ -0,0 +1,29 @@ +# Terraform +.terraform/ +terraform.tfstate +terraform.tfstate.* +*.tfvars +*.tfvars.json +crash.log +crash.*.log +override.tf +override.tf.json +*_override.tf +*_override.tf.json + +# Python +__pycache__/ +*.py[cod] +*.egg-info/ +.pytest_cache/ +.venv/ +venv/ + +# OS +.DS_Store +Thumbs.db + +# Editors +.idea/ +.vscode/ +*.swp diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..15c0031 --- /dev/null +++ b/Makefile @@ -0,0 +1,78 @@ +REPO_ROOT := $(abspath $(dir $(abspath $(lastword $(MAKEFILE_LIST))))) +CHART_PATH := $(REPO_ROOT)/charts/llm-app +AGENT_SRC := $(REPO_ROOT)/agent/agent.py +AGENT_IMG := localhost/agent:0.1.0 +CLUSTER := llm-local +CONTEXT := kind-$(CLUSTER) + +export KIND_EXPERIMENTAL_PROVIDER=podman + +.PHONY: help +help: + @echo "Targets:" + @echo " up-dev deploy dev LLM (Qwen2.5-0.5B, 2 replicas)" + @echo " up-prod deploy prod LLM (Qwen2.5-1.5B, 1 replica + HPA 1->3)" + @echo " up-agent up-prod + tool-using agent" + @echo " ask Q='...' POST a question to the agent" + @echo " down destroy everything + delete kind cluster" + @echo "" + @echo "URLs (after up-dev/up-prod):" + @echo " Grafana http://grafana.localtest.me:8080 (admin/admin)" + @echo " curl -f http://grafana.localtest.me:8080/api/health" + @echo " Prometheus http://prom.localtest.me:8080" + @echo " curl -f http://prom.localtest.me:8080/-/healthy" + @echo "" + +.PHONY: up-dev +up-dev: + @kind get clusters | grep -qx $(CLUSTER) || kind create cluster --config $(REPO_ROOT)/cluster/kind-config.yaml + cd $(REPO_ROOT)/terraform/envs/bootstrap && \ + tofu init -upgrade && \ + tofu apply -auto-approve \ + -var kube_context=$(CONTEXT) + cd $(REPO_ROOT)/terraform/envs/dev && \ + tofu init -upgrade && \ + tofu apply -auto-approve \ + -var kube_context=$(CONTEXT) \ + -var chart_path=$(CHART_PATH) + +.PHONY: up-prod +up-prod: + @kind get clusters | grep -qx $(CLUSTER) || kind create cluster --config $(REPO_ROOT)/cluster/kind-config.yaml + cd $(REPO_ROOT)/terraform/envs/bootstrap && \ + tofu init -upgrade && \ + tofu apply -auto-approve \ + -var kube_context=$(CONTEXT) + cd $(REPO_ROOT)/terraform/envs/prod && \ + tofu init -upgrade && \ + tofu apply -auto-approve \ + -var kube_context=$(CONTEXT) \ + -var chart_path=$(CHART_PATH) + +.PHONY: up-agent +up-agent: up-prod + podman build -t $(AGENT_IMG) $(REPO_ROOT)/agent/ + @tmp=$$(mktemp -t agent-XXXXXX.tar); \ + podman save $(AGENT_IMG) -o $$tmp && \ + kind load image-archive $$tmp --name $(CLUSTER) && \ + rm -f $$tmp + cd $(REPO_ROOT)/terraform/envs/agent && \ + tofu init -upgrade && \ + tofu apply -auto-approve \ + -var kube_context=$(CONTEXT) \ + -var agent_source_path=$(AGENT_SRC) + +.PHONY: ask +ask: + @if [ -z "$(Q)" ]; then echo "usage: make ask Q='what is 17*23?'"; exit 1; fi + curl -s http://agent.localtest.me:8080/ask \ + -H 'Content-Type: application/json' \ + -d "$(shell printf '{"question":"%s"}' "$(Q)")" | python3 -m json.tool + +.PHONY: down +down: + -cd $(REPO_ROOT)/terraform/envs/agent && tofu destroy -auto-approve -var kube_context=$(CONTEXT) -var agent_source_path=$(AGENT_SRC) || true + -cd $(REPO_ROOT)/terraform/envs/prod && tofu destroy -auto-approve -var kube_context=$(CONTEXT) -var chart_path=$(CHART_PATH) || true + -cd $(REPO_ROOT)/terraform/envs/dev && tofu destroy -auto-approve -var kube_context=$(CONTEXT) -var chart_path=$(CHART_PATH) || true + -cd $(REPO_ROOT)/terraform/envs/bootstrap && tofu destroy -auto-approve -var kube_context=$(CONTEXT) || true + KIND_EXPERIMENTAL_PROVIDER=podman kind delete cluster --name $(CLUSTER) diff --git a/README.txt b/README.txt new file mode 100644 index 0000000..4ce7a8e --- /dev/null +++ b/README.txt @@ -0,0 +1,90 @@ +============================================================================= + Local K8s LLM demo — kind + OpenTofu + vLLM +============================================================================= + +sudo dnf install -y podman git make jq curl tar + +# kind v0.31.0 (node image: kindest/node:v1.35.0, pinned by digest in cluster/kind-config.yaml) +cluster/kind-config.yaml + +# kubectl v1.36.0 +curl -fsSLo /tmp/kubectl \ + https://dl.k8s.io/release/v1.36.0/bin/linux/amd64/kubectl +sudo install -m 0755 /tmp/kubectl /usr/local/bin/kubectl + +# Helm 4.1.4 +terraform/modules/observability/variables.tf + +# OpenTofu 1.11.6 +terraform/envs/{dev,prod,bootstrap}/versions.tf + + + +# kind runs each k8s "node" as a long-lived podman container. The default pids_limit = 2048 causes ingress-nginx to hit pthread EAGAIN once the control plane warms up. Raise it once, then restart podman: +sudo mkdir -p /etc/containers/containers.conf.d +printf '[containers]\npids_limit = 0\n' \ + | sudo tee /etc/containers/containers.conf.d/99-kind-pids.conf +sudo systemctl restart podman.socket podman 2>/dev/null || true + + +make help # will have self explaantory commands to coopy adn paste stuff with + + + +# Expose an OpenAI-compatible endpoint through a K8s Service / Ingress and prove it works with a `curl` example in the README. + +http://llm.dev.localtest.me:8080 +$curl http://llm.dev.localtest.me:8080/v1/chat/completions -H 'Content-Type: application/json' -d '{"model":"Qwen2.5-0.5B-Instruct","messages":[{"role":"user","content":"hi"}],"max_tokens":8}' | jq + +http://llm.prod.localtest.me:8080 +$curl http://llm.prod.localtest.me:8080/v1/chat/completions -H 'Content-Type: application/json' -d '{"model":"Qwen2.5-1.5B-Instruct","messages":[{"role":"user","content":"hi"}],"max_tokens":8}' | jq + + +# Observability: scrape `/metrics` from the inference pod with Prometheus and show at least one dashboard or PromQL query for request latency and GPU/CPU utilization. + +Fire 10 chat requests against dev to populate metrics +$for i in {1..10}; do curl -s http://llm.dev.localtest.me:8080/v1/chat/completions -H 'Content-Type: application/json' -d '{"model":"Qwen2.5-0.5B-Instruct","messages":[{"role":"user","content":"hi"}],"max_tokens":4}' >/dev/null & done; wait + +Raw /metrics (vLLM exposes natively) +$curl -s http://llm.dev.localtest.me:8080/metrics | grep '^vllm:' | head + +Request latency p95 (seconds) — via Prometheus +$curl -G http://prom.localtest.me:8080/api/v1/query --data-urlencode 'query=histogram_quantile(0.95, sum by (le) (rate(vllm:e2e_request_latency_seconds_bucket[5m])))' | jq .data.result + +CPU cores in use per vLLM pod (CPU-only inference — no GPU on this stack) +$curl -G http://prom.localtest.me:8080/api/v1/query --data-urlencode 'query=sum by (pod) (rate(container_cpu_usage_seconds_total{namespace="llm-dev",pod=~"llm-llm-app.*",container!="",container!="POD"}[5m]))' | jq .data.result + +In-flight requests per pod (the same metric the prod HPA scales on) +$curl -G http://prom.localtest.me:8080/api/v1/query --data-urlencode 'query=sum by (namespace, pod) (vllm:num_requests_running)' | jq .data.result + + +# stretch 1 +deployed /agent/agent.py in a container to use the backend to caluclate the product of 2 nums +$curl http://agent.localtest.me:8080/ask -H 'Content-Type: application/json' -d '{"question":"what is 17 * 23?"}' # need to make up-agent first + +# stretch 2 +horizontal pod scaling by counting total inflight requests, up to a total of 3 pods +term1 +$(trap 'kill 0' INT; for i in {1..5}; do \ + curl -s http://llm.prod.localtest.me:8080/v1/chat/completions \ + -H 'Content-Type: application/json' \ + -d '{"model":"Qwen2.5-1.5B-Instruct","messages":[{"role":"user","content":"hi"}],"max_tokens":32}' \ + >/dev/null & + done; wait) +term2 +$kubectl -n llm-prod get hpa -w # better to use watch -n for this, -w is slow af + +# stretch 3 +image pinning — prod uses repo@sha256: (resolved via scripts/resolve-digests.sh); +terraform/envs/prod/main.tf +dev tracks :latest. The chart prefers digest over tag when both are set. + +#stretch 4 +smoke test +charts/llm-app/templates/smoketest-job.yaml reruns after every install or upgrade +just checks if the response has a content field, no functional thingummy, then passes it + + + + +all stretch goals except atlantis git controlled thingummy cuz lazy setup locally or use github diff --git a/agent/Dockerfile b/agent/Dockerfile new file mode 100644 index 0000000..509c3b6 --- /dev/null +++ b/agent/Dockerfile @@ -0,0 +1,6 @@ +FROM python:3.12-slim +RUN pip install --no-cache-dir 'openai>=1.59.2,<2' 'httpx<0.28' +WORKDIR /app +COPY agent.py /app/agent.py +EXPOSE 8001 +CMD ["python", "/app/agent.py"] diff --git a/agent/agent.py b/agent/agent.py new file mode 100644 index 0000000..12ad9d6 --- /dev/null +++ b/agent/agent.py @@ -0,0 +1,162 @@ +"""Tool-using agent over an OpenAI-compatible backend. + +Uses the standard OpenAI tools API (function calling). vLLM maps this to the +model's native tool-call template (Qwen here), so small models follow the +protocol much more reliably than a hand-rolled text convention. + +POST /ask {"question": "..."} -> {"answer": "...", "transcript": [...]} +GET /health -> "ok" +""" +import json +import os +import re +from http.server import BaseHTTPRequestHandler, HTTPServer + +from openai import OpenAI + +client = OpenAI( + base_url=os.environ["OPENAI_BASE_URL"], + api_key=os.environ.get("OPENAI_API_KEY", "sk-local"), +) +MODEL = os.environ.get("MODEL", "Qwen2.5-1.5B-Instruct") +MAX_STEPS = int(os.environ.get("MAX_STEPS", "6")) + +SYSTEM = ( + "You are a careful math assistant. When the user asks any arithmetic question, " + "call the 'calc' tool with the exact expression. Do not compute arithmetic in your head. " + "After you receive the tool result, give a concise final answer." +) + +TOOLS = [ + { + "type": "function", + "function": { + "name": "calc", + "description": "Evaluate a safe arithmetic expression and return the numeric result.", + "parameters": { + "type": "object", + "properties": { + "expression": { + "type": "string", + "description": "Arithmetic expression using only digits, spaces, and + - * / . ( )", + } + }, + "required": ["expression"], + }, + }, + } +] + +SAFE_EXPR = re.compile(r"^[\d\s+\-*/().]+$") + + +def calc(expression: str) -> str: + if not SAFE_EXPR.fullmatch(expression): + return "ERROR: disallowed characters" + try: + return str(eval(expression, {"__builtins__": {}}, {})) # noqa: S307 + except Exception as e: + return f"ERROR: {e}" + + +def run_agent(question: str) -> dict: + messages = [ + {"role": "system", "content": SYSTEM}, + {"role": "user", "content": question}, + ] + transcript: list = [] + + for step in range(MAX_STEPS): + resp = client.chat.completions.create( + model=MODEL, + messages=messages, + tools=TOOLS, + tool_choice="auto", + temperature=0.0, + max_tokens=256, + ) + msg = resp.choices[0].message + + # Always append the assistant message (with any tool_calls) to history. + assistant_entry = {"role": "assistant", "content": msg.content or ""} + if msg.tool_calls: + assistant_entry["tool_calls"] = [ + { + "id": tc.id, + "type": "function", + "function": {"name": tc.function.name, "arguments": tc.function.arguments}, + } + for tc in msg.tool_calls + ] + messages.append(assistant_entry) + + transcript.append( + { + "step": step + 1, + "content": msg.content, + "tool_calls": [ + {"name": tc.function.name, "arguments": tc.function.arguments} + for tc in (msg.tool_calls or []) + ], + } + ) + + if msg.tool_calls: + for tc in msg.tool_calls: + if tc.function.name != "calc": + result = f"ERROR: unknown tool {tc.function.name}" + else: + try: + args = json.loads(tc.function.arguments) + except json.JSONDecodeError: + result = "ERROR: bad JSON arguments" + else: + result = calc(args.get("expression", "")) + transcript.append({"tool_result": {"name": tc.function.name, "result": result}}) + messages.append( + {"role": "tool", "tool_call_id": tc.id, "content": result} + ) + continue + + # No tool call -> model produced a final answer. + return {"answer": (msg.content or "").strip(), "steps": step + 1, "transcript": transcript} + + return {"answer": None, "steps": MAX_STEPS, "note": "MAX_STEPS reached", "transcript": transcript} + + +class Handler(BaseHTTPRequestHandler): + def do_POST(self): # noqa: N802 + if self.path != "/ask": + self.send_response(404); self.end_headers(); return + n = int(self.headers.get("Content-Length", "0")) + try: + body = json.loads(self.rfile.read(n) or b"{}") + except json.JSONDecodeError: + self.send_response(400); self.end_headers(); self.wfile.write(b'{"error":"invalid json"}'); return + q = body.get("question", "") + try: + result = run_agent(q) + code = 200 + except Exception as e: + result = {"error": str(e), "type": type(e).__name__} + code = 500 + payload = json.dumps(result).encode() + self.send_response(code) + self.send_header("Content-Type", "application/json") + self.send_header("Content-Length", str(len(payload))) + self.end_headers() + self.wfile.write(payload) + + def do_GET(self): # noqa: N802 + if self.path == "/health": + self.send_response(200); self.end_headers(); self.wfile.write(b"ok"); return + self.send_response(404); self.end_headers() + + def log_message(self, fmt, *args): + import sys + print(f"{self.address_string()} {fmt % args}", file=sys.stderr) + + +if __name__ == "__main__": + print(f"agent starting on :8001, model={MODEL}, backend={os.environ['OPENAI_BASE_URL']}") + HTTPServer(("0.0.0.0", 8001), Handler).serve_forever() diff --git a/charts/llm-app/Chart.yaml b/charts/llm-app/Chart.yaml new file mode 100644 index 0000000..e0747df --- /dev/null +++ b/charts/llm-app/Chart.yaml @@ -0,0 +1,6 @@ +apiVersion: v2 +name: llm-app +description: OpenAI-compatible LLM server (vLLM CPU) with Prometheus metrics +type: application +version: 0.1.0 +appVersion: "latest" diff --git a/charts/llm-app/templates/_helpers.tpl b/charts/llm-app/templates/_helpers.tpl new file mode 100644 index 0000000..8b104de --- /dev/null +++ b/charts/llm-app/templates/_helpers.tpl @@ -0,0 +1,8 @@ +{{- define "llm-app.fullname" -}} +{{- printf "%s-%s" .Release.Name .Chart.Name | trunc 63 | trimSuffix "-" -}} +{{- end -}} + +{{- define "llm-app.selectorLabels" -}} +app.kubernetes.io/name: {{ .Chart.Name }} +app.kubernetes.io/instance: {{ .Release.Name }} +{{- end -}} diff --git a/charts/llm-app/templates/deployment.yaml b/charts/llm-app/templates/deployment.yaml new file mode 100644 index 0000000..12677b5 --- /dev/null +++ b/charts/llm-app/templates/deployment.yaml @@ -0,0 +1,76 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: {{ include "llm-app.fullname" . }} +spec: + replicas: {{ .Values.replicaCount }} + selector: + matchLabels: {{- include "llm-app.selectorLabels" . | nindent 6 }} + template: + metadata: + labels: {{- include "llm-app.selectorLabels" . | nindent 8 }} + spec: + containers: + - name: vllm-server + # Image entrypoint is already `vllm serve`; args start with the model tag. + image: "{{ .Values.image.repository }}{{ if .Values.image.digest }}@{{ .Values.image.digest }}{{ else }}:{{ .Values.image.tag }}{{ end }}" + imagePullPolicy: {{ .Values.image.pullPolicy }} + args: + - {{ .Values.model.name | quote }} + - "--host" + - "0.0.0.0" + - "--port" + - {{ .Values.server.port | quote }} + - "--served-model-name" + - {{ .Values.model.alias | quote }} + - "--max-model-len" + - {{ .Values.model.maxModelLen | quote }} + - "--dtype" + - {{ .Values.model.dtype | quote }} + {{- with .Values.server.extraArgs }} + {{- toYaml . | nindent 12 }} + {{- end }} + env: + - name: HF_HOME + value: /cache/huggingface + - name: VLLM_CPU_KVCACHE_SPACE + value: "2" + {{- if gt (int .Values.server.ompThreads) 0 }} + - name: OMP_NUM_THREADS + value: {{ .Values.server.ompThreads | quote }} + {{- end }} + ports: + - name: http + containerPort: {{ .Values.server.port }} + protocol: TCP + readinessProbe: + httpGet: + path: /health + port: http + # vLLM CPU cold-start is ~2 min + HF download on first boot. + initialDelaySeconds: 60 + periodSeconds: 10 + timeoutSeconds: 5 + failureThreshold: 180 + livenessProbe: + httpGet: + path: /health + port: http + initialDelaySeconds: 600 + periodSeconds: 30 + timeoutSeconds: 5 + failureThreshold: 6 + resources: {{- toYaml .Values.resources | nindent 12 }} + volumeMounts: + - name: cache + mountPath: /cache + - name: shm + mountPath: /dev/shm + volumes: + - name: cache + emptyDir: + sizeLimit: {{ .Values.modelCache.sizeLimit }} + - name: shm + emptyDir: + medium: Memory + sizeLimit: 1Gi diff --git a/charts/llm-app/templates/ingress.yaml b/charts/llm-app/templates/ingress.yaml new file mode 100644 index 0000000..f3a6ded --- /dev/null +++ b/charts/llm-app/templates/ingress.yaml @@ -0,0 +1,19 @@ +{{- if .Values.ingress.enabled -}} +apiVersion: networking.k8s.io/v1 +kind: Ingress +metadata: + name: {{ include "llm-app.fullname" . }} +spec: + ingressClassName: {{ .Values.ingress.className }} + rules: + - host: {{ .Values.ingress.host | quote }} + http: + paths: + - path: / + pathType: Prefix + backend: + service: + name: {{ include "llm-app.fullname" . }} + port: + number: {{ .Values.service.port }} +{{- end }} diff --git a/charts/llm-app/templates/service.yaml b/charts/llm-app/templates/service.yaml new file mode 100644 index 0000000..6350996 --- /dev/null +++ b/charts/llm-app/templates/service.yaml @@ -0,0 +1,13 @@ +apiVersion: v1 +kind: Service +metadata: + name: {{ include "llm-app.fullname" . }} + labels: {{- include "llm-app.selectorLabels" . | nindent 4 }} +spec: + type: {{ .Values.service.type }} + ports: + - name: http + port: {{ .Values.service.port }} + targetPort: http + protocol: TCP + selector: {{- include "llm-app.selectorLabels" . | nindent 4 }} diff --git a/charts/llm-app/templates/servicemonitor.yaml b/charts/llm-app/templates/servicemonitor.yaml new file mode 100644 index 0000000..264e766 --- /dev/null +++ b/charts/llm-app/templates/servicemonitor.yaml @@ -0,0 +1,19 @@ +{{- if .Values.monitoring.serviceMonitor.enabled -}} +apiVersion: monitoring.coreos.com/v1 +kind: ServiceMonitor +metadata: + name: {{ include "llm-app.fullname" . }} + {{- with .Values.monitoring.serviceMonitor.labels }} + labels: {{- toYaml . | nindent 4 }} + {{- end }} +spec: + selector: + matchLabels: {{- include "llm-app.selectorLabels" . | nindent 6 }} + endpoints: + - port: http + path: /metrics + interval: {{ .Values.monitoring.serviceMonitor.interval }} + namespaceSelector: + matchNames: + - {{ .Release.Namespace }} +{{- end }} diff --git a/charts/llm-app/templates/smoketest-job.yaml b/charts/llm-app/templates/smoketest-job.yaml new file mode 100644 index 0000000..ac97f33 --- /dev/null +++ b/charts/llm-app/templates/smoketest-job.yaml @@ -0,0 +1,32 @@ +apiVersion: batch/v1 +kind: Job +metadata: + name: {{ include "llm-app.fullname" . }}-smoketest + annotations: + "helm.sh/hook": post-install,post-upgrade + "helm.sh/hook-weight": "10" + "helm.sh/hook-delete-policy": before-hook-creation,hook-succeeded +spec: + backoffLimit: 2 + activeDeadlineSeconds: 240 + ttlSecondsAfterFinished: 600 + template: + spec: + restartPolicy: Never + containers: + - name: curl + image: curlimages/curl:8.10.1 + command: ["/bin/sh", "-euc"] + args: + - | + ENDPOINT="http://{{ include "llm-app.fullname" . }}:{{ .Values.service.port }}" + MODEL={{ .Values.model.alias | quote }} + echo "smoketest: GET $ENDPOINT/v1/models" + out=$(curl -fsS --max-time 60 "$ENDPOINT/v1/models") + echo "$out" | grep -q "\"$MODEL\"" || { echo "FAIL: $MODEL not listed in /v1/models"; echo "$out"; exit 1; } + echo "smoketest: POST $ENDPOINT/v1/chat/completions" + resp=$(curl -fsS --max-time 90 "$ENDPOINT/v1/chat/completions" \ + -H "Content-Type: application/json" \ + -d "{\"model\":\"$MODEL\",\"messages\":[{\"role\":\"user\",\"content\":\"Reply with just: pong\"}],\"max_tokens\":8,\"temperature\":0}") + echo "$resp" | grep -q '"content"' || { echo "FAIL: no content in response"; echo "$resp"; exit 1; } + echo "OK" diff --git a/charts/llm-app/values.yaml b/charts/llm-app/values.yaml new file mode 100644 index 0000000..96c5c9a --- /dev/null +++ b/charts/llm-app/values.yaml @@ -0,0 +1,51 @@ +replicaCount: 1 + +image: + # vLLM CPU-only image (no CUDA, works on AVX2+). + repository: public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo + tag: latest + # Optional. If set, used in place of `tag` to pin the image by content. + # Example: "sha256:abc123...". Fill via scripts/resolve-digests.sh. + digest: "" + pullPolicy: IfNotPresent + +# vLLM pulls model weights from HuggingFace at first boot into the cache volume. +# `name` is the HF repo id (also used as `--served-model-name` unless `alias` overrides). +model: + name: "Qwen/Qwen2.5-0.5B-Instruct" + alias: "Qwen2.5-0.5B-Instruct" + maxModelLen: 2048 + dtype: "bfloat16" + +server: + port: 8000 + # OMP threads for the CPU backend; 0 = autodetect. + ompThreads: 0 + extraArgs: [] + +resources: + requests: + cpu: "500m" + memory: "1Gi" + limits: + cpu: "2" + memory: "3Gi" + +service: + type: ClusterIP + port: 8000 + +ingress: + enabled: true + className: nginx + host: llm.localtest.me + +monitoring: + serviceMonitor: + enabled: true + interval: 15s + labels: + release: kube-prometheus-stack + +modelCache: + sizeLimit: 10Gi diff --git a/cluster/kind-config.yaml b/cluster/kind-config.yaml new file mode 100644 index 0000000..c0306ce --- /dev/null +++ b/cluster/kind-config.yaml @@ -0,0 +1,21 @@ +kind: Cluster +apiVersion: kind.x-k8s.io/v1alpha4 +name: llm-local +nodes: + - role: control-plane + image: kindest/node:v1.35.0@sha256:452d707d4862f52530247495d180205e029056831160e22870e37e3f6c1ac31f + kubeadmConfigPatches: + - | + kind: InitConfiguration + nodeRegistration: + kubeletExtraArgs: + node-labels: "ingress-ready=true" + extraPortMappings: + - containerPort: 80 + hostPort: 8080 + protocol: TCP + - containerPort: 443 + hostPort: 8443 + protocol: TCP + - role: worker + image: kindest/node:v1.35.0@sha256:452d707d4862f52530247495d180205e029056831160e22870e37e3f6c1ac31f diff --git a/goals b/goals new file mode 100644 index 0000000..9bbec82 --- /dev/null +++ b/goals @@ -0,0 +1,18 @@ +### Task +1. Stand up a local K8s cluster with `kind`, `k3d`, or `minikube`. Document exact versions. +2. Write a Helm chart (or use the upstream vLLM/SGLang chart and extend it) that deploys a small open-weights model — e.g. `Qwen2.5-0.5B-Instruct`, `Llama-3.2-1B-Instruct`, or any model that fits on CPU/small GPU. CPU-only inference is acceptable. +3. Wrap it in Terraform (or OpenTofu) using the `helm` and `kubernetes` providers. +4. Expose an OpenAI-compatible endpoint through a K8s Service / Ingress and prove it works with a `curl` example in the README. +5. Observability: scrape `/metrics` from the inference pod with Prometheus and show at least one dashboard or PromQL query for request latency and GPU/CPU utilization. +6. Two environments — `dev` and `prod` — differ by at least: replica count, resource requests/limits, and model choice. Use Terraform workspaces, tfvars, or environment directories; justify your choice. + +Stretch Goals +- Deploy a separate application container containing an agentic system utilizing the deployed vLLM/SGLang as the backend model server. The agent system's use-case is free to you to choose. +- HPA based on a custom metric (e.g. queue depth or tokens/sec) +- Image digest pinning and an `atlantis.yaml` or equivalent GitOps config +- A smoke-test job that runs post-deploy and fails the apply if the endpoint is unhealthy + +You will be assessed on the following criteria: +- the correctness of its output (stochastic functions notwithstanding); +- how reliable, testable, modular and clean your code is; +- other interesting add-ons you can think of. diff --git a/scripts/resolve-digests.sh b/scripts/resolve-digests.sh new file mode 100755 index 0000000..526d463 --- /dev/null +++ b/scripts/resolve-digests.sh @@ -0,0 +1,31 @@ +#!/usr/bin/env bash +# Resolve an image tag to a content-addressable digest for pinning. +# +# Usage: +# scripts/resolve-digests.sh public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:latest +# scripts/resolve-digests.sh # default image +# +# Prints three lines: +# repo: public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo +# digest: sha256:abc123... +# pin: public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo@sha256:abc123... +# +# Paste the digest into the env's terraform (var.image_digest) to pin. +set -euo pipefail + +IMG="${1:-public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:latest}" + +engine="" +if command -v podman >/dev/null 2>&1; then engine=podman +elif command -v docker >/dev/null 2>&1; then engine=docker +else + echo "need podman or docker on PATH" >&2; exit 1 +fi + +"$engine" pull --quiet "$IMG" >/dev/null +digest="$("$engine" image inspect "$IMG" --format '{{.Digest}}')" +repo="${IMG%:*}" + +printf 'repo: %s\n' "$repo" +printf 'digest: %s\n' "$digest" +printf 'pin: %s@%s\n' "$repo" "$digest" diff --git a/terraform/envs/agent/.terraform.lock.hcl b/terraform/envs/agent/.terraform.lock.hcl new file mode 100644 index 0000000..605df33 --- /dev/null +++ b/terraform/envs/agent/.terraform.lock.hcl @@ -0,0 +1,19 @@ +# This file is maintained automatically by "tofu init". +# Manual edits may be lost in future updates. + +provider "registry.opentofu.org/hashicorp/kubernetes" { + version = "2.38.0" + constraints = "~> 2.31" + hashes = [ + "h1:nY7J9jFXcsRINog0KYagiWZw1GVYF9D2JmtIB7Wnrao=", + "zh:1096b41c4e5b2ee6c1980916fb9a8579bc1892071396f7a9432be058aabf3cbc", + "zh:2959fde9ae3d1deb5e317df0d7b02ea4977951ee6b9c4beb083c148ca8f3681c", + "zh:5082f98fcb3389c73339365f7df39fc6912bf2bd1a46d5f97778f441a67fd337", + "zh:620fd5d0fbc2d7a24ac6b420a4922e6093020358162a62fa8cbd37b2bac1d22e", + "zh:7f47c2de179bba35d759147c53082cad6c3449d19b0ec0c5a4ca8db5b06393e1", + "zh:89c3aa2a87e29febf100fd21cead34f9a4c0e6e7ae5f383b5cef815c677eb52a", + "zh:96eecc9f94938a0bc35b8a63d2c4a5f972395e44206620db06760b730d0471fc", + "zh:e15567c1095f898af173c281b66bffdc4f3068afdd9f84bb5b5b5521d9f29584", + "zh:ecc6b912629734a9a41a7cf1c4c73fb13b4b510afc9e7b2e0011d290bcd6d77f", + ] +} diff --git a/terraform/envs/agent/backend.tf b/terraform/envs/agent/backend.tf new file mode 100644 index 0000000..3c533e6 --- /dev/null +++ b/terraform/envs/agent/backend.tf @@ -0,0 +1,5 @@ +terraform { + backend "local" { + path = "terraform.tfstate" + } +} diff --git a/terraform/envs/agent/main.tf b/terraform/envs/agent/main.tf new file mode 100644 index 0000000..122eaca --- /dev/null +++ b/terraform/envs/agent/main.tf @@ -0,0 +1,27 @@ +provider "kubernetes" { + config_path = pathexpand(var.kubeconfig) + config_context = var.kube_context +} + +module "agent" { + source = "../../modules/agent" + + namespace = "agent" + agent_source_path = var.agent_source_path + + # Point at the prod LLM. `svc.cluster.local` resolves from any namespace. + llm_service_url = "http://llm-llm-app.llm-prod.svc.cluster.local:8000/v1" + model_alias = "Qwen2.5-1.5B-Instruct" + + ingress_host = "agent.localtest.me" +} + +output "ingress_host" { value = module.agent.ingress_host } +output "service_dns" { value = module.agent.service_dns } +output "curl_example" { + value = <<-EOT + curl -s http://${module.agent.ingress_host}:8080/ask \ + -H 'Content-Type: application/json' \ + -d '{"question":"what is 123 * 47?"}' + EOT +} diff --git a/terraform/envs/agent/variables.tf b/terraform/envs/agent/variables.tf new file mode 100644 index 0000000..bf005b9 --- /dev/null +++ b/terraform/envs/agent/variables.tf @@ -0,0 +1,14 @@ +variable "kubeconfig" { + type = string + default = "~/.kube/config" +} + +variable "kube_context" { + type = string + default = "kind-llm-local" +} + +variable "agent_source_path" { + type = string + description = "Absolute path to agent/agent.py" +} diff --git a/terraform/envs/agent/versions.tf b/terraform/envs/agent/versions.tf new file mode 100644 index 0000000..69cf77e --- /dev/null +++ b/terraform/envs/agent/versions.tf @@ -0,0 +1,6 @@ +terraform { + required_version = ">= 1.6.0" + required_providers { + kubernetes = { source = "hashicorp/kubernetes", version = "~> 2.31" } + } +} diff --git a/terraform/envs/bootstrap/.terraform.lock.hcl b/terraform/envs/bootstrap/.terraform.lock.hcl new file mode 100644 index 0000000..baa0088 --- /dev/null +++ b/terraform/envs/bootstrap/.terraform.lock.hcl @@ -0,0 +1,37 @@ +# This file is maintained automatically by "tofu init". +# Manual edits may be lost in future updates. + +provider "registry.opentofu.org/hashicorp/helm" { + version = "2.17.0" + constraints = "~> 2.17" + hashes = [ + "h1:69PnHoYrrDrm7C8+8PiSvRGPI55taqL14SvQR/FGM+g=", + "zh:02690815e35131a42cb9851f63a3369c216af30ad093d05b39001d43da04b56b", + "zh:27a62f12b29926387f4d71aeeee9f7ffa0ccb81a1b6066ee895716ad050d1b7a", + "zh:2d0a5babfa73604b3fefc9dab9c87f91c77fce756c2e32b294e9f1290aed26c0", + "zh:3976400ceba6dda4636e1d297e3097e1831de5628afa534a166de98a70d1dcbe", + "zh:54440ef14f342b41d75c1aded7487bfcc3f76322b75894235b47b7e89ac4bfa4", + "zh:6512e2ab9f2fa31cbb90d9249647b5c5798f62eb1215ec44da2cdaa24e38ad25", + "zh:795f327ca0b8c5368af0ed03d5d4f6da7260692b4b3ca0bd004ed542e683464d", + "zh:ba659e1d94f224bc3f1fd34cbb9d2663e3a8e734108e5a58eb49eda84b140978", + "zh:c5c8575c4458835c2acbc3d1ed5570589b14baa2525d8fbd04295c097caf41eb", + "zh:e0877a5dac3de138e61eefa26b2f5a13305a17259779465899880f70e11314e0", + ] +} + +provider "registry.opentofu.org/hashicorp/kubernetes" { + version = "2.38.0" + constraints = "~> 2.31" + hashes = [ + "h1:nY7J9jFXcsRINog0KYagiWZw1GVYF9D2JmtIB7Wnrao=", + "zh:1096b41c4e5b2ee6c1980916fb9a8579bc1892071396f7a9432be058aabf3cbc", + "zh:2959fde9ae3d1deb5e317df0d7b02ea4977951ee6b9c4beb083c148ca8f3681c", + "zh:5082f98fcb3389c73339365f7df39fc6912bf2bd1a46d5f97778f441a67fd337", + "zh:620fd5d0fbc2d7a24ac6b420a4922e6093020358162a62fa8cbd37b2bac1d22e", + "zh:7f47c2de179bba35d759147c53082cad6c3449d19b0ec0c5a4ca8db5b06393e1", + "zh:89c3aa2a87e29febf100fd21cead34f9a4c0e6e7ae5f383b5cef815c677eb52a", + "zh:96eecc9f94938a0bc35b8a63d2c4a5f972395e44206620db06760b730d0471fc", + "zh:e15567c1095f898af173c281b66bffdc4f3068afdd9f84bb5b5b5521d9f29584", + "zh:ecc6b912629734a9a41a7cf1c4c73fb13b4b510afc9e7b2e0011d290bcd6d77f", + ] +} diff --git a/terraform/envs/bootstrap/backend.tf b/terraform/envs/bootstrap/backend.tf new file mode 100644 index 0000000..3c533e6 --- /dev/null +++ b/terraform/envs/bootstrap/backend.tf @@ -0,0 +1,5 @@ +terraform { + backend "local" { + path = "terraform.tfstate" + } +} diff --git a/terraform/envs/bootstrap/main.tf b/terraform/envs/bootstrap/main.tf new file mode 100644 index 0000000..07bf04d --- /dev/null +++ b/terraform/envs/bootstrap/main.tf @@ -0,0 +1,25 @@ +provider "kubernetes" { + config_path = pathexpand(var.kubeconfig) + config_context = var.kube_context +} + +provider "helm" { + kubernetes { + config_path = pathexpand(var.kubeconfig) + config_context = var.kube_context + } +} + +module "observability" { + source = "../../modules/observability" + namespace = "monitoring" + grafana_admin_password = var.grafana_admin_password +} + +output "grafana" { + value = module.observability.grafana_service +} + +output "prometheus" { + value = module.observability.prometheus_service +} diff --git a/terraform/envs/bootstrap/variables.tf b/terraform/envs/bootstrap/variables.tf new file mode 100644 index 0000000..220bed3 --- /dev/null +++ b/terraform/envs/bootstrap/variables.tf @@ -0,0 +1,15 @@ +variable "kubeconfig" { + type = string + default = "~/.kube/config" +} + +variable "kube_context" { + type = string + default = "kind-llm-local" +} + +variable "grafana_admin_password" { + type = string + default = "admin" + sensitive = true +} diff --git a/terraform/envs/bootstrap/versions.tf b/terraform/envs/bootstrap/versions.tf new file mode 100644 index 0000000..0d7f77b --- /dev/null +++ b/terraform/envs/bootstrap/versions.tf @@ -0,0 +1,7 @@ +terraform { + required_version = ">= 1.6.0" + required_providers { + helm = { source = "hashicorp/helm", version = "~> 2.17" } + kubernetes = { source = "hashicorp/kubernetes", version = "~> 2.31" } + } +} diff --git a/terraform/envs/dev/.terraform.lock.hcl b/terraform/envs/dev/.terraform.lock.hcl new file mode 100644 index 0000000..09902a1 --- /dev/null +++ b/terraform/envs/dev/.terraform.lock.hcl @@ -0,0 +1,54 @@ +# This file is maintained automatically by "tofu init". +# Manual edits may be lost in future updates. + +provider "registry.opentofu.org/hashicorp/helm" { + version = "2.17.0" + constraints = "~> 2.17" + hashes = [ + "h1:69PnHoYrrDrm7C8+8PiSvRGPI55taqL14SvQR/FGM+g=", + "zh:02690815e35131a42cb9851f63a3369c216af30ad093d05b39001d43da04b56b", + "zh:27a62f12b29926387f4d71aeeee9f7ffa0ccb81a1b6066ee895716ad050d1b7a", + "zh:2d0a5babfa73604b3fefc9dab9c87f91c77fce756c2e32b294e9f1290aed26c0", + "zh:3976400ceba6dda4636e1d297e3097e1831de5628afa534a166de98a70d1dcbe", + "zh:54440ef14f342b41d75c1aded7487bfcc3f76322b75894235b47b7e89ac4bfa4", + "zh:6512e2ab9f2fa31cbb90d9249647b5c5798f62eb1215ec44da2cdaa24e38ad25", + "zh:795f327ca0b8c5368af0ed03d5d4f6da7260692b4b3ca0bd004ed542e683464d", + "zh:ba659e1d94f224bc3f1fd34cbb9d2663e3a8e734108e5a58eb49eda84b140978", + "zh:c5c8575c4458835c2acbc3d1ed5570589b14baa2525d8fbd04295c097caf41eb", + "zh:e0877a5dac3de138e61eefa26b2f5a13305a17259779465899880f70e11314e0", + ] +} + +provider "registry.opentofu.org/hashicorp/kubernetes" { + version = "2.38.0" + constraints = "~> 2.31" + hashes = [ + "h1:nY7J9jFXcsRINog0KYagiWZw1GVYF9D2JmtIB7Wnrao=", + "zh:1096b41c4e5b2ee6c1980916fb9a8579bc1892071396f7a9432be058aabf3cbc", + "zh:2959fde9ae3d1deb5e317df0d7b02ea4977951ee6b9c4beb083c148ca8f3681c", + "zh:5082f98fcb3389c73339365f7df39fc6912bf2bd1a46d5f97778f441a67fd337", + "zh:620fd5d0fbc2d7a24ac6b420a4922e6093020358162a62fa8cbd37b2bac1d22e", + "zh:7f47c2de179bba35d759147c53082cad6c3449d19b0ec0c5a4ca8db5b06393e1", + "zh:89c3aa2a87e29febf100fd21cead34f9a4c0e6e7ae5f383b5cef815c677eb52a", + "zh:96eecc9f94938a0bc35b8a63d2c4a5f972395e44206620db06760b730d0471fc", + "zh:e15567c1095f898af173c281b66bffdc4f3068afdd9f84bb5b5b5521d9f29584", + "zh:ecc6b912629734a9a41a7cf1c4c73fb13b4b510afc9e7b2e0011d290bcd6d77f", + ] +} + +provider "registry.opentofu.org/hashicorp/random" { + version = "3.8.1" + constraints = "~> 3.6" + hashes = [ + "h1:EHn3jsqOKhWjbg0X+psk0Ww96yz3N7ASqEKKuFvDFwo=", + "zh:25c458c7c676f15705e872202dad7dcd0982e4a48e7ea1800afa5fc64e77f4c8", + "zh:2edeaf6f1b20435b2f81855ad98a2e70956d473be9e52a5fdf57ccd0098ba476", + "zh:44becb9d5f75d55e36dfed0c5beabaf4c92e0a2bc61a3814d698271c646d48e7", + "zh:7699032612c3b16cc69928add8973de47b10ce81b1141f30644a0e8a895b5cd3", + "zh:86d07aa98d17703de9fbf402c89590dc1e01dbe5671dd6bc5e487eb8fe87eee0", + "zh:8c411c77b8390a49a8a1bc9f176529e6b32369dd33a723606c8533e5ca4d68c1", + "zh:a5ecc8255a612652a56b28149994985e2c4dc046e5d34d416d47fa7767f5c28f", + "zh:aea3fe1a5669b932eda9c5c72e5f327db8da707fe514aaca0d0ef60cb24892f9", + "zh:f56e26e6977f755d7ae56fa6320af96ecf4bb09580d47cb481efbf27f1c5afff", + ] +} diff --git a/terraform/envs/dev/backend.tf b/terraform/envs/dev/backend.tf new file mode 100644 index 0000000..3c533e6 --- /dev/null +++ b/terraform/envs/dev/backend.tf @@ -0,0 +1,5 @@ +terraform { + backend "local" { + path = "terraform.tfstate" + } +} diff --git a/terraform/envs/dev/main.tf b/terraform/envs/dev/main.tf new file mode 100644 index 0000000..8e1b882 --- /dev/null +++ b/terraform/envs/dev/main.tf @@ -0,0 +1,49 @@ +provider "kubernetes" { + config_path = pathexpand(var.kubeconfig) + config_context = var.kube_context +} + +provider "helm" { + kubernetes { + config_path = pathexpand(var.kubeconfig) + config_context = var.kube_context + } +} + +locals { + env = "dev" +} + +module "llm" { + source = "../../modules/llm" + + release_name = "llm" + namespace = "llm-${local.env}" + chart_path = var.chart_path + + replicas = 2 + + model_name = "Qwen/Qwen2.5-0.5B-Instruct" + model_alias = "Qwen2.5-0.5B-Instruct" + max_model_len = 2048 + dtype = "bfloat16" + omp_threads = 4 + + resources = { + requests = { cpu = "1", memory = "2Gi" } + limits = { cpu = "4", memory = "6Gi" } + } + + ingress_host = "llm.dev.localtest.me" + image_tag = "latest" +} + +output "ingress_host" { value = module.llm.ingress_host } +output "service_dns" { value = module.llm.service_dns } +output "curl_example" { + value = <<-EOT + curl -s http://${module.llm.ingress_host}:8080/v1/chat/completions \ + -H 'Content-Type: application/json' \ + -d '{"model":"Qwen2.5-0.5B-Instruct","messages":[{"role":"user","content":"Say hi."}]}' + EOT +} diff --git a/terraform/envs/dev/variables.tf b/terraform/envs/dev/variables.tf new file mode 100644 index 0000000..9f1b697 --- /dev/null +++ b/terraform/envs/dev/variables.tf @@ -0,0 +1,14 @@ +variable "kubeconfig" { + type = string + default = "~/.kube/config" +} + +variable "kube_context" { + type = string + default = "kind-llm-local" +} + +variable "chart_path" { + type = string + description = "Absolute path to charts/llm-app" +} diff --git a/terraform/envs/dev/versions.tf b/terraform/envs/dev/versions.tf new file mode 100644 index 0000000..6a87674 --- /dev/null +++ b/terraform/envs/dev/versions.tf @@ -0,0 +1,8 @@ +terraform { + required_version = ">= 1.6.0" + required_providers { + helm = { source = "hashicorp/helm", version = "~> 2.17" } + kubernetes = { source = "hashicorp/kubernetes", version = "~> 2.31" } + random = { source = "hashicorp/random", version = "~> 3.6" } + } +} diff --git a/terraform/envs/prod/.terraform.lock.hcl b/terraform/envs/prod/.terraform.lock.hcl new file mode 100644 index 0000000..09902a1 --- /dev/null +++ b/terraform/envs/prod/.terraform.lock.hcl @@ -0,0 +1,54 @@ +# This file is maintained automatically by "tofu init". +# Manual edits may be lost in future updates. + +provider "registry.opentofu.org/hashicorp/helm" { + version = "2.17.0" + constraints = "~> 2.17" + hashes = [ + "h1:69PnHoYrrDrm7C8+8PiSvRGPI55taqL14SvQR/FGM+g=", + "zh:02690815e35131a42cb9851f63a3369c216af30ad093d05b39001d43da04b56b", + "zh:27a62f12b29926387f4d71aeeee9f7ffa0ccb81a1b6066ee895716ad050d1b7a", + "zh:2d0a5babfa73604b3fefc9dab9c87f91c77fce756c2e32b294e9f1290aed26c0", + "zh:3976400ceba6dda4636e1d297e3097e1831de5628afa534a166de98a70d1dcbe", + "zh:54440ef14f342b41d75c1aded7487bfcc3f76322b75894235b47b7e89ac4bfa4", + "zh:6512e2ab9f2fa31cbb90d9249647b5c5798f62eb1215ec44da2cdaa24e38ad25", + "zh:795f327ca0b8c5368af0ed03d5d4f6da7260692b4b3ca0bd004ed542e683464d", + "zh:ba659e1d94f224bc3f1fd34cbb9d2663e3a8e734108e5a58eb49eda84b140978", + "zh:c5c8575c4458835c2acbc3d1ed5570589b14baa2525d8fbd04295c097caf41eb", + "zh:e0877a5dac3de138e61eefa26b2f5a13305a17259779465899880f70e11314e0", + ] +} + +provider "registry.opentofu.org/hashicorp/kubernetes" { + version = "2.38.0" + constraints = "~> 2.31" + hashes = [ + "h1:nY7J9jFXcsRINog0KYagiWZw1GVYF9D2JmtIB7Wnrao=", + "zh:1096b41c4e5b2ee6c1980916fb9a8579bc1892071396f7a9432be058aabf3cbc", + "zh:2959fde9ae3d1deb5e317df0d7b02ea4977951ee6b9c4beb083c148ca8f3681c", + "zh:5082f98fcb3389c73339365f7df39fc6912bf2bd1a46d5f97778f441a67fd337", + "zh:620fd5d0fbc2d7a24ac6b420a4922e6093020358162a62fa8cbd37b2bac1d22e", + "zh:7f47c2de179bba35d759147c53082cad6c3449d19b0ec0c5a4ca8db5b06393e1", + "zh:89c3aa2a87e29febf100fd21cead34f9a4c0e6e7ae5f383b5cef815c677eb52a", + "zh:96eecc9f94938a0bc35b8a63d2c4a5f972395e44206620db06760b730d0471fc", + "zh:e15567c1095f898af173c281b66bffdc4f3068afdd9f84bb5b5b5521d9f29584", + "zh:ecc6b912629734a9a41a7cf1c4c73fb13b4b510afc9e7b2e0011d290bcd6d77f", + ] +} + +provider "registry.opentofu.org/hashicorp/random" { + version = "3.8.1" + constraints = "~> 3.6" + hashes = [ + "h1:EHn3jsqOKhWjbg0X+psk0Ww96yz3N7ASqEKKuFvDFwo=", + "zh:25c458c7c676f15705e872202dad7dcd0982e4a48e7ea1800afa5fc64e77f4c8", + "zh:2edeaf6f1b20435b2f81855ad98a2e70956d473be9e52a5fdf57ccd0098ba476", + "zh:44becb9d5f75d55e36dfed0c5beabaf4c92e0a2bc61a3814d698271c646d48e7", + "zh:7699032612c3b16cc69928add8973de47b10ce81b1141f30644a0e8a895b5cd3", + "zh:86d07aa98d17703de9fbf402c89590dc1e01dbe5671dd6bc5e487eb8fe87eee0", + "zh:8c411c77b8390a49a8a1bc9f176529e6b32369dd33a723606c8533e5ca4d68c1", + "zh:a5ecc8255a612652a56b28149994985e2c4dc046e5d34d416d47fa7767f5c28f", + "zh:aea3fe1a5669b932eda9c5c72e5f327db8da707fe514aaca0d0ef60cb24892f9", + "zh:f56e26e6977f755d7ae56fa6320af96ecf4bb09580d47cb481efbf27f1c5afff", + ] +} diff --git a/terraform/envs/prod/backend.tf b/terraform/envs/prod/backend.tf new file mode 100644 index 0000000..3c533e6 --- /dev/null +++ b/terraform/envs/prod/backend.tf @@ -0,0 +1,5 @@ +terraform { + backend "local" { + path = "terraform.tfstate" + } +} diff --git a/terraform/envs/prod/main.tf b/terraform/envs/prod/main.tf new file mode 100644 index 0000000..04db31d --- /dev/null +++ b/terraform/envs/prod/main.tf @@ -0,0 +1,70 @@ +provider "kubernetes" { + config_path = pathexpand(var.kubeconfig) + config_context = var.kube_context +} + +provider "helm" { + kubernetes { + config_path = pathexpand(var.kubeconfig) + config_context = var.kube_context + } +} + +locals { + env = "prod" +} + +module "llm" { + source = "../../modules/llm" + + release_name = "llm" + namespace = "llm-${local.env}" + chart_path = var.chart_path + + replicas = 1 + + model_name = "Qwen/Qwen2.5-1.5B-Instruct" + model_alias = "Qwen2.5-1.5B-Instruct" + max_model_len = 4096 + dtype = "bfloat16" + omp_threads = 6 + + resources = { + requests = { cpu = "2", memory = "4Gi" } + limits = { cpu = "6", memory = "8Gi" } + } + + ingress_host = "llm.prod.localtest.me" + image_tag = "latest" + # Content-addressable pin, resolved with scripts/resolve-digests.sh on amd64. + # Per-arch digest — re-resolve on a different arch or after an upstream tag move. + # Dev intentionally runs on `:latest` so new fixes flow in without a PR. + image_digest = "sha256:bb7ed9b6c595334d78179e9d8f6490e06bf9220ed4a10b9b4e15064454ddc69e" + + # Enable OpenAI tool-calling so the agent's function-call path works. + # Qwen 2.5 uses hermes-style tool parsing in vLLM. + extra_args = [ + "--enable-auto-tool-choice", + "--tool-call-parser", "hermes", + ] + + hpa = { + enabled = true + min_replicas = 1 + max_replicas = 3 + # vLLM exposes `vllm:num_requests_running` as a per-pod gauge of in-flight + # requests. Scale up when >50% of pods are actively serving. + metric_name = "vllm:num_requests_running" + target_average_value = "500m" + } +} + +output "ingress_host" { value = module.llm.ingress_host } +output "service_dns" { value = module.llm.service_dns } +output "curl_example" { + value = <<-EOT + curl -s http://${module.llm.ingress_host}:8080/v1/chat/completions \ + -H 'Content-Type: application/json' \ + -d '{"model":"Qwen2.5-1.5B-Instruct","messages":[{"role":"user","content":"Say hi."}]}' + EOT +} diff --git a/terraform/envs/prod/variables.tf b/terraform/envs/prod/variables.tf new file mode 100644 index 0000000..9f1b697 --- /dev/null +++ b/terraform/envs/prod/variables.tf @@ -0,0 +1,14 @@ +variable "kubeconfig" { + type = string + default = "~/.kube/config" +} + +variable "kube_context" { + type = string + default = "kind-llm-local" +} + +variable "chart_path" { + type = string + description = "Absolute path to charts/llm-app" +} diff --git a/terraform/envs/prod/versions.tf b/terraform/envs/prod/versions.tf new file mode 100644 index 0000000..6a87674 --- /dev/null +++ b/terraform/envs/prod/versions.tf @@ -0,0 +1,8 @@ +terraform { + required_version = ">= 1.6.0" + required_providers { + helm = { source = "hashicorp/helm", version = "~> 2.17" } + kubernetes = { source = "hashicorp/kubernetes", version = "~> 2.31" } + random = { source = "hashicorp/random", version = "~> 3.6" } + } +} diff --git a/terraform/modules/agent/main.tf b/terraform/modules/agent/main.tf new file mode 100644 index 0000000..f53acdc --- /dev/null +++ b/terraform/modules/agent/main.tf @@ -0,0 +1,114 @@ +resource "kubernetes_namespace_v1" "agent" { + metadata { + name = var.namespace + labels = { + "app.kubernetes.io/part-of" = "llm-platform" + } + } +} + +resource "kubernetes_deployment_v1" "agent" { + metadata { + name = "agent" + namespace = kubernetes_namespace_v1.agent.metadata[0].name + labels = { app = "agent" } + } + spec { + replicas = 1 + selector { + match_labels = { app = "agent" } + } + template { + metadata { + labels = { app = "agent" } + annotations = { + # Bounce the pod when agent.py changes on disk, even if image tag is unchanged. + "checksum/code" = substr(sha256(file(var.agent_source_path)), 0, 16) + } + } + spec { + container { + name = "agent" + image = var.agent_image + image_pull_policy = "IfNotPresent" + env { + name = "OPENAI_BASE_URL" + value = var.llm_service_url + } + env { + name = "MODEL" + value = var.model_alias + } + port { + name = "http" + container_port = 8001 + } + readiness_probe { + http_get { + path = "/health" + port = "http" + } + initial_delay_seconds = 3 + period_seconds = 5 + failure_threshold = 10 + } + liveness_probe { + http_get { + path = "/health" + port = "http" + } + initial_delay_seconds = 30 + period_seconds = 30 + } + resources { + requests = { cpu = "100m", memory = "128Mi" } + limits = { cpu = "1", memory = "512Mi" } + } + } + } + } + } +} + +resource "kubernetes_service_v1" "agent" { + metadata { + name = "agent" + namespace = kubernetes_namespace_v1.agent.metadata[0].name + labels = { app = "agent" } + } + spec { + selector = { app = "agent" } + port { + name = "http" + port = 8001 + target_port = "http" + } + } +} + +resource "kubernetes_ingress_v1" "agent" { + metadata { + name = "agent" + namespace = kubernetes_namespace_v1.agent.metadata[0].name + } + spec { + ingress_class_name = var.ingress_class + rule { + host = var.ingress_host + http { + path { + path = "/" + path_type = "Prefix" + backend { + service { + name = kubernetes_service_v1.agent.metadata[0].name + port { + number = 8001 + } + } + } + } + } + } + } +} diff --git a/terraform/modules/agent/outputs.tf b/terraform/modules/agent/outputs.tf new file mode 100644 index 0000000..ac9932b --- /dev/null +++ b/terraform/modules/agent/outputs.tf @@ -0,0 +1,11 @@ +output "service_dns" { + value = "${kubernetes_service_v1.agent.metadata[0].name}.${kubernetes_namespace_v1.agent.metadata[0].name}.svc.cluster.local" +} + +output "ingress_host" { + value = var.ingress_host +} + +output "namespace" { + value = kubernetes_namespace_v1.agent.metadata[0].name +} diff --git a/terraform/modules/agent/variables.tf b/terraform/modules/agent/variables.tf new file mode 100644 index 0000000..6f525ee --- /dev/null +++ b/terraform/modules/agent/variables.tf @@ -0,0 +1,33 @@ +variable "namespace" { + type = string +} + +variable "agent_source_path" { + type = string + description = "Absolute path to agent/agent.py. Used only to bounce pods on code change." +} + +variable "agent_image" { + type = string + default = "localhost/agent:0.1.0" + description = "Pre-built agent image. Must be loaded into kind with `make agent-build`." +} + +variable "llm_service_url" { + type = string + description = "OpenAI-compatible base URL, e.g. http://llm-llm-app.llm-prod.svc.cluster.local:8000/v1" +} + +variable "model_alias" { + type = string + default = "Qwen2.5-1.5B-Instruct" +} + +variable "ingress_host" { + type = string +} + +variable "ingress_class" { + type = string + default = "nginx" +} diff --git a/terraform/modules/agent/versions.tf b/terraform/modules/agent/versions.tf new file mode 100644 index 0000000..4242705 --- /dev/null +++ b/terraform/modules/agent/versions.tf @@ -0,0 +1,5 @@ +terraform { + required_providers { + kubernetes = { source = "hashicorp/kubernetes", version = "~> 2.31" } + } +} diff --git a/terraform/modules/llm/main.tf b/terraform/modules/llm/main.tf new file mode 100644 index 0000000..cd22019 --- /dev/null +++ b/terraform/modules/llm/main.tf @@ -0,0 +1,99 @@ +resource "kubernetes_namespace_v1" "this" { + metadata { + name = var.namespace + labels = { + "app.kubernetes.io/part-of" = "llm-platform" + } + } +} + +resource "kubernetes_horizontal_pod_autoscaler_v2" "llm" { + count = var.hpa.enabled ? 1 : 0 + + metadata { + name = "${var.release_name}-llm-app" + namespace = kubernetes_namespace_v1.this.metadata[0].name + } + spec { + scale_target_ref { + api_version = "apps/v1" + kind = "Deployment" + name = "${var.release_name}-llm-app" + } + min_replicas = var.hpa.min_replicas + max_replicas = var.hpa.max_replicas + + metric { + type = "Pods" + pods { + metric { + name = var.hpa.metric_name + } + target { + type = "AverageValue" + average_value = var.hpa.target_average_value + } + } + } + } + + depends_on = [helm_release.llm] +} + +resource "helm_release" "llm" { + name = var.release_name + chart = var.chart_path + namespace = kubernetes_namespace_v1.this.metadata[0].name + create_namespace = false + atomic = false + wait = true + timeout = 1800 + + values = [ + yamlencode({ + replicaCount = var.replicas + + image = { + repository = var.image_repository + tag = var.image_tag + digest = var.image_digest + pullPolicy = "IfNotPresent" + } + + model = { + name = var.model_name + alias = var.model_alias + maxModelLen = var.max_model_len + dtype = var.dtype + } + + server = { + port = 8000 + ompThreads = var.omp_threads + extraArgs = var.extra_args + } + + resources = var.resources + + ingress = { + enabled = true + className = var.ingress_class + host = var.ingress_host + } + + monitoring = { + serviceMonitor = { + enabled = true + interval = "15s" + labels = { + release = var.service_monitor_release_label + } + } + } + + modelCache = { + sizeLimit = var.model_cache_size + } + }), + ] +} diff --git a/terraform/modules/llm/outputs.tf b/terraform/modules/llm/outputs.tf new file mode 100644 index 0000000..a953e73 --- /dev/null +++ b/terraform/modules/llm/outputs.tf @@ -0,0 +1,12 @@ +output "service_dns" { + value = "${var.release_name}-llm-app.${var.namespace}.svc.cluster.local" + description = "In-cluster DNS name for the LLM Service." +} + +output "ingress_host" { + value = var.ingress_host +} + +output "namespace" { + value = kubernetes_namespace_v1.this.metadata[0].name +} diff --git a/terraform/modules/llm/variables.tf b/terraform/modules/llm/variables.tf new file mode 100644 index 0000000..3a7d8f7 --- /dev/null +++ b/terraform/modules/llm/variables.tf @@ -0,0 +1,112 @@ +variable "release_name" { + type = string + description = "Helm release name." +} + +variable "namespace" { + type = string + description = "Kubernetes namespace to deploy into." +} + +variable "chart_path" { + type = string + description = "Path to the local llm-app chart." +} + +variable "replicas" { + type = number + default = 1 +} + +variable "model_name" { + type = string + description = "HuggingFace repo id, passed as vLLM model_tag (positional)." +} + +variable "model_alias" { + type = string + description = "Value clients pass in the OpenAI 'model' field (maps to --served-model-name)." +} + +variable "max_model_len" { + type = number + default = 2048 +} + +variable "dtype" { + type = string + default = "bfloat16" +} + +variable "omp_threads" { + type = number + default = 0 + description = "OMP_NUM_THREADS for vLLM CPU backend. 0 = autodetect." +} + +variable "extra_args" { + type = list(string) + default = [] + description = "Extra CLI args passed to `vllm serve`, appended after the stock set." +} + +variable "resources" { + type = object({ + requests = object({ cpu = string, memory = string }) + limits = object({ cpu = string, memory = string }) + }) +} + +variable "ingress_host" { + type = string +} + +variable "ingress_class" { + type = string + default = "nginx" +} + +variable "image_repository" { + type = string + default = "public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo" +} + +variable "image_tag" { + type = string + default = "latest" + description = "Used only when image_digest is empty." +} + +variable "image_digest" { + type = string + default = "" + description = "Optional sha256:abc... content-addressable digest. Takes precedence over image_tag." +} + +variable "service_monitor_release_label" { + type = string + default = "kube-prometheus-stack" + description = "Must match the release label the Prometheus Operator selects on." +} + +variable "model_cache_size" { + type = string + default = "10Gi" +} + +variable "hpa" { + type = object({ + enabled = bool + min_replicas = number + max_replicas = number + metric_name = string + target_average_value = string + }) + default = { + enabled = false + min_replicas = 1 + max_replicas = 3 + metric_name = "vllm:num_requests_running" + target_average_value = "500m" + } +} diff --git a/terraform/modules/observability/main.tf b/terraform/modules/observability/main.tf new file mode 100644 index 0000000..2f88f2e --- /dev/null +++ b/terraform/modules/observability/main.tf @@ -0,0 +1,156 @@ +resource "kubernetes_namespace_v1" "monitoring" { + metadata { + name = var.namespace + } +} + +resource "kubernetes_namespace_v1" "ingress" { + metadata { + name = "ingress-nginx" + } +} + +resource "helm_release" "ingress_nginx" { + name = "ingress-nginx" + repository = "https://kubernetes.github.io/ingress-nginx" + chart = "ingress-nginx" + version = var.ingress_nginx_version + namespace = kubernetes_namespace_v1.ingress.metadata[0].name + wait = true + timeout = 300 + + values = [ + yamlencode({ + controller = { + hostPort = { enabled = true, ports = { http = 80, https = 443 } } + service = { type = "NodePort" } + nodeSelector = { + "ingress-ready" = "true" + } + tolerations = [ + { key = "node-role.kubernetes.io/control-plane", operator = "Equal", effect = "NoSchedule" }, + { key = "node-role.kubernetes.io/master", operator = "Equal", effect = "NoSchedule" }, + ] + publishService = { enabled = false } + admissionWebhooks = { enabled = false } # speeds up kind cluster installs + # Cap worker_processes so nginx doesn't try to spawn 14 threads under + # CPU pressure from vLLM cold-starts. With auto (= one per CPU) it + # sometimes hits pthread EAGAIN and workers die without respawn. + config = { + "worker-processes" = "4" + } + } + }), + ] +} + +resource "helm_release" "kps" { + name = "kube-prometheus-stack" + repository = "https://prometheus-community.github.io/helm-charts" + chart = "kube-prometheus-stack" + version = var.kps_version + namespace = kubernetes_namespace_v1.monitoring.metadata[0].name + wait = true + timeout = 600 + + values = [ + yamlencode({ + fullnameOverride = "kps" + prometheus = { + prometheusSpec = { + # Let Prometheus pick up ServiceMonitors from any namespace matching + # the release=kube-prometheus-stack label (the chart's default). + serviceMonitorSelectorNilUsesHelmValues = false + podMonitorSelectorNilUsesHelmValues = false + ruleSelectorNilUsesHelmValues = false + retention = "2d" + resources = { + requests = { cpu = "100m", memory = "400Mi" } + limits = { memory = "1Gi" } + } + } + ingress = { + enabled = true + ingressClassName = "nginx" + hosts = ["prom.localtest.me"] + } + } + alertmanager = { enabled = false } + grafana = { + adminPassword = var.grafana_admin_password + sidecar = { + dashboards = { + enabled = true + label = "grafana_dashboard" + labelValue = "1" + searchNamespace = "ALL" + } + } + service = { type = "ClusterIP" } + ingress = { + enabled = true + ingressClassName = "nginx" + hosts = ["grafana.localtest.me"] + } + } + }), + ] +} + +resource "helm_release" "prometheus_adapter" { + name = "prometheus-adapter" + repository = "https://prometheus-community.github.io/helm-charts" + chart = "prometheus-adapter" + version = var.prometheus_adapter_version + namespace = kubernetes_namespace_v1.monitoring.metadata[0].name + wait = true + timeout = 300 + + values = [ + yamlencode({ + prometheus = { + url = "http://kps-prometheus.${kubernetes_namespace_v1.monitoring.metadata[0].name}.svc" + port = 9090 + } + rules = { + default = false + custom = [ + { + # In-flight request count per pod; basis for autoscaling. + # vLLM exposes this as a gauge per model-engine. + seriesQuery = "vllm:num_requests_running{namespace!=\"\",pod!=\"\"}" + resources = { + overrides = { + namespace = { resource = "namespace" } + pod = { resource = "pod" } + } + } + name = { + matches = "^vllm:num_requests_running$" + as = "vllm:num_requests_running" + } + metricsQuery = "avg(<<.Series>>{<<.LabelMatchers>>}) by (<<.GroupBy>>)" + }, + { + # Waiting (queued) requests per pod — an alternative scale signal. + seriesQuery = "vllm:num_requests_waiting{namespace!=\"\",pod!=\"\"}" + resources = { + overrides = { + namespace = { resource = "namespace" } + pod = { resource = "pod" } + } + } + name = { + matches = "^vllm:num_requests_waiting$" + as = "vllm:num_requests_waiting" + } + metricsQuery = "avg(<<.Series>>{<<.LabelMatchers>>}) by (<<.GroupBy>>)" + }, + ] + } + }), + ] + + depends_on = [helm_release.kps] +} + diff --git a/terraform/modules/observability/outputs.tf b/terraform/modules/observability/outputs.tf new file mode 100644 index 0000000..06a507d --- /dev/null +++ b/terraform/modules/observability/outputs.tf @@ -0,0 +1,11 @@ +output "namespace" { + value = kubernetes_namespace_v1.monitoring.metadata[0].name +} + +output "grafana_service" { + value = "kube-prometheus-stack-grafana.${kubernetes_namespace_v1.monitoring.metadata[0].name}.svc.cluster.local" +} + +output "prometheus_service" { + value = "kps-prometheus.${kubernetes_namespace_v1.monitoring.metadata[0].name}.svc.cluster.local" +} diff --git a/terraform/modules/observability/variables.tf b/terraform/modules/observability/variables.tf new file mode 100644 index 0000000..6aeaca3 --- /dev/null +++ b/terraform/modules/observability/variables.tf @@ -0,0 +1,27 @@ +variable "namespace" { + type = string + default = "monitoring" +} + +variable "kps_version" { + type = string + default = "65.5.1" + description = "kube-prometheus-stack chart version." +} + +variable "ingress_nginx_version" { + type = string + default = "4.11.3" + description = "ingress-nginx chart version." +} + +variable "grafana_admin_password" { + type = string + default = "admin" + sensitive = true +} + +variable "prometheus_adapter_version" { + type = string + default = "4.11.0" +} diff --git a/tests/smoke.sh b/tests/smoke.sh new file mode 100755 index 0000000..a5ef23d --- /dev/null +++ b/tests/smoke.sh @@ -0,0 +1,38 @@ +#!/usr/bin/env bash +# Smoke test for the OpenAI-compatible LLM endpoint. +# Usage: +# ENDPOINT=http://llm.dev.localtest.me:8080 MODEL=Qwen2.5-0.5B-Instruct ./tests/smoke.sh +set -euo pipefail + +ENDPOINT="${ENDPOINT:-http://llm.dev.localtest.me:8080}" +MODEL="${MODEL:-Qwen2.5-0.5B-Instruct}" +TIMEOUT="${TIMEOUT:-120}" + +say() { printf '\033[1;34m==>\033[0m %s\n' "$*"; } +fail() { printf '\033[1;31mFAIL\033[0m %s\n' "$*" >&2; exit 1; } + +say "Endpoint: $ENDPOINT" +say "Model: $MODEL" + +say "GET /v1/models" +models_json="$(curl -fsS --max-time "$TIMEOUT" "$ENDPOINT/v1/models")" || fail "/v1/models unreachable" +echo "$models_json" | grep -q "$MODEL" || fail "/v1/models does not list $MODEL" + +say "POST /v1/chat/completions" +resp="$(curl -fsS --max-time "$TIMEOUT" "$ENDPOINT/v1/chat/completions" \ + -H 'Content-Type: application/json' \ + -d "$(cat <