diff options
47 files changed, 1718 insertions, 0 deletions
diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..747a966 --- /dev/null +++ b/.gitignore | |||
| @@ -0,0 +1,29 @@ | |||
| 1 | # Terraform | ||
| 2 | .terraform/ | ||
| 3 | terraform.tfstate | ||
| 4 | terraform.tfstate.* | ||
| 5 | *.tfvars | ||
| 6 | *.tfvars.json | ||
| 7 | crash.log | ||
| 8 | crash.*.log | ||
| 9 | override.tf | ||
| 10 | override.tf.json | ||
| 11 | *_override.tf | ||
| 12 | *_override.tf.json | ||
| 13 | |||
| 14 | # Python | ||
| 15 | __pycache__/ | ||
| 16 | *.py[cod] | ||
| 17 | *.egg-info/ | ||
| 18 | .pytest_cache/ | ||
| 19 | .venv/ | ||
| 20 | venv/ | ||
| 21 | |||
| 22 | # OS | ||
| 23 | .DS_Store | ||
| 24 | Thumbs.db | ||
| 25 | |||
| 26 | # Editors | ||
| 27 | .idea/ | ||
| 28 | .vscode/ | ||
| 29 | *.swp | ||
diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..15c0031 --- /dev/null +++ b/Makefile | |||
| @@ -0,0 +1,78 @@ | |||
| 1 | REPO_ROOT := $(abspath $(dir $(abspath $(lastword $(MAKEFILE_LIST))))) | ||
| 2 | CHART_PATH := $(REPO_ROOT)/charts/llm-app | ||
| 3 | AGENT_SRC := $(REPO_ROOT)/agent/agent.py | ||
| 4 | AGENT_IMG := localhost/agent:0.1.0 | ||
| 5 | CLUSTER := llm-local | ||
| 6 | CONTEXT := kind-$(CLUSTER) | ||
| 7 | |||
| 8 | export KIND_EXPERIMENTAL_PROVIDER=podman | ||
| 9 | |||
| 10 | .PHONY: help | ||
| 11 | help: | ||
| 12 | @echo "Targets:" | ||
| 13 | @echo " up-dev deploy dev LLM (Qwen2.5-0.5B, 2 replicas)" | ||
| 14 | @echo " up-prod deploy prod LLM (Qwen2.5-1.5B, 1 replica + HPA 1->3)" | ||
| 15 | @echo " up-agent up-prod + tool-using agent" | ||
| 16 | @echo " ask Q='...' POST a question to the agent" | ||
| 17 | @echo " down destroy everything + delete kind cluster" | ||
| 18 | @echo "" | ||
| 19 | @echo "URLs (after up-dev/up-prod):" | ||
| 20 | @echo " Grafana http://grafana.localtest.me:8080 (admin/admin)" | ||
| 21 | @echo " curl -f http://grafana.localtest.me:8080/api/health" | ||
| 22 | @echo " Prometheus http://prom.localtest.me:8080" | ||
| 23 | @echo " curl -f http://prom.localtest.me:8080/-/healthy" | ||
| 24 | @echo "" | ||
| 25 | |||
| 26 | .PHONY: up-dev | ||
| 27 | up-dev: | ||
| 28 | @kind get clusters | grep -qx $(CLUSTER) || kind create cluster --config $(REPO_ROOT)/cluster/kind-config.yaml | ||
| 29 | cd $(REPO_ROOT)/terraform/envs/bootstrap && \ | ||
| 30 | tofu init -upgrade && \ | ||
| 31 | tofu apply -auto-approve \ | ||
| 32 | -var kube_context=$(CONTEXT) | ||
| 33 | cd $(REPO_ROOT)/terraform/envs/dev && \ | ||
| 34 | tofu init -upgrade && \ | ||
| 35 | tofu apply -auto-approve \ | ||
| 36 | -var kube_context=$(CONTEXT) \ | ||
| 37 | -var chart_path=$(CHART_PATH) | ||
| 38 | |||
| 39 | .PHONY: up-prod | ||
| 40 | up-prod: | ||
| 41 | @kind get clusters | grep -qx $(CLUSTER) || kind create cluster --config $(REPO_ROOT)/cluster/kind-config.yaml | ||
| 42 | cd $(REPO_ROOT)/terraform/envs/bootstrap && \ | ||
| 43 | tofu init -upgrade && \ | ||
| 44 | tofu apply -auto-approve \ | ||
| 45 | -var kube_context=$(CONTEXT) | ||
| 46 | cd $(REPO_ROOT)/terraform/envs/prod && \ | ||
| 47 | tofu init -upgrade && \ | ||
| 48 | tofu apply -auto-approve \ | ||
| 49 | -var kube_context=$(CONTEXT) \ | ||
| 50 | -var chart_path=$(CHART_PATH) | ||
| 51 | |||
| 52 | .PHONY: up-agent | ||
| 53 | up-agent: up-prod | ||
| 54 | podman build -t $(AGENT_IMG) $(REPO_ROOT)/agent/ | ||
| 55 | @tmp=$$(mktemp -t agent-XXXXXX.tar); \ | ||
| 56 | podman save $(AGENT_IMG) -o $$tmp && \ | ||
| 57 | kind load image-archive $$tmp --name $(CLUSTER) && \ | ||
| 58 | rm -f $$tmp | ||
| 59 | cd $(REPO_ROOT)/terraform/envs/agent && \ | ||
| 60 | tofu init -upgrade && \ | ||
| 61 | tofu apply -auto-approve \ | ||
| 62 | -var kube_context=$(CONTEXT) \ | ||
| 63 | -var agent_source_path=$(AGENT_SRC) | ||
| 64 | |||
| 65 | .PHONY: ask | ||
| 66 | ask: | ||
| 67 | @if [ -z "$(Q)" ]; then echo "usage: make ask Q='what is 17*23?'"; exit 1; fi | ||
| 68 | curl -s http://agent.localtest.me:8080/ask \ | ||
| 69 | -H 'Content-Type: application/json' \ | ||
| 70 | -d "$(shell printf '{"question":"%s"}' "$(Q)")" | python3 -m json.tool | ||
| 71 | |||
| 72 | .PHONY: down | ||
| 73 | down: | ||
| 74 | -cd $(REPO_ROOT)/terraform/envs/agent && tofu destroy -auto-approve -var kube_context=$(CONTEXT) -var agent_source_path=$(AGENT_SRC) || true | ||
| 75 | -cd $(REPO_ROOT)/terraform/envs/prod && tofu destroy -auto-approve -var kube_context=$(CONTEXT) -var chart_path=$(CHART_PATH) || true | ||
| 76 | -cd $(REPO_ROOT)/terraform/envs/dev && tofu destroy -auto-approve -var kube_context=$(CONTEXT) -var chart_path=$(CHART_PATH) || true | ||
| 77 | -cd $(REPO_ROOT)/terraform/envs/bootstrap && tofu destroy -auto-approve -var kube_context=$(CONTEXT) || true | ||
| 78 | KIND_EXPERIMENTAL_PROVIDER=podman kind delete cluster --name $(CLUSTER) | ||
diff --git a/README.txt b/README.txt new file mode 100644 index 0000000..4ce7a8e --- /dev/null +++ b/README.txt | |||
| @@ -0,0 +1,90 @@ | |||
| 1 | ============================================================================= | ||
| 2 | Local K8s LLM demo — kind + OpenTofu + vLLM | ||
| 3 | ============================================================================= | ||
| 4 | |||
| 5 | sudo dnf install -y podman git make jq curl tar | ||
| 6 | |||
| 7 | # kind v0.31.0 (node image: kindest/node:v1.35.0, pinned by digest in cluster/kind-config.yaml) | ||
| 8 | cluster/kind-config.yaml | ||
| 9 | |||
| 10 | # kubectl v1.36.0 | ||
| 11 | curl -fsSLo /tmp/kubectl \ | ||
| 12 | https://dl.k8s.io/release/v1.36.0/bin/linux/amd64/kubectl | ||
| 13 | sudo install -m 0755 /tmp/kubectl /usr/local/bin/kubectl | ||
| 14 | |||
| 15 | # Helm 4.1.4 | ||
| 16 | terraform/modules/observability/variables.tf | ||
| 17 | |||
| 18 | # OpenTofu 1.11.6 | ||
| 19 | terraform/envs/{dev,prod,bootstrap}/versions.tf | ||
| 20 | |||
| 21 | |||
| 22 | |||
| 23 | # kind runs each k8s "node" as a long-lived podman container. The default pids_limit = 2048 causes ingress-nginx to hit pthread EAGAIN once the control plane warms up. Raise it once, then restart podman: | ||
| 24 | sudo mkdir -p /etc/containers/containers.conf.d | ||
| 25 | printf '[containers]\npids_limit = 0\n' \ | ||
| 26 | | sudo tee /etc/containers/containers.conf.d/99-kind-pids.conf | ||
| 27 | sudo systemctl restart podman.socket podman 2>/dev/null || true | ||
| 28 | |||
| 29 | |||
| 30 | make help # will have self explaantory commands to coopy adn paste stuff with | ||
| 31 | |||
| 32 | |||
| 33 | |||
| 34 | # Expose an OpenAI-compatible endpoint through a K8s Service / Ingress and prove it works with a `curl` example in the README. | ||
| 35 | |||
| 36 | http://llm.dev.localtest.me:8080 | ||
| 37 | $curl http://llm.dev.localtest.me:8080/v1/chat/completions -H 'Content-Type: application/json' -d '{"model":"Qwen2.5-0.5B-Instruct","messages":[{"role":"user","content":"hi"}],"max_tokens":8}' | jq | ||
| 38 | |||
| 39 | http://llm.prod.localtest.me:8080 | ||
| 40 | $curl http://llm.prod.localtest.me:8080/v1/chat/completions -H 'Content-Type: application/json' -d '{"model":"Qwen2.5-1.5B-Instruct","messages":[{"role":"user","content":"hi"}],"max_tokens":8}' | jq | ||
| 41 | |||
| 42 | |||
| 43 | # Observability: scrape `/metrics` from the inference pod with Prometheus and show at least one dashboard or PromQL query for request latency and GPU/CPU utilization. | ||
| 44 | |||
| 45 | Fire 10 chat requests against dev to populate metrics | ||
| 46 | $for i in {1..10}; do curl -s http://llm.dev.localtest.me:8080/v1/chat/completions -H 'Content-Type: application/json' -d '{"model":"Qwen2.5-0.5B-Instruct","messages":[{"role":"user","content":"hi"}],"max_tokens":4}' >/dev/null & done; wait | ||
| 47 | |||
| 48 | Raw /metrics (vLLM exposes natively) | ||
| 49 | $curl -s http://llm.dev.localtest.me:8080/metrics | grep '^vllm:' | head | ||
| 50 | |||
| 51 | Request latency p95 (seconds) — via Prometheus | ||
| 52 | $curl -G http://prom.localtest.me:8080/api/v1/query --data-urlencode 'query=histogram_quantile(0.95, sum by (le) (rate(vllm:e2e_request_latency_seconds_bucket[5m])))' | jq .data.result | ||
| 53 | |||
| 54 | CPU cores in use per vLLM pod (CPU-only inference — no GPU on this stack) | ||
| 55 | $curl -G http://prom.localtest.me:8080/api/v1/query --data-urlencode 'query=sum by (pod) (rate(container_cpu_usage_seconds_total{namespace="llm-dev",pod=~"llm-llm-app.*",container!="",container!="POD"}[5m]))' | jq .data.result | ||
| 56 | |||
| 57 | In-flight requests per pod (the same metric the prod HPA scales on) | ||
| 58 | $curl -G http://prom.localtest.me:8080/api/v1/query --data-urlencode 'query=sum by (namespace, pod) (vllm:num_requests_running)' | jq .data.result | ||
| 59 | |||
| 60 | |||
| 61 | # stretch 1 | ||
| 62 | deployed /agent/agent.py in a container to use the backend to caluclate the product of 2 nums | ||
| 63 | $curl http://agent.localtest.me:8080/ask -H 'Content-Type: application/json' -d '{"question":"what is 17 * 23?"}' # need to make up-agent first | ||
| 64 | |||
| 65 | # stretch 2 | ||
| 66 | horizontal pod scaling by counting total inflight requests, up to a total of 3 pods | ||
| 67 | term1 | ||
| 68 | $(trap 'kill 0' INT; for i in {1..5}; do \ | ||
| 69 | curl -s http://llm.prod.localtest.me:8080/v1/chat/completions \ | ||
| 70 | -H 'Content-Type: application/json' \ | ||
| 71 | -d '{"model":"Qwen2.5-1.5B-Instruct","messages":[{"role":"user","content":"hi"}],"max_tokens":32}' \ | ||
| 72 | >/dev/null & | ||
| 73 | done; wait) | ||
| 74 | term2 | ||
| 75 | $kubectl -n llm-prod get hpa -w # better to use watch -n for this, -w is slow af | ||
| 76 | |||
| 77 | # stretch 3 | ||
| 78 | image pinning — prod uses repo@sha256:<digest> (resolved via scripts/resolve-digests.sh); | ||
| 79 | terraform/envs/prod/main.tf | ||
| 80 | dev tracks :latest. The chart prefers digest over tag when both are set. | ||
| 81 | |||
| 82 | #stretch 4 | ||
| 83 | smoke test | ||
| 84 | charts/llm-app/templates/smoketest-job.yaml reruns after every install or upgrade | ||
| 85 | just checks if the response has a content field, no functional thingummy, then passes it | ||
| 86 | |||
| 87 | |||
| 88 | |||
| 89 | |||
| 90 | all stretch goals except atlantis git controlled thingummy cuz lazy setup locally or use github | ||
diff --git a/agent/Dockerfile b/agent/Dockerfile new file mode 100644 index 0000000..509c3b6 --- /dev/null +++ b/agent/Dockerfile | |||
| @@ -0,0 +1,6 @@ | |||
| 1 | FROM python:3.12-slim | ||
| 2 | RUN pip install --no-cache-dir 'openai>=1.59.2,<2' 'httpx<0.28' | ||
| 3 | WORKDIR /app | ||
| 4 | COPY agent.py /app/agent.py | ||
| 5 | EXPOSE 8001 | ||
| 6 | CMD ["python", "/app/agent.py"] | ||
diff --git a/agent/agent.py b/agent/agent.py new file mode 100644 index 0000000..12ad9d6 --- /dev/null +++ b/agent/agent.py | |||
| @@ -0,0 +1,162 @@ | |||
| 1 | """Tool-using agent over an OpenAI-compatible backend. | ||
| 2 | |||
| 3 | Uses the standard OpenAI tools API (function calling). vLLM maps this to the | ||
| 4 | model's native tool-call template (Qwen here), so small models follow the | ||
| 5 | protocol much more reliably than a hand-rolled text convention. | ||
| 6 | |||
| 7 | POST /ask {"question": "..."} -> {"answer": "...", "transcript": [...]} | ||
| 8 | GET /health -> "ok" | ||
| 9 | """ | ||
| 10 | import json | ||
| 11 | import os | ||
| 12 | import re | ||
| 13 | from http.server import BaseHTTPRequestHandler, HTTPServer | ||
| 14 | |||
| 15 | from openai import OpenAI | ||
| 16 | |||
| 17 | client = OpenAI( | ||
| 18 | base_url=os.environ["OPENAI_BASE_URL"], | ||
| 19 | api_key=os.environ.get("OPENAI_API_KEY", "sk-local"), | ||
| 20 | ) | ||
| 21 | MODEL = os.environ.get("MODEL", "Qwen2.5-1.5B-Instruct") | ||
| 22 | MAX_STEPS = int(os.environ.get("MAX_STEPS", "6")) | ||
| 23 | |||
| 24 | SYSTEM = ( | ||
| 25 | "You are a careful math assistant. When the user asks any arithmetic question, " | ||
| 26 | "call the 'calc' tool with the exact expression. Do not compute arithmetic in your head. " | ||
| 27 | "After you receive the tool result, give a concise final answer." | ||
| 28 | ) | ||
| 29 | |||
| 30 | TOOLS = [ | ||
| 31 | { | ||
| 32 | "type": "function", | ||
| 33 | "function": { | ||
| 34 | "name": "calc", | ||
| 35 | "description": "Evaluate a safe arithmetic expression and return the numeric result.", | ||
| 36 | "parameters": { | ||
| 37 | "type": "object", | ||
| 38 | "properties": { | ||
| 39 | "expression": { | ||
| 40 | "type": "string", | ||
| 41 | "description": "Arithmetic expression using only digits, spaces, and + - * / . ( )", | ||
| 42 | } | ||
| 43 | }, | ||
| 44 | "required": ["expression"], | ||
| 45 | }, | ||
| 46 | }, | ||
| 47 | } | ||
| 48 | ] | ||
| 49 | |||
| 50 | SAFE_EXPR = re.compile(r"^[\d\s+\-*/().]+$") | ||
| 51 | |||
| 52 | |||
| 53 | def calc(expression: str) -> str: | ||
| 54 | if not SAFE_EXPR.fullmatch(expression): | ||
| 55 | return "ERROR: disallowed characters" | ||
| 56 | try: | ||
| 57 | return str(eval(expression, {"__builtins__": {}}, {})) # noqa: S307 | ||
| 58 | except Exception as e: | ||
| 59 | return f"ERROR: {e}" | ||
| 60 | |||
| 61 | |||
| 62 | def run_agent(question: str) -> dict: | ||
| 63 | messages = [ | ||
| 64 | {"role": "system", "content": SYSTEM}, | ||
| 65 | {"role": "user", "content": question}, | ||
| 66 | ] | ||
| 67 | transcript: list = [] | ||
| 68 | |||
| 69 | for step in range(MAX_STEPS): | ||
| 70 | resp = client.chat.completions.create( | ||
| 71 | model=MODEL, | ||
| 72 | messages=messages, | ||
| 73 | tools=TOOLS, | ||
| 74 | tool_choice="auto", | ||
| 75 | temperature=0.0, | ||
| 76 | max_tokens=256, | ||
| 77 | ) | ||
| 78 | msg = resp.choices[0].message | ||
| 79 | |||
| 80 | # Always append the assistant message (with any tool_calls) to history. | ||
| 81 | assistant_entry = {"role": "assistant", "content": msg.content or ""} | ||
| 82 | if msg.tool_calls: | ||
| 83 | assistant_entry["tool_calls"] = [ | ||
| 84 | { | ||
| 85 | "id": tc.id, | ||
| 86 | "type": "function", | ||
| 87 | "function": {"name": tc.function.name, "arguments": tc.function.arguments}, | ||
| 88 | } | ||
| 89 | for tc in msg.tool_calls | ||
| 90 | ] | ||
| 91 | messages.append(assistant_entry) | ||
| 92 | |||
| 93 | transcript.append( | ||
| 94 | { | ||
| 95 | "step": step + 1, | ||
| 96 | "content": msg.content, | ||
| 97 | "tool_calls": [ | ||
| 98 | {"name": tc.function.name, "arguments": tc.function.arguments} | ||
| 99 | for tc in (msg.tool_calls or []) | ||
| 100 | ], | ||
| 101 | } | ||
| 102 | ) | ||
| 103 | |||
| 104 | if msg.tool_calls: | ||
| 105 | for tc in msg.tool_calls: | ||
| 106 | if tc.function.name != "calc": | ||
| 107 | result = f"ERROR: unknown tool {tc.function.name}" | ||
| 108 | else: | ||
| 109 | try: | ||
| 110 | args = json.loads(tc.function.arguments) | ||
| 111 | except json.JSONDecodeError: | ||
| 112 | result = "ERROR: bad JSON arguments" | ||
| 113 | else: | ||
| 114 | result = calc(args.get("expression", "")) | ||
| 115 | transcript.append({"tool_result": {"name": tc.function.name, "result": result}}) | ||
| 116 | messages.append( | ||
| 117 | {"role": "tool", "tool_call_id": tc.id, "content": result} | ||
| 118 | ) | ||
| 119 | continue | ||
| 120 | |||
| 121 | # No tool call -> model produced a final answer. | ||
| 122 | return {"answer": (msg.content or "").strip(), "steps": step + 1, "transcript": transcript} | ||
| 123 | |||
| 124 | return {"answer": None, "steps": MAX_STEPS, "note": "MAX_STEPS reached", "transcript": transcript} | ||
| 125 | |||
| 126 | |||
| 127 | class Handler(BaseHTTPRequestHandler): | ||
| 128 | def do_POST(self): # noqa: N802 | ||
| 129 | if self.path != "/ask": | ||
| 130 | self.send_response(404); self.end_headers(); return | ||
| 131 | n = int(self.headers.get("Content-Length", "0")) | ||
| 132 | try: | ||
| 133 | body = json.loads(self.rfile.read(n) or b"{}") | ||
| 134 | except json.JSONDecodeError: | ||
| 135 | self.send_response(400); self.end_headers(); self.wfile.write(b'{"error":"invalid json"}'); return | ||
| 136 | q = body.get("question", "") | ||
| 137 | try: | ||
| 138 | result = run_agent(q) | ||
| 139 | code = 200 | ||
| 140 | except Exception as e: | ||
| 141 | result = {"error": str(e), "type": type(e).__name__} | ||
| 142 | code = 500 | ||
| 143 | payload = json.dumps(result).encode() | ||
| 144 | self.send_response(code) | ||
| 145 | self.send_header("Content-Type", "application/json") | ||
| 146 | self.send_header("Content-Length", str(len(payload))) | ||
| 147 | self.end_headers() | ||
| 148 | self.wfile.write(payload) | ||
| 149 | |||
| 150 | def do_GET(self): # noqa: N802 | ||
| 151 | if self.path == "/health": | ||
| 152 | self.send_response(200); self.end_headers(); self.wfile.write(b"ok"); return | ||
| 153 | self.send_response(404); self.end_headers() | ||
| 154 | |||
| 155 | def log_message(self, fmt, *args): | ||
| 156 | import sys | ||
| 157 | print(f"{self.address_string()} {fmt % args}", file=sys.stderr) | ||
| 158 | |||
| 159 | |||
| 160 | if __name__ == "__main__": | ||
| 161 | print(f"agent starting on :8001, model={MODEL}, backend={os.environ['OPENAI_BASE_URL']}") | ||
| 162 | HTTPServer(("0.0.0.0", 8001), Handler).serve_forever() | ||
diff --git a/charts/llm-app/Chart.yaml b/charts/llm-app/Chart.yaml new file mode 100644 index 0000000..e0747df --- /dev/null +++ b/charts/llm-app/Chart.yaml | |||
| @@ -0,0 +1,6 @@ | |||
| 1 | apiVersion: v2 | ||
| 2 | name: llm-app | ||
| 3 | description: OpenAI-compatible LLM server (vLLM CPU) with Prometheus metrics | ||
| 4 | type: application | ||
| 5 | version: 0.1.0 | ||
| 6 | appVersion: "latest" | ||
diff --git a/charts/llm-app/templates/_helpers.tpl b/charts/llm-app/templates/_helpers.tpl new file mode 100644 index 0000000..8b104de --- /dev/null +++ b/charts/llm-app/templates/_helpers.tpl | |||
| @@ -0,0 +1,8 @@ | |||
| 1 | {{- define "llm-app.fullname" -}} | ||
| 2 | {{- printf "%s-%s" .Release.Name .Chart.Name | trunc 63 | trimSuffix "-" -}} | ||
| 3 | {{- end -}} | ||
| 4 | |||
| 5 | {{- define "llm-app.selectorLabels" -}} | ||
| 6 | app.kubernetes.io/name: {{ .Chart.Name }} | ||
| 7 | app.kubernetes.io/instance: {{ .Release.Name }} | ||
| 8 | {{- end -}} | ||
diff --git a/charts/llm-app/templates/deployment.yaml b/charts/llm-app/templates/deployment.yaml new file mode 100644 index 0000000..12677b5 --- /dev/null +++ b/charts/llm-app/templates/deployment.yaml | |||
| @@ -0,0 +1,76 @@ | |||
| 1 | apiVersion: apps/v1 | ||
| 2 | kind: Deployment | ||
| 3 | metadata: | ||
| 4 | name: {{ include "llm-app.fullname" . }} | ||
| 5 | spec: | ||
| 6 | replicas: {{ .Values.replicaCount }} | ||
| 7 | selector: | ||
| 8 | matchLabels: {{- include "llm-app.selectorLabels" . | nindent 6 }} | ||
| 9 | template: | ||
| 10 | metadata: | ||
| 11 | labels: {{- include "llm-app.selectorLabels" . | nindent 8 }} | ||
| 12 | spec: | ||
| 13 | containers: | ||
| 14 | - name: vllm-server | ||
| 15 | # Image entrypoint is already `vllm serve`; args start with the model tag. | ||
| 16 | image: "{{ .Values.image.repository }}{{ if .Values.image.digest }}@{{ .Values.image.digest }}{{ else }}:{{ .Values.image.tag }}{{ end }}" | ||
| 17 | imagePullPolicy: {{ .Values.image.pullPolicy }} | ||
| 18 | args: | ||
| 19 | - {{ .Values.model.name | quote }} | ||
| 20 | - "--host" | ||
| 21 | - "0.0.0.0" | ||
| 22 | - "--port" | ||
| 23 | - {{ .Values.server.port | quote }} | ||
| 24 | - "--served-model-name" | ||
| 25 | - {{ .Values.model.alias | quote }} | ||
| 26 | - "--max-model-len" | ||
| 27 | - {{ .Values.model.maxModelLen | quote }} | ||
| 28 | - "--dtype" | ||
| 29 | - {{ .Values.model.dtype | quote }} | ||
| 30 | {{- with .Values.server.extraArgs }} | ||
| 31 | {{- toYaml . | nindent 12 }} | ||
| 32 | {{- end }} | ||
| 33 | env: | ||
| 34 | - name: HF_HOME | ||
| 35 | value: /cache/huggingface | ||
| 36 | - name: VLLM_CPU_KVCACHE_SPACE | ||
| 37 | value: "2" | ||
| 38 | {{- if gt (int .Values.server.ompThreads) 0 }} | ||
| 39 | - name: OMP_NUM_THREADS | ||
| 40 | value: {{ .Values.server.ompThreads | quote }} | ||
| 41 | {{- end }} | ||
| 42 | ports: | ||
| 43 | - name: http | ||
| 44 | containerPort: {{ .Values.server.port }} | ||
| 45 | protocol: TCP | ||
| 46 | readinessProbe: | ||
| 47 | httpGet: | ||
| 48 | path: /health | ||
| 49 | port: http | ||
| 50 | # vLLM CPU cold-start is ~2 min + HF download on first boot. | ||
| 51 | initialDelaySeconds: 60 | ||
| 52 | periodSeconds: 10 | ||
| 53 | timeoutSeconds: 5 | ||
| 54 | failureThreshold: 180 | ||
| 55 | livenessProbe: | ||
| 56 | httpGet: | ||
| 57 | path: /health | ||
| 58 | port: http | ||
| 59 | initialDelaySeconds: 600 | ||
| 60 | periodSeconds: 30 | ||
| 61 | timeoutSeconds: 5 | ||
| 62 | failureThreshold: 6 | ||
| 63 | resources: {{- toYaml .Values.resources | nindent 12 }} | ||
| 64 | volumeMounts: | ||
| 65 | - name: cache | ||
| 66 | mountPath: /cache | ||
| 67 | - name: shm | ||
| 68 | mountPath: /dev/shm | ||
| 69 | volumes: | ||
| 70 | - name: cache | ||
| 71 | emptyDir: | ||
| 72 | sizeLimit: {{ .Values.modelCache.sizeLimit }} | ||
| 73 | - name: shm | ||
| 74 | emptyDir: | ||
| 75 | medium: Memory | ||
| 76 | sizeLimit: 1Gi | ||
diff --git a/charts/llm-app/templates/ingress.yaml b/charts/llm-app/templates/ingress.yaml new file mode 100644 index 0000000..f3a6ded --- /dev/null +++ b/charts/llm-app/templates/ingress.yaml | |||
| @@ -0,0 +1,19 @@ | |||
| 1 | {{- if .Values.ingress.enabled -}} | ||
| 2 | apiVersion: networking.k8s.io/v1 | ||
| 3 | kind: Ingress | ||
| 4 | metadata: | ||
| 5 | name: {{ include "llm-app.fullname" . }} | ||
| 6 | spec: | ||
| 7 | ingressClassName: {{ .Values.ingress.className }} | ||
| 8 | rules: | ||
| 9 | - host: {{ .Values.ingress.host | quote }} | ||
| 10 | http: | ||
| 11 | paths: | ||
| 12 | - path: / | ||
| 13 | pathType: Prefix | ||
| 14 | backend: | ||
| 15 | service: | ||
| 16 | name: {{ include "llm-app.fullname" . }} | ||
| 17 | port: | ||
| 18 | number: {{ .Values.service.port }} | ||
| 19 | {{- end }} | ||
diff --git a/charts/llm-app/templates/service.yaml b/charts/llm-app/templates/service.yaml new file mode 100644 index 0000000..6350996 --- /dev/null +++ b/charts/llm-app/templates/service.yaml | |||
| @@ -0,0 +1,13 @@ | |||
| 1 | apiVersion: v1 | ||
| 2 | kind: Service | ||
| 3 | metadata: | ||
| 4 | name: {{ include "llm-app.fullname" . }} | ||
| 5 | labels: {{- include "llm-app.selectorLabels" . | nindent 4 }} | ||
| 6 | spec: | ||
| 7 | type: {{ .Values.service.type }} | ||
| 8 | ports: | ||
| 9 | - name: http | ||
| 10 | port: {{ .Values.service.port }} | ||
| 11 | targetPort: http | ||
| 12 | protocol: TCP | ||
| 13 | selector: {{- include "llm-app.selectorLabels" . | nindent 4 }} | ||
diff --git a/charts/llm-app/templates/servicemonitor.yaml b/charts/llm-app/templates/servicemonitor.yaml new file mode 100644 index 0000000..264e766 --- /dev/null +++ b/charts/llm-app/templates/servicemonitor.yaml | |||
| @@ -0,0 +1,19 @@ | |||
| 1 | {{- if .Values.monitoring.serviceMonitor.enabled -}} | ||
| 2 | apiVersion: monitoring.coreos.com/v1 | ||
| 3 | kind: ServiceMonitor | ||
| 4 | metadata: | ||
| 5 | name: {{ include "llm-app.fullname" . }} | ||
| 6 | {{- with .Values.monitoring.serviceMonitor.labels }} | ||
| 7 | labels: {{- toYaml . | nindent 4 }} | ||
| 8 | {{- end }} | ||
| 9 | spec: | ||
| 10 | selector: | ||
| 11 | matchLabels: {{- include "llm-app.selectorLabels" . | nindent 6 }} | ||
| 12 | endpoints: | ||
| 13 | - port: http | ||
| 14 | path: /metrics | ||
| 15 | interval: {{ .Values.monitoring.serviceMonitor.interval }} | ||
| 16 | namespaceSelector: | ||
| 17 | matchNames: | ||
| 18 | - {{ .Release.Namespace }} | ||
| 19 | {{- end }} | ||
diff --git a/charts/llm-app/templates/smoketest-job.yaml b/charts/llm-app/templates/smoketest-job.yaml new file mode 100644 index 0000000..ac97f33 --- /dev/null +++ b/charts/llm-app/templates/smoketest-job.yaml | |||
| @@ -0,0 +1,32 @@ | |||
| 1 | apiVersion: batch/v1 | ||
| 2 | kind: Job | ||
| 3 | metadata: | ||
| 4 | name: {{ include "llm-app.fullname" . }}-smoketest | ||
| 5 | annotations: | ||
| 6 | "helm.sh/hook": post-install,post-upgrade | ||
| 7 | "helm.sh/hook-weight": "10" | ||
| 8 | "helm.sh/hook-delete-policy": before-hook-creation,hook-succeeded | ||
| 9 | spec: | ||
| 10 | backoffLimit: 2 | ||
| 11 | activeDeadlineSeconds: 240 | ||
| 12 | ttlSecondsAfterFinished: 600 | ||
| 13 | template: | ||
| 14 | spec: | ||
| 15 | restartPolicy: Never | ||
| 16 | containers: | ||
| 17 | - name: curl | ||
| 18 | image: curlimages/curl:8.10.1 | ||
| 19 | command: ["/bin/sh", "-euc"] | ||
| 20 | args: | ||
| 21 | - | | ||
| 22 | ENDPOINT="http://{{ include "llm-app.fullname" . }}:{{ .Values.service.port }}" | ||
| 23 | MODEL={{ .Values.model.alias | quote }} | ||
| 24 | echo "smoketest: GET $ENDPOINT/v1/models" | ||
| 25 | out=$(curl -fsS --max-time 60 "$ENDPOINT/v1/models") | ||
| 26 | echo "$out" | grep -q "\"$MODEL\"" || { echo "FAIL: $MODEL not listed in /v1/models"; echo "$out"; exit 1; } | ||
| 27 | echo "smoketest: POST $ENDPOINT/v1/chat/completions" | ||
| 28 | resp=$(curl -fsS --max-time 90 "$ENDPOINT/v1/chat/completions" \ | ||
| 29 | -H "Content-Type: application/json" \ | ||
| 30 | -d "{\"model\":\"$MODEL\",\"messages\":[{\"role\":\"user\",\"content\":\"Reply with just: pong\"}],\"max_tokens\":8,\"temperature\":0}") | ||
| 31 | echo "$resp" | grep -q '"content"' || { echo "FAIL: no content in response"; echo "$resp"; exit 1; } | ||
| 32 | echo "OK" | ||
diff --git a/charts/llm-app/values.yaml b/charts/llm-app/values.yaml new file mode 100644 index 0000000..96c5c9a --- /dev/null +++ b/charts/llm-app/values.yaml | |||
| @@ -0,0 +1,51 @@ | |||
| 1 | replicaCount: 1 | ||
| 2 | |||
| 3 | image: | ||
| 4 | # vLLM CPU-only image (no CUDA, works on AVX2+). | ||
| 5 | repository: public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo | ||
| 6 | tag: latest | ||
| 7 | # Optional. If set, used in place of `tag` to pin the image by content. | ||
| 8 | # Example: "sha256:abc123...". Fill via scripts/resolve-digests.sh. | ||
| 9 | digest: "" | ||
| 10 | pullPolicy: IfNotPresent | ||
| 11 | |||
| 12 | # vLLM pulls model weights from HuggingFace at first boot into the cache volume. | ||
| 13 | # `name` is the HF repo id (also used as `--served-model-name` unless `alias` overrides). | ||
| 14 | model: | ||
| 15 | name: "Qwen/Qwen2.5-0.5B-Instruct" | ||
| 16 | alias: "Qwen2.5-0.5B-Instruct" | ||
| 17 | maxModelLen: 2048 | ||
| 18 | dtype: "bfloat16" | ||
| 19 | |||
| 20 | server: | ||
| 21 | port: 8000 | ||
| 22 | # OMP threads for the CPU backend; 0 = autodetect. | ||
| 23 | ompThreads: 0 | ||
| 24 | extraArgs: [] | ||
| 25 | |||
| 26 | resources: | ||
| 27 | requests: | ||
| 28 | cpu: "500m" | ||
| 29 | memory: "1Gi" | ||
| 30 | limits: | ||
| 31 | cpu: "2" | ||
| 32 | memory: "3Gi" | ||
| 33 | |||
| 34 | service: | ||
| 35 | type: ClusterIP | ||
| 36 | port: 8000 | ||
| 37 | |||
| 38 | ingress: | ||
| 39 | enabled: true | ||
| 40 | className: nginx | ||
| 41 | host: llm.localtest.me | ||
| 42 | |||
| 43 | monitoring: | ||
| 44 | serviceMonitor: | ||
| 45 | enabled: true | ||
| 46 | interval: 15s | ||
| 47 | labels: | ||
| 48 | release: kube-prometheus-stack | ||
| 49 | |||
| 50 | modelCache: | ||
| 51 | sizeLimit: 10Gi | ||
diff --git a/cluster/kind-config.yaml b/cluster/kind-config.yaml new file mode 100644 index 0000000..c0306ce --- /dev/null +++ b/cluster/kind-config.yaml | |||
| @@ -0,0 +1,21 @@ | |||
| 1 | kind: Cluster | ||
| 2 | apiVersion: kind.x-k8s.io/v1alpha4 | ||
| 3 | name: llm-local | ||
| 4 | nodes: | ||
| 5 | - role: control-plane | ||
| 6 | image: kindest/node:v1.35.0@sha256:452d707d4862f52530247495d180205e029056831160e22870e37e3f6c1ac31f | ||
| 7 | kubeadmConfigPatches: | ||
| 8 | - | | ||
| 9 | kind: InitConfiguration | ||
| 10 | nodeRegistration: | ||
| 11 | kubeletExtraArgs: | ||
| 12 | node-labels: "ingress-ready=true" | ||
| 13 | extraPortMappings: | ||
| 14 | - containerPort: 80 | ||
| 15 | hostPort: 8080 | ||
| 16 | protocol: TCP | ||
| 17 | - containerPort: 443 | ||
| 18 | hostPort: 8443 | ||
| 19 | protocol: TCP | ||
| 20 | - role: worker | ||
| 21 | image: kindest/node:v1.35.0@sha256:452d707d4862f52530247495d180205e029056831160e22870e37e3f6c1ac31f | ||
| @@ -0,0 +1,18 @@ | |||
| 1 | ### Task | ||
| 2 | 1. Stand up a local K8s cluster with `kind`, `k3d`, or `minikube`. Document exact versions. | ||
| 3 | 2. Write a Helm chart (or use the upstream vLLM/SGLang chart and extend it) that deploys a small open-weights model — e.g. `Qwen2.5-0.5B-Instruct`, `Llama-3.2-1B-Instruct`, or any model that fits on CPU/small GPU. CPU-only inference is acceptable. | ||
| 4 | 3. Wrap it in Terraform (or OpenTofu) using the `helm` and `kubernetes` providers. | ||
| 5 | 4. Expose an OpenAI-compatible endpoint through a K8s Service / Ingress and prove it works with a `curl` example in the README. | ||
| 6 | 5. Observability: scrape `/metrics` from the inference pod with Prometheus and show at least one dashboard or PromQL query for request latency and GPU/CPU utilization. | ||
| 7 | 6. Two environments — `dev` and `prod` — differ by at least: replica count, resource requests/limits, and model choice. Use Terraform workspaces, tfvars, or environment directories; justify your choice. | ||
| 8 | |||
| 9 | Stretch Goals | ||
| 10 | - Deploy a separate application container containing an agentic system utilizing the deployed vLLM/SGLang as the backend model server. The agent system's use-case is free to you to choose. | ||
| 11 | - HPA based on a custom metric (e.g. queue depth or tokens/sec) | ||
| 12 | - Image digest pinning and an `atlantis.yaml` or equivalent GitOps config | ||
| 13 | - A smoke-test job that runs post-deploy and fails the apply if the endpoint is unhealthy | ||
| 14 | |||
| 15 | You will be assessed on the following criteria: | ||
| 16 | - the correctness of its output (stochastic functions notwithstanding); | ||
| 17 | - how reliable, testable, modular and clean your code is; | ||
| 18 | - other interesting add-ons you can think of. | ||
diff --git a/scripts/resolve-digests.sh b/scripts/resolve-digests.sh new file mode 100755 index 0000000..526d463 --- /dev/null +++ b/scripts/resolve-digests.sh | |||
| @@ -0,0 +1,31 @@ | |||
| 1 | #!/usr/bin/env bash | ||
| 2 | # Resolve an image tag to a content-addressable digest for pinning. | ||
| 3 | # | ||
| 4 | # Usage: | ||
| 5 | # scripts/resolve-digests.sh public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:latest | ||
| 6 | # scripts/resolve-digests.sh # default image | ||
| 7 | # | ||
| 8 | # Prints three lines: | ||
| 9 | # repo: public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo | ||
| 10 | # digest: sha256:abc123... | ||
| 11 | # pin: public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo@sha256:abc123... | ||
| 12 | # | ||
| 13 | # Paste the digest into the env's terraform (var.image_digest) to pin. | ||
| 14 | set -euo pipefail | ||
| 15 | |||
| 16 | IMG="${1:-public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:latest}" | ||
| 17 | |||
| 18 | engine="" | ||
| 19 | if command -v podman >/dev/null 2>&1; then engine=podman | ||
| 20 | elif command -v docker >/dev/null 2>&1; then engine=docker | ||
| 21 | else | ||
| 22 | echo "need podman or docker on PATH" >&2; exit 1 | ||
| 23 | fi | ||
| 24 | |||
| 25 | "$engine" pull --quiet "$IMG" >/dev/null | ||
| 26 | digest="$("$engine" image inspect "$IMG" --format '{{.Digest}}')" | ||
| 27 | repo="${IMG%:*}" | ||
| 28 | |||
| 29 | printf 'repo: %s\n' "$repo" | ||
| 30 | printf 'digest: %s\n' "$digest" | ||
| 31 | printf 'pin: %s@%s\n' "$repo" "$digest" | ||
diff --git a/terraform/envs/agent/.terraform.lock.hcl b/terraform/envs/agent/.terraform.lock.hcl new file mode 100644 index 0000000..605df33 --- /dev/null +++ b/terraform/envs/agent/.terraform.lock.hcl | |||
| @@ -0,0 +1,19 @@ | |||
| 1 | # This file is maintained automatically by "tofu init". | ||
| 2 | # Manual edits may be lost in future updates. | ||
| 3 | |||
| 4 | provider "registry.opentofu.org/hashicorp/kubernetes" { | ||
| 5 | version = "2.38.0" | ||
| 6 | constraints = "~> 2.31" | ||
| 7 | hashes = [ | ||
| 8 | "h1:nY7J9jFXcsRINog0KYagiWZw1GVYF9D2JmtIB7Wnrao=", | ||
| 9 | "zh:1096b41c4e5b2ee6c1980916fb9a8579bc1892071396f7a9432be058aabf3cbc", | ||
| 10 | "zh:2959fde9ae3d1deb5e317df0d7b02ea4977951ee6b9c4beb083c148ca8f3681c", | ||
| 11 | "zh:5082f98fcb3389c73339365f7df39fc6912bf2bd1a46d5f97778f441a67fd337", | ||
| 12 | "zh:620fd5d0fbc2d7a24ac6b420a4922e6093020358162a62fa8cbd37b2bac1d22e", | ||
| 13 | "zh:7f47c2de179bba35d759147c53082cad6c3449d19b0ec0c5a4ca8db5b06393e1", | ||
| 14 | "zh:89c3aa2a87e29febf100fd21cead34f9a4c0e6e7ae5f383b5cef815c677eb52a", | ||
| 15 | "zh:96eecc9f94938a0bc35b8a63d2c4a5f972395e44206620db06760b730d0471fc", | ||
| 16 | "zh:e15567c1095f898af173c281b66bffdc4f3068afdd9f84bb5b5b5521d9f29584", | ||
| 17 | "zh:ecc6b912629734a9a41a7cf1c4c73fb13b4b510afc9e7b2e0011d290bcd6d77f", | ||
| 18 | ] | ||
| 19 | } | ||
diff --git a/terraform/envs/agent/backend.tf b/terraform/envs/agent/backend.tf new file mode 100644 index 0000000..3c533e6 --- /dev/null +++ b/terraform/envs/agent/backend.tf | |||
| @@ -0,0 +1,5 @@ | |||
| 1 | terraform { | ||
| 2 | backend "local" { | ||
| 3 | path = "terraform.tfstate" | ||
| 4 | } | ||
| 5 | } | ||
diff --git a/terraform/envs/agent/main.tf b/terraform/envs/agent/main.tf new file mode 100644 index 0000000..122eaca --- /dev/null +++ b/terraform/envs/agent/main.tf | |||
| @@ -0,0 +1,27 @@ | |||
| 1 | provider "kubernetes" { | ||
| 2 | config_path = pathexpand(var.kubeconfig) | ||
| 3 | config_context = var.kube_context | ||
| 4 | } | ||
| 5 | |||
| 6 | module "agent" { | ||
| 7 | source = "../../modules/agent" | ||
| 8 | |||
| 9 | namespace = "agent" | ||
| 10 | agent_source_path = var.agent_source_path | ||
| 11 | |||
| 12 | # Point at the prod LLM. `svc.cluster.local` resolves from any namespace. | ||
| 13 | llm_service_url = "http://llm-llm-app.llm-prod.svc.cluster.local:8000/v1" | ||
| 14 | model_alias = "Qwen2.5-1.5B-Instruct" | ||
| 15 | |||
| 16 | ingress_host = "agent.localtest.me" | ||
| 17 | } | ||
| 18 | |||
| 19 | output "ingress_host" { value = module.agent.ingress_host } | ||
| 20 | output "service_dns" { value = module.agent.service_dns } | ||
| 21 | output "curl_example" { | ||
| 22 | value = <<-EOT | ||
| 23 | curl -s http://${module.agent.ingress_host}:8080/ask \ | ||
| 24 | -H 'Content-Type: application/json' \ | ||
| 25 | -d '{"question":"what is 123 * 47?"}' | ||
| 26 | EOT | ||
| 27 | } | ||
diff --git a/terraform/envs/agent/variables.tf b/terraform/envs/agent/variables.tf new file mode 100644 index 0000000..bf005b9 --- /dev/null +++ b/terraform/envs/agent/variables.tf | |||
| @@ -0,0 +1,14 @@ | |||
| 1 | variable "kubeconfig" { | ||
| 2 | type = string | ||
| 3 | default = "~/.kube/config" | ||
| 4 | } | ||
| 5 | |||
| 6 | variable "kube_context" { | ||
| 7 | type = string | ||
| 8 | default = "kind-llm-local" | ||
| 9 | } | ||
| 10 | |||
| 11 | variable "agent_source_path" { | ||
| 12 | type = string | ||
| 13 | description = "Absolute path to agent/agent.py" | ||
| 14 | } | ||
diff --git a/terraform/envs/agent/versions.tf b/terraform/envs/agent/versions.tf new file mode 100644 index 0000000..69cf77e --- /dev/null +++ b/terraform/envs/agent/versions.tf | |||
| @@ -0,0 +1,6 @@ | |||
| 1 | terraform { | ||
| 2 | required_version = ">= 1.6.0" | ||
| 3 | required_providers { | ||
| 4 | kubernetes = { source = "hashicorp/kubernetes", version = "~> 2.31" } | ||
| 5 | } | ||
| 6 | } | ||
diff --git a/terraform/envs/bootstrap/.terraform.lock.hcl b/terraform/envs/bootstrap/.terraform.lock.hcl new file mode 100644 index 0000000..baa0088 --- /dev/null +++ b/terraform/envs/bootstrap/.terraform.lock.hcl | |||
| @@ -0,0 +1,37 @@ | |||
| 1 | # This file is maintained automatically by "tofu init". | ||
| 2 | # Manual edits may be lost in future updates. | ||
| 3 | |||
| 4 | provider "registry.opentofu.org/hashicorp/helm" { | ||
| 5 | version = "2.17.0" | ||
| 6 | constraints = "~> 2.17" | ||
| 7 | hashes = [ | ||
| 8 | "h1:69PnHoYrrDrm7C8+8PiSvRGPI55taqL14SvQR/FGM+g=", | ||
| 9 | "zh:02690815e35131a42cb9851f63a3369c216af30ad093d05b39001d43da04b56b", | ||
| 10 | "zh:27a62f12b29926387f4d71aeeee9f7ffa0ccb81a1b6066ee895716ad050d1b7a", | ||
| 11 | "zh:2d0a5babfa73604b3fefc9dab9c87f91c77fce756c2e32b294e9f1290aed26c0", | ||
| 12 | "zh:3976400ceba6dda4636e1d297e3097e1831de5628afa534a166de98a70d1dcbe", | ||
| 13 | "zh:54440ef14f342b41d75c1aded7487bfcc3f76322b75894235b47b7e89ac4bfa4", | ||
| 14 | "zh:6512e2ab9f2fa31cbb90d9249647b5c5798f62eb1215ec44da2cdaa24e38ad25", | ||
| 15 | "zh:795f327ca0b8c5368af0ed03d5d4f6da7260692b4b3ca0bd004ed542e683464d", | ||
| 16 | "zh:ba659e1d94f224bc3f1fd34cbb9d2663e3a8e734108e5a58eb49eda84b140978", | ||
| 17 | "zh:c5c8575c4458835c2acbc3d1ed5570589b14baa2525d8fbd04295c097caf41eb", | ||
| 18 | "zh:e0877a5dac3de138e61eefa26b2f5a13305a17259779465899880f70e11314e0", | ||
| 19 | ] | ||
| 20 | } | ||
| 21 | |||
| 22 | provider "registry.opentofu.org/hashicorp/kubernetes" { | ||
| 23 | version = "2.38.0" | ||
| 24 | constraints = "~> 2.31" | ||
| 25 | hashes = [ | ||
| 26 | "h1:nY7J9jFXcsRINog0KYagiWZw1GVYF9D2JmtIB7Wnrao=", | ||
| 27 | "zh:1096b41c4e5b2ee6c1980916fb9a8579bc1892071396f7a9432be058aabf3cbc", | ||
| 28 | "zh:2959fde9ae3d1deb5e317df0d7b02ea4977951ee6b9c4beb083c148ca8f3681c", | ||
| 29 | "zh:5082f98fcb3389c73339365f7df39fc6912bf2bd1a46d5f97778f441a67fd337", | ||
| 30 | "zh:620fd5d0fbc2d7a24ac6b420a4922e6093020358162a62fa8cbd37b2bac1d22e", | ||
| 31 | "zh:7f47c2de179bba35d759147c53082cad6c3449d19b0ec0c5a4ca8db5b06393e1", | ||
| 32 | "zh:89c3aa2a87e29febf100fd21cead34f9a4c0e6e7ae5f383b5cef815c677eb52a", | ||
| 33 | "zh:96eecc9f94938a0bc35b8a63d2c4a5f972395e44206620db06760b730d0471fc", | ||
| 34 | "zh:e15567c1095f898af173c281b66bffdc4f3068afdd9f84bb5b5b5521d9f29584", | ||
| 35 | "zh:ecc6b912629734a9a41a7cf1c4c73fb13b4b510afc9e7b2e0011d290bcd6d77f", | ||
| 36 | ] | ||
| 37 | } | ||
diff --git a/terraform/envs/bootstrap/backend.tf b/terraform/envs/bootstrap/backend.tf new file mode 100644 index 0000000..3c533e6 --- /dev/null +++ b/terraform/envs/bootstrap/backend.tf | |||
| @@ -0,0 +1,5 @@ | |||
| 1 | terraform { | ||
| 2 | backend "local" { | ||
| 3 | path = "terraform.tfstate" | ||
| 4 | } | ||
| 5 | } | ||
diff --git a/terraform/envs/bootstrap/main.tf b/terraform/envs/bootstrap/main.tf new file mode 100644 index 0000000..07bf04d --- /dev/null +++ b/terraform/envs/bootstrap/main.tf | |||
| @@ -0,0 +1,25 @@ | |||
| 1 | provider "kubernetes" { | ||
| 2 | config_path = pathexpand(var.kubeconfig) | ||
| 3 | config_context = var.kube_context | ||
| 4 | } | ||
| 5 | |||
| 6 | provider "helm" { | ||
| 7 | kubernetes { | ||
| 8 | config_path = pathexpand(var.kubeconfig) | ||
| 9 | config_context = var.kube_context | ||
| 10 | } | ||
| 11 | } | ||
| 12 | |||
| 13 | module "observability" { | ||
| 14 | source = "../../modules/observability" | ||
| 15 | namespace = "monitoring" | ||
| 16 | grafana_admin_password = var.grafana_admin_password | ||
| 17 | } | ||
| 18 | |||
| 19 | output "grafana" { | ||
| 20 | value = module.observability.grafana_service | ||
| 21 | } | ||
| 22 | |||
| 23 | output "prometheus" { | ||
| 24 | value = module.observability.prometheus_service | ||
| 25 | } | ||
diff --git a/terraform/envs/bootstrap/variables.tf b/terraform/envs/bootstrap/variables.tf new file mode 100644 index 0000000..220bed3 --- /dev/null +++ b/terraform/envs/bootstrap/variables.tf | |||
| @@ -0,0 +1,15 @@ | |||
| 1 | variable "kubeconfig" { | ||
| 2 | type = string | ||
| 3 | default = "~/.kube/config" | ||
| 4 | } | ||
| 5 | |||
| 6 | variable "kube_context" { | ||
| 7 | type = string | ||
| 8 | default = "kind-llm-local" | ||
| 9 | } | ||
| 10 | |||
| 11 | variable "grafana_admin_password" { | ||
| 12 | type = string | ||
| 13 | default = "admin" | ||
| 14 | sensitive = true | ||
| 15 | } | ||
diff --git a/terraform/envs/bootstrap/versions.tf b/terraform/envs/bootstrap/versions.tf new file mode 100644 index 0000000..0d7f77b --- /dev/null +++ b/terraform/envs/bootstrap/versions.tf | |||
| @@ -0,0 +1,7 @@ | |||
| 1 | terraform { | ||
| 2 | required_version = ">= 1.6.0" | ||
| 3 | required_providers { | ||
| 4 | helm = { source = "hashicorp/helm", version = "~> 2.17" } | ||
| 5 | kubernetes = { source = "hashicorp/kubernetes", version = "~> 2.31" } | ||
| 6 | } | ||
| 7 | } | ||
diff --git a/terraform/envs/dev/.terraform.lock.hcl b/terraform/envs/dev/.terraform.lock.hcl new file mode 100644 index 0000000..09902a1 --- /dev/null +++ b/terraform/envs/dev/.terraform.lock.hcl | |||
| @@ -0,0 +1,54 @@ | |||
| 1 | # This file is maintained automatically by "tofu init". | ||
| 2 | # Manual edits may be lost in future updates. | ||
| 3 | |||
| 4 | provider "registry.opentofu.org/hashicorp/helm" { | ||
| 5 | version = "2.17.0" | ||
| 6 | constraints = "~> 2.17" | ||
| 7 | hashes = [ | ||
| 8 | "h1:69PnHoYrrDrm7C8+8PiSvRGPI55taqL14SvQR/FGM+g=", | ||
| 9 | "zh:02690815e35131a42cb9851f63a3369c216af30ad093d05b39001d43da04b56b", | ||
| 10 | "zh:27a62f12b29926387f4d71aeeee9f7ffa0ccb81a1b6066ee895716ad050d1b7a", | ||
| 11 | "zh:2d0a5babfa73604b3fefc9dab9c87f91c77fce756c2e32b294e9f1290aed26c0", | ||
| 12 | "zh:3976400ceba6dda4636e1d297e3097e1831de5628afa534a166de98a70d1dcbe", | ||
| 13 | "zh:54440ef14f342b41d75c1aded7487bfcc3f76322b75894235b47b7e89ac4bfa4", | ||
| 14 | "zh:6512e2ab9f2fa31cbb90d9249647b5c5798f62eb1215ec44da2cdaa24e38ad25", | ||
| 15 | "zh:795f327ca0b8c5368af0ed03d5d4f6da7260692b4b3ca0bd004ed542e683464d", | ||
| 16 | "zh:ba659e1d94f224bc3f1fd34cbb9d2663e3a8e734108e5a58eb49eda84b140978", | ||
| 17 | "zh:c5c8575c4458835c2acbc3d1ed5570589b14baa2525d8fbd04295c097caf41eb", | ||
| 18 | "zh:e0877a5dac3de138e61eefa26b2f5a13305a17259779465899880f70e11314e0", | ||
| 19 | ] | ||
| 20 | } | ||
| 21 | |||
| 22 | provider "registry.opentofu.org/hashicorp/kubernetes" { | ||
| 23 | version = "2.38.0" | ||
| 24 | constraints = "~> 2.31" | ||
| 25 | hashes = [ | ||
| 26 | "h1:nY7J9jFXcsRINog0KYagiWZw1GVYF9D2JmtIB7Wnrao=", | ||
| 27 | "zh:1096b41c4e5b2ee6c1980916fb9a8579bc1892071396f7a9432be058aabf3cbc", | ||
| 28 | "zh:2959fde9ae3d1deb5e317df0d7b02ea4977951ee6b9c4beb083c148ca8f3681c", | ||
| 29 | "zh:5082f98fcb3389c73339365f7df39fc6912bf2bd1a46d5f97778f441a67fd337", | ||
| 30 | "zh:620fd5d0fbc2d7a24ac6b420a4922e6093020358162a62fa8cbd37b2bac1d22e", | ||
| 31 | "zh:7f47c2de179bba35d759147c53082cad6c3449d19b0ec0c5a4ca8db5b06393e1", | ||
| 32 | "zh:89c3aa2a87e29febf100fd21cead34f9a4c0e6e7ae5f383b5cef815c677eb52a", | ||
| 33 | "zh:96eecc9f94938a0bc35b8a63d2c4a5f972395e44206620db06760b730d0471fc", | ||
| 34 | "zh:e15567c1095f898af173c281b66bffdc4f3068afdd9f84bb5b5b5521d9f29584", | ||
| 35 | "zh:ecc6b912629734a9a41a7cf1c4c73fb13b4b510afc9e7b2e0011d290bcd6d77f", | ||
| 36 | ] | ||
| 37 | } | ||
| 38 | |||
| 39 | provider "registry.opentofu.org/hashicorp/random" { | ||
| 40 | version = "3.8.1" | ||
| 41 | constraints = "~> 3.6" | ||
| 42 | hashes = [ | ||
| 43 | "h1:EHn3jsqOKhWjbg0X+psk0Ww96yz3N7ASqEKKuFvDFwo=", | ||
| 44 | "zh:25c458c7c676f15705e872202dad7dcd0982e4a48e7ea1800afa5fc64e77f4c8", | ||
| 45 | "zh:2edeaf6f1b20435b2f81855ad98a2e70956d473be9e52a5fdf57ccd0098ba476", | ||
| 46 | "zh:44becb9d5f75d55e36dfed0c5beabaf4c92e0a2bc61a3814d698271c646d48e7", | ||
| 47 | "zh:7699032612c3b16cc69928add8973de47b10ce81b1141f30644a0e8a895b5cd3", | ||
| 48 | "zh:86d07aa98d17703de9fbf402c89590dc1e01dbe5671dd6bc5e487eb8fe87eee0", | ||
| 49 | "zh:8c411c77b8390a49a8a1bc9f176529e6b32369dd33a723606c8533e5ca4d68c1", | ||
| 50 | "zh:a5ecc8255a612652a56b28149994985e2c4dc046e5d34d416d47fa7767f5c28f", | ||
| 51 | "zh:aea3fe1a5669b932eda9c5c72e5f327db8da707fe514aaca0d0ef60cb24892f9", | ||
| 52 | "zh:f56e26e6977f755d7ae56fa6320af96ecf4bb09580d47cb481efbf27f1c5afff", | ||
| 53 | ] | ||
| 54 | } | ||
diff --git a/terraform/envs/dev/backend.tf b/terraform/envs/dev/backend.tf new file mode 100644 index 0000000..3c533e6 --- /dev/null +++ b/terraform/envs/dev/backend.tf | |||
| @@ -0,0 +1,5 @@ | |||
| 1 | terraform { | ||
| 2 | backend "local" { | ||
| 3 | path = "terraform.tfstate" | ||
| 4 | } | ||
| 5 | } | ||
diff --git a/terraform/envs/dev/main.tf b/terraform/envs/dev/main.tf new file mode 100644 index 0000000..8e1b882 --- /dev/null +++ b/terraform/envs/dev/main.tf | |||
| @@ -0,0 +1,49 @@ | |||
| 1 | provider "kubernetes" { | ||
| 2 | config_path = pathexpand(var.kubeconfig) | ||
| 3 | config_context = var.kube_context | ||
| 4 | } | ||
| 5 | |||
| 6 | provider "helm" { | ||
| 7 | kubernetes { | ||
| 8 | config_path = pathexpand(var.kubeconfig) | ||
| 9 | config_context = var.kube_context | ||
| 10 | } | ||
| 11 | } | ||
| 12 | |||
| 13 | locals { | ||
| 14 | env = "dev" | ||
| 15 | } | ||
| 16 | |||
| 17 | module "llm" { | ||
| 18 | source = "../../modules/llm" | ||
| 19 | |||
| 20 | release_name = "llm" | ||
| 21 | namespace = "llm-${local.env}" | ||
| 22 | chart_path = var.chart_path | ||
| 23 | |||
| 24 | replicas = 2 | ||
| 25 | |||
| 26 | model_name = "Qwen/Qwen2.5-0.5B-Instruct" | ||
| 27 | model_alias = "Qwen2.5-0.5B-Instruct" | ||
| 28 | max_model_len = 2048 | ||
| 29 | dtype = "bfloat16" | ||
| 30 | omp_threads = 4 | ||
| 31 | |||
| 32 | resources = { | ||
| 33 | requests = { cpu = "1", memory = "2Gi" } | ||
| 34 | limits = { cpu = "4", memory = "6Gi" } | ||
| 35 | } | ||
| 36 | |||
| 37 | ingress_host = "llm.dev.localtest.me" | ||
| 38 | image_tag = "latest" | ||
| 39 | } | ||
| 40 | |||
| 41 | output "ingress_host" { value = module.llm.ingress_host } | ||
| 42 | output "service_dns" { value = module.llm.service_dns } | ||
| 43 | output "curl_example" { | ||
| 44 | value = <<-EOT | ||
| 45 | curl -s http://${module.llm.ingress_host}:8080/v1/chat/completions \ | ||
| 46 | -H 'Content-Type: application/json' \ | ||
| 47 | -d '{"model":"Qwen2.5-0.5B-Instruct","messages":[{"role":"user","content":"Say hi."}]}' | ||
| 48 | EOT | ||
| 49 | } | ||
diff --git a/terraform/envs/dev/variables.tf b/terraform/envs/dev/variables.tf new file mode 100644 index 0000000..9f1b697 --- /dev/null +++ b/terraform/envs/dev/variables.tf | |||
| @@ -0,0 +1,14 @@ | |||
| 1 | variable "kubeconfig" { | ||
| 2 | type = string | ||
| 3 | default = "~/.kube/config" | ||
| 4 | } | ||
| 5 | |||
| 6 | variable "kube_context" { | ||
| 7 | type = string | ||
| 8 | default = "kind-llm-local" | ||
| 9 | } | ||
| 10 | |||
| 11 | variable "chart_path" { | ||
| 12 | type = string | ||
| 13 | description = "Absolute path to charts/llm-app" | ||
| 14 | } | ||
diff --git a/terraform/envs/dev/versions.tf b/terraform/envs/dev/versions.tf new file mode 100644 index 0000000..6a87674 --- /dev/null +++ b/terraform/envs/dev/versions.tf | |||
| @@ -0,0 +1,8 @@ | |||
| 1 | terraform { | ||
| 2 | required_version = ">= 1.6.0" | ||
| 3 | required_providers { | ||
| 4 | helm = { source = "hashicorp/helm", version = "~> 2.17" } | ||
| 5 | kubernetes = { source = "hashicorp/kubernetes", version = "~> 2.31" } | ||
| 6 | random = { source = "hashicorp/random", version = "~> 3.6" } | ||
| 7 | } | ||
| 8 | } | ||
diff --git a/terraform/envs/prod/.terraform.lock.hcl b/terraform/envs/prod/.terraform.lock.hcl new file mode 100644 index 0000000..09902a1 --- /dev/null +++ b/terraform/envs/prod/.terraform.lock.hcl | |||
| @@ -0,0 +1,54 @@ | |||
| 1 | # This file is maintained automatically by "tofu init". | ||
| 2 | # Manual edits may be lost in future updates. | ||
| 3 | |||
| 4 | provider "registry.opentofu.org/hashicorp/helm" { | ||
| 5 | version = "2.17.0" | ||
| 6 | constraints = "~> 2.17" | ||
| 7 | hashes = [ | ||
| 8 | "h1:69PnHoYrrDrm7C8+8PiSvRGPI55taqL14SvQR/FGM+g=", | ||
| 9 | "zh:02690815e35131a42cb9851f63a3369c216af30ad093d05b39001d43da04b56b", | ||
| 10 | "zh:27a62f12b29926387f4d71aeeee9f7ffa0ccb81a1b6066ee895716ad050d1b7a", | ||
| 11 | "zh:2d0a5babfa73604b3fefc9dab9c87f91c77fce756c2e32b294e9f1290aed26c0", | ||
| 12 | "zh:3976400ceba6dda4636e1d297e3097e1831de5628afa534a166de98a70d1dcbe", | ||
| 13 | "zh:54440ef14f342b41d75c1aded7487bfcc3f76322b75894235b47b7e89ac4bfa4", | ||
| 14 | "zh:6512e2ab9f2fa31cbb90d9249647b5c5798f62eb1215ec44da2cdaa24e38ad25", | ||
| 15 | "zh:795f327ca0b8c5368af0ed03d5d4f6da7260692b4b3ca0bd004ed542e683464d", | ||
| 16 | "zh:ba659e1d94f224bc3f1fd34cbb9d2663e3a8e734108e5a58eb49eda84b140978", | ||
| 17 | "zh:c5c8575c4458835c2acbc3d1ed5570589b14baa2525d8fbd04295c097caf41eb", | ||
| 18 | "zh:e0877a5dac3de138e61eefa26b2f5a13305a17259779465899880f70e11314e0", | ||
| 19 | ] | ||
| 20 | } | ||
| 21 | |||
| 22 | provider "registry.opentofu.org/hashicorp/kubernetes" { | ||
| 23 | version = "2.38.0" | ||
| 24 | constraints = "~> 2.31" | ||
| 25 | hashes = [ | ||
| 26 | "h1:nY7J9jFXcsRINog0KYagiWZw1GVYF9D2JmtIB7Wnrao=", | ||
| 27 | "zh:1096b41c4e5b2ee6c1980916fb9a8579bc1892071396f7a9432be058aabf3cbc", | ||
| 28 | "zh:2959fde9ae3d1deb5e317df0d7b02ea4977951ee6b9c4beb083c148ca8f3681c", | ||
| 29 | "zh:5082f98fcb3389c73339365f7df39fc6912bf2bd1a46d5f97778f441a67fd337", | ||
| 30 | "zh:620fd5d0fbc2d7a24ac6b420a4922e6093020358162a62fa8cbd37b2bac1d22e", | ||
| 31 | "zh:7f47c2de179bba35d759147c53082cad6c3449d19b0ec0c5a4ca8db5b06393e1", | ||
| 32 | "zh:89c3aa2a87e29febf100fd21cead34f9a4c0e6e7ae5f383b5cef815c677eb52a", | ||
| 33 | "zh:96eecc9f94938a0bc35b8a63d2c4a5f972395e44206620db06760b730d0471fc", | ||
| 34 | "zh:e15567c1095f898af173c281b66bffdc4f3068afdd9f84bb5b5b5521d9f29584", | ||
| 35 | "zh:ecc6b912629734a9a41a7cf1c4c73fb13b4b510afc9e7b2e0011d290bcd6d77f", | ||
| 36 | ] | ||
| 37 | } | ||
| 38 | |||
| 39 | provider "registry.opentofu.org/hashicorp/random" { | ||
| 40 | version = "3.8.1" | ||
| 41 | constraints = "~> 3.6" | ||
| 42 | hashes = [ | ||
| 43 | "h1:EHn3jsqOKhWjbg0X+psk0Ww96yz3N7ASqEKKuFvDFwo=", | ||
| 44 | "zh:25c458c7c676f15705e872202dad7dcd0982e4a48e7ea1800afa5fc64e77f4c8", | ||
| 45 | "zh:2edeaf6f1b20435b2f81855ad98a2e70956d473be9e52a5fdf57ccd0098ba476", | ||
| 46 | "zh:44becb9d5f75d55e36dfed0c5beabaf4c92e0a2bc61a3814d698271c646d48e7", | ||
| 47 | "zh:7699032612c3b16cc69928add8973de47b10ce81b1141f30644a0e8a895b5cd3", | ||
| 48 | "zh:86d07aa98d17703de9fbf402c89590dc1e01dbe5671dd6bc5e487eb8fe87eee0", | ||
| 49 | "zh:8c411c77b8390a49a8a1bc9f176529e6b32369dd33a723606c8533e5ca4d68c1", | ||
| 50 | "zh:a5ecc8255a612652a56b28149994985e2c4dc046e5d34d416d47fa7767f5c28f", | ||
| 51 | "zh:aea3fe1a5669b932eda9c5c72e5f327db8da707fe514aaca0d0ef60cb24892f9", | ||
| 52 | "zh:f56e26e6977f755d7ae56fa6320af96ecf4bb09580d47cb481efbf27f1c5afff", | ||
| 53 | ] | ||
| 54 | } | ||
diff --git a/terraform/envs/prod/backend.tf b/terraform/envs/prod/backend.tf new file mode 100644 index 0000000..3c533e6 --- /dev/null +++ b/terraform/envs/prod/backend.tf | |||
| @@ -0,0 +1,5 @@ | |||
| 1 | terraform { | ||
| 2 | backend "local" { | ||
| 3 | path = "terraform.tfstate" | ||
| 4 | } | ||
| 5 | } | ||
diff --git a/terraform/envs/prod/main.tf b/terraform/envs/prod/main.tf new file mode 100644 index 0000000..04db31d --- /dev/null +++ b/terraform/envs/prod/main.tf | |||
| @@ -0,0 +1,70 @@ | |||
| 1 | provider "kubernetes" { | ||
| 2 | config_path = pathexpand(var.kubeconfig) | ||
| 3 | config_context = var.kube_context | ||
| 4 | } | ||
| 5 | |||
| 6 | provider "helm" { | ||
| 7 | kubernetes { | ||
| 8 | config_path = pathexpand(var.kubeconfig) | ||
| 9 | config_context = var.kube_context | ||
| 10 | } | ||
| 11 | } | ||
| 12 | |||
| 13 | locals { | ||
| 14 | env = "prod" | ||
| 15 | } | ||
| 16 | |||
| 17 | module "llm" { | ||
| 18 | source = "../../modules/llm" | ||
| 19 | |||
| 20 | release_name = "llm" | ||
| 21 | namespace = "llm-${local.env}" | ||
| 22 | chart_path = var.chart_path | ||
| 23 | |||
| 24 | replicas = 1 | ||
| 25 | |||
| 26 | model_name = "Qwen/Qwen2.5-1.5B-Instruct" | ||
| 27 | model_alias = "Qwen2.5-1.5B-Instruct" | ||
| 28 | max_model_len = 4096 | ||
| 29 | dtype = "bfloat16" | ||
| 30 | omp_threads = 6 | ||
| 31 | |||
| 32 | resources = { | ||
| 33 | requests = { cpu = "2", memory = "4Gi" } | ||
| 34 | limits = { cpu = "6", memory = "8Gi" } | ||
| 35 | } | ||
| 36 | |||
| 37 | ingress_host = "llm.prod.localtest.me" | ||
| 38 | image_tag = "latest" | ||
| 39 | # Content-addressable pin, resolved with scripts/resolve-digests.sh on amd64. | ||
| 40 | # Per-arch digest — re-resolve on a different arch or after an upstream tag move. | ||
| 41 | # Dev intentionally runs on `:latest` so new fixes flow in without a PR. | ||
| 42 | image_digest = "sha256:bb7ed9b6c595334d78179e9d8f6490e06bf9220ed4a10b9b4e15064454ddc69e" | ||
| 43 | |||
| 44 | # Enable OpenAI tool-calling so the agent's function-call path works. | ||
| 45 | # Qwen 2.5 uses hermes-style tool parsing in vLLM. | ||
| 46 | extra_args = [ | ||
| 47 | "--enable-auto-tool-choice", | ||
| 48 | "--tool-call-parser", "hermes", | ||
| 49 | ] | ||
| 50 | |||
| 51 | hpa = { | ||
| 52 | enabled = true | ||
| 53 | min_replicas = 1 | ||
| 54 | max_replicas = 3 | ||
| 55 | # vLLM exposes `vllm:num_requests_running` as a per-pod gauge of in-flight | ||
| 56 | # requests. Scale up when >50% of pods are actively serving. | ||
| 57 | metric_name = "vllm:num_requests_running" | ||
| 58 | target_average_value = "500m" | ||
| 59 | } | ||
| 60 | } | ||
| 61 | |||
| 62 | output "ingress_host" { value = module.llm.ingress_host } | ||
| 63 | output "service_dns" { value = module.llm.service_dns } | ||
| 64 | output "curl_example" { | ||
| 65 | value = <<-EOT | ||
| 66 | curl -s http://${module.llm.ingress_host}:8080/v1/chat/completions \ | ||
| 67 | -H 'Content-Type: application/json' \ | ||
| 68 | -d '{"model":"Qwen2.5-1.5B-Instruct","messages":[{"role":"user","content":"Say hi."}]}' | ||
| 69 | EOT | ||
| 70 | } | ||
diff --git a/terraform/envs/prod/variables.tf b/terraform/envs/prod/variables.tf new file mode 100644 index 0000000..9f1b697 --- /dev/null +++ b/terraform/envs/prod/variables.tf | |||
| @@ -0,0 +1,14 @@ | |||
| 1 | variable "kubeconfig" { | ||
| 2 | type = string | ||
| 3 | default = "~/.kube/config" | ||
| 4 | } | ||
| 5 | |||
| 6 | variable "kube_context" { | ||
| 7 | type = string | ||
| 8 | default = "kind-llm-local" | ||
| 9 | } | ||
| 10 | |||
| 11 | variable "chart_path" { | ||
| 12 | type = string | ||
| 13 | description = "Absolute path to charts/llm-app" | ||
| 14 | } | ||
diff --git a/terraform/envs/prod/versions.tf b/terraform/envs/prod/versions.tf new file mode 100644 index 0000000..6a87674 --- /dev/null +++ b/terraform/envs/prod/versions.tf | |||
| @@ -0,0 +1,8 @@ | |||
| 1 | terraform { | ||
| 2 | required_version = ">= 1.6.0" | ||
| 3 | required_providers { | ||
| 4 | helm = { source = "hashicorp/helm", version = "~> 2.17" } | ||
| 5 | kubernetes = { source = "hashicorp/kubernetes", version = "~> 2.31" } | ||
| 6 | random = { source = "hashicorp/random", version = "~> 3.6" } | ||
| 7 | } | ||
| 8 | } | ||
diff --git a/terraform/modules/agent/main.tf b/terraform/modules/agent/main.tf new file mode 100644 index 0000000..f53acdc --- /dev/null +++ b/terraform/modules/agent/main.tf | |||
| @@ -0,0 +1,114 @@ | |||
| 1 | resource "kubernetes_namespace_v1" "agent" { | ||
| 2 | metadata { | ||
| 3 | name = var.namespace | ||
| 4 | labels = { | ||
| 5 | "app.kubernetes.io/part-of" = "llm-platform" | ||
| 6 | } | ||
| 7 | } | ||
| 8 | } | ||
| 9 | |||
| 10 | resource "kubernetes_deployment_v1" "agent" { | ||
| 11 | metadata { | ||
| 12 | name = "agent" | ||
| 13 | namespace = kubernetes_namespace_v1.agent.metadata[0].name | ||
| 14 | labels = { app = "agent" } | ||
| 15 | } | ||
| 16 | spec { | ||
| 17 | replicas = 1 | ||
| 18 | selector { | ||
| 19 | match_labels = { app = "agent" } | ||
| 20 | } | ||
| 21 | template { | ||
| 22 | metadata { | ||
| 23 | labels = { app = "agent" } | ||
| 24 | annotations = { | ||
| 25 | # Bounce the pod when agent.py changes on disk, even if image tag is unchanged. | ||
| 26 | "checksum/code" = substr(sha256(file(var.agent_source_path)), 0, 16) | ||
| 27 | } | ||
| 28 | } | ||
| 29 | spec { | ||
| 30 | container { | ||
| 31 | name = "agent" | ||
| 32 | image = var.agent_image | ||
| 33 | image_pull_policy = "IfNotPresent" | ||
| 34 | env { | ||
| 35 | name = "OPENAI_BASE_URL" | ||
| 36 | value = var.llm_service_url | ||
| 37 | } | ||
| 38 | env { | ||
| 39 | name = "MODEL" | ||
| 40 | value = var.model_alias | ||
| 41 | } | ||
| 42 | port { | ||
| 43 | name = "http" | ||
| 44 | container_port = 8001 | ||
| 45 | } | ||
| 46 | readiness_probe { | ||
| 47 | http_get { | ||
| 48 | path = "/health" | ||
| 49 | port = "http" | ||
| 50 | } | ||
| 51 | initial_delay_seconds = 3 | ||
| 52 | period_seconds = 5 | ||
| 53 | failure_threshold = 10 | ||
| 54 | } | ||
| 55 | liveness_probe { | ||
| 56 | http_get { | ||
| 57 | path = "/health" | ||
| 58 | port = "http" | ||
| 59 | } | ||
| 60 | initial_delay_seconds = 30 | ||
| 61 | period_seconds = 30 | ||
| 62 | } | ||
| 63 | resources { | ||
| 64 | requests = { cpu = "100m", memory = "128Mi" } | ||
| 65 | limits = { cpu = "1", memory = "512Mi" } | ||
| 66 | } | ||
| 67 | } | ||
| 68 | } | ||
| 69 | } | ||
| 70 | } | ||
| 71 | } | ||
| 72 | |||
| 73 | resource "kubernetes_service_v1" "agent" { | ||
| 74 | metadata { | ||
| 75 | name = "agent" | ||
| 76 | namespace = kubernetes_namespace_v1.agent.metadata[0].name | ||
| 77 | labels = { app = "agent" } | ||
| 78 | } | ||
| 79 | spec { | ||
| 80 | selector = { app = "agent" } | ||
| 81 | port { | ||
| 82 | name = "http" | ||
| 83 | port = 8001 | ||
| 84 | target_port = "http" | ||
| 85 | } | ||
| 86 | } | ||
| 87 | } | ||
| 88 | |||
| 89 | resource "kubernetes_ingress_v1" "agent" { | ||
| 90 | metadata { | ||
| 91 | name = "agent" | ||
| 92 | namespace = kubernetes_namespace_v1.agent.metadata[0].name | ||
| 93 | } | ||
| 94 | spec { | ||
| 95 | ingress_class_name = var.ingress_class | ||
| 96 | rule { | ||
| 97 | host = var.ingress_host | ||
| 98 | http { | ||
| 99 | path { | ||
| 100 | path = "/" | ||
| 101 | path_type = "Prefix" | ||
| 102 | backend { | ||
| 103 | service { | ||
| 104 | name = kubernetes_service_v1.agent.metadata[0].name | ||
| 105 | port { | ||
| 106 | number = 8001 | ||
| 107 | } | ||
| 108 | } | ||
| 109 | } | ||
| 110 | } | ||
| 111 | } | ||
| 112 | } | ||
| 113 | } | ||
| 114 | } | ||
diff --git a/terraform/modules/agent/outputs.tf b/terraform/modules/agent/outputs.tf new file mode 100644 index 0000000..ac9932b --- /dev/null +++ b/terraform/modules/agent/outputs.tf | |||
| @@ -0,0 +1,11 @@ | |||
| 1 | output "service_dns" { | ||
| 2 | value = "${kubernetes_service_v1.agent.metadata[0].name}.${kubernetes_namespace_v1.agent.metadata[0].name}.svc.cluster.local" | ||
| 3 | } | ||
| 4 | |||
| 5 | output "ingress_host" { | ||
| 6 | value = var.ingress_host | ||
| 7 | } | ||
| 8 | |||
| 9 | output "namespace" { | ||
| 10 | value = kubernetes_namespace_v1.agent.metadata[0].name | ||
| 11 | } | ||
diff --git a/terraform/modules/agent/variables.tf b/terraform/modules/agent/variables.tf new file mode 100644 index 0000000..6f525ee --- /dev/null +++ b/terraform/modules/agent/variables.tf | |||
| @@ -0,0 +1,33 @@ | |||
| 1 | variable "namespace" { | ||
| 2 | type = string | ||
| 3 | } | ||
| 4 | |||
| 5 | variable "agent_source_path" { | ||
| 6 | type = string | ||
| 7 | description = "Absolute path to agent/agent.py. Used only to bounce pods on code change." | ||
| 8 | } | ||
| 9 | |||
| 10 | variable "agent_image" { | ||
| 11 | type = string | ||
| 12 | default = "localhost/agent:0.1.0" | ||
| 13 | description = "Pre-built agent image. Must be loaded into kind with `make agent-build`." | ||
| 14 | } | ||
| 15 | |||
| 16 | variable "llm_service_url" { | ||
| 17 | type = string | ||
| 18 | description = "OpenAI-compatible base URL, e.g. http://llm-llm-app.llm-prod.svc.cluster.local:8000/v1" | ||
| 19 | } | ||
| 20 | |||
| 21 | variable "model_alias" { | ||
| 22 | type = string | ||
| 23 | default = "Qwen2.5-1.5B-Instruct" | ||
| 24 | } | ||
| 25 | |||
| 26 | variable "ingress_host" { | ||
| 27 | type = string | ||
| 28 | } | ||
| 29 | |||
| 30 | variable "ingress_class" { | ||
| 31 | type = string | ||
| 32 | default = "nginx" | ||
| 33 | } | ||
diff --git a/terraform/modules/agent/versions.tf b/terraform/modules/agent/versions.tf new file mode 100644 index 0000000..4242705 --- /dev/null +++ b/terraform/modules/agent/versions.tf | |||
| @@ -0,0 +1,5 @@ | |||
| 1 | terraform { | ||
| 2 | required_providers { | ||
| 3 | kubernetes = { source = "hashicorp/kubernetes", version = "~> 2.31" } | ||
| 4 | } | ||
| 5 | } | ||
diff --git a/terraform/modules/llm/main.tf b/terraform/modules/llm/main.tf new file mode 100644 index 0000000..cd22019 --- /dev/null +++ b/terraform/modules/llm/main.tf | |||
| @@ -0,0 +1,99 @@ | |||
| 1 | resource "kubernetes_namespace_v1" "this" { | ||
| 2 | metadata { | ||
| 3 | name = var.namespace | ||
| 4 | labels = { | ||
| 5 | "app.kubernetes.io/part-of" = "llm-platform" | ||
| 6 | } | ||
| 7 | } | ||
| 8 | } | ||
| 9 | |||
| 10 | resource "kubernetes_horizontal_pod_autoscaler_v2" "llm" { | ||
| 11 | count = var.hpa.enabled ? 1 : 0 | ||
| 12 | |||
| 13 | metadata { | ||
| 14 | name = "${var.release_name}-llm-app" | ||
| 15 | namespace = kubernetes_namespace_v1.this.metadata[0].name | ||
| 16 | } | ||
| 17 | spec { | ||
| 18 | scale_target_ref { | ||
| 19 | api_version = "apps/v1" | ||
| 20 | kind = "Deployment" | ||
| 21 | name = "${var.release_name}-llm-app" | ||
| 22 | } | ||
| 23 | min_replicas = var.hpa.min_replicas | ||
| 24 | max_replicas = var.hpa.max_replicas | ||
| 25 | |||
| 26 | metric { | ||
| 27 | type = "Pods" | ||
| 28 | pods { | ||
| 29 | metric { | ||
| 30 | name = var.hpa.metric_name | ||
| 31 | } | ||
| 32 | target { | ||
| 33 | type = "AverageValue" | ||
| 34 | average_value = var.hpa.target_average_value | ||
| 35 | } | ||
| 36 | } | ||
| 37 | } | ||
| 38 | } | ||
| 39 | |||
| 40 | depends_on = [helm_release.llm] | ||
| 41 | } | ||
| 42 | |||
| 43 | resource "helm_release" "llm" { | ||
| 44 | name = var.release_name | ||
| 45 | chart = var.chart_path | ||
| 46 | namespace = kubernetes_namespace_v1.this.metadata[0].name | ||
| 47 | create_namespace = false | ||
| 48 | atomic = false | ||
| 49 | wait = true | ||
| 50 | timeout = 1800 | ||
| 51 | |||
| 52 | values = [ | ||
| 53 | yamlencode({ | ||
| 54 | replicaCount = var.replicas | ||
| 55 | |||
| 56 | image = { | ||
| 57 | repository = var.image_repository | ||
| 58 | tag = var.image_tag | ||
| 59 | digest = var.image_digest | ||
| 60 | pullPolicy = "IfNotPresent" | ||
| 61 | } | ||
| 62 | |||
| 63 | model = { | ||
| 64 | name = var.model_name | ||
| 65 | alias = var.model_alias | ||
| 66 | maxModelLen = var.max_model_len | ||
| 67 | dtype = var.dtype | ||
| 68 | } | ||
| 69 | |||
| 70 | server = { | ||
| 71 | port = 8000 | ||
| 72 | ompThreads = var.omp_threads | ||
| 73 | extraArgs = var.extra_args | ||
| 74 | } | ||
| 75 | |||
| 76 | resources = var.resources | ||
| 77 | |||
| 78 | ingress = { | ||
| 79 | enabled = true | ||
| 80 | className = var.ingress_class | ||
| 81 | host = var.ingress_host | ||
| 82 | } | ||
| 83 | |||
| 84 | monitoring = { | ||
| 85 | serviceMonitor = { | ||
| 86 | enabled = true | ||
| 87 | interval = "15s" | ||
| 88 | labels = { | ||
| 89 | release = var.service_monitor_release_label | ||
| 90 | } | ||
| 91 | } | ||
| 92 | } | ||
| 93 | |||
| 94 | modelCache = { | ||
| 95 | sizeLimit = var.model_cache_size | ||
| 96 | } | ||
| 97 | }), | ||
| 98 | ] | ||
| 99 | } | ||
diff --git a/terraform/modules/llm/outputs.tf b/terraform/modules/llm/outputs.tf new file mode 100644 index 0000000..a953e73 --- /dev/null +++ b/terraform/modules/llm/outputs.tf | |||
| @@ -0,0 +1,12 @@ | |||
| 1 | output "service_dns" { | ||
| 2 | value = "${var.release_name}-llm-app.${var.namespace}.svc.cluster.local" | ||
| 3 | description = "In-cluster DNS name for the LLM Service." | ||
| 4 | } | ||
| 5 | |||
| 6 | output "ingress_host" { | ||
| 7 | value = var.ingress_host | ||
| 8 | } | ||
| 9 | |||
| 10 | output "namespace" { | ||
| 11 | value = kubernetes_namespace_v1.this.metadata[0].name | ||
| 12 | } | ||
diff --git a/terraform/modules/llm/variables.tf b/terraform/modules/llm/variables.tf new file mode 100644 index 0000000..3a7d8f7 --- /dev/null +++ b/terraform/modules/llm/variables.tf | |||
| @@ -0,0 +1,112 @@ | |||
| 1 | variable "release_name" { | ||
| 2 | type = string | ||
| 3 | description = "Helm release name." | ||
| 4 | } | ||
| 5 | |||
| 6 | variable "namespace" { | ||
| 7 | type = string | ||
| 8 | description = "Kubernetes namespace to deploy into." | ||
| 9 | } | ||
| 10 | |||
| 11 | variable "chart_path" { | ||
| 12 | type = string | ||
| 13 | description = "Path to the local llm-app chart." | ||
| 14 | } | ||
| 15 | |||
| 16 | variable "replicas" { | ||
| 17 | type = number | ||
| 18 | default = 1 | ||
| 19 | } | ||
| 20 | |||
| 21 | variable "model_name" { | ||
| 22 | type = string | ||
| 23 | description = "HuggingFace repo id, passed as vLLM model_tag (positional)." | ||
| 24 | } | ||
| 25 | |||
| 26 | variable "model_alias" { | ||
| 27 | type = string | ||
| 28 | description = "Value clients pass in the OpenAI 'model' field (maps to --served-model-name)." | ||
| 29 | } | ||
| 30 | |||
| 31 | variable "max_model_len" { | ||
| 32 | type = number | ||
| 33 | default = 2048 | ||
| 34 | } | ||
| 35 | |||
| 36 | variable "dtype" { | ||
| 37 | type = string | ||
| 38 | default = "bfloat16" | ||
| 39 | } | ||
| 40 | |||
| 41 | variable "omp_threads" { | ||
| 42 | type = number | ||
| 43 | default = 0 | ||
| 44 | description = "OMP_NUM_THREADS for vLLM CPU backend. 0 = autodetect." | ||
| 45 | } | ||
| 46 | |||
| 47 | variable "extra_args" { | ||
| 48 | type = list(string) | ||
| 49 | default = [] | ||
| 50 | description = "Extra CLI args passed to `vllm serve`, appended after the stock set." | ||
| 51 | } | ||
| 52 | |||
| 53 | variable "resources" { | ||
| 54 | type = object({ | ||
| 55 | requests = object({ cpu = string, memory = string }) | ||
| 56 | limits = object({ cpu = string, memory = string }) | ||
| 57 | }) | ||
| 58 | } | ||
| 59 | |||
| 60 | variable "ingress_host" { | ||
| 61 | type = string | ||
| 62 | } | ||
| 63 | |||
| 64 | variable "ingress_class" { | ||
| 65 | type = string | ||
| 66 | default = "nginx" | ||
| 67 | } | ||
| 68 | |||
| 69 | variable "image_repository" { | ||
| 70 | type = string | ||
| 71 | default = "public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo" | ||
| 72 | } | ||
| 73 | |||
| 74 | variable "image_tag" { | ||
| 75 | type = string | ||
| 76 | default = "latest" | ||
| 77 | description = "Used only when image_digest is empty." | ||
| 78 | } | ||
| 79 | |||
| 80 | variable "image_digest" { | ||
| 81 | type = string | ||
| 82 | default = "" | ||
| 83 | description = "Optional sha256:abc... content-addressable digest. Takes precedence over image_tag." | ||
| 84 | } | ||
| 85 | |||
| 86 | variable "service_monitor_release_label" { | ||
| 87 | type = string | ||
| 88 | default = "kube-prometheus-stack" | ||
| 89 | description = "Must match the release label the Prometheus Operator selects on." | ||
| 90 | } | ||
| 91 | |||
| 92 | variable "model_cache_size" { | ||
| 93 | type = string | ||
| 94 | default = "10Gi" | ||
| 95 | } | ||
| 96 | |||
| 97 | variable "hpa" { | ||
| 98 | type = object({ | ||
| 99 | enabled = bool | ||
| 100 | min_replicas = number | ||
| 101 | max_replicas = number | ||
| 102 | metric_name = string | ||
| 103 | target_average_value = string | ||
| 104 | }) | ||
| 105 | default = { | ||
| 106 | enabled = false | ||
| 107 | min_replicas = 1 | ||
| 108 | max_replicas = 3 | ||
| 109 | metric_name = "vllm:num_requests_running" | ||
| 110 | target_average_value = "500m" | ||
| 111 | } | ||
| 112 | } | ||
diff --git a/terraform/modules/observability/main.tf b/terraform/modules/observability/main.tf new file mode 100644 index 0000000..2f88f2e --- /dev/null +++ b/terraform/modules/observability/main.tf | |||
| @@ -0,0 +1,156 @@ | |||
| 1 | resource "kubernetes_namespace_v1" "monitoring" { | ||
| 2 | metadata { | ||
| 3 | name = var.namespace | ||
| 4 | } | ||
| 5 | } | ||
| 6 | |||
| 7 | resource "kubernetes_namespace_v1" "ingress" { | ||
| 8 | metadata { | ||
| 9 | name = "ingress-nginx" | ||
| 10 | } | ||
| 11 | } | ||
| 12 | |||
| 13 | resource "helm_release" "ingress_nginx" { | ||
| 14 | name = "ingress-nginx" | ||
| 15 | repository = "https://kubernetes.github.io/ingress-nginx" | ||
| 16 | chart = "ingress-nginx" | ||
| 17 | version = var.ingress_nginx_version | ||
| 18 | namespace = kubernetes_namespace_v1.ingress.metadata[0].name | ||
| 19 | wait = true | ||
| 20 | timeout = 300 | ||
| 21 | |||
| 22 | values = [ | ||
| 23 | yamlencode({ | ||
| 24 | controller = { | ||
| 25 | hostPort = { enabled = true, ports = { http = 80, https = 443 } } | ||
| 26 | service = { type = "NodePort" } | ||
| 27 | nodeSelector = { | ||
| 28 | "ingress-ready" = "true" | ||
| 29 | } | ||
| 30 | tolerations = [ | ||
| 31 | { key = "node-role.kubernetes.io/control-plane", operator = "Equal", effect = "NoSchedule" }, | ||
| 32 | { key = "node-role.kubernetes.io/master", operator = "Equal", effect = "NoSchedule" }, | ||
| 33 | ] | ||
| 34 | publishService = { enabled = false } | ||
| 35 | admissionWebhooks = { enabled = false } # speeds up kind cluster installs | ||
| 36 | # Cap worker_processes so nginx doesn't try to spawn 14 threads under | ||
| 37 | # CPU pressure from vLLM cold-starts. With auto (= one per CPU) it | ||
| 38 | # sometimes hits pthread EAGAIN and workers die without respawn. | ||
| 39 | config = { | ||
| 40 | "worker-processes" = "4" | ||
| 41 | } | ||
| 42 | } | ||
| 43 | }), | ||
| 44 | ] | ||
| 45 | } | ||
| 46 | |||
| 47 | resource "helm_release" "kps" { | ||
| 48 | name = "kube-prometheus-stack" | ||
| 49 | repository = "https://prometheus-community.github.io/helm-charts" | ||
| 50 | chart = "kube-prometheus-stack" | ||
| 51 | version = var.kps_version | ||
| 52 | namespace = kubernetes_namespace_v1.monitoring.metadata[0].name | ||
| 53 | wait = true | ||
| 54 | timeout = 600 | ||
| 55 | |||
| 56 | values = [ | ||
| 57 | yamlencode({ | ||
| 58 | fullnameOverride = "kps" | ||
| 59 | prometheus = { | ||
| 60 | prometheusSpec = { | ||
| 61 | # Let Prometheus pick up ServiceMonitors from any namespace matching | ||
| 62 | # the release=kube-prometheus-stack label (the chart's default). | ||
| 63 | serviceMonitorSelectorNilUsesHelmValues = false | ||
| 64 | podMonitorSelectorNilUsesHelmValues = false | ||
| 65 | ruleSelectorNilUsesHelmValues = false | ||
| 66 | retention = "2d" | ||
| 67 | resources = { | ||
| 68 | requests = { cpu = "100m", memory = "400Mi" } | ||
| 69 | limits = { memory = "1Gi" } | ||
| 70 | } | ||
| 71 | } | ||
| 72 | ingress = { | ||
| 73 | enabled = true | ||
| 74 | ingressClassName = "nginx" | ||
| 75 | hosts = ["prom.localtest.me"] | ||
| 76 | } | ||
| 77 | } | ||
| 78 | alertmanager = { enabled = false } | ||
| 79 | grafana = { | ||
| 80 | adminPassword = var.grafana_admin_password | ||
| 81 | sidecar = { | ||
| 82 | dashboards = { | ||
| 83 | enabled = true | ||
| 84 | label = "grafana_dashboard" | ||
| 85 | labelValue = "1" | ||
| 86 | searchNamespace = "ALL" | ||
| 87 | } | ||
| 88 | } | ||
| 89 | service = { type = "ClusterIP" } | ||
| 90 | ingress = { | ||
| 91 | enabled = true | ||
| 92 | ingressClassName = "nginx" | ||
| 93 | hosts = ["grafana.localtest.me"] | ||
| 94 | } | ||
| 95 | } | ||
| 96 | }), | ||
| 97 | ] | ||
| 98 | } | ||
| 99 | |||
| 100 | resource "helm_release" "prometheus_adapter" { | ||
| 101 | name = "prometheus-adapter" | ||
| 102 | repository = "https://prometheus-community.github.io/helm-charts" | ||
| 103 | chart = "prometheus-adapter" | ||
| 104 | version = var.prometheus_adapter_version | ||
| 105 | namespace = kubernetes_namespace_v1.monitoring.metadata[0].name | ||
| 106 | wait = true | ||
| 107 | timeout = 300 | ||
| 108 | |||
| 109 | values = [ | ||
| 110 | yamlencode({ | ||
| 111 | prometheus = { | ||
| 112 | url = "http://kps-prometheus.${kubernetes_namespace_v1.monitoring.metadata[0].name}.svc" | ||
| 113 | port = 9090 | ||
| 114 | } | ||
| 115 | rules = { | ||
| 116 | default = false | ||
| 117 | custom = [ | ||
| 118 | { | ||
| 119 | # In-flight request count per pod; basis for autoscaling. | ||
| 120 | # vLLM exposes this as a gauge per model-engine. | ||
| 121 | seriesQuery = "vllm:num_requests_running{namespace!=\"\",pod!=\"\"}" | ||
| 122 | resources = { | ||
| 123 | overrides = { | ||
| 124 | namespace = { resource = "namespace" } | ||
| 125 | pod = { resource = "pod" } | ||
| 126 | } | ||
| 127 | } | ||
| 128 | name = { | ||
| 129 | matches = "^vllm:num_requests_running$" | ||
| 130 | as = "vllm:num_requests_running" | ||
| 131 | } | ||
| 132 | metricsQuery = "avg(<<.Series>>{<<.LabelMatchers>>}) by (<<.GroupBy>>)" | ||
| 133 | }, | ||
| 134 | { | ||
| 135 | # Waiting (queued) requests per pod — an alternative scale signal. | ||
| 136 | seriesQuery = "vllm:num_requests_waiting{namespace!=\"\",pod!=\"\"}" | ||
| 137 | resources = { | ||
| 138 | overrides = { | ||
| 139 | namespace = { resource = "namespace" } | ||
| 140 | pod = { resource = "pod" } | ||
| 141 | } | ||
| 142 | } | ||
| 143 | name = { | ||
| 144 | matches = "^vllm:num_requests_waiting$" | ||
| 145 | as = "vllm:num_requests_waiting" | ||
| 146 | } | ||
| 147 | metricsQuery = "avg(<<.Series>>{<<.LabelMatchers>>}) by (<<.GroupBy>>)" | ||
| 148 | }, | ||
| 149 | ] | ||
| 150 | } | ||
| 151 | }), | ||
| 152 | ] | ||
| 153 | |||
| 154 | depends_on = [helm_release.kps] | ||
| 155 | } | ||
| 156 | |||
diff --git a/terraform/modules/observability/outputs.tf b/terraform/modules/observability/outputs.tf new file mode 100644 index 0000000..06a507d --- /dev/null +++ b/terraform/modules/observability/outputs.tf | |||
| @@ -0,0 +1,11 @@ | |||
| 1 | output "namespace" { | ||
| 2 | value = kubernetes_namespace_v1.monitoring.metadata[0].name | ||
| 3 | } | ||
| 4 | |||
| 5 | output "grafana_service" { | ||
| 6 | value = "kube-prometheus-stack-grafana.${kubernetes_namespace_v1.monitoring.metadata[0].name}.svc.cluster.local" | ||
| 7 | } | ||
| 8 | |||
| 9 | output "prometheus_service" { | ||
| 10 | value = "kps-prometheus.${kubernetes_namespace_v1.monitoring.metadata[0].name}.svc.cluster.local" | ||
| 11 | } | ||
diff --git a/terraform/modules/observability/variables.tf b/terraform/modules/observability/variables.tf new file mode 100644 index 0000000..6aeaca3 --- /dev/null +++ b/terraform/modules/observability/variables.tf | |||
| @@ -0,0 +1,27 @@ | |||
| 1 | variable "namespace" { | ||
| 2 | type = string | ||
| 3 | default = "monitoring" | ||
| 4 | } | ||
| 5 | |||
| 6 | variable "kps_version" { | ||
| 7 | type = string | ||
| 8 | default = "65.5.1" | ||
| 9 | description = "kube-prometheus-stack chart version." | ||
| 10 | } | ||
| 11 | |||
| 12 | variable "ingress_nginx_version" { | ||
| 13 | type = string | ||
| 14 | default = "4.11.3" | ||
| 15 | description = "ingress-nginx chart version." | ||
| 16 | } | ||
| 17 | |||
| 18 | variable "grafana_admin_password" { | ||
| 19 | type = string | ||
| 20 | default = "admin" | ||
| 21 | sensitive = true | ||
| 22 | } | ||
| 23 | |||
| 24 | variable "prometheus_adapter_version" { | ||
| 25 | type = string | ||
| 26 | default = "4.11.0" | ||
| 27 | } | ||
diff --git a/tests/smoke.sh b/tests/smoke.sh new file mode 100755 index 0000000..a5ef23d --- /dev/null +++ b/tests/smoke.sh | |||
| @@ -0,0 +1,38 @@ | |||
| 1 | #!/usr/bin/env bash | ||
| 2 | # Smoke test for the OpenAI-compatible LLM endpoint. | ||
| 3 | # Usage: | ||
| 4 | # ENDPOINT=http://llm.dev.localtest.me:8080 MODEL=Qwen2.5-0.5B-Instruct ./tests/smoke.sh | ||
| 5 | set -euo pipefail | ||
| 6 | |||
| 7 | ENDPOINT="${ENDPOINT:-http://llm.dev.localtest.me:8080}" | ||
| 8 | MODEL="${MODEL:-Qwen2.5-0.5B-Instruct}" | ||
| 9 | TIMEOUT="${TIMEOUT:-120}" | ||
| 10 | |||
| 11 | say() { printf '\033[1;34m==>\033[0m %s\n' "$*"; } | ||
| 12 | fail() { printf '\033[1;31mFAIL\033[0m %s\n' "$*" >&2; exit 1; } | ||
| 13 | |||
| 14 | say "Endpoint: $ENDPOINT" | ||
| 15 | say "Model: $MODEL" | ||
| 16 | |||
| 17 | say "GET /v1/models" | ||
| 18 | models_json="$(curl -fsS --max-time "$TIMEOUT" "$ENDPOINT/v1/models")" || fail "/v1/models unreachable" | ||
| 19 | echo "$models_json" | grep -q "$MODEL" || fail "/v1/models does not list $MODEL" | ||
| 20 | |||
| 21 | say "POST /v1/chat/completions" | ||
| 22 | resp="$(curl -fsS --max-time "$TIMEOUT" "$ENDPOINT/v1/chat/completions" \ | ||
| 23 | -H 'Content-Type: application/json' \ | ||
| 24 | -d "$(cat <<EOF | ||
| 25 | { | ||
| 26 | "model": "$MODEL", | ||
| 27 | "messages": [{"role": "user", "content": "Reply with the single word: pong"}], | ||
| 28 | "max_tokens": 8, | ||
| 29 | "temperature": 0 | ||
| 30 | } | ||
| 31 | EOF | ||
| 32 | )")" || fail "chat completion request failed" | ||
| 33 | |||
| 34 | content="$(echo "$resp" | python3 -c 'import sys, json; print(json.load(sys.stdin)["choices"][0]["message"]["content"])')" | ||
| 35 | echo "model reply: $content" | ||
| 36 | [[ -n "$content" ]] || fail "empty completion content" | ||
| 37 | |||
| 38 | say "OK" | ||
