diff options
| author | Your Name <you@example.com> | 2026-04-26 21:02:47 +0800 |
|---|---|---|
| committer | Your Name <you@example.com> | 2026-04-26 21:02:47 +0800 |
| commit | d3e770254de0bb301815ca87257c8b1a357d06c4 (patch) | |
| tree | 358c814be2a06b9e2009905f14938243286b8d82 /README.txt | |
Diffstat (limited to 'README.txt')
| -rw-r--r-- | README.txt | 90 |
1 files changed, 90 insertions, 0 deletions
diff --git a/README.txt b/README.txt new file mode 100644 index 0000000..4ce7a8e --- /dev/null +++ b/README.txt | |||
| @@ -0,0 +1,90 @@ | |||
| 1 | ============================================================================= | ||
| 2 | Local K8s LLM demo — kind + OpenTofu + vLLM | ||
| 3 | ============================================================================= | ||
| 4 | |||
| 5 | sudo dnf install -y podman git make jq curl tar | ||
| 6 | |||
| 7 | # kind v0.31.0 (node image: kindest/node:v1.35.0, pinned by digest in cluster/kind-config.yaml) | ||
| 8 | cluster/kind-config.yaml | ||
| 9 | |||
| 10 | # kubectl v1.36.0 | ||
| 11 | curl -fsSLo /tmp/kubectl \ | ||
| 12 | https://dl.k8s.io/release/v1.36.0/bin/linux/amd64/kubectl | ||
| 13 | sudo install -m 0755 /tmp/kubectl /usr/local/bin/kubectl | ||
| 14 | |||
| 15 | # Helm 4.1.4 | ||
| 16 | terraform/modules/observability/variables.tf | ||
| 17 | |||
| 18 | # OpenTofu 1.11.6 | ||
| 19 | terraform/envs/{dev,prod,bootstrap}/versions.tf | ||
| 20 | |||
| 21 | |||
| 22 | |||
| 23 | # kind runs each k8s "node" as a long-lived podman container. The default pids_limit = 2048 causes ingress-nginx to hit pthread EAGAIN once the control plane warms up. Raise it once, then restart podman: | ||
| 24 | sudo mkdir -p /etc/containers/containers.conf.d | ||
| 25 | printf '[containers]\npids_limit = 0\n' \ | ||
| 26 | | sudo tee /etc/containers/containers.conf.d/99-kind-pids.conf | ||
| 27 | sudo systemctl restart podman.socket podman 2>/dev/null || true | ||
| 28 | |||
| 29 | |||
| 30 | make help # will have self explaantory commands to coopy adn paste stuff with | ||
| 31 | |||
| 32 | |||
| 33 | |||
| 34 | # Expose an OpenAI-compatible endpoint through a K8s Service / Ingress and prove it works with a `curl` example in the README. | ||
| 35 | |||
| 36 | http://llm.dev.localtest.me:8080 | ||
| 37 | $curl http://llm.dev.localtest.me:8080/v1/chat/completions -H 'Content-Type: application/json' -d '{"model":"Qwen2.5-0.5B-Instruct","messages":[{"role":"user","content":"hi"}],"max_tokens":8}' | jq | ||
| 38 | |||
| 39 | http://llm.prod.localtest.me:8080 | ||
| 40 | $curl http://llm.prod.localtest.me:8080/v1/chat/completions -H 'Content-Type: application/json' -d '{"model":"Qwen2.5-1.5B-Instruct","messages":[{"role":"user","content":"hi"}],"max_tokens":8}' | jq | ||
| 41 | |||
| 42 | |||
| 43 | # Observability: scrape `/metrics` from the inference pod with Prometheus and show at least one dashboard or PromQL query for request latency and GPU/CPU utilization. | ||
| 44 | |||
| 45 | Fire 10 chat requests against dev to populate metrics | ||
| 46 | $for i in {1..10}; do curl -s http://llm.dev.localtest.me:8080/v1/chat/completions -H 'Content-Type: application/json' -d '{"model":"Qwen2.5-0.5B-Instruct","messages":[{"role":"user","content":"hi"}],"max_tokens":4}' >/dev/null & done; wait | ||
| 47 | |||
| 48 | Raw /metrics (vLLM exposes natively) | ||
| 49 | $curl -s http://llm.dev.localtest.me:8080/metrics | grep '^vllm:' | head | ||
| 50 | |||
| 51 | Request latency p95 (seconds) — via Prometheus | ||
| 52 | $curl -G http://prom.localtest.me:8080/api/v1/query --data-urlencode 'query=histogram_quantile(0.95, sum by (le) (rate(vllm:e2e_request_latency_seconds_bucket[5m])))' | jq .data.result | ||
| 53 | |||
| 54 | CPU cores in use per vLLM pod (CPU-only inference — no GPU on this stack) | ||
| 55 | $curl -G http://prom.localtest.me:8080/api/v1/query --data-urlencode 'query=sum by (pod) (rate(container_cpu_usage_seconds_total{namespace="llm-dev",pod=~"llm-llm-app.*",container!="",container!="POD"}[5m]))' | jq .data.result | ||
| 56 | |||
| 57 | In-flight requests per pod (the same metric the prod HPA scales on) | ||
| 58 | $curl -G http://prom.localtest.me:8080/api/v1/query --data-urlencode 'query=sum by (namespace, pod) (vllm:num_requests_running)' | jq .data.result | ||
| 59 | |||
| 60 | |||
| 61 | # stretch 1 | ||
| 62 | deployed /agent/agent.py in a container to use the backend to caluclate the product of 2 nums | ||
| 63 | $curl http://agent.localtest.me:8080/ask -H 'Content-Type: application/json' -d '{"question":"what is 17 * 23?"}' # need to make up-agent first | ||
| 64 | |||
| 65 | # stretch 2 | ||
| 66 | horizontal pod scaling by counting total inflight requests, up to a total of 3 pods | ||
| 67 | term1 | ||
| 68 | $(trap 'kill 0' INT; for i in {1..5}; do \ | ||
| 69 | curl -s http://llm.prod.localtest.me:8080/v1/chat/completions \ | ||
| 70 | -H 'Content-Type: application/json' \ | ||
| 71 | -d '{"model":"Qwen2.5-1.5B-Instruct","messages":[{"role":"user","content":"hi"}],"max_tokens":32}' \ | ||
| 72 | >/dev/null & | ||
| 73 | done; wait) | ||
| 74 | term2 | ||
| 75 | $kubectl -n llm-prod get hpa -w # better to use watch -n for this, -w is slow af | ||
| 76 | |||
| 77 | # stretch 3 | ||
| 78 | image pinning — prod uses repo@sha256:<digest> (resolved via scripts/resolve-digests.sh); | ||
| 79 | terraform/envs/prod/main.tf | ||
| 80 | dev tracks :latest. The chart prefers digest over tag when both are set. | ||
| 81 | |||
| 82 | #stretch 4 | ||
| 83 | smoke test | ||
| 84 | charts/llm-app/templates/smoketest-job.yaml reruns after every install or upgrade | ||
| 85 | just checks if the response has a content field, no functional thingummy, then passes it | ||
| 86 | |||
| 87 | |||
| 88 | |||
| 89 | |||
| 90 | all stretch goals except atlantis git controlled thingummy cuz lazy setup locally or use github | ||
