From d3e770254de0bb301815ca87257c8b1a357d06c4 Mon Sep 17 00:00:00 2001 From: Your Name Date: Sun, 26 Apr 2026 21:02:47 +0800 Subject: hehe --- README.txt | 90 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 90 insertions(+) create mode 100644 README.txt (limited to 'README.txt') diff --git a/README.txt b/README.txt new file mode 100644 index 0000000..4ce7a8e --- /dev/null +++ b/README.txt @@ -0,0 +1,90 @@ +============================================================================= + Local K8s LLM demo — kind + OpenTofu + vLLM +============================================================================= + +sudo dnf install -y podman git make jq curl tar + +# kind v0.31.0 (node image: kindest/node:v1.35.0, pinned by digest in cluster/kind-config.yaml) +cluster/kind-config.yaml + +# kubectl v1.36.0 +curl -fsSLo /tmp/kubectl \ + https://dl.k8s.io/release/v1.36.0/bin/linux/amd64/kubectl +sudo install -m 0755 /tmp/kubectl /usr/local/bin/kubectl + +# Helm 4.1.4 +terraform/modules/observability/variables.tf + +# OpenTofu 1.11.6 +terraform/envs/{dev,prod,bootstrap}/versions.tf + + + +# kind runs each k8s "node" as a long-lived podman container. The default pids_limit = 2048 causes ingress-nginx to hit pthread EAGAIN once the control plane warms up. Raise it once, then restart podman: +sudo mkdir -p /etc/containers/containers.conf.d +printf '[containers]\npids_limit = 0\n' \ + | sudo tee /etc/containers/containers.conf.d/99-kind-pids.conf +sudo systemctl restart podman.socket podman 2>/dev/null || true + + +make help # will have self explaantory commands to coopy adn paste stuff with + + + +# Expose an OpenAI-compatible endpoint through a K8s Service / Ingress and prove it works with a `curl` example in the README. + +http://llm.dev.localtest.me:8080 +$curl http://llm.dev.localtest.me:8080/v1/chat/completions -H 'Content-Type: application/json' -d '{"model":"Qwen2.5-0.5B-Instruct","messages":[{"role":"user","content":"hi"}],"max_tokens":8}' | jq + +http://llm.prod.localtest.me:8080 +$curl http://llm.prod.localtest.me:8080/v1/chat/completions -H 'Content-Type: application/json' -d '{"model":"Qwen2.5-1.5B-Instruct","messages":[{"role":"user","content":"hi"}],"max_tokens":8}' | jq + + +# Observability: scrape `/metrics` from the inference pod with Prometheus and show at least one dashboard or PromQL query for request latency and GPU/CPU utilization. + +Fire 10 chat requests against dev to populate metrics +$for i in {1..10}; do curl -s http://llm.dev.localtest.me:8080/v1/chat/completions -H 'Content-Type: application/json' -d '{"model":"Qwen2.5-0.5B-Instruct","messages":[{"role":"user","content":"hi"}],"max_tokens":4}' >/dev/null & done; wait + +Raw /metrics (vLLM exposes natively) +$curl -s http://llm.dev.localtest.me:8080/metrics | grep '^vllm:' | head + +Request latency p95 (seconds) — via Prometheus +$curl -G http://prom.localtest.me:8080/api/v1/query --data-urlencode 'query=histogram_quantile(0.95, sum by (le) (rate(vllm:e2e_request_latency_seconds_bucket[5m])))' | jq .data.result + +CPU cores in use per vLLM pod (CPU-only inference — no GPU on this stack) +$curl -G http://prom.localtest.me:8080/api/v1/query --data-urlencode 'query=sum by (pod) (rate(container_cpu_usage_seconds_total{namespace="llm-dev",pod=~"llm-llm-app.*",container!="",container!="POD"}[5m]))' | jq .data.result + +In-flight requests per pod (the same metric the prod HPA scales on) +$curl -G http://prom.localtest.me:8080/api/v1/query --data-urlencode 'query=sum by (namespace, pod) (vllm:num_requests_running)' | jq .data.result + + +# stretch 1 +deployed /agent/agent.py in a container to use the backend to caluclate the product of 2 nums +$curl http://agent.localtest.me:8080/ask -H 'Content-Type: application/json' -d '{"question":"what is 17 * 23?"}' # need to make up-agent first + +# stretch 2 +horizontal pod scaling by counting total inflight requests, up to a total of 3 pods +term1 +$(trap 'kill 0' INT; for i in {1..5}; do \ + curl -s http://llm.prod.localtest.me:8080/v1/chat/completions \ + -H 'Content-Type: application/json' \ + -d '{"model":"Qwen2.5-1.5B-Instruct","messages":[{"role":"user","content":"hi"}],"max_tokens":32}' \ + >/dev/null & + done; wait) +term2 +$kubectl -n llm-prod get hpa -w # better to use watch -n for this, -w is slow af + +# stretch 3 +image pinning — prod uses repo@sha256: (resolved via scripts/resolve-digests.sh); +terraform/envs/prod/main.tf +dev tracks :latest. The chart prefers digest over tag when both are set. + +#stretch 4 +smoke test +charts/llm-app/templates/smoketest-job.yaml reruns after every install or upgrade +just checks if the response has a content field, no functional thingummy, then passes it + + + + +all stretch goals except atlantis git controlled thingummy cuz lazy setup locally or use github -- cgit