heheHEAD main

author: Your Name <you@example.com> 2026-04-26 21:02:47 +0800
committer: Your Name <you@example.com> 2026-04-26 21:02:47 +0800
commit: d3e770254de0bb301815ca87257c8b1a357d06c4 (patch)
tree: 358c814be2a06b9e2009905f14938243286b8d82 /README.txt
1 files changed, 90 insertions, 0 deletions
diff --git a/README.txt b/README.txt
new file mode 100644
index 0000000..4ce7a8e
--- /dev/null
+++ b/README.txt
@@ -0,0 +1,90 @@
+=============================================================================
+ Local K8s LLM demo — kind + OpenTofu + vLLM
+=============================================================================
+sudo dnf install -y podman git make jq curl tar
+# kind v0.31.0 (node image: kindest/node:v1.35.0, pinned by digest in cluster/kind-config.yaml)
+cluster/kind-config.yaml
+# kubectl v1.36.0
+curl -fsSLo /tmp/kubectl \
+  https://dl.k8s.io/release/v1.36.0/bin/linux/amd64/kubectl
+sudo install -m 0755 /tmp/kubectl /usr/local/bin/kubectl
+# Helm 4.1.4
+terraform/modules/observability/variables.tf
+# OpenTofu 1.11.6
+terraform/envs/{dev,prod,bootstrap}/versions.tf
+# kind runs each k8s "node" as a long-lived podman container. The default pids_limit = 2048 causes ingress-nginx to hit pthread EAGAIN once the control plane warms up. Raise it once, then restart podman:
+sudo mkdir -p /etc/containers/containers.conf.d
+printf '[containers]\npids_limit = 0\n' \
+  | sudo tee /etc/containers/containers.conf.d/99-kind-pids.conf
+sudo systemctl restart podman.socket podman 2>/dev/null || true
+make help  # will have self explaantory commands to coopy adn paste stuff with
+# Expose an OpenAI-compatible endpoint through a K8s Service / Ingress and prove it works with a `curl` example in the README.
+http://llm.dev.localtest.me:8080
+$curl http://llm.dev.localtest.me:8080/v1/chat/completions -H 'Content-Type: application/json' -d '{"model":"Qwen2.5-0.5B-Instruct","messages":[{"role":"user","content":"hi"}],"max_tokens":8}' | jq
+http://llm.prod.localtest.me:8080
+$curl http://llm.prod.localtest.me:8080/v1/chat/completions -H 'Content-Type: application/json' -d '{"model":"Qwen2.5-1.5B-Instruct","messages":[{"role":"user","content":"hi"}],"max_tokens":8}' | jq
+# Observability: scrape `/metrics` from the inference pod with Prometheus and show at least one dashboard or PromQL query for request latency and GPU/CPU utilization.
+Fire 10 chat requests against dev to populate metrics
+$for i in {1..10}; do curl -s http://llm.dev.localtest.me:8080/v1/chat/completions -H 'Content-Type: application/json' -d '{"model":"Qwen2.5-0.5B-Instruct","messages":[{"role":"user","content":"hi"}],"max_tokens":4}' >/dev/null & done; wait
+Raw /metrics (vLLM exposes natively)
+$curl -s http://llm.dev.localtest.me:8080/metrics | grep '^vllm:' | head
+Request latency p95 (seconds) — via Prometheus
+$curl -G http://prom.localtest.me:8080/api/v1/query --data-urlencode 'query=histogram_quantile(0.95, sum by (le) (rate(vllm:e2e_request_latency_seconds_bucket[5m])))' | jq .data.result
+CPU cores in use per vLLM pod (CPU-only inference — no GPU on this stack)
+$curl -G http://prom.localtest.me:8080/api/v1/query --data-urlencode 'query=sum by (pod) (rate(container_cpu_usage_seconds_total{namespace="llm-dev",pod=~"llm-llm-app.*",container!="",container!="POD"}[5m]))' | jq .data.result
+In-flight requests per pod (the same metric the prod HPA scales on)
+$curl -G http://prom.localtest.me:8080/api/v1/query --data-urlencode 'query=sum by (namespace, pod) (vllm:num_requests_running)' | jq .data.result
+# stretch 1
+deployed /agent/agent.py in a container to use the backend to caluclate the product of 2 nums
+$curl http://agent.localtest.me:8080/ask -H 'Content-Type: application/json' -d '{"question":"what is 17 * 23?"}'  # need to make up-agent first
+# stretch 2 
+horizontal pod scaling by counting total inflight requests, up to a total of 3 pods
+term1
+$(trap 'kill 0' INT; for i in {1..5}; do \
+    curl -s http://llm.prod.localtest.me:8080/v1/chat/completions \
+      -H 'Content-Type: application/json' \
+      -d '{"model":"Qwen2.5-1.5B-Instruct","messages":[{"role":"user","content":"hi"}],"max_tokens":32}' \
+      >/dev/null &
+  done; wait)
+term2
+$kubectl -n llm-prod get hpa -w  # better to use watch -n for this, -w is slow af
+# stretch 3
+image pinning — prod uses repo@sha256:<digest> (resolved via scripts/resolve-digests.sh);
+terraform/envs/prod/main.tf
+dev tracks :latest. The chart prefers digest over tag when both are set.
+#stretch 4
+smoke test
+charts/llm-app/templates/smoketest-job.yaml reruns after every install or upgrade
+just checks if the response has a content field, no functional thingummy, then passes it
+all stretch goals except atlantis git controlled thingummy cuz lazy setup locally or use github
author	Your Name <you@example.com>	2026-04-26 21:02:47 +0800
committer	Your Name <you@example.com>	2026-04-26 21:02:47 +0800
commit	d3e770254de0bb301815ca87257c8b1a357d06c4 (patch)
tree	358c814be2a06b9e2009905f14938243286b8d82 /README.txt

diff --git a/README.txt b/README.txt new file mode 100644 index 0000000..4ce7a8e --- /dev/null +++ b/README.txt
@@ -0,0 +1,90 @@
	1	=============================================================================
	2	Local K8s LLM demo — kind + OpenTofu + vLLM
	3	=============================================================================
	4
	5	sudo dnf install -y podman git make jq curl tar
	6
	7	# kind v0.31.0 (node image: kindest/node:v1.35.0, pinned by digest in cluster/kind-config.yaml)
	8	cluster/kind-config.yaml
	9
	10	# kubectl v1.36.0
	11	curl -fsSLo /tmp/kubectl \
	12	https://dl.k8s.io/release/v1.36.0/bin/linux/amd64/kubectl
	13	sudo install -m 0755 /tmp/kubectl /usr/local/bin/kubectl
	14
	15	# Helm 4.1.4
	16	terraform/modules/observability/variables.tf
	17
	18	# OpenTofu 1.11.6
	19	terraform/envs/{dev,prod,bootstrap}/versions.tf
	20
	21
	22
	23	# kind runs each k8s "node" as a long-lived podman container. The default pids_limit = 2048 causes ingress-nginx to hit pthread EAGAIN once the control plane warms up. Raise it once, then restart podman:
	24	sudo mkdir -p /etc/containers/containers.conf.d
	25	printf '[containers]\npids_limit = 0\n' \
	26	\| sudo tee /etc/containers/containers.conf.d/99-kind-pids.conf
	27	sudo systemctl restart podman.socket podman 2>/dev/null \|\| true
	28
	29
	30	make help # will have self explaantory commands to coopy adn paste stuff with
	31
	32
	33
	34	# Expose an OpenAI-compatible endpoint through a K8s Service / Ingress and prove it works with a `curl` example in the README.
	35
	36	http://llm.dev.localtest.me:8080
	37	$curl http://llm.dev.localtest.me:8080/v1/chat/completions -H 'Content-Type: application/json' -d '{"model":"Qwen2.5-0.5B-Instruct","messages":[{"role":"user","content":"hi"}],"max_tokens":8}' \| jq
	38
	39	http://llm.prod.localtest.me:8080
	40	$curl http://llm.prod.localtest.me:8080/v1/chat/completions -H 'Content-Type: application/json' -d '{"model":"Qwen2.5-1.5B-Instruct","messages":[{"role":"user","content":"hi"}],"max_tokens":8}' \| jq
	41
	42
	43	# Observability: scrape `/metrics` from the inference pod with Prometheus and show at least one dashboard or PromQL query for request latency and GPU/CPU utilization.
	44
	45	Fire 10 chat requests against dev to populate metrics
	46	$for i in {1..10}; do curl -s http://llm.dev.localtest.me:8080/v1/chat/completions -H 'Content-Type: application/json' -d '{"model":"Qwen2.5-0.5B-Instruct","messages":[{"role":"user","content":"hi"}],"max_tokens":4}' >/dev/null & done; wait
	47
	48	Raw /metrics (vLLM exposes natively)
	49	$curl -s http://llm.dev.localtest.me:8080/metrics \| grep '^vllm:' \| head
	50
	51	Request latency p95 (seconds) — via Prometheus
	52	$curl -G http://prom.localtest.me:8080/api/v1/query --data-urlencode 'query=histogram_quantile(0.95, sum by (le) (rate(vllm:e2e_request_latency_seconds_bucket[5m])))' \| jq .data.result
	53
	54	CPU cores in use per vLLM pod (CPU-only inference — no GPU on this stack)
	55	$curl -G http://prom.localtest.me:8080/api/v1/query --data-urlencode 'query=sum by (pod) (rate(container_cpu_usage_seconds_total{namespace="llm-dev",pod=~"llm-llm-app.*",container!="",container!="POD"}[5m]))' \| jq .data.result
	56
	57	In-flight requests per pod (the same metric the prod HPA scales on)
	58	$curl -G http://prom.localtest.me:8080/api/v1/query --data-urlencode 'query=sum by (namespace, pod) (vllm:num_requests_running)' \| jq .data.result
	59
	60
	61	# stretch 1
	62	deployed /agent/agent.py in a container to use the backend to caluclate the product of 2 nums
	63	$curl http://agent.localtest.me:8080/ask -H 'Content-Type: application/json' -d '{"question":"what is 17 * 23?"}' # need to make up-agent first
	64
	65	# stretch 2
	66	horizontal pod scaling by counting total inflight requests, up to a total of 3 pods
	67	term1
	68	$(trap 'kill 0' INT; for i in {1..5}; do \
	69	curl -s http://llm.prod.localtest.me:8080/v1/chat/completions \
	70	-H 'Content-Type: application/json' \
	71	-d '{"model":"Qwen2.5-1.5B-Instruct","messages":[{"role":"user","content":"hi"}],"max_tokens":32}' \
	72	>/dev/null &
	73	done; wait)
	74	term2
	75	$kubectl -n llm-prod get hpa -w # better to use watch -n for this, -w is slow af
	76
	77	# stretch 3
	78	image pinning — prod uses repo@sha256:<digest> (resolved via scripts/resolve-digests.sh);
	79	terraform/envs/prod/main.tf
	80	dev tracks :latest. The chart prefers digest over tag when both are set.
	81
	82	#stretch 4
	83	smoke test
	84	charts/llm-app/templates/smoketest-job.yaml reruns after every install or upgrade
	85	just checks if the response has a content field, no functional thingummy, then passes it
	86
	87
	88
	89
	90	all stretch goals except atlantis git controlled thingummy cuz lazy setup locally or use github