provider "kubernetes" { config_path = pathexpand(var.kubeconfig) config_context = var.kube_context } provider "helm" { kubernetes { config_path = pathexpand(var.kubeconfig) config_context = var.kube_context } } locals { env = "dev" } module "llm" { source = "../../modules/llm" release_name = "llm" namespace = "llm-${local.env}" chart_path = var.chart_path replicas = 2 model_name = "Qwen/Qwen2.5-0.5B-Instruct" model_alias = "Qwen2.5-0.5B-Instruct" max_model_len = 2048 dtype = "bfloat16" omp_threads = 4 resources = { requests = { cpu = "1", memory = "2Gi" } limits = { cpu = "4", memory = "6Gi" } } ingress_host = "llm.dev.localtest.me" image_tag = "latest" } output "ingress_host" { value = module.llm.ingress_host } output "service_dns" { value = module.llm.service_dns } output "curl_example" { value = <<-EOT curl -s http://${module.llm.ingress_host}:8080/v1/chat/completions \ -H 'Content-Type: application/json' \ -d '{"model":"Qwen2.5-0.5B-Instruct","messages":[{"role":"user","content":"Say hi."}]}' EOT }