provider "kubernetes" {
  config_path    = pathexpand(var.kubeconfig)
  config_context = var.kube_context
}

provider "helm" {
  kubernetes {
    config_path    = pathexpand(var.kubeconfig)
    config_context = var.kube_context
  }
}

locals {
  env = "dev"
}

module "llm" {
  source = "../../modules/llm"

  release_name = "llm"
  namespace    = "llm-${local.env}"
  chart_path   = var.chart_path

  replicas = 2

  model_name    = "Qwen/Qwen2.5-0.5B-Instruct"
  model_alias   = "Qwen2.5-0.5B-Instruct"
  max_model_len = 2048
  dtype         = "bfloat16"
  omp_threads   = 4

  resources = {
    requests = { cpu = "1", memory = "2Gi" }
    limits   = { cpu = "4", memory = "6Gi" }
  }

  ingress_host = "llm.dev.localtest.me"
  image_tag    = "latest"
}

output "ingress_host" { value = module.llm.ingress_host }
output "service_dns" { value = module.llm.service_dns }
output "curl_example" {
  value = <<-EOT
    curl -s http://${module.llm.ingress_host}:8080/v1/chat/completions \
      -H 'Content-Type: application/json' \
      -d '{"model":"Qwen2.5-0.5B-Instruct","messages":[{"role":"user","content":"Say hi."}]}'
  EOT
}