provider "kubernetes" {
  config_path    = pathexpand(var.kubeconfig)
  config_context = var.kube_context
}

provider "helm" {
  kubernetes {
    config_path    = pathexpand(var.kubeconfig)
    config_context = var.kube_context
  }
}

locals {
  env = "prod"
}

module "llm" {
  source = "../../modules/llm"

  release_name = "llm"
  namespace    = "llm-${local.env}"
  chart_path   = var.chart_path

  replicas = 1

  model_name    = "Qwen/Qwen2.5-1.5B-Instruct"
  model_alias   = "Qwen2.5-1.5B-Instruct"
  max_model_len = 4096
  dtype         = "bfloat16"
  omp_threads   = 6

  resources = {
    requests = { cpu = "2", memory = "4Gi" }
    limits   = { cpu = "6", memory = "8Gi" }
  }

  ingress_host = "llm.prod.localtest.me"
  image_tag    = "latest"
  # Content-addressable pin, resolved with scripts/resolve-digests.sh on amd64.
  # Per-arch digest — re-resolve on a different arch or after an upstream tag move.
  # Dev intentionally runs on `:latest` so new fixes flow in without a PR.
  image_digest = "sha256:bb7ed9b6c595334d78179e9d8f6490e06bf9220ed4a10b9b4e15064454ddc69e"

  # Enable OpenAI tool-calling so the agent's function-call path works.
  # Qwen 2.5 uses hermes-style tool parsing in vLLM.
  extra_args = [
    "--enable-auto-tool-choice",
    "--tool-call-parser", "hermes",
  ]

  hpa = {
    enabled      = true
    min_replicas = 1
    max_replicas = 3
    # vLLM exposes `vllm:num_requests_running` as a per-pod gauge of in-flight
    # requests. Scale up when >50% of pods are actively serving.
    metric_name          = "vllm:num_requests_running"
    target_average_value = "500m"
  }
}

output "ingress_host" { value = module.llm.ingress_host }
output "service_dns" { value = module.llm.service_dns }
output "curl_example" {
  value = <<-EOT
    curl -s http://${module.llm.ingress_host}:8080/v1/chat/completions \
      -H 'Content-Type: application/json' \
      -d '{"model":"Qwen2.5-1.5B-Instruct","messages":[{"role":"user","content":"Say hi."}]}'
  EOT
}