helm/values.yaml

# -- Default values for llmstack helm chart
# -- Declare variables to be passed into your templates.

# -- Serving engine configuratoon
servingEngineSpec:
  # -- Customized labels for the serving engine deployment
  labels:
    environment: "test"
    release: "test"

  # modelSpec - configuring multiple serving engines deployments that runs different models
  # Each entry in the modelSpec array should contain the following fields:
  # - name: (string) The name of the model, e.g., "example-model"
  # - repository: (string) The repository of the model, e.g., "vllm/vllm-openai"
  # - tag: (string) The tag of the model, e.g., "latest"
  # - imagePullSecret: (Optional, string) Name of secret with credentials to private container repository, e.g. "secret"
  # - modelURL: (string) The URL of the model, e.g., "facebook/opt-125m"
  # - chatTemplate: (Optional, string) Chat template (Jinga2) specifying tokenizer configuration, e.g. "{% for message in messages %}\n{% if message['role'] == 'user' %}\n{{ 'Question:\n' + message['content'] + '\n\n' }}{% elif message['role'] == 'system' %}\n{{ 'System:\n' + message['content'] + '\n\n' }}{% elif message['role'] == 'assistant' %}{{ 'Answer:\n'  + message['content'] + '\n\n' }}{% endif %}\n{% if loop.last and add_generation_prompt %}\n{{ 'Answer:\n' }}{% endif %}{% endfor %}"
  #
  # - replicaCount: (int) The number of replicas for the model, e.g. 1
  # - requestCPU: (int) The number of CPUs requested for the model, e.g. 6
  # - requestMemory: (string) The amount of memory requested for the model, e.g., "16Gi"
  # - requestGPU: (int) The number of GPUs requested for the model, e.g., 1
  #
  # - pvcStorage: (Optional, string) The amount of storage requested for the model, e.g., "50Gi".
  # - pvcAccessMode: (Optional, list) The access mode policy for the mounted volume, e.g., ["ReadWriteOnce"]
  # - storageClass: (Optional, String) The storage class of the PVC e.g., "", default is ""
  # - pvcMatchLabels: (Optional, map) The labels to match the PVC, e.g., {model: "opt125m"}
  # - initContainer: (optional, list of objects) The configuration for the init container to be run before the main container.
  #   - name: (string) The name of the init container, e.g., "init"
  #   - image: (string) The Docker image for the init container, e.g., "busybox:latest"
  #   - command: (optional, list) The command to run in the init container, e.g., ["sh", "-c"]
  #   - args: (optional, list) Additional arguments to pass to the command, e.g., ["ls"]
  #   - env: (optional, list) List of environment variables to set in the container, each being a map with:
  #   - resources: (optional, map) The resource requests and limits for the container:
  #   - mountPvcStorage: (optional, bool) Whether to mount the model's volume.
  #
  # - vllmConfig: (optional, map) The configuration for the VLLM model, supported options are:
  #   - enablePrefixCaching: (optional, bool) Enable prefix caching, e.g., false
  #   - enableChunkedPrefill: (optional, bool) Enable chunked prefill, e.g., false
  #   - maxModelLen: (optional, int) The maximum model length, e.g., 16384
  #   - dtype: (optional, string) The data type, e.g., "bfloat16"
  #   - tensorParallelSize: (optional, int) The degree of tensor parallelism, e.g., 2
  #   - extraArgs: (optional, list) Extra command line arguments to pass to vLLM, e.g., ["--disable-log-requests"]
  #
  # - lmcacheConfig: (optional, map) The configuration of the LMCache for KV offloading, supported options are:
  #   - enabled: (optional, bool) Enable LMCache, e.g., true
  #   - cpuOffloadingBufferSize: (optional, string) The CPU offloading buffer size, e.g., "30"
  #
  # - hf_token: (optional) Hugging Face token configuration. Can be either:
  #   - A string containing the token directly (will be stored in a generated secret)
  #   - An object referencing an existing secret:
  #     secretName: "my-existing-secret"
  #     secretKey: "hf-token-key"
  #
  # - env: (optional, list) The environment variables to set in the container, e.g., your HF_TOKEN
  #
  # - nodeSelectorTerms: (optional, list) The node selector terms to match the nodes
  #
  # - shmSize: (optional, string) The size of the shared memory, e.g., "20Gi"
  #
  # Example:
  # modelSpec:
  # - name: "mistral"
  #   repository: "lmcache/vllm-openai"
  #   tag: "latest"
  #   modelURL: "mistralai/Mistral-7B-Instruct-v0.2"
  #   replicaCount: 1
  #
  #   requestCPU: 10
  #   requestMemory: "64Gi"
  #   requestGPU: 1
  #
  #   pvcStorage: "50Gi"
  #   pvcAccessMode:
  #     - ReadWriteOnce
  #   pvcMatchLabels:
  #     model: "mistral"
  #   initContainer:
  #     name: my-container
  #     image: busybox
  #     command: ["sh"]
  #     env: {}
  #     args: []
  #     resources: {}
  #     mountPvcStorage: true
  #
  #   vllmConfig:
  #     enableChunkedPrefill: false
  #     enablePrefixCaching: false
  #     maxModelLen: 16384
  #     dtype: "bfloat16"
  #     extraArgs: ["--disable-log-requests", "--gpu-memory-utilization", "0.8"]
  #
  #   lmcacheConfig:
  #     enabled: true
  #     cpuOffloadingBufferSize: "30"
  #
  #   hf_token: "hf_xxxxxxxxxxxxx"
  #
  #   nodeSelectorTerms:
  #     - matchExpressions:
  #       - key: nvidia.com/gpu.product
  #         operator: "In"
  #         values:
  #         - "NVIDIA-RTX-A6000"
  modelSpec: []

  # -- Container port
  containerPort: 8000
  # -- Service port
  servicePort: 80

  # -- Set other environment variables from config map
  configs: {}

  # -- deployment strategy
  strategy: {}

  # -- Readiness probe configuration
  startupProbe:
    # -- Number of seconds after the container has started before startup probe is initiated
    initialDelaySeconds: 15
    # -- How often (in seconds) to perform the startup probe
    periodSeconds: 10
    # -- Number of times after which if a probe fails in a row, Kubernetes considers that the overall check has failed: the container is not ready
    failureThreshold:
      60
      # -- Configuration of the Kubelet http request on the server
    httpGet:
      # -- Path to access on the HTTP server
      path: /health
      # -- Name or number of the port to access on the container, on which the server is listening
      port: 8000

  # -- Liveness probe configuration
  livenessProbe:
    # -- Number of seconds after the container has started before liveness probe is initiated
    initialDelaySeconds: 15
    # -- Number of times after which if a probe fails in a row, Kubernetes considers that the overall check has failed: the container is not alive
    failureThreshold: 3
    # -- How often (in seconds) to perform the liveness probe
    periodSeconds: 10
    # -- Configuration of the Kubelet http request on the server
    httpGet:
      # -- Path to access on the HTTP server
      path: /health
      # -- Name or number of the port to access on the container, on which the server is listening
      port: 8000

  # -- Disruption Budget Configuration
  maxUnavailablePodDisruptionBudget: ""

  # -- Tolerations configuration (when there are taints on nodes)
  # Example:
  # tolerations:
  #   - key: "node-role.kubernetes.io/control-plane"
  #     operator: "Exists"
  #     effect: "NoSchedule"
  tolerations: []

  # -- RuntimeClassName configuration, set to "nvidia" if the model requires GPU
  runtimeClassName: "nvidia"

routerSpec:
  # -- The docker image of the router. The following values are defaults:
  repository: "lmcache/lmstack-router"
  tag: "latest"
  imagePullPolicy: "Always"

  # -- Whether to enable the router service
  enableRouter: true

  # -- Number of replicas
  replicaCount: 1

  # -- Container port
  containerPort: 8000

  # -- Service port
  servicePort: 80

  # -- Service discovery mode, supports "k8s" or "static". Defaults to "k8s" if not set.
  serviceDiscovery: "k8s"

  # -- If serviceDiscovery is set to "static", the comma-separated values below are required. There needs to be the same number of backends and models
  staticBackends: ""
  staticModels: ""

  # -- routing logic, could be "roundrobin" or "session"
  routingLogic: "roundrobin"

  # -- session key if using "session" routing logic
  sessionKey: ""

  # -- extra router commandline arguments
  extraArgs: []

  # -- Interval in seconds to scrape the serving engine metrics
  engineScrapeInterval: 15

  # -- Window size in seconds to calculate the request statistics
  requestStatsWindow: 60

  # -- deployment strategy
  strategy: {}

  # -- router resource requests and limits
  resources:
    requests:
      cpu: "4"
      memory: "16G"
    limits:
      cpu: "8"
      memory: "32G"

  # -- Customized labels for the router deployment
  labels:
    environment: "router"
    release: "router"

  ingress:
    # -- Enable ingress controller resource
    enabled: false

    # -- IngressClass that will be used to implement the Ingress
    className: ""

    # -- Additional annotations for the Ingress resource
    annotations:
      {}
      # kubernetes.io/ingress.class: alb
      # kubernetes.io/ingress.class: nginx
      # kubernetes.io/tls-acme: "true"

    # The list of hostnames to be covered with this ingress record.
    hosts:
      - host: vllm-router.local
        paths:
          - path: /
            pathType: Prefix

    # --  The tls configuration for hostnames to be covered with this ingress record.
    tls: []
    #  - secretName: vllm-router-tls
    #    hosts:
    #      - vllm-router.local

  # -- TODO: Readiness probe configuration
  #startupProbe:
  #  # -- Number of seconds after the container has started before startup probe is initiated
  #  initialDelaySeconds: 5
  #  # -- How often (in seconds) to perform the startup probe
  #  periodSeconds: 5
  #  # -- Number of times after which if a probe fails in a row, Kubernetes considers that the overall check has failed: the container is not ready
  #  failureThreshold: 100
  #   # -- Configuration of the Kubelet http request on the server
  #  httpGet:
  #    # -- Path to access on the HTTP server
  #