From 1c9c6914715ccde80ddccec7b9fe978b3632b49a Mon Sep 17 00:00:00 2001 From: Nguyen Dinh Duy Date: Thu, 27 Feb 2025 15:13:28 +0100 Subject: [PATCH] feat: add vllm-api-key Signed-off-by: kbvd623 --- .github/curl-05-secure-vllm.sh | 17 ++ .github/values-05-secure-vllm.yaml | 19 ++ .../workflows/functionality-helm-chart.yml | 36 ++- helm/templates/deployment-router.yaml | 16 + helm/templates/deployment-vllm-multi.yaml | 15 + helm/templates/secrets.yaml | 5 + helm/values.schema.json | 275 ++++++++++++++---- helm/values.yaml | 9 +- src/vllm_router/service_discovery.py | 7 +- tutorials/06-secure-vllm-serve.md | 179 ++++++++++++ 10 files changed, 515 insertions(+), 63 deletions(-) create mode 100644 .github/curl-05-secure-vllm.sh create mode 100644 .github/values-05-secure-vllm.yaml create mode 100644 tutorials/06-secure-vllm-serve.md diff --git a/.github/curl-05-secure-vllm.sh b/.github/curl-05-secure-vllm.sh new file mode 100644 index 00000000..fd39ebb8 --- /dev/null +++ b/.github/curl-05-secure-vllm.sh @@ -0,0 +1,17 @@ +#!/bin/bash +HOST=$1 +PORT=$2 +VLLM_API_KEY=abc123XYZ987 + +# Curl and save output +OUTPUT_DIR="output-05-secure-vllm" +[ ! -d "$OUTPUT_DIR" ] && mkdir $OUTPUT_DIR +chmod -R 777 $OUTPUT_DIR + +# Fetch model list with authentication +curl -s -H "Authorization: Bearer $VLLM_API_KEY" "http://$HOST:$PORT/v1/models" | tee "$OUTPUT_DIR/models-05-secure-vllm.json" +# Run completion query with authentication +curl -s -X POST -H "Authorization: Bearer $VLLM_API_KEY" \ + -H "Content-Type: application/json" \ + -d '{"model": "facebook/opt-125m", "prompt": "Once upon a time,", "max_tokens": 10}' \ + "http://$HOST:$PORT/v1/completions" | tee "$OUTPUT_DIR/query-05-secure-vllm.json" diff --git a/.github/values-05-secure-vllm.yaml b/.github/values-05-secure-vllm.yaml new file mode 100644 index 00000000..ac988730 --- /dev/null +++ b/.github/values-05-secure-vllm.yaml @@ -0,0 +1,19 @@ +servingEngineSpec: + runtimeClassName: "" + vllmApiKey: "abc123XYZ987" + modelSpec: + - name: "opt125m" + repository: "vllm/vllm-openai" + tag: "latest" + modelURL: "facebook/opt-125m" + + replicaCount: 1 + + requestCPU: 6 + requestMemory: "16Gi" + requestGPU: 1 + +routerSpec: + repository: "localhost:5000/git-act-router" + imagePullPolicy: "IfNotPresent" + enableRouter: true diff --git a/.github/workflows/functionality-helm-chart.yml b/.github/workflows/functionality-helm-chart.yml index 74962056..78e086b9 100644 --- a/.github/workflows/functionality-helm-chart.yml +++ b/.github/workflows/functionality-helm-chart.yml @@ -52,7 +52,41 @@ jobs: sudo helm uninstall vllm if: always() - run: echo "🍏 This job's status is ${{ job.status }}." - + Secure-Minimal-Example: + runs-on: self-hosted + steps: + - run: echo "🎉 The job was automatically triggered by a ${{ github.event_name }} event." + - run: echo "🐧 This job is now running on a ${{ runner.os }} server hosted by GitHub!" + - run: echo "🔎 The name of your branch is ${{ github.ref }} and your repository is ${{ github.repository }}." + - name: Check out repository code + uses: actions/checkout@v4 + - run: echo "💡 The ${{ github.repository }} repository has been cloned to the runner." + - run: echo "🖥️ The workflow is now ready to test your code on the runner." + - name: Deploy via helm charts + env: + DOCKER_BUILDKIT: 1 + run: | + cd ${{ github.workspace }} + sudo docker build -t localhost:5000/git-act-router -f docker/Dockerfile . + sudo docker push localhost:5000/git-act-router + sudo sysctl fs.protected_regular=0 + sudo minikube image load localhost:5000/git-act-router + sudo helm install vllm ./helm -f .github/values-05-secure-vllm.yaml + - name: Validate the installation and send query to the stack + run: | + sudo bash .github/port-forward.sh curl-05-secure-vllm + timeout-minutes: 2 + - name: Archive functionality results + uses: actions/upload-artifact@v4 + with: + name: output-05-secure-vllm + path: | + output-05-secure-vllm/ + - name: Helm uninstall + run: | + sudo helm uninstall vllm + if: always() + - run: echo "🍏 This job's status is ${{ job.status }}." Two-Pods-Minimal-Example: runs-on: self-hosted needs: Minimal-Example diff --git a/helm/templates/deployment-router.yaml b/helm/templates/deployment-router.yaml index 5dacd67e..7ed3bee6 100644 --- a/helm/templates/deployment-router.yaml +++ b/helm/templates/deployment-router.yaml @@ -22,6 +22,22 @@ spec: - name: router-container image: "{{ .Values.routerSpec.repository | default "lmcache/lmstack-router" }}:{{ .Values.routerSpec.tag | default "latest" }}" imagePullPolicy: "{{ .Values.routerSpec.imagePullPolicy | default "Always" }}" + env: + {{- $vllmApiKey := $.Values.servingEngineSpec.vllmApiKey }} + {{- if $vllmApiKey }} + - name: VLLM_API_KEY + {{- if kindIs "string" $vllmApiKey }} + valueFrom: + secretKeyRef: + name: {{ .Release.Name }}-secrets + key: vllmApiKey + {{- else }} + valueFrom: + secretKeyRef: + name: {{ $vllmApiKey.secretName }} + key: {{ $vllmApiKey.secretKey }} + {{- end }} + {{- end }} args: - "--host" - "0.0.0.0" diff --git a/helm/templates/deployment-vllm-multi.yaml b/helm/templates/deployment-vllm-multi.yaml index 06f6a4fc..14da783c 100644 --- a/helm/templates/deployment-vllm-multi.yaml +++ b/helm/templates/deployment-vllm-multi.yaml @@ -93,6 +93,21 @@ spec: key: {{ $modelSpec.hf_token.secretKey }} {{- end }} {{- end }} + {{- $vllmApiKey := $.Values.servingEngineSpec.vllmApiKey }} + {{- if $vllmApiKey }} + - name: VLLM_API_KEY + {{- if kindIs "string" $vllmApiKey }} + valueFrom: + secretKeyRef: + name: {{ .Release.Name }}-secrets + key: vllmApiKey + {{- else }} + valueFrom: + secretKeyRef: + name: {{ $vllmApiKey.secretName }} + key: {{ $vllmApiKey.secretKey }} + {{- end }} + {{- end }} {{- with $modelSpec.env }} {{- toYaml . | nindent 10 }} {{- end }} diff --git a/helm/templates/secrets.yaml b/helm/templates/secrets.yaml index 21c0121e..073aa3d2 100644 --- a/helm/templates/secrets.yaml +++ b/helm/templates/secrets.yaml @@ -5,6 +5,11 @@ metadata: namespace: {{ .Release.Namespace }} type: Opaque data: + {{- $vllmApiKey := $.Values.servingEngineSpec.vllmApiKey }} + {{- if and $vllmApiKey (kindIs "string" $vllmApiKey) }} + vllmApiKey: {{ $vllmApiKey | b64enc | quote }} + {{- end }} + {{- range $modelSpec := .Values.servingEngineSpec.modelSpec }} {{- with $ -}} {{- if and $modelSpec.hf_token (kindIs "string" $modelSpec.hf_token) }} diff --git a/helm/values.schema.json b/helm/values.schema.json index 0e00e165..0190ae9b 100644 --- a/helm/values.schema.json +++ b/helm/values.schema.json @@ -8,24 +8,68 @@ "labels": { "type": "object", "properties": { - "environment": { "type": "string" }, - "release": { "type": "string" } + "environment": { + "type": "string" + }, + "release": { + "type": "string" + } } }, + "vllmApiKey": { + "oneOf": [ + { + "type": "string" + }, + { + "type": "object", + "properties": { + "secretName": { + "type": "string" + }, + "secretKey": { + "type": "string" + } + }, + "required": [ + "secretName", + "secretKey" + ] + } + ] + }, "modelSpec": { "type": "array", "items": { "type": "object", "properties": { - "name": { "type": "string" }, - "repository": { "type": "string" }, - "tag": { "type": "string" }, - "modelURL": { "type": "string" }, - "replicaCount": { "type": "integer" }, - "requestCPU": { "type": "integer" }, - "requestMemory": { "type": "string" }, - "requestGPU": { "type": "integer" }, - "pvcStorage": { "type": "string" }, + "name": { + "type": "string" + }, + "repository": { + "type": "string" + }, + "tag": { + "type": "string" + }, + "modelURL": { + "type": "string" + }, + "replicaCount": { + "type": "integer" + }, + "requestCPU": { + "type": "integer" + }, + "requestMemory": { + "type": "string" + }, + "requestGPU": { + "type": "integer" + }, + "pvcStorage": { + "type": "string" + }, "pvcMatchLabels": { "type": "object", "additionalProperties": { @@ -35,34 +79,60 @@ "vllmConfig": { "type": "object", "properties": { - "enablePrefixCaching": { "type": "boolean" }, - "enableChunkedPrefill": { "type": "boolean" }, - "maxModelLen": { "type": "integer" }, - "dtype": { "type": "string" }, + "enablePrefixCaching": { + "type": "boolean" + }, + "enableChunkedPrefill": { + "type": "boolean" + }, + "maxModelLen": { + "type": "integer" + }, + "dtype": { + "type": "string" + }, "extraArgs": { "type": "array", - "items": { "type": "string" } + "items": { + "type": "string" + } } } }, "lmcacheConfig": { "type": "object", "properties": { - "enabled": { "type": "boolean" }, - "cpuOffloadingBufferSize": { "type": "string" } + "enabled": { + "type": "boolean" + }, + "cpuOffloadingBufferSize": { + "type": "string" + } }, - "required": ["enabled", "cpuOffloadingBufferSize"] + "required": [ + "enabled", + "cpuOffloadingBufferSize" + ] }, "hf_token": { "oneOf": [ - { "type": "string" }, + { + "type": "string" + }, { "type": "object", "properties": { - "secretName": { "type": "string" }, - "secretKey": { "type": "string" } + "secretName": { + "type": "string" + }, + "secretKey": { + "type": "string" + } }, - "required": ["secretName", "secretKey"] + "required": [ + "secretName", + "secretKey" + ] } ] }, @@ -71,10 +141,17 @@ "items": { "type": "object", "properties": { - "name": { "type": "string" }, - "value": { "type": "string" } + "name": { + "type": "string" + }, + "value": { + "type": "string" + } }, - "required": ["name", "value"] + "required": [ + "name", + "value" + ] } }, "nodeSelectorTerms": { @@ -87,93 +164,171 @@ "items": { "type": "object", "properties": { - "key": { "type": "string" }, - "operator": { "type": "string" }, + "key": { + "type": "string" + }, + "operator": { + "type": "string" + }, "values": { "type": "array", - "items": { "type": "string" } + "items": { + "type": "string" + } } }, - "required": ["key", "operator", "values"] + "required": [ + "key", + "operator", + "values" + ] } } } } } }, - "required": ["name", "repository", "tag", "modelURL", "replicaCount", "requestCPU", "requestMemory", "requestGPU", "pvcStorage"] + "required": [ + "name", + "repository", + "tag", + "modelURL", + "replicaCount", + "requestCPU", + "requestMemory", + "requestGPU", + "pvcStorage" + ] } }, - "containerPort": { "type": "integer" }, - "servicePort": { "type": "integer" }, + "containerPort": { + "type": "integer" + }, + "servicePort": { + "type": "integer" + }, "startupProbe": { "type": "object", "properties": { - "initialDelaySeconds": { "type": "integer" }, - "periodSeconds": { "type": "integer" }, - "failureThreshold": { "type": "integer" }, + "initialDelaySeconds": { + "type": "integer" + }, + "periodSeconds": { + "type": "integer" + }, + "failureThreshold": { + "type": "integer" + }, "httpGet": { "type": "object", "properties": { - "path": { "type": "string" }, - "port": { "type": "integer" } + "path": { + "type": "string" + }, + "port": { + "type": "integer" + } }, - "required": ["path", "port"] + "required": [ + "path", + "port" + ] } } }, "livenessProbe": { "type": "object", "properties": { - "initialDelaySeconds": { "type": "integer" }, - "periodSeconds": { "type": "integer" }, - "failureThreshold": { "type": "integer" }, + "initialDelaySeconds": { + "type": "integer" + }, + "periodSeconds": { + "type": "integer" + }, + "failureThreshold": { + "type": "integer" + }, "httpGet": { "type": "object", "properties": { - "path": { "type": "string" }, - "port": { "type": "integer" } + "path": { + "type": "string" + }, + "port": { + "type": "integer" + } }, - "required": ["path", "port"] + "required": [ + "path", + "port" + ] } } }, - "maxUnavailablePodDisruptionBudget": { "type": "string" }, + "maxUnavailablePodDisruptionBudget": { + "type": "string" + }, "tolerations": { "type": "array", "items": { "type": "object", "properties": { - "key": { "type": "string" }, - "operator": { "type": "string" }, - "effect": { "type": "string" } + "key": { + "type": "string" + }, + "operator": { + "type": "string" + }, + "effect": { + "type": "string" + } } } }, "runtimeClassName": { - "type": "string" + "type": "string" } } }, "routerSpec": { "type": "object", "properties": { - "replicaCount": { "type": "integer" }, - "containerPort": { "type": "integer" }, - "servicePort": { "type": "integer" }, - "routingLogic": { "type": "string" }, - "sessionKey": { "type": "string" }, + "replicaCount": { + "type": "integer" + }, + "containerPort": { + "type": "integer" + }, + "servicePort": { + "type": "integer" + }, + "routingLogic": { + "type": "string" + }, + "sessionKey": { + "type": "string" + }, "extraArgs": { "type": "array", - "items": { "type": "string" } + "items": { + "type": "string" + } + }, + "engineScrapeInterval": { + "type": "integer" + }, + "requestStatsWindow": { + "type": "integer" }, - "engineScrapeInterval": { "type": "integer" }, - "requestStatsWindow": { "type": "integer" }, "labels": { "type": "object", "properties": { - "environment": { "type": "string" }, - "release": { "type": "string" } + "environment": { + "type": "string" + }, + "release": { + "type": "string" + } } } } diff --git a/helm/values.yaml b/helm/values.yaml index eab4be7e..c299c5db 100644 --- a/helm/values.yaml +++ b/helm/values.yaml @@ -8,7 +8,12 @@ servingEngineSpec: labels: environment: "test" release: "test" - + # vllmApiKey: (optional) api key for securing the VLMM models. Can be either: + # - A string containing the token directly (will be stored in a generated secret) + # - An object referencing an existing secret: + # secretName: "my-existing-secret" + # secretKey: "vllm-api-key" + # # modelSpec - configuring multiple serving engines deployments that runs different models # Each entry in the modelSpec array should contain the following fields: # - name: (string) The name of the model, e.g., "example-model" @@ -51,6 +56,7 @@ servingEngineSpec: # - shmSize: (optional, string) The size of the shared memory, e.g., "20Gi" # # Example: + # vllmApiKey: "vllm_xxxxxxxxxxxxx" # modelSpec: # - name: "mistral" # repository: "lmcache/vllm-openai" @@ -81,6 +87,7 @@ servingEngineSpec: # # hf_token: "hf_xxxxxxxxxxxxx" # + # # nodeSelectorTerms: # - matchExpressions: # - key: nvidia.com/gpu.product diff --git a/src/vllm_router/service_discovery.py b/src/vllm_router/service_discovery.py index 77ad2120..09c9a20f 100644 --- a/src/vllm_router/service_discovery.py +++ b/src/vllm_router/service_discovery.py @@ -1,5 +1,6 @@ import abc import enum +import os import threading import time from dataclasses import dataclass @@ -133,7 +134,11 @@ def _get_model_name(self, pod_ip) -> Optional[str]: """ url = f"http://{pod_ip}:{self.port}/v1/models" try: - response = requests.get(url) + headers = None + if VLLM_API_KEY := os.getenv("VLLM_API_KEY"): + logger.info(f"Using vllm server authentication") + headers = {"Authorization": f"Bearer {VLLM_API_KEY}"} + response = requests.get(url, headers=headers) response.raise_for_status() model_name = response.json()["data"][0]["id"] except Exception as e: diff --git a/tutorials/06-secure-vllm-serve.md b/tutorials/06-secure-vllm-serve.md new file mode 100644 index 00000000..9cdc24f8 --- /dev/null +++ b/tutorials/06-secure-vllm-serve.md @@ -0,0 +1,179 @@ +# Tutorial: Basic secure vLLM Configurations + +## Introduction + +This tutorial guides you through the basic configurations required to deploy a +vLLM serving engine in a Kubernetes environment with GPU support. You will learn +how to specify the model details, set up necessary environment variables (like +`HF_TOKEN`, `VLLM_API_KEY`), and launch the vLLM serving engine. + +## Table of Contents + +1. [Prerequisites](#prerequisites) +2. [Step 1: Preparing the Configuration File](#step-1-preparing-the-configuration-file) +3. [Step 2: Applying the Configuration](#step-2-applying-the-configuration) +4. [Step 3: Verifying the Deployment](#step-3-verifying-the-deployment) + +## Prerequisites + +- A Kubernetes environment with GPU support, as set up in the + [00-install-kubernetes-env tutorial](00-install-kubernetes-env.md). +- Helm installed on your system. +- Access to a HuggingFace token (`HF_TOKEN`). +- A self-defined api key or an existing secret (`VLLM_API_KEY`). + +## Step 1: Preparing the Configuration File + +1. Locate the example configuration file + `tutorials/assets/values-06-secure-vllm.yaml`. +2. Open the file and update the following fields: + - Write your actual huggingface token in `hf_token: ` in the + yaml file. + - Write your actual vllmApiKey in `vllmApiKey: ` in the + yaml file. + +### Explanation of Key Items in `values-06-secure-vllm-serve.yaml` + +- **`vllmApiKey`**: The api key to secure the model serving with vllm. +- **`name`**: The unique identifier for your model deployment. +- **`repository`**: The Docker repository containing the model's serving engine + image. +- **`tag`**: Specifies the version of the model image to use. +- **`modelURL`**: The URL pointing to the model on Hugging Face or another + hosting service. +- **`replicaCount`**: The number of replicas for the deployment, allowing + scaling for load. +- **`requestCPU`**: The amount of CPU resources requested per replica. +- **`requestMemory`**: Memory allocation for the deployment; sufficient memory + is required to load the model. +- **`requestGPU`**: Specifies the number of GPUs to allocate for the deployment. +- **`pvcStorage`**: Defines the Persistent Volume Claim size for model storage. +- **`vllmConfig`**: Contains model-specific configurations: + - `enableChunkedPrefill`: Optimizes performance by prefetching model chunks. + - `enablePrefixCaching`: Speeds up response times for common prefixes in + queries. + - `maxModelLen`: The maximum sequence length the model can handle. + - `dtype`: Data type for computations, e.g., `bfloat16` for faster performance + on modern GPUs. + - `extraArgs`: Additional arguments passed to the vLLM engine for fine-tuning + behavior. +- **`hf_token`**: The Hugging Face token for authenticating with the Hugging + Face model hub. +- **`env`**: Extra environment variables to pass to the model-serving engine. + +### Example Snippet + +```yaml +servingEngineSpec: + vllmApiKey: + modelSpec: + - name: "llama3" + repository: "vllm/vllm-openai" + tag: "latest" + modelURL: "meta-llama/Llama-3.1-8B-Instruct" + replicaCount: 1 + + requestCPU: 10 + requestMemory: "16Gi" + requestGPU: 1 + + pvcStorage: "50Gi" + + vllmConfig: + enableChunkedPrefill: false + enablePrefixCaching: false + maxModelLen: 16384 + dtype: "bfloat16" + extraArgs: ["--disable-log-requests", "--gpu-memory-utilization", "0.8"] + + hf_token: +``` + +## Step 2: Applying the Configuration + +Deploy the configuration using Helm: + +```bash +helm repo add vllm https://vllm-project.github.io/production-stack +helm install vllm vllm/vllm-stack -f tutorials/assets/values-06-secure-vllm-serve.yaml +``` + +Expected output: + +You should see output indicating the successful deployment of the Helm chart: + +```plaintext +Release "vllm" has been deployed. Happy Helming! +NAME: vllm +LAST DEPLOYED: +NAMESPACE: default +STATUS: deployed +REVISION: 1 +``` + +## Step 3: Verifying the Deployment + +1. Check the status of the pods: + + ```bash + sudo kubectl get pods + ``` + + Expected output: + + You should see the following pods: + + ```plaintext + NAME READY STATUS RESTARTS AGE + pod/vllm-deployment-router-xxxx-xxxx 1/1 Running 0 3m23s + vllm-llama3-deployment-vllm-xxxx-xxxx 1/1 Running 0 3m23s + ``` + + - The `vllm-deployment-router` pod acts as the router, managing requests and + routing them to the appropriate model-serving pod. + - The `vllm-llama3-deployment-vllm` pod serves the actual model for + inference. + +2. Verify the service is exposed correctly: + + ```bash + sudo kubectl get services + ``` + + Expected output: + + Ensure there are services for both the serving engine and the router: + + ```plaintext + NAME TYPE CLUSTER-IP EXTERNAL-IP PORT(S) AGE + vllm-engine-service ClusterIP 10.103.98.170 80/TCP 4m + vllm-router-service ClusterIP 10.103.110.107 80/TCP 4m + ``` + + - The `vllm-engine-service` exposes the serving engine. + - The `vllm-router-service` handles routing and load balancing across + model-serving pods. + +3. Test the health endpoint: + + ```bash + curl http:///health + ``` + + Replace `` with the external IP of the service. If everything is + configured correctly, you will get: + + ```plaintext + {"status":"healthy"} + ``` + +Please refer to Step 3 in the +[01-minimal-helm-installation](01-minimal-helm-installation.md) tutorial for +querying the deployed vLLM service. + +## Conclusion + +In this tutorial, you configured and deployed a secure vLLM serving engine in a +Kubernetes environment. You also learned how to verify its deployment and ensure +it is running as expected. For further customization, refer to the `values.yaml` +file and Helm chart documentation.