From 1c9c6914715ccde80ddccec7b9fe978b3632b49a Mon Sep 17 00:00:00 2001
From: Nguyen Dinh Duy <judichi87@gmail.com>
Date: Thu, 27 Feb 2025 15:13:28 +0100
Subject: [PATCH] feat: add vllm-api-key

Signed-off-by: kbvd623 <duy.nguyen4@astrazeneca.com>
---
 .github/curl-05-secure-vllm.sh                |  17 ++
 .github/values-05-secure-vllm.yaml            |  19 ++
 .../workflows/functionality-helm-chart.yml    |  36 ++-
 helm/templates/deployment-router.yaml         |  16 +
 helm/templates/deployment-vllm-multi.yaml     |  15 +
 helm/templates/secrets.yaml                   |   5 +
 helm/values.schema.json                       | 275 ++++++++++++++----
 helm/values.yaml                              |   9 +-
 src/vllm_router/service_discovery.py          |   7 +-
 tutorials/06-secure-vllm-serve.md             | 179 ++++++++++++
 10 files changed, 515 insertions(+), 63 deletions(-)
 create mode 100644 .github/curl-05-secure-vllm.sh
 create mode 100644 .github/values-05-secure-vllm.yaml
 create mode 100644 tutorials/06-secure-vllm-serve.md

diff --git a/.github/curl-05-secure-vllm.sh b/.github/curl-05-secure-vllm.sh
new file mode 100644
index 00000000..fd39ebb8
--- /dev/null
+++ b/.github/curl-05-secure-vllm.sh
@@ -0,0 +1,17 @@
+#!/bin/bash
+HOST=$1
+PORT=$2
+VLLM_API_KEY=abc123XYZ987
+
+# Curl and save output
+OUTPUT_DIR="output-05-secure-vllm"
+[ ! -d "$OUTPUT_DIR" ] && mkdir $OUTPUT_DIR
+chmod -R 777 $OUTPUT_DIR
+
+# Fetch model list with authentication
+curl -s -H "Authorization: Bearer $VLLM_API_KEY" "http://$HOST:$PORT/v1/models" | tee "$OUTPUT_DIR/models-05-secure-vllm.json"
+# Run completion query with authentication
+curl -s -X POST -H "Authorization: Bearer $VLLM_API_KEY" \
+     -H "Content-Type: application/json" \
+     -d '{"model": "facebook/opt-125m", "prompt": "Once upon a time,", "max_tokens": 10}' \
+     "http://$HOST:$PORT/v1/completions" | tee "$OUTPUT_DIR/query-05-secure-vllm.json"
diff --git a/.github/values-05-secure-vllm.yaml b/.github/values-05-secure-vllm.yaml
new file mode 100644
index 00000000..ac988730
--- /dev/null
+++ b/.github/values-05-secure-vllm.yaml
@@ -0,0 +1,19 @@
+servingEngineSpec:
+  runtimeClassName: ""
+  vllmApiKey: "abc123XYZ987"
+  modelSpec:
+  - name: "opt125m"
+    repository: "vllm/vllm-openai"
+    tag: "latest"
+    modelURL: "facebook/opt-125m"
+
+    replicaCount: 1
+
+    requestCPU: 6
+    requestMemory: "16Gi"
+    requestGPU: 1
+
+routerSpec:
+  repository: "localhost:5000/git-act-router"
+  imagePullPolicy: "IfNotPresent"
+  enableRouter: true
diff --git a/.github/workflows/functionality-helm-chart.yml b/.github/workflows/functionality-helm-chart.yml
index 74962056..78e086b9 100644
--- a/.github/workflows/functionality-helm-chart.yml
+++ b/.github/workflows/functionality-helm-chart.yml
@@ -52,7 +52,41 @@ jobs:
             sudo helm uninstall vllm
         if: always()
       - run: echo "🍏 This job's status is ${{ job.status }}."
-
+  Secure-Minimal-Example:
+    runs-on: self-hosted
+    steps:
+      - run: echo "🎉 The job was automatically triggered by a ${{ github.event_name }} event."
+      - run: echo "🐧 This job is now running on a ${{ runner.os }} server hosted by GitHub!"
+      - run: echo "🔎 The name of your branch is ${{ github.ref }} and your repository is ${{ github.repository }}."
+      - name: Check out repository code
+        uses: actions/checkout@v4
+      - run: echo "💡 The ${{ github.repository }} repository has been cloned to the runner."
+      - run: echo "🖥️ The workflow is now ready to test your code on the runner."
+      - name: Deploy via helm charts
+        env:
+          DOCKER_BUILDKIT: 1
+        run: |
+          cd ${{ github.workspace }}
+          sudo docker build -t localhost:5000/git-act-router -f docker/Dockerfile .
+          sudo docker push localhost:5000/git-act-router
+          sudo sysctl fs.protected_regular=0
+          sudo minikube image load localhost:5000/git-act-router
+          sudo helm install vllm ./helm -f .github/values-05-secure-vllm.yaml
+      - name: Validate the installation and send query to the stack
+        run: |
+          sudo bash .github/port-forward.sh curl-05-secure-vllm
+        timeout-minutes: 2
+      - name: Archive functionality results
+        uses: actions/upload-artifact@v4
+        with:
+          name: output-05-secure-vllm
+          path: |
+            output-05-secure-vllm/
+      - name: Helm uninstall
+        run: |
+            sudo helm uninstall vllm
+        if: always()
+      - run: echo "🍏 This job's status is ${{ job.status }}."
   Two-Pods-Minimal-Example:
     runs-on: self-hosted
     needs: Minimal-Example
diff --git a/helm/templates/deployment-router.yaml b/helm/templates/deployment-router.yaml
index 5dacd67e..7ed3bee6 100644
--- a/helm/templates/deployment-router.yaml
+++ b/helm/templates/deployment-router.yaml
@@ -22,6 +22,22 @@ spec:
       - name: router-container
         image: "{{ .Values.routerSpec.repository | default "lmcache/lmstack-router" }}:{{ .Values.routerSpec.tag | default "latest" }}"
         imagePullPolicy: "{{ .Values.routerSpec.imagePullPolicy | default "Always" }}"
+        env:
+          {{- $vllmApiKey := $.Values.servingEngineSpec.vllmApiKey }}
+          {{- if $vllmApiKey }}
+          - name: VLLM_API_KEY
+            {{- if kindIs "string" $vllmApiKey }}
+            valueFrom:
+              secretKeyRef:
+                name: {{ .Release.Name }}-secrets
+                key: vllmApiKey
+            {{- else }}
+            valueFrom:
+              secretKeyRef:
+                name: {{ $vllmApiKey.secretName }}
+                key: {{ $vllmApiKey.secretKey }}
+            {{- end }}
+          {{- end }}
         args:
           - "--host"
           - "0.0.0.0"
diff --git a/helm/templates/deployment-vllm-multi.yaml b/helm/templates/deployment-vllm-multi.yaml
index 06f6a4fc..14da783c 100644
--- a/helm/templates/deployment-vllm-multi.yaml
+++ b/helm/templates/deployment-vllm-multi.yaml
@@ -93,6 +93,21 @@ spec:
                 key: {{ $modelSpec.hf_token.secretKey }}
             {{- end }}
           {{- end }}
+          {{- $vllmApiKey := $.Values.servingEngineSpec.vllmApiKey }}
+          {{- if $vllmApiKey }}
+          - name: VLLM_API_KEY
+            {{- if kindIs "string" $vllmApiKey }}
+            valueFrom:
+              secretKeyRef:
+                name: {{ .Release.Name }}-secrets
+                key: vllmApiKey
+            {{- else }}
+            valueFrom:
+              secretKeyRef:
+                name: {{ $vllmApiKey.secretName }}
+                key: {{ $vllmApiKey.secretKey }}
+            {{- end }}
+          {{- end }}
           {{- with $modelSpec.env }}
           {{- toYaml . | nindent 10 }}
           {{- end }}
diff --git a/helm/templates/secrets.yaml b/helm/templates/secrets.yaml
index 21c0121e..073aa3d2 100644
--- a/helm/templates/secrets.yaml
+++ b/helm/templates/secrets.yaml
@@ -5,6 +5,11 @@ metadata:
   namespace: {{ .Release.Namespace }}
 type: Opaque
 data:
+  {{- $vllmApiKey := $.Values.servingEngineSpec.vllmApiKey }}
+  {{- if and $vllmApiKey (kindIs "string" $vllmApiKey) }}
+  vllmApiKey: {{ $vllmApiKey | b64enc | quote }}
+  {{- end }}
+
   {{- range $modelSpec := .Values.servingEngineSpec.modelSpec }}
   {{- with $ -}}
   {{-   if and $modelSpec.hf_token (kindIs "string" $modelSpec.hf_token) }}
diff --git a/helm/values.schema.json b/helm/values.schema.json
index 0e00e165..0190ae9b 100644
--- a/helm/values.schema.json
+++ b/helm/values.schema.json
@@ -8,24 +8,68 @@
         "labels": {
           "type": "object",
           "properties": {
-            "environment": { "type": "string" },
-            "release": { "type": "string" }
+            "environment": {
+              "type": "string"
+            },
+            "release": {
+              "type": "string"
+            }
           }
         },
+        "vllmApiKey": {
+          "oneOf": [
+            {
+              "type": "string"
+            },
+            {
+              "type": "object",
+              "properties": {
+                "secretName": {
+                  "type": "string"
+                },
+                "secretKey": {
+                  "type": "string"
+                }
+              },
+              "required": [
+                "secretName",
+                "secretKey"
+              ]
+            }
+          ]
+        },
         "modelSpec": {
           "type": "array",
           "items": {
             "type": "object",
             "properties": {
-              "name": { "type": "string" },
-              "repository": { "type": "string" },
-              "tag": { "type": "string" },
-              "modelURL": { "type": "string" },
-              "replicaCount": { "type": "integer" },
-              "requestCPU": { "type": "integer" },
-              "requestMemory": { "type": "string" },
-              "requestGPU": { "type": "integer" },
-              "pvcStorage": { "type": "string" },
+              "name": {
+                "type": "string"
+              },
+              "repository": {
+                "type": "string"
+              },
+              "tag": {
+                "type": "string"
+              },
+              "modelURL": {
+                "type": "string"
+              },
+              "replicaCount": {
+                "type": "integer"
+              },
+              "requestCPU": {
+                "type": "integer"
+              },
+              "requestMemory": {
+                "type": "string"
+              },
+              "requestGPU": {
+                "type": "integer"
+              },
+              "pvcStorage": {
+                "type": "string"
+              },
               "pvcMatchLabels": {
                 "type": "object",
                 "additionalProperties": {
@@ -35,34 +79,60 @@
               "vllmConfig": {
                 "type": "object",
                 "properties": {
-                  "enablePrefixCaching": { "type": "boolean" },
-                  "enableChunkedPrefill": { "type": "boolean" },
-                  "maxModelLen": { "type": "integer" },
-                  "dtype": { "type": "string" },
+                  "enablePrefixCaching": {
+                    "type": "boolean"
+                  },
+                  "enableChunkedPrefill": {
+                    "type": "boolean"
+                  },
+                  "maxModelLen": {
+                    "type": "integer"
+                  },
+                  "dtype": {
+                    "type": "string"
+                  },
                   "extraArgs": {
                     "type": "array",
-                    "items": { "type": "string" }
+                    "items": {
+                      "type": "string"
+                    }
                   }
                 }
               },
               "lmcacheConfig": {
                 "type": "object",
                 "properties": {
-                  "enabled": { "type": "boolean" },
-                  "cpuOffloadingBufferSize": { "type": "string" }
+                  "enabled": {
+                    "type": "boolean"
+                  },
+                  "cpuOffloadingBufferSize": {
+                    "type": "string"
+                  }
                 },
-                "required": ["enabled", "cpuOffloadingBufferSize"]
+                "required": [
+                  "enabled",
+                  "cpuOffloadingBufferSize"
+                ]
               },
               "hf_token": {
                 "oneOf": [
-                  { "type": "string" },
+                  {
+                    "type": "string"
+                  },
                   {
                     "type": "object",
                     "properties": {
-                      "secretName": { "type": "string" },
-                      "secretKey": { "type": "string" }
+                      "secretName": {
+                        "type": "string"
+                      },
+                      "secretKey": {
+                        "type": "string"
+                      }
                     },
-                    "required": ["secretName", "secretKey"]
+                    "required": [
+                      "secretName",
+                      "secretKey"
+                    ]
                   }
                 ]
               },
@@ -71,10 +141,17 @@
                 "items": {
                   "type": "object",
                   "properties": {
-                    "name": { "type": "string" },
-                    "value": { "type": "string" }
+                    "name": {
+                      "type": "string"
+                    },
+                    "value": {
+                      "type": "string"
+                    }
                   },
-                  "required": ["name", "value"]
+                  "required": [
+                    "name",
+                    "value"
+                  ]
                 }
               },
               "nodeSelectorTerms": {
@@ -87,93 +164,171 @@
                       "items": {
                         "type": "object",
                         "properties": {
-                          "key": { "type": "string" },
-                          "operator": { "type": "string" },
+                          "key": {
+                            "type": "string"
+                          },
+                          "operator": {
+                            "type": "string"
+                          },
                           "values": {
                             "type": "array",
-                            "items": { "type": "string" }
+                            "items": {
+                              "type": "string"
+                            }
                           }
                         },
-                        "required": ["key", "operator", "values"]
+                        "required": [
+                          "key",
+                          "operator",
+                          "values"
+                        ]
                       }
                     }
                   }
                 }
               }
             },
-            "required": ["name", "repository", "tag", "modelURL", "replicaCount", "requestCPU", "requestMemory", "requestGPU", "pvcStorage"]
+            "required": [
+              "name",
+              "repository",
+              "tag",
+              "modelURL",
+              "replicaCount",
+              "requestCPU",
+              "requestMemory",
+              "requestGPU",
+              "pvcStorage"
+            ]
           }
         },
-        "containerPort": { "type": "integer" },
-        "servicePort": { "type": "integer" },
+        "containerPort": {
+          "type": "integer"
+        },
+        "servicePort": {
+          "type": "integer"
+        },
         "startupProbe": {
           "type": "object",
           "properties": {
-            "initialDelaySeconds": { "type": "integer" },
-            "periodSeconds": { "type": "integer" },
-            "failureThreshold": { "type": "integer" },
+            "initialDelaySeconds": {
+              "type": "integer"
+            },
+            "periodSeconds": {
+              "type": "integer"
+            },
+            "failureThreshold": {
+              "type": "integer"
+            },
             "httpGet": {
               "type": "object",
               "properties": {
-                "path": { "type": "string" },
-                "port": { "type": "integer" }
+                "path": {
+                  "type": "string"
+                },
+                "port": {
+                  "type": "integer"
+                }
               },
-              "required": ["path", "port"]
+              "required": [
+                "path",
+                "port"
+              ]
             }
           }
         },
         "livenessProbe": {
           "type": "object",
           "properties": {
-            "initialDelaySeconds": { "type": "integer" },
-            "periodSeconds": { "type": "integer" },
-            "failureThreshold": { "type": "integer" },
+            "initialDelaySeconds": {
+              "type": "integer"
+            },
+            "periodSeconds": {
+              "type": "integer"
+            },
+            "failureThreshold": {
+              "type": "integer"
+            },
             "httpGet": {
               "type": "object",
               "properties": {
-                "path": { "type": "string" },
-                "port": { "type": "integer" }
+                "path": {
+                  "type": "string"
+                },
+                "port": {
+                  "type": "integer"
+                }
               },
-              "required": ["path", "port"]
+              "required": [
+                "path",
+                "port"
+              ]
             }
           }
         },
-        "maxUnavailablePodDisruptionBudget": { "type": "string" },
+        "maxUnavailablePodDisruptionBudget": {
+          "type": "string"
+        },
         "tolerations": {
           "type": "array",
           "items": {
             "type": "object",
             "properties": {
-              "key": { "type": "string" },
-              "operator": { "type": "string" },
-              "effect": { "type": "string" }
+              "key": {
+                "type": "string"
+              },
+              "operator": {
+                "type": "string"
+              },
+              "effect": {
+                "type": "string"
+              }
             }
           }
         },
         "runtimeClassName": {
-            "type": "string"
+          "type": "string"
         }
       }
     },
     "routerSpec": {
       "type": "object",
       "properties": {
-        "replicaCount": { "type": "integer" },
-        "containerPort": { "type": "integer" },
-        "servicePort": { "type": "integer" },
-        "routingLogic": { "type": "string" },
-        "sessionKey": { "type": "string" },
+        "replicaCount": {
+          "type": "integer"
+        },
+        "containerPort": {
+          "type": "integer"
+        },
+        "servicePort": {
+          "type": "integer"
+        },
+        "routingLogic": {
+          "type": "string"
+        },
+        "sessionKey": {
+          "type": "string"
+        },
         "extraArgs": {
           "type": "array",
-          "items": { "type": "string" }
+          "items": {
+            "type": "string"
+          }
+        },
+        "engineScrapeInterval": {
+          "type": "integer"
+        },
+        "requestStatsWindow": {
+          "type": "integer"
         },
-        "engineScrapeInterval": { "type": "integer" },
-        "requestStatsWindow": { "type": "integer" },
         "labels": {
           "type": "object",
           "properties": {
-            "environment": { "type": "string" },
-            "release": { "type": "string" }
+            "environment": {
+              "type": "string"
+            },
+            "release": {
+              "type": "string"
+            }
           }
         }
       }
diff --git a/helm/values.yaml b/helm/values.yaml
index eab4be7e..c299c5db 100644
--- a/helm/values.yaml
+++ b/helm/values.yaml
@@ -8,7 +8,12 @@ servingEngineSpec:
   labels:
     environment: "test"
     release: "test"
-
+  # vllmApiKey: (optional) api key for securing the VLMM models. Can be either:
+  #   - A string containing the token directly (will be stored in a generated secret)
+  #   - An object referencing an existing secret:
+  #     secretName: "my-existing-secret"
+  #     secretKey: "vllm-api-key"
+  #
   # modelSpec - configuring multiple serving engines deployments that runs different models
   # Each entry in the modelSpec array should contain the following fields:
   # - name: (string) The name of the model, e.g., "example-model"
@@ -51,6 +56,7 @@ servingEngineSpec:
   # - shmSize: (optional, string) The size of the shared memory, e.g., "20Gi"
   #
   # Example:
+  # vllmApiKey: "vllm_xxxxxxxxxxxxx"
   # modelSpec:
   # - name: "mistral"
   #   repository: "lmcache/vllm-openai"
@@ -81,6 +87,7 @@ servingEngineSpec:
   #
   #   hf_token: "hf_xxxxxxxxxxxxx"
   #
+  #
   #   nodeSelectorTerms:
   #     - matchExpressions:
   #       - key: nvidia.com/gpu.product
diff --git a/src/vllm_router/service_discovery.py b/src/vllm_router/service_discovery.py
index 77ad2120..09c9a20f 100644
--- a/src/vllm_router/service_discovery.py
+++ b/src/vllm_router/service_discovery.py
@@ -1,5 +1,6 @@
 import abc
 import enum
+import os
 import threading
 import time
 from dataclasses import dataclass
@@ -133,7 +134,11 @@ def _get_model_name(self, pod_ip) -> Optional[str]:
         """
         url = f"http://{pod_ip}:{self.port}/v1/models"
         try:
-            response = requests.get(url)
+            headers = None
+            if VLLM_API_KEY := os.getenv("VLLM_API_KEY"):
+                logger.info(f"Using vllm server authentication")
+                headers = {"Authorization": f"Bearer {VLLM_API_KEY}"}
+            response = requests.get(url, headers=headers)
             response.raise_for_status()
             model_name = response.json()["data"][0]["id"]
         except Exception as e:
diff --git a/tutorials/06-secure-vllm-serve.md b/tutorials/06-secure-vllm-serve.md
new file mode 100644
index 00000000..9cdc24f8
--- /dev/null
+++ b/tutorials/06-secure-vllm-serve.md
@@ -0,0 +1,179 @@
+# Tutorial: Basic secure vLLM Configurations
+
+## Introduction
+
+This tutorial guides you through the basic configurations required to deploy a
+vLLM serving engine in a Kubernetes environment with GPU support. You will learn
+how to specify the model details, set up necessary environment variables (like
+`HF_TOKEN`, `VLLM_API_KEY`), and launch the vLLM serving engine.
+
+## Table of Contents
+
+1. [Prerequisites](#prerequisites)
+2. [Step 1: Preparing the Configuration File](#step-1-preparing-the-configuration-file)
+3. [Step 2: Applying the Configuration](#step-2-applying-the-configuration)
+4. [Step 3: Verifying the Deployment](#step-3-verifying-the-deployment)
+
+## Prerequisites
+
+- A Kubernetes environment with GPU support, as set up in the
+  [00-install-kubernetes-env tutorial](00-install-kubernetes-env.md).
+- Helm installed on your system.
+- Access to a HuggingFace token (`HF_TOKEN`).
+- A self-defined api key or an existing secret (`VLLM_API_KEY`).
+
+## Step 1: Preparing the Configuration File
+
+1. Locate the example configuration file
+   `tutorials/assets/values-06-secure-vllm.yaml`.
+2. Open the file and update the following fields:
+   - Write your actual huggingface token in `hf_token: <YOUR HF TOKEN>` in the
+     yaml file.
+   - Write your actual vllmApiKey in `vllmApiKey: <YOUR VLLM API KEY>` in the
+     yaml file.
+
+### Explanation of Key Items in `values-06-secure-vllm-serve.yaml`
+
+- **`vllmApiKey`**: The api key to secure the model serving with vllm.
+- **`name`**: The unique identifier for your model deployment.
+- **`repository`**: The Docker repository containing the model's serving engine
+  image.
+- **`tag`**: Specifies the version of the model image to use.
+- **`modelURL`**: The URL pointing to the model on Hugging Face or another
+  hosting service.
+- **`replicaCount`**: The number of replicas for the deployment, allowing
+  scaling for load.
+- **`requestCPU`**: The amount of CPU resources requested per replica.
+- **`requestMemory`**: Memory allocation for the deployment; sufficient memory
+  is required to load the model.
+- **`requestGPU`**: Specifies the number of GPUs to allocate for the deployment.
+- **`pvcStorage`**: Defines the Persistent Volume Claim size for model storage.
+- **`vllmConfig`**: Contains model-specific configurations:
+  - `enableChunkedPrefill`: Optimizes performance by prefetching model chunks.
+  - `enablePrefixCaching`: Speeds up response times for common prefixes in
+    queries.
+  - `maxModelLen`: The maximum sequence length the model can handle.
+  - `dtype`: Data type for computations, e.g., `bfloat16` for faster performance
+    on modern GPUs.
+  - `extraArgs`: Additional arguments passed to the vLLM engine for fine-tuning
+    behavior.
+- **`hf_token`**: The Hugging Face token for authenticating with the Hugging
+  Face model hub.
+- **`env`**: Extra environment variables to pass to the model-serving engine.
+
+### Example Snippet
+
+```yaml
+servingEngineSpec:
+  vllmApiKey: <YOUR VLLM API KEY>
+  modelSpec:
+    - name: "llama3"
+      repository: "vllm/vllm-openai"
+      tag: "latest"
+      modelURL: "meta-llama/Llama-3.1-8B-Instruct"
+      replicaCount: 1
+
+      requestCPU: 10
+      requestMemory: "16Gi"
+      requestGPU: 1
+
+      pvcStorage: "50Gi"
+
+      vllmConfig:
+        enableChunkedPrefill: false
+        enablePrefixCaching: false
+        maxModelLen: 16384
+        dtype: "bfloat16"
+        extraArgs: ["--disable-log-requests", "--gpu-memory-utilization", "0.8"]
+
+      hf_token: <YOUR HF TOKEN>
+```
+
+## Step 2: Applying the Configuration
+
+Deploy the configuration using Helm:
+
+```bash
+helm repo add vllm https://vllm-project.github.io/production-stack
+helm install vllm vllm/vllm-stack -f tutorials/assets/values-06-secure-vllm-serve.yaml
+```
+
+Expected output:
+
+You should see output indicating the successful deployment of the Helm chart:
+
+```plaintext
+Release "vllm" has been deployed. Happy Helming!
+NAME: vllm
+LAST DEPLOYED: <timestamp>
+NAMESPACE: default
+STATUS: deployed
+REVISION: 1
+```
+
+## Step 3: Verifying the Deployment
+
+1. Check the status of the pods:
+
+   ```bash
+   sudo kubectl get pods
+   ```
+
+   Expected output:
+
+   You should see the following pods:
+
+   ```plaintext
+   NAME                                             READY   STATUS    RESTARTS   AGE
+   pod/vllm-deployment-router-xxxx-xxxx         1/1     Running   0          3m23s
+   vllm-llama3-deployment-vllm-xxxx-xxxx        1/1     Running   0          3m23s
+   ```
+
+   - The `vllm-deployment-router` pod acts as the router, managing requests and
+     routing them to the appropriate model-serving pod.
+   - The `vllm-llama3-deployment-vllm` pod serves the actual model for
+     inference.
+
+2. Verify the service is exposed correctly:
+
+   ```bash
+   sudo kubectl get services
+   ```
+
+   Expected output:
+
+   Ensure there are services for both the serving engine and the router:
+
+   ```plaintext
+   NAME                      TYPE        CLUSTER-IP      EXTERNAL-IP   PORT(S)        AGE
+   vllm-engine-service   ClusterIP   10.103.98.170    <none>        80/TCP    4m
+   vllm-router-service   ClusterIP   10.103.110.107   <none>        80/TCP    4m
+   ```
+
+   - The `vllm-engine-service` exposes the serving engine.
+   - The `vllm-router-service` handles routing and load balancing across
+     model-serving pods.
+
+3. Test the health endpoint:
+
+   ```bash
+   curl http://<SERVICE_IP>/health
+   ```
+
+   Replace `<SERVICE_IP>` with the external IP of the service. If everything is
+   configured correctly, you will get:
+
+   ```plaintext
+   {"status":"healthy"}
+   ```
+
+Please refer to Step 3 in the
+[01-minimal-helm-installation](01-minimal-helm-installation.md) tutorial for
+querying the deployed vLLM service.
+
+## Conclusion
+
+In this tutorial, you configured and deployed a secure vLLM serving engine in a
+Kubernetes environment. You also learned how to verify its deployment and ensure
+it is running as expected. For further customization, refer to the `values.yaml`
+file and Helm chart documentation.