From 09d5c10943aa369ee3f0f36b7e3761fc7fc2b7f7 Mon Sep 17 00:00:00 2001
From: Jiayi Yao <82156730+YaoJiayi@users.noreply.github.com>
Date: Sat, 1 Mar 2025 01:15:01 -0600
Subject: [PATCH] [Feat] Add remote shared storage with LMCache (#188)

* add remote server

Signed-off-by: YaoJiayi <120040070@link.cuhk.edu.cn>

* add yaml

Signed-off-by: YaoJiayi <120040070@link.cuhk.edu.cn>

* add tutorial md

Signed-off-by: YaoJiayi <120040070@link.cuhk.edu.cn>

* fix replica count

Signed-off-by: YaoJiayi <120040070@link.cuhk.edu.cn>

* Update 06-remote-shared-kv-cache.md

Signed-off-by: YaoJiayi <120040070@link.cuhk.edu.cn>

* add signiture

Signed-off-by: YaoJiayi <120040070@link.cuhk.edu.cn>

---------

Signed-off-by: YaoJiayi <120040070@link.cuhk.edu.cn>
---
 .gitignore                                    |   2 +
 helm/templates/_helpers.tpl                   |  17 +++
 helm/templates/deployment-cache-server.yaml   |  52 +++++++
 helm/templates/deployment-vllm-multi.yaml     |   6 +
 helm/templates/service-cache-server.yaml      |  18 +++
 tutorials/06-remote-shared-kv-cache.md        | 139 ++++++++++++++++++
 .../assets/values-06-shared-storage.yaml      |  54 +++++++
 7 files changed, 288 insertions(+)
 create mode 100644 helm/templates/deployment-cache-server.yaml
 create mode 100644 helm/templates/service-cache-server.yaml
 create mode 100644 tutorials/06-remote-shared-kv-cache.md
 create mode 100644 tutorials/assets/values-06-shared-storage.yaml

diff --git a/.gitignore b/.gitignore
index c4d9a7a9..5b4aef42 100644
--- a/.gitignore
+++ b/.gitignore
@@ -97,3 +97,5 @@ helm/examples
 
 # version files
 src/vllm_router/_version.py
+
+/tutorials/assets/private.yaml
diff --git a/helm/templates/_helpers.tpl b/helm/templates/_helpers.tpl
index 43ef2e59..3488ede3 100644
--- a/helm/templates/_helpers.tpl
+++ b/helm/templates/_helpers.tpl
@@ -128,6 +128,15 @@ limits:
 {{-   end }}
 {{- end }}
 
+{{/*
+  Define labels for cache server and its service
+*/}}
+{{- define "chart.cacheserverLabels" -}}
+{{-   with .Values.cacheserverSpec.labels -}}
+{{      toYaml . }}
+{{-   end }}
+{{- end }}
+
 {{/*
   Define helper function to convert labels to a comma separated list
 */}}
@@ -140,3 +149,11 @@ limits:
   {{- $result = "," -}}
 {{- end -}}
 {{- end -}}
+
+
+{{/*
+  Define helper function to format remote cache url
+*/}}
+{{- define "cacheserver.formatRemoteUrl" -}}
+lm://{{ .service_name }}:{{ .port }}
+{{- end -}}
diff --git a/helm/templates/deployment-cache-server.yaml b/helm/templates/deployment-cache-server.yaml
new file mode 100644
index 00000000..8f9a778b
--- /dev/null
+++ b/helm/templates/deployment-cache-server.yaml
@@ -0,0 +1,52 @@
+{{- if .Values.cacheserverSpec -}}
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: "{{ .Release.Name }}-deployment-cache-server"
+  namespace: {{ .Release.Namespace }}
+  labels:
+  {{- include "chart.cacheserverLabels" . | nindent 4 }}
+spec:
+  replicas: 1
+  selector:
+    matchLabels:
+    {{- include "chart.cacheserverLabels" . | nindent 6 }}
+  template:
+    metadata:
+      labels:
+      {{- include "chart.cacheserverLabels" . | nindent 8 }}
+    spec:
+      containers:
+        - name: "lmcache-server"
+          image: "{{ required "Required value 'cacheserverSpec.repository' must be defined !" .Values.cacheserverSpec.repository }}:{{ required "Required value 'cacheserverSpec.tag' must be defined !" .Values.cacheserverSpec.tag }}"
+          command:
+          - "lmcache_experimental_server"
+          - "0.0.0.0"
+          - "{{ .Values.cacheserverSpec.containerPort }}"
+          {{- if .Values.cacheserverSpec.resources }}
+          resources:
+            {{- if .Values.cacheserverSpec.resources.requests }}
+            requests:
+              cpu: "{{ .Values.cacheserverSpec.resources.requests.cpu }}"
+              memory: "{{ .Values.cacheserverSpec.resources.requests.memory }}"
+            {{- end }}
+            {{- if .Values.cacheserverSpec.resources.limits }}
+            limits:
+              cpu: "{{ .Values.cacheserverSpec.resources.limits.cpu }}"
+              memory: "{{ .Values.cacheserverSpec.resources.limits.memory }}"
+            {{- end }}
+          {{- end }}
+          ports:
+            - name: "caserver-cport"
+              containerPort: {{ .Values.cacheserverSpec.containerPort }}
+          imagePullPolicy: IfNotPresent
+
+        # TODO(Jiayi): add health check for lmcache server
+        # livenessProbe:
+        #   initialDelaySeconds: 30
+        #   periodSeconds: 5
+        #   failureThreshold: 3
+        #   httpGet:
+        #     path: /health
+        #     port: {{ .Values.cacheserverSpec.containerPort }}
+{{- end -}}
diff --git a/helm/templates/deployment-vllm-multi.yaml b/helm/templates/deployment-vllm-multi.yaml
index 825bfe3e..4075efc2 100644
--- a/helm/templates/deployment-vllm-multi.yaml
+++ b/helm/templates/deployment-vllm-multi.yaml
@@ -119,6 +119,12 @@ spec:
           - name: LMCACHE_MAX_LOCAL_DISK_SIZE
             value: "{{ $modelSpec.lmcacheConfig.diskOffloadingBufferSize }}"
           {{-   end }}
+          {{-   if .Values.cacheserverSpec }}
+          - name: LMCACHE_REMOTE_URL
+            value: "{{ include "cacheserver.formatRemoteUrl" (dict "service_name" (print .Release.Name "-cache-server-service") "port" .Values.cacheserverSpec.servicePort) }}"
+          - name: LMCACHE_REMOTE_SERDE
+            value: "{{ .Values.cacheserverSpec.serde }}"
+          {{-   end }}
           {{- end }}
           {{- if .Values.servingEngineSpec.configs }}
           envFrom:
diff --git a/helm/templates/service-cache-server.yaml b/helm/templates/service-cache-server.yaml
new file mode 100644
index 00000000..fa09782c
--- /dev/null
+++ b/helm/templates/service-cache-server.yaml
@@ -0,0 +1,18 @@
+{{- if .Values.cacheserverSpec -}}
+apiVersion: v1
+kind: Service
+metadata:
+  name: "{{ .Release.Name }}-cache-server-service"
+  namespace: {{ .Release.Namespace }}
+  labels:
+  {{- include "chart.cacheserverLabels" . | nindent 4 }}
+spec:
+  type: ClusterIP
+  ports:
+    - name: "cacheserver-sport"
+      port: {{ .Values.cacheserverSpec.servicePort }}
+      targetPort: {{ .Values.cacheserverSpec.containerPort }}
+      protocol: TCP
+  selector:
+  {{- include "chart.cacheserverLabels" . | nindent 4 }}
+{{- end -}}
diff --git a/tutorials/06-remote-shared-kv-cache.md b/tutorials/06-remote-shared-kv-cache.md
new file mode 100644
index 00000000..cdcdcef9
--- /dev/null
+++ b/tutorials/06-remote-shared-kv-cache.md
@@ -0,0 +1,139 @@
+# Tutorial: Shared Remote KV Cache Storage with LMCache
+
+## Introduction
+
+This tutorial demonstrates how to enable remote KV cache storage using LMCache in a vLLM deployment. Remote KV cache sharing moves large KV caches from GPU memory to a remote shared storage, enabling more KV cache hits and potentially making the deployment more fault tolerant.
+vLLM Production Stack uses LMCache for remote KV cache sharing. For more details, see the [LMCache GitHub repository](https://github.com/LMCache/LMCache).
+
+## Table of Contents
+
+1. [Prerequisites](#prerequisites)
+2. [Step 1: Configuring Remote KV Cache Storage](#step-1-configuring-kv-cache-shared-storage)
+3. [Step 2: Deploying the Helm Chart](#step-2-deploying-the-helm-chart)
+4. [Step 3: Verifying the Installation](#step-3-verifying-the-installation)
+5. [Benchmark the Performance Gain of Remote Shared Storage (Work in Progress)](#benchmark-the-performance-gain-of-remote-shared-storage-work-in-progress)
+
+## Prerequisites
+
+- Completion of the following tutorials:
+  - [00-install-kubernetes-env.md](00-install-kubernetes-env.md)
+  - [01-minimal-helm-installation.md](01-minimal-helm-installation.md)
+  - [02-basic-vllm-config.md](02-basic-vllm-config.md)
+- A Kubernetes environment with GPU support.
+
+## Step 1: Configuring KV Cache Shared Storage
+
+Locate the file `tutorials/assets/values-06-remote-shared-storage.yaml` with the following content:
+
+```yaml
+servingEngineSpec:
+  runtimeClassName: ""
+  modelSpec:
+  - name: "mistral"
+    repository: "lmcache/vllm-openai"
+    tag: "latest"
+    modelURL: "mistralai/Mistral-7B-Instruct-v0.2"
+    replicaCount: 2
+    requestCPU: 10
+    requestMemory: "40Gi"
+    requestGPU: 1
+    pvcStorage: "50Gi"
+    vllmConfig:
+      enableChunkedPrefill: false
+      enablePrefixCaching: false
+      maxModelLen: 16384
+
+    lmcacheConfig:
+      enabled: true
+      cpuOffloadingBufferSize: "20"
+
+    hf_token: <YOUR HF TOKEN>
+
+cacheserverSpec:
+  replicaCount: 1
+  containerPort: 8080
+  servicePort: 81
+  serde: "naive"
+
+  repository: "lmcache/vllm-openai"
+  tag: "latest"
+  resources:
+    requests:
+      cpu: "4"
+      memory: "8G"
+    limits:
+      cpu: "4"
+      memory: "10G"
+
+  labels:
+    environment: "cacheserver"
+    release: "cacheserver"
+
+```
+
+> **Note:** Replace `<YOUR HF TOKEN>` with your actual Hugging Face token.
+
+The `CacheserverSpec` starts a remote shared KV cache storage.
+
+## Step 2: Deploying the Helm Chart
+
+Deploy the Helm chart using the customized values file:
+
+```bash
+sudo helm install vllm vllm/vllm-stack -f tutorials/assets/values-06-shared-storage.yaml
+```
+
+## Step 3: Verifying the Installation
+
+1. Check the pod logs to verify LMCache is active:
+
+   ```bash
+   sudo kubectl get pods
+   ```
+
+   Identify the pod name for the vLLM deployment (e.g., `vllm-mistral-deployment-vllm-xxxx-xxxx`). Then run:
+
+   ```bash
+   sudo kubectl logs -f <pod-name>
+   ```
+
+   Look for entries in the log indicating LMCache is enabled and operational. An example output (indicating KV cache is stored) is:
+
+   ```plaintext
+   INFO 01-21 20:16:58 lmcache_connector.py:41] Initializing LMCacheConfig under kv_transfer_config kv_connector='LMCacheConnector' kv_buffer_device='cuda' kv_buffer_size=1000000000.0 kv_role='kv_both' kv_rank=None kv_parallel_size=1 kv_ip='127.0.0.1' kv_port=14579
+   INFO LMCache: Creating LMCacheEngine instance vllm-instance [2025-01-21 20:16:58,732] -- /usr/local/lib/python3.12/dist-packages/lmcache/experimental/cache_engine.py:237
+   ```
+
+2. Forward the router service port to access the stack locally:
+
+   ```bash
+   sudo kubectl port-forward svc/vllm-router-service 30080:80
+   ```
+
+3. Send a request to the stack and observe the logs:
+
+   ```bash
+   curl -X POST http://localhost:30080/v1/completions \
+     -H "Content-Type: application/json" \
+     -d '{
+       "model": "mistralai/Mistral-7B-Instruct-v0.2",
+       "prompt": "Explain the significance of KV cache in language models.",
+       "max_tokens": 10
+     }'
+   ```
+
+   Expected output:
+
+   The response from the stack should contain the completion result, and the logs should show LMCache activity, for example:
+
+   ```plaintext
+   DEBUG LMCache: Store skips 0 tokens and then stores 13 tokens [2025-01-21 20:23:45,113] -- /usr/local/lib/python3.12/dist-packages/lmcache/integration/vllm/vllm_adapter.py:490
+   ```
+
+## Benchmark the Performance Gain of Remote Shared Storage (Work in Progress)
+
+In this section, we will benchmark the performance improvement when using LMCache for remote KV cache shared storage. Stay tuned for updates.
+
+## Conclusion
+
+This tutorial demonstrated how to enable a shared KV cache storage across multiple vllm nodes in a vLLM deployment using LMCache. By storing KV cache to a remote shared storage, you can improve KV cache hit rate and potentially make the deployment more fault tolerant. Explore further configurations to tailor LMCache to your workloads.
diff --git a/tutorials/assets/values-06-shared-storage.yaml b/tutorials/assets/values-06-shared-storage.yaml
new file mode 100644
index 00000000..31052b11
--- /dev/null
+++ b/tutorials/assets/values-06-shared-storage.yaml
@@ -0,0 +1,54 @@
+servingEngineSpec:
+  runtimeClassName: ""
+  modelSpec:
+  - name: "mistral"
+    repository: "lmcache/vllm-openai"
+    tag: "latest"
+    modelURL: "mistralai/Mistral-7B-Instruct-v0.2"
+    replicaCount: 2
+    requestCPU: 10
+    requestMemory: "40Gi"
+    requestGPU: 1
+    pvcStorage: "50Gi"
+    vllmConfig:
+      enableChunkedPrefill: false
+      enablePrefixCaching: false
+      maxModelLen: 16384
+
+    lmcacheConfig:
+      enabled: true
+      cpuOffloadingBufferSize: "20"
+
+    hf_token: <YOUR HF TOKEN>
+
+cacheserverSpec:
+  # -- Number of replicas
+  replicaCount: 1
+
+  # -- Container port
+  containerPort: 8080
+
+  # -- Service port
+  servicePort: 81
+
+  # -- Serializer/Deserializer type
+  serde: "naive"
+
+  # -- Cache server image (reusing the vllm image)
+  repository: "lmcache/vllm-openai"
+  tag: "latest"
+
+  # TODO (Jiayi): please adjust this once we have evictor
+  # -- router resource requests and limits
+  resources:
+    requests:
+      cpu: "4"
+      memory: "8G"
+    limits:
+      cpu: "4"
+      memory: "10G"
+
+  # -- Customized labels for the cache server deployment
+  labels:
+    environment: "cacheserver"
+    release: "cacheserver"