From 09d5c10943aa369ee3f0f36b7e3761fc7fc2b7f7 Mon Sep 17 00:00:00 2001 From: Jiayi Yao <82156730+YaoJiayi@users.noreply.github.com> Date: Sat, 1 Mar 2025 01:15:01 -0600 Subject: [PATCH] [Feat] Add remote shared storage with LMCache (#188) * add remote server Signed-off-by: YaoJiayi <120040070@link.cuhk.edu.cn> * add yaml Signed-off-by: YaoJiayi <120040070@link.cuhk.edu.cn> * add tutorial md Signed-off-by: YaoJiayi <120040070@link.cuhk.edu.cn> * fix replica count Signed-off-by: YaoJiayi <120040070@link.cuhk.edu.cn> * Update 06-remote-shared-kv-cache.md Signed-off-by: YaoJiayi <120040070@link.cuhk.edu.cn> * add signiture Signed-off-by: YaoJiayi <120040070@link.cuhk.edu.cn> --------- Signed-off-by: YaoJiayi <120040070@link.cuhk.edu.cn> --- .gitignore | 2 + helm/templates/_helpers.tpl | 17 +++ helm/templates/deployment-cache-server.yaml | 52 +++++++ helm/templates/deployment-vllm-multi.yaml | 6 + helm/templates/service-cache-server.yaml | 18 +++ tutorials/06-remote-shared-kv-cache.md | 139 ++++++++++++++++++ .../assets/values-06-shared-storage.yaml | 54 +++++++ 7 files changed, 288 insertions(+) create mode 100644 helm/templates/deployment-cache-server.yaml create mode 100644 helm/templates/service-cache-server.yaml create mode 100644 tutorials/06-remote-shared-kv-cache.md create mode 100644 tutorials/assets/values-06-shared-storage.yaml diff --git a/.gitignore b/.gitignore index c4d9a7a9..5b4aef42 100644 --- a/.gitignore +++ b/.gitignore @@ -97,3 +97,5 @@ helm/examples # version files src/vllm_router/_version.py + +/tutorials/assets/private.yaml diff --git a/helm/templates/_helpers.tpl b/helm/templates/_helpers.tpl index 43ef2e59..3488ede3 100644 --- a/helm/templates/_helpers.tpl +++ b/helm/templates/_helpers.tpl @@ -128,6 +128,15 @@ limits: {{- end }} {{- end }} +{{/* + Define labels for cache server and its service +*/}} +{{- define "chart.cacheserverLabels" -}} +{{- with .Values.cacheserverSpec.labels -}} +{{ toYaml . }} +{{- end }} +{{- end }} + {{/* Define helper function to convert labels to a comma separated list */}} @@ -140,3 +149,11 @@ limits: {{- $result = "," -}} {{- end -}} {{- end -}} + + +{{/* + Define helper function to format remote cache url +*/}} +{{- define "cacheserver.formatRemoteUrl" -}} +lm://{{ .service_name }}:{{ .port }} +{{- end -}} diff --git a/helm/templates/deployment-cache-server.yaml b/helm/templates/deployment-cache-server.yaml new file mode 100644 index 00000000..8f9a778b --- /dev/null +++ b/helm/templates/deployment-cache-server.yaml @@ -0,0 +1,52 @@ +{{- if .Values.cacheserverSpec -}} +apiVersion: apps/v1 +kind: Deployment +metadata: + name: "{{ .Release.Name }}-deployment-cache-server" + namespace: {{ .Release.Namespace }} + labels: + {{- include "chart.cacheserverLabels" . | nindent 4 }} +spec: + replicas: 1 + selector: + matchLabels: + {{- include "chart.cacheserverLabels" . | nindent 6 }} + template: + metadata: + labels: + {{- include "chart.cacheserverLabels" . | nindent 8 }} + spec: + containers: + - name: "lmcache-server" + image: "{{ required "Required value 'cacheserverSpec.repository' must be defined !" .Values.cacheserverSpec.repository }}:{{ required "Required value 'cacheserverSpec.tag' must be defined !" .Values.cacheserverSpec.tag }}" + command: + - "lmcache_experimental_server" + - "0.0.0.0" + - "{{ .Values.cacheserverSpec.containerPort }}" + {{- if .Values.cacheserverSpec.resources }} + resources: + {{- if .Values.cacheserverSpec.resources.requests }} + requests: + cpu: "{{ .Values.cacheserverSpec.resources.requests.cpu }}" + memory: "{{ .Values.cacheserverSpec.resources.requests.memory }}" + {{- end }} + {{- if .Values.cacheserverSpec.resources.limits }} + limits: + cpu: "{{ .Values.cacheserverSpec.resources.limits.cpu }}" + memory: "{{ .Values.cacheserverSpec.resources.limits.memory }}" + {{- end }} + {{- end }} + ports: + - name: "caserver-cport" + containerPort: {{ .Values.cacheserverSpec.containerPort }} + imagePullPolicy: IfNotPresent + + # TODO(Jiayi): add health check for lmcache server + # livenessProbe: + # initialDelaySeconds: 30 + # periodSeconds: 5 + # failureThreshold: 3 + # httpGet: + # path: /health + # port: {{ .Values.cacheserverSpec.containerPort }} +{{- end -}} diff --git a/helm/templates/deployment-vllm-multi.yaml b/helm/templates/deployment-vllm-multi.yaml index 825bfe3e..4075efc2 100644 --- a/helm/templates/deployment-vllm-multi.yaml +++ b/helm/templates/deployment-vllm-multi.yaml @@ -119,6 +119,12 @@ spec: - name: LMCACHE_MAX_LOCAL_DISK_SIZE value: "{{ $modelSpec.lmcacheConfig.diskOffloadingBufferSize }}" {{- end }} + {{- if .Values.cacheserverSpec }} + - name: LMCACHE_REMOTE_URL + value: "{{ include "cacheserver.formatRemoteUrl" (dict "service_name" (print .Release.Name "-cache-server-service") "port" .Values.cacheserverSpec.servicePort) }}" + - name: LMCACHE_REMOTE_SERDE + value: "{{ .Values.cacheserverSpec.serde }}" + {{- end }} {{- end }} {{- if .Values.servingEngineSpec.configs }} envFrom: diff --git a/helm/templates/service-cache-server.yaml b/helm/templates/service-cache-server.yaml new file mode 100644 index 00000000..fa09782c --- /dev/null +++ b/helm/templates/service-cache-server.yaml @@ -0,0 +1,18 @@ +{{- if .Values.cacheserverSpec -}} +apiVersion: v1 +kind: Service +metadata: + name: "{{ .Release.Name }}-cache-server-service" + namespace: {{ .Release.Namespace }} + labels: + {{- include "chart.cacheserverLabels" . | nindent 4 }} +spec: + type: ClusterIP + ports: + - name: "cacheserver-sport" + port: {{ .Values.cacheserverSpec.servicePort }} + targetPort: {{ .Values.cacheserverSpec.containerPort }} + protocol: TCP + selector: + {{- include "chart.cacheserverLabels" . | nindent 4 }} +{{- end -}} diff --git a/tutorials/06-remote-shared-kv-cache.md b/tutorials/06-remote-shared-kv-cache.md new file mode 100644 index 00000000..cdcdcef9 --- /dev/null +++ b/tutorials/06-remote-shared-kv-cache.md @@ -0,0 +1,139 @@ +# Tutorial: Shared Remote KV Cache Storage with LMCache + +## Introduction + +This tutorial demonstrates how to enable remote KV cache storage using LMCache in a vLLM deployment. Remote KV cache sharing moves large KV caches from GPU memory to a remote shared storage, enabling more KV cache hits and potentially making the deployment more fault tolerant. +vLLM Production Stack uses LMCache for remote KV cache sharing. For more details, see the [LMCache GitHub repository](https://github.com/LMCache/LMCache). + +## Table of Contents + +1. [Prerequisites](#prerequisites) +2. [Step 1: Configuring Remote KV Cache Storage](#step-1-configuring-kv-cache-shared-storage) +3. [Step 2: Deploying the Helm Chart](#step-2-deploying-the-helm-chart) +4. [Step 3: Verifying the Installation](#step-3-verifying-the-installation) +5. [Benchmark the Performance Gain of Remote Shared Storage (Work in Progress)](#benchmark-the-performance-gain-of-remote-shared-storage-work-in-progress) + +## Prerequisites + +- Completion of the following tutorials: + - [00-install-kubernetes-env.md](00-install-kubernetes-env.md) + - [01-minimal-helm-installation.md](01-minimal-helm-installation.md) + - [02-basic-vllm-config.md](02-basic-vllm-config.md) +- A Kubernetes environment with GPU support. + +## Step 1: Configuring KV Cache Shared Storage + +Locate the file `tutorials/assets/values-06-remote-shared-storage.yaml` with the following content: + +```yaml +servingEngineSpec: + runtimeClassName: "" + modelSpec: + - name: "mistral" + repository: "lmcache/vllm-openai" + tag: "latest" + modelURL: "mistralai/Mistral-7B-Instruct-v0.2" + replicaCount: 2 + requestCPU: 10 + requestMemory: "40Gi" + requestGPU: 1 + pvcStorage: "50Gi" + vllmConfig: + enableChunkedPrefill: false + enablePrefixCaching: false + maxModelLen: 16384 + + lmcacheConfig: + enabled: true + cpuOffloadingBufferSize: "20" + + hf_token: + +cacheserverSpec: + replicaCount: 1 + containerPort: 8080 + servicePort: 81 + serde: "naive" + + repository: "lmcache/vllm-openai" + tag: "latest" + resources: + requests: + cpu: "4" + memory: "8G" + limits: + cpu: "4" + memory: "10G" + + labels: + environment: "cacheserver" + release: "cacheserver" + +``` + +> **Note:** Replace `` with your actual Hugging Face token. + +The `CacheserverSpec` starts a remote shared KV cache storage. + +## Step 2: Deploying the Helm Chart + +Deploy the Helm chart using the customized values file: + +```bash +sudo helm install vllm vllm/vllm-stack -f tutorials/assets/values-06-shared-storage.yaml +``` + +## Step 3: Verifying the Installation + +1. Check the pod logs to verify LMCache is active: + + ```bash + sudo kubectl get pods + ``` + + Identify the pod name for the vLLM deployment (e.g., `vllm-mistral-deployment-vllm-xxxx-xxxx`). Then run: + + ```bash + sudo kubectl logs -f + ``` + + Look for entries in the log indicating LMCache is enabled and operational. An example output (indicating KV cache is stored) is: + + ```plaintext + INFO 01-21 20:16:58 lmcache_connector.py:41] Initializing LMCacheConfig under kv_transfer_config kv_connector='LMCacheConnector' kv_buffer_device='cuda' kv_buffer_size=1000000000.0 kv_role='kv_both' kv_rank=None kv_parallel_size=1 kv_ip='127.0.0.1' kv_port=14579 + INFO LMCache: Creating LMCacheEngine instance vllm-instance [2025-01-21 20:16:58,732] -- /usr/local/lib/python3.12/dist-packages/lmcache/experimental/cache_engine.py:237 + ``` + +2. Forward the router service port to access the stack locally: + + ```bash + sudo kubectl port-forward svc/vllm-router-service 30080:80 + ``` + +3. Send a request to the stack and observe the logs: + + ```bash + curl -X POST http://localhost:30080/v1/completions \ + -H "Content-Type: application/json" \ + -d '{ + "model": "mistralai/Mistral-7B-Instruct-v0.2", + "prompt": "Explain the significance of KV cache in language models.", + "max_tokens": 10 + }' + ``` + + Expected output: + + The response from the stack should contain the completion result, and the logs should show LMCache activity, for example: + + ```plaintext + DEBUG LMCache: Store skips 0 tokens and then stores 13 tokens [2025-01-21 20:23:45,113] -- /usr/local/lib/python3.12/dist-packages/lmcache/integration/vllm/vllm_adapter.py:490 + ``` + +## Benchmark the Performance Gain of Remote Shared Storage (Work in Progress) + +In this section, we will benchmark the performance improvement when using LMCache for remote KV cache shared storage. Stay tuned for updates. + +## Conclusion + +This tutorial demonstrated how to enable a shared KV cache storage across multiple vllm nodes in a vLLM deployment using LMCache. By storing KV cache to a remote shared storage, you can improve KV cache hit rate and potentially make the deployment more fault tolerant. Explore further configurations to tailor LMCache to your workloads. diff --git a/tutorials/assets/values-06-shared-storage.yaml b/tutorials/assets/values-06-shared-storage.yaml new file mode 100644 index 00000000..31052b11 --- /dev/null +++ b/tutorials/assets/values-06-shared-storage.yaml @@ -0,0 +1,54 @@ +servingEngineSpec: + runtimeClassName: "" + modelSpec: + - name: "mistral" + repository: "lmcache/vllm-openai" + tag: "latest" + modelURL: "mistralai/Mistral-7B-Instruct-v0.2" + replicaCount: 2 + requestCPU: 10 + requestMemory: "40Gi" + requestGPU: 1 + pvcStorage: "50Gi" + vllmConfig: + enableChunkedPrefill: false + enablePrefixCaching: false + maxModelLen: 16384 + + lmcacheConfig: + enabled: true + cpuOffloadingBufferSize: "20" + + hf_token: + +cacheserverSpec: + # -- Number of replicas + replicaCount: 1 + + # -- Container port + containerPort: 8080 + + # -- Service port + servicePort: 81 + + # -- Serializer/Deserializer type + serde: "naive" + + # -- Cache server image (reusing the vllm image) + repository: "lmcache/vllm-openai" + tag: "latest" + + # TODO (Jiayi): please adjust this once we have evictor + # -- router resource requests and limits + resources: + requests: + cpu: "4" + memory: "8G" + limits: + cpu: "4" + memory: "10G" + + # -- Customized labels for the cache server deployment + labels: + environment: "cacheserver" + release: "cacheserver"