[Doc] Adding benchmark tutorial (#197)

* Adding benchmark tutorial Signed-off-by: YuhanLiu11 <[email protected]> * Adding benchmark tutorial Signed-off-by: YuhanLiu11 <[email protected]> * Adding benchmark tutorial Signed-off-by: YuhanLiu11 <[email protected]> * Adding benchmark tutorial Signed-off-by: YuhanLiu11 <[email protected]> * Adding benchmark tutorial Signed-off-by: YuhanLiu11 <[email protected]> --------- Signed-off-by: YuhanLiu11 <[email protected]>
vllm-project · Feb 27, 2025 · 3b860ce · 3b860ce
1 parent b2e84ee
commit 3b860ce
Show file tree

Hide file tree

Showing 8 changed files with 571 additions and 40 deletions.
diff --git a/benchmarks/multi-round-qa/run.sh b/benchmarks/multi-round-qa/run.sh
@@ -1,46 +1,42 @@
 #!/bin/bash
 
-# Ensure correct number of arguments
-if [[ $# -ne 3 ]]; then
-    echo "Usage: $0 <model> <base url> <output file key>"
+if [[ $# -ne 4 ]]; then
+    echo "Usage: $0 <model> <base url> <save file key>"
     exit 1
 fi
 
 MODEL=$1
 BASE_URL=$2
 
 # CONFIGURATION
-NUM_USERS=15
-NUM_ROUNDS=20
+NUM_USERS=320
+NUM_ROUNDS=10
 
-SYSTEM_PROMPT=1000  # Shared system prompt length
-CHAT_HISTORY=20000  # User-specific chat history length
-ANSWER_LEN=100      # Generation length per round
+SYSTEM_PROMPT=1000 # Shared system prompt length
+CHAT_HISTORY=20000 # User specific chat history length
+ANSWER_LEN=100 # Generation length per round
 
-# Function to run the benchmark
 run_benchmark() {
-    local qps=$1
-    local output_file=$2
-
+    # $1: qps
+    # $2: output file
     python3 ./multi-round-qa.py \
-        --num-users "$NUM_USERS" \
-        --num-rounds "$NUM_ROUNDS" \
-        --qps "$qps" \
+        --num-users $NUM_USERS \
+        --num-rounds $NUM_ROUNDS \
+        --qps "$1" \
         --shared-system-prompt "$SYSTEM_PROMPT" \
         --user-history-prompt "$CHAT_HISTORY" \
-        --answer-len "$ANSWER_LEN" \
+        --answer-len $ANSWER_LEN \
         --model "$MODEL" \
         --base-url "$BASE_URL" \
-        --output "$output_file" \
+        --output "$2" \
         --log-interval 30 \
         --time 100
 }
 
-
-key=$3
+KEY=$3
 
 # Run benchmarks for different QPS values
 for qps in 0.1 0.3 0.5 0.7 0.9 1.1; do
-    output_file="${key}_output_${qps}.csv"
+    output_file="${KEY}_output_${qps}.csv"
     run_benchmark "$qps" "$output_file"
 done
diff --git a/benchmarks/multi-round-qa/run_single.sh b/benchmarks/multi-round-qa/run_single.sh
@@ -0,0 +1,42 @@
+#!/bin/bash
+
+if [[ $# -ne 4 ]]; then
+    echo "Usage: $0 <model> <base url> <save file key>"
+    exit 1
+fi
+
+MODEL=$1
+BASE_URL=$2
+
+# CONFIGURATION
+NUM_USERS=15
+NUM_ROUNDS=20
+
+SYSTEM_PROMPT=1000 # Shared system prompt length
+CHAT_HISTORY=20000 # User specific chat history length
+ANSWER_LEN=100 # Generation length per round
+
+run_benchmark() {
+    # $1: qps
+    # $2: output file
+    python3 ./multi-round-qa.py \
+        --num-users $NUM_USERS \
+        --num-rounds $NUM_ROUNDS \
+        --qps "$1" \
+        --shared-system-prompt "$SYSTEM_PROMPT" \
+        --user-history-prompt "$CHAT_HISTORY" \
+        --answer-len $ANSWER_LEN \
+        --model "$MODEL" \
+        --base-url "$BASE_URL" \
+        --output "$2" \
+        --log-interval 30 \
+        --time 100
+}
+
+KEY=$3
+
+# Run benchmarks for different QPS values
+for qps in 0.1 0.3 0.5 0.7 0.9 1.1; do
+    output_file="${KEY}_output_${qps}.csv"
+    run_benchmark "$qps" "$output_file"
+done
diff --git a/benchmarks/multi-round-qa/warmup.sh b/benchmarks/multi-round-qa/warmup.sh
@@ -1,38 +1,29 @@
 #!/bin/bash
 
-# Ensure correct number of arguments
-if [[ $# -ne 2 ]]; then
-    echo "Usage: $0 <model> <base url>"
-    exit 1
-fi
-
 MODEL=$1
 BASE_URL=$2
 
 # CONFIGURATION
-SYSTEM_PROMPT=1000  # Shared system prompt length
-CHAT_HISTORY=20000  # User-specific chat history length
-ANSWER_LEN=100      # Generation length per round
+NUM_USERS_WARMUP=400
 
-# Function to warm up the vLLM
-warmup() {
-    # Calculate warmup time
-    local warmup_time=$((NUM_USERS / 2 + 2))
+SYSTEM_PROMPT=1000 # Shared system prompt length
+CHAT_HISTORY=20000 # User specific chat history length
+ANSWER_LEN=100 # Generation length per round
 
+warmup() {
     # Warm up the vLLM with a lot of user queries
     python3 ./multi-round-qa.py \
         --num-users 1 \
         --num-rounds 2 \
         --qps 2 \
-        --shared-system-prompt "$SYSTEM_PROMPT" \
-        --user-history-prompt "$CHAT_HISTORY" \
-        --answer-len "$ANSWER_LEN" \
+        --shared-system-prompt $SYSTEM_PROMPT \
+        --user-history-prompt $CHAT_HISTORY \
+        --answer-len $ANSWER_LEN \
         --model "$MODEL" \
         --base-url "$BASE_URL" \
         --output /tmp/warmup.csv \
         --log-interval 30 \
-        --time "$warmup_time"
+        --time $((NUM_USERS_WARMUP / 2))
 }
 
-# Run the warmup function
 warmup
diff --git a/benchmarks/multi-round-qa/warmup_single.sh b/benchmarks/multi-round-qa/warmup_single.sh
@@ -0,0 +1,29 @@
+#!/bin/bash
+
+MODEL=$1
+BASE_URL=$2
+
+# CONFIGURATION
+NUM_USERS_WARMUP=20
+
+SYSTEM_PROMPT=1000 # Shared system prompt length
+CHAT_HISTORY=20000 # User specific chat history length
+ANSWER_LEN=100 # Generation length per round
+
+warmup() {
+    # Warm up the vLLM with a lot of user queries
+    python3 ./multi-round-qa.py \
+        --num-users 1 \
+        --num-rounds 2 \
+        --qps 2 \
+        --shared-system-prompt $SYSTEM_PROMPT \
+        --user-history-prompt $CHAT_HISTORY \
+        --answer-len $ANSWER_LEN \
+        --model "$MODEL" \
+        --base-url "$BASE_URL" \
+        --output /tmp/warmup.csv \
+        --log-interval 30 \
+        --time $((NUM_USERS_WARMUP / 2))
+}
+
+warmup
diff --git a/tutorials/07-benchmark-multi-round-qa-multi-gpu.md b/tutorials/07-benchmark-multi-round-qa-multi-gpu.md
@@ -0,0 +1,159 @@
+# Tutorial: Multi-Round QA Benchmark (Multi-GPU)
+
+## Introduction
+
+This tutorial provides a step-by-step guide to setting up and running benchmarks for comparing vLLM Production Stack, Naive Kubernetes, and AIBrix, with multi-round QA benchmark on 8 A100 GPUs (``gpu_8x_a100_80gb_sxm4
+``) from Lambda Labs.
+
+## Table of Contents
+
+1. [Prerequisites](#prerequisites)
+2. [Step 1: Running Benchmarks with vLLM Production Stack](#step-1-running-benchmarks-with-vllm-production-stack)
+3. [Step 2: Running Benchmarks with Naive Kubernetes](#step-2-running-benchmarks-with-naive-kubernetes)
+4. [Step 3: Running Benchmarks with AIBrix](#step-3-running-benchmarks-with-aibrix)
+
+## Prerequisites
+
+- Completion of the following tutorials:
+  - [00-install-kubernetes-env.md](00-install-kubernetes-env.md)
+  - [01-minimal-helm-installation.md](01-minimal-helm-installation.md)
+- In `benchmarks/multi-round-qa/`, Install necessary python packages needed to run multi-round QA benchmark script by `pip install -r requirements.txt`.
+
+## Step 1: Running Benchmarks with vLLM Production Stack
+
+First, start a vLLM Production Stack server.
+
+To begin with, create a `stack.yaml` configuration file:
+
+```yaml
+servingEngineSpec:
+  runtimeClassName: ""
+  modelSpec:
+  - name: "llama3"
+    repository: "lmcache/vllm-openai"
+    tag: "latest"
+    modelURL: "meta-llama/Llama-3.1-8B-Instruct"
+    replicaCount: 8
+    requestCPU: 10
+    requestMemory: "150Gi"
+    requestGPU: 1
+    pvcStorage: "50Gi"
+    pvcAccessMode:
+      - ReadWriteOnce
+    vllmConfig:
+      enableChunkedPrefill: false
+      enablePrefixCaching: false
+      maxModelLen: 32000
+      dtype: "bfloat16"
+      extraArgs: ["--disable-log-requests", "--swap-space", 0]
+    lmcacheConfig:
+      enabled: true
+      cpuOffloadingBufferSize: "120"
+    hf_token: <YOUR_HUGGINGFACE_TOKEN>
+```
+
+Deploy the vLLM Production Stack server by:
+
+```bash
+sudo helm repo add vllm https://vllm-project.github.io/production-stack
+sudo helm install vllm vllm/vllm-stack -f stack.yaml
+```
+
+Then you can verify the pod readiness:
+
+```bash
+kubectl get pods
+```
+
+Once the pods are ready, run the port forwarding:
+
+```bash
+sudo kubectl port-forward svc/vllm-router-service 30080:80
+```
+
+Finally, run the benchmarking code by:
+
+```bash
+bash warmup.sh meta-llama/Llama-3.1-8B-Instruct http://localhost:30080/v1/
+bash run.sh meta-llama/Llama-3.1-8B-Instruct http://localhost:30080/v1/ stack
+```
+
+## Step 2: Running Benchmarks with Naive Kubernetes
+
+First, start a naive Kubernetes server.
+
+To begin with, create a `naive.yaml` configuration file:
+
+```yaml
+servingEngineSpec:
+  runtimeClassName: ""
+  modelSpec:
+  - name: "llama3"
+    repository: "lmcache/vllm-openai"
+    tag: "latest"
+    modelURL: "meta-llama/Llama-3.1-8B-Instruct"
+    replicaCount: 8
+    requestCPU: 10
+    requestMemory: "150Gi"
+    requestGPU: 1
+    pvcStorage: "50Gi"
+    pvcAccessMode:
+      - ReadWriteOnce
+    vllmConfig:
+      enableChunkedPrefill: false
+      enablePrefixCaching: true
+      maxModelLen: 32000
+      dtype: "bfloat16"
+      extraArgs: ["--disable-log-requests", "--swap-space", 0]
+    lmcacheConfig:
+      enabled: false
+    hf_token: <YOUR_HUGGINGFACE_TOKEN>
+```
+
+Deploy the Naive K8s stack server:
+
+```bash
+sudo helm repo add vllm https://vllm-project.github.io/production-stack
+sudo helm install vllm vllm/vllm-stack -f naive.yaml
+```
+
+Then you can verify the pod readiness:
+
+```bash
+kubectl get pods
+```
+
+Once the pods are ready, run the port forwarding:
+
+```bash
+sudo kubectl port-forward svc/vllm-router-service 30080:80
+```
+
+Finally, run the benchmarking code by:
+
+```bash
+bash warmup.sh meta-llama/Llama-3.1-8B-Instruct http://localhost:30080/v1/
+bash run.sh meta-llama/Llama-3.1-8B-Instruct http://localhost:30080/v1/ native
+```
+
+## Step 3: Running Benchmarks with AIBrix
+
+We followed the installation steps documented in [AIBrix's official repo](https://aibrix.readthedocs.io/latest/getting_started/installation/lambda.html) to install their necessary packages needed to run on the Lambda server.
+
+To align the configurations used in benchmarking vLLM Production Stack and naive K8s, we changed the configurations documented in [AIBrix's official repo](https://aibrix.readthedocs.io/latest/features/distributed-kv-cache.html) to enable AIBrix's KV Cache CPU offloading.
+Specifically, we changed the model name in their [deployment configuration yaml file](https://aibrix.readthedocs.io/latest/features/distributed-kv-cache.html) at lines #4, #6, #17, #21, #38, #81, #86 and #99 from `deepseek-coder-7b-instruct` to `llama3-1-8b`; and line #36 from `deepseek-ai/deepseek-coder-6.7b-instruct` to `meta-llama/Llama-3.1-8B-Instruct`; and line #57 from and line #73 from `deepseek-coder-7b-kvcache-rpc:9600` to `llama3-1-8b-kvcache-rpc:9600` `/var/run/vineyard-kubernetes/default/deepseek-coder-7b-kvcache` to `/var/run/vineyard-kubernetes/default/llama3-1-8b-kvcache`.
+We also changed the CPU offload memory limit at line #47 from `10` to `120` to match the configuration used in [Step 1](#step-1-running-benchmarks-with-vllm-production-stack).
+Finally, we changed the replica number at line #9 from `1` to `8`.
+
+We also changed the CPU memory limit in AIBrix's KV cache server config: At line #4, we changed from `deepseek-coder-7b-kvcache` to `llama3-1-8b-kvcache`; and at line #7, we changed from `deepseek-coder-7b-instruct` to `llama3-1-8b`; and at line #17, we changed from `4Gi` to `150Gi` for aligning with the configuration used in [Step 1](#step-1-running-benchmarks-with-vllm-production-stack).
+
+Finally, we follow the steps in [AIBrix's official repo](https://aibrix.readthedocs.io/latest/getting_started/installation/lambda.html) to start AIBrix server and then run the benchmarking code by:
+
+`bash
+bash warmup.sh llama3 http://localhost:8888/v1/
+bash run.sh llama3 http://localhost:8888/v1/ aibrix
+`
+
+## Conclusion
+
+This tutorial provides a comprehensive guide to setting up and benchmarking vLLM Production Stack, Native Kubernetes, and AIBrix. By following these steps, you can effectively evaluate their performance in your environment.