Merge pull request #137 from Shaoting-Feng/test-router

[CI/Build] Fix static router in github actions
vllm-project · Feb 18, 2025 · a01852b · a01852b
2 parents 9c350ff + 116680e
commit a01852b
Show file tree

Hide file tree

Showing 12 changed files with 86 additions and 23 deletions.
diff --git a/.github/curl-01-minimal-example.sh b/.github/curl-01-minimal-example.sh
@@ -4,6 +4,6 @@
 [ ! -d "output-01-minimal-example" ] && mkdir output-01-minimal-example
 chmod -R 777 output-01-minimal-example
 # shellcheck disable=SC2034  # result_model appears unused. Verify it or export it.
-result_model=$(curl -s http://"$1":"$2"/models | tee output-01-minimal-example/models-01-minimal-example.json)
+result_model=$(curl -s http://"$1":"$2"/v1/models | tee output-01-minimal-example/models-01-minimal-example.json)
 # shellcheck disable=SC2034  # result_query appears unused. Verify it or export it.
-result_query=$(curl -X POST http://"$1":"$2"/completions -H "Content-Type: application/json" -d '{"model": "facebook/opt-125m", "prompt": "Once upon a time,", "max_tokens": 10}' | tee output-01-minimal-example/query-01-minimal-example.json)
+result_query=$(curl -X POST http://"$1":"$2"/v1/completions -H "Content-Type: application/json" -d '{"model": "facebook/opt-125m", "prompt": "Once upon a time,", "max_tokens": 10}' | tee output-01-minimal-example/query-01-minimal-example.json)
diff --git a/.github/curl-02-two-pods.sh b/.github/curl-02-two-pods.sh
@@ -4,6 +4,6 @@
 [ ! -d "output-02-two-pods" ] && mkdir output-02-two-pods
 chmod -R 777 output-02-two-pods
 # shellcheck disable=SC2034  # result_model appears unused. Verify it or export it.
-result_model=$(curl -s http://"$1":"$2"/models | tee output-02-two-pods/output-02-two-pods.json)
+result_model=$(curl -s http://"$1":"$2"/v1/models | tee output-02-two-pods/models-02-two-pods.json)
 # shellcheck disable=SC2034  # result_query appears unused. Verify it or export it.
-result_query=$(curl -X POST http://"$1":"$2"/completions -H "Content-Type: application/json" -d '{"model": "facebook/opt-125m", "prompt": "Once upon a time,", "max_tokens": 10}' | tee output-02-two-pods/output-02-two-pods.json)
+result_query=$(curl -X POST http://"$1":"$2"/v1/completions -H "Content-Type: application/json" -d '{"model": "facebook/opt-125m", "prompt": "Once upon a time,", "max_tokens": 10}' | tee output-02-two-pods/query-02-two-pods.json)
diff --git a/.github/curl-04-multiple-models.sh b/.github/curl-04-multiple-models.sh
@@ -4,11 +4,11 @@
 [ ! -d "output-04-multiple-models" ] && mkdir output-04-multiple-models
 chmod -R 777 output-04-multiple-models
 # shellcheck disable=SC2034  # result_model appears unused. Verify it or export it.
-result_model=$(curl -s "http://$1:$2/models" | tee output-04-multiple-models/models-04-multiple-models.json)
+result_model=$(curl -s http://"$1":"$2"/v1/models | tee output-04-multiple-models/models-04-multiple-models.json)
 
 # shellcheck disable=SC1091  # Not following: /usr/local/bin/conda-init was not specified as input
 source /usr/local/bin/conda-init
 conda activate llmstack
 
 # shellcheck disable=SC2034  # result_query appears unused. Verify it or export it.
-result_query=$(python3 tutorials/assets/example-04-openai.py --openai_api_base http://"$1":"$2"/ | tee output-04-multiple-models/query-04-multiple-models.json)
+result_query=$(python3 tutorials/assets/example-04-openai.py --openai_api_base http://"$1":"$2"/v1/ | tee output-04-multiple-models/query-04-multiple-models.json)
diff --git a/.github/values-01-2pods-minimal-example.yaml b/.github/values-01-2pods-minimal-example.yaml
@@ -0,0 +1,28 @@
+servingEngineSpec:
+  strategy:
+    type: Recreate
+  runtimeClassName: ""
+  modelSpec:
+  - name: "opt125m"
+    repository: "vllm/vllm-openai"
+    tag: "latest"
+    modelURL: "facebook/opt-125m"
+
+    replicaCount: 2
+
+    requestCPU: 6
+    requestMemory: "16Gi"
+    requestGPU: 0.5
+
+    pvcStorage: "10Gi"
+    pvcAccessMode:
+      - ReadWriteMany
+
+    vllmConfig:
+      maxModelLen: 1024
+      extraArgs: ["--disable-log-requests", "--gpu-memory-utilization", "0.4"]
+
+routerSpec:
+  repository: "localhost:5000/git-act-router"
+  imagePullPolicy: "IfNotPresent"
+  enableRouter: true
diff --git a/.github/values-01-minimal-example.yaml b/.github/values-01-minimal-example.yaml
@@ -0,0 +1,22 @@
+servingEngineSpec:
+  runtimeClassName: ""
+  modelSpec:
+  - name: "opt125m"
+    repository: "vllm/vllm-openai"
+    tag: "latest"
+    modelURL: "facebook/opt-125m"
+
+    replicaCount: 1
+
+    requestCPU: 6
+    requestMemory: "16Gi"
+    requestGPU: 1
+
+    pvcStorage: "10Gi"
+    pvcAccessMode:
+      - ReadWriteOnce
+
+routerSpec:
+  repository: "localhost:5000/git-act-router"
+  imagePullPolicy: "IfNotPresent"
+  enableRouter: true
diff --git a/.github/multiple-models.yaml → .github/values-04-multiple-models.yaml b/.github/multiple-models.yaml → .github/values-04-multiple-models.yaml
@@ -24,3 +24,8 @@ servingEngineSpec:
     pvcStorage: "10Gi"
     pvcAccessMode:
       - ReadWriteOnce
+
+routerSpec:
+  repository: "localhost:5000/git-act-router"
+  imagePullPolicy: "IfNotPresent"
+  enableRouter: true
diff --git a/.github/workflows/functionality-helm-chart.yml b/.github/workflows/functionality-helm-chart.yml
@@ -30,7 +30,11 @@ jobs:
       - name: Deploy via helm charts
         run: |
           cd ${{ github.workspace }}
-          sudo helm install vllm ./helm -f tutorials/assets/values-01-minimal-example.yaml
+          sudo docker build -t localhost:5000/git-act-router -f docker/Dockerfile .
+          sudo docker push localhost:5000/git-act-router
+          sudo sysctl fs.protected_regular=0
+          sudo minikube image load localhost:5000/git-act-router
+          sudo helm install vllm ./helm -f .github/values-01-minimal-example.yaml
       - name: Validate the installation and send query to the stack
         run: |
           sudo bash .github/port-forward.sh curl-01-minimal-example
@@ -43,24 +47,18 @@ jobs:
             output-01-minimal-example/
       - name: Helm uninstall
         run: |
-           sudo helm uninstall vllm
+            sudo helm uninstall vllm
         if: always()
       - run: echo "🍏 This job's status is ${{ job.status }}."
 
   Two-Pods-Minimal-Example:
     runs-on: self-hosted
+    needs: Minimal-Example
     steps:
-      - run: echo "🎉 The job was automatically triggered by a ${{ github.event_name }} event."
-      - run: echo "🐧 This job is now running on a ${{ runner.os }} server hosted by GitHub!"
-      - run: echo "🔎 The name of your branch is ${{ github.ref }} and your repository is ${{ github.repository }}."
-      - name: Check out repository code
-        uses: actions/checkout@v4
-      - run: echo "💡 The ${{ github.repository }} repository has been cloned to the runner."
-      - run: echo "🖥️ The workflow is now ready to test your code on the runner."
       - name: Deploy via helm charts
         run: |
           cd ${{ github.workspace }}
-          sudo helm install vllm ./helm -f tutorials/assets/values-01-2pods-minimal-example.yaml
+          sudo helm install vllm ./helm -f .github/values-01-2pods-minimal-example.yaml
       - name: Validate the installation and send query to the stack
         run: |
           sudo bash .github/port-forward.sh curl-02-two-pods
@@ -79,11 +77,11 @@ jobs:
 
   Multiple-Models:
     runs-on: self-hosted
-    needs: Minimal-Example
+    needs: Two-Pods-Minimal-Example
     steps:
       - name: Deploy via helm charts
         run: |
-          sudo helm install vllm ./helm -f .github/multiple-models.yaml
+          sudo helm install vllm ./helm -f .github/values-04-multiple-models.yaml
       - name: Validate the installation and send query to the stack
         run: |
           sudo bash .github/port-forward.sh curl-04-multiple-models
@@ -96,6 +94,6 @@ jobs:
             output-04-multiple-models/
       - name: Helm uninstall
         run: |
-           sudo helm uninstall vllm
+            sudo helm uninstall vllm
         if: always()
       - run: echo "🍏 This job's status is ${{ job.status }}."
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -9,7 +9,11 @@ repos:
   - id: check-json
   - id: check-toml
   - id: check-yaml
-    exclude: ^helm/templates/
+    exclude: |
+      (?x)(
+          ^helm/templates/|
+          .github/deployment-router.yaml
+      )
   - id: end-of-file-fixer
   - id: requirements-txt-fixer
   - id: trailing-whitespace

diff --git a/helm/Chart.yaml b/helm/Chart.yaml
@@ -15,7 +15,7 @@ type: application
 # This is the chart version. This version number should be incremented each time you make changes
 # to the chart and its templates, including the app version.
 # Versions are expected to follow Semantic Versioning (https://semver.org/)
-version: 0.0.7
+version: 0.0.8
 
 maintainers:
   - name: apostac
diff --git a/helm/templates/deployment-router.yaml b/helm/templates/deployment-router.yaml
@@ -20,7 +20,8 @@ spec:
       serviceAccountName: {{ .Release.Name }}-router-service-account
       containers:
       - name: router-container
-        image: lmcache/lmstack-router:latest
+        image: "{{ .Values.routerSpec.repository | default "lmcache/lmstack-router" }}:{{ .Values.routerSpec.tag | default "latest" }}"
+        imagePullPolicy: "{{ .Values.routerSpec.imagePullPolicy | default "Always" }}"
         args:
           - "--host"
           - "0.0.0.0"

diff --git a/helm/values.yaml b/helm/values.yaml
@@ -141,6 +141,11 @@ servingEngineSpec:
   runtimeClassName: "nvidia"
 
 routerSpec:
+  # -- The docker image of the router. The following values are defaults:
+  repository: "lmcache/lmstack-router"
+  tag: "latest"
+  imagePullPolicy: "Always"
+
   # -- Whether to enable the router service
   enableRouter: true
 

diff --git a/tutorials/assets/example-04-openai.py b/tutorials/assets/example-04-openai.py
@@ -7,7 +7,7 @@
 parser.add_argument(
     "--openai_api_base",
     type=str,
-    default="http://localhost:30080/",
+    default="http://localhost:30080/v1/",
     help="The base URL for the OpenAI API",
 )
 parser.add_argument(