vllm-project · Yikun · Mar 6, 2025
diff --git a/.github/workflows/vllm_ascend_test.yaml b/.github/workflows/vllm_ascend_test.yaml
@@ -47,9 +47,30 @@ defaults:
     shell: bash -el {0}
 
 jobs:
+  dispatch:
+    name: vLLM Ascend test (dispatch)
+    runs-on: ascend-03-arm64
+    outputs:
+      number: ${{ steps.dispatch-device.outputs.number }}
+    steps:
+      - name: vLLM Ascend test (dispatch)
+        id: dispatch-device
+        run: |
+          # Try to acquire lock to dispatch a device
+          lockfile /tmp/dispatch.lock
+
+          # Print npu info
+          npu-list /dev/null 2>&1
+
+          # Select first available device (exclude davinci1 and davinci0)
+          NUMBER=$(npu-list /dev/null 2>&1 | grep None | grep -v davinci1 | grep -v davinci0 |head -1 | cut -b 15)
+          echo "Dispatch to /dev/davinci$NUMBER"
+          echo "number=$NUMBER" >> $GITHUB_OUTPUT
+
   test:
+    needs: [dispatch]
     name: vLLM Ascend test (self-host)
-    runs-on: ascend-arm64  # actionlint-ignore: runner-label
+    runs-on: ascend-03-arm64  # actionlint-ignore: runner-label
 
     container:
       image: quay.io/ascend/cann:8.0.0-910b-ubuntu22.04-py3.10
@@ -58,9 +79,11 @@ jobs:
         - /usr/local/bin/npu-smi:/usr/local/bin/npu-smi
         - /usr/local/Ascend/driver/lib64/:/usr/local/Ascend/driver/lib64/
         # Use self-host cache speed up pip and model download
-        - /home/action/actions-runner/_work/cache:/github/home/.cache/
+        - /home/action/cache:/github/home/.cache/
+        # for dispatch lock
+        - /tmp/:/tmp/
       options: >-
-        --device /dev/davinci6
+        --device /dev/davinci${{ needs.dispatch.outputs.number }}
         --device /dev/davinci_manager
         --device /dev/devmm_svm
         --device /dev/hisi_hdc
@@ -71,6 +94,8 @@ jobs:
         run: |
           npu-smi info
           cat /usr/local/Ascend/ascend-toolkit/latest/"$(uname -i)"-linux/ascend_toolkit_install.info
+          # unlock
+          rm -rf /tmp/dispatch.lock
 
       - name: Config mirrors
         run: |

diff --git a/tests/test_offline_inference.py b/tests/test_offline_inference.py
@@ -32,6 +32,7 @@
     "Qwen/Qwen2.5-0.5B-Instruct",
 ]
 os.environ["VLLM_USE_MODELSCOPE"] = "True"
+os.environ["PYTORCH_NPU_ALLOC_CONF"] = "max_split_size_mb:256"
 
 TARGET_TEST_SUITE = os.environ.get("TARGET_TEST_SUITE", "L4")