Enable multi-GPU inference in vLLM with tensor parallelism (#105)

* Adding TP>1 support and adding shm size as a configurable parameter Signed-off-by: YuhanLiu11 <[email protected]>
vllm-project · Feb 11, 2025 · 8d4b05a · 8d4b05a
1 parent d109e77
commit 8d4b05a
Show file tree

Hide file tree

Showing 3 changed files with 22 additions and 3 deletions.
diff --git a/helm/Chart.yaml b/helm/Chart.yaml
@@ -15,7 +15,7 @@ type: application
 # This is the chart version. This version number should be incremented each time you make changes
 # to the chart and its templates, including the app version.
 # Versions are expected to follow Semantic Versioning (https://semver.org/)
-version: 0.0.3
+version: 0.0.4
 
 maintainers:
   - name: apostac
diff --git a/helm/templates/deployment-vllm-multi.yaml b/helm/templates/deployment-vllm-multi.yaml
@@ -47,6 +47,10 @@ spec:
           - "--dtype"
           - {{ .dtype | quote }}
           {{-   end }}
+          {{-   if hasKey . "tensorParallelSize" }}
+          - "--tensor-parallel-size"
+          - {{ .tensorParallelSize | quote }}
+          {{-   end }}
           {{-   if .extraArgs }}
           {{-     range .extraArgs }}
           - {{ . | quote }}
@@ -108,12 +112,24 @@ spec:
           volumeMounts:
           - name: {{ .Release.Name }}-storage
             mountPath: /data
-
+          {{- with $modelSpec.vllmConfig }}
+          {{- if hasKey $modelSpec.vllmConfig "tensorParallelSize"}}
+          - name: shm
+            mountPath: /dev/shm
+          {{- end}}
+          {{- end}}
       volumes:
         - name: {{ .Release.Name }}-storage
           persistentVolumeClaim:
             claimName: "{{ .Release.Name }}-{{$modelSpec.name}}-storage-claim"
-
+        {{- with $modelSpec.vllmConfig }}
+        {{- if hasKey $modelSpec.vllmConfig "tensorParallelSize"}}
+        - name: shm
+          emptyDir:
+            medium: Memory
+            sizeLimit: {{ default "20Gi" $modelSpec.shmSize }}
+        {{- end}}
+        {{- end}}
       {{- if .Values.servingEngineSpec.tolerations }}
       {{-   with .Values.servingEngineSpec.tolerations }}
       tolerations:

diff --git a/helm/values.yaml b/helm/values.yaml
@@ -29,6 +29,7 @@ servingEngineSpec:
   #   - enableChunkedPrefill: (optional, bool) Enable chunked prefill, e.g., false
   #   - maxModelLen: (optional, int) The maximum model length, e.g., 16384
   #   - dtype: (optional, string) The data type, e.g., "bfloat16"
+  #   - tensorParallelSize: (optional, int) The degree of tensor parallelism, e.g., 2
   #   - extraArgs: (optional, list) Extra command line arguments to pass to vLLM, e.g., ["--disable-log-requests"]
   #
   # - lmcacheConfig: (optional, map) The configuration of the LMCache for KV offloading, supported options are:
@@ -41,6 +42,8 @@ servingEngineSpec:
   #
   # - nodeSelectorTerms: (optional, list) The node selector terms to match the nodes
   #
+  # - shmSize: (optional, string) The size of the shared memory, e.g., "20Gi"
+  #
   # Example:
   # modelSpec:
   # - name: "mistral"