Skip to content

Commit

Permalink
Enable multi-GPU inference in vLLM with tensor parallelism (#105)
Browse files Browse the repository at this point in the history
* Adding TP>1 support and adding shm size as a configurable parameter

Signed-off-by: YuhanLiu11 <[email protected]>
  • Loading branch information
YuhanLiu11 authored Feb 11, 2025
1 parent d109e77 commit 8d4b05a
Show file tree
Hide file tree
Showing 3 changed files with 22 additions and 3 deletions.
2 changes: 1 addition & 1 deletion helm/Chart.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ type: application
# This is the chart version. This version number should be incremented each time you make changes
# to the chart and its templates, including the app version.
# Versions are expected to follow Semantic Versioning (https://semver.org/)
version: 0.0.3
version: 0.0.4

maintainers:
- name: apostac
20 changes: 18 additions & 2 deletions helm/templates/deployment-vllm-multi.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,10 @@ spec:
- "--dtype"
- {{ .dtype | quote }}
{{- end }}
{{- if hasKey . "tensorParallelSize" }}
- "--tensor-parallel-size"
- {{ .tensorParallelSize | quote }}
{{- end }}
{{- if .extraArgs }}
{{- range .extraArgs }}
- {{ . | quote }}
Expand Down Expand Up @@ -108,12 +112,24 @@ spec:
volumeMounts:
- name: {{ .Release.Name }}-storage
mountPath: /data

{{- with $modelSpec.vllmConfig }}
{{- if hasKey $modelSpec.vllmConfig "tensorParallelSize"}}
- name: shm
mountPath: /dev/shm
{{- end}}
{{- end}}
volumes:
- name: {{ .Release.Name }}-storage
persistentVolumeClaim:
claimName: "{{ .Release.Name }}-{{$modelSpec.name}}-storage-claim"

{{- with $modelSpec.vllmConfig }}
{{- if hasKey $modelSpec.vllmConfig "tensorParallelSize"}}
- name: shm
emptyDir:
medium: Memory
sizeLimit: {{ default "20Gi" $modelSpec.shmSize }}
{{- end}}
{{- end}}
{{- if .Values.servingEngineSpec.tolerations }}
{{- with .Values.servingEngineSpec.tolerations }}
tolerations:
Expand Down
3 changes: 3 additions & 0 deletions helm/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ servingEngineSpec:
# - enableChunkedPrefill: (optional, bool) Enable chunked prefill, e.g., false
# - maxModelLen: (optional, int) The maximum model length, e.g., 16384
# - dtype: (optional, string) The data type, e.g., "bfloat16"
# - tensorParallelSize: (optional, int) The degree of tensor parallelism, e.g., 2
# - extraArgs: (optional, list) Extra command line arguments to pass to vLLM, e.g., ["--disable-log-requests"]
#
# - lmcacheConfig: (optional, map) The configuration of the LMCache for KV offloading, supported options are:
Expand All @@ -41,6 +42,8 @@ servingEngineSpec:
#
# - nodeSelectorTerms: (optional, list) The node selector terms to match the nodes
#
# - shmSize: (optional, string) The size of the shared memory, e.g., "20Gi"
#
# Example:
# modelSpec:
# - name: "mistral"
Expand Down

0 comments on commit 8d4b05a

Please sign in to comment.