diff --git a/helm/Chart.yaml b/helm/Chart.yaml index 677a5ef7..e3bfce62 100644 --- a/helm/Chart.yaml +++ b/helm/Chart.yaml @@ -15,7 +15,7 @@ type: application # This is the chart version. This version number should be incremented each time you make changes # to the chart and its templates, including the app version. # Versions are expected to follow Semantic Versioning (https://semver.org/) -version: 0.0.3 +version: 0.0.4 maintainers: - name: apostac diff --git a/helm/templates/deployment-vllm-multi.yaml b/helm/templates/deployment-vllm-multi.yaml index 212d1ad8..048c9718 100644 --- a/helm/templates/deployment-vllm-multi.yaml +++ b/helm/templates/deployment-vllm-multi.yaml @@ -47,6 +47,10 @@ spec: - "--dtype" - {{ .dtype | quote }} {{- end }} + {{- if hasKey . "tensorParallelSize" }} + - "--tensor-parallel-size" + - {{ .tensorParallelSize | quote }} + {{- end }} {{- if .extraArgs }} {{- range .extraArgs }} - {{ . | quote }} @@ -108,12 +112,24 @@ spec: volumeMounts: - name: {{ .Release.Name }}-storage mountPath: /data - + {{- with $modelSpec.vllmConfig }} + {{- if hasKey $modelSpec.vllmConfig "tensorParallelSize"}} + - name: shm + mountPath: /dev/shm + {{- end}} + {{- end}} volumes: - name: {{ .Release.Name }}-storage persistentVolumeClaim: claimName: "{{ .Release.Name }}-{{$modelSpec.name}}-storage-claim" - + {{- with $modelSpec.vllmConfig }} + {{- if hasKey $modelSpec.vllmConfig "tensorParallelSize"}} + - name: shm + emptyDir: + medium: Memory + sizeLimit: {{ default "20Gi" $modelSpec.shmSize }} + {{- end}} + {{- end}} {{- if .Values.servingEngineSpec.tolerations }} {{- with .Values.servingEngineSpec.tolerations }} tolerations: diff --git a/helm/values.yaml b/helm/values.yaml index daa80181..f1f64e4b 100644 --- a/helm/values.yaml +++ b/helm/values.yaml @@ -29,6 +29,7 @@ servingEngineSpec: # - enableChunkedPrefill: (optional, bool) Enable chunked prefill, e.g., false # - maxModelLen: (optional, int) The maximum model length, e.g., 16384 # - dtype: (optional, string) The data type, e.g., "bfloat16" + # - tensorParallelSize: (optional, int) The degree of tensor parallelism, e.g., 2 # - extraArgs: (optional, list) Extra command line arguments to pass to vLLM, e.g., ["--disable-log-requests"] # # - lmcacheConfig: (optional, map) The configuration of the LMCache for KV offloading, supported options are: @@ -41,6 +42,8 @@ servingEngineSpec: # # - nodeSelectorTerms: (optional, list) The node selector terms to match the nodes # + # - shmSize: (optional, string) The size of the shared memory, e.g., "20Gi" + # # Example: # modelSpec: # - name: "mistral"