-
Notifications
You must be signed in to change notification settings - Fork 82
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #146 from EaminC/ps-gke
[Tutorial] Deployment on Google GKE
- Loading branch information
Showing
5 changed files
with
458 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,26 @@ | ||
# Setting up GKE vLLM stack with one command | ||
|
||
This script automatically configures a GKE LLM inference cluster. | ||
Make sure your GCP cli is set up, logged in, and region set up. You have eksctl, kubectl, helm installed. | ||
|
||
Modify fields production_stack_specification.yaml and execute as: | ||
|
||
```bash | ||
sudo bash entry_point.sh YAML_FILE_PATH | ||
``` | ||
|
||
Pods for the vllm deployment should transition to Ready and the Running state. | ||
|
||
Expected output: | ||
|
||
```plaintext | ||
NAME READY STATUS RESTARTS AGE | ||
vllm-deployment-router-69b7f9748d-xrkvn 1/1 Running 0 75s | ||
vllm-opt125m-deployment-vllm-696c998c6f-mvhg4 1/1 Running 0 75s | ||
``` | ||
|
||
Clean up the service with: | ||
|
||
```bash | ||
bash clean_up.sh production-stack | ||
``` |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,76 @@ | ||
#!/bin/bash | ||
|
||
# Set variables | ||
CLUSTER_NAME=$1 | ||
|
||
# Automatically get the zone for the GKE cluster | ||
ZONE=$(gcloud container clusters list --filter="name=$CLUSTER_NAME" --format="value(location)") | ||
|
||
if [ -z "$ZONE" ]; then | ||
echo "Cluster $CLUSTER_NAME not found." | ||
exit 1 | ||
fi | ||
|
||
echo "Starting cleanup for GKE cluster: $CLUSTER_NAME in zone: $ZONE" | ||
|
||
# Check if the cluster is still active | ||
CLUSTER_STATUS=$(gcloud container clusters describe "$CLUSTER_NAME" --zone "$ZONE" --format="value(status)") | ||
|
||
if [ "$CLUSTER_STATUS" == "RUNNING" ]; then | ||
# Delete all namespaces except for default, kube-system, and kube-public | ||
echo "Deleting all custom namespaces..." | ||
kubectl get ns --no-headers | awk '{print $1}' | grep -vE '^(default|kube-system|kube-public)' | xargs -r kubectl delete ns | ||
|
||
# Delete all workloads | ||
echo "Deleting all workloads..." | ||
kubectl delete deployments,statefulsets,daemonsets,services,ingresses,configmaps,secrets,persistentvolumeclaims,jobs,cronjobs --all --all-namespaces | ||
kubectl delete persistentvolumes --all | ||
|
||
# Delete GKE node pools | ||
echo "Checking for node pools..." | ||
NODE_POOLS=$(gcloud container node-pools list --cluster "$CLUSTER_NAME" --zone "$ZONE" --format="value(name)") | ||
if [ -n "$NODE_POOLS" ]; then | ||
for NODE_POOL in $NODE_POOLS; do | ||
echo "Deleting node pool: $NODE_POOL" | ||
gcloud container node-pools delete "$NODE_POOL" --cluster "$CLUSTER_NAME" --zone "$ZONE" --quiet | ||
done | ||
else | ||
echo "No node pools found." | ||
fi | ||
|
||
# Delete Load Balancers | ||
echo "Deleting Load Balancers..." | ||
LB_NAMES=$(kubectl get services --all-namespaces -o jsonpath='{.items[?(@.spec.type=="LoadBalancer")].metadata.name}') | ||
for LB_NAME in $LB_NAMES; do | ||
kubectl delete service "$LB_NAME" --all-namespaces | ||
done | ||
else | ||
echo "Cluster $CLUSTER_NAME is not running or has already been deleted." | ||
fi | ||
|
||
# Delete GKE cluster | ||
echo "Deleting GKE cluster..." | ||
gcloud container clusters delete "$CLUSTER_NAME" --zone "$ZONE" --quiet | ||
|
||
# Wait for the cluster deletion to complete | ||
echo "Waiting for cluster $CLUSTER_NAME to be deleted..." | ||
while true; do | ||
sleep 10 | ||
CLUSTER_STATUS=$(gcloud container clusters describe "$CLUSTER_NAME" --zone "$ZONE" --format="value(status)" 2>/dev/null) | ||
if [ "$CLUSTER_STATUS" == "DELETING" ]; then | ||
continue | ||
else | ||
break | ||
fi | ||
done | ||
|
||
echo "Cluster $CLUSTER_NAME deleted." | ||
|
||
# Delete persistent disks | ||
echo "Deleting persistent disks..." | ||
DISK_NAMES=$(gcloud compute disks list --filter="name~'$CLUSTER_NAME' AND status='READY'" --format="value(name)") | ||
for DISK_NAME in $DISK_NAMES; do | ||
gcloud compute disks delete "$DISK_NAME" --quiet | ||
done | ||
|
||
echo "GKE cluster $CLUSTER_NAME cleanup completed successfully!" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,66 @@ | ||
#!/bin/bash | ||
CLUSTER_NAME="production-stack" | ||
ZONE="us-central1-a" | ||
# Get the current GCP project ID | ||
GCP_PROJECT=$(gcloud config get-value project) | ||
|
||
# Ensure the project ID is retrieved correctly | ||
if [ -z "$GCP_PROJECT" ]; then | ||
echo "Error: No GCP project ID found. Please set your project with 'gcloud config set project <PROJECT_ID>'." | ||
exit 1 | ||
fi | ||
|
||
# Ensure a parameter is provided | ||
if [ "$#" -ne 1 ]; then | ||
echo "Usage: $0 <SETUP_YAML>" | ||
exit 1 | ||
fi | ||
|
||
SETUP_YAML=$1 | ||
|
||
|
||
# Create the GKE cluster | ||
gcloud beta container --project "$GCP_PROJECT" clusters create "$CLUSTER_NAME" \ | ||
--zone "$ZONE" \ | ||
--tier "standard" \ | ||
--no-enable-basic-auth \ | ||
--cluster-version "1.31.5-gke.1023000" \ | ||
--release-channel "regular" \ | ||
--machine-type "n2d-standard-8" \ | ||
--image-type "COS_CONTAINERD" \ | ||
--disk-type "pd-balanced" \ | ||
--disk-size "100" \ | ||
--metadata disable-legacy-endpoints=true \ | ||
--scopes "https://www.googleapis.com/auth/devstorage.read_only",\ | ||
"https://www.googleapis.com/auth/logging.write",\ | ||
"https://www.googleapis.com/auth/monitoring",\ | ||
"https://www.googleapis.com/auth/servicecontrol",\ | ||
"https://www.googleapis.com/auth/service.management.readonly",\ | ||
"https://www.googleapis.com/auth/trace.append" \ | ||
--max-pods-per-node "110" \ | ||
--num-nodes "1" \ | ||
--logging=SYSTEM,WORKLOAD \ | ||
--monitoring=SYSTEM,STORAGE,POD,DEPLOYMENT,STATEFULSET,DAEMONSET,HPA,CADVISOR,KUBELET \ | ||
--enable-ip-alias \ | ||
--network "projects/$GCP_PROJECT/global/networks/default" \ | ||
--subnetwork "projects/$GCP_PROJECT/regions/us-central1/subnetworks/default" \ | ||
--no-enable-intra-node-visibility \ | ||
--default-max-pods-per-node "110" \ | ||
--enable-ip-access \ | ||
--security-posture=standard \ | ||
--workload-vulnerability-scanning=disabled \ | ||
--no-enable-master-authorized-networks \ | ||
--no-enable-google-cloud-access \ | ||
--addons HorizontalPodAutoscaling,HttpLoadBalancing,GcePersistentDiskCsiDriver \ | ||
--enable-autoupgrade \ | ||
--enable-autorepair \ | ||
--max-surge-upgrade 1 \ | ||
--max-unavailable-upgrade 0 \ | ||
--binauthz-evaluation-mode=DISABLED \ | ||
--enable-managed-prometheus \ | ||
--enable-shielded-nodes \ | ||
--node-locations "$ZONE" | ||
|
||
# Deploy the application using Helm | ||
sudo helm repo add vllm https://vllm-project.github.io/production-stack | ||
sudo helm install vllm vllm/vllm-stack -f "$SETUP_YAML" |
27 changes: 27 additions & 0 deletions
27
deployment_on_cloud/gke/production_stack_specification.yaml
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,27 @@ | ||
servingEngineSpec: | ||
runtimeClassName: "" | ||
modelSpec: | ||
- name: "opt125m" | ||
repository: "eaminchan/opt-125m-cpu" | ||
tag: "latest" | ||
modelURL: "facebook/opt-125m" | ||
|
||
replicaCount: 1 | ||
|
||
requestCPU: 1.5 | ||
requestMemory: "6Gi" | ||
requestGPU: 0 | ||
|
||
pvcStorage: "10Gi" | ||
pvcAccessMode: | ||
- ReadWriteOnce | ||
device: "cpu" | ||
|
||
routerSpec: | ||
resources: | ||
requests: | ||
cpu: "1" | ||
memory: "6G" | ||
limits: | ||
cpu: "1" | ||
memory: "6G" |
Oops, something went wrong.