Skip to content

Commit

Permalink
[Tutorial] Added EKS tutorial for production stack (#142)
Browse files Browse the repository at this point in the history
* [Feat] Added support for modifying PVC through values.yaml

Signed-off-by: hanchenli <[email protected]>

* fix pre commit

Signed-off-by: hanchenli <[email protected]>

* [WIP] AWS EKS deployment and tutorial

Signed-off-by: hanchenli <[email protected]>

* [FIX] Fixed precommit errors

Signed-off-by: hanchenli <[email protected]>

* Added readme

Signed-off-by: hanchenli <[email protected]>

* changed tutorial

Signed-off-by: hanchenli <[email protected]>

---------

Signed-off-by: hanchenli <[email protected]>
  • Loading branch information
Hanchenli authored Feb 17, 2025
1 parent f125a14 commit 639edc0
Show file tree
Hide file tree
Showing 7 changed files with 560 additions and 1 deletion.
16 changes: 16 additions & 0 deletions deployment_on_cloud/aws/Readme.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
# Setting up EKS vLLM stack with one command

This script automatically configures a EKS LLM inference cluster.
Make sure your AWS cli is set up, logged in, and region set up. You have eksctl, kubectl, helm installed.

Modify fields production_stack_specification.yaml and execute as:

```bash
bash entry_point.sh YOUR_AWSREGION YAML_FILE_PATH
```

Clean up the service (not the VPC) with:

```bash
bash clean_up.sh production-stack YOUR_AWSREGION
```
144 changes: 144 additions & 0 deletions deployment_on_cloud/aws/clean_up.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,144 @@
#!/bin/bash

# Set variables
CLUSTER_NAME=$1
REGION=$2

echo "Starting cleanup for EKS cluster: $CLUSTER_NAME in region: $REGION"

# Delete all namespaces except default, kube-system, kube-public
echo "Deleting all custom namespaces..."
kubectl get ns --no-headers | awk '{print $1}' | grep -vE '^(default|kube-system|kube-public)$' | xargs -r kubectl delete ns

# Delete all workloads
echo "Deleting all workloads..."
kubectl delete deployments,statefulsets,daemonsets,services,ingresses,configmaps,secrets,persistentvolumeclaims,jobs,cronjobs --all --all-namespaces
kubectl delete persistentvolumes --all

# Delete managed node groups (if any exist)
echo "Checking for managed node groups..."
NODEGROUPS=$(aws eks list-nodegroups --cluster-name "$CLUSTER_NAME" --region "$REGION" --query "nodegroups[]" --output text)
if [ -n "$NODEGROUPS" ]; then
for NODEGROUP in $NODEGROUPS; do
echo "Deleting node group: $NODEGROUP"
aws eks delete-nodegroup --cluster-name "$CLUSTER_NAME" --nodegroup-name "$NODEGROUP" --region "$REGION"
echo "Waiting for node group $NODEGROUP to be deleted..."
aws eks wait nodegroup-deleted --cluster-name "$CLUSTER_NAME" --nodegroup-name "$NODEGROUP" --region "$REGION"
done
else
echo "No managed node groups found."
fi

# Delete self-managed EC2 instances
echo "Deleting self-managed nodes..."
INSTANCE_IDS=$(aws ec2 describe-instances --filters "Name=tag:eks:cluster-name,Values=$CLUSTER_NAME" --query "Reservations[].Instances[].InstanceId" --output text --region "$REGION")
if [ -n "$INSTANCE_IDS" ]; then
aws ec2 terminate-instances --instance-ids "$INSTANCE_IDS" --region "$REGION"
echo "Waiting for EC2 instances to terminate..."
aws ec2 wait instance-terminated --instance-ids "$INSTANCE_IDS" --region "$REGION"
fi

# Delete associated load balancers
echo "Deleting load balancers..."
LB_ARNs=$(aws elbv2 describe-load-balancers --query "LoadBalancers[?contains(LoadBalancerName, '$CLUSTER_NAME')].LoadBalancerArn" --output text --region "$REGION")
for LB_ARN in $LB_ARNs; do
aws elbv2 delete-load-balancer --load-balancer-arn "$LB_ARN" --region "$REGION"
echo "Waiting for load balancer $LB_ARN to be deleted..."
aws elbv2 wait load-balancers-deleted --load-balancer-arns "$LB_ARN" --region "$REGION"
done

# Delete target groups
echo "Deleting target groups..."
TG_ARNs=$(aws elbv2 describe-target-groups --query "TargetGroups[?contains(TargetGroupName, '$CLUSTER_NAME')].TargetGroupArn" --output text --region "$REGION")
for TG_ARN in $TG_ARNs; do
aws elbv2 delete-target-group --target-group-arn "$TG_ARN" --region "$REGION"
done

# Delete NAT Gateways
echo "Deleting NAT Gateways..."
NAT_GATEWAYS=$(aws ec2 describe-nat-gateways --filter "Name=tag:eks:cluster-name,Values=$CLUSTER_NAME" --query "NatGateways[].NatGatewayId" --output text --region "$REGION")
for NAT_ID in $NAT_GATEWAYS; do
aws ec2 delete-nat-gateway --nat-gateway-id "$NAT_ID" --region "$REGION"
echo "Waiting for NAT Gateway $NAT_ID to be deleted..."
aws ec2 wait nat-gateway-deleted --nat-gateway-ids "$NAT_ID" --region "$REGION"
done

# Release Elastic IPs
echo "Releasing Elastic IPs..."
EIP_ALLOCS=$(aws ec2 describe-addresses --query "Addresses[?AssociationId==null].AllocationId" --output text --region "$REGION")
for EIP in $EIP_ALLOCS; do
aws ec2 release-address --allocation-id "$EIP" --region "$REGION"
echo "Released Elastic IP $EIP"
done

# Release EFS and the created security group
while read -r fs_id; do
echo "Processing File System: $fs_id"

# Get the list of mount targets
mount_targets=$(aws efs describe-mount-targets --file-system-id "$fs_id" --query "MountTargets[*].MountTargetId" --output text)

# Delete each mount target
for mt_id in $mount_targets; do
echo "Deleting Mount Target: $mt_id"
aws efs delete-mount-target --mount-target-id "$mt_id"
done

# Wait for mount targets to be deleted (optional, prevents API conflicts)
while [[ -n $(aws efs describe-mount-targets --file-system-id "$fs_id" --query "MountTargets[*].MountTargetId" --output text) ]]; do
echo "Waiting for mount targets to be deleted..."
sleep 10
done

# Delete the file system
echo "Deleting File System: $fs_id"
aws efs delete-file-system --file-system-id "$fs_id"

done < temp.txt


for sg in $(aws ec2 describe-security-groups --filters "Name=group-name,Values=efs-sg" --query "SecurityGroups[*].GroupId" --output text); do

echo "Deleting Security Group: $sg"
for eni in $(aws ec2 describe-network-interfaces --filters "Name=group-id,Values=$sg" --query "NetworkInterfaces[*].NetworkInterfaceId" --output text); do
echo "$eni"
aws ec2 delete-network-interface --network-interface-id "$eni"
done
aws ec2 delete-security-group --group-id "$sg"
done

# Delete the EKS cluster
echo "Deleting EKS cluster..."
aws eks delete-cluster --name "$CLUSTER_NAME" --region "$REGION"
echo "Waiting for cluster $CLUSTER_NAME to be deleted..."
aws eks wait cluster-deleted --name "$CLUSTER_NAME" --region "$REGION"

# Delete CloudFormation Stack
echo "Checking if CloudFormation stack exists for EKS cluster..."
STACK_NAME="eksctl-${CLUSTER_NAME}-cluster"
STACK_STATUS=$(aws cloudformation describe-stacks --stack-name "$STACK_NAME" --region "$REGION" --query "Stacks[0].StackStatus" --output text 2>/dev/null)

if [ -n "$STACK_STATUS" ]; then
echo "Deleting CloudFormation stack: $STACK_NAME"
aws cloudformation delete-stack --stack-name "$STACK_NAME" --region "$REGION"
echo "Waiting for CloudFormation stack $STACK_NAME to be deleted..."
aws cloudformation wait stack-delete-complete --stack-name "$STACK_NAME" --region "$REGION"
echo "CloudFormation stack $STACK_NAME has been deleted successfully!"
else
echo "CloudFormation stack $STACK_NAME not found, skipping..."
fi

STACK_NAME="eksctl-${CLUSTER_NAME}-cluster-nodegroup-gpu-nodegroup"
STACK_STATUS=$(aws cloudformation describe-stacks --stack-name "$STACK_NAME" --region "$REGION" --query "Stacks[0].StackStatus" --output text 2>/dev/null)

if [ -n "$STACK_STATUS" ]; then
echo "Deleting CloudFormation stack: $STACK_NAME"
aws cloudformation delete-stack --stack-name "$STACK_NAME" --region "$REGION"
echo "Waiting for CloudFormation stack $STACK_NAME to be deleted..."
aws cloudformation wait stack-delete-complete --stack-name "$STACK_NAME" --region "$REGION"
echo "CloudFormation stack $STACK_NAME has been deleted successfully!"
else
echo "CloudFormation stack $STACK_NAME not found, skipping..."
fi

echo "EKS cluster $CLUSTER_NAME cleanup completed successfully!"
64 changes: 64 additions & 0 deletions deployment_on_cloud/aws/entry_point.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
#!/bin/bash

AWS_REGION=$1
SETUP_YAML=$2
CLUSTER_NAME="production-stack"
#This script assumes you have latest AWS cli logged in.

# Set up EKS cluster
eksctl create cluster \
--name "$CLUSTER_NAME" \
--region "$AWS_REGION" \
--version 1.27 \
--nodegroup-name gpu-nodegroup \
--node-type g6e.4xlarge \
--nodes 2 \
--nodes-min 2 \
--nodes-max 2 \
--managed


# Create EFS (need to be in same VPC as EKS)
bash set_up_efs.sh "$CLUSTER_NAME" "$AWS_REGION"

# #Create CSI driver
eksctl utils associate-iam-oidc-provider --region "$AWS_REGION" --cluster "$CLUSTER_NAME" --approve

kubectl apply -k "github.com/kubernetes-sigs/aws-efs-csi-driver/deploy/kubernetes/overlays/stable/ecr/?ref=release-1.6"
kubectl get pods -n kube-system | grep efs
eksctl create iamserviceaccount \
--region "$AWS_REGION" \
--name efs-csi-controller-sa \
--namespace kube-system \
--cluster "$CLUSTER_NAME" \
--attach-policy-arn arn:aws:iam::aws:policy/service-role/AmazonEFSCSIDriverPolicy \
--approve


#create pv after modify the filesys id to be the filesys id
#storage needed is based on model weights
EFS_ID=$(cat temp.text)

cat <<EOF > efs-pv.yaml
apiVersion: v1
kind: PersistentVolume
metadata:
name: efs-pv
spec:
capacity:
storage: 40Gi
volumeMode: Filesystem
accessModes:
- ReadWriteMany
persistentVolumeReclaimPolicy: Retain
csi:
driver: efs.csi.aws.com
volumeHandle: $EFS_ID
EOF

kubectl apply -f efs-pv.yaml
kubectl get pv

#Now we start the production stack.
helm repo add vllm https://vllm-project.github.io/production-stack
helm install vllm vllm/vllm-stack -f "$SETUP_YAML"
18 changes: 18 additions & 0 deletions deployment_on_cloud/aws/production_stack_specification.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
servingEngineSpec:
runtimeClassName: ""
modelSpec:
- name: "llama8b"
repository: "vllm/vllm-openai"
tag: "latest"
modelURL: "meta-llama/Llama-3.1-8B"

replicaCount: 2

requestCPU: 6
requestMemory: "16Gi"
requestGPU: 1
hf_token: HUGGINGFACE_TOKEN
pvcStorage: "40Gi"
pvcAccessMode:
- ReadWriteMany
storageClass: "efs_static_storage_class_indicator" #This is to let helm pvc know it should be empty string
73 changes: 73 additions & 0 deletions deployment_on_cloud/aws/set_up_efs.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
#!/bin/bash
set -e # Exit on error
set -o pipefail # Exit on first command failure in a pipeline

CLUSTER_NAME=$1
REGION=$2
EFS_NAME="efs_for_eks"

# Fetch VPC ID
VPC_ID=$(aws eks describe-cluster --name "$CLUSTER_NAME" --query "cluster.resourcesVpcConfig.vpcId" --output text)

# Fetch Subnet IDs
read -r -a SUBNET_IDS <<< "$(aws eks describe-cluster --name "$CLUSTER_NAME" --query "cluster.resourcesVpcConfig.subnetIds" --output text)"

# Fetch Security Group used by EKS
INSTANCE_ID=$(aws ec2 describe-instances --filters Name=tag:eks:cluster-name,Values=production-stack --query "Reservations[*].Instances[*].InstanceId" --output text | head -n 1)
EKS_SG_ID=$(aws ec2 describe-instances --instance-ids "$INSTANCE_ID" --query "Reservations[0].Instances[0].SecurityGroups[*].GroupId" --output text)
echo "EKS Security Group ID: $EKS_SG_ID"

# Create Security Group for EFS
EFS_SG_ID=$(aws ec2 create-security-group \
--group-name efs-sg \
--description "Allow NFS from EKS" \
--vpc-id "$VPC_ID" \
--query "GroupId" --output text --region "$REGION")
echo "Created Security Group for EFS: $EFS_SG_ID"

# Allow NFS traffic (port 2049) from EKS nodes
aws ec2 authorize-security-group-ingress \
--group-id "$EFS_SG_ID" \
--protocol tcp \
--port 2049 \
--source-group "$EKS_SG_ID"

echo "Security group updated to allow NFS traffic."

# Create EFS File System
EFS_ID=$(aws efs create-file-system \
--region "$REGION" \
--performance-mode generalPurpose \
--throughput-mode bursting \
--tags Key=Name,Value="$EFS_NAME" \
--query "FileSystemId" --output text)

echo "Created EFS File System: $EFS_ID"

# Wait for EFS to be available
echo "Waiting for EFS to become available..."
while true; do
STATE=$(aws efs describe-file-systems --file-system-id "$EFS_ID" --query "FileSystems[0].LifeCycleState" --output text)
echo "Current EFS state: $STATE"

if [[ "$STATE" == "available" ]]; then
echo "EFS is now available!"
break
fi

echo "Waiting for EFS to become available..."
sleep 10 # Wait 10 seconds before checking again
done

# Create Mount Targets in each subnet
for SUBNET_ID in "${SUBNET_IDS[@]}"; do
echo "Creating mount target in subnet: $SUBNET_ID"
aws efs create-mount-target \
--file-system-id "$EFS_ID" \
--subnet-id "$SUBNET_ID" \
--security-groups "$EFS_SG_ID"
done

echo "EFS setup complete!"
echo "File System ID: $EFS_ID"
echo "$EFS_ID" > temp.text
2 changes: 1 addition & 1 deletion helm/templates/pvc.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ spec:
{{- end }}
resources:
requests:
storage: {{ $modelSpec.pvcStorage | default "40Gi" }} # Default to 40Gi if not set
storage: {{ $modelSpec.pvcStorage | default "20Gi" }} # Default to 40Gi if not set
{{- if $modelSpec.storageClass }}
{{- if eq $modelSpec.storageClass "efs_static_storage_class_indicator" }}
storageClassName: ""
Expand Down
Loading

0 comments on commit 639edc0

Please sign in to comment.