-
Notifications
You must be signed in to change notification settings - Fork 82
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
[Tutorial] Added EKS tutorial for production stack (#142)
* [Feat] Added support for modifying PVC through values.yaml Signed-off-by: hanchenli <[email protected]> * fix pre commit Signed-off-by: hanchenli <[email protected]> * [WIP] AWS EKS deployment and tutorial Signed-off-by: hanchenli <[email protected]> * [FIX] Fixed precommit errors Signed-off-by: hanchenli <[email protected]> * Added readme Signed-off-by: hanchenli <[email protected]> * changed tutorial Signed-off-by: hanchenli <[email protected]> --------- Signed-off-by: hanchenli <[email protected]>
- Loading branch information
Showing
7 changed files
with
560 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,16 @@ | ||
# Setting up EKS vLLM stack with one command | ||
|
||
This script automatically configures a EKS LLM inference cluster. | ||
Make sure your AWS cli is set up, logged in, and region set up. You have eksctl, kubectl, helm installed. | ||
|
||
Modify fields production_stack_specification.yaml and execute as: | ||
|
||
```bash | ||
bash entry_point.sh YOUR_AWSREGION YAML_FILE_PATH | ||
``` | ||
|
||
Clean up the service (not the VPC) with: | ||
|
||
```bash | ||
bash clean_up.sh production-stack YOUR_AWSREGION | ||
``` |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,144 @@ | ||
#!/bin/bash | ||
|
||
# Set variables | ||
CLUSTER_NAME=$1 | ||
REGION=$2 | ||
|
||
echo "Starting cleanup for EKS cluster: $CLUSTER_NAME in region: $REGION" | ||
|
||
# Delete all namespaces except default, kube-system, kube-public | ||
echo "Deleting all custom namespaces..." | ||
kubectl get ns --no-headers | awk '{print $1}' | grep -vE '^(default|kube-system|kube-public)$' | xargs -r kubectl delete ns | ||
|
||
# Delete all workloads | ||
echo "Deleting all workloads..." | ||
kubectl delete deployments,statefulsets,daemonsets,services,ingresses,configmaps,secrets,persistentvolumeclaims,jobs,cronjobs --all --all-namespaces | ||
kubectl delete persistentvolumes --all | ||
|
||
# Delete managed node groups (if any exist) | ||
echo "Checking for managed node groups..." | ||
NODEGROUPS=$(aws eks list-nodegroups --cluster-name "$CLUSTER_NAME" --region "$REGION" --query "nodegroups[]" --output text) | ||
if [ -n "$NODEGROUPS" ]; then | ||
for NODEGROUP in $NODEGROUPS; do | ||
echo "Deleting node group: $NODEGROUP" | ||
aws eks delete-nodegroup --cluster-name "$CLUSTER_NAME" --nodegroup-name "$NODEGROUP" --region "$REGION" | ||
echo "Waiting for node group $NODEGROUP to be deleted..." | ||
aws eks wait nodegroup-deleted --cluster-name "$CLUSTER_NAME" --nodegroup-name "$NODEGROUP" --region "$REGION" | ||
done | ||
else | ||
echo "No managed node groups found." | ||
fi | ||
|
||
# Delete self-managed EC2 instances | ||
echo "Deleting self-managed nodes..." | ||
INSTANCE_IDS=$(aws ec2 describe-instances --filters "Name=tag:eks:cluster-name,Values=$CLUSTER_NAME" --query "Reservations[].Instances[].InstanceId" --output text --region "$REGION") | ||
if [ -n "$INSTANCE_IDS" ]; then | ||
aws ec2 terminate-instances --instance-ids "$INSTANCE_IDS" --region "$REGION" | ||
echo "Waiting for EC2 instances to terminate..." | ||
aws ec2 wait instance-terminated --instance-ids "$INSTANCE_IDS" --region "$REGION" | ||
fi | ||
|
||
# Delete associated load balancers | ||
echo "Deleting load balancers..." | ||
LB_ARNs=$(aws elbv2 describe-load-balancers --query "LoadBalancers[?contains(LoadBalancerName, '$CLUSTER_NAME')].LoadBalancerArn" --output text --region "$REGION") | ||
for LB_ARN in $LB_ARNs; do | ||
aws elbv2 delete-load-balancer --load-balancer-arn "$LB_ARN" --region "$REGION" | ||
echo "Waiting for load balancer $LB_ARN to be deleted..." | ||
aws elbv2 wait load-balancers-deleted --load-balancer-arns "$LB_ARN" --region "$REGION" | ||
done | ||
|
||
# Delete target groups | ||
echo "Deleting target groups..." | ||
TG_ARNs=$(aws elbv2 describe-target-groups --query "TargetGroups[?contains(TargetGroupName, '$CLUSTER_NAME')].TargetGroupArn" --output text --region "$REGION") | ||
for TG_ARN in $TG_ARNs; do | ||
aws elbv2 delete-target-group --target-group-arn "$TG_ARN" --region "$REGION" | ||
done | ||
|
||
# Delete NAT Gateways | ||
echo "Deleting NAT Gateways..." | ||
NAT_GATEWAYS=$(aws ec2 describe-nat-gateways --filter "Name=tag:eks:cluster-name,Values=$CLUSTER_NAME" --query "NatGateways[].NatGatewayId" --output text --region "$REGION") | ||
for NAT_ID in $NAT_GATEWAYS; do | ||
aws ec2 delete-nat-gateway --nat-gateway-id "$NAT_ID" --region "$REGION" | ||
echo "Waiting for NAT Gateway $NAT_ID to be deleted..." | ||
aws ec2 wait nat-gateway-deleted --nat-gateway-ids "$NAT_ID" --region "$REGION" | ||
done | ||
|
||
# Release Elastic IPs | ||
echo "Releasing Elastic IPs..." | ||
EIP_ALLOCS=$(aws ec2 describe-addresses --query "Addresses[?AssociationId==null].AllocationId" --output text --region "$REGION") | ||
for EIP in $EIP_ALLOCS; do | ||
aws ec2 release-address --allocation-id "$EIP" --region "$REGION" | ||
echo "Released Elastic IP $EIP" | ||
done | ||
|
||
# Release EFS and the created security group | ||
while read -r fs_id; do | ||
echo "Processing File System: $fs_id" | ||
|
||
# Get the list of mount targets | ||
mount_targets=$(aws efs describe-mount-targets --file-system-id "$fs_id" --query "MountTargets[*].MountTargetId" --output text) | ||
|
||
# Delete each mount target | ||
for mt_id in $mount_targets; do | ||
echo "Deleting Mount Target: $mt_id" | ||
aws efs delete-mount-target --mount-target-id "$mt_id" | ||
done | ||
|
||
# Wait for mount targets to be deleted (optional, prevents API conflicts) | ||
while [[ -n $(aws efs describe-mount-targets --file-system-id "$fs_id" --query "MountTargets[*].MountTargetId" --output text) ]]; do | ||
echo "Waiting for mount targets to be deleted..." | ||
sleep 10 | ||
done | ||
|
||
# Delete the file system | ||
echo "Deleting File System: $fs_id" | ||
aws efs delete-file-system --file-system-id "$fs_id" | ||
|
||
done < temp.txt | ||
|
||
|
||
for sg in $(aws ec2 describe-security-groups --filters "Name=group-name,Values=efs-sg" --query "SecurityGroups[*].GroupId" --output text); do | ||
|
||
echo "Deleting Security Group: $sg" | ||
for eni in $(aws ec2 describe-network-interfaces --filters "Name=group-id,Values=$sg" --query "NetworkInterfaces[*].NetworkInterfaceId" --output text); do | ||
echo "$eni" | ||
aws ec2 delete-network-interface --network-interface-id "$eni" | ||
done | ||
aws ec2 delete-security-group --group-id "$sg" | ||
done | ||
|
||
# Delete the EKS cluster | ||
echo "Deleting EKS cluster..." | ||
aws eks delete-cluster --name "$CLUSTER_NAME" --region "$REGION" | ||
echo "Waiting for cluster $CLUSTER_NAME to be deleted..." | ||
aws eks wait cluster-deleted --name "$CLUSTER_NAME" --region "$REGION" | ||
|
||
# Delete CloudFormation Stack | ||
echo "Checking if CloudFormation stack exists for EKS cluster..." | ||
STACK_NAME="eksctl-${CLUSTER_NAME}-cluster" | ||
STACK_STATUS=$(aws cloudformation describe-stacks --stack-name "$STACK_NAME" --region "$REGION" --query "Stacks[0].StackStatus" --output text 2>/dev/null) | ||
|
||
if [ -n "$STACK_STATUS" ]; then | ||
echo "Deleting CloudFormation stack: $STACK_NAME" | ||
aws cloudformation delete-stack --stack-name "$STACK_NAME" --region "$REGION" | ||
echo "Waiting for CloudFormation stack $STACK_NAME to be deleted..." | ||
aws cloudformation wait stack-delete-complete --stack-name "$STACK_NAME" --region "$REGION" | ||
echo "CloudFormation stack $STACK_NAME has been deleted successfully!" | ||
else | ||
echo "CloudFormation stack $STACK_NAME not found, skipping..." | ||
fi | ||
|
||
STACK_NAME="eksctl-${CLUSTER_NAME}-cluster-nodegroup-gpu-nodegroup" | ||
STACK_STATUS=$(aws cloudformation describe-stacks --stack-name "$STACK_NAME" --region "$REGION" --query "Stacks[0].StackStatus" --output text 2>/dev/null) | ||
|
||
if [ -n "$STACK_STATUS" ]; then | ||
echo "Deleting CloudFormation stack: $STACK_NAME" | ||
aws cloudformation delete-stack --stack-name "$STACK_NAME" --region "$REGION" | ||
echo "Waiting for CloudFormation stack $STACK_NAME to be deleted..." | ||
aws cloudformation wait stack-delete-complete --stack-name "$STACK_NAME" --region "$REGION" | ||
echo "CloudFormation stack $STACK_NAME has been deleted successfully!" | ||
else | ||
echo "CloudFormation stack $STACK_NAME not found, skipping..." | ||
fi | ||
|
||
echo "EKS cluster $CLUSTER_NAME cleanup completed successfully!" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,64 @@ | ||
#!/bin/bash | ||
|
||
AWS_REGION=$1 | ||
SETUP_YAML=$2 | ||
CLUSTER_NAME="production-stack" | ||
#This script assumes you have latest AWS cli logged in. | ||
|
||
# Set up EKS cluster | ||
eksctl create cluster \ | ||
--name "$CLUSTER_NAME" \ | ||
--region "$AWS_REGION" \ | ||
--version 1.27 \ | ||
--nodegroup-name gpu-nodegroup \ | ||
--node-type g6e.4xlarge \ | ||
--nodes 2 \ | ||
--nodes-min 2 \ | ||
--nodes-max 2 \ | ||
--managed | ||
|
||
|
||
# Create EFS (need to be in same VPC as EKS) | ||
bash set_up_efs.sh "$CLUSTER_NAME" "$AWS_REGION" | ||
|
||
# #Create CSI driver | ||
eksctl utils associate-iam-oidc-provider --region "$AWS_REGION" --cluster "$CLUSTER_NAME" --approve | ||
|
||
kubectl apply -k "github.com/kubernetes-sigs/aws-efs-csi-driver/deploy/kubernetes/overlays/stable/ecr/?ref=release-1.6" | ||
kubectl get pods -n kube-system | grep efs | ||
eksctl create iamserviceaccount \ | ||
--region "$AWS_REGION" \ | ||
--name efs-csi-controller-sa \ | ||
--namespace kube-system \ | ||
--cluster "$CLUSTER_NAME" \ | ||
--attach-policy-arn arn:aws:iam::aws:policy/service-role/AmazonEFSCSIDriverPolicy \ | ||
--approve | ||
|
||
|
||
#create pv after modify the filesys id to be the filesys id | ||
#storage needed is based on model weights | ||
EFS_ID=$(cat temp.text) | ||
|
||
cat <<EOF > efs-pv.yaml | ||
apiVersion: v1 | ||
kind: PersistentVolume | ||
metadata: | ||
name: efs-pv | ||
spec: | ||
capacity: | ||
storage: 40Gi | ||
volumeMode: Filesystem | ||
accessModes: | ||
- ReadWriteMany | ||
persistentVolumeReclaimPolicy: Retain | ||
csi: | ||
driver: efs.csi.aws.com | ||
volumeHandle: $EFS_ID | ||
EOF | ||
|
||
kubectl apply -f efs-pv.yaml | ||
kubectl get pv | ||
|
||
#Now we start the production stack. | ||
helm repo add vllm https://vllm-project.github.io/production-stack | ||
helm install vllm vllm/vllm-stack -f "$SETUP_YAML" |
18 changes: 18 additions & 0 deletions
18
deployment_on_cloud/aws/production_stack_specification.yaml
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,18 @@ | ||
servingEngineSpec: | ||
runtimeClassName: "" | ||
modelSpec: | ||
- name: "llama8b" | ||
repository: "vllm/vllm-openai" | ||
tag: "latest" | ||
modelURL: "meta-llama/Llama-3.1-8B" | ||
|
||
replicaCount: 2 | ||
|
||
requestCPU: 6 | ||
requestMemory: "16Gi" | ||
requestGPU: 1 | ||
hf_token: HUGGINGFACE_TOKEN | ||
pvcStorage: "40Gi" | ||
pvcAccessMode: | ||
- ReadWriteMany | ||
storageClass: "efs_static_storage_class_indicator" #This is to let helm pvc know it should be empty string |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,73 @@ | ||
#!/bin/bash | ||
set -e # Exit on error | ||
set -o pipefail # Exit on first command failure in a pipeline | ||
|
||
CLUSTER_NAME=$1 | ||
REGION=$2 | ||
EFS_NAME="efs_for_eks" | ||
|
||
# Fetch VPC ID | ||
VPC_ID=$(aws eks describe-cluster --name "$CLUSTER_NAME" --query "cluster.resourcesVpcConfig.vpcId" --output text) | ||
|
||
# Fetch Subnet IDs | ||
read -r -a SUBNET_IDS <<< "$(aws eks describe-cluster --name "$CLUSTER_NAME" --query "cluster.resourcesVpcConfig.subnetIds" --output text)" | ||
|
||
# Fetch Security Group used by EKS | ||
INSTANCE_ID=$(aws ec2 describe-instances --filters Name=tag:eks:cluster-name,Values=production-stack --query "Reservations[*].Instances[*].InstanceId" --output text | head -n 1) | ||
EKS_SG_ID=$(aws ec2 describe-instances --instance-ids "$INSTANCE_ID" --query "Reservations[0].Instances[0].SecurityGroups[*].GroupId" --output text) | ||
echo "EKS Security Group ID: $EKS_SG_ID" | ||
|
||
# Create Security Group for EFS | ||
EFS_SG_ID=$(aws ec2 create-security-group \ | ||
--group-name efs-sg \ | ||
--description "Allow NFS from EKS" \ | ||
--vpc-id "$VPC_ID" \ | ||
--query "GroupId" --output text --region "$REGION") | ||
echo "Created Security Group for EFS: $EFS_SG_ID" | ||
|
||
# Allow NFS traffic (port 2049) from EKS nodes | ||
aws ec2 authorize-security-group-ingress \ | ||
--group-id "$EFS_SG_ID" \ | ||
--protocol tcp \ | ||
--port 2049 \ | ||
--source-group "$EKS_SG_ID" | ||
|
||
echo "Security group updated to allow NFS traffic." | ||
|
||
# Create EFS File System | ||
EFS_ID=$(aws efs create-file-system \ | ||
--region "$REGION" \ | ||
--performance-mode generalPurpose \ | ||
--throughput-mode bursting \ | ||
--tags Key=Name,Value="$EFS_NAME" \ | ||
--query "FileSystemId" --output text) | ||
|
||
echo "Created EFS File System: $EFS_ID" | ||
|
||
# Wait for EFS to be available | ||
echo "Waiting for EFS to become available..." | ||
while true; do | ||
STATE=$(aws efs describe-file-systems --file-system-id "$EFS_ID" --query "FileSystems[0].LifeCycleState" --output text) | ||
echo "Current EFS state: $STATE" | ||
|
||
if [[ "$STATE" == "available" ]]; then | ||
echo "EFS is now available!" | ||
break | ||
fi | ||
|
||
echo "Waiting for EFS to become available..." | ||
sleep 10 # Wait 10 seconds before checking again | ||
done | ||
|
||
# Create Mount Targets in each subnet | ||
for SUBNET_ID in "${SUBNET_IDS[@]}"; do | ||
echo "Creating mount target in subnet: $SUBNET_ID" | ||
aws efs create-mount-target \ | ||
--file-system-id "$EFS_ID" \ | ||
--subnet-id "$SUBNET_ID" \ | ||
--security-groups "$EFS_SG_ID" | ||
done | ||
|
||
echo "EFS setup complete!" | ||
echo "File System ID: $EFS_ID" | ||
echo "$EFS_ID" > temp.text |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.