-
Notifications
You must be signed in to change notification settings - Fork 740
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
adding fine tune example with s3 as the dataset store #2006
Changes from all commits
e9b736f
612cbba
0e7c7bf
5193af8
e4ec55c
be05f8e
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,164 @@ | ||
{ | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @deepanker13 Could you set keys instead of dummy keys like this:
# Need to set S3 ACCESS_KEY S3_ACCESS_KEY = "" Reply via ReviewNB There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. done There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @deepanker13 I meant that we should define vars here or dedicated block like this:
# Need to set S3 credentials s3_access_key = "" s3_secret_key = ""
And then, we should use those vars here:
# it is assumed for text related tasks, you have 'text' column in the dataset. # for more info on how dataset is loaded check load_and_preprocess_data function in sdk/python/kubeflow/trainer/hf_llm_training.py dataset_provider_parameters=S3DatasetParams( { "endpoint_url": "http://10.117.63.3", "bucket_name": "test", "file_key": "imdatta0___ultrachat_1k", "region_name": "us-east-1", + "access_key": s3_access_key, + "secret_key": s3_secret_key, } ), There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. oh ok, done |
||
"cells": [ | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"# install kubeflow-training extra 'huggingface'\n", | ||
"!pip install -U 'kubeflow-training[huggingface]'" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 2, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"# import the libraries\n", | ||
"from kubeflow.training.api.training_client import TrainingClient\n", | ||
"from kubeflow.storage_initializer.hugging_face import (\n", | ||
" HuggingFaceModelParams,\n", | ||
" HuggingFaceTrainParams,\n", | ||
" HfDatasetParams,\n", | ||
")\n", | ||
"from kubeflow.storage_initializer.constants import INIT_CONTAINER_MOUNT_PATH\n", | ||
"from peft import LoraConfig\n", | ||
"import transformers\n", | ||
"from transformers import TrainingArguments\n", | ||
"from kubeflow.training import constants" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 2, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"# create a training client, pass config_file parameter if you want to use kubeconfig other than \"~/.kube/config\"\n", | ||
"client = TrainingClient()" | ||
] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"metadata": {}, | ||
"source": [ | ||
"USING S3 AS THE DATASET SOURCE" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"# Need to set S3 credentials\n", | ||
"s3_access_key = \"\"\n", | ||
"s3_secret_key = \"\"" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"# mention the model, datasets and training parameters\n", | ||
"client.train(\n", | ||
" name=\"s3-test\",\n", | ||
" num_workers=2,\n", | ||
" num_procs_per_worker=1,\n", | ||
" # specify the storage class if you don't want to use the default one for the storage-initializer PVC\n", | ||
" # storage_config={\n", | ||
" # \"size\": \"10Gi\",\n", | ||
" # \"storage_class\": \"<your storage class>\",\n", | ||
" # },\n", | ||
" model_provider_parameters=HuggingFaceModelParams(\n", | ||
" model_uri=\"hf://TinyLlama/TinyLlama-1.1B-Chat-v1.0\",\n", | ||
" transformer_type=transformers.AutoModelForCausalLM,\n", | ||
" ),\n", | ||
" # it is assumed for text related tasks, you have 'text' column in the dataset.\n", | ||
" # for more info on how dataset is loaded check load_and_preprocess_data function in sdk/python/kubeflow/trainer/hf_llm_training.py\n", | ||
" dataset_provider_parameters=S3DatasetParams(\n", | ||
" {\n", | ||
" \"endpoint_url\": \"http://10.117.63.3\",\n", | ||
" \"bucket_name\": \"test\",\n", | ||
" \"file_key\": \"imdatta0___ultrachat_1k\",\n", | ||
" \"region_name\": \"us-east-1\",\n", | ||
" \"access_key\": s3_access_key,\n", | ||
" \"secret_key\": s3_secret_key,\n", | ||
" }\n", | ||
" ),\n", | ||
" train_parameters=HuggingFaceTrainParams(\n", | ||
" lora_config=LoraConfig(\n", | ||
" r=8,\n", | ||
" lora_alpha=8,\n", | ||
" lora_dropout=0.2,\n", | ||
" bias=\"none\",\n", | ||
" task_type=\"CAUSAL_LM\",\n", | ||
" ),\n", | ||
" training_parameters=TrainingArguments(\n", | ||
" num_train_epochs=1,\n", | ||
" per_device_train_batch_size=1,\n", | ||
" gradient_accumulation_steps=1,\n", | ||
" gradient_checkpointing=True,\n", | ||
" gradient_checkpointing_kwargs={\n", | ||
" \"use_reentrant\": False\n", | ||
" }, # this is mandatory if checkpointng is enabled\n", | ||
" warmup_steps=0.02,\n", | ||
" learning_rate=1,\n", | ||
" lr_scheduler_type=\"cosine\",\n", | ||
" bf16=False,\n", | ||
" logging_steps=0.01,\n", | ||
" output_dir=INIT_CONTAINER_MOUNT_PATH,\n", | ||
" optim=f\"sgd\",\n", | ||
" save_steps=0.01,\n", | ||
" save_total_limit=3,\n", | ||
" disable_tqdm=False,\n", | ||
" resume_from_checkpoint=True,\n", | ||
" remove_unused_columns=True,\n", | ||
" ddp_backend=\"nccl\", # change the backend to gloo if you want cpu based training and remove the gpu key in resources_per_worker\n", | ||
" ),\n", | ||
" ),\n", | ||
" resources_per_worker={\n", | ||
" \"gpu\": 1,\n", | ||
" \"cpu\": 8,\n", | ||
" \"memory\": \"8Gi\",\n", | ||
" }, # remove the gpu key if you don't want to attach gpus to the pods\n", | ||
")" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"# check the logs of the job\n", | ||
"client.get_job_logs(name=\"s3-test\", job_kind=constants.PYTORCHJOB_KIND)" | ||
] | ||
} | ||
], | ||
"metadata": { | ||
"kernelspec": { | ||
"display_name": "myenv3.11", | ||
"language": "python", | ||
"name": "python3" | ||
}, | ||
"language_info": { | ||
"codemirror_mode": { | ||
"name": "ipython", | ||
"version": 3 | ||
}, | ||
"file_extension": ".py", | ||
"mimetype": "text/x-python", | ||
"name": "python", | ||
"nbconvert_exporter": "python", | ||
"pygments_lexer": "ipython3", | ||
"version": "3.11.6" | ||
} | ||
}, | ||
"nbformat": 4, | ||
"nbformat_minor": 2 | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -42,18 +42,31 @@ def download_dataset(self): | |
import boto3 | ||
|
||
# Create an S3 client for Nutanix Object Store/S3 | ||
s3_client = boto3.client( | ||
"s3", | ||
s3_client = boto3.Session( | ||
aws_access_key_id=self.config.access_key, | ||
aws_secret_access_key=self.config.secret_key, | ||
endpoint_url=self.config.endpoint_url, | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. What is the reason that we should pass the There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @tenzen-y With endpoint urls, it can work with any S3 protocol compliant implementations There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. That makes sense. |
||
region_name=self.config.region_name, | ||
) | ||
s3_resource = s3_client.resource("s3", endpoint_url=self.config.endpoint_url) | ||
# Get the bucket object | ||
bucket = s3_resource.Bucket(self.config.bucket_name) | ||
|
||
# Download the file | ||
s3_client.download_file( | ||
self.config.bucket_name, | ||
self.config.file_key, | ||
os.path.join(VOLUME_PATH_DATASET, self.config.file_key), | ||
) | ||
print(f"File downloaded to: {VOLUME_PATH_DATASET}") | ||
# Filter objects with the specified prefix | ||
objects = bucket.objects.filter(Prefix=self.config.file_key) | ||
# Iterate over filtered objects | ||
for obj in objects: | ||
# Extract the object key (filename) | ||
obj_key = obj.key | ||
path_components = obj_key.split(os.path.sep) | ||
path_excluded_first_last_parts = os.path.sep.join(path_components[1:-1]) | ||
|
||
# Create directories if they don't exist | ||
os.makedirs( | ||
os.path.join(VOLUME_PATH_DATASET, path_excluded_first_last_parts), | ||
exist_ok=True, | ||
) | ||
|
||
# Download the file | ||
file_path = os.path.sep.join(path_components[1:]) | ||
bucket.download_file(obj_key, os.path.join(VOLUME_PATH_DATASET, file_path)) | ||
print(f"Files downloaded") |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The
access_key
andsecret_key
are still remaining.Reply via ReviewNB
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
they are invalid keys