|
68 | 68 | " min_memory=8,\n",
|
69 | 69 | " max_memory=8,\n",
|
70 | 70 | " num_gpus=1,\n",
|
| 71 | + " head_gpus=1,\n", |
71 | 72 | " image=\"quay.io/project-codeflare/ray:latest-py39-cu118\",\n",
|
72 | 73 | " write_to_file=False, # When enabled Ray Cluster yaml files are written to /HOME/.codeflare/resources \n",
|
73 | 74 | " # local_queue=\"local-queue-name\" # Specify the local queue manually\n",
|
|
147 | 148 | "metadata": {},
|
148 | 149 | "outputs": [],
|
149 | 150 | "source": [
|
150 |
| - "#before proceeding make sure the cluster exists and the uri is not empty\n", |
| 151 | + "# before proceeding make sure the cluster exists and the uri is not empty\n", |
151 | 152 | "assert ray_cluster_uri, \"Ray cluster needs to be started and set before proceeding\"\n",
|
152 | 153 | "\n",
|
153 | 154 | "import ray\n",
|
154 |
| - "from ray.air.config import ScalingConfig\n", |
155 | 155 | "\n",
|
156 | 156 | "# reset the ray context in case there's already one. \n",
|
157 | 157 | "ray.shutdown()\n",
|
158 | 158 | "# establish connection to ray cluster\n",
|
159 | 159 | "\n",
|
160 |
| - "#install additional libraries that will be required for model training\n", |
161 |
| - "runtime_env = {\"pip\": [\"transformers\", \"datasets\", \"evaluate\", \"pyarrow<7.0.0\", \"accelerate\"]}\n", |
162 |
| - "\n", |
| 160 | + "# install additional libraries that will be required for model training\n", |
| 161 | + "runtime_env = {\"pip\": [\"pytorch_lightning==1.5.10\", \"ray_lightning\", \"torchmetrics==0.9.1\", \"torchvision==0.12.0\"]}\n", |
163 | 162 | "# NOTE: This will work for in-cluster notebook servers (RHODS/ODH), but not for local machines\n",
|
164 | 163 | "# To see how to connect from your laptop, go to demo-notebooks/additional-demos/local_interactive.ipynb\n",
|
165 | 164 | "ray.init(address=ray_cluster_uri, runtime_env=runtime_env)\n",
|
|
172 | 171 | "id": "9711030b",
|
173 | 172 | "metadata": {},
|
174 | 173 | "source": [
|
175 |
| - "Now that we are connected (and have passed in some package requirements), let's try writing some training code for a DistilBERT transformer model via HuggingFace (using IMDB dataset):" |
| 174 | + "Now that we are connected (and have passed in some package requirements), let's try writing some training code:" |
176 | 175 | ]
|
177 | 176 | },
|
178 | 177 | {
|
|
184 | 183 | "source": [
|
185 | 184 | "@ray.remote\n",
|
186 | 185 | "def train_fn():\n",
|
187 |
| - " from datasets import load_dataset\n", |
188 |
| - " import transformers\n", |
189 |
| - " from transformers import AutoTokenizer, TrainingArguments\n", |
190 |
| - " from transformers import AutoModelForSequenceClassification\n", |
| 186 | + " import os\n", |
191 | 187 | " import numpy as np\n",
|
192 |
| - " from datasets import load_metric\n", |
193 |
| - " import ray\n", |
194 |
| - " from ray import tune\n", |
195 |
| - " from ray.train.huggingface import HuggingFaceTrainer\n", |
196 |
| - "\n", |
197 |
| - " dataset = load_dataset(\"imdb\")\n", |
198 |
| - " tokenizer = AutoTokenizer.from_pretrained(\"distilbert-base-uncased\")\n", |
199 |
| - "\n", |
200 |
| - " def tokenize_function(examples):\n", |
201 |
| - " return tokenizer(examples[\"text\"], padding=\"max_length\", truncation=True)\n", |
| 188 | + " import evaluate\n", |
| 189 | + " from datasets import load_dataset, load_metric\n", |
| 190 | + " import transformers\n", |
| 191 | + " from transformers import (\n", |
| 192 | + " Trainer,\n", |
| 193 | + " TrainingArguments,\n", |
| 194 | + " AutoTokenizer,\n", |
| 195 | + " AutoModelForSequenceClassification,\n", |
| 196 | + " )\n", |
| 197 | + " import ray.train.huggingface.transformers\n", |
| 198 | + " from ray.train import ScalingConfig\n", |
| 199 | + " from ray.train.torch import TorchTrainer\n", |
202 | 200 | "\n",
|
203 |
| - " tokenized_datasets = dataset.map(tokenize_function, batched=True)\n", |
| 201 | + " # For S3 persistent storage replace the following environment variables with your AWS credentials \n", |
| 202 | + " # See here for information on how to set up an S3 bucket https://docs.aws.amazon.com/AmazonS3/latest/userguide/creating-bucket.html\n", |
| 203 | + " os.environ[\"AWS_ACCESS_KEY_ID\"] = \"XXXXXXXX\"\n", |
| 204 | + " os.environ[\"AWS_SECRET_ACCESS_KEY\"] = \"XXXXXXXX\"\n", |
| 205 | + " os.environ[\"AWS_DEFAULT_REGION\"] = \"XXXXXXXX\"\n", |
| 206 | + " \n", |
| 207 | + " def train_func():\n", |
| 208 | + " # Datasets\n", |
| 209 | + " dataset = load_dataset(\"imdb\")\n", |
| 210 | + " tokenizer = AutoTokenizer.from_pretrained(\"distilbert-base-uncased\")\n", |
204 | 211 | "\n",
|
205 |
| - " #using a fraction of dataset but you can run with the full dataset\n", |
206 |
| - " small_train_dataset = tokenized_datasets[\"train\"].shuffle(seed=42).select(range(100))\n", |
207 |
| - " small_eval_dataset = tokenized_datasets[\"test\"].shuffle(seed=42).select(range(100))\n", |
| 212 | + " def tokenize_function(examples):\n", |
| 213 | + " return tokenizer(examples[\"text\"], padding=\"max_length\", truncation=True)\n", |
208 | 214 | "\n",
|
209 |
| - " print(f\"len of train {small_train_dataset} and test {small_eval_dataset}\")\n", |
| 215 | + " small_train_dataset = (\n", |
| 216 | + " dataset[\"train\"].select(range(100)).map(tokenize_function, batched=True)\n", |
| 217 | + " )\n", |
| 218 | + " small_eval_dataset = (\n", |
| 219 | + " dataset[\"test\"].select(range(100)).map(tokenize_function, batched=True)\n", |
| 220 | + " )\n", |
210 | 221 | "\n",
|
211 |
| - " ray_train_ds = ray.data.from_huggingface(small_train_dataset)\n", |
212 |
| - " ray_evaluation_ds = ray.data.from_huggingface(small_eval_dataset)\n", |
| 222 | + " # Model\n", |
| 223 | + " model = AutoModelForSequenceClassification.from_pretrained(\n", |
| 224 | + " \"distilbert-base-uncased\", num_labels=2\n", |
| 225 | + " )\n", |
213 | 226 | "\n",
|
214 |
| - " def compute_metrics(eval_pred):\n", |
215 |
| - " metric = load_metric(\"accuracy\")\n", |
216 |
| - " logits, labels = eval_pred\n", |
217 |
| - " predictions = np.argmax(logits, axis=-1)\n", |
218 |
| - " return metric.compute(predictions=predictions, references=labels)\n", |
| 227 | + " def compute_metrics(eval_pred):\n", |
| 228 | + " metric = load_metric(\"accuracy\")\n", |
| 229 | + " logits, labels = eval_pred\n", |
| 230 | + " predictions = np.argmax(logits, axis=-1)\n", |
| 231 | + " return metric.compute(predictions=predictions, references=labels)\n", |
219 | 232 | "\n",
|
220 |
| - " def trainer_init_per_worker(train_dataset, eval_dataset, **config):\n", |
221 |
| - " model = AutoModelForSequenceClassification.from_pretrained(\"distilbert-base-uncased\", num_labels=2)\n", |
| 233 | + " # Hugging Face Trainer\n", |
| 234 | + " training_args = TrainingArguments(\n", |
| 235 | + " output_dir=\"test_trainer\",\n", |
| 236 | + " evaluation_strategy=\"epoch\",\n", |
| 237 | + " save_strategy=\"epoch\",\n", |
| 238 | + " report_to=\"none\",\n", |
| 239 | + " )\n", |
222 | 240 | "\n",
|
223 |
| - " training_args = TrainingArguments(\"/tmp/hf_imdb/test\", eval_steps=1, disable_tqdm=True, \n", |
224 |
| - " num_train_epochs=1, skip_memory_metrics=True,\n", |
225 |
| - " learning_rate=2e-5,\n", |
226 |
| - " per_device_train_batch_size=16,\n", |
227 |
| - " per_device_eval_batch_size=16, \n", |
228 |
| - " weight_decay=0.01,)\n", |
229 |
| - " return transformers.Trainer(\n", |
| 241 | + " trainer = Trainer(\n", |
230 | 242 | " model=model,\n",
|
231 | 243 | " args=training_args,\n",
|
232 |
| - " train_dataset=train_dataset,\n", |
233 |
| - " eval_dataset=eval_dataset,\n", |
234 |
| - " compute_metrics=compute_metrics\n", |
| 244 | + " train_dataset=small_train_dataset,\n", |
| 245 | + " eval_dataset=small_eval_dataset,\n", |
| 246 | + " compute_metrics=compute_metrics,\n", |
235 | 247 | " )\n",
|
236 | 248 | "\n",
|
237 |
| - " scaling_config = ScalingConfig(num_workers=2, use_gpu=True) #num workers is the number of gpus\n", |
238 | 249 | "\n",
|
239 |
| - " # we are using the ray native HuggingFaceTrainer, but you can swap out to use non ray Huggingface Trainer. Both have the same method signature. \n", |
240 |
| - " # the ray native HFTrainer has built in support for scaling to multiple GPUs\n", |
241 |
| - " trainer = HuggingFaceTrainer(\n", |
242 |
| - " trainer_init_per_worker=trainer_init_per_worker,\n", |
243 |
| - " scaling_config=scaling_config,\n", |
244 |
| - " datasets={\"train\": ray_train_ds, \"evaluation\": ray_evaluation_ds},\n", |
| 250 | + " callback = ray.train.huggingface.transformers.RayTrainReportCallback()\n", |
| 251 | + " trainer.add_callback(callback)\n", |
| 252 | + "\n", |
| 253 | + " trainer = ray.train.huggingface.transformers.prepare_trainer(trainer)\n", |
| 254 | + "\n", |
| 255 | + " trainer.train()\n", |
| 256 | + "\n", |
| 257 | + "\n", |
| 258 | + " ray_trainer = TorchTrainer(\n", |
| 259 | + " train_func,\n", |
| 260 | + " scaling_config=ScalingConfig(num_workers=3, use_gpu=True),\n", |
| 261 | + " # Configure the run's persistent storage that is accessible across \n", |
| 262 | + " # all worker nodes\n", |
| 263 | + " # update RunConfig below to include your s3 bucket details \n", |
| 264 | + " run_config=ray.train.RunConfig(storage_path=\"s3://BUCKET_NAME/SUB_PATH/\", name=\"unique_run_name\"),\n", |
245 | 265 | " )\n",
|
246 |
| - " result = trainer.fit()" |
| 266 | + " result: ray.train.Result = ray_trainer.fit()" |
247 | 267 | ]
|
248 | 268 | },
|
249 | 269 | {
|
|
0 commit comments