Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
The table of contents is too big for display.
Diff view
Diff view
  •  
  •  
  •  
2 changes: 1 addition & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ python/dist
python/FedML.egg-info

doc/deploy
doc/en/_build
doc/en/_build/doctrees


*.h5
Expand Down
3 changes: 1 addition & 2 deletions app/fedcv/image_classification/config/fedml_config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -22,9 +22,8 @@ train_args:
epochs: 5
batch_size: 64
client_optimizer: sgd
learning_rate: 0.03
lr: 0.03
weight_decay: 0.001
metric: "prc-auc"

validation_args:
frequency_of_the_test: 5
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,14 +21,15 @@ def train(self, train_data, device, args):

criterion = nn.CrossEntropyLoss().to(device)
if args.client_optimizer == "sgd":
optimizer = torch.optim.SGD(model.parameters(), lr=args.learning_rate)
optimizer = torch.optim.SGD(model.parameters(), lr=args.lr)
else:
optimizer = torch.optim.Adam(
filter(lambda p: p.requires_grad, model.parameters()),
lr=args.learning_rate,
lr=args.lr,
weight_decay=args.weight_decay,
amsgrad=True,
)

epoch_loss = []
for epoch in range(args.epochs):
batch_loss = []
Expand Down
8 changes: 5 additions & 3 deletions app/fedcv/image_segmentation/config/fedml_config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -21,9 +21,11 @@ train_args:
epochs: 5
batch_size: 64
client_optimizer: sgd
learning_rate: 0.03
lr: 0.03
momentum: 0.9
nesterov: true
weight_decay: 0.001
metric: "prc-auc"
loss_type: "ce"

validation_args:
frequency_of_the_test: 5
Expand All @@ -43,4 +45,4 @@ tracking_args:
enable_wandb: false
wandb_key: ee0b5f53d949c84cee7decbe7a629e63fb2f8408
wandb_project: fedml
wandb_name: fedml_torch_image_classification
wandb_name: fedml_torch_image_segmentation
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ data_args:

model_args:
model: "densenet"
class_num: 1000
class_num: 14

train_args:
federated_optimizer: "FedAvg"
Expand All @@ -21,9 +21,8 @@ train_args:
epochs: 5
batch_size: 64
client_optimizer: sgd
learning_rate: 0.03
lr: 0.03
weight_decay: 0.001
metric: "prc-auc"

validation_args:
frequency_of_the_test: 5
Expand Down
7 changes: 3 additions & 4 deletions app/fedcv/object_detection/config/fedml_config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ data_args:
partition_alpha: 0.5

model_args:
model: "unet"
model: "yolo"
class_num: 1000

train_args:
Expand All @@ -21,9 +21,8 @@ train_args:
epochs: 5
batch_size: 64
client_optimizer: sgd
learning_rate: 0.03
lr: 0.03
weight_decay: 0.001
metric: "prc-auc"

validation_args:
frequency_of_the_test: 5
Expand All @@ -43,4 +42,4 @@ tracking_args:
enable_wandb: false
wandb_key: ee0b5f53d949c84cee7decbe7a629e63fb2f8408
wandb_project: fedml
wandb_name: fedml_torch_image_classification
wandb_name: fedml_torch_object_detection
25 changes: 15 additions & 10 deletions app/fednlp/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -35,62 +35,67 @@ For each of these make sure the datapaths and the gpu config paths are given cor

**TEXT CLASSIFICATION**

Read data/README.md for more details of datasets available
Read `data/README.md` for more details of datasets available

Adjust the hyperparameters in fednlp/text_classification/config/fedml_config.yaml

To run text classification using MPI simulator follow the following steps:

```bash
1. cd ../
2. bash fednlp/data/download_data.sh
3. bash fednlp/data/download_partition.sh
4. bash fednlp/text_classification/run_step_by_step_example.sh

```

**SEQ TAGGING**

Read data/README.md for more details of datasets available
Read `data/README.md` for more details of datasets available

Adjust the hyperparameters in fednlp/seq_tagging/config/fedml_config.yaml

To run sequence tagging on Onto dataset using MPI simulator follow the following steps:

```bash
1. cd ../
2. bash fednlp/data/download_data.sh
3. bash fednlp/data/download_partition.sh
4. bash fednlp/seq_tagging/run_step_by_step_example.sh

```

**SPAN EXTRACTION**

Adjust the hyperparameters in fednlp/span_extraction/config/fedml_config.yaml and make sure data file paths are correct
Adjust the hyperparameters in `fednlp/span_extraction/config/fedml_config.yaml` and make sure data file paths are correct

To run span extraction on MRQA dataset using MPI simulator follow the following steps:

```bash
1. cd ../
2. bash fednlp/data/download_data.sh
3. bash fednlp/data/download_partition.sh
4. bash fednlp/span_extraction/run_step_by_step_example.sh
```


**SEQ2SEQ**

Read data/README.md for more details of datasets available
Read `data/README.md` for more details of datasets available

Adjust the hyperparameters in fednlp/seq2seq/config/fedml_config.yaml
Adjust the hyperparameters in `fednlp/seq2seq/config/fedml_config.yaml` and make sure data file paths are correct

To run seq2seq using MPI simulator follow the following steps:

```bash
1. cd ../
2. bash fednlp/data/download_data.sh
3. bash fednlp/data/download_partition.sh
4. bash fednlp/seq_tagging/run_step_by_step_example.sh
```


We have provided examples of trainers in each example. For running custom trainers feel free to follow the folder {application_name}/trainer/ and write your own custom trainer. To include this trainer please follow the create_model function in the python executable in the folder {application_name}/ and replace the current trainer with your own trainer. Every trainer should inherit the Client Trainer class and should contain a train and a test function.
We have provided examples of trainers in each example. For running custom trainers feel free to follow the folder `{application_name}/trainer/` and write your own custom trainer. To include this trainer please follow the create_model function in the python executable in the folder `{application_name}/` and replace the current trainer with your own trainer. Every trainer should inherit the Client Trainer class and should contain a train and a test function.


We have provided examples with BERT and DistilBert for text classification, seq tagging and span extraction and BART for Seq2Seq. For using any other model from Huggingface Transformers please look at the create_model function in the python executable in the folder {application_name}/. Also please ensure that you are using the correct tokenizer in {application_name}/data/data_loader.py
We have provided examples with BERT and DistilBert for text classification, seq tagging and span extraction and BART for Seq2Seq. For using any other model from Huggingface Transformers please look at the create_model function in the python executable in the folder `{application_name}/`. Also please ensure that you are using the correct tokenizer in `{application_name}/data/data_loader.py`


* Here {application_name} refers to any one of text_classification, span_extraction, seq_tagging or seq2seq.
29 changes: 15 additions & 14 deletions app/fednlp/seq2seq/config/fedml_config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -19,22 +19,23 @@ model_args:
train_args:
federated_optimizer: "FedAvg"
client_id_list: "[]"
client_num_in_total: 10
client_num_per_round: 4
client_num_in_total: 100
client_num_per_round: 1
comm_round: 1
epochs: 1
batch_size: 8
eval_batch_size: 8
max_seq_length: 128
epochs: 5
batch_size: 4
eval_batch_size: 4
max_seq_length: 256
fp16: false
output_dir: "home/ubuntu/output_dir"
client_optimizer: AdamW
client_optimizer: sgd
server_optimizer: sgd
server_lr: 0.01
learning_rate: 0.03
server_lr: 0.1
learning_rate: 0.0001
weight_decay: 0.001
gradient_accumulation_steps: 1
clip_grad_norm: 0
clip_grad_norm: true
max_grad_norm: 1
fedprox_mu: 1
evaluate_during_training: false
evaluate_during_training_steps: 10
Expand All @@ -45,10 +46,10 @@ validation_args:
frequency_of_the_test: 5

device_args:
worker_num: 4
using_gpu: false
gpu_mapping_file: config/gpu_mapping.yaml
gpu_mapping_key: mapping_default
worker_num: 1
using_gpu: true
gpu_mapping_file: /home/ubuntu/FedML/app/fednlp/seq2seq/config/gpu_mapping.yaml
gpu_mapping_key: mapping_fednlp_sp

comm_args:
backend: "MPI"
Expand Down
5 changes: 3 additions & 2 deletions app/fednlp/seq2seq/config/gpu_mapping.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,7 @@ mapping_FedML_gRPC:
mapping_FedML_tRPC:
lambda-server1: [0, 0, 0, 0, 2, 2, 1, 1]
lambda-server2: [2, 1, 1, 1, 0, 0, 0, 0]

mapping_fednlp_sp:
hostname_node_1: [2]
#mapping_FedML_tRPC:
# lambda-server1: [0, 0, 0, 0, 3, 3, 3, 2]
# lambda-server1: [0, 0, 0, 0, 3, 3, 3, 2]
2 changes: 1 addition & 1 deletion app/fednlp/seq2seq/run_step_by_step_example.sh
Original file line number Diff line number Diff line change
Expand Up @@ -8,4 +8,4 @@ hostname > mpi_host_file

$(which mpirun) -np $PROCESS_NUM \
-hostfile mpi_host_file \
/home/ubuntu/fednlp_migration/bin/python3.8 -m fednlp.seq2seq.torch_fedavg_20news_bert_step_by_step_example --cf fednlp/seq2seq/config/fedml_config.yaml
/home/ubuntu/fedml/bin/python3.8 -m fednlp.seq2seq.torch_fedavg_20news_bert_step_by_step_example --cf fednlp/seq2seq/config/fedml_config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,7 @@ def create_model(args, device, output_dim=1):
"output_dir": args.output_dir,
"is_debug_mode": args.is_debug_mode,
"fedprox_mu": args.fedprox_mu,
"optimizer": args.client_optimizer,
}
)

Expand All @@ -69,6 +70,7 @@ def create_model(args, device, output_dim=1):
model_config = {}
config = config_class.from_pretrained(args.model, **model_config)
model = model_class.from_pretrained(args.model, config=config)
print("reached_here")
trainer = MySSTrainer(model_args, device, model, tokenizer=tokenizer)
return model, trainer

Expand Down
61 changes: 32 additions & 29 deletions app/fednlp/seq2seq/trainer/seq2seq_trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
AdamW,
get_linear_schedule_with_warmup,
)
from tqdm import tqdm


class MyModelTrainer(ClientTrainer):
Expand Down Expand Up @@ -133,10 +134,10 @@ def train(self, train_data, device, args, test_data=None):

if self.args.fl_algorithm == "FedProx":
global_model = copy.deepcopy(self.model)

epoch_loss = []
for epoch in range(0, args.epochs):

for batch_idx, batch in enumerate(train_data):
batch_loss = []
for batch_idx, batch in tqdm(enumerate(train_data)):
self.model.train()
# batch = tuple(t.to(device) for t in batch)
# dataset = TensorDataset(all_guid, all_input_ids, all_attention_masks, all_token_type_ids, all_cls_index,
Expand All @@ -146,17 +147,14 @@ def train(self, train_data, device, args, test_data=None):

if args.fp16:
with amp.autocast():
print("reached here")
outputs = self.model(**inputs)
# model outputs are always tuple in pytorch-transformers (see doc)
loss = outputs[0]
print("reached here")
else:
print("reached here")
outputs = self.model(**inputs)
# model outputs are always tuple in pytorch-transformers (see doc)
loss = outputs[0]
print("reached here")

if args.n_gpu > 1:
loss = (
Expand All @@ -173,22 +171,19 @@ def train(self, train_data, device, args, test_data=None):
loss += fed_prox_reg

current_loss = loss.item()
print("reached here")

if args.gradient_accumulation_steps > 1:
loss = loss / args.gradient_accumulation_steps
print("reached here")
if args.fp16:
scaler.scale(loss).backward()
else:
loss.backward()
print("reached here")
tr_loss += loss.item()

logging.info(
"epoch = %d, batch_idx = %d/%d, loss = %s"
% (epoch, batch_idx, len(train_data), current_loss)
)
# logging.info(
# "epoch = %d, batch_idx = %d/%d, loss = %s"
# % (epoch, batch_idx, len(train_data), current_loss)
# )

if (batch_idx + 1) % args.gradient_accumulation_steps == 0:
if args.fp16:
Expand All @@ -205,17 +200,25 @@ def train(self, train_data, device, args, test_data=None):
scheduler.step() # Update learning rate schedule
self.model.zero_grad()
global_step += 1
batch_loss.append(tr_loss)
tr_loss = 0

if (
self.args.evaluate_during_training
and (
self.args.evaluate_during_training_steps > 0
and global_step % self.args.evaluate_during_training_steps == 0
)
and test_data is not None
):
results, _, _ = self.test(test_data, device, args)
logging.info(results)
# epoch_loss.append(sum(batch_loss) / len(batch_loss))
logging.info(
"Client Index = {}\tEpoch: {}\tLoss: {:.6f}".format(
self.id, epoch, sum(batch_loss) / len(batch_loss)
)
)
if (
self.args.evaluate_during_training
and (
self.args.evaluate_during_training_steps > 0
and global_step % self.args.evaluate_during_training_steps == 0
)
and test_data is not None
):
results, _, _ = self.test(test_data, device, args)
logging.info(results)

def test(self, test_data, device, args):

Expand Down Expand Up @@ -244,7 +247,7 @@ def test(self, test_data, device, args):
logging.info("len(test_dl) = %d, n_batches = %d" % (len(test_data), n_batches))
for i, batch in enumerate(test_data):
# batch = tuple(t for t in batch)
inputs = self._get_inputs_dict(batch)
inputs = self._get_inputs_dict(batch, device)
with torch.no_grad():
outputs = self.model(**inputs)
tmp_eval_loss = outputs[0]
Expand Down Expand Up @@ -285,10 +288,10 @@ def test(self, test_data, device, args):
if i != (n_batches - 1)
else test_sample_len
)
logging.info(
"batch index = %d, start_index = %d, end_index = %d"
% (i, start_index, end_index)
)
# logging.info(
# "batch index = %d, start_index = %d, end_index = %d"
# % (i, start_index, end_index)
# )

eval_loss = eval_loss / nb_eval_steps
rouge_score = rouge_score / nb_eval_steps
Expand All @@ -315,7 +318,7 @@ def test(self, test_data, device, args):
# result = self.compute_metrics(references, model_preds)
# self.results.update(result)

logging.info(self.results)
# logging.info(self.results)

return result, model_preds, None

Expand Down
Loading