FedML-AI · FedML-AI-admin · Jun 16, 2022 · Jun 15, 2022 · Jun 16, 2022 · Jun 16, 2022
diff --git a/.gitignore b/.gitignore
@@ -29,7 +29,7 @@ python/dist
 python/FedML.egg-info
 
 doc/deploy
-doc/en/_build
+doc/en/_build/doctrees
 
 
 *.h5

diff --git a/app/fedcv/image_classification/config/fedml_config.yaml b/app/fedcv/image_classification/config/fedml_config.yaml
@@ -22,9 +22,8 @@ train_args:
   epochs: 5
   batch_size: 64
   client_optimizer: sgd
-  learning_rate: 0.03
+  lr: 0.03
   weight_decay: 0.001
-  metric: "prc-auc"
 
 validation_args:
   frequency_of_the_test: 5

diff --git a/app/fedcv/image_classification/trainer/classification_trainer.py b/app/fedcv/image_classification/trainer/classification_trainer.py
@@ -21,14 +21,15 @@ def train(self, train_data, device, args):
 
         criterion = nn.CrossEntropyLoss().to(device)
         if args.client_optimizer == "sgd":
-            optimizer = torch.optim.SGD(model.parameters(), lr=args.learning_rate)
+            optimizer = torch.optim.SGD(model.parameters(), lr=args.lr)
         else:
             optimizer = torch.optim.Adam(
                 filter(lambda p: p.requires_grad, model.parameters()),
-                lr=args.learning_rate,
+                lr=args.lr,
                 weight_decay=args.weight_decay,
                 amsgrad=True,
             )
+
         epoch_loss = []
         for epoch in range(args.epochs):
             batch_loss = []

diff --git a/app/fedcv/image_segmentation/config/fedml_config.yaml b/app/fedcv/image_segmentation/config/fedml_config.yaml
@@ -21,9 +21,11 @@ train_args:
   epochs: 5
   batch_size: 64
   client_optimizer: sgd
-  learning_rate: 0.03
+  lr: 0.03
+  momentum: 0.9
+  nesterov: true
   weight_decay: 0.001
-  metric: "prc-auc"
+  loss_type: "ce"
 
 validation_args:
   frequency_of_the_test: 5
@@ -43,4 +45,4 @@ tracking_args:
   enable_wandb: false
   wandb_key: ee0b5f53d949c84cee7decbe7a629e63fb2f8408
   wandb_project: fedml
-  wandb_name: fedml_torch_image_classification
+  wandb_name: fedml_torch_image_segmentation
diff --git a/app/fedcv/medical_chest_xray_image_clf/config/fedml_config.yaml b/app/fedcv/medical_chest_xray_image_clf/config/fedml_config.yaml
@@ -10,7 +10,7 @@ data_args:
 
 model_args:
   model: "densenet"
-  class_num: 1000
+  class_num: 14
 
 train_args:
   federated_optimizer: "FedAvg"
@@ -21,9 +21,8 @@ train_args:
   epochs: 5
   batch_size: 64
   client_optimizer: sgd
-  learning_rate: 0.03
+  lr: 0.03
   weight_decay: 0.001
-  metric: "prc-auc"
 
 validation_args:
   frequency_of_the_test: 5

diff --git a/app/fedcv/object_detection/config/fedml_config.yaml b/app/fedcv/object_detection/config/fedml_config.yaml
@@ -9,7 +9,7 @@ data_args:
   partition_alpha: 0.5
 
 model_args:
-  model: "unet"
+  model: "yolo"
   class_num: 1000
 
 train_args:
@@ -21,9 +21,8 @@ train_args:
   epochs: 5
   batch_size: 64
   client_optimizer: sgd
-  learning_rate: 0.03
+  lr: 0.03
   weight_decay: 0.001
-  metric: "prc-auc"
 
 validation_args:
   frequency_of_the_test: 5
@@ -43,4 +42,4 @@ tracking_args:
   enable_wandb: false
   wandb_key: ee0b5f53d949c84cee7decbe7a629e63fb2f8408
   wandb_project: fedml
-  wandb_name: fedml_torch_image_classification
+  wandb_name: fedml_torch_object_detection
diff --git a/app/fednlp/README.md b/app/fednlp/README.md
@@ -35,62 +35,67 @@ For each of these make sure the datapaths and the gpu config paths are given cor
 
 **TEXT CLASSIFICATION**
 
-Read data/README.md for more details of datasets available
+Read `data/README.md` for more details of datasets available
 
 Adjust the hyperparameters in fednlp/text_classification/config/fedml_config.yaml
 
 To run text classification using MPI simulator follow the following steps:
 
+```bash
 1. cd ../
 2. bash fednlp/data/download_data.sh
 3. bash fednlp/data/download_partition.sh
 4. bash fednlp/text_classification/run_step_by_step_example.sh
-
+```
 
 **SEQ TAGGING**
 
-Read data/README.md for more details of datasets available
+Read `data/README.md` for more details of datasets available
 
 Adjust the hyperparameters in fednlp/seq_tagging/config/fedml_config.yaml
 
 To run sequence tagging on Onto dataset using MPI simulator follow the following steps:
 
+```bash
 1. cd ../
 2. bash fednlp/data/download_data.sh
 3. bash fednlp/data/download_partition.sh
 4. bash fednlp/seq_tagging/run_step_by_step_example.sh
-
+```
 
 **SPAN EXTRACTION**
 
-Adjust the hyperparameters in fednlp/span_extraction/config/fedml_config.yaml and make sure data file paths are correct
+Adjust the hyperparameters in `fednlp/span_extraction/config/fedml_config.yaml` and make sure data file paths are correct
 
 To run span extraction on MRQA dataset using MPI simulator follow the following steps:
 
+```bash
 1. cd ../
 2. bash fednlp/data/download_data.sh
 3. bash fednlp/data/download_partition.sh
 4. bash fednlp/span_extraction/run_step_by_step_example.sh
+```
 
 
 **SEQ2SEQ**
 
-Read data/README.md for more details of datasets available
+Read `data/README.md` for more details of datasets available
 
-Adjust the hyperparameters in fednlp/seq2seq/config/fedml_config.yaml
+Adjust the hyperparameters in `fednlp/seq2seq/config/fedml_config.yaml` and make sure data file paths are correct
 
 To run seq2seq using MPI simulator follow the following steps:
 
+```bash
 1. cd ../
 2. bash fednlp/data/download_data.sh
 3. bash fednlp/data/download_partition.sh
 4. bash fednlp/seq_tagging/run_step_by_step_example.sh
+```
 
-
-We have provided examples of trainers in each example. For running custom trainers feel free to follow the folder {application_name}/trainer/ and write your own custom trainer. To include this trainer please follow the create_model function in the python executable in the folder {application_name}/ and replace the current trainer with your own trainer. Every trainer should inherit the Client Trainer class and should contain a train and a test function.
+We have provided examples of trainers in each example. For running custom trainers feel free to follow the folder `{application_name}/trainer/` and write your own custom trainer. To include this trainer please follow the create_model function in the python executable in the folder `{application_name}/` and replace the current trainer with your own trainer. Every trainer should inherit the Client Trainer class and should contain a train and a test function.
 
 
-We have provided examples with BERT and DistilBert for text classification, seq tagging and span extraction and BART for Seq2Seq. For using any other model from Huggingface Transformers please look at the create_model function in the python executable in the folder {application_name}/. Also please ensure that you are using the correct tokenizer in {application_name}/data/data_loader.py 
+We have provided examples with BERT and DistilBert for text classification, seq tagging and span extraction and BART for Seq2Seq. For using any other model from Huggingface Transformers please look at the create_model function in the python executable in the folder `{application_name}/`. Also please ensure that you are using the correct tokenizer in `{application_name}/data/data_loader.py` 
 
 
 * Here {application_name} refers to any one of text_classification, span_extraction, seq_tagging or seq2seq.
diff --git a/app/fednlp/seq2seq/config/fedml_config.yaml b/app/fednlp/seq2seq/config/fedml_config.yaml
@@ -19,22 +19,23 @@ model_args:
 train_args:
   federated_optimizer: "FedAvg"
   client_id_list: "[]"
-  client_num_in_total: 10
-  client_num_per_round: 4
+  client_num_in_total: 100
+  client_num_per_round: 1
   comm_round: 1
-  epochs: 1
-  batch_size: 8
-  eval_batch_size: 8
-  max_seq_length: 128
+  epochs: 5
+  batch_size: 4
+  eval_batch_size: 4
+  max_seq_length: 256
   fp16: false
   output_dir: "home/ubuntu/output_dir"
-  client_optimizer: AdamW
+  client_optimizer: sgd
   server_optimizer: sgd
-  server_lr: 0.01
-  learning_rate: 0.03
+  server_lr: 0.1
+  learning_rate: 0.0001
   weight_decay: 0.001
   gradient_accumulation_steps: 1
-  clip_grad_norm: 0
+  clip_grad_norm: true
+  max_grad_norm: 1
   fedprox_mu: 1
   evaluate_during_training: false
   evaluate_during_training_steps: 10
@@ -45,10 +46,10 @@ validation_args:
   frequency_of_the_test: 5
 
 device_args:
-  worker_num: 4
-  using_gpu: false
-  gpu_mapping_file: config/gpu_mapping.yaml
-  gpu_mapping_key: mapping_default
+  worker_num: 1
+  using_gpu: true
+  gpu_mapping_file: /home/ubuntu/FedML/app/fednlp/seq2seq/config/gpu_mapping.yaml
+  gpu_mapping_key: mapping_fednlp_sp
 
 comm_args:
   backend: "MPI"

diff --git a/app/fednlp/seq2seq/config/gpu_mapping.yaml b/app/fednlp/seq2seq/config/gpu_mapping.yaml
@@ -56,6 +56,7 @@ mapping_FedML_gRPC:
 mapping_FedML_tRPC:
     lambda-server1: [0, 0, 0, 0, 2, 2, 1, 1]
     lambda-server2: [2, 1, 1, 1, 0, 0, 0, 0]
-
+mapping_fednlp_sp:
+        hostname_node_1: [2]
 #mapping_FedML_tRPC:
-#    lambda-server1: [0, 0, 0, 0, 3, 3, 3, 2]
+#    lambda-server1: [0, 0, 0, 0, 3, 3, 3, 2]
diff --git a/app/fednlp/seq2seq/run_step_by_step_example.sh b/app/fednlp/seq2seq/run_step_by_step_example.sh
@@ -8,4 +8,4 @@ hostname > mpi_host_file
 
 $(which mpirun) -np $PROCESS_NUM \
 -hostfile mpi_host_file \
-/home/ubuntu/fednlp_migration/bin/python3.8 -m fednlp.seq2seq.torch_fedavg_20news_bert_step_by_step_example --cf fednlp/seq2seq/config/fedml_config.yaml
+/home/ubuntu/fedml/bin/python3.8 -m fednlp.seq2seq.torch_fedavg_20news_bert_step_by_step_example --cf fednlp/seq2seq/config/fedml_config.yaml
diff --git a/app/fednlp/seq2seq/torch_fedavg_20news_bert_step_by_step_example.py b/app/fednlp/seq2seq/torch_fedavg_20news_bert_step_by_step_example.py
@@ -58,6 +58,7 @@ def create_model(args, device, output_dim=1):
             "output_dir": args.output_dir,
             "is_debug_mode": args.is_debug_mode,
             "fedprox_mu": args.fedprox_mu,
+            "optimizer": args.client_optimizer,
         }
     )
 
@@ -69,6 +70,7 @@ def create_model(args, device, output_dim=1):
     model_config = {}
     config = config_class.from_pretrained(args.model, **model_config)
     model = model_class.from_pretrained(args.model, config=config)
+    print("reached_here")
     trainer = MySSTrainer(model_args, device, model, tokenizer=tokenizer)
     return model, trainer
 

diff --git a/app/fednlp/seq2seq/trainer/seq2seq_trainer.py b/app/fednlp/seq2seq/trainer/seq2seq_trainer.py
@@ -19,6 +19,7 @@
     AdamW,
     get_linear_schedule_with_warmup,
 )
+from tqdm import tqdm
 
 
 class MyModelTrainer(ClientTrainer):
@@ -133,10 +134,10 @@ def train(self, train_data, device, args, test_data=None):
 
         if self.args.fl_algorithm == "FedProx":
             global_model = copy.deepcopy(self.model)
-
+        epoch_loss = []
         for epoch in range(0, args.epochs):
-
-            for batch_idx, batch in enumerate(train_data):
+            batch_loss = []
+            for batch_idx, batch in tqdm(enumerate(train_data)):
                 self.model.train()
                 # batch = tuple(t.to(device) for t in batch)
                 # dataset = TensorDataset(all_guid, all_input_ids, all_attention_masks, all_token_type_ids, all_cls_index,
@@ -146,17 +147,14 @@ def train(self, train_data, device, args, test_data=None):
 
                 if args.fp16:
                     with amp.autocast():
-                        print("reached here")
                         outputs = self.model(**inputs)
                         # model outputs are always tuple in pytorch-transformers (see doc)
                         loss = outputs[0]
                         print("reached here")
                 else:
-                    print("reached here")
                     outputs = self.model(**inputs)
                     # model outputs are always tuple in pytorch-transformers (see doc)
                     loss = outputs[0]
-                    print("reached here")
 
                 if args.n_gpu > 1:
                     loss = (
@@ -173,22 +171,19 @@ def train(self, train_data, device, args, test_data=None):
                     loss += fed_prox_reg
 
                 current_loss = loss.item()
-                print("reached here")
 
                 if args.gradient_accumulation_steps > 1:
                     loss = loss / args.gradient_accumulation_steps
-                print("reached here")
                 if args.fp16:
                     scaler.scale(loss).backward()
                 else:
                     loss.backward()
-                print("reached here")
                 tr_loss += loss.item()
 
-                logging.info(
-                    "epoch = %d, batch_idx = %d/%d, loss = %s"
-                    % (epoch, batch_idx, len(train_data), current_loss)
-                )
+                # logging.info(
+                #    "epoch = %d, batch_idx = %d/%d, loss = %s"
+                #    % (epoch, batch_idx, len(train_data), current_loss)
+                # )
 
                 if (batch_idx + 1) % args.gradient_accumulation_steps == 0:
                     if args.fp16:
@@ -205,17 +200,25 @@ def train(self, train_data, device, args, test_data=None):
                     scheduler.step()  # Update learning rate schedule
                     self.model.zero_grad()
                     global_step += 1
+                    batch_loss.append(tr_loss)
+                    tr_loss = 0
 
-                if (
-                    self.args.evaluate_during_training
-                    and (
-                        self.args.evaluate_during_training_steps > 0
-                        and global_step % self.args.evaluate_during_training_steps == 0
-                    )
-                    and test_data is not None
-                ):
-                    results, _, _ = self.test(test_data, device, args)
-                    logging.info(results)
+            # epoch_loss.append(sum(batch_loss) / len(batch_loss))
+            logging.info(
+                "Client Index = {}\tEpoch: {}\tLoss: {:.6f}".format(
+                    self.id, epoch, sum(batch_loss) / len(batch_loss)
+                )
+            )
+            if (
+                self.args.evaluate_during_training
+                and (
+                    self.args.evaluate_during_training_steps > 0
+                    and global_step % self.args.evaluate_during_training_steps == 0
+                )
+                and test_data is not None
+            ):
+                results, _, _ = self.test(test_data, device, args)
+                logging.info(results)
 
     def test(self, test_data, device, args):
 
@@ -244,7 +247,7 @@ def test(self, test_data, device, args):
         logging.info("len(test_dl) = %d, n_batches = %d" % (len(test_data), n_batches))
         for i, batch in enumerate(test_data):
             # batch = tuple(t for t in batch)
-            inputs = self._get_inputs_dict(batch)
+            inputs = self._get_inputs_dict(batch, device)
             with torch.no_grad():
                 outputs = self.model(**inputs)
                 tmp_eval_loss = outputs[0]
@@ -285,10 +288,10 @@ def test(self, test_data, device, args):
                 if i != (n_batches - 1)
                 else test_sample_len
             )
-            logging.info(
-                "batch index = %d, start_index = %d, end_index = %d"
-                % (i, start_index, end_index)
-            )
+        #   logging.info(
+        #      "batch index = %d, start_index = %d, end_index = %d"
+        #     % (i, start_index, end_index)
+        # )
 
         eval_loss = eval_loss / nb_eval_steps
         rouge_score = rouge_score / nb_eval_steps
@@ -315,7 +318,7 @@ def test(self, test_data, device, args):
         # result = self.compute_metrics(references, model_preds)
         # self.results.update(result)
 
-        logging.info(self.results)
+        # logging.info(self.results)
 
         return result, model_preds, None