From 39a22ce6a19dac8f6c79761c6dd41442babfea76 Mon Sep 17 00:00:00 2001 From: wizard1203 <956234606@qq.com> Date: Fri, 8 Jan 2021 12:09:39 +0800 Subject: [PATCH 1/2] add a more example of gpu_util usage --- fedml_experiments/distributed/fed_launch/README.md | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/fedml_experiments/distributed/fed_launch/README.md b/fedml_experiments/distributed/fed_launch/README.md index 59e59af55e..0573984e6c 100644 --- a/fedml_experiments/distributed/fed_launch/README.md +++ b/fedml_experiments/distributed/fed_launch/README.md @@ -47,6 +47,17 @@ config_11: This example is also used for 11 process. But the mapping is different: Server process -- host1:GPU:0, client 1 -- host1:GPU:0, client 2 -- host1:GPU:1, client 3 -- host1:GPU:1, client 4 -- host2:GPU:0, client 5 -- host2:GPU:1, client 6 -- host2:GPU:2, client 7 -- host3:GPU:0, client 8 -- host3:GPU:1, client 9 -- host3:GPU:2, client 10 -- host3:GPU:3 +Sometimes one may want to use some GPUs in one machine, instead of all GPUs. Then you can use this: +``` +config_11: + host1: [0, 2] + host2: [1, 0, 1] + host3: [1, 1, 0, 1] + host4: [0, 1, 0, 0, 0, 1, 0, 2] +``` +Now the mapping become: Server process -- host1:GPU:1, client 1 -- host1:GPU:1, client 2 -- host1:GPU:0, client 3 -- host1:GPU:2, client 4 -- host3:GPU:0, client 5 -- host3:GPU:1, client 6 -- host3:GPU:3, client 7 -- host4:GPU:1, client 8 -- host4:GPU:6, client 9 -- host4:GPU:7, client 10 -- host4:GPU:7. + + And you also can add many mappings in one yaml file like this: ``` config_11: From 4359c0f4b1d70c2622638f40079a48c89d6c2ad9 Mon Sep 17 00:00:00 2001 From: wizard1203 <956234606@qq.com> Date: Fri, 8 Jan 2021 12:12:02 +0800 Subject: [PATCH 2/2] fix launch/main.py --- .../distributed/fed_launch/main.py | 25 ------------------- 1 file changed, 25 deletions(-) diff --git a/fedml_experiments/distributed/fed_launch/main.py b/fedml_experiments/distributed/fed_launch/main.py index 7e151aa6d0..d3b6cc300e 100644 --- a/fedml_experiments/distributed/fed_launch/main.py +++ b/fedml_experiments/distributed/fed_launch/main.py @@ -467,31 +467,6 @@ def init_training_device_from_gpu_util_file(process_id, worker_number, gpu_util_ FedML_FedAvg_distributed(process_id, worker_number, device, comm, model, train_data_num, train_data_global, test_data_global, train_data_local_num_dict, train_data_local_dict, test_data_local_dict, args) - elif args.algorithm == 'PSGD': - from fedml_api.distributed.PSGD.PSGD_API import FedML_init, FedML_PSGD_distributed - FedML_PSGD_distributed(process_id, worker_number, device, comm, - model, train_data_num, train_data_global, test_data_global, - train_data_local_num_dict, train_data_local_dict, test_data_local_dict, args) - elif args.algorithm == 'DPSGD': - from fedml_api.distributed.DPSGD.DPSGD_API import FedML_init, FedML_DPSGD - FedML_DPSGD(process_id, worker_number, device, comm, - model, train_data_num, train_data_global, test_data_global, - train_data_local_num_dict, train_data_local_dict, test_data_local_dict, args) - elif args.algorithm == 'DCD_PSGD': - from fedml_api.distributed.DCD_PSGD.DCD_PSGD_API import FedML_init, FedML_DCD_PSGD - FedML_DCD_PSGD(process_id, worker_number, device, comm, - model, train_data_num, train_data_global, test_data_global, - train_data_local_num_dict, train_data_local_dict, test_data_local_dict, args) - elif args.algorithm == 'CHOCO_SGD': - from fedml_api.distributed.CHOCO_SGD.CHOCO_SGD_API import FedML_init, FedML_CHOCO_SGD - FedML_CHOCO_SGD(process_id, worker_number, device, comm, - model, train_data_num, train_data_global, test_data_global, - train_data_local_num_dict, train_data_local_dict, test_data_local_dict, args) - elif args.algorithm == 'CHOCO_SGD': - from fedml_api.distributed.SAPS_FL.SAPS_FL_API import FedML_init, FedML_SAPS_FL - FedML_SAPS_FL(process_id, worker_number, device, comm, - model, train_data_num, train_data_global, test_data_global, - train_data_local_num_dict, train_data_local_dict, test_data_local_dict, args) else: raise NotImplementedError